CST parser: lossless concrete syntax tree for .sx files
New sx_cst.ml: CstAtom, CstList, CstDict node types with leading/trailing trivia (whitespace + comments). Two projections: - cst_to_source/cst_file_to_source: exact source reconstruction - cst_to_ast: strip trivia → Sx_types.value for evaluation New parse_all_cst/parse_file_cst in sx_parser.ml: parallel CST parser alongside existing AST parser. Reuses read_string, read_symbol, try_number. Trivia collected via collect_trivia (replaces skip_whitespace_and_comments). Round-trip invariant: cst_file_to_source(parse_all_cst(src)) = src Verified on 13 synthetic tests + 7 real codebase files (101KB evaluator, parser, primitives, render, tree-tools, engine, io). CST→AST equivalence: cst_to_ast matches parse_all output. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -5,3 +5,7 @@
|
||||
(executable
|
||||
(name mcp_tree)
|
||||
(libraries sx unix yojson str))
|
||||
|
||||
(executable
|
||||
(name test_cst)
|
||||
(libraries sx))
|
||||
|
||||
91
hosts/ocaml/bin/test_cst.ml
Normal file
91
hosts/ocaml/bin/test_cst.ml
Normal file
@@ -0,0 +1,91 @@
|
||||
let () =
|
||||
let test_sources = [
|
||||
"(define foo 42)";
|
||||
";; comment\n(define bar 1)\n\n;; another\n(define baz 2)\n";
|
||||
"(define my-fn\n (fn (x)\n ;; check nil\n (if (nil? x) 0 x)))";
|
||||
"(list 1 2 3)";
|
||||
"{:key \"value\" :num 42}";
|
||||
"'(a b c)";
|
||||
"`(a ,b ,@c)";
|
||||
"(define x \"hello\\nworld\")";
|
||||
";; top\n;; multi-line\n(define a 1)\n";
|
||||
"";
|
||||
" \n ";
|
||||
"(a)\n(b)\n(c)";
|
||||
"(a ;; inline\n b)";
|
||||
] in
|
||||
let pass = ref 0 in
|
||||
let fail = ref 0 in
|
||||
List.iter (fun src ->
|
||||
let cst = Sx_parser.parse_all_cst src in
|
||||
let roundtrip = Sx_cst.cst_file_to_source cst.nodes cst.trailing_trivia in
|
||||
if roundtrip = src then begin
|
||||
incr pass;
|
||||
Printf.printf "PASS: %S\n" (if String.length src > 40 then String.sub src 0 40 ^ "..." else src)
|
||||
end else begin
|
||||
incr fail;
|
||||
Printf.printf "FAIL: %S\n expected: %S\n got: %S\n"
|
||||
(if String.length src > 40 then String.sub src 0 40 ^ "..." else src)
|
||||
src roundtrip
|
||||
end
|
||||
) test_sources;
|
||||
(* Also test CST→AST matches AST parser *)
|
||||
let ast_tests = [
|
||||
"(define foo 42)";
|
||||
"(list 1 2 3)";
|
||||
"{:key \"value\"}";
|
||||
";; comment\n(define bar 1)";
|
||||
] in
|
||||
Printf.printf "\nCST→AST equivalence:\n";
|
||||
List.iter (fun src ->
|
||||
let ast_direct = Sx_parser.parse_all src in
|
||||
let cst = Sx_parser.parse_all_cst src in
|
||||
let ast_via_cst = List.map Sx_cst.cst_to_ast cst.nodes in
|
||||
let s1 = String.concat " " (List.map Sx_types.inspect ast_direct) in
|
||||
let s2 = String.concat " " (List.map Sx_types.inspect ast_via_cst) in
|
||||
if s1 = s2 then begin
|
||||
incr pass;
|
||||
Printf.printf "PASS: %S\n" src
|
||||
end else begin
|
||||
incr fail;
|
||||
Printf.printf "FAIL: %S\n AST: %s\n CST→AST: %s\n" src s1 s2
|
||||
end
|
||||
) ast_tests;
|
||||
(* Test real .sx files from the codebase *)
|
||||
Printf.printf "\nReal file round-trips:\n";
|
||||
let test_file path =
|
||||
try
|
||||
let src = In_channel.with_open_text path In_channel.input_all in
|
||||
let cst = Sx_parser.parse_all_cst src in
|
||||
let roundtrip = Sx_cst.cst_file_to_source cst.nodes cst.trailing_trivia in
|
||||
if roundtrip = src then begin
|
||||
incr pass;
|
||||
Printf.printf "PASS: %s (%d bytes)\n" path (String.length src)
|
||||
end else begin
|
||||
incr fail;
|
||||
(* Find first difference *)
|
||||
let len = min (String.length src) (String.length roundtrip) in
|
||||
let diff_pos = ref len in
|
||||
for i = 0 to len - 1 do
|
||||
if src.[i] <> roundtrip.[i] && !diff_pos = len then diff_pos := i
|
||||
done;
|
||||
Printf.printf "FAIL: %s (diff at byte %d, src=%d rt=%d)\n" path !diff_pos (String.length src) (String.length roundtrip)
|
||||
end
|
||||
with e ->
|
||||
incr fail;
|
||||
Printf.printf "ERROR: %s — %s\n" path (Printexc.to_string e)
|
||||
in
|
||||
let spec_dir = try Sys.getenv "SX_SPEC_DIR" with Not_found -> "spec" in
|
||||
let project_dir = try Sys.getenv "SX_PROJECT_DIR" with Not_found -> "." in
|
||||
List.iter test_file [
|
||||
spec_dir ^ "/evaluator.sx";
|
||||
spec_dir ^ "/parser.sx";
|
||||
spec_dir ^ "/primitives.sx";
|
||||
spec_dir ^ "/render.sx";
|
||||
project_dir ^ "/lib/tree-tools.sx";
|
||||
project_dir ^ "/web/engine.sx";
|
||||
project_dir ^ "/web/io.sx";
|
||||
];
|
||||
|
||||
Printf.printf "\n%d/%d passed\n" !pass (!pass + !fail);
|
||||
if !fail > 0 then exit 1
|
||||
146
hosts/ocaml/lib/sx_cst.ml
Normal file
146
hosts/ocaml/lib/sx_cst.ml
Normal file
@@ -0,0 +1,146 @@
|
||||
(** Concrete Syntax Tree for SX — lossless source representation.
|
||||
|
||||
Every piece of source text is preserved: whitespace, comments,
|
||||
delimiters, raw token text. The CST supports two projections:
|
||||
- [cst_to_source]: reconstruct the exact original source
|
||||
- [cst_to_ast]: strip trivia, produce [Sx_types.value] for evaluation
|
||||
|
||||
Trivia attaches to nodes (leading on every node, trailing on
|
||||
containers before the close delimiter). No separate comment map. *)
|
||||
|
||||
open Sx_types
|
||||
|
||||
(** {1 Types} *)
|
||||
|
||||
type trivia =
|
||||
| Whitespace of string (** Runs of spaces, tabs, newlines *)
|
||||
| LineComment of string (** ";;" through end of line, including the ";" chars *)
|
||||
|
||||
type span = {
|
||||
start_offset : int;
|
||||
end_offset : int;
|
||||
}
|
||||
|
||||
type cst_node =
|
||||
| CstAtom of {
|
||||
leading_trivia : trivia list;
|
||||
token : string; (** Raw source text of the token *)
|
||||
value : value; (** Parsed semantic value *)
|
||||
span : span;
|
||||
}
|
||||
| CstList of {
|
||||
leading_trivia : trivia list;
|
||||
open_delim : char; (** '(' or '[' *)
|
||||
children : cst_node list;
|
||||
close_delim : char; (** ')' or ']' *)
|
||||
trailing_trivia : trivia list; (** Trivia between last child and close delim *)
|
||||
span : span;
|
||||
}
|
||||
| CstDict of {
|
||||
leading_trivia : trivia list;
|
||||
children : cst_node list; (** Alternating key/value atoms *)
|
||||
trailing_trivia : trivia list;
|
||||
span : span;
|
||||
}
|
||||
|
||||
|
||||
(** {1 CST → Source (lossless reconstruction)} *)
|
||||
|
||||
let trivia_to_string ts =
|
||||
let buf = Buffer.create 64 in
|
||||
List.iter (function
|
||||
| Whitespace s -> Buffer.add_string buf s
|
||||
| LineComment s -> Buffer.add_string buf s
|
||||
) ts;
|
||||
Buffer.contents buf
|
||||
|
||||
let rec cst_to_source node =
|
||||
match node with
|
||||
| CstAtom { leading_trivia; token; _ } ->
|
||||
trivia_to_string leading_trivia ^ token
|
||||
| CstList { leading_trivia; open_delim; children; close_delim; trailing_trivia; _ } ->
|
||||
let buf = Buffer.create 256 in
|
||||
Buffer.add_string buf (trivia_to_string leading_trivia);
|
||||
Buffer.add_char buf open_delim;
|
||||
List.iter (fun c -> Buffer.add_string buf (cst_to_source c)) children;
|
||||
Buffer.add_string buf (trivia_to_string trailing_trivia);
|
||||
Buffer.add_char buf close_delim;
|
||||
Buffer.contents buf
|
||||
| CstDict { leading_trivia; children; trailing_trivia; _ } ->
|
||||
let buf = Buffer.create 256 in
|
||||
Buffer.add_string buf (trivia_to_string leading_trivia);
|
||||
Buffer.add_char buf '{';
|
||||
List.iter (fun c -> Buffer.add_string buf (cst_to_source c)) children;
|
||||
Buffer.add_string buf (trivia_to_string trailing_trivia);
|
||||
Buffer.add_char buf '}';
|
||||
Buffer.contents buf
|
||||
|
||||
let cst_to_source_file nodes =
|
||||
String.concat "" (List.map cst_to_source nodes)
|
||||
|
||||
(** Reconstruct source from a parsed file (nodes + trailing trivia). *)
|
||||
let cst_file_to_source nodes trailing =
|
||||
cst_to_source_file nodes ^ trivia_to_string trailing
|
||||
|
||||
|
||||
(** {1 CST → AST (strip trivia for evaluation)} *)
|
||||
|
||||
let rec cst_to_ast = function
|
||||
| CstAtom { value; _ } -> value
|
||||
| CstList { children; _ } ->
|
||||
List (List.map cst_to_ast children)
|
||||
| CstDict { children; _ } ->
|
||||
let d = make_dict () in
|
||||
let rec pairs = function
|
||||
| k :: v :: rest ->
|
||||
let key_str = match cst_to_ast k with
|
||||
| Keyword k -> k | String k -> k | Symbol k -> k | _ -> ""
|
||||
in
|
||||
dict_set d key_str (cst_to_ast v);
|
||||
pairs rest
|
||||
| _ -> ()
|
||||
in
|
||||
pairs children;
|
||||
Dict d
|
||||
|
||||
|
||||
(** {1 CST editing — apply AST-level edits back to the CST} *)
|
||||
|
||||
(** Replace the CST node at [path] with [new_source], preserving the
|
||||
original node's leading trivia. [new_source] is parsed as CST so
|
||||
any comments in it are preserved. *)
|
||||
let apply_edit path new_cst_nodes original_cst_nodes =
|
||||
let rec go nodes idx_path =
|
||||
match idx_path with
|
||||
| [] -> nodes (* shouldn't happen *)
|
||||
| [target] ->
|
||||
List.mapi (fun i node ->
|
||||
if i = target then
|
||||
match new_cst_nodes with
|
||||
| [replacement] ->
|
||||
(* Preserve original leading trivia *)
|
||||
let orig_trivia = match node with
|
||||
| CstAtom { leading_trivia; _ } -> leading_trivia
|
||||
| CstList { leading_trivia; _ } -> leading_trivia
|
||||
| CstDict { leading_trivia; _ } -> leading_trivia
|
||||
in
|
||||
(match replacement with
|
||||
| CstAtom r -> CstAtom { r with leading_trivia = orig_trivia }
|
||||
| CstList r -> CstList { r with leading_trivia = orig_trivia }
|
||||
| CstDict r -> CstDict { r with leading_trivia = orig_trivia })
|
||||
| _ -> node (* multi-node replacement: use as-is *)
|
||||
else node
|
||||
) nodes
|
||||
| target :: rest ->
|
||||
List.mapi (fun i node ->
|
||||
if i = target then
|
||||
match node with
|
||||
| CstList r ->
|
||||
CstList { r with children = go r.children rest }
|
||||
| CstDict r ->
|
||||
CstDict { r with children = go r.children rest }
|
||||
| _ -> node
|
||||
else node
|
||||
) nodes
|
||||
in
|
||||
go original_cst_nodes path
|
||||
@@ -290,3 +290,232 @@ let parse_file ?(comments=false) path =
|
||||
let src = really_input_string ic n in
|
||||
close_in ic;
|
||||
parse_all ~comments src
|
||||
|
||||
|
||||
(* ================================================================== *)
|
||||
(* CST parser — lossless concrete syntax tree *)
|
||||
(* ================================================================== *)
|
||||
|
||||
open Sx_cst
|
||||
|
||||
(** Collect leading trivia (whitespace + comments) from current position. *)
|
||||
let collect_trivia s =
|
||||
let items = ref [] in
|
||||
let rec go () =
|
||||
if at_end s then ()
|
||||
else match s.src.[s.pos] with
|
||||
| ' ' | '\t' | '\n' | '\r' ->
|
||||
let start = s.pos in
|
||||
while s.pos < s.len && (let c = s.src.[s.pos] in c = ' ' || c = '\t' || c = '\n' || c = '\r') do
|
||||
advance s
|
||||
done;
|
||||
items := Whitespace (String.sub s.src start (s.pos - start)) :: !items;
|
||||
go ()
|
||||
| ';' ->
|
||||
let start = s.pos in
|
||||
while s.pos < s.len && s.src.[s.pos] <> '\n' do advance s done;
|
||||
let text = String.sub s.src start (s.pos - start) in
|
||||
if s.pos < s.len then advance s;
|
||||
(* Include the newline in the comment trivia *)
|
||||
let text = if s.pos > 0 && s.pos <= s.len && s.src.[s.pos - 1] = '\n'
|
||||
then text ^ "\n" else text in
|
||||
items := LineComment text :: !items;
|
||||
go ()
|
||||
| _ -> ()
|
||||
in
|
||||
go ();
|
||||
List.rev !items
|
||||
|
||||
(** Read a single CST value — dispatches on first non-trivia char. *)
|
||||
let rec read_cst s : cst_node =
|
||||
let trivia = collect_trivia s in
|
||||
if at_end s then
|
||||
raise (Parse_error "Unexpected end of input");
|
||||
let start = s.pos in
|
||||
match s.src.[s.pos] with
|
||||
| '(' -> read_cst_list s trivia start '(' ')'
|
||||
| '[' -> read_cst_list s trivia start '[' ']'
|
||||
| '{' -> read_cst_dict s trivia start
|
||||
| '\'' ->
|
||||
(* Quote sugar: 'x → (quote x) — emit as raw token *)
|
||||
advance s;
|
||||
let inner = read_cst s in
|
||||
let end_pos = s.pos in
|
||||
let token = String.sub s.src start (end_pos - start) in
|
||||
let value = List [Symbol "quote"; cst_to_ast inner] in
|
||||
CstAtom { leading_trivia = trivia; token; value; span = { start_offset = start; end_offset = end_pos } }
|
||||
| '`' ->
|
||||
advance s;
|
||||
let inner = read_cst s in
|
||||
let end_pos = s.pos in
|
||||
let token = String.sub s.src start (end_pos - start) in
|
||||
let value = List [Symbol "quasiquote"; cst_to_ast inner] in
|
||||
CstAtom { leading_trivia = trivia; token; value; span = { start_offset = start; end_offset = end_pos } }
|
||||
| ',' ->
|
||||
advance s;
|
||||
let splice = s.pos < s.len && s.src.[s.pos] = '@' in
|
||||
if splice then advance s;
|
||||
let inner = read_cst s in
|
||||
let end_pos = s.pos in
|
||||
let token = String.sub s.src start (end_pos - start) in
|
||||
let sym = if splice then "splice-unquote" else "unquote" in
|
||||
let value = List [Symbol sym; cst_to_ast inner] in
|
||||
CstAtom { leading_trivia = trivia; token; value; span = { start_offset = start; end_offset = end_pos } }
|
||||
| '#' when s.pos + 1 < s.len && s.src.[s.pos + 1] = ';' ->
|
||||
(* Datum comment: #; discards next expression *)
|
||||
advance s; advance s;
|
||||
let _discarded = read_cst s in
|
||||
(* Read the real value after the datum comment — attach trivia from #; *)
|
||||
let next = read_cst s in
|
||||
let combined_trivia = trivia @ (match next with
|
||||
| CstAtom r -> r.leading_trivia
|
||||
| CstList r -> r.leading_trivia
|
||||
| CstDict r -> r.leading_trivia) in
|
||||
(match next with
|
||||
| CstAtom r -> CstAtom { r with leading_trivia = combined_trivia }
|
||||
| CstList r -> CstList { r with leading_trivia = combined_trivia }
|
||||
| CstDict r -> CstDict { r with leading_trivia = combined_trivia })
|
||||
| '#' when s.pos + 1 < s.len && s.src.[s.pos + 1] = '\'' ->
|
||||
advance s; advance s;
|
||||
let inner = read_cst s in
|
||||
let end_pos = s.pos in
|
||||
let token = String.sub s.src start (end_pos - start) in
|
||||
let value = List [Symbol "quote"; cst_to_ast inner] in
|
||||
CstAtom { leading_trivia = trivia; token; value; span = { start_offset = start; end_offset = end_pos } }
|
||||
| '#' when s.pos + 1 < s.len && s.src.[s.pos + 1] = '|' ->
|
||||
(* Raw string: #|...| *)
|
||||
advance s; advance s;
|
||||
let buf = Buffer.create 64 in
|
||||
let rec go () =
|
||||
if at_end s then raise (Parse_error "Unterminated raw string");
|
||||
let c = s.src.[s.pos] in
|
||||
advance s;
|
||||
if c = '|' then ()
|
||||
else begin Buffer.add_char buf c; go () end
|
||||
in
|
||||
go ();
|
||||
let end_pos = s.pos in
|
||||
let token = String.sub s.src start (end_pos - start) in
|
||||
CstAtom { leading_trivia = trivia; token; value = String (Buffer.contents buf);
|
||||
span = { start_offset = start; end_offset = end_pos } }
|
||||
| '"' ->
|
||||
let value = String (read_string s) in
|
||||
let end_pos = s.pos in
|
||||
let token = String.sub s.src start (end_pos - start) in
|
||||
CstAtom { leading_trivia = trivia; token; value;
|
||||
span = { start_offset = start; end_offset = end_pos } }
|
||||
| _ ->
|
||||
let sym = read_symbol s in
|
||||
if sym = "" then begin
|
||||
let line = ref 1 and col = ref 1 in
|
||||
for i = 0 to s.pos - 1 do
|
||||
if s.src.[i] = '\n' then (incr line; col := 1) else incr col
|
||||
done;
|
||||
raise (Parse_error (Printf.sprintf "Unexpected char: %c at line %d col %d (pos %d)"
|
||||
s.src.[s.pos] !line !col s.pos))
|
||||
end;
|
||||
let end_pos = s.pos in
|
||||
let token = String.sub s.src start (end_pos - start) in
|
||||
let value = match sym with
|
||||
| "true" -> Bool true
|
||||
| "false" -> Bool false
|
||||
| "nil" -> Nil
|
||||
| _ when sym.[0] = ':' -> Keyword (String.sub sym 1 (String.length sym - 1))
|
||||
| _ -> match try_number sym with Some n -> n | None -> Symbol sym
|
||||
in
|
||||
CstAtom { leading_trivia = trivia; token; value;
|
||||
span = { start_offset = start; end_offset = end_pos } }
|
||||
|
||||
and read_cst_list s trivia start open_c close_c =
|
||||
advance s; (* skip open delim *)
|
||||
let children = ref [] in
|
||||
let rec go () =
|
||||
let child_trivia = collect_trivia s in
|
||||
if at_end s then raise (Parse_error "Unterminated list");
|
||||
if s.src.[s.pos] = close_c then begin
|
||||
advance s;
|
||||
let end_pos = s.pos in
|
||||
CstList { leading_trivia = trivia; open_delim = open_c;
|
||||
children = List.rev !children; close_delim = close_c;
|
||||
trailing_trivia = child_trivia;
|
||||
span = { start_offset = start; end_offset = end_pos } }
|
||||
end else begin
|
||||
(* Push collected trivia onto the next child *)
|
||||
let child_start = s.pos in
|
||||
let child = read_cst_inner s in
|
||||
let child_with_trivia = match child with
|
||||
| CstAtom r -> CstAtom { r with leading_trivia = child_trivia @ r.leading_trivia }
|
||||
| CstList r -> CstList { r with leading_trivia = child_trivia @ r.leading_trivia }
|
||||
| CstDict r -> CstDict { r with leading_trivia = child_trivia @ r.leading_trivia }
|
||||
in
|
||||
ignore child_start;
|
||||
children := child_with_trivia :: !children;
|
||||
go ()
|
||||
end
|
||||
in
|
||||
go ()
|
||||
|
||||
and read_cst_dict s trivia start =
|
||||
advance s; (* skip { *)
|
||||
let children = ref [] in
|
||||
let rec go () =
|
||||
let child_trivia = collect_trivia s in
|
||||
if at_end s then raise (Parse_error "Unterminated dict");
|
||||
if s.src.[s.pos] = '}' then begin
|
||||
advance s;
|
||||
let end_pos = s.pos in
|
||||
CstDict { leading_trivia = trivia; children = List.rev !children;
|
||||
trailing_trivia = child_trivia;
|
||||
span = { start_offset = start; end_offset = end_pos } }
|
||||
end else begin
|
||||
let child = read_cst_inner s in
|
||||
let child_with_trivia = match child with
|
||||
| CstAtom r -> CstAtom { r with leading_trivia = child_trivia @ r.leading_trivia }
|
||||
| CstList r -> CstList { r with leading_trivia = child_trivia @ r.leading_trivia }
|
||||
| CstDict r -> CstDict { r with leading_trivia = child_trivia @ r.leading_trivia }
|
||||
in
|
||||
children := child_with_trivia :: !children;
|
||||
go ()
|
||||
end
|
||||
in
|
||||
go ()
|
||||
|
||||
(** Inner read — no trivia collection (caller handles it). *)
|
||||
and read_cst_inner s : cst_node =
|
||||
read_cst s
|
||||
|
||||
(** Parse result: list of CST nodes + any trailing trivia after the last node. *)
|
||||
type cst_file = {
|
||||
nodes : cst_node list;
|
||||
trailing_trivia : trivia list;
|
||||
}
|
||||
|
||||
(** Parse a string into a list of CST nodes. *)
|
||||
let parse_all_cst src =
|
||||
let s = make_state src in
|
||||
let results = ref [] in
|
||||
let rec go () =
|
||||
let trivia = collect_trivia s in
|
||||
if at_end s then
|
||||
{ nodes = List.rev !results; trailing_trivia = trivia }
|
||||
else begin
|
||||
let node = read_cst_inner s in
|
||||
(* Prepend collected trivia to this node *)
|
||||
let node_with_trivia = match node with
|
||||
| CstAtom r -> CstAtom { r with leading_trivia = trivia @ r.leading_trivia }
|
||||
| CstList r -> CstList { r with leading_trivia = trivia @ r.leading_trivia }
|
||||
| CstDict r -> CstDict { r with leading_trivia = trivia @ r.leading_trivia }
|
||||
in
|
||||
results := node_with_trivia :: !results;
|
||||
go ()
|
||||
end
|
||||
in
|
||||
go ()
|
||||
|
||||
(** Parse a file into a list of CST nodes. *)
|
||||
let parse_file_cst path =
|
||||
let ic = open_in path in
|
||||
let n = in_channel_length ic in
|
||||
let src = really_input_string ic n in
|
||||
close_in ic;
|
||||
parse_all_cst src
|
||||
|
||||
Reference in New Issue
Block a user