CST parser: lossless concrete syntax tree for .sx files

New sx_cst.ml: CstAtom, CstList, CstDict node types with leading/trailing
trivia (whitespace + comments). Two projections:
- cst_to_source/cst_file_to_source: exact source reconstruction
- cst_to_ast: strip trivia → Sx_types.value for evaluation

New parse_all_cst/parse_file_cst in sx_parser.ml: parallel CST parser
alongside existing AST parser. Reuses read_string, read_symbol, try_number.
Trivia collected via collect_trivia (replaces skip_whitespace_and_comments).

Round-trip invariant: cst_file_to_source(parse_all_cst(src)) = src
Verified on 13 synthetic tests + 7 real codebase files (101KB evaluator,
parser, primitives, render, tree-tools, engine, io).

CST→AST equivalence: cst_to_ast matches parse_all output.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-03 18:07:35 +00:00
parent 36acb56a3a
commit 5390df7b0b
4 changed files with 470 additions and 0 deletions

146
hosts/ocaml/lib/sx_cst.ml Normal file
View File

@@ -0,0 +1,146 @@
(** Concrete Syntax Tree for SX — lossless source representation.
Every piece of source text is preserved: whitespace, comments,
delimiters, raw token text. The CST supports two projections:
- [cst_to_source]: reconstruct the exact original source
- [cst_to_ast]: strip trivia, produce [Sx_types.value] for evaluation
Trivia attaches to nodes (leading on every node, trailing on
containers before the close delimiter). No separate comment map. *)
open Sx_types
(** {1 Types} *)
type trivia =
| Whitespace of string (** Runs of spaces, tabs, newlines *)
| LineComment of string (** ";;" through end of line, including the ";" chars *)
type span = {
start_offset : int;
end_offset : int;
}
type cst_node =
| CstAtom of {
leading_trivia : trivia list;
token : string; (** Raw source text of the token *)
value : value; (** Parsed semantic value *)
span : span;
}
| CstList of {
leading_trivia : trivia list;
open_delim : char; (** '(' or '[' *)
children : cst_node list;
close_delim : char; (** ')' or ']' *)
trailing_trivia : trivia list; (** Trivia between last child and close delim *)
span : span;
}
| CstDict of {
leading_trivia : trivia list;
children : cst_node list; (** Alternating key/value atoms *)
trailing_trivia : trivia list;
span : span;
}
(** {1 CST → Source (lossless reconstruction)} *)
let trivia_to_string ts =
let buf = Buffer.create 64 in
List.iter (function
| Whitespace s -> Buffer.add_string buf s
| LineComment s -> Buffer.add_string buf s
) ts;
Buffer.contents buf
let rec cst_to_source node =
match node with
| CstAtom { leading_trivia; token; _ } ->
trivia_to_string leading_trivia ^ token
| CstList { leading_trivia; open_delim; children; close_delim; trailing_trivia; _ } ->
let buf = Buffer.create 256 in
Buffer.add_string buf (trivia_to_string leading_trivia);
Buffer.add_char buf open_delim;
List.iter (fun c -> Buffer.add_string buf (cst_to_source c)) children;
Buffer.add_string buf (trivia_to_string trailing_trivia);
Buffer.add_char buf close_delim;
Buffer.contents buf
| CstDict { leading_trivia; children; trailing_trivia; _ } ->
let buf = Buffer.create 256 in
Buffer.add_string buf (trivia_to_string leading_trivia);
Buffer.add_char buf '{';
List.iter (fun c -> Buffer.add_string buf (cst_to_source c)) children;
Buffer.add_string buf (trivia_to_string trailing_trivia);
Buffer.add_char buf '}';
Buffer.contents buf
let cst_to_source_file nodes =
String.concat "" (List.map cst_to_source nodes)
(** Reconstruct source from a parsed file (nodes + trailing trivia). *)
let cst_file_to_source nodes trailing =
cst_to_source_file nodes ^ trivia_to_string trailing
(** {1 CST → AST (strip trivia for evaluation)} *)
let rec cst_to_ast = function
| CstAtom { value; _ } -> value
| CstList { children; _ } ->
List (List.map cst_to_ast children)
| CstDict { children; _ } ->
let d = make_dict () in
let rec pairs = function
| k :: v :: rest ->
let key_str = match cst_to_ast k with
| Keyword k -> k | String k -> k | Symbol k -> k | _ -> ""
in
dict_set d key_str (cst_to_ast v);
pairs rest
| _ -> ()
in
pairs children;
Dict d
(** {1 CST editing — apply AST-level edits back to the CST} *)
(** Replace the CST node at [path] with [new_source], preserving the
original node's leading trivia. [new_source] is parsed as CST so
any comments in it are preserved. *)
let apply_edit path new_cst_nodes original_cst_nodes =
let rec go nodes idx_path =
match idx_path with
| [] -> nodes (* shouldn't happen *)
| [target] ->
List.mapi (fun i node ->
if i = target then
match new_cst_nodes with
| [replacement] ->
(* Preserve original leading trivia *)
let orig_trivia = match node with
| CstAtom { leading_trivia; _ } -> leading_trivia
| CstList { leading_trivia; _ } -> leading_trivia
| CstDict { leading_trivia; _ } -> leading_trivia
in
(match replacement with
| CstAtom r -> CstAtom { r with leading_trivia = orig_trivia }
| CstList r -> CstList { r with leading_trivia = orig_trivia }
| CstDict r -> CstDict { r with leading_trivia = orig_trivia })
| _ -> node (* multi-node replacement: use as-is *)
else node
) nodes
| target :: rest ->
List.mapi (fun i node ->
if i = target then
match node with
| CstList r ->
CstList { r with children = go r.children rest }
| CstDict r ->
CstDict { r with children = go r.children rest }
| _ -> node
else node
) nodes
in
go original_cst_nodes path

View File

@@ -290,3 +290,232 @@ let parse_file ?(comments=false) path =
let src = really_input_string ic n in
close_in ic;
parse_all ~comments src
(* ================================================================== *)
(* CST parser — lossless concrete syntax tree *)
(* ================================================================== *)
open Sx_cst
(** Collect leading trivia (whitespace + comments) from current position. *)
let collect_trivia s =
let items = ref [] in
let rec go () =
if at_end s then ()
else match s.src.[s.pos] with
| ' ' | '\t' | '\n' | '\r' ->
let start = s.pos in
while s.pos < s.len && (let c = s.src.[s.pos] in c = ' ' || c = '\t' || c = '\n' || c = '\r') do
advance s
done;
items := Whitespace (String.sub s.src start (s.pos - start)) :: !items;
go ()
| ';' ->
let start = s.pos in
while s.pos < s.len && s.src.[s.pos] <> '\n' do advance s done;
let text = String.sub s.src start (s.pos - start) in
if s.pos < s.len then advance s;
(* Include the newline in the comment trivia *)
let text = if s.pos > 0 && s.pos <= s.len && s.src.[s.pos - 1] = '\n'
then text ^ "\n" else text in
items := LineComment text :: !items;
go ()
| _ -> ()
in
go ();
List.rev !items
(** Read a single CST value — dispatches on first non-trivia char. *)
let rec read_cst s : cst_node =
let trivia = collect_trivia s in
if at_end s then
raise (Parse_error "Unexpected end of input");
let start = s.pos in
match s.src.[s.pos] with
| '(' -> read_cst_list s trivia start '(' ')'
| '[' -> read_cst_list s trivia start '[' ']'
| '{' -> read_cst_dict s trivia start
| '\'' ->
(* Quote sugar: 'x → (quote x) — emit as raw token *)
advance s;
let inner = read_cst s in
let end_pos = s.pos in
let token = String.sub s.src start (end_pos - start) in
let value = List [Symbol "quote"; cst_to_ast inner] in
CstAtom { leading_trivia = trivia; token; value; span = { start_offset = start; end_offset = end_pos } }
| '`' ->
advance s;
let inner = read_cst s in
let end_pos = s.pos in
let token = String.sub s.src start (end_pos - start) in
let value = List [Symbol "quasiquote"; cst_to_ast inner] in
CstAtom { leading_trivia = trivia; token; value; span = { start_offset = start; end_offset = end_pos } }
| ',' ->
advance s;
let splice = s.pos < s.len && s.src.[s.pos] = '@' in
if splice then advance s;
let inner = read_cst s in
let end_pos = s.pos in
let token = String.sub s.src start (end_pos - start) in
let sym = if splice then "splice-unquote" else "unquote" in
let value = List [Symbol sym; cst_to_ast inner] in
CstAtom { leading_trivia = trivia; token; value; span = { start_offset = start; end_offset = end_pos } }
| '#' when s.pos + 1 < s.len && s.src.[s.pos + 1] = ';' ->
(* Datum comment: #; discards next expression *)
advance s; advance s;
let _discarded = read_cst s in
(* Read the real value after the datum comment — attach trivia from #; *)
let next = read_cst s in
let combined_trivia = trivia @ (match next with
| CstAtom r -> r.leading_trivia
| CstList r -> r.leading_trivia
| CstDict r -> r.leading_trivia) in
(match next with
| CstAtom r -> CstAtom { r with leading_trivia = combined_trivia }
| CstList r -> CstList { r with leading_trivia = combined_trivia }
| CstDict r -> CstDict { r with leading_trivia = combined_trivia })
| '#' when s.pos + 1 < s.len && s.src.[s.pos + 1] = '\'' ->
advance s; advance s;
let inner = read_cst s in
let end_pos = s.pos in
let token = String.sub s.src start (end_pos - start) in
let value = List [Symbol "quote"; cst_to_ast inner] in
CstAtom { leading_trivia = trivia; token; value; span = { start_offset = start; end_offset = end_pos } }
| '#' when s.pos + 1 < s.len && s.src.[s.pos + 1] = '|' ->
(* Raw string: #|...| *)
advance s; advance s;
let buf = Buffer.create 64 in
let rec go () =
if at_end s then raise (Parse_error "Unterminated raw string");
let c = s.src.[s.pos] in
advance s;
if c = '|' then ()
else begin Buffer.add_char buf c; go () end
in
go ();
let end_pos = s.pos in
let token = String.sub s.src start (end_pos - start) in
CstAtom { leading_trivia = trivia; token; value = String (Buffer.contents buf);
span = { start_offset = start; end_offset = end_pos } }
| '"' ->
let value = String (read_string s) in
let end_pos = s.pos in
let token = String.sub s.src start (end_pos - start) in
CstAtom { leading_trivia = trivia; token; value;
span = { start_offset = start; end_offset = end_pos } }
| _ ->
let sym = read_symbol s in
if sym = "" then begin
let line = ref 1 and col = ref 1 in
for i = 0 to s.pos - 1 do
if s.src.[i] = '\n' then (incr line; col := 1) else incr col
done;
raise (Parse_error (Printf.sprintf "Unexpected char: %c at line %d col %d (pos %d)"
s.src.[s.pos] !line !col s.pos))
end;
let end_pos = s.pos in
let token = String.sub s.src start (end_pos - start) in
let value = match sym with
| "true" -> Bool true
| "false" -> Bool false
| "nil" -> Nil
| _ when sym.[0] = ':' -> Keyword (String.sub sym 1 (String.length sym - 1))
| _ -> match try_number sym with Some n -> n | None -> Symbol sym
in
CstAtom { leading_trivia = trivia; token; value;
span = { start_offset = start; end_offset = end_pos } }
and read_cst_list s trivia start open_c close_c =
advance s; (* skip open delim *)
let children = ref [] in
let rec go () =
let child_trivia = collect_trivia s in
if at_end s then raise (Parse_error "Unterminated list");
if s.src.[s.pos] = close_c then begin
advance s;
let end_pos = s.pos in
CstList { leading_trivia = trivia; open_delim = open_c;
children = List.rev !children; close_delim = close_c;
trailing_trivia = child_trivia;
span = { start_offset = start; end_offset = end_pos } }
end else begin
(* Push collected trivia onto the next child *)
let child_start = s.pos in
let child = read_cst_inner s in
let child_with_trivia = match child with
| CstAtom r -> CstAtom { r with leading_trivia = child_trivia @ r.leading_trivia }
| CstList r -> CstList { r with leading_trivia = child_trivia @ r.leading_trivia }
| CstDict r -> CstDict { r with leading_trivia = child_trivia @ r.leading_trivia }
in
ignore child_start;
children := child_with_trivia :: !children;
go ()
end
in
go ()
and read_cst_dict s trivia start =
advance s; (* skip { *)
let children = ref [] in
let rec go () =
let child_trivia = collect_trivia s in
if at_end s then raise (Parse_error "Unterminated dict");
if s.src.[s.pos] = '}' then begin
advance s;
let end_pos = s.pos in
CstDict { leading_trivia = trivia; children = List.rev !children;
trailing_trivia = child_trivia;
span = { start_offset = start; end_offset = end_pos } }
end else begin
let child = read_cst_inner s in
let child_with_trivia = match child with
| CstAtom r -> CstAtom { r with leading_trivia = child_trivia @ r.leading_trivia }
| CstList r -> CstList { r with leading_trivia = child_trivia @ r.leading_trivia }
| CstDict r -> CstDict { r with leading_trivia = child_trivia @ r.leading_trivia }
in
children := child_with_trivia :: !children;
go ()
end
in
go ()
(** Inner read — no trivia collection (caller handles it). *)
and read_cst_inner s : cst_node =
read_cst s
(** Parse result: list of CST nodes + any trailing trivia after the last node. *)
type cst_file = {
nodes : cst_node list;
trailing_trivia : trivia list;
}
(** Parse a string into a list of CST nodes. *)
let parse_all_cst src =
let s = make_state src in
let results = ref [] in
let rec go () =
let trivia = collect_trivia s in
if at_end s then
{ nodes = List.rev !results; trailing_trivia = trivia }
else begin
let node = read_cst_inner s in
(* Prepend collected trivia to this node *)
let node_with_trivia = match node with
| CstAtom r -> CstAtom { r with leading_trivia = trivia @ r.leading_trivia }
| CstList r -> CstList { r with leading_trivia = trivia @ r.leading_trivia }
| CstDict r -> CstDict { r with leading_trivia = trivia @ r.leading_trivia }
in
results := node_with_trivia :: !results;
go ()
end
in
go ()
(** Parse a file into a list of CST nodes. *)
let parse_file_cst path =
let ic = open_in path in
let n = in_channel_length ic in
let src = really_input_string ic n in
close_in ic;
parse_all_cst src