Files
rose-ash/hosts/ocaml/lib/sx_cst.ml
giles 5390df7b0b CST parser: lossless concrete syntax tree for .sx files
New sx_cst.ml: CstAtom, CstList, CstDict node types with leading/trailing
trivia (whitespace + comments). Two projections:
- cst_to_source/cst_file_to_source: exact source reconstruction
- cst_to_ast: strip trivia → Sx_types.value for evaluation

New parse_all_cst/parse_file_cst in sx_parser.ml: parallel CST parser
alongside existing AST parser. Reuses read_string, read_symbol, try_number.
Trivia collected via collect_trivia (replaces skip_whitespace_and_comments).

Round-trip invariant: cst_file_to_source(parse_all_cst(src)) = src
Verified on 13 synthetic tests + 7 real codebase files (101KB evaluator,
parser, primitives, render, tree-tools, engine, io).

CST→AST equivalence: cst_to_ast matches parse_all output.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-03 18:07:35 +00:00

147 lines
4.9 KiB
OCaml

(** Concrete Syntax Tree for SX — lossless source representation.
Every piece of source text is preserved: whitespace, comments,
delimiters, raw token text. The CST supports two projections:
- [cst_to_source]: reconstruct the exact original source
- [cst_to_ast]: strip trivia, produce [Sx_types.value] for evaluation
Trivia attaches to nodes (leading on every node, trailing on
containers before the close delimiter). No separate comment map. *)
open Sx_types
(** {1 Types} *)
type trivia =
| Whitespace of string (** Runs of spaces, tabs, newlines *)
| LineComment of string (** ";;" through end of line, including the ";" chars *)
type span = {
start_offset : int;
end_offset : int;
}
type cst_node =
| CstAtom of {
leading_trivia : trivia list;
token : string; (** Raw source text of the token *)
value : value; (** Parsed semantic value *)
span : span;
}
| CstList of {
leading_trivia : trivia list;
open_delim : char; (** '(' or '[' *)
children : cst_node list;
close_delim : char; (** ')' or ']' *)
trailing_trivia : trivia list; (** Trivia between last child and close delim *)
span : span;
}
| CstDict of {
leading_trivia : trivia list;
children : cst_node list; (** Alternating key/value atoms *)
trailing_trivia : trivia list;
span : span;
}
(** {1 CST → Source (lossless reconstruction)} *)
let trivia_to_string ts =
let buf = Buffer.create 64 in
List.iter (function
| Whitespace s -> Buffer.add_string buf s
| LineComment s -> Buffer.add_string buf s
) ts;
Buffer.contents buf
let rec cst_to_source node =
match node with
| CstAtom { leading_trivia; token; _ } ->
trivia_to_string leading_trivia ^ token
| CstList { leading_trivia; open_delim; children; close_delim; trailing_trivia; _ } ->
let buf = Buffer.create 256 in
Buffer.add_string buf (trivia_to_string leading_trivia);
Buffer.add_char buf open_delim;
List.iter (fun c -> Buffer.add_string buf (cst_to_source c)) children;
Buffer.add_string buf (trivia_to_string trailing_trivia);
Buffer.add_char buf close_delim;
Buffer.contents buf
| CstDict { leading_trivia; children; trailing_trivia; _ } ->
let buf = Buffer.create 256 in
Buffer.add_string buf (trivia_to_string leading_trivia);
Buffer.add_char buf '{';
List.iter (fun c -> Buffer.add_string buf (cst_to_source c)) children;
Buffer.add_string buf (trivia_to_string trailing_trivia);
Buffer.add_char buf '}';
Buffer.contents buf
let cst_to_source_file nodes =
String.concat "" (List.map cst_to_source nodes)
(** Reconstruct source from a parsed file (nodes + trailing trivia). *)
let cst_file_to_source nodes trailing =
cst_to_source_file nodes ^ trivia_to_string trailing
(** {1 CST → AST (strip trivia for evaluation)} *)
let rec cst_to_ast = function
| CstAtom { value; _ } -> value
| CstList { children; _ } ->
List (List.map cst_to_ast children)
| CstDict { children; _ } ->
let d = make_dict () in
let rec pairs = function
| k :: v :: rest ->
let key_str = match cst_to_ast k with
| Keyword k -> k | String k -> k | Symbol k -> k | _ -> ""
in
dict_set d key_str (cst_to_ast v);
pairs rest
| _ -> ()
in
pairs children;
Dict d
(** {1 CST editing — apply AST-level edits back to the CST} *)
(** Replace the CST node at [path] with [new_source], preserving the
original node's leading trivia. [new_source] is parsed as CST so
any comments in it are preserved. *)
let apply_edit path new_cst_nodes original_cst_nodes =
let rec go nodes idx_path =
match idx_path with
| [] -> nodes (* shouldn't happen *)
| [target] ->
List.mapi (fun i node ->
if i = target then
match new_cst_nodes with
| [replacement] ->
(* Preserve original leading trivia *)
let orig_trivia = match node with
| CstAtom { leading_trivia; _ } -> leading_trivia
| CstList { leading_trivia; _ } -> leading_trivia
| CstDict { leading_trivia; _ } -> leading_trivia
in
(match replacement with
| CstAtom r -> CstAtom { r with leading_trivia = orig_trivia }
| CstList r -> CstList { r with leading_trivia = orig_trivia }
| CstDict r -> CstDict { r with leading_trivia = orig_trivia })
| _ -> node (* multi-node replacement: use as-is *)
else node
) nodes
| target :: rest ->
List.mapi (fun i node ->
if i = target then
match node with
| CstList r ->
CstList { r with children = go r.children rest }
| CstDict r ->
CstDict { r with children = go r.children rest }
| _ -> node
else node
) nodes
in
go original_cst_nodes path