New sx_cst.ml: CstAtom, CstList, CstDict node types with leading/trailing trivia (whitespace + comments). Two projections: - cst_to_source/cst_file_to_source: exact source reconstruction - cst_to_ast: strip trivia → Sx_types.value for evaluation New parse_all_cst/parse_file_cst in sx_parser.ml: parallel CST parser alongside existing AST parser. Reuses read_string, read_symbol, try_number. Trivia collected via collect_trivia (replaces skip_whitespace_and_comments). Round-trip invariant: cst_file_to_source(parse_all_cst(src)) = src Verified on 13 synthetic tests + 7 real codebase files (101KB evaluator, parser, primitives, render, tree-tools, engine, io). CST→AST equivalence: cst_to_ast matches parse_all output. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
147 lines
4.9 KiB
OCaml
147 lines
4.9 KiB
OCaml
(** Concrete Syntax Tree for SX — lossless source representation.
|
|
|
|
Every piece of source text is preserved: whitespace, comments,
|
|
delimiters, raw token text. The CST supports two projections:
|
|
- [cst_to_source]: reconstruct the exact original source
|
|
- [cst_to_ast]: strip trivia, produce [Sx_types.value] for evaluation
|
|
|
|
Trivia attaches to nodes (leading on every node, trailing on
|
|
containers before the close delimiter). No separate comment map. *)
|
|
|
|
open Sx_types
|
|
|
|
(** {1 Types} *)
|
|
|
|
type trivia =
|
|
| Whitespace of string (** Runs of spaces, tabs, newlines *)
|
|
| LineComment of string (** ";;" through end of line, including the ";" chars *)
|
|
|
|
type span = {
|
|
start_offset : int;
|
|
end_offset : int;
|
|
}
|
|
|
|
type cst_node =
|
|
| CstAtom of {
|
|
leading_trivia : trivia list;
|
|
token : string; (** Raw source text of the token *)
|
|
value : value; (** Parsed semantic value *)
|
|
span : span;
|
|
}
|
|
| CstList of {
|
|
leading_trivia : trivia list;
|
|
open_delim : char; (** '(' or '[' *)
|
|
children : cst_node list;
|
|
close_delim : char; (** ')' or ']' *)
|
|
trailing_trivia : trivia list; (** Trivia between last child and close delim *)
|
|
span : span;
|
|
}
|
|
| CstDict of {
|
|
leading_trivia : trivia list;
|
|
children : cst_node list; (** Alternating key/value atoms *)
|
|
trailing_trivia : trivia list;
|
|
span : span;
|
|
}
|
|
|
|
|
|
(** {1 CST → Source (lossless reconstruction)} *)
|
|
|
|
let trivia_to_string ts =
|
|
let buf = Buffer.create 64 in
|
|
List.iter (function
|
|
| Whitespace s -> Buffer.add_string buf s
|
|
| LineComment s -> Buffer.add_string buf s
|
|
) ts;
|
|
Buffer.contents buf
|
|
|
|
let rec cst_to_source node =
|
|
match node with
|
|
| CstAtom { leading_trivia; token; _ } ->
|
|
trivia_to_string leading_trivia ^ token
|
|
| CstList { leading_trivia; open_delim; children; close_delim; trailing_trivia; _ } ->
|
|
let buf = Buffer.create 256 in
|
|
Buffer.add_string buf (trivia_to_string leading_trivia);
|
|
Buffer.add_char buf open_delim;
|
|
List.iter (fun c -> Buffer.add_string buf (cst_to_source c)) children;
|
|
Buffer.add_string buf (trivia_to_string trailing_trivia);
|
|
Buffer.add_char buf close_delim;
|
|
Buffer.contents buf
|
|
| CstDict { leading_trivia; children; trailing_trivia; _ } ->
|
|
let buf = Buffer.create 256 in
|
|
Buffer.add_string buf (trivia_to_string leading_trivia);
|
|
Buffer.add_char buf '{';
|
|
List.iter (fun c -> Buffer.add_string buf (cst_to_source c)) children;
|
|
Buffer.add_string buf (trivia_to_string trailing_trivia);
|
|
Buffer.add_char buf '}';
|
|
Buffer.contents buf
|
|
|
|
let cst_to_source_file nodes =
|
|
String.concat "" (List.map cst_to_source nodes)
|
|
|
|
(** Reconstruct source from a parsed file (nodes + trailing trivia). *)
|
|
let cst_file_to_source nodes trailing =
|
|
cst_to_source_file nodes ^ trivia_to_string trailing
|
|
|
|
|
|
(** {1 CST → AST (strip trivia for evaluation)} *)
|
|
|
|
let rec cst_to_ast = function
|
|
| CstAtom { value; _ } -> value
|
|
| CstList { children; _ } ->
|
|
List (List.map cst_to_ast children)
|
|
| CstDict { children; _ } ->
|
|
let d = make_dict () in
|
|
let rec pairs = function
|
|
| k :: v :: rest ->
|
|
let key_str = match cst_to_ast k with
|
|
| Keyword k -> k | String k -> k | Symbol k -> k | _ -> ""
|
|
in
|
|
dict_set d key_str (cst_to_ast v);
|
|
pairs rest
|
|
| _ -> ()
|
|
in
|
|
pairs children;
|
|
Dict d
|
|
|
|
|
|
(** {1 CST editing — apply AST-level edits back to the CST} *)
|
|
|
|
(** Replace the CST node at [path] with [new_source], preserving the
|
|
original node's leading trivia. [new_source] is parsed as CST so
|
|
any comments in it are preserved. *)
|
|
let apply_edit path new_cst_nodes original_cst_nodes =
|
|
let rec go nodes idx_path =
|
|
match idx_path with
|
|
| [] -> nodes (* shouldn't happen *)
|
|
| [target] ->
|
|
List.mapi (fun i node ->
|
|
if i = target then
|
|
match new_cst_nodes with
|
|
| [replacement] ->
|
|
(* Preserve original leading trivia *)
|
|
let orig_trivia = match node with
|
|
| CstAtom { leading_trivia; _ } -> leading_trivia
|
|
| CstList { leading_trivia; _ } -> leading_trivia
|
|
| CstDict { leading_trivia; _ } -> leading_trivia
|
|
in
|
|
(match replacement with
|
|
| CstAtom r -> CstAtom { r with leading_trivia = orig_trivia }
|
|
| CstList r -> CstList { r with leading_trivia = orig_trivia }
|
|
| CstDict r -> CstDict { r with leading_trivia = orig_trivia })
|
|
| _ -> node (* multi-node replacement: use as-is *)
|
|
else node
|
|
) nodes
|
|
| target :: rest ->
|
|
List.mapi (fun i node ->
|
|
if i = target then
|
|
match node with
|
|
| CstList r ->
|
|
CstList { r with children = go r.children rest }
|
|
| CstDict r ->
|
|
CstDict { r with children = go r.children rest }
|
|
| _ -> node
|
|
else node
|
|
) nodes
|
|
in
|
|
go original_cst_nodes path
|