diff --git a/hosts/ocaml/bin/dune b/hosts/ocaml/bin/dune index b94d477f..1d28418b 100644 --- a/hosts/ocaml/bin/dune +++ b/hosts/ocaml/bin/dune @@ -5,3 +5,7 @@ (executable (name mcp_tree) (libraries sx unix yojson str)) + +(executable + (name test_cst) + (libraries sx)) diff --git a/hosts/ocaml/bin/test_cst.ml b/hosts/ocaml/bin/test_cst.ml new file mode 100644 index 00000000..d875515b --- /dev/null +++ b/hosts/ocaml/bin/test_cst.ml @@ -0,0 +1,91 @@ +let () = + let test_sources = [ + "(define foo 42)"; + ";; comment\n(define bar 1)\n\n;; another\n(define baz 2)\n"; + "(define my-fn\n (fn (x)\n ;; check nil\n (if (nil? x) 0 x)))"; + "(list 1 2 3)"; + "{:key \"value\" :num 42}"; + "'(a b c)"; + "`(a ,b ,@c)"; + "(define x \"hello\\nworld\")"; + ";; top\n;; multi-line\n(define a 1)\n"; + ""; + " \n "; + "(a)\n(b)\n(c)"; + "(a ;; inline\n b)"; + ] in + let pass = ref 0 in + let fail = ref 0 in + List.iter (fun src -> + let cst = Sx_parser.parse_all_cst src in + let roundtrip = Sx_cst.cst_file_to_source cst.nodes cst.trailing_trivia in + if roundtrip = src then begin + incr pass; + Printf.printf "PASS: %S\n" (if String.length src > 40 then String.sub src 0 40 ^ "..." else src) + end else begin + incr fail; + Printf.printf "FAIL: %S\n expected: %S\n got: %S\n" + (if String.length src > 40 then String.sub src 0 40 ^ "..." else src) + src roundtrip + end + ) test_sources; + (* Also test CST→AST matches AST parser *) + let ast_tests = [ + "(define foo 42)"; + "(list 1 2 3)"; + "{:key \"value\"}"; + ";; comment\n(define bar 1)"; + ] in + Printf.printf "\nCST→AST equivalence:\n"; + List.iter (fun src -> + let ast_direct = Sx_parser.parse_all src in + let cst = Sx_parser.parse_all_cst src in + let ast_via_cst = List.map Sx_cst.cst_to_ast cst.nodes in + let s1 = String.concat " " (List.map Sx_types.inspect ast_direct) in + let s2 = String.concat " " (List.map Sx_types.inspect ast_via_cst) in + if s1 = s2 then begin + incr pass; + Printf.printf "PASS: %S\n" src + end else begin + incr fail; + Printf.printf "FAIL: %S\n AST: %s\n CST→AST: %s\n" src s1 s2 + end + ) ast_tests; + (* Test real .sx files from the codebase *) + Printf.printf "\nReal file round-trips:\n"; + let test_file path = + try + let src = In_channel.with_open_text path In_channel.input_all in + let cst = Sx_parser.parse_all_cst src in + let roundtrip = Sx_cst.cst_file_to_source cst.nodes cst.trailing_trivia in + if roundtrip = src then begin + incr pass; + Printf.printf "PASS: %s (%d bytes)\n" path (String.length src) + end else begin + incr fail; + (* Find first difference *) + let len = min (String.length src) (String.length roundtrip) in + let diff_pos = ref len in + for i = 0 to len - 1 do + if src.[i] <> roundtrip.[i] && !diff_pos = len then diff_pos := i + done; + Printf.printf "FAIL: %s (diff at byte %d, src=%d rt=%d)\n" path !diff_pos (String.length src) (String.length roundtrip) + end + with e -> + incr fail; + Printf.printf "ERROR: %s — %s\n" path (Printexc.to_string e) + in + let spec_dir = try Sys.getenv "SX_SPEC_DIR" with Not_found -> "spec" in + let project_dir = try Sys.getenv "SX_PROJECT_DIR" with Not_found -> "." in + List.iter test_file [ + spec_dir ^ "/evaluator.sx"; + spec_dir ^ "/parser.sx"; + spec_dir ^ "/primitives.sx"; + spec_dir ^ "/render.sx"; + project_dir ^ "/lib/tree-tools.sx"; + project_dir ^ "/web/engine.sx"; + project_dir ^ "/web/io.sx"; + ]; + + Printf.printf "\n%d/%d passed\n" !pass (!pass + !fail); + if !fail > 0 then exit 1 diff --git a/hosts/ocaml/lib/sx_cst.ml b/hosts/ocaml/lib/sx_cst.ml new file mode 100644 index 00000000..b73f4594 --- /dev/null +++ b/hosts/ocaml/lib/sx_cst.ml @@ -0,0 +1,146 @@ +(** Concrete Syntax Tree for SX — lossless source representation. + + Every piece of source text is preserved: whitespace, comments, + delimiters, raw token text. The CST supports two projections: + - [cst_to_source]: reconstruct the exact original source + - [cst_to_ast]: strip trivia, produce [Sx_types.value] for evaluation + + Trivia attaches to nodes (leading on every node, trailing on + containers before the close delimiter). No separate comment map. *) + +open Sx_types + +(** {1 Types} *) + +type trivia = + | Whitespace of string (** Runs of spaces, tabs, newlines *) + | LineComment of string (** ";;" through end of line, including the ";" chars *) + +type span = { + start_offset : int; + end_offset : int; +} + +type cst_node = + | CstAtom of { + leading_trivia : trivia list; + token : string; (** Raw source text of the token *) + value : value; (** Parsed semantic value *) + span : span; + } + | CstList of { + leading_trivia : trivia list; + open_delim : char; (** '(' or '[' *) + children : cst_node list; + close_delim : char; (** ')' or ']' *) + trailing_trivia : trivia list; (** Trivia between last child and close delim *) + span : span; + } + | CstDict of { + leading_trivia : trivia list; + children : cst_node list; (** Alternating key/value atoms *) + trailing_trivia : trivia list; + span : span; + } + + +(** {1 CST → Source (lossless reconstruction)} *) + +let trivia_to_string ts = + let buf = Buffer.create 64 in + List.iter (function + | Whitespace s -> Buffer.add_string buf s + | LineComment s -> Buffer.add_string buf s + ) ts; + Buffer.contents buf + +let rec cst_to_source node = + match node with + | CstAtom { leading_trivia; token; _ } -> + trivia_to_string leading_trivia ^ token + | CstList { leading_trivia; open_delim; children; close_delim; trailing_trivia; _ } -> + let buf = Buffer.create 256 in + Buffer.add_string buf (trivia_to_string leading_trivia); + Buffer.add_char buf open_delim; + List.iter (fun c -> Buffer.add_string buf (cst_to_source c)) children; + Buffer.add_string buf (trivia_to_string trailing_trivia); + Buffer.add_char buf close_delim; + Buffer.contents buf + | CstDict { leading_trivia; children; trailing_trivia; _ } -> + let buf = Buffer.create 256 in + Buffer.add_string buf (trivia_to_string leading_trivia); + Buffer.add_char buf '{'; + List.iter (fun c -> Buffer.add_string buf (cst_to_source c)) children; + Buffer.add_string buf (trivia_to_string trailing_trivia); + Buffer.add_char buf '}'; + Buffer.contents buf + +let cst_to_source_file nodes = + String.concat "" (List.map cst_to_source nodes) + +(** Reconstruct source from a parsed file (nodes + trailing trivia). *) +let cst_file_to_source nodes trailing = + cst_to_source_file nodes ^ trivia_to_string trailing + + +(** {1 CST → AST (strip trivia for evaluation)} *) + +let rec cst_to_ast = function + | CstAtom { value; _ } -> value + | CstList { children; _ } -> + List (List.map cst_to_ast children) + | CstDict { children; _ } -> + let d = make_dict () in + let rec pairs = function + | k :: v :: rest -> + let key_str = match cst_to_ast k with + | Keyword k -> k | String k -> k | Symbol k -> k | _ -> "" + in + dict_set d key_str (cst_to_ast v); + pairs rest + | _ -> () + in + pairs children; + Dict d + + +(** {1 CST editing — apply AST-level edits back to the CST} *) + +(** Replace the CST node at [path] with [new_source], preserving the + original node's leading trivia. [new_source] is parsed as CST so + any comments in it are preserved. *) +let apply_edit path new_cst_nodes original_cst_nodes = + let rec go nodes idx_path = + match idx_path with + | [] -> nodes (* shouldn't happen *) + | [target] -> + List.mapi (fun i node -> + if i = target then + match new_cst_nodes with + | [replacement] -> + (* Preserve original leading trivia *) + let orig_trivia = match node with + | CstAtom { leading_trivia; _ } -> leading_trivia + | CstList { leading_trivia; _ } -> leading_trivia + | CstDict { leading_trivia; _ } -> leading_trivia + in + (match replacement with + | CstAtom r -> CstAtom { r with leading_trivia = orig_trivia } + | CstList r -> CstList { r with leading_trivia = orig_trivia } + | CstDict r -> CstDict { r with leading_trivia = orig_trivia }) + | _ -> node (* multi-node replacement: use as-is *) + else node + ) nodes + | target :: rest -> + List.mapi (fun i node -> + if i = target then + match node with + | CstList r -> + CstList { r with children = go r.children rest } + | CstDict r -> + CstDict { r with children = go r.children rest } + | _ -> node + else node + ) nodes + in + go original_cst_nodes path diff --git a/hosts/ocaml/lib/sx_parser.ml b/hosts/ocaml/lib/sx_parser.ml index 9be3f71d..f508dda9 100644 --- a/hosts/ocaml/lib/sx_parser.ml +++ b/hosts/ocaml/lib/sx_parser.ml @@ -290,3 +290,232 @@ let parse_file ?(comments=false) path = let src = really_input_string ic n in close_in ic; parse_all ~comments src + + +(* ================================================================== *) +(* CST parser — lossless concrete syntax tree *) +(* ================================================================== *) + +open Sx_cst + +(** Collect leading trivia (whitespace + comments) from current position. *) +let collect_trivia s = + let items = ref [] in + let rec go () = + if at_end s then () + else match s.src.[s.pos] with + | ' ' | '\t' | '\n' | '\r' -> + let start = s.pos in + while s.pos < s.len && (let c = s.src.[s.pos] in c = ' ' || c = '\t' || c = '\n' || c = '\r') do + advance s + done; + items := Whitespace (String.sub s.src start (s.pos - start)) :: !items; + go () + | ';' -> + let start = s.pos in + while s.pos < s.len && s.src.[s.pos] <> '\n' do advance s done; + let text = String.sub s.src start (s.pos - start) in + if s.pos < s.len then advance s; + (* Include the newline in the comment trivia *) + let text = if s.pos > 0 && s.pos <= s.len && s.src.[s.pos - 1] = '\n' + then text ^ "\n" else text in + items := LineComment text :: !items; + go () + | _ -> () + in + go (); + List.rev !items + +(** Read a single CST value — dispatches on first non-trivia char. *) +let rec read_cst s : cst_node = + let trivia = collect_trivia s in + if at_end s then + raise (Parse_error "Unexpected end of input"); + let start = s.pos in + match s.src.[s.pos] with + | '(' -> read_cst_list s trivia start '(' ')' + | '[' -> read_cst_list s trivia start '[' ']' + | '{' -> read_cst_dict s trivia start + | '\'' -> + (* Quote sugar: 'x → (quote x) — emit as raw token *) + advance s; + let inner = read_cst s in + let end_pos = s.pos in + let token = String.sub s.src start (end_pos - start) in + let value = List [Symbol "quote"; cst_to_ast inner] in + CstAtom { leading_trivia = trivia; token; value; span = { start_offset = start; end_offset = end_pos } } + | '`' -> + advance s; + let inner = read_cst s in + let end_pos = s.pos in + let token = String.sub s.src start (end_pos - start) in + let value = List [Symbol "quasiquote"; cst_to_ast inner] in + CstAtom { leading_trivia = trivia; token; value; span = { start_offset = start; end_offset = end_pos } } + | ',' -> + advance s; + let splice = s.pos < s.len && s.src.[s.pos] = '@' in + if splice then advance s; + let inner = read_cst s in + let end_pos = s.pos in + let token = String.sub s.src start (end_pos - start) in + let sym = if splice then "splice-unquote" else "unquote" in + let value = List [Symbol sym; cst_to_ast inner] in + CstAtom { leading_trivia = trivia; token; value; span = { start_offset = start; end_offset = end_pos } } + | '#' when s.pos + 1 < s.len && s.src.[s.pos + 1] = ';' -> + (* Datum comment: #; discards next expression *) + advance s; advance s; + let _discarded = read_cst s in + (* Read the real value after the datum comment — attach trivia from #; *) + let next = read_cst s in + let combined_trivia = trivia @ (match next with + | CstAtom r -> r.leading_trivia + | CstList r -> r.leading_trivia + | CstDict r -> r.leading_trivia) in + (match next with + | CstAtom r -> CstAtom { r with leading_trivia = combined_trivia } + | CstList r -> CstList { r with leading_trivia = combined_trivia } + | CstDict r -> CstDict { r with leading_trivia = combined_trivia }) + | '#' when s.pos + 1 < s.len && s.src.[s.pos + 1] = '\'' -> + advance s; advance s; + let inner = read_cst s in + let end_pos = s.pos in + let token = String.sub s.src start (end_pos - start) in + let value = List [Symbol "quote"; cst_to_ast inner] in + CstAtom { leading_trivia = trivia; token; value; span = { start_offset = start; end_offset = end_pos } } + | '#' when s.pos + 1 < s.len && s.src.[s.pos + 1] = '|' -> + (* Raw string: #|...| *) + advance s; advance s; + let buf = Buffer.create 64 in + let rec go () = + if at_end s then raise (Parse_error "Unterminated raw string"); + let c = s.src.[s.pos] in + advance s; + if c = '|' then () + else begin Buffer.add_char buf c; go () end + in + go (); + let end_pos = s.pos in + let token = String.sub s.src start (end_pos - start) in + CstAtom { leading_trivia = trivia; token; value = String (Buffer.contents buf); + span = { start_offset = start; end_offset = end_pos } } + | '"' -> + let value = String (read_string s) in + let end_pos = s.pos in + let token = String.sub s.src start (end_pos - start) in + CstAtom { leading_trivia = trivia; token; value; + span = { start_offset = start; end_offset = end_pos } } + | _ -> + let sym = read_symbol s in + if sym = "" then begin + let line = ref 1 and col = ref 1 in + for i = 0 to s.pos - 1 do + if s.src.[i] = '\n' then (incr line; col := 1) else incr col + done; + raise (Parse_error (Printf.sprintf "Unexpected char: %c at line %d col %d (pos %d)" + s.src.[s.pos] !line !col s.pos)) + end; + let end_pos = s.pos in + let token = String.sub s.src start (end_pos - start) in + let value = match sym with + | "true" -> Bool true + | "false" -> Bool false + | "nil" -> Nil + | _ when sym.[0] = ':' -> Keyword (String.sub sym 1 (String.length sym - 1)) + | _ -> match try_number sym with Some n -> n | None -> Symbol sym + in + CstAtom { leading_trivia = trivia; token; value; + span = { start_offset = start; end_offset = end_pos } } + +and read_cst_list s trivia start open_c close_c = + advance s; (* skip open delim *) + let children = ref [] in + let rec go () = + let child_trivia = collect_trivia s in + if at_end s then raise (Parse_error "Unterminated list"); + if s.src.[s.pos] = close_c then begin + advance s; + let end_pos = s.pos in + CstList { leading_trivia = trivia; open_delim = open_c; + children = List.rev !children; close_delim = close_c; + trailing_trivia = child_trivia; + span = { start_offset = start; end_offset = end_pos } } + end else begin + (* Push collected trivia onto the next child *) + let child_start = s.pos in + let child = read_cst_inner s in + let child_with_trivia = match child with + | CstAtom r -> CstAtom { r with leading_trivia = child_trivia @ r.leading_trivia } + | CstList r -> CstList { r with leading_trivia = child_trivia @ r.leading_trivia } + | CstDict r -> CstDict { r with leading_trivia = child_trivia @ r.leading_trivia } + in + ignore child_start; + children := child_with_trivia :: !children; + go () + end + in + go () + +and read_cst_dict s trivia start = + advance s; (* skip { *) + let children = ref [] in + let rec go () = + let child_trivia = collect_trivia s in + if at_end s then raise (Parse_error "Unterminated dict"); + if s.src.[s.pos] = '}' then begin + advance s; + let end_pos = s.pos in + CstDict { leading_trivia = trivia; children = List.rev !children; + trailing_trivia = child_trivia; + span = { start_offset = start; end_offset = end_pos } } + end else begin + let child = read_cst_inner s in + let child_with_trivia = match child with + | CstAtom r -> CstAtom { r with leading_trivia = child_trivia @ r.leading_trivia } + | CstList r -> CstList { r with leading_trivia = child_trivia @ r.leading_trivia } + | CstDict r -> CstDict { r with leading_trivia = child_trivia @ r.leading_trivia } + in + children := child_with_trivia :: !children; + go () + end + in + go () + +(** Inner read — no trivia collection (caller handles it). *) +and read_cst_inner s : cst_node = + read_cst s + +(** Parse result: list of CST nodes + any trailing trivia after the last node. *) +type cst_file = { + nodes : cst_node list; + trailing_trivia : trivia list; +} + +(** Parse a string into a list of CST nodes. *) +let parse_all_cst src = + let s = make_state src in + let results = ref [] in + let rec go () = + let trivia = collect_trivia s in + if at_end s then + { nodes = List.rev !results; trailing_trivia = trivia } + else begin + let node = read_cst_inner s in + (* Prepend collected trivia to this node *) + let node_with_trivia = match node with + | CstAtom r -> CstAtom { r with leading_trivia = trivia @ r.leading_trivia } + | CstList r -> CstList { r with leading_trivia = trivia @ r.leading_trivia } + | CstDict r -> CstDict { r with leading_trivia = trivia @ r.leading_trivia } + in + results := node_with_trivia :: !results; + go () + end + in + go () + +(** Parse a file into a list of CST nodes. *) +let parse_file_cst path = + let ic = open_in path in + let n = in_channel_length ic in + let src = really_input_string ic n in + close_in ic; + parse_all_cst src