Delete from sx_types.ml: - Comment of string variant (no longer needed) Delete from sx_parser.ml: - _preserve_comments mutable ref - collect_comment_node function - comment-mode branches in read_value, read_list - ~comments parameter from parse_all and parse_file - skip_whitespace and read_comment (only used by old comment mode) Delete from mcp_tree.ml: - has_interior_comments function - Comment handling in pretty_print_value - pretty_print_file function (replaced by CST write-back) - ~comments parameter from local parse_file Migrate sx_pretty_print, sx_write_file, sx_doc_gen to CST path. Net: -69 lines. 24/24 CST round-trips, 2583/2583 evaluator tests. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
454 lines
15 KiB
OCaml
454 lines
15 KiB
OCaml
(** S-expression parser.
|
|
|
|
Recursive descent over a string, producing [Sx_types.value list].
|
|
Supports: lists, dicts, symbols, keywords, strings (with escapes),
|
|
numbers, booleans, nil, comments, quote/quasiquote/unquote sugar. *)
|
|
|
|
open Sx_types
|
|
|
|
type state = {
|
|
src : string;
|
|
len : int;
|
|
mutable pos : int;
|
|
}
|
|
|
|
let make_state src = { src; len = String.length src; pos = 0 }
|
|
|
|
let peek s = if s.pos < s.len then Some s.src.[s.pos] else None
|
|
let advance s = s.pos <- s.pos + 1
|
|
let at_end s = s.pos >= s.len
|
|
|
|
let skip_whitespace_and_comments s =
|
|
let rec go () =
|
|
if at_end s then ()
|
|
else match s.src.[s.pos] with
|
|
| ' ' | '\t' | '\n' | '\r' -> advance s; go ()
|
|
| ';' ->
|
|
while s.pos < s.len && s.src.[s.pos] <> '\n' do advance s done;
|
|
if s.pos < s.len then advance s;
|
|
go ()
|
|
| _ -> ()
|
|
in go ()
|
|
|
|
(* Character classification — matches spec/parser.sx ident-start/ident-char.
|
|
ident-start: a-z A-Z _ ~ * + - > < = / ! ? &
|
|
ident-char: ident-start plus 0-9 . : / # , *)
|
|
let is_ident_start = function
|
|
| 'a'..'z' | 'A'..'Z' | '_' | '~' | '*' | '+' | '-'
|
|
| '>' | '<' | '=' | '/' | '!' | '?' | '&' -> true
|
|
| _ -> false
|
|
|
|
let is_ident_char = function
|
|
| c when is_ident_start c -> true
|
|
| '0'..'9' | '.' | ':' | '#' | ',' -> true
|
|
| _ -> false
|
|
|
|
(* Symbol reading uses ident_char; first char must be ident_start or digit/colon *)
|
|
let is_symbol_char = is_ident_char
|
|
|
|
let read_string s =
|
|
(* s.pos is on the opening quote *)
|
|
advance s;
|
|
let buf = Buffer.create 64 in
|
|
let rec go () =
|
|
if at_end s then raise (Parse_error "Unterminated string");
|
|
let c = s.src.[s.pos] in
|
|
advance s;
|
|
if c = '"' then Buffer.contents buf
|
|
else if c = '\\' then begin
|
|
if at_end s then raise (Parse_error "Unterminated string escape");
|
|
let esc = s.src.[s.pos] in
|
|
advance s;
|
|
(match esc with
|
|
| 'n' -> Buffer.add_char buf '\n'
|
|
| 't' -> Buffer.add_char buf '\t'
|
|
| 'r' -> Buffer.add_char buf '\r'
|
|
| '"' -> Buffer.add_char buf '"'
|
|
| '\\' -> Buffer.add_char buf '\\'
|
|
| 'u' ->
|
|
(* \uXXXX — read 4 hex digits, encode as UTF-8 *)
|
|
if s.pos + 4 > s.len then raise (Parse_error "Incomplete \\u escape");
|
|
let hex = String.sub s.src s.pos 4 in
|
|
s.pos <- s.pos + 4;
|
|
let code = int_of_string ("0x" ^ hex) in
|
|
let ubuf = Buffer.create 4 in
|
|
Buffer.add_utf_8_uchar ubuf (Uchar.of_int code);
|
|
Buffer.add_string buf (Buffer.contents ubuf)
|
|
| '`' -> Buffer.add_char buf '`'
|
|
| _ -> Buffer.add_char buf '\\'; Buffer.add_char buf esc);
|
|
go ()
|
|
end else begin
|
|
Buffer.add_char buf c;
|
|
go ()
|
|
end
|
|
in go ()
|
|
|
|
let read_symbol s =
|
|
let start = s.pos in
|
|
while s.pos < s.len && is_symbol_char s.src.[s.pos] do advance s done;
|
|
String.sub s.src start (s.pos - start)
|
|
|
|
let try_number str =
|
|
match float_of_string_opt str with
|
|
| Some n -> Some (Number n)
|
|
| None -> None
|
|
|
|
let rec read_value s : value =
|
|
skip_whitespace_and_comments s;
|
|
if at_end s then begin
|
|
let line = ref 1 in
|
|
String.iter (fun c -> if c = '\n' then incr line) s.src;
|
|
raise (Parse_error (Printf.sprintf "Unexpected end of input at line %d (pos %d)" !line s.pos))
|
|
end;
|
|
match s.src.[s.pos] with
|
|
| '(' -> read_list s ')'
|
|
| '[' -> read_list s ']'
|
|
| '{' -> read_dict s
|
|
| '"' -> String (read_string s)
|
|
| '\'' -> advance s; List [Symbol "quote"; read_value s]
|
|
| '`' -> advance s; List [Symbol "quasiquote"; read_value s]
|
|
| '#' when s.pos + 1 < s.len && s.src.[s.pos + 1] = ';' ->
|
|
(* Datum comment: #; discards next expression *)
|
|
advance s; advance s;
|
|
ignore (read_value s);
|
|
read_value s
|
|
| '#' when s.pos + 1 < s.len && s.src.[s.pos + 1] = '\'' ->
|
|
(* Quote shorthand: #'expr -> (quote expr) *)
|
|
advance s; advance s;
|
|
List [Symbol "quote"; read_value s]
|
|
| '#' when s.pos + 1 < s.len && s.src.[s.pos + 1] = '|' ->
|
|
(* Raw string: #|...| — ends at next | *)
|
|
advance s; advance s;
|
|
let buf = Buffer.create 64 in
|
|
let rec go () =
|
|
if at_end s then raise (Parse_error "Unterminated raw string");
|
|
let c = s.src.[s.pos] in
|
|
advance s;
|
|
if c = '|' then
|
|
String (Buffer.contents buf)
|
|
else begin
|
|
Buffer.add_char buf c;
|
|
go ()
|
|
end
|
|
in go ()
|
|
| ',' ->
|
|
(* Unquote / splice-unquote — matches spec: , always triggers unquote *)
|
|
advance s;
|
|
if s.pos < s.len && s.src.[s.pos] = '@' then begin
|
|
advance s;
|
|
List [Symbol "splice-unquote"; read_value s]
|
|
end else
|
|
List [Symbol "unquote"; read_value s]
|
|
| _ ->
|
|
begin
|
|
(* Symbol, keyword, number, or boolean *)
|
|
let token = read_symbol s in
|
|
if token = "" then begin
|
|
let line = ref 1 and col = ref 1 in
|
|
for i = 0 to s.pos - 1 do
|
|
if s.src.[i] = '\n' then (incr line; col := 1) else incr col
|
|
done;
|
|
raise (Parse_error (Printf.sprintf "Unexpected char: %c at line %d col %d (pos %d)"
|
|
s.src.[s.pos] !line !col s.pos))
|
|
end;
|
|
match token with
|
|
| "true" -> Bool true
|
|
| "false" -> Bool false
|
|
| "nil" -> Nil
|
|
| _ when token.[0] = ':' ->
|
|
Keyword (String.sub token 1 (String.length token - 1))
|
|
| _ ->
|
|
match try_number token with
|
|
| Some n -> n
|
|
| None -> Symbol token
|
|
end
|
|
|
|
and read_list s close_char =
|
|
advance s; (* skip opening paren/bracket *)
|
|
let items = ref [] in
|
|
let rec go () =
|
|
skip_whitespace_and_comments s;
|
|
if at_end s then raise (Parse_error "Unterminated list");
|
|
if s.src.[s.pos] = close_char then begin
|
|
advance s;
|
|
List (List.rev !items)
|
|
end else begin
|
|
items := read_value s :: !items;
|
|
go ()
|
|
end
|
|
in go ()
|
|
|
|
and read_dict s =
|
|
advance s; (* skip { *)
|
|
let d = make_dict () in
|
|
let rec go () =
|
|
skip_whitespace_and_comments s;
|
|
if at_end s then raise (Parse_error "Unterminated dict");
|
|
if s.src.[s.pos] = '}' then begin
|
|
advance s;
|
|
Dict d
|
|
end else begin
|
|
let key = read_value s in
|
|
let key_str = match key with
|
|
| Keyword k -> k
|
|
| String k -> k
|
|
| Symbol k -> k
|
|
| _ -> raise (Parse_error "Dict key must be keyword, string, or symbol")
|
|
in
|
|
let v = read_value s in
|
|
dict_set d key_str v;
|
|
go ()
|
|
end
|
|
in go ()
|
|
|
|
|
|
(** Parse a string into a list of SX values (AST — comments stripped). *)
|
|
let parse_all src =
|
|
let s = make_state src in
|
|
let results = ref [] in
|
|
let rec go () =
|
|
skip_whitespace_and_comments s;
|
|
if at_end s then List.rev !results
|
|
else begin
|
|
results := read_value s :: !results;
|
|
go ()
|
|
end
|
|
in go ()
|
|
|
|
(** Parse a file into a list of SX values (AST — comments stripped). *)
|
|
let parse_file path =
|
|
let ic = open_in path in
|
|
let n = in_channel_length ic in
|
|
let src = really_input_string ic n in
|
|
close_in ic;
|
|
parse_all src
|
|
|
|
|
|
(* ================================================================== *)
|
|
(* CST parser — lossless concrete syntax tree *)
|
|
(* ================================================================== *)
|
|
|
|
open Sx_cst
|
|
|
|
(** Collect leading trivia (whitespace + comments) from current position. *)
|
|
let collect_trivia s =
|
|
let items = ref [] in
|
|
let rec go () =
|
|
if at_end s then ()
|
|
else match s.src.[s.pos] with
|
|
| ' ' | '\t' | '\n' | '\r' ->
|
|
let start = s.pos in
|
|
while s.pos < s.len && (let c = s.src.[s.pos] in c = ' ' || c = '\t' || c = '\n' || c = '\r') do
|
|
advance s
|
|
done;
|
|
items := Whitespace (String.sub s.src start (s.pos - start)) :: !items;
|
|
go ()
|
|
| ';' ->
|
|
let start = s.pos in
|
|
while s.pos < s.len && s.src.[s.pos] <> '\n' do advance s done;
|
|
let text = String.sub s.src start (s.pos - start) in
|
|
if s.pos < s.len then advance s;
|
|
(* Include the newline in the comment trivia *)
|
|
let text = if s.pos > 0 && s.pos <= s.len && s.src.[s.pos - 1] = '\n'
|
|
then text ^ "\n" else text in
|
|
items := LineComment text :: !items;
|
|
go ()
|
|
| _ -> ()
|
|
in
|
|
go ();
|
|
List.rev !items
|
|
|
|
(** Read a single CST value — dispatches on first non-trivia char. *)
|
|
let rec read_cst s : cst_node =
|
|
let trivia = collect_trivia s in
|
|
if at_end s then
|
|
raise (Parse_error "Unexpected end of input");
|
|
let start = s.pos in
|
|
match s.src.[s.pos] with
|
|
| '(' -> read_cst_list s trivia start '(' ')'
|
|
| '[' -> read_cst_list s trivia start '[' ']'
|
|
| '{' -> read_cst_dict s trivia start
|
|
| '\'' ->
|
|
(* Quote sugar: 'x → (quote x) — emit as raw token *)
|
|
advance s;
|
|
let inner = read_cst s in
|
|
let end_pos = s.pos in
|
|
let token = String.sub s.src start (end_pos - start) in
|
|
let value = List [Symbol "quote"; cst_to_ast inner] in
|
|
CstAtom { leading_trivia = trivia; token; value; span = { start_offset = start; end_offset = end_pos } }
|
|
| '`' ->
|
|
advance s;
|
|
let inner = read_cst s in
|
|
let end_pos = s.pos in
|
|
let token = String.sub s.src start (end_pos - start) in
|
|
let value = List [Symbol "quasiquote"; cst_to_ast inner] in
|
|
CstAtom { leading_trivia = trivia; token; value; span = { start_offset = start; end_offset = end_pos } }
|
|
| ',' ->
|
|
advance s;
|
|
let splice = s.pos < s.len && s.src.[s.pos] = '@' in
|
|
if splice then advance s;
|
|
let inner = read_cst s in
|
|
let end_pos = s.pos in
|
|
let token = String.sub s.src start (end_pos - start) in
|
|
let sym = if splice then "splice-unquote" else "unquote" in
|
|
let value = List [Symbol sym; cst_to_ast inner] in
|
|
CstAtom { leading_trivia = trivia; token; value; span = { start_offset = start; end_offset = end_pos } }
|
|
| '#' when s.pos + 1 < s.len && s.src.[s.pos + 1] = ';' ->
|
|
(* Datum comment: #; discards next expression *)
|
|
advance s; advance s;
|
|
let _discarded = read_cst s in
|
|
(* Read the real value after the datum comment — attach trivia from #; *)
|
|
let next = read_cst s in
|
|
let combined_trivia = trivia @ (match next with
|
|
| CstAtom r -> r.leading_trivia
|
|
| CstList r -> r.leading_trivia
|
|
| CstDict r -> r.leading_trivia) in
|
|
(match next with
|
|
| CstAtom r -> CstAtom { r with leading_trivia = combined_trivia }
|
|
| CstList r -> CstList { r with leading_trivia = combined_trivia }
|
|
| CstDict r -> CstDict { r with leading_trivia = combined_trivia })
|
|
| '#' when s.pos + 1 < s.len && s.src.[s.pos + 1] = '\'' ->
|
|
advance s; advance s;
|
|
let inner = read_cst s in
|
|
let end_pos = s.pos in
|
|
let token = String.sub s.src start (end_pos - start) in
|
|
let value = List [Symbol "quote"; cst_to_ast inner] in
|
|
CstAtom { leading_trivia = trivia; token; value; span = { start_offset = start; end_offset = end_pos } }
|
|
| '#' when s.pos + 1 < s.len && s.src.[s.pos + 1] = '|' ->
|
|
(* Raw string: #|...| *)
|
|
advance s; advance s;
|
|
let buf = Buffer.create 64 in
|
|
let rec go () =
|
|
if at_end s then raise (Parse_error "Unterminated raw string");
|
|
let c = s.src.[s.pos] in
|
|
advance s;
|
|
if c = '|' then ()
|
|
else begin Buffer.add_char buf c; go () end
|
|
in
|
|
go ();
|
|
let end_pos = s.pos in
|
|
let token = String.sub s.src start (end_pos - start) in
|
|
CstAtom { leading_trivia = trivia; token; value = String (Buffer.contents buf);
|
|
span = { start_offset = start; end_offset = end_pos } }
|
|
| '"' ->
|
|
let value = String (read_string s) in
|
|
let end_pos = s.pos in
|
|
let token = String.sub s.src start (end_pos - start) in
|
|
CstAtom { leading_trivia = trivia; token; value;
|
|
span = { start_offset = start; end_offset = end_pos } }
|
|
| _ ->
|
|
let sym = read_symbol s in
|
|
if sym = "" then begin
|
|
let line = ref 1 and col = ref 1 in
|
|
for i = 0 to s.pos - 1 do
|
|
if s.src.[i] = '\n' then (incr line; col := 1) else incr col
|
|
done;
|
|
raise (Parse_error (Printf.sprintf "Unexpected char: %c at line %d col %d (pos %d)"
|
|
s.src.[s.pos] !line !col s.pos))
|
|
end;
|
|
let end_pos = s.pos in
|
|
let token = String.sub s.src start (end_pos - start) in
|
|
let value = match sym with
|
|
| "true" -> Bool true
|
|
| "false" -> Bool false
|
|
| "nil" -> Nil
|
|
| _ when sym.[0] = ':' -> Keyword (String.sub sym 1 (String.length sym - 1))
|
|
| _ -> match try_number sym with Some n -> n | None -> Symbol sym
|
|
in
|
|
CstAtom { leading_trivia = trivia; token; value;
|
|
span = { start_offset = start; end_offset = end_pos } }
|
|
|
|
and read_cst_list s trivia start open_c close_c =
|
|
advance s; (* skip open delim *)
|
|
let children = ref [] in
|
|
let rec go () =
|
|
let child_trivia = collect_trivia s in
|
|
if at_end s then raise (Parse_error "Unterminated list");
|
|
if s.src.[s.pos] = close_c then begin
|
|
advance s;
|
|
let end_pos = s.pos in
|
|
CstList { leading_trivia = trivia; open_delim = open_c;
|
|
children = List.rev !children; close_delim = close_c;
|
|
trailing_trivia = child_trivia;
|
|
span = { start_offset = start; end_offset = end_pos } }
|
|
end else begin
|
|
(* Push collected trivia onto the next child *)
|
|
let child_start = s.pos in
|
|
let child = read_cst_inner s in
|
|
let child_with_trivia = match child with
|
|
| CstAtom r -> CstAtom { r with leading_trivia = child_trivia @ r.leading_trivia }
|
|
| CstList r -> CstList { r with leading_trivia = child_trivia @ r.leading_trivia }
|
|
| CstDict r -> CstDict { r with leading_trivia = child_trivia @ r.leading_trivia }
|
|
in
|
|
ignore child_start;
|
|
children := child_with_trivia :: !children;
|
|
go ()
|
|
end
|
|
in
|
|
go ()
|
|
|
|
and read_cst_dict s trivia start =
|
|
advance s; (* skip { *)
|
|
let children = ref [] in
|
|
let rec go () =
|
|
let child_trivia = collect_trivia s in
|
|
if at_end s then raise (Parse_error "Unterminated dict");
|
|
if s.src.[s.pos] = '}' then begin
|
|
advance s;
|
|
let end_pos = s.pos in
|
|
CstDict { leading_trivia = trivia; children = List.rev !children;
|
|
trailing_trivia = child_trivia;
|
|
span = { start_offset = start; end_offset = end_pos } }
|
|
end else begin
|
|
let child = read_cst_inner s in
|
|
let child_with_trivia = match child with
|
|
| CstAtom r -> CstAtom { r with leading_trivia = child_trivia @ r.leading_trivia }
|
|
| CstList r -> CstList { r with leading_trivia = child_trivia @ r.leading_trivia }
|
|
| CstDict r -> CstDict { r with leading_trivia = child_trivia @ r.leading_trivia }
|
|
in
|
|
children := child_with_trivia :: !children;
|
|
go ()
|
|
end
|
|
in
|
|
go ()
|
|
|
|
(** Inner read — no trivia collection (caller handles it). *)
|
|
and read_cst_inner s : cst_node =
|
|
read_cst s
|
|
|
|
(** Parse result: list of CST nodes + any trailing trivia after the last node. *)
|
|
type cst_file = {
|
|
nodes : cst_node list;
|
|
trailing_trivia : trivia list;
|
|
}
|
|
|
|
(** Parse a string into a list of CST nodes. *)
|
|
let parse_all_cst src =
|
|
let s = make_state src in
|
|
let results = ref [] in
|
|
let rec go () =
|
|
let trivia = collect_trivia s in
|
|
if at_end s then
|
|
{ nodes = List.rev !results; trailing_trivia = trivia }
|
|
else begin
|
|
let node = read_cst_inner s in
|
|
(* Prepend collected trivia to this node *)
|
|
let node_with_trivia = match node with
|
|
| CstAtom r -> CstAtom { r with leading_trivia = trivia @ r.leading_trivia }
|
|
| CstList r -> CstList { r with leading_trivia = trivia @ r.leading_trivia }
|
|
| CstDict r -> CstDict { r with leading_trivia = trivia @ r.leading_trivia }
|
|
in
|
|
results := node_with_trivia :: !results;
|
|
go ()
|
|
end
|
|
in
|
|
go ()
|
|
|
|
(** Parse a file into a list of CST nodes. *)
|
|
let parse_file_cst path =
|
|
let ic = open_in path in
|
|
let n = in_channel_length ic in
|
|
let src = really_input_string ic n in
|
|
close_in ic;
|
|
parse_all_cst src
|