Files
rose-ash/hosts/ocaml/lib/sx_parser.ml
giles 38556af423 Interior comments, fragment comments, get_siblings + doc_gen comment support
Parser: read_value/read_list now capture Comment nodes inside lists
when ~comments:true. Module-level _preserve_comments ref threads the
flag through the recursive descent without changing signatures.

Pretty printer: has_interior_comments (recursive) forces multi-line
when any nested list contains comments. Comment nodes inside lists
emit as indented comment lines.

Edit tools: separate_comments strips interior comments recursively
via strip_interior_comments before passing to tree-tools (paths stay
correct). extract_fragment_comments parses new source with comments,
attaches leading comments to the target position in the comment map.

sx_get_siblings: injects comments for top-level siblings.

sx_doc_gen: parses with comments, tracks preceding Comment node,
includes cleaned comment text in generated component documentation.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-03 17:00:56 +00:00

293 lines
8.8 KiB
OCaml

(** S-expression parser.
Recursive descent over a string, producing [Sx_types.value list].
Supports: lists, dicts, symbols, keywords, strings (with escapes),
numbers, booleans, nil, comments, quote/quasiquote/unquote sugar. *)
open Sx_types
type state = {
src : string;
len : int;
mutable pos : int;
}
let make_state src = { src; len = String.length src; pos = 0 }
let peek s = if s.pos < s.len then Some s.src.[s.pos] else None
let advance s = s.pos <- s.pos + 1
let at_end s = s.pos >= s.len
let skip_whitespace_and_comments s =
let rec go () =
if at_end s then ()
else match s.src.[s.pos] with
| ' ' | '\t' | '\n' | '\r' -> advance s; go ()
| ';' ->
while s.pos < s.len && s.src.[s.pos] <> '\n' do advance s done;
if s.pos < s.len then advance s;
go ()
| _ -> ()
in go ()
(* Skip whitespace only — leaves comments for capture *)
let skip_whitespace s =
let rec go () =
if at_end s then ()
else match s.src.[s.pos] with
| ' ' | '\t' | '\n' | '\r' -> advance s; go ()
| _ -> ()
in go ()
(* Read a comment line starting at ';', returns text after ";;" (or ";") *)
let read_comment s =
let start = s.pos in
while s.pos < s.len && s.src.[s.pos] <> '\n' do advance s done;
let text = String.sub s.src start (s.pos - start) in
if s.pos < s.len then advance s;
text
(* Character classification — matches spec/parser.sx ident-start/ident-char.
ident-start: a-z A-Z _ ~ * + - > < = / ! ? &
ident-char: ident-start plus 0-9 . : / # , *)
let is_ident_start = function
| 'a'..'z' | 'A'..'Z' | '_' | '~' | '*' | '+' | '-'
| '>' | '<' | '=' | '/' | '!' | '?' | '&' -> true
| _ -> false
let is_ident_char = function
| c when is_ident_start c -> true
| '0'..'9' | '.' | ':' | '#' | ',' -> true
| _ -> false
(* Symbol reading uses ident_char; first char must be ident_start or digit/colon *)
let is_symbol_char = is_ident_char
let read_string s =
(* s.pos is on the opening quote *)
advance s;
let buf = Buffer.create 64 in
let rec go () =
if at_end s then raise (Parse_error "Unterminated string");
let c = s.src.[s.pos] in
advance s;
if c = '"' then Buffer.contents buf
else if c = '\\' then begin
if at_end s then raise (Parse_error "Unterminated string escape");
let esc = s.src.[s.pos] in
advance s;
(match esc with
| 'n' -> Buffer.add_char buf '\n'
| 't' -> Buffer.add_char buf '\t'
| 'r' -> Buffer.add_char buf '\r'
| '"' -> Buffer.add_char buf '"'
| '\\' -> Buffer.add_char buf '\\'
| 'u' ->
(* \uXXXX — read 4 hex digits, encode as UTF-8 *)
if s.pos + 4 > s.len then raise (Parse_error "Incomplete \\u escape");
let hex = String.sub s.src s.pos 4 in
s.pos <- s.pos + 4;
let code = int_of_string ("0x" ^ hex) in
let ubuf = Buffer.create 4 in
Buffer.add_utf_8_uchar ubuf (Uchar.of_int code);
Buffer.add_string buf (Buffer.contents ubuf)
| '`' -> Buffer.add_char buf '`'
| _ -> Buffer.add_char buf '\\'; Buffer.add_char buf esc);
go ()
end else begin
Buffer.add_char buf c;
go ()
end
in go ()
let read_symbol s =
let start = s.pos in
while s.pos < s.len && is_symbol_char s.src.[s.pos] do advance s done;
String.sub s.src start (s.pos - start)
let try_number str =
match float_of_string_opt str with
| Some n -> Some (Number n)
| None -> None
(* Module-level flag — when true, comments are captured as Comment nodes.
Set by parse_all ~comments:true, reset after. *)
let _preserve_comments = ref false
(* Collect consecutive comment lines into a Comment node *)
let collect_comment_node s =
let lines = ref [] in
let rec go () =
skip_whitespace s;
if not (at_end s) && s.src.[s.pos] = ';' then begin
lines := read_comment s :: !lines;
go ()
end
in
go ();
Comment (String.concat "\n" (List.rev !lines))
let rec read_value s : value =
(* In comment-preserving mode, check for comments first *)
if !_preserve_comments then begin
skip_whitespace s;
if not (at_end s) && s.src.[s.pos] = ';' then
collect_comment_node s
else
read_value_core s
end else
read_value_core s
and read_value_core s : value =
skip_whitespace_and_comments s;
if at_end s then begin
let line = ref 1 in
String.iter (fun c -> if c = '\n' then incr line) s.src;
raise (Parse_error (Printf.sprintf "Unexpected end of input at line %d (pos %d)" !line s.pos))
end;
match s.src.[s.pos] with
| '(' -> read_list s ')'
| '[' -> read_list s ']'
| '{' -> read_dict s
| '"' -> String (read_string s)
| '\'' -> advance s; List [Symbol "quote"; read_value s]
| '`' -> advance s; List [Symbol "quasiquote"; read_value s]
| '#' when s.pos + 1 < s.len && s.src.[s.pos + 1] = ';' ->
(* Datum comment: #; discards next expression *)
advance s; advance s;
ignore (read_value s);
read_value s
| '#' when s.pos + 1 < s.len && s.src.[s.pos + 1] = '\'' ->
(* Quote shorthand: #'expr -> (quote expr) *)
advance s; advance s;
List [Symbol "quote"; read_value s]
| '#' when s.pos + 1 < s.len && s.src.[s.pos + 1] = '|' ->
(* Raw string: #|...| — ends at next | *)
advance s; advance s;
let buf = Buffer.create 64 in
let rec go () =
if at_end s then raise (Parse_error "Unterminated raw string");
let c = s.src.[s.pos] in
advance s;
if c = '|' then
String (Buffer.contents buf)
else begin
Buffer.add_char buf c;
go ()
end
in go ()
| ',' ->
(* Unquote / splice-unquote — matches spec: , always triggers unquote *)
advance s;
if s.pos < s.len && s.src.[s.pos] = '@' then begin
advance s;
List [Symbol "splice-unquote"; read_value s]
end else
List [Symbol "unquote"; read_value s]
| _ ->
begin
(* Symbol, keyword, number, or boolean *)
let token = read_symbol s in
if token = "" then begin
let line = ref 1 and col = ref 1 in
for i = 0 to s.pos - 1 do
if s.src.[i] = '\n' then (incr line; col := 1) else incr col
done;
raise (Parse_error (Printf.sprintf "Unexpected char: %c at line %d col %d (pos %d)"
s.src.[s.pos] !line !col s.pos))
end;
match token with
| "true" -> Bool true
| "false" -> Bool false
| "nil" -> Nil
| _ when token.[0] = ':' ->
Keyword (String.sub token 1 (String.length token - 1))
| _ ->
match try_number token with
| Some n -> n
| None -> Symbol token
end
and read_list s close_char =
advance s; (* skip opening paren/bracket *)
let items = ref [] in
let rec go () =
if !_preserve_comments then begin
skip_whitespace s;
if at_end s then raise (Parse_error "Unterminated list");
if s.src.[s.pos] = close_char then begin
advance s;
List (List.rev !items)
end else if s.src.[s.pos] = ';' then begin
items := collect_comment_node s :: !items;
go ()
end else begin
items := read_value_core s :: !items;
go ()
end
end else begin
skip_whitespace_and_comments s;
if at_end s then raise (Parse_error "Unterminated list");
if s.src.[s.pos] = close_char then begin
advance s;
List (List.rev !items)
end else begin
items := read_value s :: !items;
go ()
end
end
in go ()
and read_dict s =
advance s; (* skip { *)
let d = make_dict () in
let rec go () =
skip_whitespace_and_comments s;
if at_end s then raise (Parse_error "Unterminated dict");
if s.src.[s.pos] = '}' then begin
advance s;
Dict d
end else begin
let key = read_value s in
let key_str = match key with
| Keyword k -> k
| String k -> k
| Symbol k -> k
| _ -> raise (Parse_error "Dict key must be keyword, string, or symbol")
in
let v = read_value s in
dict_set d key_str v;
go ()
end
in go ()
(** Parse a string into a list of SX values.
When [~comments:true], comments are preserved as [Comment] nodes —
both at top level and inside lists. Default is [false] (strip). *)
let parse_all ?(comments=false) src =
_preserve_comments := comments;
let s = make_state src in
let results = ref [] in
let rec go () =
if !_preserve_comments then skip_whitespace s
else skip_whitespace_and_comments s;
if at_end s then (
_preserve_comments := false;
List.rev !results
) else begin
results := read_value s :: !results;
go ()
end
in
try go ()
with e -> _preserve_comments := false; raise e
(** Parse a file into a list of SX values. *)
let parse_file ?(comments=false) path =
let ic = open_in path in
let n = in_channel_length ic in
let src = really_input_string ic n in
close_in ic;
parse_all ~comments src