(** S-expression parser. Recursive descent over a string, producing [Sx_types.value list]. Supports: lists, dicts, symbols, keywords, strings (with escapes), numbers, booleans, nil, comments, quote/quasiquote/unquote sugar. *) open Sx_types type state = { src : string; len : int; mutable pos : int; } let make_state src = { src; len = String.length src; pos = 0 } let peek s = if s.pos < s.len then Some s.src.[s.pos] else None let advance s = s.pos <- s.pos + 1 let at_end s = s.pos >= s.len let skip_whitespace_and_comments s = let rec go () = if at_end s then () else match s.src.[s.pos] with | ' ' | '\t' | '\n' | '\r' -> advance s; go () | ';' -> while s.pos < s.len && s.src.[s.pos] <> '\n' do advance s done; if s.pos < s.len then advance s; go () | _ -> () in go () (* Character classification — matches spec/parser.sx ident-start/ident-char. ident-start: a-z A-Z _ ~ * + - > < = / ! ? & ident-char: ident-start plus 0-9 . : / # , *) let is_ident_start = function | 'a'..'z' | 'A'..'Z' | '_' | '~' | '*' | '+' | '-' | '>' | '<' | '=' | '/' | '!' | '?' | '&' -> true | _ -> false let is_ident_char = function | c when is_ident_start c -> true | '0'..'9' | '.' | ':' | '#' | ',' -> true | _ -> false (* Symbol reading uses ident_char; first char must be ident_start or digit/colon *) let is_symbol_char = is_ident_char let read_string s = (* s.pos is on the opening quote *) advance s; let buf = Buffer.create 64 in let rec go () = if at_end s then raise (Parse_error "Unterminated string"); let c = s.src.[s.pos] in advance s; if c = '"' then Buffer.contents buf else if c = '\\' then begin if at_end s then raise (Parse_error "Unterminated string escape"); let esc = s.src.[s.pos] in advance s; (match esc with | 'n' -> Buffer.add_char buf '\n' | 't' -> Buffer.add_char buf '\t' | 'r' -> Buffer.add_char buf '\r' | '"' -> Buffer.add_char buf '"' | '\\' -> Buffer.add_char buf '\\' | 'u' -> (* \uXXXX — read 4 hex digits, encode as UTF-8 *) if s.pos + 4 > s.len then raise (Parse_error "Incomplete \\u escape"); let hex = String.sub s.src s.pos 4 in s.pos <- s.pos + 4; let code = int_of_string ("0x" ^ hex) in let ubuf = Buffer.create 4 in Buffer.add_utf_8_uchar ubuf (Uchar.of_int code); Buffer.add_string buf (Buffer.contents ubuf) | '`' -> Buffer.add_char buf '`' | _ -> Buffer.add_char buf '\\'; Buffer.add_char buf esc); go () end else begin Buffer.add_char buf c; go () end in go () let read_symbol s = let start = s.pos in while s.pos < s.len && is_symbol_char s.src.[s.pos] do advance s done; String.sub s.src start (s.pos - start) let try_number str = match float_of_string_opt str with | Some n -> Some (Number n) | None -> None let rec read_value s : value = skip_whitespace_and_comments s; if at_end s then begin let line = ref 1 in String.iter (fun c -> if c = '\n' then incr line) s.src; raise (Parse_error (Printf.sprintf "Unexpected end of input at line %d (pos %d)" !line s.pos)) end; match s.src.[s.pos] with | '(' -> read_list s ')' | '[' -> read_list s ']' | '{' -> read_dict s | '"' -> String (read_string s) | '\'' -> advance s; List [Symbol "quote"; read_value s] | '`' -> advance s; List [Symbol "quasiquote"; read_value s] | '#' when s.pos + 1 < s.len && s.src.[s.pos + 1] = ';' -> (* Datum comment: #; discards next expression *) advance s; advance s; ignore (read_value s); read_value s | '#' when s.pos + 1 < s.len && s.src.[s.pos + 1] = '\'' -> (* Quote shorthand: #'expr -> (quote expr) *) advance s; advance s; List [Symbol "quote"; read_value s] | '#' when s.pos + 1 < s.len && s.src.[s.pos + 1] = '|' -> (* Raw string: #|...| — ends at next | *) advance s; advance s; let buf = Buffer.create 64 in let rec go () = if at_end s then raise (Parse_error "Unterminated raw string"); let c = s.src.[s.pos] in advance s; if c = '|' then String (Buffer.contents buf) else begin Buffer.add_char buf c; go () end in go () | ',' -> (* Unquote / splice-unquote — matches spec: , always triggers unquote *) advance s; if s.pos < s.len && s.src.[s.pos] = '@' then begin advance s; List [Symbol "splice-unquote"; read_value s] end else List [Symbol "unquote"; read_value s] | _ -> begin (* Symbol, keyword, number, or boolean *) let token = read_symbol s in if token = "" then begin let line = ref 1 and col = ref 1 in for i = 0 to s.pos - 1 do if s.src.[i] = '\n' then (incr line; col := 1) else incr col done; raise (Parse_error (Printf.sprintf "Unexpected char: %c at line %d col %d (pos %d)" s.src.[s.pos] !line !col s.pos)) end; match token with | "true" -> Bool true | "false" -> Bool false | "nil" -> Nil | _ when token.[0] = ':' -> Keyword (String.sub token 1 (String.length token - 1)) | _ -> match try_number token with | Some n -> n | None -> Symbol token end and read_list s close_char = advance s; (* skip opening paren/bracket *) let items = ref [] in let rec go () = skip_whitespace_and_comments s; if at_end s then raise (Parse_error "Unterminated list"); if s.src.[s.pos] = close_char then begin advance s; List (List.rev !items) end else begin items := read_value s :: !items; go () end in go () and read_dict s = advance s; (* skip { *) let d = make_dict () in let rec go () = skip_whitespace_and_comments s; if at_end s then raise (Parse_error "Unterminated dict"); if s.src.[s.pos] = '}' then begin advance s; Dict d end else begin let key = read_value s in let key_str = match key with | Keyword k -> k | String k -> k | Symbol k -> k | _ -> raise (Parse_error "Dict key must be keyword, string, or symbol") in let v = read_value s in dict_set d key_str v; go () end in go () (** Parse a string into a list of SX values (AST — comments stripped). *) let parse_all src = let s = make_state src in let results = ref [] in let rec go () = skip_whitespace_and_comments s; if at_end s then List.rev !results else begin results := read_value s :: !results; go () end in go () (** Parse a file into a list of SX values (AST — comments stripped). *) let parse_file path = let ic = open_in path in let n = in_channel_length ic in let src = really_input_string ic n in close_in ic; parse_all src (* ================================================================== *) (* CST parser — lossless concrete syntax tree *) (* ================================================================== *) open Sx_cst (** Collect leading trivia (whitespace + comments) from current position. *) let collect_trivia s = let items = ref [] in let rec go () = if at_end s then () else match s.src.[s.pos] with | ' ' | '\t' | '\n' | '\r' -> let start = s.pos in while s.pos < s.len && (let c = s.src.[s.pos] in c = ' ' || c = '\t' || c = '\n' || c = '\r') do advance s done; items := Whitespace (String.sub s.src start (s.pos - start)) :: !items; go () | ';' -> let start = s.pos in while s.pos < s.len && s.src.[s.pos] <> '\n' do advance s done; let text = String.sub s.src start (s.pos - start) in if s.pos < s.len then advance s; (* Include the newline in the comment trivia *) let text = if s.pos > 0 && s.pos <= s.len && s.src.[s.pos - 1] = '\n' then text ^ "\n" else text in items := LineComment text :: !items; go () | _ -> () in go (); List.rev !items (** Read a single CST value — dispatches on first non-trivia char. *) let rec read_cst s : cst_node = let trivia = collect_trivia s in if at_end s then raise (Parse_error "Unexpected end of input"); let start = s.pos in match s.src.[s.pos] with | '(' -> read_cst_list s trivia start '(' ')' | '[' -> read_cst_list s trivia start '[' ']' | '{' -> read_cst_dict s trivia start | '\'' -> (* Quote sugar: 'x → (quote x) — emit as raw token *) advance s; let inner = read_cst s in let end_pos = s.pos in let token = String.sub s.src start (end_pos - start) in let value = List [Symbol "quote"; cst_to_ast inner] in CstAtom { leading_trivia = trivia; token; value; span = { start_offset = start; end_offset = end_pos } } | '`' -> advance s; let inner = read_cst s in let end_pos = s.pos in let token = String.sub s.src start (end_pos - start) in let value = List [Symbol "quasiquote"; cst_to_ast inner] in CstAtom { leading_trivia = trivia; token; value; span = { start_offset = start; end_offset = end_pos } } | ',' -> advance s; let splice = s.pos < s.len && s.src.[s.pos] = '@' in if splice then advance s; let inner = read_cst s in let end_pos = s.pos in let token = String.sub s.src start (end_pos - start) in let sym = if splice then "splice-unquote" else "unquote" in let value = List [Symbol sym; cst_to_ast inner] in CstAtom { leading_trivia = trivia; token; value; span = { start_offset = start; end_offset = end_pos } } | '#' when s.pos + 1 < s.len && s.src.[s.pos + 1] = ';' -> (* Datum comment: #; discards next expression *) advance s; advance s; let _discarded = read_cst s in (* Read the real value after the datum comment — attach trivia from #; *) let next = read_cst s in let combined_trivia = trivia @ (match next with | CstAtom r -> r.leading_trivia | CstList r -> r.leading_trivia | CstDict r -> r.leading_trivia) in (match next with | CstAtom r -> CstAtom { r with leading_trivia = combined_trivia } | CstList r -> CstList { r with leading_trivia = combined_trivia } | CstDict r -> CstDict { r with leading_trivia = combined_trivia }) | '#' when s.pos + 1 < s.len && s.src.[s.pos + 1] = '\'' -> advance s; advance s; let inner = read_cst s in let end_pos = s.pos in let token = String.sub s.src start (end_pos - start) in let value = List [Symbol "quote"; cst_to_ast inner] in CstAtom { leading_trivia = trivia; token; value; span = { start_offset = start; end_offset = end_pos } } | '#' when s.pos + 1 < s.len && s.src.[s.pos + 1] = '|' -> (* Raw string: #|...| *) advance s; advance s; let buf = Buffer.create 64 in let rec go () = if at_end s then raise (Parse_error "Unterminated raw string"); let c = s.src.[s.pos] in advance s; if c = '|' then () else begin Buffer.add_char buf c; go () end in go (); let end_pos = s.pos in let token = String.sub s.src start (end_pos - start) in CstAtom { leading_trivia = trivia; token; value = String (Buffer.contents buf); span = { start_offset = start; end_offset = end_pos } } | '"' -> let value = String (read_string s) in let end_pos = s.pos in let token = String.sub s.src start (end_pos - start) in CstAtom { leading_trivia = trivia; token; value; span = { start_offset = start; end_offset = end_pos } } | _ -> let sym = read_symbol s in if sym = "" then begin let line = ref 1 and col = ref 1 in for i = 0 to s.pos - 1 do if s.src.[i] = '\n' then (incr line; col := 1) else incr col done; raise (Parse_error (Printf.sprintf "Unexpected char: %c at line %d col %d (pos %d)" s.src.[s.pos] !line !col s.pos)) end; let end_pos = s.pos in let token = String.sub s.src start (end_pos - start) in let value = match sym with | "true" -> Bool true | "false" -> Bool false | "nil" -> Nil | _ when sym.[0] = ':' -> Keyword (String.sub sym 1 (String.length sym - 1)) | _ -> match try_number sym with Some n -> n | None -> Symbol sym in CstAtom { leading_trivia = trivia; token; value; span = { start_offset = start; end_offset = end_pos } } and read_cst_list s trivia start open_c close_c = advance s; (* skip open delim *) let children = ref [] in let rec go () = let child_trivia = collect_trivia s in if at_end s then raise (Parse_error "Unterminated list"); if s.src.[s.pos] = close_c then begin advance s; let end_pos = s.pos in CstList { leading_trivia = trivia; open_delim = open_c; children = List.rev !children; close_delim = close_c; trailing_trivia = child_trivia; span = { start_offset = start; end_offset = end_pos } } end else begin (* Push collected trivia onto the next child *) let child_start = s.pos in let child = read_cst_inner s in let child_with_trivia = match child with | CstAtom r -> CstAtom { r with leading_trivia = child_trivia @ r.leading_trivia } | CstList r -> CstList { r with leading_trivia = child_trivia @ r.leading_trivia } | CstDict r -> CstDict { r with leading_trivia = child_trivia @ r.leading_trivia } in ignore child_start; children := child_with_trivia :: !children; go () end in go () and read_cst_dict s trivia start = advance s; (* skip { *) let children = ref [] in let rec go () = let child_trivia = collect_trivia s in if at_end s then raise (Parse_error "Unterminated dict"); if s.src.[s.pos] = '}' then begin advance s; let end_pos = s.pos in CstDict { leading_trivia = trivia; children = List.rev !children; trailing_trivia = child_trivia; span = { start_offset = start; end_offset = end_pos } } end else begin let child = read_cst_inner s in let child_with_trivia = match child with | CstAtom r -> CstAtom { r with leading_trivia = child_trivia @ r.leading_trivia } | CstList r -> CstList { r with leading_trivia = child_trivia @ r.leading_trivia } | CstDict r -> CstDict { r with leading_trivia = child_trivia @ r.leading_trivia } in children := child_with_trivia :: !children; go () end in go () (** Inner read — no trivia collection (caller handles it). *) and read_cst_inner s : cst_node = read_cst s (** Parse result: list of CST nodes + any trailing trivia after the last node. *) type cst_file = { nodes : cst_node list; trailing_trivia : trivia list; } (** Parse a string into a list of CST nodes. *) let parse_all_cst src = let s = make_state src in let results = ref [] in let rec go () = let trivia = collect_trivia s in if at_end s then { nodes = List.rev !results; trailing_trivia = trivia } else begin let node = read_cst_inner s in (* Prepend collected trivia to this node *) let node_with_trivia = match node with | CstAtom r -> CstAtom { r with leading_trivia = trivia @ r.leading_trivia } | CstList r -> CstList { r with leading_trivia = trivia @ r.leading_trivia } | CstDict r -> CstDict { r with leading_trivia = trivia @ r.leading_trivia } in results := node_with_trivia :: !results; go () end in go () (** Parse a file into a list of CST nodes. *) let parse_file_cst path = let ic = open_in path in let n = in_channel_length ic in let src = really_input_string ic n in close_in ic; parse_all_cst src