(** S-expression parser. Recursive descent over a string, producing [Sx_types.value list]. Supports: lists, dicts, symbols, keywords, strings (with escapes), numbers, booleans, nil, comments, quote/quasiquote/unquote sugar. *) open Sx_types type state = { src : string; len : int; mutable pos : int; } let make_state src = { src; len = String.length src; pos = 0 } let peek s = if s.pos < s.len then Some s.src.[s.pos] else None let advance s = s.pos <- s.pos + 1 let at_end s = s.pos >= s.len let skip_whitespace_and_comments s = let rec go () = if at_end s then () else match s.src.[s.pos] with | ' ' | '\t' | '\n' | '\r' -> advance s; go () | ';' -> while s.pos < s.len && s.src.[s.pos] <> '\n' do advance s done; if s.pos < s.len then advance s; go () | _ -> () in go () (* Character classification — matches spec/parser.sx ident-start/ident-char. ident-start: a-z A-Z _ ~ * + - > < = / ! ? & ident-char: ident-start plus 0-9 . : / # , *) let is_ident_start = function | 'a'..'z' | 'A'..'Z' | '_' | '~' | '*' | '+' | '-' | '>' | '<' | '=' | '/' | '!' | '?' | '&' -> true | _ -> false let is_ident_char = function | c when is_ident_start c -> true | '0'..'9' | '.' | ':' | '#' | ',' -> true | _ -> false (* Symbol reading uses ident_char; first char must be ident_start or digit/colon *) let is_symbol_char = is_ident_char let read_string s = (* s.pos is on the opening quote *) advance s; let buf = Buffer.create 64 in let rec go () = if at_end s then raise (Parse_error "Unterminated string"); let c = s.src.[s.pos] in advance s; if c = '"' then Buffer.contents buf else if c = '\\' then begin if at_end s then raise (Parse_error "Unterminated string escape"); let esc = s.src.[s.pos] in advance s; (match esc with | 'n' -> Buffer.add_char buf '\n' | 't' -> Buffer.add_char buf '\t' | 'r' -> Buffer.add_char buf '\r' | '"' -> Buffer.add_char buf '"' | '\\' -> Buffer.add_char buf '\\' | 'u' -> (* \uXXXX — read 4 hex digits, encode as UTF-8 *) if s.pos + 4 > s.len then raise (Parse_error "Incomplete \\u escape"); let hex = String.sub s.src s.pos 4 in s.pos <- s.pos + 4; let code = int_of_string ("0x" ^ hex) in let ubuf = Buffer.create 4 in Buffer.add_utf_8_uchar ubuf (Uchar.of_int code); Buffer.add_string buf (Buffer.contents ubuf) | '`' -> Buffer.add_char buf '`' | _ -> Buffer.add_char buf '\\'; Buffer.add_char buf esc); go () end else begin Buffer.add_char buf c; go () end in go () let read_symbol s = let start = s.pos in while s.pos < s.len && is_symbol_char s.src.[s.pos] do advance s done; String.sub s.src start (s.pos - start) let try_number str = match float_of_string_opt str with | Some n -> Some (Number n) | None -> None let rec read_value s : value = skip_whitespace_and_comments s; if at_end s then raise (Parse_error "Unexpected end of input"); match s.src.[s.pos] with | '(' -> read_list s ')' | '[' -> read_list s ']' | '{' -> read_dict s | '"' -> String (read_string s) | '\'' -> advance s; List [Symbol "quote"; read_value s] | '`' -> advance s; List [Symbol "quasiquote"; read_value s] | '#' when s.pos + 1 < s.len && s.src.[s.pos + 1] = ';' -> (* Datum comment: #; discards next expression *) advance s; advance s; ignore (read_value s); read_value s | '#' when s.pos + 1 < s.len && s.src.[s.pos + 1] = '\'' -> (* Quote shorthand: #'expr -> (quote expr) *) advance s; advance s; List [Symbol "quote"; read_value s] | '#' when s.pos + 1 < s.len && s.src.[s.pos + 1] = '|' -> (* Raw string: #|...| — ends at next | *) advance s; advance s; let buf = Buffer.create 64 in let rec go () = if at_end s then raise (Parse_error "Unterminated raw string"); let c = s.src.[s.pos] in advance s; if c = '|' then String (Buffer.contents buf) else begin Buffer.add_char buf c; go () end in go () | ',' -> (* Unquote / splice-unquote — matches spec: , always triggers unquote *) advance s; if s.pos < s.len && s.src.[s.pos] = '@' then begin advance s; List [Symbol "splice-unquote"; read_value s] end else List [Symbol "unquote"; read_value s] | _ -> begin (* Symbol, keyword, number, or boolean *) let token = read_symbol s in if token = "" then raise (Parse_error ("Unexpected char: " ^ String.make 1 s.src.[s.pos])); match token with | "true" -> Bool true | "false" -> Bool false | "nil" -> Nil | _ when token.[0] = ':' -> Keyword (String.sub token 1 (String.length token - 1)) | _ -> match try_number token with | Some n -> n | None -> Symbol token end and read_list s close_char = advance s; (* skip opening paren/bracket *) let items = ref [] in let rec go () = skip_whitespace_and_comments s; if at_end s then raise (Parse_error "Unterminated list"); if s.src.[s.pos] = close_char then begin advance s; List (List.rev !items) end else begin items := read_value s :: !items; go () end in go () and read_dict s = advance s; (* skip { *) let d = make_dict () in let rec go () = skip_whitespace_and_comments s; if at_end s then raise (Parse_error "Unterminated dict"); if s.src.[s.pos] = '}' then begin advance s; Dict d end else begin let key = read_value s in let key_str = match key with | Keyword k -> k | String k -> k | Symbol k -> k | _ -> raise (Parse_error "Dict key must be keyword, string, or symbol") in let v = read_value s in dict_set d key_str v; go () end in go () (** Parse a string into a list of SX values. *) let parse_all src = let s = make_state src in let results = ref [] in let rec go () = skip_whitespace_and_comments s; if at_end s then List.rev !results else begin results := read_value s :: !results; go () end in go () (** Parse a file into a list of SX values. *) let parse_file path = let ic = open_in path in let n = in_channel_length ic in let src = really_input_string ic n in close_in ic; parse_all src