From 9a090c6e42fedc38466dc38418cf8e92251ae6a2 Mon Sep 17 00:00:00 2001 From: giles Date: Thu, 7 May 2026 23:26:48 +0000 Subject: [PATCH] =?UTF-8?q?ocaml:=20phase=201=20expression=20parser=20(+37?= =?UTF-8?q?=20tests,=2095=20total)=20=E2=80=94=20consumes=20lib/guest/prat?= =?UTF-8?q?t.sx?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Atoms (literals/var/con/unit/list), application (left-assoc), prefix - / not, 29-op precedence table via pratt-op-lookup (incl. keyword-spelled mod/land/ lor/lxor/lsl/lsr/asr), tuples, parens, if/then/else, fun, let, let rec with function shorthand. AST follows Haskell-on-SX (:int / :op / :fun / etc). --- lib/ocaml/parser.sx | 418 ++++++++++++++++++++++++++++++++++++ lib/ocaml/test.sh | 138 +++++++++++- lib/ocaml/tests/tokenize.sx | 13 +- plans/ocaml-on-sx.md | 21 +- 4 files changed, 579 insertions(+), 11 deletions(-) create mode 100644 lib/ocaml/parser.sx diff --git a/lib/ocaml/parser.sx b/lib/ocaml/parser.sx new file mode 100644 index 00000000..5201d17c --- /dev/null +++ b/lib/ocaml/parser.sx @@ -0,0 +1,418 @@ +;; lib/ocaml/parser.sx — OCaml expression parser. +;; +;; Input: token list from (ocaml-tokenize src). +;; Output: an OCaml AST. Nodes are plain lists tagged by a keyword head; +;; keywords serialize to their string name so `(list :var "x")` is the +;; same value as `(list "var" "x")` at runtime. +;; +;; Scope (this iteration — expressions only): +;; atoms int/float/string/char, true/false, unit (), var, con, list literal +;; application left-associative, f x y z +;; prefix -E unary minus, not E +;; infix standard ops via lib/guest/pratt.sx table +;; tuple a, b, c (lower than infix, higher than let/if) +;; parens (e) +;; if if c then t else e (else optional → unit) +;; fun fun x y -> body +;; let let x = e in body (no rec) +;; let f x y = e in body (function shorthand) +;; let rec f x = e in body +;; +;; AST shapes: +;; (:int N) (:float N) (:string S) (:char C) (:bool B) (:unit) +;; (:var NAME) (:con NAME) +;; (:app FN ARG) — binary, chain for multi-arg +;; (:op OP LHS RHS) — binary infix; OP is the source string +;; (:neg E) (:not E) +;; (:tuple ITEMS) +;; (:list ITEMS) +;; (:if C T E) +;; (:fun PARAMS BODY) — PARAMS list of strings (idents) +;; (:let NAME PARAMS EXPR BODY) +;; (:let-rec NAME PARAMS EXPR BODY) + +(define ocaml-tok-type (fn (t) (if (= t nil) "eof" (get t :type)))) + +(define ocaml-tok-value (fn (t) (if (= t nil) nil (get t :value)))) + +;; Standard OCaml binary operator table. +;; Higher precedence = tighter binding. +;; ASSOC is :left or :right. +(define + ocaml-op-table + (list + (list "||" 2 :right) + (list "or" 2 :right) + (list "&&" 3 :right) + (list "&" 3 :right) + (list "=" 4 :left) + (list "<" 4 :left) + (list ">" 4 :left) + (list "<=" 4 :left) + (list ">=" 4 :left) + (list "<>" 4 :left) + (list "==" 4 :left) + (list "!=" 4 :left) + (list "|>" 4 :left) + (list "@" 5 :right) + (list "^" 5 :right) + (list "::" 6 :right) + (list "+" 7 :left) + (list "-" 7 :left) + (list "*" 8 :left) + (list "/" 8 :left) + (list "%" 8 :left) + (list "mod" 8 :left) + (list "land" 8 :left) + (list "lor" 8 :left) + (list "lxor" 8 :left) + (list "**" 9 :right) + (list "lsl" 9 :right) + (list "lsr" 9 :right) + (list "asr" 9 :right))) + +(define + ocaml-binop-prec + (fn + (op) + (let + ((entry (pratt-op-lookup ocaml-op-table op))) + (if (= entry nil) 0 (pratt-op-prec entry))))) + +(define + ocaml-binop-right? + (fn + (op) + (let + ((entry (pratt-op-lookup ocaml-op-table op))) + (and (not (= entry nil)) (= (pratt-op-assoc entry) :right))))) + +;; Some OCaml binops are spelled with keyword tokens (mod / land / lor / +;; lxor / lsl / lsr / asr / or). Recognise both shapes. +(define + ocaml-tok-is-binop? + (fn + (tok) + (let + ((tt (ocaml-tok-type tok)) (tv (ocaml-tok-value tok))) + (cond + ((= tt "op") (not (= (ocaml-binop-prec tv) 0))) + ((= tt "keyword") (not (= (ocaml-binop-prec tv) 0))) + (else false))))) + +(define + ocaml-parse + (fn + (src) + (let + ((tokens (ocaml-tokenize src)) (idx 0) (tok-len 0)) + (begin + (set! tok-len (len tokens)) + (define peek-tok (fn () (nth tokens idx))) + (define advance-tok! (fn () (set! idx (+ idx 1)))) + (define + check-tok? + (fn + (type value) + (let + ((t (peek-tok))) + (and + (= (ocaml-tok-type t) type) + (or (= value nil) (= (ocaml-tok-value t) value)))))) + (define + consume! + (fn + (type value) + (if + (check-tok? type value) + (let ((t (peek-tok))) (begin (advance-tok!) t)) + (error + (str + "ocaml-parse: expected " + type + " " + value + " got " + (ocaml-tok-type (peek-tok)) + " " + (ocaml-tok-value (peek-tok))))))) + (define at-kw? (fn (kw) (check-tok? "keyword" kw))) + (define at-op? (fn (op) (check-tok? "op" op))) + (define parse-expr (fn () nil)) + (define parse-tuple (fn () nil)) + (define parse-binop-rhs (fn (lhs min-prec) lhs)) + (define parse-prefix (fn () nil)) + (define parse-app (fn () nil)) + (define parse-atom (fn () nil)) + (set! + parse-atom + (fn + () + (let + ((t (peek-tok)) + (tt (ocaml-tok-type (peek-tok))) + (tv (ocaml-tok-value (peek-tok)))) + (cond + ((= tt "number") + (begin + (advance-tok!) + (if (= (round tv) tv) (list :int tv) (list :float tv)))) + ((= tt "string") (begin (advance-tok!) (list :string tv))) + ((= tt "char") (begin (advance-tok!) (list :char tv))) + ((and (= tt "keyword") (= tv "true")) + (begin (advance-tok!) (list :bool true))) + ((and (= tt "keyword") (= tv "false")) + (begin (advance-tok!) (list :bool false))) + ((= tt "ident") (begin (advance-tok!) (list :var tv))) + ((= tt "ctor") (begin (advance-tok!) (list :con tv))) + ((and (= tt "op") (= tv "(")) + (begin + (advance-tok!) + (cond + ((at-op? ")") (begin (advance-tok!) (list :unit))) + (else + (let + ((e (parse-expr))) + (begin (consume! "op" ")") e)))))) + ((and (= tt "op") (= tv "[")) + (begin + (advance-tok!) + (cond + ((at-op? "]") (begin (advance-tok!) (list :list))) + (else + (let + ((items (list))) + (begin + (append! items (parse-expr)) + (define + loop + (fn + () + (when + (at-op? ";") + (begin + (advance-tok!) + (when + (not (at-op? "]")) + (begin + (append! items (parse-expr)) + (loop))))))) + (loop) + (consume! "op" "]") + (cons :list items))))))) + ((at-kw? "begin") + (begin + (advance-tok!) + (let + ((e (parse-expr))) + (begin (consume! "keyword" "end") e)))) + (else + (error + (str + "ocaml-parse: unexpected token " + tt + " " + tv + " at idx " + idx))))))) + (define + at-app-start? + (fn + () + (let + ((tt (ocaml-tok-type (peek-tok))) + (tv (ocaml-tok-value (peek-tok)))) + (cond + ((= tt "number") true) + ((= tt "string") true) + ((= tt "char") true) + ((= tt "ident") true) + ((= tt "ctor") true) + ((and (= tt "keyword") (or (= tv "true") (= tv "false") (= tv "begin"))) + true) + ((and (= tt "op") (or (= tv "(") (= tv "["))) true) + (else false))))) + (set! + parse-app + (fn + () + (let + ((head (parse-atom))) + (begin + (define + loop + (fn + () + (when + (at-app-start?) + (let + ((arg (parse-atom))) + (begin (set! head (list :app head arg)) (loop)))))) + (loop) + head)))) + (set! + parse-prefix + (fn + () + (cond + ((at-op? "-") + (begin (advance-tok!) (list :neg (parse-prefix)))) + ((at-kw? "not") + (begin (advance-tok!) (list :not (parse-prefix)))) + (else (parse-app))))) + (set! + parse-binop-rhs + (fn + (lhs min-prec) + (let + ((tok (peek-tok))) + (cond + ((not (ocaml-tok-is-binop? tok)) lhs) + (else + (let + ((op (ocaml-tok-value tok)) + (prec (ocaml-binop-prec (ocaml-tok-value tok)))) + (cond + ((< prec min-prec) lhs) + (else + (begin + (advance-tok!) + (let + ((rhs (parse-prefix)) + (next-min + (if + (ocaml-binop-right? op) + prec + (+ prec 1)))) + (begin + (set! rhs (parse-binop-rhs rhs next-min)) + (parse-binop-rhs (list :op op lhs rhs) min-prec)))))))))))) + (define + parse-binary + (fn + () + (let ((lhs (parse-prefix))) (parse-binop-rhs lhs 1)))) + (set! + parse-tuple + (fn + () + (let + ((first (parse-binary))) + (cond + ((at-op? ",") + (let + ((items (list first))) + (begin + (define + loop + (fn + () + (when + (at-op? ",") + (begin + (advance-tok!) + (append! items (parse-binary)) + (loop))))) + (loop) + (cons :tuple items)))) + (else first))))) + (define + parse-fun + (fn + () + (let + ((params (list))) + (begin + (define + collect-params + (fn + () + (when + (check-tok? "ident" nil) + (begin + (append! params (ocaml-tok-value (peek-tok))) + (advance-tok!) + (collect-params))))) + (collect-params) + (when + (= (len params) 0) + (error "ocaml-parse: fun expects at least one parameter")) + (consume! "op" "->") + (let ((body (parse-expr))) (list :fun params body)))))) + (define + parse-let + (fn + () + (let + ((reccy false)) + (begin + (when + (at-kw? "rec") + (begin (advance-tok!) (set! reccy true))) + (let + ((name (ocaml-tok-value (consume! "ident" nil))) + (params (list))) + (begin + (define + collect-params + (fn + () + (when + (check-tok? "ident" nil) + (begin + (append! params (ocaml-tok-value (peek-tok))) + (advance-tok!) + (collect-params))))) + (collect-params) + (consume! "op" "=") + (let + ((rhs (parse-expr))) + (begin + (consume! "keyword" "in") + (let + ((body (parse-expr))) + (if + reccy + (list :let-rec name params rhs body) + (list :let name params rhs body))))))))))) + (define + parse-if + (fn + () + (let + ((cond-expr (parse-expr))) + (begin + (consume! "keyword" "then") + (let + ((then-expr (parse-expr))) + (cond + ((at-kw? "else") + (begin + (advance-tok!) + (let + ((else-expr (parse-expr))) + (list :if cond-expr then-expr else-expr)))) + (else (list :if cond-expr then-expr (list :unit))))))))) + (set! + parse-expr + (fn + () + (cond + ((at-kw? "fun") (begin (advance-tok!) (parse-fun))) + ((at-kw? "let") (begin (advance-tok!) (parse-let))) + ((at-kw? "if") (begin (advance-tok!) (parse-if))) + (else (parse-tuple))))) + (let + ((result (parse-expr))) + (begin + (when + (not (= (ocaml-tok-type (peek-tok)) "eof")) + (error + (str + "ocaml-parse: trailing tokens at idx " + idx + " — got " + (ocaml-tok-type (peek-tok)) + " " + (ocaml-tok-value (peek-tok))))) + result)))))) diff --git a/lib/ocaml/test.sh b/lib/ocaml/test.sh index 2750fa16..3b2fae6b 100755 --- a/lib/ocaml/test.sh +++ b/lib/ocaml/test.sh @@ -29,7 +29,9 @@ cat > "$TMPFILE" << 'EPOCHS' (epoch 1) (load "lib/guest/lex.sx") (load "lib/guest/prefix.sx") +(load "lib/guest/pratt.sx") (load "lib/ocaml/tokenizer.sx") +(load "lib/ocaml/parser.sx") (load "lib/ocaml/tests/tokenize.sx") ;; ── empty / eof ──────────────────────────────────────────────── @@ -169,6 +171,95 @@ cat > "$TMPFILE" << 'EPOCHS' (epoch 195) (eval "(ocaml-test-tok-value \"x := !y\" 1)") +;; ── Phase 1.parse: parser ────────────────────────────────────── +;; Atoms +(epoch 200) +(eval "(ocaml-parse \"42\")") +(epoch 201) +(eval "(ocaml-parse \"3.14\")") +(epoch 202) +(eval "(ocaml-parse \"\\\"hi\\\"\")") +(epoch 203) +(eval "(ocaml-parse \"'a'\")") +(epoch 204) +(eval "(ocaml-parse \"true\")") +(epoch 205) +(eval "(ocaml-parse \"false\")") +(epoch 206) +(eval "(ocaml-parse \"x\")") +(epoch 207) +(eval "(ocaml-parse \"Some\")") +(epoch 208) +(eval "(ocaml-parse \"()\")") + +;; Application (left-assoc) +(epoch 210) +(eval "(ocaml-parse \"f x\")") +(epoch 211) +(eval "(ocaml-parse \"f x y\")") +(epoch 212) +(eval "(ocaml-parse \"f (g x)\")") +(epoch 213) +(eval "(ocaml-parse \"Some 42\")") + +;; Binops with precedence +(epoch 220) +(eval "(ocaml-parse \"1 + 2\")") +(epoch 221) +(eval "(ocaml-parse \"a + b * c\")") +(epoch 222) +(eval "(ocaml-parse \"a * b + c\")") +(epoch 223) +(eval "(ocaml-parse \"a && b || c\")") +(epoch 224) +(eval "(ocaml-parse \"a = b\")") +(epoch 225) +(eval "(ocaml-parse \"a ^ b ^ c\")") +(epoch 226) +(eval "(ocaml-parse \"a :: b :: []\")") +(epoch 227) +(eval "(ocaml-parse \"(a + b) * c\")") +(epoch 228) +(eval "(ocaml-parse \"a |> f |> g\")") +(epoch 229) +(eval "(ocaml-parse \"x mod 2\")") + +;; Prefix +(epoch 230) +(eval "(ocaml-parse \"-x\")") +(epoch 231) +(eval "(ocaml-parse \"-1 + 2\")") + +;; Tuples & lists +(epoch 240) +(eval "(ocaml-parse \"(1, 2, 3)\")") +(epoch 241) +(eval "(ocaml-parse \"[1; 2; 3]\")") +(epoch 242) +(eval "(ocaml-parse \"[]\")") + +;; if / fun / let / let rec +(epoch 250) +(eval "(ocaml-parse \"if x then 1 else 2\")") +(epoch 251) +(eval "(ocaml-parse \"if c then x\")") +(epoch 252) +(eval "(ocaml-parse \"fun x -> x + 1\")") +(epoch 253) +(eval "(ocaml-parse \"fun x y -> x + y\")") +(epoch 254) +(eval "(ocaml-parse \"let x = 1 in x\")") +(epoch 255) +(eval "(ocaml-parse \"let f x = x + 1 in f 2\")") +(epoch 256) +(eval "(ocaml-parse \"let rec f x = f x in f 1\")") +(epoch 257) +(eval "(ocaml-parse \"let f x y = x + y in f 1 2\")") + +;; begin/end +(epoch 260) +(eval "(ocaml-parse \"begin 1 + 2 end\")") + EPOCHS OUTPUT=$(timeout 60 "$SX_SERVER" < "$TMPFILE" 2>/dev/null) @@ -278,9 +369,54 @@ check 193 "Some is ctor" '"ctor"' check 194 "first |> value" '"|>"' check 195 "ref assign :=" '":="' +# ── Parser tests ──────────────────────────────────────────────── +check 200 "parse int" '("int" 42)' +check 201 "parse float" '("float" 3.14)' +check 202 "parse string" '("string" "hi")' +check 203 "parse char" '("char" "a")' +check 204 "parse true" '("bool" true)' +check 205 "parse false" '("bool" false)' +check 206 "parse var" '("var" "x")' +check 207 "parse ctor" '("con" "Some")' +check 208 "parse unit" '("unit")' + +check 210 "parse f x" '("app" ("var" "f") ("var" "x"))' +check 211 "parse f x y left-assoc" '("app" ("app" ("var" "f") ("var" "x")) ("var" "y"))' +check 212 "parse f (g x)" '("app" ("var" "f") ("app" ("var" "g") ("var" "x")))' +check 213 "parse Some 42" '("app" ("con" "Some") ("int" 42))' + +check 220 "parse 1+2" '("op" "+" ("int" 1) ("int" 2))' +check 221 "parse a + b * c prec" '("op" "+" ("var" "a") ("op" "*"' +check 222 "parse a*b + c prec" '("op" "+" ("op" "*"' +check 223 "parse && / || prec" '("op" "||" ("op" "&&"' +check 224 "parse a = b" '("op" "=" ("var" "a") ("var" "b"))' +check 225 "parse ^ right-assoc" '("op" "^" ("var" "a") ("op" "^"' +check 226 "parse :: right-assoc" '("op" "::" ("var" "a") ("op" "::"' +check 227 "parse parens override" '("op" "*" ("op" "+"' +check 228 "parse |> chain" '("op" "|>" ("op" "|>"' +check 229 "parse mod kw-binop" '("op" "mod" ("var" "x") ("int" 2))' + +check 230 "parse -x" '("neg" ("var" "x"))' +check 231 "parse -1+2" '("op" "+" ("neg" ("int" 1)) ("int" 2))' + +check 240 "parse tuple" '("tuple" ("int" 1) ("int" 2) ("int" 3))' +check 241 "parse list literal" '("list" ("int" 1) ("int" 2) ("int" 3))' +check 242 "parse []" '("list")' + +check 250 "parse if/then/else" '("if" ("var" "x") ("int" 1) ("int" 2))' +check 251 "parse if w/o else" '("if" ("var" "c") ("var" "x") ("unit"))' +check 252 "parse fun x -> ..." '("fun" ("x") ("op" "+" ("var" "x") ("int" 1)))' +check 253 "parse fun x y ->" '("fun" ("x" "y")' +check 254 "parse let x = 1 in x" '("let" "x" () ("int" 1) ("var" "x"))' +check 255 "parse let f x =" '("let" "f" ("x") ("op" "+"' +check 256 "parse let rec f x =" '("let-rec" "f" ("x")' +check 257 "parse let f x y =" '("let" "f" ("x" "y")' + +check 260 "parse begin/end" '("op" "+" ("int" 1) ("int" 2))' + TOTAL=$((PASS + FAIL)) if [ $FAIL -eq 0 ]; then - echo "ok $PASS/$TOTAL OCaml-on-SX tokenizer tests passed" + echo "ok $PASS/$TOTAL OCaml-on-SX tests passed" else echo "FAIL $PASS/$TOTAL passed, $FAIL failed:" echo "" diff --git a/lib/ocaml/tests/tokenize.sx b/lib/ocaml/tests/tokenize.sx index cdf8955a..68235703 100644 --- a/lib/ocaml/tests/tokenize.sx +++ b/lib/ocaml/tests/tokenize.sx @@ -1,9 +1,8 @@ -;; lib/ocaml/tests/tokenize.sx — smoke tests for the OCaml tokenizer. +;; lib/ocaml/tests/tokenize.sx — smoke-test helpers. ;; ;; Tests are exercised via lib/ocaml/test.sh, which drives sx_server.exe -;; over the epoch protocol. This file provides a small evaluator that -;; returns short diagnostic values for each fixture so the runner can -;; grep them out of one batched run. +;; over the epoch protocol. This file provides small accessors so the +;; bash runner can grep short diagnostic values out of one batched run. (define ocaml-test-tok-type @@ -14,3 +13,9 @@ (fn (src i) (get (nth (ocaml-tokenize src) i) :value))) (define ocaml-test-tok-count (fn (src) (len (ocaml-tokenize src)))) + +(define ocaml-test-parse-str (fn (src) (ocaml-parse src))) + +(define + ocaml-test-parse-head + (fn (src) (nth (ocaml-parse src) 0))) diff --git a/plans/ocaml-on-sx.md b/plans/ocaml-on-sx.md index e16759aa..54bd98f2 100644 --- a/plans/ocaml-on-sx.md +++ b/plans/ocaml-on-sx.md @@ -125,12 +125,13 @@ SX CEK evaluator (both JS and OCaml hosts) int/float literals (incl. hex, exponent, underscores), nested block comments `(* ... *)`. _(labels `~label:` / `?label:` and heredoc `{|...|}` deferred — surface tokens already work via `~`/`?` punct + `{`/`|` punct.)_ -- [ ] **Parser:** top-level `let`/`let rec`/`type`/`module`/`exception`/`open`/`include` - declarations; expressions: literals, identifiers, constructor application, - lambda, application (left-assoc), binary ops with precedence table, - `if`/`then`/`else`, `match`/`with`, `try`/`with`, `let`/`in`, `begin`/`end`, - `fun`/`function`, tuples, list literals, record literals/updates, field access, - sequences `;`, unit `()`. +- [~] **Parser:** expressions: literals, identifiers, constructor application, + lambda, application (left-assoc), binary ops with precedence (29 ops via + `lib/guest/pratt.sx`), `if`/`then`/`else`, `let`/`in`, `let rec`, + `fun`/`->`, tuples, list literals, `begin`/`end`, unit `()`. _(Pending: + top-level `let`/`type`/`module`/`exception`/`open`/`include` decls, + `match`/`with`, `try`/`with`, `function`, record literals/updates, + field access, sequences `;`.)_ - [ ] **Patterns:** constructor, literal, variable, wildcard `_`, tuple, list cons `::`, list literal, record, `as`, or-pattern `P1 | P2`, `when` guard. - [ ] OCaml is **not** indentation-sensitive — no layout algorithm needed. @@ -309,6 +310,14 @@ the "mother tongue" closure: OCaml → SX → OCaml. This means: _Newest first._ +- 2026-05-07 Phase 1 — `lib/ocaml/parser.sx` expression parser consuming + `lib/guest/pratt.sx` for binop precedence (29 operators across 8 levels, + incl. keyword-spelled binops `mod`/`land`/`lor`/`lxor`/`lsl`/`lsr`/`asr`). + Atoms (literals + var/con/unit/list), application (left-assoc), prefix + `-`/`not`, tuples, parens, `if`/`then`/`else`, `fun x y -> body`, + `let`/`let rec` with function shorthand. AST shapes match Haskell-on-SX + conventions (`(:int N)` `(:op OP L R)` `(:fun PARAMS BODY)` etc.). Total + 95/95 tests now passing via `lib/ocaml/test.sh`. - 2026-05-07 Phase 1 — `lib/ocaml/tokenizer.sx` consuming `lib/guest/lex.sx` via `prefix-rename`. Covers idents, ctors, 51 keywords, numbers (int / float / hex / exponent / underscored), strings (with escapes), chars (with escapes),