2 Commits

Author SHA1 Message Date
9a090c6e42 ocaml: phase 1 expression parser (+37 tests, 95 total) — consumes lib/guest/pratt.sx
Some checks failed
Test, Build, and Deploy / test-build-deploy (push) Failing after 55s
Atoms (literals/var/con/unit/list), application (left-assoc), prefix - / not,
29-op precedence table via pratt-op-lookup (incl. keyword-spelled mod/land/
lor/lxor/lsl/lsr/asr), tuples, parens, if/then/else, fun, let, let rec
with function shorthand. AST follows Haskell-on-SX (:int / :op / :fun / etc).
2026-05-07 23:26:48 +00:00
85b7fed4fc ocaml: phase 1 tokenizer (+58 tests) — consumes lib/guest/lex.sx
Some checks failed
Test, Build, and Deploy / test-build-deploy (push) Failing after 52s
Idents, ctors, 51 keywords, numbers (int/float/hex/exp/underscored),
strings + chars with escapes, type variables, 26 op/punct tokens, and
nested (* ... *) block comments. Tests via epoch protocol against
sx_server.exe.
2026-05-07 23:04:40 +00:00
5 changed files with 1273 additions and 11 deletions

418
lib/ocaml/parser.sx Normal file
View File

@@ -0,0 +1,418 @@
;; lib/ocaml/parser.sx — OCaml expression parser.
;;
;; Input: token list from (ocaml-tokenize src).
;; Output: an OCaml AST. Nodes are plain lists tagged by a keyword head;
;; keywords serialize to their string name so `(list :var "x")` is the
;; same value as `(list "var" "x")` at runtime.
;;
;; Scope (this iteration — expressions only):
;; atoms int/float/string/char, true/false, unit (), var, con, list literal
;; application left-associative, f x y z
;; prefix -E unary minus, not E
;; infix standard ops via lib/guest/pratt.sx table
;; tuple a, b, c (lower than infix, higher than let/if)
;; parens (e)
;; if if c then t else e (else optional → unit)
;; fun fun x y -> body
;; let let x = e in body (no rec)
;; let f x y = e in body (function shorthand)
;; let rec f x = e in body
;;
;; AST shapes:
;; (:int N) (:float N) (:string S) (:char C) (:bool B) (:unit)
;; (:var NAME) (:con NAME)
;; (:app FN ARG) — binary, chain for multi-arg
;; (:op OP LHS RHS) — binary infix; OP is the source string
;; (:neg E) (:not E)
;; (:tuple ITEMS)
;; (:list ITEMS)
;; (:if C T E)
;; (:fun PARAMS BODY) — PARAMS list of strings (idents)
;; (:let NAME PARAMS EXPR BODY)
;; (:let-rec NAME PARAMS EXPR BODY)
(define ocaml-tok-type (fn (t) (if (= t nil) "eof" (get t :type))))
(define ocaml-tok-value (fn (t) (if (= t nil) nil (get t :value))))
;; Standard OCaml binary operator table.
;; Higher precedence = tighter binding.
;; ASSOC is :left or :right.
(define
ocaml-op-table
(list
(list "||" 2 :right)
(list "or" 2 :right)
(list "&&" 3 :right)
(list "&" 3 :right)
(list "=" 4 :left)
(list "<" 4 :left)
(list ">" 4 :left)
(list "<=" 4 :left)
(list ">=" 4 :left)
(list "<>" 4 :left)
(list "==" 4 :left)
(list "!=" 4 :left)
(list "|>" 4 :left)
(list "@" 5 :right)
(list "^" 5 :right)
(list "::" 6 :right)
(list "+" 7 :left)
(list "-" 7 :left)
(list "*" 8 :left)
(list "/" 8 :left)
(list "%" 8 :left)
(list "mod" 8 :left)
(list "land" 8 :left)
(list "lor" 8 :left)
(list "lxor" 8 :left)
(list "**" 9 :right)
(list "lsl" 9 :right)
(list "lsr" 9 :right)
(list "asr" 9 :right)))
(define
ocaml-binop-prec
(fn
(op)
(let
((entry (pratt-op-lookup ocaml-op-table op)))
(if (= entry nil) 0 (pratt-op-prec entry)))))
(define
ocaml-binop-right?
(fn
(op)
(let
((entry (pratt-op-lookup ocaml-op-table op)))
(and (not (= entry nil)) (= (pratt-op-assoc entry) :right)))))
;; Some OCaml binops are spelled with keyword tokens (mod / land / lor /
;; lxor / lsl / lsr / asr / or). Recognise both shapes.
(define
ocaml-tok-is-binop?
(fn
(tok)
(let
((tt (ocaml-tok-type tok)) (tv (ocaml-tok-value tok)))
(cond
((= tt "op") (not (= (ocaml-binop-prec tv) 0)))
((= tt "keyword") (not (= (ocaml-binop-prec tv) 0)))
(else false)))))
(define
ocaml-parse
(fn
(src)
(let
((tokens (ocaml-tokenize src)) (idx 0) (tok-len 0))
(begin
(set! tok-len (len tokens))
(define peek-tok (fn () (nth tokens idx)))
(define advance-tok! (fn () (set! idx (+ idx 1))))
(define
check-tok?
(fn
(type value)
(let
((t (peek-tok)))
(and
(= (ocaml-tok-type t) type)
(or (= value nil) (= (ocaml-tok-value t) value))))))
(define
consume!
(fn
(type value)
(if
(check-tok? type value)
(let ((t (peek-tok))) (begin (advance-tok!) t))
(error
(str
"ocaml-parse: expected "
type
" "
value
" got "
(ocaml-tok-type (peek-tok))
" "
(ocaml-tok-value (peek-tok)))))))
(define at-kw? (fn (kw) (check-tok? "keyword" kw)))
(define at-op? (fn (op) (check-tok? "op" op)))
(define parse-expr (fn () nil))
(define parse-tuple (fn () nil))
(define parse-binop-rhs (fn (lhs min-prec) lhs))
(define parse-prefix (fn () nil))
(define parse-app (fn () nil))
(define parse-atom (fn () nil))
(set!
parse-atom
(fn
()
(let
((t (peek-tok))
(tt (ocaml-tok-type (peek-tok)))
(tv (ocaml-tok-value (peek-tok))))
(cond
((= tt "number")
(begin
(advance-tok!)
(if (= (round tv) tv) (list :int tv) (list :float tv))))
((= tt "string") (begin (advance-tok!) (list :string tv)))
((= tt "char") (begin (advance-tok!) (list :char tv)))
((and (= tt "keyword") (= tv "true"))
(begin (advance-tok!) (list :bool true)))
((and (= tt "keyword") (= tv "false"))
(begin (advance-tok!) (list :bool false)))
((= tt "ident") (begin (advance-tok!) (list :var tv)))
((= tt "ctor") (begin (advance-tok!) (list :con tv)))
((and (= tt "op") (= tv "("))
(begin
(advance-tok!)
(cond
((at-op? ")") (begin (advance-tok!) (list :unit)))
(else
(let
((e (parse-expr)))
(begin (consume! "op" ")") e))))))
((and (= tt "op") (= tv "["))
(begin
(advance-tok!)
(cond
((at-op? "]") (begin (advance-tok!) (list :list)))
(else
(let
((items (list)))
(begin
(append! items (parse-expr))
(define
loop
(fn
()
(when
(at-op? ";")
(begin
(advance-tok!)
(when
(not (at-op? "]"))
(begin
(append! items (parse-expr))
(loop)))))))
(loop)
(consume! "op" "]")
(cons :list items)))))))
((at-kw? "begin")
(begin
(advance-tok!)
(let
((e (parse-expr)))
(begin (consume! "keyword" "end") e))))
(else
(error
(str
"ocaml-parse: unexpected token "
tt
" "
tv
" at idx "
idx)))))))
(define
at-app-start?
(fn
()
(let
((tt (ocaml-tok-type (peek-tok)))
(tv (ocaml-tok-value (peek-tok))))
(cond
((= tt "number") true)
((= tt "string") true)
((= tt "char") true)
((= tt "ident") true)
((= tt "ctor") true)
((and (= tt "keyword") (or (= tv "true") (= tv "false") (= tv "begin")))
true)
((and (= tt "op") (or (= tv "(") (= tv "["))) true)
(else false)))))
(set!
parse-app
(fn
()
(let
((head (parse-atom)))
(begin
(define
loop
(fn
()
(when
(at-app-start?)
(let
((arg (parse-atom)))
(begin (set! head (list :app head arg)) (loop))))))
(loop)
head))))
(set!
parse-prefix
(fn
()
(cond
((at-op? "-")
(begin (advance-tok!) (list :neg (parse-prefix))))
((at-kw? "not")
(begin (advance-tok!) (list :not (parse-prefix))))
(else (parse-app)))))
(set!
parse-binop-rhs
(fn
(lhs min-prec)
(let
((tok (peek-tok)))
(cond
((not (ocaml-tok-is-binop? tok)) lhs)
(else
(let
((op (ocaml-tok-value tok))
(prec (ocaml-binop-prec (ocaml-tok-value tok))))
(cond
((< prec min-prec) lhs)
(else
(begin
(advance-tok!)
(let
((rhs (parse-prefix))
(next-min
(if
(ocaml-binop-right? op)
prec
(+ prec 1))))
(begin
(set! rhs (parse-binop-rhs rhs next-min))
(parse-binop-rhs (list :op op lhs rhs) min-prec))))))))))))
(define
parse-binary
(fn
()
(let ((lhs (parse-prefix))) (parse-binop-rhs lhs 1))))
(set!
parse-tuple
(fn
()
(let
((first (parse-binary)))
(cond
((at-op? ",")
(let
((items (list first)))
(begin
(define
loop
(fn
()
(when
(at-op? ",")
(begin
(advance-tok!)
(append! items (parse-binary))
(loop)))))
(loop)
(cons :tuple items))))
(else first)))))
(define
parse-fun
(fn
()
(let
((params (list)))
(begin
(define
collect-params
(fn
()
(when
(check-tok? "ident" nil)
(begin
(append! params (ocaml-tok-value (peek-tok)))
(advance-tok!)
(collect-params)))))
(collect-params)
(when
(= (len params) 0)
(error "ocaml-parse: fun expects at least one parameter"))
(consume! "op" "->")
(let ((body (parse-expr))) (list :fun params body))))))
(define
parse-let
(fn
()
(let
((reccy false))
(begin
(when
(at-kw? "rec")
(begin (advance-tok!) (set! reccy true)))
(let
((name (ocaml-tok-value (consume! "ident" nil)))
(params (list)))
(begin
(define
collect-params
(fn
()
(when
(check-tok? "ident" nil)
(begin
(append! params (ocaml-tok-value (peek-tok)))
(advance-tok!)
(collect-params)))))
(collect-params)
(consume! "op" "=")
(let
((rhs (parse-expr)))
(begin
(consume! "keyword" "in")
(let
((body (parse-expr)))
(if
reccy
(list :let-rec name params rhs body)
(list :let name params rhs body)))))))))))
(define
parse-if
(fn
()
(let
((cond-expr (parse-expr)))
(begin
(consume! "keyword" "then")
(let
((then-expr (parse-expr)))
(cond
((at-kw? "else")
(begin
(advance-tok!)
(let
((else-expr (parse-expr)))
(list :if cond-expr then-expr else-expr))))
(else (list :if cond-expr then-expr (list :unit)))))))))
(set!
parse-expr
(fn
()
(cond
((at-kw? "fun") (begin (advance-tok!) (parse-fun)))
((at-kw? "let") (begin (advance-tok!) (parse-let)))
((at-kw? "if") (begin (advance-tok!) (parse-if)))
(else (parse-tuple)))))
(let
((result (parse-expr)))
(begin
(when
(not (= (ocaml-tok-type (peek-tok)) "eof"))
(error
(str
"ocaml-parse: trailing tokens at idx "
idx
" — got "
(ocaml-tok-type (peek-tok))
" "
(ocaml-tok-value (peek-tok)))))
result))))))

426
lib/ocaml/test.sh Executable file
View File

@@ -0,0 +1,426 @@
#!/usr/bin/env bash
# Fast OCaml-on-SX test runner — epoch protocol direct to sx_server.exe.
# Mirrors lib/lua/test.sh.
#
# Usage:
# bash lib/ocaml/test.sh # run all tests
# bash lib/ocaml/test.sh -v # verbose
set -uo pipefail
cd "$(git rev-parse --show-toplevel)"
SX_SERVER="${SX_SERVER:-hosts/ocaml/_build/default/bin/sx_server.exe}"
if [ ! -x "$SX_SERVER" ]; then
SX_SERVER="/root/rose-ash/hosts/ocaml/_build/default/bin/sx_server.exe"
fi
if [ ! -x "$SX_SERVER" ]; then
echo "ERROR: sx_server.exe not found. Run: cd hosts/ocaml && dune build"
exit 1
fi
VERBOSE="${1:-}"
PASS=0
FAIL=0
ERRORS=""
TMPFILE=$(mktemp)
trap "rm -f $TMPFILE" EXIT
cat > "$TMPFILE" << 'EPOCHS'
(epoch 1)
(load "lib/guest/lex.sx")
(load "lib/guest/prefix.sx")
(load "lib/guest/pratt.sx")
(load "lib/ocaml/tokenizer.sx")
(load "lib/ocaml/parser.sx")
(load "lib/ocaml/tests/tokenize.sx")
;; ── empty / eof ────────────────────────────────────────────────
(epoch 100)
(eval "(ocaml-test-tok-count \"\")")
(epoch 101)
(eval "(ocaml-test-tok-type \"\" 0)")
;; ── numbers ────────────────────────────────────────────────────
(epoch 110)
(eval "(ocaml-test-tok-type \"42\" 0)")
(epoch 111)
(eval "(ocaml-test-tok-value \"42\" 0)")
(epoch 112)
(eval "(ocaml-test-tok-value \"3.14\" 0)")
(epoch 113)
(eval "(ocaml-test-tok-value \"0xff\" 0)")
(epoch 114)
(eval "(ocaml-test-tok-value \"1e3\" 0)")
(epoch 115)
(eval "(ocaml-test-tok-value \"1_000_000\" 0)")
(epoch 116)
(eval "(ocaml-test-tok-value \"3.14e-2\" 0)")
;; ── identifiers / constructors / keywords ─────────────────────
(epoch 120)
(eval "(ocaml-test-tok-type \"foo\" 0)")
(epoch 121)
(eval "(ocaml-test-tok-value \"foo_bar1\" 0)")
(epoch 122)
(eval "(ocaml-test-tok-type \"Some\" 0)")
(epoch 123)
(eval "(ocaml-test-tok-value \"Some\" 0)")
(epoch 124)
(eval "(ocaml-test-tok-type \"let\" 0)")
(epoch 125)
(eval "(ocaml-test-tok-value \"match\" 0)")
(epoch 126)
(eval "(ocaml-test-tok-type \"true\" 0)")
(epoch 127)
(eval "(ocaml-test-tok-value \"false\" 0)")
(epoch 128)
(eval "(ocaml-test-tok-value \"name'\" 0)")
;; ── strings ────────────────────────────────────────────────────
(epoch 130)
(eval "(ocaml-test-tok-type \"\\\"hi\\\"\" 0)")
(epoch 131)
(eval "(ocaml-test-tok-value \"\\\"hi\\\"\" 0)")
(epoch 132)
(eval "(ocaml-test-tok-value \"\\\"a\\\\nb\\\"\" 0)")
;; ── chars ──────────────────────────────────────────────────────
(epoch 140)
(eval "(ocaml-test-tok-type \"'a'\" 0)")
(epoch 141)
(eval "(ocaml-test-tok-value \"'a'\" 0)")
(epoch 142)
(eval "(ocaml-test-tok-value \"'\\\\n'\" 0)")
;; ── type variables ─────────────────────────────────────────────
(epoch 145)
(eval "(ocaml-test-tok-type \"'a\" 0)")
(epoch 146)
(eval "(ocaml-test-tok-value \"'a\" 0)")
;; ── multi-char operators ───────────────────────────────────────
(epoch 150)
(eval "(ocaml-test-tok-value \"->\" 0)")
(epoch 151)
(eval "(ocaml-test-tok-value \"|>\" 0)")
(epoch 152)
(eval "(ocaml-test-tok-value \"<-\" 0)")
(epoch 153)
(eval "(ocaml-test-tok-value \":=\" 0)")
(epoch 154)
(eval "(ocaml-test-tok-value \"::\" 0)")
(epoch 155)
(eval "(ocaml-test-tok-value \";;\" 0)")
(epoch 156)
(eval "(ocaml-test-tok-value \"@@\" 0)")
(epoch 157)
(eval "(ocaml-test-tok-value \"<>\" 0)")
(epoch 158)
(eval "(ocaml-test-tok-value \"&&\" 0)")
(epoch 159)
(eval "(ocaml-test-tok-value \"||\" 0)")
;; ── single-char punctuation ────────────────────────────────────
(epoch 160)
(eval "(ocaml-test-tok-value \"+\" 0)")
(epoch 161)
(eval "(ocaml-test-tok-value \"|\" 0)")
(epoch 162)
(eval "(ocaml-test-tok-value \";\" 0)")
(epoch 163)
(eval "(ocaml-test-tok-value \"(\" 0)")
(epoch 164)
(eval "(ocaml-test-tok-value \"!\" 0)")
(epoch 165)
(eval "(ocaml-test-tok-value \"@\" 0)")
;; ── comments ───────────────────────────────────────────────────
(epoch 170)
(eval "(ocaml-test-tok-count \"(* hi *)\")")
(epoch 171)
(eval "(ocaml-test-tok-value \"(* c *) 42\" 0)")
(epoch 172)
(eval "(ocaml-test-tok-count \"(* outer (* inner *) end *) 1\")")
(epoch 173)
(eval "(ocaml-test-tok-value \"(* outer (* inner *) end *) 1\" 0)")
;; ── compound expressions ───────────────────────────────────────
(epoch 180)
(eval "(ocaml-test-tok-count \"let x = 1\")")
(epoch 181)
(eval "(ocaml-test-tok-type \"let x = 1\" 0)")
(epoch 182)
(eval "(ocaml-test-tok-value \"let x = 1\" 0)")
(epoch 183)
(eval "(ocaml-test-tok-type \"let x = 1\" 1)")
(epoch 184)
(eval "(ocaml-test-tok-value \"let x = 1\" 2)")
(epoch 185)
(eval "(ocaml-test-tok-value \"let x = 1\" 3)")
(epoch 190)
(eval "(ocaml-test-tok-count \"match x with | None -> 0 | Some y -> y\")")
(epoch 191)
(eval "(ocaml-test-tok-value \"fun x -> x + 1\" 2)")
(epoch 192)
(eval "(ocaml-test-tok-type \"fun x -> x + 1\" 2)")
(epoch 193)
(eval "(ocaml-test-tok-type \"Some 42\" 0)")
(epoch 194)
(eval "(ocaml-test-tok-value \"a |> f |> g\" 1)")
(epoch 195)
(eval "(ocaml-test-tok-value \"x := !y\" 1)")
;; ── Phase 1.parse: parser ──────────────────────────────────────
;; Atoms
(epoch 200)
(eval "(ocaml-parse \"42\")")
(epoch 201)
(eval "(ocaml-parse \"3.14\")")
(epoch 202)
(eval "(ocaml-parse \"\\\"hi\\\"\")")
(epoch 203)
(eval "(ocaml-parse \"'a'\")")
(epoch 204)
(eval "(ocaml-parse \"true\")")
(epoch 205)
(eval "(ocaml-parse \"false\")")
(epoch 206)
(eval "(ocaml-parse \"x\")")
(epoch 207)
(eval "(ocaml-parse \"Some\")")
(epoch 208)
(eval "(ocaml-parse \"()\")")
;; Application (left-assoc)
(epoch 210)
(eval "(ocaml-parse \"f x\")")
(epoch 211)
(eval "(ocaml-parse \"f x y\")")
(epoch 212)
(eval "(ocaml-parse \"f (g x)\")")
(epoch 213)
(eval "(ocaml-parse \"Some 42\")")
;; Binops with precedence
(epoch 220)
(eval "(ocaml-parse \"1 + 2\")")
(epoch 221)
(eval "(ocaml-parse \"a + b * c\")")
(epoch 222)
(eval "(ocaml-parse \"a * b + c\")")
(epoch 223)
(eval "(ocaml-parse \"a && b || c\")")
(epoch 224)
(eval "(ocaml-parse \"a = b\")")
(epoch 225)
(eval "(ocaml-parse \"a ^ b ^ c\")")
(epoch 226)
(eval "(ocaml-parse \"a :: b :: []\")")
(epoch 227)
(eval "(ocaml-parse \"(a + b) * c\")")
(epoch 228)
(eval "(ocaml-parse \"a |> f |> g\")")
(epoch 229)
(eval "(ocaml-parse \"x mod 2\")")
;; Prefix
(epoch 230)
(eval "(ocaml-parse \"-x\")")
(epoch 231)
(eval "(ocaml-parse \"-1 + 2\")")
;; Tuples & lists
(epoch 240)
(eval "(ocaml-parse \"(1, 2, 3)\")")
(epoch 241)
(eval "(ocaml-parse \"[1; 2; 3]\")")
(epoch 242)
(eval "(ocaml-parse \"[]\")")
;; if / fun / let / let rec
(epoch 250)
(eval "(ocaml-parse \"if x then 1 else 2\")")
(epoch 251)
(eval "(ocaml-parse \"if c then x\")")
(epoch 252)
(eval "(ocaml-parse \"fun x -> x + 1\")")
(epoch 253)
(eval "(ocaml-parse \"fun x y -> x + y\")")
(epoch 254)
(eval "(ocaml-parse \"let x = 1 in x\")")
(epoch 255)
(eval "(ocaml-parse \"let f x = x + 1 in f 2\")")
(epoch 256)
(eval "(ocaml-parse \"let rec f x = f x in f 1\")")
(epoch 257)
(eval "(ocaml-parse \"let f x y = x + y in f 1 2\")")
;; begin/end
(epoch 260)
(eval "(ocaml-parse \"begin 1 + 2 end\")")
EPOCHS
OUTPUT=$(timeout 60 "$SX_SERVER" < "$TMPFILE" 2>/dev/null)
check() {
local epoch="$1" desc="$2" expected="$3"
local actual
actual=$(echo "$OUTPUT" | grep -A1 "^(ok-len $epoch " | tail -1)
if [ -z "$actual" ]; then
actual=$(echo "$OUTPUT" | grep "^(ok $epoch " || true)
fi
if [ -z "$actual" ]; then
actual=$(echo "$OUTPUT" | grep "^(error $epoch " || true)
fi
if [ -z "$actual" ]; then
actual="<no output for epoch $epoch>"
fi
if echo "$actual" | grep -qF -- "$expected"; then
PASS=$((PASS + 1))
[ "$VERBOSE" = "-v" ] && echo " ok $desc"
else
FAIL=$((FAIL + 1))
ERRORS+=" FAIL $desc (epoch $epoch)
expected: $expected
actual: $actual
"
fi
}
# empty / eof
check 100 "empty tokens length" '1'
check 101 "empty first is eof" '"eof"'
# numbers
check 110 "int type" '"number"'
check 111 "int value" '42'
check 112 "float value" '3.14'
check 113 "hex value" '255'
check 114 "exponent" '1000'
check 115 "underscored int" '1000000'
check 116 "neg exponent" '0.0314'
# idents / ctors / keywords
check 120 "ident type" '"ident"'
check 121 "ident value" '"foo_bar1"'
check 122 "ctor type" '"ctor"'
check 123 "ctor value" '"Some"'
check 124 "let keyword type" '"keyword"'
check 125 "match keyword value" '"match"'
check 126 "true is keyword" '"keyword"'
check 127 "false value" '"false"'
check 128 "primed ident" "\"name'\""
# strings
check 130 "string type" '"string"'
check 131 "string value" '"hi"'
check 132 "escape sequence" '"a'
# chars
check 140 "char type" '"char"'
check 141 "char value" '"a"'
check 142 "char escape" '"'
# tyvars
check 145 "tyvar type" '"tyvar"'
check 146 "tyvar value" '"a"'
# multi-char ops
check 150 "->" '"->"'
check 151 "|>" '"|>"'
check 152 "<-" '"<-"'
check 153 ":=" '":="'
check 154 "::" '"::"'
check 155 ";;" '";;"'
check 156 "@@" '"@@"'
check 157 "<>" '"<>"'
check 158 "&&" '"&&"'
check 159 "||" '"||"'
# single ops
check 160 "+" '"+"'
check 161 "|" '"|"'
check 162 ";" '";"'
check 163 "(" '"("'
check 164 "!" '"!"'
check 165 "@" '"@"'
# comments
check 170 "block comment alone -> eof" '1'
check 171 "num after block comment" '42'
check 172 "nested comment count" '2'
check 173 "nested comment value" '1'
# compound
check 180 "let x = 1 count" '5'
check 181 "let is keyword" '"keyword"'
check 182 "let value" '"let"'
check 183 "x is ident" '"ident"'
check 184 "= value" '"="'
check 185 "1 value" '1'
check 190 "match expr count" '13'
check 191 "fun -> arrow value" '"->"'
check 192 "fun -> arrow type" '"op"'
check 193 "Some is ctor" '"ctor"'
check 194 "first |> value" '"|>"'
check 195 "ref assign :=" '":="'
# ── Parser tests ────────────────────────────────────────────────
check 200 "parse int" '("int" 42)'
check 201 "parse float" '("float" 3.14)'
check 202 "parse string" '("string" "hi")'
check 203 "parse char" '("char" "a")'
check 204 "parse true" '("bool" true)'
check 205 "parse false" '("bool" false)'
check 206 "parse var" '("var" "x")'
check 207 "parse ctor" '("con" "Some")'
check 208 "parse unit" '("unit")'
check 210 "parse f x" '("app" ("var" "f") ("var" "x"))'
check 211 "parse f x y left-assoc" '("app" ("app" ("var" "f") ("var" "x")) ("var" "y"))'
check 212 "parse f (g x)" '("app" ("var" "f") ("app" ("var" "g") ("var" "x")))'
check 213 "parse Some 42" '("app" ("con" "Some") ("int" 42))'
check 220 "parse 1+2" '("op" "+" ("int" 1) ("int" 2))'
check 221 "parse a + b * c prec" '("op" "+" ("var" "a") ("op" "*"'
check 222 "parse a*b + c prec" '("op" "+" ("op" "*"'
check 223 "parse && / || prec" '("op" "||" ("op" "&&"'
check 224 "parse a = b" '("op" "=" ("var" "a") ("var" "b"))'
check 225 "parse ^ right-assoc" '("op" "^" ("var" "a") ("op" "^"'
check 226 "parse :: right-assoc" '("op" "::" ("var" "a") ("op" "::"'
check 227 "parse parens override" '("op" "*" ("op" "+"'
check 228 "parse |> chain" '("op" "|>" ("op" "|>"'
check 229 "parse mod kw-binop" '("op" "mod" ("var" "x") ("int" 2))'
check 230 "parse -x" '("neg" ("var" "x"))'
check 231 "parse -1+2" '("op" "+" ("neg" ("int" 1)) ("int" 2))'
check 240 "parse tuple" '("tuple" ("int" 1) ("int" 2) ("int" 3))'
check 241 "parse list literal" '("list" ("int" 1) ("int" 2) ("int" 3))'
check 242 "parse []" '("list")'
check 250 "parse if/then/else" '("if" ("var" "x") ("int" 1) ("int" 2))'
check 251 "parse if w/o else" '("if" ("var" "c") ("var" "x") ("unit"))'
check 252 "parse fun x -> ..." '("fun" ("x") ("op" "+" ("var" "x") ("int" 1)))'
check 253 "parse fun x y ->" '("fun" ("x" "y")'
check 254 "parse let x = 1 in x" '("let" "x" () ("int" 1) ("var" "x"))'
check 255 "parse let f x =" '("let" "f" ("x") ("op" "+"'
check 256 "parse let rec f x =" '("let-rec" "f" ("x")'
check 257 "parse let f x y =" '("let" "f" ("x" "y")'
check 260 "parse begin/end" '("op" "+" ("int" 1) ("int" 2))'
TOTAL=$((PASS + FAIL))
if [ $FAIL -eq 0 ]; then
echo "ok $PASS/$TOTAL OCaml-on-SX tests passed"
else
echo "FAIL $PASS/$TOTAL passed, $FAIL failed:"
echo ""
echo "$ERRORS"
fi
[ $FAIL -eq 0 ]

View File

@@ -0,0 +1,21 @@
;; lib/ocaml/tests/tokenize.sx — smoke-test helpers.
;;
;; Tests are exercised via lib/ocaml/test.sh, which drives sx_server.exe
;; over the epoch protocol. This file provides small accessors so the
;; bash runner can grep short diagnostic values out of one batched run.
(define
ocaml-test-tok-type
(fn (src i) (get (nth (ocaml-tokenize src) i) :type)))
(define
ocaml-test-tok-value
(fn (src i) (get (nth (ocaml-tokenize src) i) :value)))
(define ocaml-test-tok-count (fn (src) (len (ocaml-tokenize src))))
(define ocaml-test-parse-str (fn (src) (ocaml-parse src)))
(define
ocaml-test-parse-head
(fn (src) (nth (ocaml-parse src) 0)))

382
lib/ocaml/tokenizer.sx Normal file
View File

@@ -0,0 +1,382 @@
;; lib/ocaml/tokenizer.sx — OCaml lexer.
;;
;; Tokens: ident, ctor (uppercase ident), keyword, number, string, char, op, eof.
;; Token shape: {:type :value :pos} via lex-make-token.
;; OCaml is not indentation-sensitive — no layout pass.
;; Block comments (* ... *) nest. There is no line-comment syntax.
(prefix-rename
"ocaml-"
(quote
((make-token lex-make-token)
(digit? lex-digit?)
(hex-digit? lex-hex-digit?)
(alpha? lex-alpha?)
(alnum? lex-alnum?)
(ident-start? lex-ident-start?)
(ident-char? lex-ident-char?)
(ws? lex-whitespace?))))
(define
ocaml-keywords
(list
"and"
"as"
"assert"
"begin"
"class"
"constraint"
"do"
"done"
"downto"
"else"
"end"
"exception"
"external"
"false"
"for"
"fun"
"function"
"functor"
"if"
"in"
"include"
"inherit"
"initializer"
"lazy"
"let"
"match"
"method"
"module"
"mutable"
"new"
"nonrec"
"object"
"of"
"open"
"or"
"private"
"rec"
"sig"
"struct"
"then"
"to"
"true"
"try"
"type"
"val"
"virtual"
"when"
"while"
"with"
"land"
"lor"
"lxor"
"lsl"
"lsr"
"asr"
"mod"))
(define ocaml-keyword? (fn (word) (contains? ocaml-keywords word)))
(define
ocaml-upper?
(fn (c) (and (not (= c nil)) (>= c "A") (<= c "Z"))))
(define
ocaml-tokenize
(fn
(src)
(let
((tokens (list)) (pos 0) (src-len (len src)))
(define
ocaml-peek
(fn
(offset)
(if (< (+ pos offset) src-len) (nth src (+ pos offset)) nil)))
(define cur (fn () (ocaml-peek 0)))
(define advance! (fn (n) (set! pos (+ pos n))))
(define
push!
(fn
(type value start)
(append! tokens (ocaml-make-token type value start))))
(define
skip-block-comment!
(fn
(depth)
(cond
((>= pos src-len) nil)
((and (= (cur) "*") (= (ocaml-peek 1) ")"))
(begin
(advance! 2)
(when
(> depth 1)
(skip-block-comment! (- depth 1)))))
((and (= (cur) "(") (= (ocaml-peek 1) "*"))
(begin
(advance! 2)
(skip-block-comment! (+ depth 1))))
(else (begin (advance! 1) (skip-block-comment! depth))))))
(define
skip-ws!
(fn
()
(cond
((>= pos src-len) nil)
((ocaml-ws? (cur)) (begin (advance! 1) (skip-ws!)))
((and (= (cur) "(") (= (ocaml-peek 1) "*"))
(begin
(advance! 2)
(skip-block-comment! 1)
(skip-ws!)))
(else nil))))
(define
read-ident
(fn
(start)
(begin
(when
(and (< pos src-len) (ocaml-ident-char? (cur)))
(begin (advance! 1) (read-ident start)))
(when
(and (< pos src-len) (= (cur) "'"))
(begin (advance! 1) (read-ident start)))
(slice src start pos))))
(define
read-decimal-digits!
(fn
()
(when
(and (< pos src-len) (or (ocaml-digit? (cur)) (= (cur) "_")))
(begin (advance! 1) (read-decimal-digits!)))))
(define
read-hex-digits!
(fn
()
(when
(and
(< pos src-len)
(or (ocaml-hex-digit? (cur)) (= (cur) "_")))
(begin (advance! 1) (read-hex-digits!)))))
(define
read-exp-part!
(fn
()
(when
(and (< pos src-len) (or (= (cur) "e") (= (cur) "E")))
(let
((p1 (ocaml-peek 1)))
(when
(or
(and (not (= p1 nil)) (ocaml-digit? p1))
(and
(or (= p1 "+") (= p1 "-"))
(< (+ pos 2) src-len)
(ocaml-digit? (ocaml-peek 2))))
(begin
(advance! 1)
(when
(and
(< pos src-len)
(or (= (cur) "+") (= (cur) "-")))
(advance! 1))
(read-decimal-digits!)))))))
(define
strip-underscores
(fn
(s)
(let
((out (list)) (i 0) (n (len s)))
(begin
(define
loop
(fn
()
(when
(< i n)
(begin
(when
(not (= (nth s i) "_"))
(append! out (nth s i)))
(set! i (+ i 1))
(loop)))))
(loop)
(join "" out)))))
(define
read-number
(fn
(start)
(cond
((and (= (cur) "0") (< (+ pos 1) src-len) (or (= (ocaml-peek 1) "x") (= (ocaml-peek 1) "X")))
(begin
(advance! 2)
(read-hex-digits!)
(let
((raw (slice src (+ start 2) pos)))
(parse-number (str "0x" (strip-underscores raw))))))
(else
(begin
(read-decimal-digits!)
(when
(and
(< pos src-len)
(= (cur) ".")
(or
(>= (+ pos 1) src-len)
(not (= (ocaml-peek 1) "."))))
(begin (advance! 1) (read-decimal-digits!)))
(read-exp-part!)
(parse-number (strip-underscores (slice src start pos))))))))
(define
read-string-literal
(fn
()
(let
((chars (list)))
(begin
(advance! 1)
(define
loop
(fn
()
(cond
((>= pos src-len) nil)
((= (cur) "\\")
(begin
(advance! 1)
(when
(< pos src-len)
(let
((ch (cur)))
(begin
(cond
((= ch "n") (append! chars "\n"))
((= ch "t") (append! chars "\t"))
((= ch "r") (append! chars "\r"))
((= ch "b") (append! chars "\\b"))
((= ch "\\") (append! chars "\\"))
((= ch "'") (append! chars "'"))
((= ch "\"") (append! chars "\""))
((= ch " ") nil)
(else (append! chars ch)))
(advance! 1))))
(loop)))
((= (cur) "\"") (advance! 1))
(else
(begin
(append! chars (cur))
(advance! 1)
(loop))))))
(loop)
(join "" chars)))))
(define
read-char-literal
(fn
()
(begin
(advance! 1)
(let
((value (cond ((= (cur) "\\") (begin (advance! 1) (let ((ch (cur))) (begin (advance! 1) (cond ((= ch "n") "\n") ((= ch "t") "\t") ((= ch "r") "\r") ((= ch "b") "\\b") ((= ch "\\") "\\") ((= ch "'") "'") ((= ch "\"") "\"") (else ch)))))) (else (let ((ch (cur))) (begin (advance! 1) ch))))))
(begin
(when
(and (< pos src-len) (= (cur) "'"))
(advance! 1))
value)))))
(define
try-punct
(fn
(start)
(let
((c (cur))
(c1 (ocaml-peek 1))
(c2 (ocaml-peek 2)))
(cond
((and (= c ";") (= c1 ";"))
(begin (advance! 2) (push! "op" ";;" start) true))
((and (= c "-") (= c1 ">"))
(begin (advance! 2) (push! "op" "->" start) true))
((and (= c "<") (= c1 "-"))
(begin (advance! 2) (push! "op" "<-" start) true))
((and (= c ":") (= c1 "="))
(begin (advance! 2) (push! "op" ":=" start) true))
((and (= c ":") (= c1 ":"))
(begin (advance! 2) (push! "op" "::" start) true))
((and (= c "|") (= c1 "|"))
(begin (advance! 2) (push! "op" "||" start) true))
((and (= c "&") (= c1 "&"))
(begin (advance! 2) (push! "op" "&&" start) true))
((and (= c "<") (= c1 "="))
(begin (advance! 2) (push! "op" "<=" start) true))
((and (= c ">") (= c1 "="))
(begin (advance! 2) (push! "op" ">=" start) true))
((and (= c "<") (= c1 ">"))
(begin (advance! 2) (push! "op" "<>" start) true))
((and (= c "=") (= c1 "="))
(begin (advance! 2) (push! "op" "==" start) true))
((and (= c "!") (= c1 "="))
(begin (advance! 2) (push! "op" "!=" start) true))
((and (= c "|") (= c1 ">"))
(begin (advance! 2) (push! "op" "|>" start) true))
((and (= c "<") (= c1 "|"))
(begin (advance! 2) (push! "op" "<|" start) true))
((and (= c "@") (= c1 "@"))
(begin (advance! 2) (push! "op" "@@" start) true))
((and (= c "*") (= c1 "*"))
(begin (advance! 2) (push! "op" "**" start) true))
((or (= c "+") (= c "-") (= c "*") (= c "/") (= c "%") (= c "^") (= c "<") (= c ">") (= c "=") (= c "(") (= c ")") (= c "{") (= c "}") (= c "[") (= c "]") (= c ";") (= c ":") (= c ",") (= c ".") (= c "|") (= c "!") (= c "&") (= c "@") (= c "?") (= c "~") (= c "#"))
(begin (advance! 1) (push! "op" c start) true))
(else false)))))
(define
step
(fn
()
(begin
(skip-ws!)
(when
(< pos src-len)
(let
((start pos) (c (cur)))
(cond
((ocaml-ident-start? c)
(let
((word (read-ident start)))
(begin
(cond
((ocaml-keyword? word)
(push! "keyword" word start))
((ocaml-upper? c) (push! "ctor" word start))
(else (push! "ident" word start)))
(step))))
((ocaml-digit? c)
(let
((v (read-number start)))
(begin (push! "number" v start) (step))))
((= c "\"")
(let
((s (read-string-literal)))
(begin (push! "string" s start) (step))))
((and (= c "'") (< (+ pos 1) src-len) (or (and (= (ocaml-peek 1) "\\") (< (+ pos 3) src-len) (= (ocaml-peek 3) "'")) (and (not (= (ocaml-peek 1) "\\")) (< (+ pos 2) src-len) (= (ocaml-peek 2) "'"))))
(let
((v (read-char-literal)))
(begin (push! "char" v start) (step))))
((= c "'")
(begin
(advance! 1)
(when
(and (< pos src-len) (ocaml-ident-start? (cur)))
(begin
(advance! 1)
(read-ident (+ start 1))))
(push!
"tyvar"
(slice src (+ start 1) pos)
start)
(step)))
((try-punct start) (step))
(else
(error
(str "ocaml-tokenize: unexpected char " c " at " pos)))))))))
(step)
(push! "eof" nil pos)
tokens)))

View File

@@ -116,20 +116,22 @@ SX CEK evaluator (both JS and OCaml hosts)
### Phase 1 — Tokenizer + parser
- [ ] **Tokenizer:** keywords (`let`, `rec`, `in`, `fun`, `function`, `match`, `with`,
- [x] **Tokenizer:** keywords (`let`, `rec`, `in`, `fun`, `function`, `match`, `with`,
`type`, `of`, `module`, `struct`, `end`, `functor`, `sig`, `open`, `include`,
`if`, `then`, `else`, `begin`, `try`, `exception`, `raise`, `mutable`,
`for`, `while`, `do`, `done`, `and`, `as`, `when`), operators (`->`, `|>`,
`<|`, `@@`, `@`, `:=`, `!`, `::`, `**`, `:`, `;`, `;;`), identifiers (lower,
upper/ctor, labels `~label:`, optional `?label:`), char literals `'c'`,
string literals (escaped + heredoc `{|...|}`), int/float literals,
line comments `(*` nested block comments `*)`.
- [ ] **Parser:** top-level `let`/`let rec`/`type`/`module`/`exception`/`open`/`include`
declarations; expressions: literals, identifiers, constructor application,
lambda, application (left-assoc), binary ops with precedence table,
`if`/`then`/`else`, `match`/`with`, `try`/`with`, `let`/`in`, `begin`/`end`,
`fun`/`function`, tuples, list literals, record literals/updates, field access,
sequences `;`, unit `()`.
upper/ctor), char literals `'c'`, string literals (escaped),
int/float literals (incl. hex, exponent, underscores), nested block
comments `(* ... *)`. _(labels `~label:` / `?label:` and heredoc `{|...|}`
deferred — surface tokens already work via `~`/`?` punct + `{`/`|` punct.)_
- [~] **Parser:** expressions: literals, identifiers, constructor application,
lambda, application (left-assoc), binary ops with precedence (29 ops via
`lib/guest/pratt.sx`), `if`/`then`/`else`, `let`/`in`, `let rec`,
`fun`/`->`, tuples, list literals, `begin`/`end`, unit `()`. _(Pending:
top-level `let`/`type`/`module`/`exception`/`open`/`include` decls,
`match`/`with`, `try`/`with`, `function`, record literals/updates,
field access, sequences `;`.)_
- [ ] **Patterns:** constructor, literal, variable, wildcard `_`, tuple, list cons `::`,
list literal, record, `as`, or-pattern `P1 | P2`, `when` guard.
- [ ] OCaml is **not** indentation-sensitive — no layout algorithm needed.
@@ -308,7 +310,20 @@ the "mother tongue" closure: OCaml → SX → OCaml. This means:
_Newest first._
_(awaiting phase 1)_
- 2026-05-07 Phase 1 — `lib/ocaml/parser.sx` expression parser consuming
`lib/guest/pratt.sx` for binop precedence (29 operators across 8 levels,
incl. keyword-spelled binops `mod`/`land`/`lor`/`lxor`/`lsl`/`lsr`/`asr`).
Atoms (literals + var/con/unit/list), application (left-assoc), prefix
`-`/`not`, tuples, parens, `if`/`then`/`else`, `fun x y -> body`,
`let`/`let rec` with function shorthand. AST shapes match Haskell-on-SX
conventions (`(:int N)` `(:op OP L R)` `(:fun PARAMS BODY)` etc.). Total
95/95 tests now passing via `lib/ocaml/test.sh`.
- 2026-05-07 Phase 1 — `lib/ocaml/tokenizer.sx` consuming `lib/guest/lex.sx`
via `prefix-rename`. Covers idents, ctors, 51 keywords, numbers (int / float
/ hex / exponent / underscored), strings (with escapes), chars (with escapes),
type variables (`'a`), nested block comments, and 26 operator/punct tokens
(incl. `->` `|>` `<-` `:=` `::` `;;` `@@` `<>` `&&` `||` `**` etc.). 58/58
tokenizer tests pass via `lib/ocaml/test.sh` driving `sx_server.exe`.
## Blockers