ocaml: phase 1 tokenizer (+58 tests) — consumes lib/guest/lex.sx
Some checks failed
Test, Build, and Deploy / test-build-deploy (push) Failing after 52s

Idents, ctors, 51 keywords, numbers (int/float/hex/exp/underscored),
strings + chars with escapes, type variables, 26 op/punct tokens, and
nested (* ... *) block comments. Tests via epoch protocol against
sx_server.exe.
This commit is contained in:
2026-05-07 23:04:40 +00:00
parent 1eb9d0f8d2
commit 85b7fed4fc
4 changed files with 699 additions and 5 deletions

290
lib/ocaml/test.sh Executable file
View File

@@ -0,0 +1,290 @@
#!/usr/bin/env bash
# Fast OCaml-on-SX test runner — epoch protocol direct to sx_server.exe.
# Mirrors lib/lua/test.sh.
#
# Usage:
# bash lib/ocaml/test.sh # run all tests
# bash lib/ocaml/test.sh -v # verbose
set -uo pipefail
cd "$(git rev-parse --show-toplevel)"
SX_SERVER="${SX_SERVER:-hosts/ocaml/_build/default/bin/sx_server.exe}"
if [ ! -x "$SX_SERVER" ]; then
SX_SERVER="/root/rose-ash/hosts/ocaml/_build/default/bin/sx_server.exe"
fi
if [ ! -x "$SX_SERVER" ]; then
echo "ERROR: sx_server.exe not found. Run: cd hosts/ocaml && dune build"
exit 1
fi
VERBOSE="${1:-}"
PASS=0
FAIL=0
ERRORS=""
TMPFILE=$(mktemp)
trap "rm -f $TMPFILE" EXIT
cat > "$TMPFILE" << 'EPOCHS'
(epoch 1)
(load "lib/guest/lex.sx")
(load "lib/guest/prefix.sx")
(load "lib/ocaml/tokenizer.sx")
(load "lib/ocaml/tests/tokenize.sx")
;; ── empty / eof ────────────────────────────────────────────────
(epoch 100)
(eval "(ocaml-test-tok-count \"\")")
(epoch 101)
(eval "(ocaml-test-tok-type \"\" 0)")
;; ── numbers ────────────────────────────────────────────────────
(epoch 110)
(eval "(ocaml-test-tok-type \"42\" 0)")
(epoch 111)
(eval "(ocaml-test-tok-value \"42\" 0)")
(epoch 112)
(eval "(ocaml-test-tok-value \"3.14\" 0)")
(epoch 113)
(eval "(ocaml-test-tok-value \"0xff\" 0)")
(epoch 114)
(eval "(ocaml-test-tok-value \"1e3\" 0)")
(epoch 115)
(eval "(ocaml-test-tok-value \"1_000_000\" 0)")
(epoch 116)
(eval "(ocaml-test-tok-value \"3.14e-2\" 0)")
;; ── identifiers / constructors / keywords ─────────────────────
(epoch 120)
(eval "(ocaml-test-tok-type \"foo\" 0)")
(epoch 121)
(eval "(ocaml-test-tok-value \"foo_bar1\" 0)")
(epoch 122)
(eval "(ocaml-test-tok-type \"Some\" 0)")
(epoch 123)
(eval "(ocaml-test-tok-value \"Some\" 0)")
(epoch 124)
(eval "(ocaml-test-tok-type \"let\" 0)")
(epoch 125)
(eval "(ocaml-test-tok-value \"match\" 0)")
(epoch 126)
(eval "(ocaml-test-tok-type \"true\" 0)")
(epoch 127)
(eval "(ocaml-test-tok-value \"false\" 0)")
(epoch 128)
(eval "(ocaml-test-tok-value \"name'\" 0)")
;; ── strings ────────────────────────────────────────────────────
(epoch 130)
(eval "(ocaml-test-tok-type \"\\\"hi\\\"\" 0)")
(epoch 131)
(eval "(ocaml-test-tok-value \"\\\"hi\\\"\" 0)")
(epoch 132)
(eval "(ocaml-test-tok-value \"\\\"a\\\\nb\\\"\" 0)")
;; ── chars ──────────────────────────────────────────────────────
(epoch 140)
(eval "(ocaml-test-tok-type \"'a'\" 0)")
(epoch 141)
(eval "(ocaml-test-tok-value \"'a'\" 0)")
(epoch 142)
(eval "(ocaml-test-tok-value \"'\\\\n'\" 0)")
;; ── type variables ─────────────────────────────────────────────
(epoch 145)
(eval "(ocaml-test-tok-type \"'a\" 0)")
(epoch 146)
(eval "(ocaml-test-tok-value \"'a\" 0)")
;; ── multi-char operators ───────────────────────────────────────
(epoch 150)
(eval "(ocaml-test-tok-value \"->\" 0)")
(epoch 151)
(eval "(ocaml-test-tok-value \"|>\" 0)")
(epoch 152)
(eval "(ocaml-test-tok-value \"<-\" 0)")
(epoch 153)
(eval "(ocaml-test-tok-value \":=\" 0)")
(epoch 154)
(eval "(ocaml-test-tok-value \"::\" 0)")
(epoch 155)
(eval "(ocaml-test-tok-value \";;\" 0)")
(epoch 156)
(eval "(ocaml-test-tok-value \"@@\" 0)")
(epoch 157)
(eval "(ocaml-test-tok-value \"<>\" 0)")
(epoch 158)
(eval "(ocaml-test-tok-value \"&&\" 0)")
(epoch 159)
(eval "(ocaml-test-tok-value \"||\" 0)")
;; ── single-char punctuation ────────────────────────────────────
(epoch 160)
(eval "(ocaml-test-tok-value \"+\" 0)")
(epoch 161)
(eval "(ocaml-test-tok-value \"|\" 0)")
(epoch 162)
(eval "(ocaml-test-tok-value \";\" 0)")
(epoch 163)
(eval "(ocaml-test-tok-value \"(\" 0)")
(epoch 164)
(eval "(ocaml-test-tok-value \"!\" 0)")
(epoch 165)
(eval "(ocaml-test-tok-value \"@\" 0)")
;; ── comments ───────────────────────────────────────────────────
(epoch 170)
(eval "(ocaml-test-tok-count \"(* hi *)\")")
(epoch 171)
(eval "(ocaml-test-tok-value \"(* c *) 42\" 0)")
(epoch 172)
(eval "(ocaml-test-tok-count \"(* outer (* inner *) end *) 1\")")
(epoch 173)
(eval "(ocaml-test-tok-value \"(* outer (* inner *) end *) 1\" 0)")
;; ── compound expressions ───────────────────────────────────────
(epoch 180)
(eval "(ocaml-test-tok-count \"let x = 1\")")
(epoch 181)
(eval "(ocaml-test-tok-type \"let x = 1\" 0)")
(epoch 182)
(eval "(ocaml-test-tok-value \"let x = 1\" 0)")
(epoch 183)
(eval "(ocaml-test-tok-type \"let x = 1\" 1)")
(epoch 184)
(eval "(ocaml-test-tok-value \"let x = 1\" 2)")
(epoch 185)
(eval "(ocaml-test-tok-value \"let x = 1\" 3)")
(epoch 190)
(eval "(ocaml-test-tok-count \"match x with | None -> 0 | Some y -> y\")")
(epoch 191)
(eval "(ocaml-test-tok-value \"fun x -> x + 1\" 2)")
(epoch 192)
(eval "(ocaml-test-tok-type \"fun x -> x + 1\" 2)")
(epoch 193)
(eval "(ocaml-test-tok-type \"Some 42\" 0)")
(epoch 194)
(eval "(ocaml-test-tok-value \"a |> f |> g\" 1)")
(epoch 195)
(eval "(ocaml-test-tok-value \"x := !y\" 1)")
EPOCHS
OUTPUT=$(timeout 60 "$SX_SERVER" < "$TMPFILE" 2>/dev/null)
check() {
local epoch="$1" desc="$2" expected="$3"
local actual
actual=$(echo "$OUTPUT" | grep -A1 "^(ok-len $epoch " | tail -1)
if [ -z "$actual" ]; then
actual=$(echo "$OUTPUT" | grep "^(ok $epoch " || true)
fi
if [ -z "$actual" ]; then
actual=$(echo "$OUTPUT" | grep "^(error $epoch " || true)
fi
if [ -z "$actual" ]; then
actual="<no output for epoch $epoch>"
fi
if echo "$actual" | grep -qF -- "$expected"; then
PASS=$((PASS + 1))
[ "$VERBOSE" = "-v" ] && echo " ok $desc"
else
FAIL=$((FAIL + 1))
ERRORS+=" FAIL $desc (epoch $epoch)
expected: $expected
actual: $actual
"
fi
}
# empty / eof
check 100 "empty tokens length" '1'
check 101 "empty first is eof" '"eof"'
# numbers
check 110 "int type" '"number"'
check 111 "int value" '42'
check 112 "float value" '3.14'
check 113 "hex value" '255'
check 114 "exponent" '1000'
check 115 "underscored int" '1000000'
check 116 "neg exponent" '0.0314'
# idents / ctors / keywords
check 120 "ident type" '"ident"'
check 121 "ident value" '"foo_bar1"'
check 122 "ctor type" '"ctor"'
check 123 "ctor value" '"Some"'
check 124 "let keyword type" '"keyword"'
check 125 "match keyword value" '"match"'
check 126 "true is keyword" '"keyword"'
check 127 "false value" '"false"'
check 128 "primed ident" "\"name'\""
# strings
check 130 "string type" '"string"'
check 131 "string value" '"hi"'
check 132 "escape sequence" '"a'
# chars
check 140 "char type" '"char"'
check 141 "char value" '"a"'
check 142 "char escape" '"'
# tyvars
check 145 "tyvar type" '"tyvar"'
check 146 "tyvar value" '"a"'
# multi-char ops
check 150 "->" '"->"'
check 151 "|>" '"|>"'
check 152 "<-" '"<-"'
check 153 ":=" '":="'
check 154 "::" '"::"'
check 155 ";;" '";;"'
check 156 "@@" '"@@"'
check 157 "<>" '"<>"'
check 158 "&&" '"&&"'
check 159 "||" '"||"'
# single ops
check 160 "+" '"+"'
check 161 "|" '"|"'
check 162 ";" '";"'
check 163 "(" '"("'
check 164 "!" '"!"'
check 165 "@" '"@"'
# comments
check 170 "block comment alone -> eof" '1'
check 171 "num after block comment" '42'
check 172 "nested comment count" '2'
check 173 "nested comment value" '1'
# compound
check 180 "let x = 1 count" '5'
check 181 "let is keyword" '"keyword"'
check 182 "let value" '"let"'
check 183 "x is ident" '"ident"'
check 184 "= value" '"="'
check 185 "1 value" '1'
check 190 "match expr count" '13'
check 191 "fun -> arrow value" '"->"'
check 192 "fun -> arrow type" '"op"'
check 193 "Some is ctor" '"ctor"'
check 194 "first |> value" '"|>"'
check 195 "ref assign :=" '":="'
TOTAL=$((PASS + FAIL))
if [ $FAIL -eq 0 ]; then
echo "ok $PASS/$TOTAL OCaml-on-SX tokenizer tests passed"
else
echo "FAIL $PASS/$TOTAL passed, $FAIL failed:"
echo ""
echo "$ERRORS"
fi
[ $FAIL -eq 0 ]

View File

@@ -0,0 +1,16 @@
;; lib/ocaml/tests/tokenize.sx — smoke tests for the OCaml tokenizer.
;;
;; Tests are exercised via lib/ocaml/test.sh, which drives sx_server.exe
;; over the epoch protocol. This file provides a small evaluator that
;; returns short diagnostic values for each fixture so the runner can
;; grep them out of one batched run.
(define
ocaml-test-tok-type
(fn (src i) (get (nth (ocaml-tokenize src) i) :type)))
(define
ocaml-test-tok-value
(fn (src i) (get (nth (ocaml-tokenize src) i) :value)))
(define ocaml-test-tok-count (fn (src) (len (ocaml-tokenize src))))

382
lib/ocaml/tokenizer.sx Normal file
View File

@@ -0,0 +1,382 @@
;; lib/ocaml/tokenizer.sx — OCaml lexer.
;;
;; Tokens: ident, ctor (uppercase ident), keyword, number, string, char, op, eof.
;; Token shape: {:type :value :pos} via lex-make-token.
;; OCaml is not indentation-sensitive — no layout pass.
;; Block comments (* ... *) nest. There is no line-comment syntax.
(prefix-rename
"ocaml-"
(quote
((make-token lex-make-token)
(digit? lex-digit?)
(hex-digit? lex-hex-digit?)
(alpha? lex-alpha?)
(alnum? lex-alnum?)
(ident-start? lex-ident-start?)
(ident-char? lex-ident-char?)
(ws? lex-whitespace?))))
(define
ocaml-keywords
(list
"and"
"as"
"assert"
"begin"
"class"
"constraint"
"do"
"done"
"downto"
"else"
"end"
"exception"
"external"
"false"
"for"
"fun"
"function"
"functor"
"if"
"in"
"include"
"inherit"
"initializer"
"lazy"
"let"
"match"
"method"
"module"
"mutable"
"new"
"nonrec"
"object"
"of"
"open"
"or"
"private"
"rec"
"sig"
"struct"
"then"
"to"
"true"
"try"
"type"
"val"
"virtual"
"when"
"while"
"with"
"land"
"lor"
"lxor"
"lsl"
"lsr"
"asr"
"mod"))
(define ocaml-keyword? (fn (word) (contains? ocaml-keywords word)))
(define
ocaml-upper?
(fn (c) (and (not (= c nil)) (>= c "A") (<= c "Z"))))
(define
ocaml-tokenize
(fn
(src)
(let
((tokens (list)) (pos 0) (src-len (len src)))
(define
ocaml-peek
(fn
(offset)
(if (< (+ pos offset) src-len) (nth src (+ pos offset)) nil)))
(define cur (fn () (ocaml-peek 0)))
(define advance! (fn (n) (set! pos (+ pos n))))
(define
push!
(fn
(type value start)
(append! tokens (ocaml-make-token type value start))))
(define
skip-block-comment!
(fn
(depth)
(cond
((>= pos src-len) nil)
((and (= (cur) "*") (= (ocaml-peek 1) ")"))
(begin
(advance! 2)
(when
(> depth 1)
(skip-block-comment! (- depth 1)))))
((and (= (cur) "(") (= (ocaml-peek 1) "*"))
(begin
(advance! 2)
(skip-block-comment! (+ depth 1))))
(else (begin (advance! 1) (skip-block-comment! depth))))))
(define
skip-ws!
(fn
()
(cond
((>= pos src-len) nil)
((ocaml-ws? (cur)) (begin (advance! 1) (skip-ws!)))
((and (= (cur) "(") (= (ocaml-peek 1) "*"))
(begin
(advance! 2)
(skip-block-comment! 1)
(skip-ws!)))
(else nil))))
(define
read-ident
(fn
(start)
(begin
(when
(and (< pos src-len) (ocaml-ident-char? (cur)))
(begin (advance! 1) (read-ident start)))
(when
(and (< pos src-len) (= (cur) "'"))
(begin (advance! 1) (read-ident start)))
(slice src start pos))))
(define
read-decimal-digits!
(fn
()
(when
(and (< pos src-len) (or (ocaml-digit? (cur)) (= (cur) "_")))
(begin (advance! 1) (read-decimal-digits!)))))
(define
read-hex-digits!
(fn
()
(when
(and
(< pos src-len)
(or (ocaml-hex-digit? (cur)) (= (cur) "_")))
(begin (advance! 1) (read-hex-digits!)))))
(define
read-exp-part!
(fn
()
(when
(and (< pos src-len) (or (= (cur) "e") (= (cur) "E")))
(let
((p1 (ocaml-peek 1)))
(when
(or
(and (not (= p1 nil)) (ocaml-digit? p1))
(and
(or (= p1 "+") (= p1 "-"))
(< (+ pos 2) src-len)
(ocaml-digit? (ocaml-peek 2))))
(begin
(advance! 1)
(when
(and
(< pos src-len)
(or (= (cur) "+") (= (cur) "-")))
(advance! 1))
(read-decimal-digits!)))))))
(define
strip-underscores
(fn
(s)
(let
((out (list)) (i 0) (n (len s)))
(begin
(define
loop
(fn
()
(when
(< i n)
(begin
(when
(not (= (nth s i) "_"))
(append! out (nth s i)))
(set! i (+ i 1))
(loop)))))
(loop)
(join "" out)))))
(define
read-number
(fn
(start)
(cond
((and (= (cur) "0") (< (+ pos 1) src-len) (or (= (ocaml-peek 1) "x") (= (ocaml-peek 1) "X")))
(begin
(advance! 2)
(read-hex-digits!)
(let
((raw (slice src (+ start 2) pos)))
(parse-number (str "0x" (strip-underscores raw))))))
(else
(begin
(read-decimal-digits!)
(when
(and
(< pos src-len)
(= (cur) ".")
(or
(>= (+ pos 1) src-len)
(not (= (ocaml-peek 1) "."))))
(begin (advance! 1) (read-decimal-digits!)))
(read-exp-part!)
(parse-number (strip-underscores (slice src start pos))))))))
(define
read-string-literal
(fn
()
(let
((chars (list)))
(begin
(advance! 1)
(define
loop
(fn
()
(cond
((>= pos src-len) nil)
((= (cur) "\\")
(begin
(advance! 1)
(when
(< pos src-len)
(let
((ch (cur)))
(begin
(cond
((= ch "n") (append! chars "\n"))
((= ch "t") (append! chars "\t"))
((= ch "r") (append! chars "\r"))
((= ch "b") (append! chars "\\b"))
((= ch "\\") (append! chars "\\"))
((= ch "'") (append! chars "'"))
((= ch "\"") (append! chars "\""))
((= ch " ") nil)
(else (append! chars ch)))
(advance! 1))))
(loop)))
((= (cur) "\"") (advance! 1))
(else
(begin
(append! chars (cur))
(advance! 1)
(loop))))))
(loop)
(join "" chars)))))
(define
read-char-literal
(fn
()
(begin
(advance! 1)
(let
((value (cond ((= (cur) "\\") (begin (advance! 1) (let ((ch (cur))) (begin (advance! 1) (cond ((= ch "n") "\n") ((= ch "t") "\t") ((= ch "r") "\r") ((= ch "b") "\\b") ((= ch "\\") "\\") ((= ch "'") "'") ((= ch "\"") "\"") (else ch)))))) (else (let ((ch (cur))) (begin (advance! 1) ch))))))
(begin
(when
(and (< pos src-len) (= (cur) "'"))
(advance! 1))
value)))))
(define
try-punct
(fn
(start)
(let
((c (cur))
(c1 (ocaml-peek 1))
(c2 (ocaml-peek 2)))
(cond
((and (= c ";") (= c1 ";"))
(begin (advance! 2) (push! "op" ";;" start) true))
((and (= c "-") (= c1 ">"))
(begin (advance! 2) (push! "op" "->" start) true))
((and (= c "<") (= c1 "-"))
(begin (advance! 2) (push! "op" "<-" start) true))
((and (= c ":") (= c1 "="))
(begin (advance! 2) (push! "op" ":=" start) true))
((and (= c ":") (= c1 ":"))
(begin (advance! 2) (push! "op" "::" start) true))
((and (= c "|") (= c1 "|"))
(begin (advance! 2) (push! "op" "||" start) true))
((and (= c "&") (= c1 "&"))
(begin (advance! 2) (push! "op" "&&" start) true))
((and (= c "<") (= c1 "="))
(begin (advance! 2) (push! "op" "<=" start) true))
((and (= c ">") (= c1 "="))
(begin (advance! 2) (push! "op" ">=" start) true))
((and (= c "<") (= c1 ">"))
(begin (advance! 2) (push! "op" "<>" start) true))
((and (= c "=") (= c1 "="))
(begin (advance! 2) (push! "op" "==" start) true))
((and (= c "!") (= c1 "="))
(begin (advance! 2) (push! "op" "!=" start) true))
((and (= c "|") (= c1 ">"))
(begin (advance! 2) (push! "op" "|>" start) true))
((and (= c "<") (= c1 "|"))
(begin (advance! 2) (push! "op" "<|" start) true))
((and (= c "@") (= c1 "@"))
(begin (advance! 2) (push! "op" "@@" start) true))
((and (= c "*") (= c1 "*"))
(begin (advance! 2) (push! "op" "**" start) true))
((or (= c "+") (= c "-") (= c "*") (= c "/") (= c "%") (= c "^") (= c "<") (= c ">") (= c "=") (= c "(") (= c ")") (= c "{") (= c "}") (= c "[") (= c "]") (= c ";") (= c ":") (= c ",") (= c ".") (= c "|") (= c "!") (= c "&") (= c "@") (= c "?") (= c "~") (= c "#"))
(begin (advance! 1) (push! "op" c start) true))
(else false)))))
(define
step
(fn
()
(begin
(skip-ws!)
(when
(< pos src-len)
(let
((start pos) (c (cur)))
(cond
((ocaml-ident-start? c)
(let
((word (read-ident start)))
(begin
(cond
((ocaml-keyword? word)
(push! "keyword" word start))
((ocaml-upper? c) (push! "ctor" word start))
(else (push! "ident" word start)))
(step))))
((ocaml-digit? c)
(let
((v (read-number start)))
(begin (push! "number" v start) (step))))
((= c "\"")
(let
((s (read-string-literal)))
(begin (push! "string" s start) (step))))
((and (= c "'") (< (+ pos 1) src-len) (or (and (= (ocaml-peek 1) "\\") (< (+ pos 3) src-len) (= (ocaml-peek 3) "'")) (and (not (= (ocaml-peek 1) "\\")) (< (+ pos 2) src-len) (= (ocaml-peek 2) "'"))))
(let
((v (read-char-literal)))
(begin (push! "char" v start) (step))))
((= c "'")
(begin
(advance! 1)
(when
(and (< pos src-len) (ocaml-ident-start? (cur)))
(begin
(advance! 1)
(read-ident (+ start 1))))
(push!
"tyvar"
(slice src (+ start 1) pos)
start)
(step)))
((try-punct start) (step))
(else
(error
(str "ocaml-tokenize: unexpected char " c " at " pos)))))))))
(step)
(push! "eof" nil pos)
tokens)))