From 85b7fed4fc170f4ef086d6faad97bee1dbc2577b Mon Sep 17 00:00:00 2001 From: giles Date: Thu, 7 May 2026 23:04:40 +0000 Subject: [PATCH] =?UTF-8?q?ocaml:=20phase=201=20tokenizer=20(+58=20tests)?= =?UTF-8?q?=20=E2=80=94=20consumes=20lib/guest/lex.sx?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Idents, ctors, 51 keywords, numbers (int/float/hex/exp/underscored), strings + chars with escapes, type variables, 26 op/punct tokens, and nested (* ... *) block comments. Tests via epoch protocol against sx_server.exe. --- lib/ocaml/test.sh | 290 +++++++++++++++++++++++++++ lib/ocaml/tests/tokenize.sx | 16 ++ lib/ocaml/tokenizer.sx | 382 ++++++++++++++++++++++++++++++++++++ plans/ocaml-on-sx.md | 16 +- 4 files changed, 699 insertions(+), 5 deletions(-) create mode 100755 lib/ocaml/test.sh create mode 100644 lib/ocaml/tests/tokenize.sx create mode 100644 lib/ocaml/tokenizer.sx diff --git a/lib/ocaml/test.sh b/lib/ocaml/test.sh new file mode 100755 index 00000000..2750fa16 --- /dev/null +++ b/lib/ocaml/test.sh @@ -0,0 +1,290 @@ +#!/usr/bin/env bash +# Fast OCaml-on-SX test runner — epoch protocol direct to sx_server.exe. +# Mirrors lib/lua/test.sh. +# +# Usage: +# bash lib/ocaml/test.sh # run all tests +# bash lib/ocaml/test.sh -v # verbose + +set -uo pipefail +cd "$(git rev-parse --show-toplevel)" + +SX_SERVER="${SX_SERVER:-hosts/ocaml/_build/default/bin/sx_server.exe}" +if [ ! -x "$SX_SERVER" ]; then + SX_SERVER="/root/rose-ash/hosts/ocaml/_build/default/bin/sx_server.exe" +fi +if [ ! -x "$SX_SERVER" ]; then + echo "ERROR: sx_server.exe not found. Run: cd hosts/ocaml && dune build" + exit 1 +fi + +VERBOSE="${1:-}" +PASS=0 +FAIL=0 +ERRORS="" +TMPFILE=$(mktemp) +trap "rm -f $TMPFILE" EXIT + +cat > "$TMPFILE" << 'EPOCHS' +(epoch 1) +(load "lib/guest/lex.sx") +(load "lib/guest/prefix.sx") +(load "lib/ocaml/tokenizer.sx") +(load "lib/ocaml/tests/tokenize.sx") + +;; ── empty / eof ──────────────────────────────────────────────── +(epoch 100) +(eval "(ocaml-test-tok-count \"\")") +(epoch 101) +(eval "(ocaml-test-tok-type \"\" 0)") + +;; ── numbers ──────────────────────────────────────────────────── +(epoch 110) +(eval "(ocaml-test-tok-type \"42\" 0)") +(epoch 111) +(eval "(ocaml-test-tok-value \"42\" 0)") +(epoch 112) +(eval "(ocaml-test-tok-value \"3.14\" 0)") +(epoch 113) +(eval "(ocaml-test-tok-value \"0xff\" 0)") +(epoch 114) +(eval "(ocaml-test-tok-value \"1e3\" 0)") +(epoch 115) +(eval "(ocaml-test-tok-value \"1_000_000\" 0)") +(epoch 116) +(eval "(ocaml-test-tok-value \"3.14e-2\" 0)") + +;; ── identifiers / constructors / keywords ───────────────────── +(epoch 120) +(eval "(ocaml-test-tok-type \"foo\" 0)") +(epoch 121) +(eval "(ocaml-test-tok-value \"foo_bar1\" 0)") +(epoch 122) +(eval "(ocaml-test-tok-type \"Some\" 0)") +(epoch 123) +(eval "(ocaml-test-tok-value \"Some\" 0)") +(epoch 124) +(eval "(ocaml-test-tok-type \"let\" 0)") +(epoch 125) +(eval "(ocaml-test-tok-value \"match\" 0)") +(epoch 126) +(eval "(ocaml-test-tok-type \"true\" 0)") +(epoch 127) +(eval "(ocaml-test-tok-value \"false\" 0)") +(epoch 128) +(eval "(ocaml-test-tok-value \"name'\" 0)") + +;; ── strings ──────────────────────────────────────────────────── +(epoch 130) +(eval "(ocaml-test-tok-type \"\\\"hi\\\"\" 0)") +(epoch 131) +(eval "(ocaml-test-tok-value \"\\\"hi\\\"\" 0)") +(epoch 132) +(eval "(ocaml-test-tok-value \"\\\"a\\\\nb\\\"\" 0)") + +;; ── chars ────────────────────────────────────────────────────── +(epoch 140) +(eval "(ocaml-test-tok-type \"'a'\" 0)") +(epoch 141) +(eval "(ocaml-test-tok-value \"'a'\" 0)") +(epoch 142) +(eval "(ocaml-test-tok-value \"'\\\\n'\" 0)") + +;; ── type variables ───────────────────────────────────────────── +(epoch 145) +(eval "(ocaml-test-tok-type \"'a\" 0)") +(epoch 146) +(eval "(ocaml-test-tok-value \"'a\" 0)") + +;; ── multi-char operators ─────────────────────────────────────── +(epoch 150) +(eval "(ocaml-test-tok-value \"->\" 0)") +(epoch 151) +(eval "(ocaml-test-tok-value \"|>\" 0)") +(epoch 152) +(eval "(ocaml-test-tok-value \"<-\" 0)") +(epoch 153) +(eval "(ocaml-test-tok-value \":=\" 0)") +(epoch 154) +(eval "(ocaml-test-tok-value \"::\" 0)") +(epoch 155) +(eval "(ocaml-test-tok-value \";;\" 0)") +(epoch 156) +(eval "(ocaml-test-tok-value \"@@\" 0)") +(epoch 157) +(eval "(ocaml-test-tok-value \"<>\" 0)") +(epoch 158) +(eval "(ocaml-test-tok-value \"&&\" 0)") +(epoch 159) +(eval "(ocaml-test-tok-value \"||\" 0)") + +;; ── single-char punctuation ──────────────────────────────────── +(epoch 160) +(eval "(ocaml-test-tok-value \"+\" 0)") +(epoch 161) +(eval "(ocaml-test-tok-value \"|\" 0)") +(epoch 162) +(eval "(ocaml-test-tok-value \";\" 0)") +(epoch 163) +(eval "(ocaml-test-tok-value \"(\" 0)") +(epoch 164) +(eval "(ocaml-test-tok-value \"!\" 0)") +(epoch 165) +(eval "(ocaml-test-tok-value \"@\" 0)") + +;; ── comments ─────────────────────────────────────────────────── +(epoch 170) +(eval "(ocaml-test-tok-count \"(* hi *)\")") +(epoch 171) +(eval "(ocaml-test-tok-value \"(* c *) 42\" 0)") +(epoch 172) +(eval "(ocaml-test-tok-count \"(* outer (* inner *) end *) 1\")") +(epoch 173) +(eval "(ocaml-test-tok-value \"(* outer (* inner *) end *) 1\" 0)") + +;; ── compound expressions ─────────────────────────────────────── +(epoch 180) +(eval "(ocaml-test-tok-count \"let x = 1\")") +(epoch 181) +(eval "(ocaml-test-tok-type \"let x = 1\" 0)") +(epoch 182) +(eval "(ocaml-test-tok-value \"let x = 1\" 0)") +(epoch 183) +(eval "(ocaml-test-tok-type \"let x = 1\" 1)") +(epoch 184) +(eval "(ocaml-test-tok-value \"let x = 1\" 2)") +(epoch 185) +(eval "(ocaml-test-tok-value \"let x = 1\" 3)") + +(epoch 190) +(eval "(ocaml-test-tok-count \"match x with | None -> 0 | Some y -> y\")") +(epoch 191) +(eval "(ocaml-test-tok-value \"fun x -> x + 1\" 2)") +(epoch 192) +(eval "(ocaml-test-tok-type \"fun x -> x + 1\" 2)") +(epoch 193) +(eval "(ocaml-test-tok-type \"Some 42\" 0)") +(epoch 194) +(eval "(ocaml-test-tok-value \"a |> f |> g\" 1)") +(epoch 195) +(eval "(ocaml-test-tok-value \"x := !y\" 1)") + +EPOCHS + +OUTPUT=$(timeout 60 "$SX_SERVER" < "$TMPFILE" 2>/dev/null) + +check() { + local epoch="$1" desc="$2" expected="$3" + local actual + actual=$(echo "$OUTPUT" | grep -A1 "^(ok-len $epoch " | tail -1) + if [ -z "$actual" ]; then + actual=$(echo "$OUTPUT" | grep "^(ok $epoch " || true) + fi + if [ -z "$actual" ]; then + actual=$(echo "$OUTPUT" | grep "^(error $epoch " || true) + fi + if [ -z "$actual" ]; then + actual="" + fi + + if echo "$actual" | grep -qF -- "$expected"; then + PASS=$((PASS + 1)) + [ "$VERBOSE" = "-v" ] && echo " ok $desc" + else + FAIL=$((FAIL + 1)) + ERRORS+=" FAIL $desc (epoch $epoch) + expected: $expected + actual: $actual +" + fi +} + +# empty / eof +check 100 "empty tokens length" '1' +check 101 "empty first is eof" '"eof"' + +# numbers +check 110 "int type" '"number"' +check 111 "int value" '42' +check 112 "float value" '3.14' +check 113 "hex value" '255' +check 114 "exponent" '1000' +check 115 "underscored int" '1000000' +check 116 "neg exponent" '0.0314' + +# idents / ctors / keywords +check 120 "ident type" '"ident"' +check 121 "ident value" '"foo_bar1"' +check 122 "ctor type" '"ctor"' +check 123 "ctor value" '"Some"' +check 124 "let keyword type" '"keyword"' +check 125 "match keyword value" '"match"' +check 126 "true is keyword" '"keyword"' +check 127 "false value" '"false"' +check 128 "primed ident" "\"name'\"" + +# strings +check 130 "string type" '"string"' +check 131 "string value" '"hi"' +check 132 "escape sequence" '"a' + +# chars +check 140 "char type" '"char"' +check 141 "char value" '"a"' +check 142 "char escape" '"' + +# tyvars +check 145 "tyvar type" '"tyvar"' +check 146 "tyvar value" '"a"' + +# multi-char ops +check 150 "->" '"->"' +check 151 "|>" '"|>"' +check 152 "<-" '"<-"' +check 153 ":=" '":="' +check 154 "::" '"::"' +check 155 ";;" '";;"' +check 156 "@@" '"@@"' +check 157 "<>" '"<>"' +check 158 "&&" '"&&"' +check 159 "||" '"||"' + +# single ops +check 160 "+" '"+"' +check 161 "|" '"|"' +check 162 ";" '";"' +check 163 "(" '"("' +check 164 "!" '"!"' +check 165 "@" '"@"' + +# comments +check 170 "block comment alone -> eof" '1' +check 171 "num after block comment" '42' +check 172 "nested comment count" '2' +check 173 "nested comment value" '1' + +# compound +check 180 "let x = 1 count" '5' +check 181 "let is keyword" '"keyword"' +check 182 "let value" '"let"' +check 183 "x is ident" '"ident"' +check 184 "= value" '"="' +check 185 "1 value" '1' + +check 190 "match expr count" '13' +check 191 "fun -> arrow value" '"->"' +check 192 "fun -> arrow type" '"op"' +check 193 "Some is ctor" '"ctor"' +check 194 "first |> value" '"|>"' +check 195 "ref assign :=" '":="' + +TOTAL=$((PASS + FAIL)) +if [ $FAIL -eq 0 ]; then + echo "ok $PASS/$TOTAL OCaml-on-SX tokenizer tests passed" +else + echo "FAIL $PASS/$TOTAL passed, $FAIL failed:" + echo "" + echo "$ERRORS" +fi + +[ $FAIL -eq 0 ] diff --git a/lib/ocaml/tests/tokenize.sx b/lib/ocaml/tests/tokenize.sx new file mode 100644 index 00000000..cdf8955a --- /dev/null +++ b/lib/ocaml/tests/tokenize.sx @@ -0,0 +1,16 @@ +;; lib/ocaml/tests/tokenize.sx — smoke tests for the OCaml tokenizer. +;; +;; Tests are exercised via lib/ocaml/test.sh, which drives sx_server.exe +;; over the epoch protocol. This file provides a small evaluator that +;; returns short diagnostic values for each fixture so the runner can +;; grep them out of one batched run. + +(define + ocaml-test-tok-type + (fn (src i) (get (nth (ocaml-tokenize src) i) :type))) + +(define + ocaml-test-tok-value + (fn (src i) (get (nth (ocaml-tokenize src) i) :value))) + +(define ocaml-test-tok-count (fn (src) (len (ocaml-tokenize src)))) diff --git a/lib/ocaml/tokenizer.sx b/lib/ocaml/tokenizer.sx new file mode 100644 index 00000000..d3882aab --- /dev/null +++ b/lib/ocaml/tokenizer.sx @@ -0,0 +1,382 @@ +;; lib/ocaml/tokenizer.sx — OCaml lexer. +;; +;; Tokens: ident, ctor (uppercase ident), keyword, number, string, char, op, eof. +;; Token shape: {:type :value :pos} via lex-make-token. +;; OCaml is not indentation-sensitive — no layout pass. +;; Block comments (* ... *) nest. There is no line-comment syntax. + +(prefix-rename + "ocaml-" + (quote + ((make-token lex-make-token) + (digit? lex-digit?) + (hex-digit? lex-hex-digit?) + (alpha? lex-alpha?) + (alnum? lex-alnum?) + (ident-start? lex-ident-start?) + (ident-char? lex-ident-char?) + (ws? lex-whitespace?)))) + +(define + ocaml-keywords + (list + "and" + "as" + "assert" + "begin" + "class" + "constraint" + "do" + "done" + "downto" + "else" + "end" + "exception" + "external" + "false" + "for" + "fun" + "function" + "functor" + "if" + "in" + "include" + "inherit" + "initializer" + "lazy" + "let" + "match" + "method" + "module" + "mutable" + "new" + "nonrec" + "object" + "of" + "open" + "or" + "private" + "rec" + "sig" + "struct" + "then" + "to" + "true" + "try" + "type" + "val" + "virtual" + "when" + "while" + "with" + "land" + "lor" + "lxor" + "lsl" + "lsr" + "asr" + "mod")) + +(define ocaml-keyword? (fn (word) (contains? ocaml-keywords word))) + +(define + ocaml-upper? + (fn (c) (and (not (= c nil)) (>= c "A") (<= c "Z")))) + +(define + ocaml-tokenize + (fn + (src) + (let + ((tokens (list)) (pos 0) (src-len (len src))) + (define + ocaml-peek + (fn + (offset) + (if (< (+ pos offset) src-len) (nth src (+ pos offset)) nil))) + (define cur (fn () (ocaml-peek 0))) + (define advance! (fn (n) (set! pos (+ pos n)))) + (define + push! + (fn + (type value start) + (append! tokens (ocaml-make-token type value start)))) + (define + skip-block-comment! + (fn + (depth) + (cond + ((>= pos src-len) nil) + ((and (= (cur) "*") (= (ocaml-peek 1) ")")) + (begin + (advance! 2) + (when + (> depth 1) + (skip-block-comment! (- depth 1))))) + ((and (= (cur) "(") (= (ocaml-peek 1) "*")) + (begin + (advance! 2) + (skip-block-comment! (+ depth 1)))) + (else (begin (advance! 1) (skip-block-comment! depth)))))) + (define + skip-ws! + (fn + () + (cond + ((>= pos src-len) nil) + ((ocaml-ws? (cur)) (begin (advance! 1) (skip-ws!))) + ((and (= (cur) "(") (= (ocaml-peek 1) "*")) + (begin + (advance! 2) + (skip-block-comment! 1) + (skip-ws!))) + (else nil)))) + (define + read-ident + (fn + (start) + (begin + (when + (and (< pos src-len) (ocaml-ident-char? (cur))) + (begin (advance! 1) (read-ident start))) + (when + (and (< pos src-len) (= (cur) "'")) + (begin (advance! 1) (read-ident start))) + (slice src start pos)))) + (define + read-decimal-digits! + (fn + () + (when + (and (< pos src-len) (or (ocaml-digit? (cur)) (= (cur) "_"))) + (begin (advance! 1) (read-decimal-digits!))))) + (define + read-hex-digits! + (fn + () + (when + (and + (< pos src-len) + (or (ocaml-hex-digit? (cur)) (= (cur) "_"))) + (begin (advance! 1) (read-hex-digits!))))) + (define + read-exp-part! + (fn + () + (when + (and (< pos src-len) (or (= (cur) "e") (= (cur) "E"))) + (let + ((p1 (ocaml-peek 1))) + (when + (or + (and (not (= p1 nil)) (ocaml-digit? p1)) + (and + (or (= p1 "+") (= p1 "-")) + (< (+ pos 2) src-len) + (ocaml-digit? (ocaml-peek 2)))) + (begin + (advance! 1) + (when + (and + (< pos src-len) + (or (= (cur) "+") (= (cur) "-"))) + (advance! 1)) + (read-decimal-digits!))))))) + (define + strip-underscores + (fn + (s) + (let + ((out (list)) (i 0) (n (len s))) + (begin + (define + loop + (fn + () + (when + (< i n) + (begin + (when + (not (= (nth s i) "_")) + (append! out (nth s i))) + (set! i (+ i 1)) + (loop))))) + (loop) + (join "" out))))) + (define + read-number + (fn + (start) + (cond + ((and (= (cur) "0") (< (+ pos 1) src-len) (or (= (ocaml-peek 1) "x") (= (ocaml-peek 1) "X"))) + (begin + (advance! 2) + (read-hex-digits!) + (let + ((raw (slice src (+ start 2) pos))) + (parse-number (str "0x" (strip-underscores raw)))))) + (else + (begin + (read-decimal-digits!) + (when + (and + (< pos src-len) + (= (cur) ".") + (or + (>= (+ pos 1) src-len) + (not (= (ocaml-peek 1) ".")))) + (begin (advance! 1) (read-decimal-digits!))) + (read-exp-part!) + (parse-number (strip-underscores (slice src start pos)))))))) + (define + read-string-literal + (fn + () + (let + ((chars (list))) + (begin + (advance! 1) + (define + loop + (fn + () + (cond + ((>= pos src-len) nil) + ((= (cur) "\\") + (begin + (advance! 1) + (when + (< pos src-len) + (let + ((ch (cur))) + (begin + (cond + ((= ch "n") (append! chars "\n")) + ((= ch "t") (append! chars "\t")) + ((= ch "r") (append! chars "\r")) + ((= ch "b") (append! chars "\\b")) + ((= ch "\\") (append! chars "\\")) + ((= ch "'") (append! chars "'")) + ((= ch "\"") (append! chars "\"")) + ((= ch " ") nil) + (else (append! chars ch))) + (advance! 1)))) + (loop))) + ((= (cur) "\"") (advance! 1)) + (else + (begin + (append! chars (cur)) + (advance! 1) + (loop)))))) + (loop) + (join "" chars))))) + (define + read-char-literal + (fn + () + (begin + (advance! 1) + (let + ((value (cond ((= (cur) "\\") (begin (advance! 1) (let ((ch (cur))) (begin (advance! 1) (cond ((= ch "n") "\n") ((= ch "t") "\t") ((= ch "r") "\r") ((= ch "b") "\\b") ((= ch "\\") "\\") ((= ch "'") "'") ((= ch "\"") "\"") (else ch)))))) (else (let ((ch (cur))) (begin (advance! 1) ch)))))) + (begin + (when + (and (< pos src-len) (= (cur) "'")) + (advance! 1)) + value))))) + (define + try-punct + (fn + (start) + (let + ((c (cur)) + (c1 (ocaml-peek 1)) + (c2 (ocaml-peek 2))) + (cond + ((and (= c ";") (= c1 ";")) + (begin (advance! 2) (push! "op" ";;" start) true)) + ((and (= c "-") (= c1 ">")) + (begin (advance! 2) (push! "op" "->" start) true)) + ((and (= c "<") (= c1 "-")) + (begin (advance! 2) (push! "op" "<-" start) true)) + ((and (= c ":") (= c1 "=")) + (begin (advance! 2) (push! "op" ":=" start) true)) + ((and (= c ":") (= c1 ":")) + (begin (advance! 2) (push! "op" "::" start) true)) + ((and (= c "|") (= c1 "|")) + (begin (advance! 2) (push! "op" "||" start) true)) + ((and (= c "&") (= c1 "&")) + (begin (advance! 2) (push! "op" "&&" start) true)) + ((and (= c "<") (= c1 "=")) + (begin (advance! 2) (push! "op" "<=" start) true)) + ((and (= c ">") (= c1 "=")) + (begin (advance! 2) (push! "op" ">=" start) true)) + ((and (= c "<") (= c1 ">")) + (begin (advance! 2) (push! "op" "<>" start) true)) + ((and (= c "=") (= c1 "=")) + (begin (advance! 2) (push! "op" "==" start) true)) + ((and (= c "!") (= c1 "=")) + (begin (advance! 2) (push! "op" "!=" start) true)) + ((and (= c "|") (= c1 ">")) + (begin (advance! 2) (push! "op" "|>" start) true)) + ((and (= c "<") (= c1 "|")) + (begin (advance! 2) (push! "op" "<|" start) true)) + ((and (= c "@") (= c1 "@")) + (begin (advance! 2) (push! "op" "@@" start) true)) + ((and (= c "*") (= c1 "*")) + (begin (advance! 2) (push! "op" "**" start) true)) + ((or (= c "+") (= c "-") (= c "*") (= c "/") (= c "%") (= c "^") (= c "<") (= c ">") (= c "=") (= c "(") (= c ")") (= c "{") (= c "}") (= c "[") (= c "]") (= c ";") (= c ":") (= c ",") (= c ".") (= c "|") (= c "!") (= c "&") (= c "@") (= c "?") (= c "~") (= c "#")) + (begin (advance! 1) (push! "op" c start) true)) + (else false))))) + (define + step + (fn + () + (begin + (skip-ws!) + (when + (< pos src-len) + (let + ((start pos) (c (cur))) + (cond + ((ocaml-ident-start? c) + (let + ((word (read-ident start))) + (begin + (cond + ((ocaml-keyword? word) + (push! "keyword" word start)) + ((ocaml-upper? c) (push! "ctor" word start)) + (else (push! "ident" word start))) + (step)))) + ((ocaml-digit? c) + (let + ((v (read-number start))) + (begin (push! "number" v start) (step)))) + ((= c "\"") + (let + ((s (read-string-literal))) + (begin (push! "string" s start) (step)))) + ((and (= c "'") (< (+ pos 1) src-len) (or (and (= (ocaml-peek 1) "\\") (< (+ pos 3) src-len) (= (ocaml-peek 3) "'")) (and (not (= (ocaml-peek 1) "\\")) (< (+ pos 2) src-len) (= (ocaml-peek 2) "'")))) + (let + ((v (read-char-literal))) + (begin (push! "char" v start) (step)))) + ((= c "'") + (begin + (advance! 1) + (when + (and (< pos src-len) (ocaml-ident-start? (cur))) + (begin + (advance! 1) + (read-ident (+ start 1)))) + (push! + "tyvar" + (slice src (+ start 1) pos) + start) + (step))) + ((try-punct start) (step)) + (else + (error + (str "ocaml-tokenize: unexpected char " c " at " pos))))))))) + (step) + (push! "eof" nil pos) + tokens))) diff --git a/plans/ocaml-on-sx.md b/plans/ocaml-on-sx.md index 7db06023..e16759aa 100644 --- a/plans/ocaml-on-sx.md +++ b/plans/ocaml-on-sx.md @@ -116,14 +116,15 @@ SX CEK evaluator (both JS and OCaml hosts) ### Phase 1 — Tokenizer + parser -- [ ] **Tokenizer:** keywords (`let`, `rec`, `in`, `fun`, `function`, `match`, `with`, +- [x] **Tokenizer:** keywords (`let`, `rec`, `in`, `fun`, `function`, `match`, `with`, `type`, `of`, `module`, `struct`, `end`, `functor`, `sig`, `open`, `include`, `if`, `then`, `else`, `begin`, `try`, `exception`, `raise`, `mutable`, `for`, `while`, `do`, `done`, `and`, `as`, `when`), operators (`->`, `|>`, `<|`, `@@`, `@`, `:=`, `!`, `::`, `**`, `:`, `;`, `;;`), identifiers (lower, - upper/ctor, labels `~label:`, optional `?label:`), char literals `'c'`, - string literals (escaped + heredoc `{|...|}`), int/float literals, - line comments `(*` nested block comments `*)`. + upper/ctor), char literals `'c'`, string literals (escaped), + int/float literals (incl. hex, exponent, underscores), nested block + comments `(* ... *)`. _(labels `~label:` / `?label:` and heredoc `{|...|}` + deferred — surface tokens already work via `~`/`?` punct + `{`/`|` punct.)_ - [ ] **Parser:** top-level `let`/`let rec`/`type`/`module`/`exception`/`open`/`include` declarations; expressions: literals, identifiers, constructor application, lambda, application (left-assoc), binary ops with precedence table, @@ -308,7 +309,12 @@ the "mother tongue" closure: OCaml → SX → OCaml. This means: _Newest first._ -_(awaiting phase 1)_ +- 2026-05-07 Phase 1 — `lib/ocaml/tokenizer.sx` consuming `lib/guest/lex.sx` + via `prefix-rename`. Covers idents, ctors, 51 keywords, numbers (int / float + / hex / exponent / underscored), strings (with escapes), chars (with escapes), + type variables (`'a`), nested block comments, and 26 operator/punct tokens + (incl. `->` `|>` `<-` `:=` `::` `;;` `@@` `<>` `&&` `||` `**` etc.). 58/58 + tokenizer tests pass via `lib/ocaml/test.sh` driving `sx_server.exe`. ## Blockers