ocaml: phase 1 tokenizer (+58 tests) — consumes lib/guest/lex.sx
Some checks failed
Test, Build, and Deploy / test-build-deploy (push) Failing after 52s
Some checks failed
Test, Build, and Deploy / test-build-deploy (push) Failing after 52s
Idents, ctors, 51 keywords, numbers (int/float/hex/exp/underscored), strings + chars with escapes, type variables, 26 op/punct tokens, and nested (* ... *) block comments. Tests via epoch protocol against sx_server.exe.
This commit is contained in:
290
lib/ocaml/test.sh
Executable file
290
lib/ocaml/test.sh
Executable file
@@ -0,0 +1,290 @@
|
||||
#!/usr/bin/env bash
|
||||
# Fast OCaml-on-SX test runner — epoch protocol direct to sx_server.exe.
|
||||
# Mirrors lib/lua/test.sh.
|
||||
#
|
||||
# Usage:
|
||||
# bash lib/ocaml/test.sh # run all tests
|
||||
# bash lib/ocaml/test.sh -v # verbose
|
||||
|
||||
set -uo pipefail
|
||||
cd "$(git rev-parse --show-toplevel)"
|
||||
|
||||
SX_SERVER="${SX_SERVER:-hosts/ocaml/_build/default/bin/sx_server.exe}"
|
||||
if [ ! -x "$SX_SERVER" ]; then
|
||||
SX_SERVER="/root/rose-ash/hosts/ocaml/_build/default/bin/sx_server.exe"
|
||||
fi
|
||||
if [ ! -x "$SX_SERVER" ]; then
|
||||
echo "ERROR: sx_server.exe not found. Run: cd hosts/ocaml && dune build"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
VERBOSE="${1:-}"
|
||||
PASS=0
|
||||
FAIL=0
|
||||
ERRORS=""
|
||||
TMPFILE=$(mktemp)
|
||||
trap "rm -f $TMPFILE" EXIT
|
||||
|
||||
cat > "$TMPFILE" << 'EPOCHS'
|
||||
(epoch 1)
|
||||
(load "lib/guest/lex.sx")
|
||||
(load "lib/guest/prefix.sx")
|
||||
(load "lib/ocaml/tokenizer.sx")
|
||||
(load "lib/ocaml/tests/tokenize.sx")
|
||||
|
||||
;; ── empty / eof ────────────────────────────────────────────────
|
||||
(epoch 100)
|
||||
(eval "(ocaml-test-tok-count \"\")")
|
||||
(epoch 101)
|
||||
(eval "(ocaml-test-tok-type \"\" 0)")
|
||||
|
||||
;; ── numbers ────────────────────────────────────────────────────
|
||||
(epoch 110)
|
||||
(eval "(ocaml-test-tok-type \"42\" 0)")
|
||||
(epoch 111)
|
||||
(eval "(ocaml-test-tok-value \"42\" 0)")
|
||||
(epoch 112)
|
||||
(eval "(ocaml-test-tok-value \"3.14\" 0)")
|
||||
(epoch 113)
|
||||
(eval "(ocaml-test-tok-value \"0xff\" 0)")
|
||||
(epoch 114)
|
||||
(eval "(ocaml-test-tok-value \"1e3\" 0)")
|
||||
(epoch 115)
|
||||
(eval "(ocaml-test-tok-value \"1_000_000\" 0)")
|
||||
(epoch 116)
|
||||
(eval "(ocaml-test-tok-value \"3.14e-2\" 0)")
|
||||
|
||||
;; ── identifiers / constructors / keywords ─────────────────────
|
||||
(epoch 120)
|
||||
(eval "(ocaml-test-tok-type \"foo\" 0)")
|
||||
(epoch 121)
|
||||
(eval "(ocaml-test-tok-value \"foo_bar1\" 0)")
|
||||
(epoch 122)
|
||||
(eval "(ocaml-test-tok-type \"Some\" 0)")
|
||||
(epoch 123)
|
||||
(eval "(ocaml-test-tok-value \"Some\" 0)")
|
||||
(epoch 124)
|
||||
(eval "(ocaml-test-tok-type \"let\" 0)")
|
||||
(epoch 125)
|
||||
(eval "(ocaml-test-tok-value \"match\" 0)")
|
||||
(epoch 126)
|
||||
(eval "(ocaml-test-tok-type \"true\" 0)")
|
||||
(epoch 127)
|
||||
(eval "(ocaml-test-tok-value \"false\" 0)")
|
||||
(epoch 128)
|
||||
(eval "(ocaml-test-tok-value \"name'\" 0)")
|
||||
|
||||
;; ── strings ────────────────────────────────────────────────────
|
||||
(epoch 130)
|
||||
(eval "(ocaml-test-tok-type \"\\\"hi\\\"\" 0)")
|
||||
(epoch 131)
|
||||
(eval "(ocaml-test-tok-value \"\\\"hi\\\"\" 0)")
|
||||
(epoch 132)
|
||||
(eval "(ocaml-test-tok-value \"\\\"a\\\\nb\\\"\" 0)")
|
||||
|
||||
;; ── chars ──────────────────────────────────────────────────────
|
||||
(epoch 140)
|
||||
(eval "(ocaml-test-tok-type \"'a'\" 0)")
|
||||
(epoch 141)
|
||||
(eval "(ocaml-test-tok-value \"'a'\" 0)")
|
||||
(epoch 142)
|
||||
(eval "(ocaml-test-tok-value \"'\\\\n'\" 0)")
|
||||
|
||||
;; ── type variables ─────────────────────────────────────────────
|
||||
(epoch 145)
|
||||
(eval "(ocaml-test-tok-type \"'a\" 0)")
|
||||
(epoch 146)
|
||||
(eval "(ocaml-test-tok-value \"'a\" 0)")
|
||||
|
||||
;; ── multi-char operators ───────────────────────────────────────
|
||||
(epoch 150)
|
||||
(eval "(ocaml-test-tok-value \"->\" 0)")
|
||||
(epoch 151)
|
||||
(eval "(ocaml-test-tok-value \"|>\" 0)")
|
||||
(epoch 152)
|
||||
(eval "(ocaml-test-tok-value \"<-\" 0)")
|
||||
(epoch 153)
|
||||
(eval "(ocaml-test-tok-value \":=\" 0)")
|
||||
(epoch 154)
|
||||
(eval "(ocaml-test-tok-value \"::\" 0)")
|
||||
(epoch 155)
|
||||
(eval "(ocaml-test-tok-value \";;\" 0)")
|
||||
(epoch 156)
|
||||
(eval "(ocaml-test-tok-value \"@@\" 0)")
|
||||
(epoch 157)
|
||||
(eval "(ocaml-test-tok-value \"<>\" 0)")
|
||||
(epoch 158)
|
||||
(eval "(ocaml-test-tok-value \"&&\" 0)")
|
||||
(epoch 159)
|
||||
(eval "(ocaml-test-tok-value \"||\" 0)")
|
||||
|
||||
;; ── single-char punctuation ────────────────────────────────────
|
||||
(epoch 160)
|
||||
(eval "(ocaml-test-tok-value \"+\" 0)")
|
||||
(epoch 161)
|
||||
(eval "(ocaml-test-tok-value \"|\" 0)")
|
||||
(epoch 162)
|
||||
(eval "(ocaml-test-tok-value \";\" 0)")
|
||||
(epoch 163)
|
||||
(eval "(ocaml-test-tok-value \"(\" 0)")
|
||||
(epoch 164)
|
||||
(eval "(ocaml-test-tok-value \"!\" 0)")
|
||||
(epoch 165)
|
||||
(eval "(ocaml-test-tok-value \"@\" 0)")
|
||||
|
||||
;; ── comments ───────────────────────────────────────────────────
|
||||
(epoch 170)
|
||||
(eval "(ocaml-test-tok-count \"(* hi *)\")")
|
||||
(epoch 171)
|
||||
(eval "(ocaml-test-tok-value \"(* c *) 42\" 0)")
|
||||
(epoch 172)
|
||||
(eval "(ocaml-test-tok-count \"(* outer (* inner *) end *) 1\")")
|
||||
(epoch 173)
|
||||
(eval "(ocaml-test-tok-value \"(* outer (* inner *) end *) 1\" 0)")
|
||||
|
||||
;; ── compound expressions ───────────────────────────────────────
|
||||
(epoch 180)
|
||||
(eval "(ocaml-test-tok-count \"let x = 1\")")
|
||||
(epoch 181)
|
||||
(eval "(ocaml-test-tok-type \"let x = 1\" 0)")
|
||||
(epoch 182)
|
||||
(eval "(ocaml-test-tok-value \"let x = 1\" 0)")
|
||||
(epoch 183)
|
||||
(eval "(ocaml-test-tok-type \"let x = 1\" 1)")
|
||||
(epoch 184)
|
||||
(eval "(ocaml-test-tok-value \"let x = 1\" 2)")
|
||||
(epoch 185)
|
||||
(eval "(ocaml-test-tok-value \"let x = 1\" 3)")
|
||||
|
||||
(epoch 190)
|
||||
(eval "(ocaml-test-tok-count \"match x with | None -> 0 | Some y -> y\")")
|
||||
(epoch 191)
|
||||
(eval "(ocaml-test-tok-value \"fun x -> x + 1\" 2)")
|
||||
(epoch 192)
|
||||
(eval "(ocaml-test-tok-type \"fun x -> x + 1\" 2)")
|
||||
(epoch 193)
|
||||
(eval "(ocaml-test-tok-type \"Some 42\" 0)")
|
||||
(epoch 194)
|
||||
(eval "(ocaml-test-tok-value \"a |> f |> g\" 1)")
|
||||
(epoch 195)
|
||||
(eval "(ocaml-test-tok-value \"x := !y\" 1)")
|
||||
|
||||
EPOCHS
|
||||
|
||||
OUTPUT=$(timeout 60 "$SX_SERVER" < "$TMPFILE" 2>/dev/null)
|
||||
|
||||
check() {
|
||||
local epoch="$1" desc="$2" expected="$3"
|
||||
local actual
|
||||
actual=$(echo "$OUTPUT" | grep -A1 "^(ok-len $epoch " | tail -1)
|
||||
if [ -z "$actual" ]; then
|
||||
actual=$(echo "$OUTPUT" | grep "^(ok $epoch " || true)
|
||||
fi
|
||||
if [ -z "$actual" ]; then
|
||||
actual=$(echo "$OUTPUT" | grep "^(error $epoch " || true)
|
||||
fi
|
||||
if [ -z "$actual" ]; then
|
||||
actual="<no output for epoch $epoch>"
|
||||
fi
|
||||
|
||||
if echo "$actual" | grep -qF -- "$expected"; then
|
||||
PASS=$((PASS + 1))
|
||||
[ "$VERBOSE" = "-v" ] && echo " ok $desc"
|
||||
else
|
||||
FAIL=$((FAIL + 1))
|
||||
ERRORS+=" FAIL $desc (epoch $epoch)
|
||||
expected: $expected
|
||||
actual: $actual
|
||||
"
|
||||
fi
|
||||
}
|
||||
|
||||
# empty / eof
|
||||
check 100 "empty tokens length" '1'
|
||||
check 101 "empty first is eof" '"eof"'
|
||||
|
||||
# numbers
|
||||
check 110 "int type" '"number"'
|
||||
check 111 "int value" '42'
|
||||
check 112 "float value" '3.14'
|
||||
check 113 "hex value" '255'
|
||||
check 114 "exponent" '1000'
|
||||
check 115 "underscored int" '1000000'
|
||||
check 116 "neg exponent" '0.0314'
|
||||
|
||||
# idents / ctors / keywords
|
||||
check 120 "ident type" '"ident"'
|
||||
check 121 "ident value" '"foo_bar1"'
|
||||
check 122 "ctor type" '"ctor"'
|
||||
check 123 "ctor value" '"Some"'
|
||||
check 124 "let keyword type" '"keyword"'
|
||||
check 125 "match keyword value" '"match"'
|
||||
check 126 "true is keyword" '"keyword"'
|
||||
check 127 "false value" '"false"'
|
||||
check 128 "primed ident" "\"name'\""
|
||||
|
||||
# strings
|
||||
check 130 "string type" '"string"'
|
||||
check 131 "string value" '"hi"'
|
||||
check 132 "escape sequence" '"a'
|
||||
|
||||
# chars
|
||||
check 140 "char type" '"char"'
|
||||
check 141 "char value" '"a"'
|
||||
check 142 "char escape" '"'
|
||||
|
||||
# tyvars
|
||||
check 145 "tyvar type" '"tyvar"'
|
||||
check 146 "tyvar value" '"a"'
|
||||
|
||||
# multi-char ops
|
||||
check 150 "->" '"->"'
|
||||
check 151 "|>" '"|>"'
|
||||
check 152 "<-" '"<-"'
|
||||
check 153 ":=" '":="'
|
||||
check 154 "::" '"::"'
|
||||
check 155 ";;" '";;"'
|
||||
check 156 "@@" '"@@"'
|
||||
check 157 "<>" '"<>"'
|
||||
check 158 "&&" '"&&"'
|
||||
check 159 "||" '"||"'
|
||||
|
||||
# single ops
|
||||
check 160 "+" '"+"'
|
||||
check 161 "|" '"|"'
|
||||
check 162 ";" '";"'
|
||||
check 163 "(" '"("'
|
||||
check 164 "!" '"!"'
|
||||
check 165 "@" '"@"'
|
||||
|
||||
# comments
|
||||
check 170 "block comment alone -> eof" '1'
|
||||
check 171 "num after block comment" '42'
|
||||
check 172 "nested comment count" '2'
|
||||
check 173 "nested comment value" '1'
|
||||
|
||||
# compound
|
||||
check 180 "let x = 1 count" '5'
|
||||
check 181 "let is keyword" '"keyword"'
|
||||
check 182 "let value" '"let"'
|
||||
check 183 "x is ident" '"ident"'
|
||||
check 184 "= value" '"="'
|
||||
check 185 "1 value" '1'
|
||||
|
||||
check 190 "match expr count" '13'
|
||||
check 191 "fun -> arrow value" '"->"'
|
||||
check 192 "fun -> arrow type" '"op"'
|
||||
check 193 "Some is ctor" '"ctor"'
|
||||
check 194 "first |> value" '"|>"'
|
||||
check 195 "ref assign :=" '":="'
|
||||
|
||||
TOTAL=$((PASS + FAIL))
|
||||
if [ $FAIL -eq 0 ]; then
|
||||
echo "ok $PASS/$TOTAL OCaml-on-SX tokenizer tests passed"
|
||||
else
|
||||
echo "FAIL $PASS/$TOTAL passed, $FAIL failed:"
|
||||
echo ""
|
||||
echo "$ERRORS"
|
||||
fi
|
||||
|
||||
[ $FAIL -eq 0 ]
|
||||
16
lib/ocaml/tests/tokenize.sx
Normal file
16
lib/ocaml/tests/tokenize.sx
Normal file
@@ -0,0 +1,16 @@
|
||||
;; lib/ocaml/tests/tokenize.sx — smoke tests for the OCaml tokenizer.
|
||||
;;
|
||||
;; Tests are exercised via lib/ocaml/test.sh, which drives sx_server.exe
|
||||
;; over the epoch protocol. This file provides a small evaluator that
|
||||
;; returns short diagnostic values for each fixture so the runner can
|
||||
;; grep them out of one batched run.
|
||||
|
||||
(define
|
||||
ocaml-test-tok-type
|
||||
(fn (src i) (get (nth (ocaml-tokenize src) i) :type)))
|
||||
|
||||
(define
|
||||
ocaml-test-tok-value
|
||||
(fn (src i) (get (nth (ocaml-tokenize src) i) :value)))
|
||||
|
||||
(define ocaml-test-tok-count (fn (src) (len (ocaml-tokenize src))))
|
||||
382
lib/ocaml/tokenizer.sx
Normal file
382
lib/ocaml/tokenizer.sx
Normal file
@@ -0,0 +1,382 @@
|
||||
;; lib/ocaml/tokenizer.sx — OCaml lexer.
|
||||
;;
|
||||
;; Tokens: ident, ctor (uppercase ident), keyword, number, string, char, op, eof.
|
||||
;; Token shape: {:type :value :pos} via lex-make-token.
|
||||
;; OCaml is not indentation-sensitive — no layout pass.
|
||||
;; Block comments (* ... *) nest. There is no line-comment syntax.
|
||||
|
||||
(prefix-rename
|
||||
"ocaml-"
|
||||
(quote
|
||||
((make-token lex-make-token)
|
||||
(digit? lex-digit?)
|
||||
(hex-digit? lex-hex-digit?)
|
||||
(alpha? lex-alpha?)
|
||||
(alnum? lex-alnum?)
|
||||
(ident-start? lex-ident-start?)
|
||||
(ident-char? lex-ident-char?)
|
||||
(ws? lex-whitespace?))))
|
||||
|
||||
(define
|
||||
ocaml-keywords
|
||||
(list
|
||||
"and"
|
||||
"as"
|
||||
"assert"
|
||||
"begin"
|
||||
"class"
|
||||
"constraint"
|
||||
"do"
|
||||
"done"
|
||||
"downto"
|
||||
"else"
|
||||
"end"
|
||||
"exception"
|
||||
"external"
|
||||
"false"
|
||||
"for"
|
||||
"fun"
|
||||
"function"
|
||||
"functor"
|
||||
"if"
|
||||
"in"
|
||||
"include"
|
||||
"inherit"
|
||||
"initializer"
|
||||
"lazy"
|
||||
"let"
|
||||
"match"
|
||||
"method"
|
||||
"module"
|
||||
"mutable"
|
||||
"new"
|
||||
"nonrec"
|
||||
"object"
|
||||
"of"
|
||||
"open"
|
||||
"or"
|
||||
"private"
|
||||
"rec"
|
||||
"sig"
|
||||
"struct"
|
||||
"then"
|
||||
"to"
|
||||
"true"
|
||||
"try"
|
||||
"type"
|
||||
"val"
|
||||
"virtual"
|
||||
"when"
|
||||
"while"
|
||||
"with"
|
||||
"land"
|
||||
"lor"
|
||||
"lxor"
|
||||
"lsl"
|
||||
"lsr"
|
||||
"asr"
|
||||
"mod"))
|
||||
|
||||
(define ocaml-keyword? (fn (word) (contains? ocaml-keywords word)))
|
||||
|
||||
(define
|
||||
ocaml-upper?
|
||||
(fn (c) (and (not (= c nil)) (>= c "A") (<= c "Z"))))
|
||||
|
||||
(define
|
||||
ocaml-tokenize
|
||||
(fn
|
||||
(src)
|
||||
(let
|
||||
((tokens (list)) (pos 0) (src-len (len src)))
|
||||
(define
|
||||
ocaml-peek
|
||||
(fn
|
||||
(offset)
|
||||
(if (< (+ pos offset) src-len) (nth src (+ pos offset)) nil)))
|
||||
(define cur (fn () (ocaml-peek 0)))
|
||||
(define advance! (fn (n) (set! pos (+ pos n))))
|
||||
(define
|
||||
push!
|
||||
(fn
|
||||
(type value start)
|
||||
(append! tokens (ocaml-make-token type value start))))
|
||||
(define
|
||||
skip-block-comment!
|
||||
(fn
|
||||
(depth)
|
||||
(cond
|
||||
((>= pos src-len) nil)
|
||||
((and (= (cur) "*") (= (ocaml-peek 1) ")"))
|
||||
(begin
|
||||
(advance! 2)
|
||||
(when
|
||||
(> depth 1)
|
||||
(skip-block-comment! (- depth 1)))))
|
||||
((and (= (cur) "(") (= (ocaml-peek 1) "*"))
|
||||
(begin
|
||||
(advance! 2)
|
||||
(skip-block-comment! (+ depth 1))))
|
||||
(else (begin (advance! 1) (skip-block-comment! depth))))))
|
||||
(define
|
||||
skip-ws!
|
||||
(fn
|
||||
()
|
||||
(cond
|
||||
((>= pos src-len) nil)
|
||||
((ocaml-ws? (cur)) (begin (advance! 1) (skip-ws!)))
|
||||
((and (= (cur) "(") (= (ocaml-peek 1) "*"))
|
||||
(begin
|
||||
(advance! 2)
|
||||
(skip-block-comment! 1)
|
||||
(skip-ws!)))
|
||||
(else nil))))
|
||||
(define
|
||||
read-ident
|
||||
(fn
|
||||
(start)
|
||||
(begin
|
||||
(when
|
||||
(and (< pos src-len) (ocaml-ident-char? (cur)))
|
||||
(begin (advance! 1) (read-ident start)))
|
||||
(when
|
||||
(and (< pos src-len) (= (cur) "'"))
|
||||
(begin (advance! 1) (read-ident start)))
|
||||
(slice src start pos))))
|
||||
(define
|
||||
read-decimal-digits!
|
||||
(fn
|
||||
()
|
||||
(when
|
||||
(and (< pos src-len) (or (ocaml-digit? (cur)) (= (cur) "_")))
|
||||
(begin (advance! 1) (read-decimal-digits!)))))
|
||||
(define
|
||||
read-hex-digits!
|
||||
(fn
|
||||
()
|
||||
(when
|
||||
(and
|
||||
(< pos src-len)
|
||||
(or (ocaml-hex-digit? (cur)) (= (cur) "_")))
|
||||
(begin (advance! 1) (read-hex-digits!)))))
|
||||
(define
|
||||
read-exp-part!
|
||||
(fn
|
||||
()
|
||||
(when
|
||||
(and (< pos src-len) (or (= (cur) "e") (= (cur) "E")))
|
||||
(let
|
||||
((p1 (ocaml-peek 1)))
|
||||
(when
|
||||
(or
|
||||
(and (not (= p1 nil)) (ocaml-digit? p1))
|
||||
(and
|
||||
(or (= p1 "+") (= p1 "-"))
|
||||
(< (+ pos 2) src-len)
|
||||
(ocaml-digit? (ocaml-peek 2))))
|
||||
(begin
|
||||
(advance! 1)
|
||||
(when
|
||||
(and
|
||||
(< pos src-len)
|
||||
(or (= (cur) "+") (= (cur) "-")))
|
||||
(advance! 1))
|
||||
(read-decimal-digits!)))))))
|
||||
(define
|
||||
strip-underscores
|
||||
(fn
|
||||
(s)
|
||||
(let
|
||||
((out (list)) (i 0) (n (len s)))
|
||||
(begin
|
||||
(define
|
||||
loop
|
||||
(fn
|
||||
()
|
||||
(when
|
||||
(< i n)
|
||||
(begin
|
||||
(when
|
||||
(not (= (nth s i) "_"))
|
||||
(append! out (nth s i)))
|
||||
(set! i (+ i 1))
|
||||
(loop)))))
|
||||
(loop)
|
||||
(join "" out)))))
|
||||
(define
|
||||
read-number
|
||||
(fn
|
||||
(start)
|
||||
(cond
|
||||
((and (= (cur) "0") (< (+ pos 1) src-len) (or (= (ocaml-peek 1) "x") (= (ocaml-peek 1) "X")))
|
||||
(begin
|
||||
(advance! 2)
|
||||
(read-hex-digits!)
|
||||
(let
|
||||
((raw (slice src (+ start 2) pos)))
|
||||
(parse-number (str "0x" (strip-underscores raw))))))
|
||||
(else
|
||||
(begin
|
||||
(read-decimal-digits!)
|
||||
(when
|
||||
(and
|
||||
(< pos src-len)
|
||||
(= (cur) ".")
|
||||
(or
|
||||
(>= (+ pos 1) src-len)
|
||||
(not (= (ocaml-peek 1) "."))))
|
||||
(begin (advance! 1) (read-decimal-digits!)))
|
||||
(read-exp-part!)
|
||||
(parse-number (strip-underscores (slice src start pos))))))))
|
||||
(define
|
||||
read-string-literal
|
||||
(fn
|
||||
()
|
||||
(let
|
||||
((chars (list)))
|
||||
(begin
|
||||
(advance! 1)
|
||||
(define
|
||||
loop
|
||||
(fn
|
||||
()
|
||||
(cond
|
||||
((>= pos src-len) nil)
|
||||
((= (cur) "\\")
|
||||
(begin
|
||||
(advance! 1)
|
||||
(when
|
||||
(< pos src-len)
|
||||
(let
|
||||
((ch (cur)))
|
||||
(begin
|
||||
(cond
|
||||
((= ch "n") (append! chars "\n"))
|
||||
((= ch "t") (append! chars "\t"))
|
||||
((= ch "r") (append! chars "\r"))
|
||||
((= ch "b") (append! chars "\\b"))
|
||||
((= ch "\\") (append! chars "\\"))
|
||||
((= ch "'") (append! chars "'"))
|
||||
((= ch "\"") (append! chars "\""))
|
||||
((= ch " ") nil)
|
||||
(else (append! chars ch)))
|
||||
(advance! 1))))
|
||||
(loop)))
|
||||
((= (cur) "\"") (advance! 1))
|
||||
(else
|
||||
(begin
|
||||
(append! chars (cur))
|
||||
(advance! 1)
|
||||
(loop))))))
|
||||
(loop)
|
||||
(join "" chars)))))
|
||||
(define
|
||||
read-char-literal
|
||||
(fn
|
||||
()
|
||||
(begin
|
||||
(advance! 1)
|
||||
(let
|
||||
((value (cond ((= (cur) "\\") (begin (advance! 1) (let ((ch (cur))) (begin (advance! 1) (cond ((= ch "n") "\n") ((= ch "t") "\t") ((= ch "r") "\r") ((= ch "b") "\\b") ((= ch "\\") "\\") ((= ch "'") "'") ((= ch "\"") "\"") (else ch)))))) (else (let ((ch (cur))) (begin (advance! 1) ch))))))
|
||||
(begin
|
||||
(when
|
||||
(and (< pos src-len) (= (cur) "'"))
|
||||
(advance! 1))
|
||||
value)))))
|
||||
(define
|
||||
try-punct
|
||||
(fn
|
||||
(start)
|
||||
(let
|
||||
((c (cur))
|
||||
(c1 (ocaml-peek 1))
|
||||
(c2 (ocaml-peek 2)))
|
||||
(cond
|
||||
((and (= c ";") (= c1 ";"))
|
||||
(begin (advance! 2) (push! "op" ";;" start) true))
|
||||
((and (= c "-") (= c1 ">"))
|
||||
(begin (advance! 2) (push! "op" "->" start) true))
|
||||
((and (= c "<") (= c1 "-"))
|
||||
(begin (advance! 2) (push! "op" "<-" start) true))
|
||||
((and (= c ":") (= c1 "="))
|
||||
(begin (advance! 2) (push! "op" ":=" start) true))
|
||||
((and (= c ":") (= c1 ":"))
|
||||
(begin (advance! 2) (push! "op" "::" start) true))
|
||||
((and (= c "|") (= c1 "|"))
|
||||
(begin (advance! 2) (push! "op" "||" start) true))
|
||||
((and (= c "&") (= c1 "&"))
|
||||
(begin (advance! 2) (push! "op" "&&" start) true))
|
||||
((and (= c "<") (= c1 "="))
|
||||
(begin (advance! 2) (push! "op" "<=" start) true))
|
||||
((and (= c ">") (= c1 "="))
|
||||
(begin (advance! 2) (push! "op" ">=" start) true))
|
||||
((and (= c "<") (= c1 ">"))
|
||||
(begin (advance! 2) (push! "op" "<>" start) true))
|
||||
((and (= c "=") (= c1 "="))
|
||||
(begin (advance! 2) (push! "op" "==" start) true))
|
||||
((and (= c "!") (= c1 "="))
|
||||
(begin (advance! 2) (push! "op" "!=" start) true))
|
||||
((and (= c "|") (= c1 ">"))
|
||||
(begin (advance! 2) (push! "op" "|>" start) true))
|
||||
((and (= c "<") (= c1 "|"))
|
||||
(begin (advance! 2) (push! "op" "<|" start) true))
|
||||
((and (= c "@") (= c1 "@"))
|
||||
(begin (advance! 2) (push! "op" "@@" start) true))
|
||||
((and (= c "*") (= c1 "*"))
|
||||
(begin (advance! 2) (push! "op" "**" start) true))
|
||||
((or (= c "+") (= c "-") (= c "*") (= c "/") (= c "%") (= c "^") (= c "<") (= c ">") (= c "=") (= c "(") (= c ")") (= c "{") (= c "}") (= c "[") (= c "]") (= c ";") (= c ":") (= c ",") (= c ".") (= c "|") (= c "!") (= c "&") (= c "@") (= c "?") (= c "~") (= c "#"))
|
||||
(begin (advance! 1) (push! "op" c start) true))
|
||||
(else false)))))
|
||||
(define
|
||||
step
|
||||
(fn
|
||||
()
|
||||
(begin
|
||||
(skip-ws!)
|
||||
(when
|
||||
(< pos src-len)
|
||||
(let
|
||||
((start pos) (c (cur)))
|
||||
(cond
|
||||
((ocaml-ident-start? c)
|
||||
(let
|
||||
((word (read-ident start)))
|
||||
(begin
|
||||
(cond
|
||||
((ocaml-keyword? word)
|
||||
(push! "keyword" word start))
|
||||
((ocaml-upper? c) (push! "ctor" word start))
|
||||
(else (push! "ident" word start)))
|
||||
(step))))
|
||||
((ocaml-digit? c)
|
||||
(let
|
||||
((v (read-number start)))
|
||||
(begin (push! "number" v start) (step))))
|
||||
((= c "\"")
|
||||
(let
|
||||
((s (read-string-literal)))
|
||||
(begin (push! "string" s start) (step))))
|
||||
((and (= c "'") (< (+ pos 1) src-len) (or (and (= (ocaml-peek 1) "\\") (< (+ pos 3) src-len) (= (ocaml-peek 3) "'")) (and (not (= (ocaml-peek 1) "\\")) (< (+ pos 2) src-len) (= (ocaml-peek 2) "'"))))
|
||||
(let
|
||||
((v (read-char-literal)))
|
||||
(begin (push! "char" v start) (step))))
|
||||
((= c "'")
|
||||
(begin
|
||||
(advance! 1)
|
||||
(when
|
||||
(and (< pos src-len) (ocaml-ident-start? (cur)))
|
||||
(begin
|
||||
(advance! 1)
|
||||
(read-ident (+ start 1))))
|
||||
(push!
|
||||
"tyvar"
|
||||
(slice src (+ start 1) pos)
|
||||
start)
|
||||
(step)))
|
||||
((try-punct start) (step))
|
||||
(else
|
||||
(error
|
||||
(str "ocaml-tokenize: unexpected char " c " at " pos)))))))))
|
||||
(step)
|
||||
(push! "eof" nil pos)
|
||||
tokens)))
|
||||
@@ -116,14 +116,15 @@ SX CEK evaluator (both JS and OCaml hosts)
|
||||
|
||||
### Phase 1 — Tokenizer + parser
|
||||
|
||||
- [ ] **Tokenizer:** keywords (`let`, `rec`, `in`, `fun`, `function`, `match`, `with`,
|
||||
- [x] **Tokenizer:** keywords (`let`, `rec`, `in`, `fun`, `function`, `match`, `with`,
|
||||
`type`, `of`, `module`, `struct`, `end`, `functor`, `sig`, `open`, `include`,
|
||||
`if`, `then`, `else`, `begin`, `try`, `exception`, `raise`, `mutable`,
|
||||
`for`, `while`, `do`, `done`, `and`, `as`, `when`), operators (`->`, `|>`,
|
||||
`<|`, `@@`, `@`, `:=`, `!`, `::`, `**`, `:`, `;`, `;;`), identifiers (lower,
|
||||
upper/ctor, labels `~label:`, optional `?label:`), char literals `'c'`,
|
||||
string literals (escaped + heredoc `{|...|}`), int/float literals,
|
||||
line comments `(*` nested block comments `*)`.
|
||||
upper/ctor), char literals `'c'`, string literals (escaped),
|
||||
int/float literals (incl. hex, exponent, underscores), nested block
|
||||
comments `(* ... *)`. _(labels `~label:` / `?label:` and heredoc `{|...|}`
|
||||
deferred — surface tokens already work via `~`/`?` punct + `{`/`|` punct.)_
|
||||
- [ ] **Parser:** top-level `let`/`let rec`/`type`/`module`/`exception`/`open`/`include`
|
||||
declarations; expressions: literals, identifiers, constructor application,
|
||||
lambda, application (left-assoc), binary ops with precedence table,
|
||||
@@ -308,7 +309,12 @@ the "mother tongue" closure: OCaml → SX → OCaml. This means:
|
||||
|
||||
_Newest first._
|
||||
|
||||
_(awaiting phase 1)_
|
||||
- 2026-05-07 Phase 1 — `lib/ocaml/tokenizer.sx` consuming `lib/guest/lex.sx`
|
||||
via `prefix-rename`. Covers idents, ctors, 51 keywords, numbers (int / float
|
||||
/ hex / exponent / underscored), strings (with escapes), chars (with escapes),
|
||||
type variables (`'a`), nested block comments, and 26 operator/punct tokens
|
||||
(incl. `->` `|>` `<-` `:=` `::` `;;` `@@` `<>` `&&` `||` `**` etc.). 58/58
|
||||
tokenizer tests pass via `lib/ocaml/test.sh` driving `sx_server.exe`.
|
||||
|
||||
## Blockers
|
||||
|
||||
|
||||
Reference in New Issue
Block a user