smalltalk: tokenizer + 63 tests
Some checks failed
Test, Build, and Deploy / test-build-deploy (push) Has been cancelled
Some checks failed
Test, Build, and Deploy / test-build-deploy (push) Has been cancelled
This commit is contained in:
99
lib/smalltalk/test.sh
Executable file
99
lib/smalltalk/test.sh
Executable file
@@ -0,0 +1,99 @@
|
||||
#!/usr/bin/env bash
|
||||
# Fast Smalltalk-on-SX test runner — pipes directly to sx_server.exe.
|
||||
# Mirrors lib/haskell/test.sh.
|
||||
#
|
||||
# Usage:
|
||||
# bash lib/smalltalk/test.sh # run all tests
|
||||
# bash lib/smalltalk/test.sh -v # verbose
|
||||
# bash lib/smalltalk/test.sh tests/tokenize.sx # run one file
|
||||
|
||||
set -uo pipefail
|
||||
cd "$(git rev-parse --show-toplevel)"
|
||||
|
||||
SX_SERVER="hosts/ocaml/_build/default/bin/sx_server.exe"
|
||||
if [ ! -x "$SX_SERVER" ]; then
|
||||
MAIN_ROOT=$(git worktree list | head -1 | awk '{print $1}')
|
||||
if [ -x "$MAIN_ROOT/$SX_SERVER" ]; then
|
||||
SX_SERVER="$MAIN_ROOT/$SX_SERVER"
|
||||
else
|
||||
echo "ERROR: sx_server.exe not found. Run: cd hosts/ocaml && dune build"
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
VERBOSE=""
|
||||
FILES=()
|
||||
for arg in "$@"; do
|
||||
case "$arg" in
|
||||
-v|--verbose) VERBOSE=1 ;;
|
||||
*) FILES+=("$arg") ;;
|
||||
esac
|
||||
done
|
||||
|
||||
if [ ${#FILES[@]} -eq 0 ]; then
|
||||
mapfile -t FILES < <(find lib/smalltalk/tests -maxdepth 2 -name '*.sx' | sort)
|
||||
fi
|
||||
|
||||
TOTAL_PASS=0
|
||||
TOTAL_FAIL=0
|
||||
FAILED_FILES=()
|
||||
|
||||
for FILE in "${FILES[@]}"; do
|
||||
[ -f "$FILE" ] || { echo "skip $FILE (not found)"; continue; }
|
||||
TMPFILE=$(mktemp)
|
||||
cat > "$TMPFILE" <<EPOCHS
|
||||
(epoch 1)
|
||||
(load "lib/smalltalk/tokenizer.sx")
|
||||
(epoch 2)
|
||||
(load "$FILE")
|
||||
(epoch 3)
|
||||
(eval "(list st-test-pass st-test-fail)")
|
||||
EPOCHS
|
||||
|
||||
OUTPUT=$(timeout 60 "$SX_SERVER" < "$TMPFILE" 2>&1 || true)
|
||||
rm -f "$TMPFILE"
|
||||
|
||||
LINE=$(echo "$OUTPUT" | awk '/^\(ok-len 3 / {getline; print; exit}')
|
||||
if [ -z "$LINE" ]; then
|
||||
LINE=$(echo "$OUTPUT" | grep -E '^\(ok 3 \([0-9]+ [0-9]+\)\)' | tail -1 \
|
||||
| sed -E 's/^\(ok 3 //; s/\)$//')
|
||||
fi
|
||||
if [ -z "$LINE" ]; then
|
||||
echo "X $FILE: could not extract summary"
|
||||
echo "$OUTPUT" | tail -30
|
||||
TOTAL_FAIL=$((TOTAL_FAIL + 1))
|
||||
FAILED_FILES+=("$FILE")
|
||||
continue
|
||||
fi
|
||||
P=$(echo "$LINE" | sed -E 's/^\(([0-9]+) ([0-9]+)\).*/\1/')
|
||||
F=$(echo "$LINE" | sed -E 's/^\(([0-9]+) ([0-9]+)\).*/\2/')
|
||||
TOTAL_PASS=$((TOTAL_PASS + P))
|
||||
TOTAL_FAIL=$((TOTAL_FAIL + F))
|
||||
if [ "$F" -gt 0 ]; then
|
||||
FAILED_FILES+=("$FILE")
|
||||
printf 'X %-40s %d/%d\n' "$FILE" "$P" "$((P+F))"
|
||||
TMPFILE2=$(mktemp)
|
||||
cat > "$TMPFILE2" <<EPOCHS
|
||||
(epoch 1)
|
||||
(load "lib/smalltalk/tokenizer.sx")
|
||||
(epoch 2)
|
||||
(load "$FILE")
|
||||
(epoch 3)
|
||||
(eval "(map (fn (f) (get f :name)) st-test-fails)")
|
||||
EPOCHS
|
||||
FAILS=$(timeout 60 "$SX_SERVER" < "$TMPFILE2" 2>&1 | grep -E '^\(ok 3 ' || true)
|
||||
rm -f "$TMPFILE2"
|
||||
echo " $FAILS"
|
||||
elif [ "$VERBOSE" = "1" ]; then
|
||||
printf 'OK %-40s %d passed\n' "$FILE" "$P"
|
||||
fi
|
||||
done
|
||||
|
||||
TOTAL=$((TOTAL_PASS + TOTAL_FAIL))
|
||||
if [ $TOTAL_FAIL -eq 0 ]; then
|
||||
echo "OK $TOTAL_PASS/$TOTAL smalltalk-on-sx tests passed"
|
||||
else
|
||||
echo "FAIL $TOTAL_PASS/$TOTAL passed, $TOTAL_FAIL failed in: ${FAILED_FILES[*]}"
|
||||
fi
|
||||
|
||||
[ $TOTAL_FAIL -eq 0 ]
|
||||
362
lib/smalltalk/tests/tokenize.sx
Normal file
362
lib/smalltalk/tests/tokenize.sx
Normal file
@@ -0,0 +1,362 @@
|
||||
;; Smalltalk tokenizer tests.
|
||||
;;
|
||||
;; Lightweight runner: each test checks actual vs expected with structural
|
||||
;; equality and accumulates pass/fail counters. Final summary read by
|
||||
;; lib/smalltalk/test.sh.
|
||||
|
||||
(define
|
||||
st-deep=?
|
||||
(fn
|
||||
(a b)
|
||||
(cond
|
||||
((= a b) true)
|
||||
((and (dict? a) (dict? b))
|
||||
(let
|
||||
((ak (keys a)) (bk (keys b)))
|
||||
(if
|
||||
(not (= (len ak) (len bk)))
|
||||
false
|
||||
(every?
|
||||
(fn
|
||||
(k)
|
||||
(and (has-key? b k) (st-deep=? (get a k) (get b k))))
|
||||
ak))))
|
||||
((and (list? a) (list? b))
|
||||
(if
|
||||
(not (= (len a) (len b)))
|
||||
false
|
||||
(let
|
||||
((i 0) (ok true))
|
||||
(begin
|
||||
(define
|
||||
de-loop
|
||||
(fn
|
||||
()
|
||||
(when
|
||||
(and ok (< i (len a)))
|
||||
(begin
|
||||
(when
|
||||
(not (st-deep=? (nth a i) (nth b i)))
|
||||
(set! ok false))
|
||||
(set! i (+ i 1))
|
||||
(de-loop)))))
|
||||
(de-loop)
|
||||
ok))))
|
||||
(:else false))))
|
||||
|
||||
(define st-test-pass 0)
|
||||
(define st-test-fail 0)
|
||||
(define st-test-fails (list))
|
||||
|
||||
(define
|
||||
st-test
|
||||
(fn
|
||||
(name actual expected)
|
||||
(if
|
||||
(st-deep=? actual expected)
|
||||
(set! st-test-pass (+ st-test-pass 1))
|
||||
(begin
|
||||
(set! st-test-fail (+ st-test-fail 1))
|
||||
(append! st-test-fails {:actual actual :expected expected :name name})))))
|
||||
|
||||
;; Strip eof and project to just :type/:value.
|
||||
(define
|
||||
st-toks
|
||||
(fn
|
||||
(src)
|
||||
(map
|
||||
(fn (tok) {:type (get tok :type) :value (get tok :value)})
|
||||
(filter
|
||||
(fn (tok) (not (= (get tok :type) "eof")))
|
||||
(st-tokenize src)))))
|
||||
|
||||
;; ── 1. Whitespace / empty ──
|
||||
(st-test "empty input" (st-toks "") (list))
|
||||
(st-test "all whitespace" (st-toks " \t\n ") (list))
|
||||
|
||||
;; ── 2. Identifiers ──
|
||||
(st-test
|
||||
"lowercase ident"
|
||||
(st-toks "foo")
|
||||
(list {:type "ident" :value "foo"}))
|
||||
|
||||
(st-test
|
||||
"capitalised ident"
|
||||
(st-toks "Foo")
|
||||
(list {:type "ident" :value "Foo"}))
|
||||
|
||||
(st-test
|
||||
"underscore ident"
|
||||
(st-toks "_x")
|
||||
(list {:type "ident" :value "_x"}))
|
||||
|
||||
(st-test
|
||||
"digits in ident"
|
||||
(st-toks "foo123")
|
||||
(list {:type "ident" :value "foo123"}))
|
||||
|
||||
(st-test
|
||||
"two idents separated"
|
||||
(st-toks "foo bar")
|
||||
(list {:type "ident" :value "foo"} {:type "ident" :value "bar"}))
|
||||
|
||||
;; ── 3. Keyword selectors ──
|
||||
(st-test
|
||||
"keyword selector"
|
||||
(st-toks "foo:")
|
||||
(list {:type "keyword" :value "foo:"}))
|
||||
|
||||
(st-test
|
||||
"keyword call"
|
||||
(st-toks "x at: 1")
|
||||
(list
|
||||
{:type "ident" :value "x"}
|
||||
{:type "keyword" :value "at:"}
|
||||
{:type "number" :value 1}))
|
||||
|
||||
(st-test
|
||||
"two-keyword chain stays separate"
|
||||
(st-toks "at: 1 put: 2")
|
||||
(list
|
||||
{:type "keyword" :value "at:"}
|
||||
{:type "number" :value 1}
|
||||
{:type "keyword" :value "put:"}
|
||||
{:type "number" :value 2}))
|
||||
|
||||
(st-test
|
||||
"ident then assign — not a keyword"
|
||||
(st-toks "x := 1")
|
||||
(list
|
||||
{:type "ident" :value "x"}
|
||||
{:type "assign" :value ":="}
|
||||
{:type "number" :value 1}))
|
||||
|
||||
;; ── 4. Numbers ──
|
||||
(st-test
|
||||
"integer"
|
||||
(st-toks "42")
|
||||
(list {:type "number" :value 42}))
|
||||
|
||||
(st-test
|
||||
"float"
|
||||
(st-toks "3.14")
|
||||
(list {:type "number" :value 3.14}))
|
||||
|
||||
(st-test
|
||||
"hex radix"
|
||||
(st-toks "16rFF")
|
||||
(list
|
||||
{:type "number"
|
||||
:value
|
||||
{:radix 16 :digits "FF" :value 255 :kind "radix"}}))
|
||||
|
||||
(st-test
|
||||
"binary radix"
|
||||
(st-toks "2r1011")
|
||||
(list
|
||||
{:type "number"
|
||||
:value
|
||||
{:radix 2 :digits "1011" :value 11 :kind "radix"}}))
|
||||
|
||||
(st-test
|
||||
"exponent"
|
||||
(st-toks "1e3")
|
||||
(list {:type "number" :value 1000}))
|
||||
|
||||
(st-test
|
||||
"negative exponent (parser handles minus)"
|
||||
(st-toks "1.5e-2")
|
||||
(list {:type "number" :value 0.015}))
|
||||
|
||||
;; ── 5. Strings ──
|
||||
(st-test
|
||||
"simple string"
|
||||
(st-toks "'hi'")
|
||||
(list {:type "string" :value "hi"}))
|
||||
|
||||
(st-test
|
||||
"empty string"
|
||||
(st-toks "''")
|
||||
(list {:type "string" :value ""}))
|
||||
|
||||
(st-test
|
||||
"doubled-quote escape"
|
||||
(st-toks "'a''b'")
|
||||
(list {:type "string" :value "a'b"}))
|
||||
|
||||
;; ── 6. Characters ──
|
||||
(st-test
|
||||
"char literal letter"
|
||||
(st-toks "$a")
|
||||
(list {:type "char" :value "a"}))
|
||||
|
||||
(st-test
|
||||
"char literal punct"
|
||||
(st-toks "$$")
|
||||
(list {:type "char" :value "$"}))
|
||||
|
||||
(st-test
|
||||
"char literal space"
|
||||
(st-toks "$ ")
|
||||
(list {:type "char" :value " "}))
|
||||
|
||||
;; ── 7. Symbols ──
|
||||
(st-test
|
||||
"symbol ident"
|
||||
(st-toks "#foo")
|
||||
(list {:type "symbol" :value "foo"}))
|
||||
|
||||
(st-test
|
||||
"symbol binary"
|
||||
(st-toks "#+")
|
||||
(list {:type "symbol" :value "+"}))
|
||||
|
||||
(st-test
|
||||
"symbol arrow"
|
||||
(st-toks "#->")
|
||||
(list {:type "symbol" :value "->"}))
|
||||
|
||||
(st-test
|
||||
"symbol keyword chain"
|
||||
(st-toks "#at:put:")
|
||||
(list {:type "symbol" :value "at:put:"}))
|
||||
|
||||
(st-test
|
||||
"quoted symbol with spaces"
|
||||
(st-toks "#'foo bar'")
|
||||
(list {:type "symbol" :value "foo bar"}))
|
||||
|
||||
;; ── 8. Literal arrays / byte arrays ──
|
||||
(st-test
|
||||
"literal array open"
|
||||
(st-toks "#(1 2)")
|
||||
(list
|
||||
{:type "array-open" :value "#("}
|
||||
{:type "number" :value 1}
|
||||
{:type "number" :value 2}
|
||||
{:type "rparen" :value ")"}))
|
||||
|
||||
(st-test
|
||||
"byte array open"
|
||||
(st-toks "#[1 2 3]")
|
||||
(list
|
||||
{:type "byte-array-open" :value "#["}
|
||||
{:type "number" :value 1}
|
||||
{:type "number" :value 2}
|
||||
{:type "number" :value 3}
|
||||
{:type "rbracket" :value "]"}))
|
||||
|
||||
;; ── 9. Binary selectors ──
|
||||
(st-test "plus" (st-toks "+") (list {:type "binary" :value "+"}))
|
||||
(st-test "minus" (st-toks "-") (list {:type "binary" :value "-"}))
|
||||
(st-test "star" (st-toks "*") (list {:type "binary" :value "*"}))
|
||||
(st-test "double-equal" (st-toks "==") (list {:type "binary" :value "=="}))
|
||||
(st-test "leq" (st-toks "<=") (list {:type "binary" :value "<="}))
|
||||
(st-test "geq" (st-toks ">=") (list {:type "binary" :value ">="}))
|
||||
(st-test "neq" (st-toks "~=") (list {:type "binary" :value "~="}))
|
||||
(st-test "arrow" (st-toks "->") (list {:type "binary" :value "->"}))
|
||||
(st-test "comma" (st-toks ",") (list {:type "binary" :value ","}))
|
||||
|
||||
(st-test
|
||||
"binary in expression"
|
||||
(st-toks "a + b")
|
||||
(list
|
||||
{:type "ident" :value "a"}
|
||||
{:type "binary" :value "+"}
|
||||
{:type "ident" :value "b"}))
|
||||
|
||||
;; ── 10. Punctuation ──
|
||||
(st-test "lparen" (st-toks "(") (list {:type "lparen" :value "("}))
|
||||
(st-test "rparen" (st-toks ")") (list {:type "rparen" :value ")"}))
|
||||
(st-test "lbracket" (st-toks "[") (list {:type "lbracket" :value "["}))
|
||||
(st-test "rbracket" (st-toks "]") (list {:type "rbracket" :value "]"}))
|
||||
(st-test "lbrace" (st-toks "{") (list {:type "lbrace" :value "{"}))
|
||||
(st-test "rbrace" (st-toks "}") (list {:type "rbrace" :value "}"}))
|
||||
(st-test "period" (st-toks ".") (list {:type "period" :value "."}))
|
||||
(st-test "semi" (st-toks ";") (list {:type "semi" :value ";"}))
|
||||
(st-test "bar" (st-toks "|") (list {:type "bar" :value "|"}))
|
||||
(st-test "caret" (st-toks "^") (list {:type "caret" :value "^"}))
|
||||
(st-test "bang" (st-toks "!") (list {:type "bang" :value "!"}))
|
||||
(st-test "colon" (st-toks ":") (list {:type "colon" :value ":"}))
|
||||
(st-test "assign" (st-toks ":=") (list {:type "assign" :value ":="}))
|
||||
|
||||
;; ── 11. Comments ──
|
||||
(st-test "comment skipped" (st-toks "\"hello\"") (list))
|
||||
(st-test
|
||||
"comment between tokens"
|
||||
(st-toks "a \"comment\" b")
|
||||
(list {:type "ident" :value "a"} {:type "ident" :value "b"}))
|
||||
(st-test
|
||||
"multi-line comment"
|
||||
(st-toks "\"line1\nline2\"42")
|
||||
(list {:type "number" :value 42}))
|
||||
|
||||
;; ── 12. Compound expressions ──
|
||||
(st-test
|
||||
"block with params"
|
||||
(st-toks "[:a :b | a + b]")
|
||||
(list
|
||||
{:type "lbracket" :value "["}
|
||||
{:type "colon" :value ":"}
|
||||
{:type "ident" :value "a"}
|
||||
{:type "colon" :value ":"}
|
||||
{:type "ident" :value "b"}
|
||||
{:type "bar" :value "|"}
|
||||
{:type "ident" :value "a"}
|
||||
{:type "binary" :value "+"}
|
||||
{:type "ident" :value "b"}
|
||||
{:type "rbracket" :value "]"}))
|
||||
|
||||
(st-test
|
||||
"cascade"
|
||||
(st-toks "x m1; m2")
|
||||
(list
|
||||
{:type "ident" :value "x"}
|
||||
{:type "ident" :value "m1"}
|
||||
{:type "semi" :value ";"}
|
||||
{:type "ident" :value "m2"}))
|
||||
|
||||
(st-test
|
||||
"method body return"
|
||||
(st-toks "^ self foo")
|
||||
(list
|
||||
{:type "caret" :value "^"}
|
||||
{:type "ident" :value "self"}
|
||||
{:type "ident" :value "foo"}))
|
||||
|
||||
(st-test
|
||||
"class declaration head"
|
||||
(st-toks "Object subclass: #Foo")
|
||||
(list
|
||||
{:type "ident" :value "Object"}
|
||||
{:type "keyword" :value "subclass:"}
|
||||
{:type "symbol" :value "Foo"}))
|
||||
|
||||
(st-test
|
||||
"temp declaration"
|
||||
(st-toks "| t1 t2 |")
|
||||
(list
|
||||
{:type "bar" :value "|"}
|
||||
{:type "ident" :value "t1"}
|
||||
{:type "ident" :value "t2"}
|
||||
{:type "bar" :value "|"}))
|
||||
|
||||
(st-test
|
||||
"chunk separator"
|
||||
(st-toks "Foo bar !")
|
||||
(list
|
||||
{:type "ident" :value "Foo"}
|
||||
{:type "ident" :value "bar"}
|
||||
{:type "bang" :value "!"}))
|
||||
|
||||
(st-test
|
||||
"keyword call with binary precedence"
|
||||
(st-toks "x foo: 1 + 2")
|
||||
(list
|
||||
{:type "ident" :value "x"}
|
||||
{:type "keyword" :value "foo:"}
|
||||
{:type "number" :value 1}
|
||||
{:type "binary" :value "+"}
|
||||
{:type "number" :value 2}))
|
||||
|
||||
(list st-test-pass st-test-fail)
|
||||
366
lib/smalltalk/tokenizer.sx
Normal file
366
lib/smalltalk/tokenizer.sx
Normal file
@@ -0,0 +1,366 @@
|
||||
;; Smalltalk tokenizer.
|
||||
;;
|
||||
;; Token types:
|
||||
;; ident identifier (foo, Foo, _x)
|
||||
;; keyword selector keyword (foo:) — value is "foo:" with the colon
|
||||
;; binary binary selector chars run together (+, ==, ->, <=, ~=, ...)
|
||||
;; number integer or float; radix integers like 16rFF supported
|
||||
;; string 'hello''world' style
|
||||
;; char $c
|
||||
;; symbol #foo, #foo:bar:, #+, #'with spaces'
|
||||
;; array-open #(
|
||||
;; byte-array-open #[
|
||||
;; lparen rparen lbracket rbracket lbrace rbrace
|
||||
;; period semi bar caret colon assign bang
|
||||
;; eof
|
||||
;;
|
||||
;; Comments "…" are skipped.
|
||||
|
||||
(define st-make-token (fn (type value pos) {:type type :value value :pos pos}))
|
||||
|
||||
(define st-digit? (fn (c) (and (not (= c nil)) (>= c "0") (<= c "9"))))
|
||||
|
||||
(define
|
||||
st-letter?
|
||||
(fn
|
||||
(c)
|
||||
(and
|
||||
(not (= c nil))
|
||||
(or (and (>= c "a") (<= c "z")) (and (>= c "A") (<= c "Z"))))))
|
||||
|
||||
(define st-ident-start? (fn (c) (or (st-letter? c) (= c "_"))))
|
||||
|
||||
(define st-ident-char? (fn (c) (or (st-ident-start? c) (st-digit? c))))
|
||||
|
||||
(define st-ws? (fn (c) (or (= c " ") (= c "\t") (= c "\n") (= c "\r"))))
|
||||
|
||||
(define
|
||||
st-binary-chars
|
||||
(list "+" "-" "*" "/" "\\" "~" "<" ">" "=" "@" "%" "&" "?" ","))
|
||||
|
||||
(define
|
||||
st-binary-char?
|
||||
(fn (c) (and (not (= c nil)) (contains? st-binary-chars c))))
|
||||
|
||||
(define
|
||||
st-radix-digit?
|
||||
(fn
|
||||
(c)
|
||||
(and
|
||||
(not (= c nil))
|
||||
(or (st-digit? c) (and (>= c "A") (<= c "Z"))))))
|
||||
|
||||
(define
|
||||
st-tokenize
|
||||
(fn
|
||||
(src)
|
||||
(let
|
||||
((tokens (list)) (pos 0) (src-len (len src)))
|
||||
(define
|
||||
pk
|
||||
(fn
|
||||
(offset)
|
||||
(if (< (+ pos offset) src-len) (nth src (+ pos offset)) nil)))
|
||||
(define cur (fn () (pk 0)))
|
||||
(define advance! (fn (n) (set! pos (+ pos n))))
|
||||
(define
|
||||
push!
|
||||
(fn
|
||||
(type value start)
|
||||
(append! tokens (st-make-token type value start))))
|
||||
(define
|
||||
skip-comment!
|
||||
(fn
|
||||
()
|
||||
(cond
|
||||
((>= pos src-len) nil)
|
||||
((= (cur) "\"") (advance! 1))
|
||||
(else (begin (advance! 1) (skip-comment!))))))
|
||||
(define
|
||||
skip-ws!
|
||||
(fn
|
||||
()
|
||||
(cond
|
||||
((>= pos src-len) nil)
|
||||
((st-ws? (cur)) (begin (advance! 1) (skip-ws!)))
|
||||
((= (cur) "\"") (begin (advance! 1) (skip-comment!) (skip-ws!)))
|
||||
(else nil))))
|
||||
(define
|
||||
read-ident-chars!
|
||||
(fn
|
||||
()
|
||||
(when
|
||||
(and (< pos src-len) (st-ident-char? (cur)))
|
||||
(begin (advance! 1) (read-ident-chars!)))))
|
||||
(define
|
||||
read-decimal-digits!
|
||||
(fn
|
||||
()
|
||||
(when
|
||||
(and (< pos src-len) (st-digit? (cur)))
|
||||
(begin (advance! 1) (read-decimal-digits!)))))
|
||||
(define
|
||||
read-radix-digits!
|
||||
(fn
|
||||
()
|
||||
(when
|
||||
(and (< pos src-len) (st-radix-digit? (cur)))
|
||||
(begin (advance! 1) (read-radix-digits!)))))
|
||||
(define
|
||||
read-exp-part!
|
||||
(fn
|
||||
()
|
||||
(when
|
||||
(and
|
||||
(< pos src-len)
|
||||
(or (= (cur) "e") (= (cur) "E"))
|
||||
(let
|
||||
((p1 (pk 1)) (p2 (pk 2)))
|
||||
(or
|
||||
(st-digit? p1)
|
||||
(and (or (= p1 "+") (= p1 "-")) (st-digit? p2)))))
|
||||
(begin
|
||||
(advance! 1)
|
||||
(when
|
||||
(and (< pos src-len) (or (= (cur) "+") (= (cur) "-")))
|
||||
(advance! 1))
|
||||
(read-decimal-digits!)))))
|
||||
(define
|
||||
read-number
|
||||
(fn
|
||||
(start)
|
||||
(begin
|
||||
(read-decimal-digits!)
|
||||
(cond
|
||||
((and (< pos src-len) (= (cur) "r"))
|
||||
(let
|
||||
((base-str (slice src start pos)))
|
||||
(begin
|
||||
(advance! 1)
|
||||
(let
|
||||
((rstart pos))
|
||||
(begin
|
||||
(read-radix-digits!)
|
||||
(let
|
||||
((digits (slice src rstart pos)))
|
||||
{:radix (parse-number base-str)
|
||||
:digits digits
|
||||
:value (parse-radix base-str digits)
|
||||
:kind "radix"}))))))
|
||||
((and
|
||||
(< pos src-len)
|
||||
(= (cur) ".")
|
||||
(st-digit? (pk 1)))
|
||||
(begin
|
||||
(advance! 1)
|
||||
(read-decimal-digits!)
|
||||
(read-exp-part!)
|
||||
(parse-number (slice src start pos))))
|
||||
(else
|
||||
(begin
|
||||
(read-exp-part!)
|
||||
(parse-number (slice src start pos))))))))
|
||||
(define
|
||||
parse-radix
|
||||
(fn
|
||||
(base-str digits)
|
||||
(let
|
||||
((base (parse-number base-str))
|
||||
(chars digits)
|
||||
(n-len (len digits))
|
||||
(idx 0)
|
||||
(acc 0))
|
||||
(begin
|
||||
(define
|
||||
rd-loop
|
||||
(fn
|
||||
()
|
||||
(when
|
||||
(< idx n-len)
|
||||
(let
|
||||
((c (nth chars idx)))
|
||||
(let
|
||||
((d (cond
|
||||
((and (>= c "0") (<= c "9")) (- (char-code c) 48))
|
||||
((and (>= c "A") (<= c "Z")) (- (char-code c) 55))
|
||||
(else 0))))
|
||||
(begin
|
||||
(set! acc (+ (* acc base) d))
|
||||
(set! idx (+ idx 1))
|
||||
(rd-loop)))))))
|
||||
(rd-loop)
|
||||
acc))))
|
||||
(define
|
||||
read-string
|
||||
(fn
|
||||
()
|
||||
(let
|
||||
((chars (list)))
|
||||
(begin
|
||||
(advance! 1)
|
||||
(define
|
||||
loop
|
||||
(fn
|
||||
()
|
||||
(cond
|
||||
((>= pos src-len) nil)
|
||||
((= (cur) "'")
|
||||
(cond
|
||||
((= (pk 1) "'")
|
||||
(begin
|
||||
(append! chars "'")
|
||||
(advance! 2)
|
||||
(loop)))
|
||||
(else (advance! 1))))
|
||||
(else
|
||||
(begin (append! chars (cur)) (advance! 1) (loop))))))
|
||||
(loop)
|
||||
(join "" chars)))))
|
||||
(define
|
||||
read-binary-run!
|
||||
(fn
|
||||
()
|
||||
(let
|
||||
((start pos))
|
||||
(begin
|
||||
(define
|
||||
bin-loop
|
||||
(fn
|
||||
()
|
||||
(when
|
||||
(and (< pos src-len) (st-binary-char? (cur)))
|
||||
(begin (advance! 1) (bin-loop)))))
|
||||
(bin-loop)
|
||||
(slice src start pos)))))
|
||||
(define
|
||||
read-symbol
|
||||
(fn
|
||||
(start)
|
||||
(cond
|
||||
;; Quoted symbol: #'whatever'
|
||||
((= (cur) "'")
|
||||
(let ((s (read-string))) (push! "symbol" s start)))
|
||||
;; Binary-char symbol: #+, #==, #->, #|
|
||||
((or (st-binary-char? (cur)) (= (cur) "|"))
|
||||
(let ((b (read-binary-run!)))
|
||||
(cond
|
||||
((= b "")
|
||||
;; lone | wasn't binary; consume it
|
||||
(begin (advance! 1) (push! "symbol" "|" start)))
|
||||
(else (push! "symbol" b start)))))
|
||||
;; Identifier or keyword chain: #foo, #foo:bar:
|
||||
((st-ident-start? (cur))
|
||||
(let ((id-start pos))
|
||||
(begin
|
||||
(read-ident-chars!)
|
||||
(define
|
||||
kw-loop
|
||||
(fn
|
||||
()
|
||||
(when
|
||||
(and (< pos src-len) (= (cur) ":"))
|
||||
(begin
|
||||
(advance! 1)
|
||||
(when
|
||||
(and (< pos src-len) (st-ident-start? (cur)))
|
||||
(begin (read-ident-chars!) (kw-loop)))))))
|
||||
(kw-loop)
|
||||
(push! "symbol" (slice src id-start pos) start))))
|
||||
(else
|
||||
(error
|
||||
(str "st-tokenize: bad symbol at " pos))))))
|
||||
(define
|
||||
step
|
||||
(fn
|
||||
()
|
||||
(begin
|
||||
(skip-ws!)
|
||||
(when
|
||||
(< pos src-len)
|
||||
(let
|
||||
((start pos) (c (cur)))
|
||||
(cond
|
||||
;; Identifier or keyword
|
||||
((st-ident-start? c)
|
||||
(begin
|
||||
(read-ident-chars!)
|
||||
(let
|
||||
((word (slice src start pos)))
|
||||
(cond
|
||||
;; ident immediately followed by ':' (and not ':=') => keyword
|
||||
((and
|
||||
(< pos src-len)
|
||||
(= (cur) ":")
|
||||
(not (= (pk 1) "=")))
|
||||
(begin
|
||||
(advance! 1)
|
||||
(push!
|
||||
"keyword"
|
||||
(str word ":")
|
||||
start)))
|
||||
(else (push! "ident" word start))))
|
||||
(step)))
|
||||
;; Number
|
||||
((st-digit? c)
|
||||
(let
|
||||
((v (read-number start)))
|
||||
(begin (push! "number" v start) (step))))
|
||||
;; String
|
||||
((= c "'")
|
||||
(let
|
||||
((s (read-string)))
|
||||
(begin (push! "string" s start) (step))))
|
||||
;; Character literal
|
||||
((= c "$")
|
||||
(cond
|
||||
((>= (+ pos 1) src-len)
|
||||
(error (str "st-tokenize: $ at end of input")))
|
||||
(else
|
||||
(begin
|
||||
(advance! 1)
|
||||
(push! "char" (cur) start)
|
||||
(advance! 1)
|
||||
(step)))))
|
||||
;; Symbol or array literal
|
||||
((= c "#")
|
||||
(cond
|
||||
((= (pk 1) "(")
|
||||
(begin (advance! 2) (push! "array-open" "#(" start) (step)))
|
||||
((= (pk 1) "[")
|
||||
(begin (advance! 2) (push! "byte-array-open" "#[" start) (step)))
|
||||
(else
|
||||
(begin (advance! 1) (read-symbol start) (step)))))
|
||||
;; Assignment := or bare colon
|
||||
((= c ":")
|
||||
(cond
|
||||
((= (pk 1) "=")
|
||||
(begin (advance! 2) (push! "assign" ":=" start) (step)))
|
||||
(else
|
||||
(begin (advance! 1) (push! "colon" ":" start) (step)))))
|
||||
;; Single-char structural punctuation
|
||||
((= c "(") (begin (advance! 1) (push! "lparen" "(" start) (step)))
|
||||
((= c ")") (begin (advance! 1) (push! "rparen" ")" start) (step)))
|
||||
((= c "[") (begin (advance! 1) (push! "lbracket" "[" start) (step)))
|
||||
((= c "]") (begin (advance! 1) (push! "rbracket" "]" start) (step)))
|
||||
((= c "{") (begin (advance! 1) (push! "lbrace" "{" start) (step)))
|
||||
((= c "}") (begin (advance! 1) (push! "rbrace" "}" start) (step)))
|
||||
((= c ".") (begin (advance! 1) (push! "period" "." start) (step)))
|
||||
((= c ";") (begin (advance! 1) (push! "semi" ";" start) (step)))
|
||||
((= c "|") (begin (advance! 1) (push! "bar" "|" start) (step)))
|
||||
((= c "^") (begin (advance! 1) (push! "caret" "^" start) (step)))
|
||||
((= c "!") (begin (advance! 1) (push! "bang" "!" start) (step)))
|
||||
;; Binary selector run
|
||||
((st-binary-char? c)
|
||||
(let
|
||||
((b (read-binary-run!)))
|
||||
(begin (push! "binary" b start) (step))))
|
||||
(else
|
||||
(error
|
||||
(str
|
||||
"st-tokenize: unexpected char "
|
||||
c
|
||||
" at "
|
||||
pos)))))))))
|
||||
(step)
|
||||
(push! "eof" nil pos)
|
||||
tokens)))
|
||||
@@ -50,7 +50,7 @@ Core mapping:
|
||||
## Roadmap
|
||||
|
||||
### Phase 1 — tokenizer + parser
|
||||
- [ ] Tokenizer: identifiers, keywords (`foo:`), binary selectors (`+`, `==`, `,`, `->`, `~=` etc.), numbers (radix `16r1F`, scaled `1.5s2`), strings `'…''…'`, characters `$c`, symbols `#foo` `#'foo bar'` `#+`, byte arrays `#[1 2 3]`, literal arrays `#(1 #foo 'x')`, comments `"…"`
|
||||
- [x] Tokenizer: identifiers, keywords (`foo:`), binary selectors (`+`, `==`, `,`, `->`, `~=` etc.), numbers (radix `16r1F`; **scaled `1.5s2` deferred**), strings `'…''…'`, characters `$c`, symbols `#foo` `#'foo bar'` `#+`, byte arrays `#[1 2 3]` (open token), literal arrays `#(1 #foo 'x')` (open token), comments `"…"`
|
||||
- [ ] Parser: chunk format (`! !` separators), class definitions (`Object subclass: #X instanceVariableNames: '…' classVariableNames: '…' …`), method definitions (`extend: #Foo with: 'bar ^self'`), pragmas `<primitive: 1>`, blocks `[:a :b | | t1 t2 | …]`, cascades, message precedence (unary > binary > keyword)
|
||||
- [ ] Unit tests in `lib/smalltalk/tests/parse.sx`
|
||||
|
||||
@@ -107,7 +107,7 @@ Core mapping:
|
||||
|
||||
_Newest first. Agent appends on every commit._
|
||||
|
||||
- _(none yet)_
|
||||
- 2026-04-25: tokenizer + 63 tests (`lib/smalltalk/tokenizer.sx`, `lib/smalltalk/tests/tokenize.sx`, `lib/smalltalk/test.sh`). All token types covered except scaled decimals `1.5s2` (deferred). `#(` and `#[` emit open tokens; literal-array contents lexed as ordinary tokens for the parser to interpret.
|
||||
|
||||
## Blockers
|
||||
|
||||
|
||||
Reference in New Issue
Block a user