ruby: Phase 1 tokenizer (+107 tests)
Some checks failed
Test, Build, and Deploy / test-build-deploy (push) Has been cancelled

lib/ruby/tokenizer.sx — rb-tokenize: keywords, identifiers (@/@~/$/const),
numbers (dec/hex/oct/bin/float), strings (dq with raw interpolation, sq),
symbols, %w/%i, operators (all compound forms), punctuation, comments,
line/col tracking. Plus test runner test.sh and 107 passing tests.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-25 18:13:05 +00:00
parent 3316d402fd
commit 96019e9fe8
4 changed files with 846 additions and 2 deletions

85
lib/ruby/test.sh Executable file
View File

@@ -0,0 +1,85 @@
#!/usr/bin/env bash
# Ruby-on-SX test runner.
# Usage:
# bash lib/ruby/test.sh # run all tests
# bash lib/ruby/test.sh -v # verbose
# bash lib/ruby/test.sh tests/tokenizer.sx # single file
set -euo pipefail
cd "$(git rev-parse --show-toplevel)"
SX_SERVER="hosts/ocaml/_build/default/bin/sx_server.exe"
if [ ! -x "$SX_SERVER" ]; then
MAIN_ROOT=$(git worktree list | head -1 | awk '{print $1}')
if [ -x "$MAIN_ROOT/$SX_SERVER" ]; then
SX_SERVER="$MAIN_ROOT/$SX_SERVER"
else
echo "ERROR: sx_server.exe not found."
exit 1
fi
fi
VERBOSE=""
FILES=()
for arg in "$@"; do
case "$arg" in
-v|--verbose) VERBOSE=1 ;;
*) FILES+=("$arg") ;;
esac
done
if [ ${#FILES[@]} -eq 0 ]; then
mapfile -t FILES < <(find lib/ruby/tests -maxdepth 2 -name '*.sx' | sort)
fi
TOTAL_PASS=0
TOTAL_FAIL=0
FAILED_FILES=()
for FILE in "${FILES[@]}"; do
[ -f "$FILE" ] || { echo "skip $FILE (not found)"; continue; }
TMPFILE=$(mktemp)
cat > "$TMPFILE" <<EPOCHS
(epoch 1)
(load "lib/ruby/tokenizer.sx")
(epoch 2)
(load "$FILE")
(epoch 3)
(eval "(list rb-test-pass rb-test-fail)")
EPOCHS
OUTPUT=$(timeout 60 "$SX_SERVER" < "$TMPFILE" 2>&1 || true)
rm -f "$TMPFILE"
LINE=$(echo "$OUTPUT" | awk '/^\(ok-len 3 / {getline; print; exit}')
if [ -z "$LINE" ]; then
LINE=$(echo "$OUTPUT" | grep -E '^\(ok 3 \([0-9]+ [0-9]+\)\)' | tail -1 \
| sed -E 's/^\(ok 3 //; s/\)$//')
fi
if [ -z "$LINE" ]; then
echo "$FILE: could not extract summary"
echo "$OUTPUT" | tail -20
TOTAL_FAIL=$((TOTAL_FAIL + 1))
FAILED_FILES+=("$FILE")
continue
fi
P=$(echo "$LINE" | sed -E 's/^\(([0-9]+) ([0-9]+)\).*/\1/')
F=$(echo "$LINE" | sed -E 's/^\(([0-9]+) ([0-9]+)\).*/\2/')
TOTAL_PASS=$((TOTAL_PASS + P))
TOTAL_FAIL=$((TOTAL_FAIL + F))
if [ "$F" -gt 0 ]; then
FAILED_FILES+=("$FILE")
printf '✗ %-40s %d/%d\n' "$FILE" "$P" "$((P+F))"
elif [ "$VERBOSE" = "1" ]; then
printf '✓ %-40s %d passed\n' "$FILE" "$P"
fi
done
TOTAL=$((TOTAL_PASS + TOTAL_FAIL))
if [ $TOTAL_FAIL -eq 0 ]; then
echo "$TOTAL_PASS/$TOTAL ruby-on-sx tests passed"
else
echo "$TOTAL_PASS/$TOTAL passed, $TOTAL_FAIL failed in: ${FAILED_FILES[*]}"
fi
[ $TOTAL_FAIL -eq 0 ]

210
lib/ruby/tests/tokenizer.sx Normal file
View File

@@ -0,0 +1,210 @@
;; Ruby tokenizer tests.
;; Final value: {:pass N :fail N :fails (list)}
(define rb-deep=?
(fn (a b)
(cond
((= a b) true)
((and (dict? a) (dict? b))
(let ((ak (keys a)) (bk (keys b)))
(if (not (= (len ak) (len bk)))
false
(every?
(fn (k) (and (has-key? b k) (rb-deep=? (get a k) (get b k))))
ak))))
((and (list? a) (list? b))
(if (not (= (len a) (len b)))
false
(let ((i 0) (ok true))
(define rb-de-loop
(fn ()
(when (and ok (< i (len a)))
(do
(when (not (rb-deep=? (nth a i) (nth b i)))
(set! ok false))
(set! i (+ i 1))
(rb-de-loop)))))
(rb-de-loop)
ok)))
(:else false))))
(define rb-test-pass 0)
(define rb-test-fail 0)
(define rb-test-fails (list))
(define rb-test
(fn (name actual expected)
(if (rb-deep=? actual expected)
(set! rb-test-pass (+ rb-test-pass 1))
(do
(set! rb-test-fail (+ rb-test-fail 1))
(append! rb-test-fails {:name name :actual actual :expected expected})))))
;; Helper: tokenize, drop newline+eof, return {:type :value} pairs
(define rb-toks
(fn (src)
(map
(fn (tok) {:value (get tok "value") :type (get tok "type")})
(filter
(fn (tok)
(let ((ty (get tok "type")))
(not (or (= ty "newline") (= ty "eof")))))
(rb-tokenize src)))))
;; Helper: get just types
(define rb-types
(fn (src) (map (fn (t) (get t "type")) (rb-toks src))))
;; Helper: get first token type
(define rb-first-type
(fn (src) (get (get (rb-tokenize src) 0) "type")))
(define rb-first-value
(fn (src) (get (get (rb-tokenize src) 0) "value")))
;; ── 1. Keywords ────────────────────────<E29480><E29480><EFBFBD>─────────────────────────
(rb-test "keyword def" (rb-toks "def") (list {:value "def" :type "keyword"}))
(rb-test "keyword end" (rb-toks "end") (list {:value "end" :type "keyword"}))
(rb-test "keyword class" (rb-toks "class") (list {:value "class" :type "keyword"}))
(rb-test "keyword if" (rb-toks "if") (list {:value "if" :type "keyword"}))
(rb-test "keyword while" (rb-toks "while") (list {:value "while" :type "keyword"}))
(rb-test "keyword nil" (rb-toks "nil") (list {:value "nil" :type "keyword"}))
(rb-test "keyword true" (rb-toks "true") (list {:value "true" :type "keyword"}))
(rb-test "keyword false" (rb-toks "false") (list {:value "false" :type "keyword"}))
(rb-test "keyword return" (rb-toks "return") (list {:value "return" :type "keyword"}))
(rb-test "keyword yield" (rb-toks "yield") (list {:value "yield" :type "keyword"}))
(rb-test "keyword begin" (rb-toks "begin") (list {:value "begin" :type "keyword"}))
(rb-test "keyword rescue" (rb-toks "rescue") (list {:value "rescue" :type "keyword"}))
(rb-test "keyword self" (rb-toks "self") (list {:value "self" :type "keyword"}))
(rb-test "keyword super" (rb-toks "super") (list {:value "super" :type "keyword"}))
;; ── 2. Identifiers ────────────────────────────────────────────────
(rb-test "ident simple" (rb-toks "foo") (list {:value "foo" :type "ident"}))
(rb-test "ident underscore" (rb-toks "_foo") (list {:value "_foo" :type "ident"}))
(rb-test "ident with digit" (rb-toks "foo2") (list {:value "foo2" :type "ident"}))
(rb-test "ident predicate" (rb-toks "empty?") (list {:value "empty?" :type "ident"}))
(rb-test "ident bang" (rb-toks "save!") (list {:value "save!" :type "ident"}))
(rb-test "defined?" (rb-toks "defined?") (list {:value "defined?" :type "keyword"}))
;; ── 3. Constants ──────────────────────────────────────────────────
(rb-test "const simple" (rb-toks "Foo") (list {:value "Foo" :type "const"}))
(rb-test "const upcase" (rb-toks "MY_CONST") (list {:value "MY_CONST" :type "const"}))
(rb-test "const class" (rb-toks "String") (list {:value "String" :type "const"}))
;; ── 4. Sigil variables ───────────────────────────────────────────
(rb-test "ivar" (rb-toks "@name") (list {:value "@name" :type "ivar"}))
(rb-test "cvar" (rb-toks "@@count") (list {:value "@@count" :type "cvar"}))
(rb-test "gvar" (rb-toks "$global") (list {:value "$global" :type "gvar"}))
;; ── 5. Integers ───────────────────────────────────────────────────
(rb-test "int decimal" (rb-first-value "42") 42)
(rb-test "int zero" (rb-first-value "0") 0)
(rb-test "int underscore" (rb-first-value "1_000") 1000)
(rb-test "int hex" (rb-first-value "0xFF") 255)
(rb-test "int hex lower" (rb-first-value "0xff") 255)
(rb-test "int octal" (rb-first-value "0o17") 15)
(rb-test "int binary" (rb-first-value "0b1010") 10)
(rb-test "int type" (rb-first-type "42") "int")
;; ── 6. Floats ─────────────────────────────────────────────────────
(rb-test "float simple" (rb-first-type "3.14") "float")
(rb-test "float value" (rb-first-value "3.14") "3.14")
(rb-test "float exp" (rb-first-type "1.5e10") "float")
(rb-test "float exp value" (rb-first-value "1.5e10") "1.5e10")
;; ── 7. Strings ────────────────────────────────────────────────────
(rb-test "dq string" (rb-first-value "\"hello\"") "hello")
(rb-test "dq string type" (rb-first-type "\"hello\"") "string")
(rb-test "sq string" (rb-first-value "'world'") "world")
(rb-test "dq escape nl" (rb-first-value "\"a\\nb\"") "a\nb")
(rb-test "dq escape tab" (rb-first-value "\"a\\tb\"") "a\tb")
(rb-test "dq escape quote" (rb-first-value "\"a\\\"b\"") "a\"b")
(rb-test "sq no escape" (rb-first-value "'a\\nb'") "a\\nb")
(rb-test "sq escape backslash" (rb-first-value "'a\\\\'") "a\\")
(rb-test "dq interp kept" (rb-first-value "\"#{x}\"") "#{x}")
;; ── 8. Symbols ────────────────────────────────────────────────────
(rb-test "symbol simple" (rb-first-type ":foo") "symbol")
(rb-test "symbol value" (rb-first-value ":foo") "foo")
(rb-test "symbol predicate" (rb-first-value ":empty?") "empty?")
(rb-test "symbol dq" (rb-first-value ":\"hello world\"") "hello world")
(rb-test "symbol sq" (rb-first-value ":'hello'") "hello")
;; ── 9. %w and %i literals ────────────────────────────────────────
(rb-test "%w bracket" (rb-first-type "%w[a b c]") "words")
(rb-test "%w value" (rb-first-value "%w[a b c]") (list "a" "b" "c"))
(rb-test "%w paren" (rb-first-value "%w(x y)") (list "x" "y"))
(rb-test "%i bracket" (rb-first-type "%i[a b]") "isymbols")
(rb-test "%i value" (rb-first-value "%i[foo bar]") (list "foo" "bar"))
;; ── 10. Punctuation ───────────────────────────────────────────────
(rb-test "dot" (rb-first-type ".") "dot")
(rb-test "dotdot" (rb-first-type "..") "dotdot")
(rb-test "dotdotdot" (rb-first-type "...") "dotdotdot")
(rb-test "dcolon" (rb-first-type "::") "dcolon")
(rb-test "comma" (rb-first-type ",") "comma")
(rb-test "semi" (rb-first-type ";") "semi")
(rb-test "lparen" (rb-first-type "(") "lparen")
(rb-test "rparen" (rb-first-type ")") "rparen")
(rb-test "lbracket" (rb-first-type "[") "lbracket")
(rb-test "rbracket" (rb-first-type "]") "rbracket")
(rb-test "lbrace" (rb-first-type "{") "lbrace")
(rb-test "rbrace" (rb-first-type "}") "rbrace")
(rb-test "pipe" (rb-first-type "|") "pipe")
;; ── 11. Operators ─────────────────────────────────────────────────
(rb-test "op plus" (rb-first-value "+") "+")
(rb-test "op minus" (rb-first-value "-") "-")
(rb-test "op star" (rb-first-value "*") "*")
(rb-test "op slash" (rb-first-value "/") "/")
(rb-test "op eq" (rb-first-value "=") "=")
(rb-test "op eqeq" (rb-first-value "==") "==")
(rb-test "op neq" (rb-first-value "!=") "!=")
(rb-test "op lt" (rb-first-value "<") "<")
(rb-test "op gt" (rb-first-value ">") ">")
(rb-test "op lte" (rb-first-value "<=") "<=")
(rb-test "op gte" (rb-first-value ">=") ">=")
(rb-test "op spaceship" (rb-first-value "<=>") "<=>")
(rb-test "op tripleq" (rb-first-value "===") "===")
(rb-test "op match" (rb-first-value "=~") "=~")
(rb-test "op nomatch" (rb-first-value "!~") "!~")
(rb-test "op lshift" (rb-first-value "<<") "<<")
(rb-test "op rshift" (rb-first-value ">>") ">>")
(rb-test "op and" (rb-first-value "&&") "&&")
(rb-test "op or" (rb-first-value "||") "||")
(rb-test "op power" (rb-first-value "**") "**")
(rb-test "op plus-eq" (rb-first-value "+=") "+=")
(rb-test "op minus-eq" (rb-first-value "-=") "-=")
(rb-test "op arrow" (rb-first-value "->") "->")
(rb-test "op hash-rocket" (rb-first-value "=>") "=>")
;; ── 12. Comments ──────────────────────────────────────────────────
(rb-test "comment skipped" (len (rb-toks "# this is a comment")) 0)
(rb-test "comment mid-line" (rb-types "x = 1 # comment") (list "ident" "op" "int"))
;; ── 13. Multi-token sequences ─────────────────────────────────────
(rb-test "method call" (rb-types "foo.bar")
(list "ident" "dot" "ident"))
(rb-test "class def" (rb-types "class Foo")
(list "keyword" "const"))
(rb-test "method def" (rb-types "def greet(name)")
(list "keyword" "ident" "lparen" "ident" "rparen"))
(rb-test "assignment" (rb-types "x = 42")
(list "ident" "op" "int"))
(rb-test "block params" (rb-types "|x, y|")
(list "pipe" "ident" "comma" "ident" "pipe"))
(rb-test "scope resolution" (rb-types "Foo::Bar")
(list "const" "dcolon" "const"))
(rb-test "range" (rb-types "1..10")
(list "int" "dotdot" "int"))
(rb-test "exclusive range" (rb-types "1...10")
(list "int" "dotdotdot" "int"))
;; ── 14. Line/col tracking ────────────────────────────────────────
(define rb-tok1 (get (rb-tokenize "hello\nworld") 0))
(define rb-tok2 (get (rb-tokenize "hello\nworld") 2))
(rb-test "line track start" (get rb-tok1 "line") 1)
(rb-test "line track second" (get rb-tok2 "line") 2)
(rb-test "col track start" (get rb-tok1 "col") 1)
(list rb-test-pass rb-test-fail)

549
lib/ruby/tokenizer.sx Normal file
View File

@@ -0,0 +1,549 @@
;; Ruby tokenizer for Ruby 2.7 subset.
;; Token: {:type T :value V :line L :col C}
;;
;; Types: keyword ident ivar cvar gvar const
;; int float string symbol
;; op dot dotdot dotdotdot dcolon colon
;; lparen rparen lbracket rbracket lbrace rbrace
;; comma semi pipe newline words isymbols eof
;; ── Character code table ──────────────────────────────────────────
(define rb-ord-table
(let ((t (dict)) (i 0))
(define rb-build-table
(fn ()
(when (< i 128)
(do
(dict-set! t (char-from-code i) i)
(set! i (+ i 1))
(rb-build-table)))))
(rb-build-table)
t))
(define rb-ord (fn (c) (or (get rb-ord-table c) 0)))
;; ── Character predicates ──────────────────────────────────────────
(define rb-digit?
(fn (c) (and (string? c) (>= (rb-ord c) 48) (<= (rb-ord c) 57))))
(define rb-hex-digit?
(fn (c)
(and (string? c)
(or (and (>= (rb-ord c) 48) (<= (rb-ord c) 57))
(and (>= (rb-ord c) 97) (<= (rb-ord c) 102))
(and (>= (rb-ord c) 65) (<= (rb-ord c) 70))))))
(define rb-octal-digit?
(fn (c) (and (string? c) (>= (rb-ord c) 48) (<= (rb-ord c) 55))))
(define rb-binary-digit? (fn (c) (or (= c "0") (= c "1"))))
(define rb-lower?
(fn (c) (and (string? c) (>= (rb-ord c) 97) (<= (rb-ord c) 122))))
(define rb-upper?
(fn (c) (and (string? c) (>= (rb-ord c) 65) (<= (rb-ord c) 90))))
(define rb-ident-start?
(fn (c) (or (rb-lower? c) (rb-upper? c) (= c "_"))))
(define rb-ident-cont?
(fn (c) (or (rb-lower? c) (rb-upper? c) (rb-digit? c) (= c "_"))))
(define rb-space? (fn (c) (or (= c " ") (= c "\t") (= c "\r"))))
;; ── Reserved words ────────────────────────────────────────────────
(define rb-keywords
(list "__ENCODING__" "__LINE__" "__FILE__"
"BEGIN" "END"
"alias" "and"
"begin" "break"
"case" "class"
"def" "defined?" "do"
"else" "elsif" "end" "ensure"
"false" "for"
"if" "in"
"module"
"next" "nil" "not"
"or"
"redo" "rescue" "retry" "return"
"self" "super"
"then" "true"
"undef" "unless" "until"
"when" "while"
"yield"))
(define rb-keyword? (fn (w) (contains? rb-keywords w)))
;; ── Token constructor ─────────────────────────────────────────────
(define rb-make-token
(fn (type value line col) {:type type :value value :line line :col col}))
;; ── Radix number parser ───────────────────────────────────────────
(define rb-parse-radix
(fn (s radix)
(let ((n (len s)) (i 0) (acc 0))
(define rb-rad-loop
(fn ()
(when (< i n)
(do
(let ((c (substring s i (+ i 1))))
(cond
((and (>= (rb-ord c) 48) (<= (rb-ord c) 57))
(set! acc (+ (* acc radix) (- (rb-ord c) 48))))
((and (>= (rb-ord c) 97) (<= (rb-ord c) 102))
(set! acc (+ (* acc radix) (+ 10 (- (rb-ord c) 97)))))
((and (>= (rb-ord c) 65) (<= (rb-ord c) 70))
(set! acc (+ (* acc radix) (+ 10 (- (rb-ord c) 65)))))))
(set! i (+ i 1))
(rb-rad-loop)))))
(rb-rad-loop)
acc)))
;; ── Strip underscores from numeric literals ───────────────────────
(define rb-strip-underscores
(fn (s)
(let ((n (len s)) (i 0) (parts (list)))
(define rb-su-loop
(fn ()
(when (< i n)
(do
(let ((c (substring s i (+ i 1))))
(when (not (= c "_"))
(append! parts c)))
(set! i (+ i 1))
(rb-su-loop)))))
(rb-su-loop)
(join "" parts))))
;; ── Main tokenizer ────────────────────────────────────────────────
(define rb-tokenize
(fn (src)
(let ((tokens (list))
(pos 0)
(line 1)
(col 1)
(src-len (len src)))
(define rb-peek
(fn (offset)
(if (< (+ pos offset) src-len)
(substring src (+ pos offset) (+ pos offset 1))
nil)))
(define rb-cur (fn () (rb-peek 0)))
(define rb-advance!
(fn ()
(let ((c (rb-cur)))
(set! pos (+ pos 1))
(if (= c "\n")
(do (set! line (+ line 1)) (set! col 1))
(set! col (+ col 1))))))
(define rb-advance-n!
(fn (n)
(when (> n 0)
(do (rb-advance!) (rb-advance-n! (- n 1))))))
(define rb-push!
(fn (type value tok-line tok-col)
(append! tokens (rb-make-token type value tok-line tok-col))))
(define rb-read-while
(fn (pred)
(let ((start pos))
(define rb-rw-loop
(fn ()
(when (and (< pos src-len) (pred (rb-cur)))
(do (rb-advance!) (rb-rw-loop)))))
(rb-rw-loop)
(substring src start pos))))
(define rb-skip-line-comment!
(fn ()
(define rb-slc-loop
(fn ()
(when (and (< pos src-len) (not (= (rb-cur) "\n")))
(do (rb-advance!) (rb-slc-loop)))))
(rb-slc-loop)))
(define rb-read-escape
(fn ()
(rb-advance!)
(let ((c (rb-cur)))
(cond
((= c "n") (do (rb-advance!) "\n"))
((= c "t") (do (rb-advance!) "\t"))
((= c "r") (do (rb-advance!) "\r"))
((= c "\\") (do (rb-advance!) "\\"))
((= c "'") (do (rb-advance!) "'"))
((= c "\"") (do (rb-advance!) "\""))
((= c "a") (do (rb-advance!) (char-from-code 7)))
((= c "b") (do (rb-advance!) (char-from-code 8)))
((= c "f") (do (rb-advance!) (char-from-code 12)))
((= c "v") (do (rb-advance!) (char-from-code 11)))
((= c "e") (do (rb-advance!) (char-from-code 27)))
((= c "s") (do (rb-advance!) " "))
((= c "0") (do (rb-advance!) (char-from-code 0)))
(:else (do (rb-advance!) (str "\\" c)))))))
(define rb-read-sq-string
(fn ()
(let ((parts (list)))
(rb-advance!)
(define rb-sq-loop
(fn ()
(cond
((>= pos src-len) nil)
((= (rb-cur) "'") (rb-advance!))
((and (= (rb-cur) "\\")
(let ((n (rb-peek 1)))
(or (= n "\\") (= n "'"))))
(do
(rb-advance!)
(append! parts (rb-cur))
(rb-advance!)
(rb-sq-loop)))
(:else
(do
(append! parts (rb-cur))
(rb-advance!)
(rb-sq-loop))))))
(rb-sq-loop)
(join "" parts))))
(define rb-read-dq-string
(fn ()
(let ((parts (list)))
(rb-advance!)
(define rb-dq-loop
(fn ()
(cond
((>= pos src-len) nil)
((= (rb-cur) "\"") (rb-advance!))
((= (rb-cur) "\\")
(do
(append! parts (rb-read-escape))
(rb-dq-loop)))
((and (= (rb-cur) "#") (= (rb-peek 1) "{"))
(do
(append! parts "#{")
(rb-advance-n! 2)
(let ((depth 1))
(define rb-interp-inner
(fn ()
(when (and (< pos src-len) (> depth 0))
(do
(let ((c (rb-cur)))
(cond
((= c "{")
(do
(set! depth (+ depth 1))
(append! parts c)
(rb-advance!)))
((= c "}")
(do
(set! depth (- depth 1))
(when (> depth 0)
(do (append! parts c) (rb-advance!)))))
(:else
(do (append! parts c) (rb-advance!)))))
(rb-interp-inner)))))
(rb-interp-inner))
(when (= (rb-cur) "}")
(do (append! parts "}") (rb-advance!)))
(rb-dq-loop)))
(:else
(do
(append! parts (rb-cur))
(rb-advance!)
(rb-dq-loop))))))
(rb-dq-loop)
(join "" parts))))
(define rb-read-percent-words
(fn ()
(rb-advance-n! 2)
(let ((open-ch (rb-cur)))
(let ((close-ch
(cond
((= open-ch "[") "]")
((= open-ch "(") ")")
((= open-ch "{") "}")
((= open-ch "<") ">")
(:else open-ch))))
(rb-advance!)
(let ((items (list)))
(define rb-pw-skip
(fn ()
(when (and (< pos src-len) (or (rb-space? (rb-cur)) (= (rb-cur) "\n")))
(do (rb-advance!) (rb-pw-skip)))))
(define rb-pw-word
(fn (wparts)
(if (or (>= pos src-len)
(rb-space? (rb-cur))
(= (rb-cur) "\n")
(= (rb-cur) close-ch))
(append! items (join "" wparts))
(do
(append! wparts (rb-cur))
(rb-advance!)
(rb-pw-word wparts)))))
(define rb-pw-loop
(fn ()
(rb-pw-skip)
(when (and (< pos src-len) (not (= (rb-cur) close-ch)))
(do
(rb-pw-word (list))
(rb-pw-loop)))))
(rb-pw-loop)
(when (= (rb-cur) close-ch) (rb-advance!))
items)))))
(define rb-read-ident-word
(fn ()
(let ((start pos))
(rb-read-while rb-ident-cont?)
(when (and (= (rb-cur) "?") (not (= (rb-peek 1) "=")))
(rb-advance!))
(when (and (= (rb-cur) "!") (not (or (= (rb-peek 1) "=") (= (rb-peek 1) "~"))))
(rb-advance!))
(substring src start pos))))
(define rb-read-number!
(fn (tok-line tok-col)
(let ((start pos))
(cond
((and (= (rb-cur) "0") (let ((p (rb-peek 1))) (or (= p "b") (= p "B"))))
(do
(rb-advance-n! 2)
(let ((bin-str (rb-read-while rb-binary-digit?)))
(rb-push! "int" (rb-parse-radix bin-str 2) tok-line tok-col))))
((and (= (rb-cur) "0") (let ((p (rb-peek 1))) (or (= p "o") (= p "O"))))
(do
(rb-advance-n! 2)
(let ((oct-str (rb-read-while rb-octal-digit?)))
(rb-push! "int" (rb-parse-radix oct-str 8) tok-line tok-col))))
((and (= (rb-cur) "0") (let ((p (rb-peek 1))) (or (= p "x") (= p "X"))))
(do
(rb-advance-n! 2)
(let ((hex-str (rb-read-while rb-hex-digit?)))
(rb-push! "int" (rb-parse-radix hex-str 16) tok-line tok-col))))
(:else
(do
(rb-read-while (fn (c) (or (rb-digit? c) (= c "_"))))
(let ((is-float false))
(when (and (= (rb-cur) ".") (rb-digit? (rb-peek 1)))
(do
(set! is-float true)
(rb-advance!)
(rb-read-while (fn (c) (or (rb-digit? c) (= c "_"))))))
(when (or (= (rb-cur) "e") (= (rb-cur) "E"))
(do
(set! is-float true)
(rb-advance!)
(when (or (= (rb-cur) "+") (= (rb-cur) "-"))
(rb-advance!))
(rb-read-while rb-digit?)))
(let ((num-str (rb-strip-underscores (substring src start pos))))
(if is-float
(rb-push! "float" num-str tok-line tok-col)
(rb-push! "int" (parse-int num-str) tok-line tok-col))))))))))
(define rb-read-op!
(fn (tok-line tok-col)
(let ((c0 (rb-cur)) (c1 (rb-peek 1)) (c2 (rb-peek 2)))
(cond
((and (= c0 "<") (= c1 "=") (= c2 ">"))
(do (rb-advance-n! 3) (rb-push! "op" "<=>" tok-line tok-col)))
((and (= c0 "=") (= c1 "=") (= c2 "="))
(do (rb-advance-n! 3) (rb-push! "op" "===" tok-line tok-col)))
((and (= c0 "*") (= c1 "*") (= c2 "="))
(do (rb-advance-n! 3) (rb-push! "op" "**=" tok-line tok-col)))
((and (= c0 "<") (= c1 "<") (= c2 "="))
(do (rb-advance-n! 3) (rb-push! "op" "<<=" tok-line tok-col)))
((and (= c0 ">") (= c1 ">") (= c2 "="))
(do (rb-advance-n! 3) (rb-push! "op" ">>=" tok-line tok-col)))
((and (= c0 "&") (= c1 "&") (= c2 "="))
(do (rb-advance-n! 3) (rb-push! "op" "&&=" tok-line tok-col)))
((and (= c0 "|") (= c1 "|") (= c2 "="))
(do (rb-advance-n! 3) (rb-push! "op" "||=" tok-line tok-col)))
((and (= c0 "*") (= c1 "*"))
(do (rb-advance-n! 2) (rb-push! "op" "**" tok-line tok-col)))
((and (= c0 "=") (= c1 "="))
(do (rb-advance-n! 2) (rb-push! "op" "==" tok-line tok-col)))
((and (= c0 "!") (= c1 "="))
(do (rb-advance-n! 2) (rb-push! "op" "!=" tok-line tok-col)))
((and (= c0 "<") (= c1 "="))
(do (rb-advance-n! 2) (rb-push! "op" "<=" tok-line tok-col)))
((and (= c0 ">") (= c1 "="))
(do (rb-advance-n! 2) (rb-push! "op" ">=" tok-line tok-col)))
((and (= c0 "=") (= c1 "~"))
(do (rb-advance-n! 2) (rb-push! "op" "=~" tok-line tok-col)))
((and (= c0 "!") (= c1 "~"))
(do (rb-advance-n! 2) (rb-push! "op" "!~" tok-line tok-col)))
((and (= c0 "<") (= c1 "<"))
(do (rb-advance-n! 2) (rb-push! "op" "<<" tok-line tok-col)))
((and (= c0 ">") (= c1 ">"))
(do (rb-advance-n! 2) (rb-push! "op" ">>" tok-line tok-col)))
((and (= c0 "&") (= c1 "&"))
(do (rb-advance-n! 2) (rb-push! "op" "&&" tok-line tok-col)))
((and (= c0 "|") (= c1 "|"))
(do (rb-advance-n! 2) (rb-push! "op" "||" tok-line tok-col)))
((and (= c0 "+") (= c1 "="))
(do (rb-advance-n! 2) (rb-push! "op" "+=" tok-line tok-col)))
((and (= c0 "-") (= c1 "="))
(do (rb-advance-n! 2) (rb-push! "op" "-=" tok-line tok-col)))
((and (= c0 "*") (= c1 "="))
(do (rb-advance-n! 2) (rb-push! "op" "*=" tok-line tok-col)))
((and (= c0 "/") (= c1 "="))
(do (rb-advance-n! 2) (rb-push! "op" "/=" tok-line tok-col)))
((and (= c0 "%") (= c1 "="))
(do (rb-advance-n! 2) (rb-push! "op" "%=" tok-line tok-col)))
((and (= c0 "&") (= c1 "="))
(do (rb-advance-n! 2) (rb-push! "op" "&=" tok-line tok-col)))
((and (= c0 "|") (= c1 "="))
(do (rb-advance-n! 2) (rb-push! "op" "|=" tok-line tok-col)))
((and (= c0 "^") (= c1 "="))
(do (rb-advance-n! 2) (rb-push! "op" "^=" tok-line tok-col)))
((and (= c0 "-") (= c1 ">"))
(do (rb-advance-n! 2) (rb-push! "op" "->" tok-line tok-col)))
((and (= c0 "=") (= c1 ">"))
(do (rb-advance-n! 2) (rb-push! "op" "=>" tok-line tok-col)))
((and (= c0 "|") (nil? c1))
(do (rb-advance!) (rb-push! "pipe" "|" tok-line tok-col)))
((= c0 "|")
(do (rb-advance!) (rb-push! "pipe" "|" tok-line tok-col)))
(:else
(do (rb-advance!) (rb-push! "op" c0 tok-line tok-col)))))))
(define rb-scan!
(fn ()
(cond
((>= pos src-len) nil)
((rb-space? (rb-cur)) (do (rb-advance!) (rb-scan!)))
((= (rb-cur) "#") (do (rb-skip-line-comment!) (rb-scan!)))
((= (rb-cur) "\n")
(do
(let ((l line) (c col))
(rb-advance!)
(rb-push! "newline" nil l c))
(rb-scan!)))
((rb-digit? (rb-cur))
(do
(let ((l line) (c col))
(rb-read-number! l c))
(rb-scan!)))
((rb-ident-start? (rb-cur))
(do
(let ((l line) (c col))
(let ((w (rb-read-ident-word)))
(if (rb-keyword? w)
(rb-push! "keyword" w l c)
(if (rb-upper? (substring w 0 1))
(rb-push! "const" w l c)
(rb-push! "ident" w l c)))))
(rb-scan!)))
((= (rb-cur) "@")
(do
(let ((l line) (c col))
(if (= (rb-peek 1) "@")
(do
(rb-advance-n! 2)
(let ((name (rb-read-while rb-ident-cont?)))
(rb-push! "cvar" (str "@@" name) l c)))
(do
(rb-advance!)
(let ((name (rb-read-while rb-ident-cont?)))
(rb-push! "ivar" (str "@" name) l c)))))
(rb-scan!)))
((= (rb-cur) "$")
(do
(let ((l line) (c col))
(rb-advance!)
(let ((name (rb-read-while rb-ident-cont?)))
(rb-push! "gvar" (str "$" name) l c)))
(rb-scan!)))
((= (rb-cur) "\"")
(do
(let ((l line) (c col))
(rb-push! "string" (rb-read-dq-string) l c))
(rb-scan!)))
((= (rb-cur) "'")
(do
(let ((l line) (c col))
(rb-push! "string" (rb-read-sq-string) l c))
(rb-scan!)))
((and (= (rb-cur) ":") (= (rb-peek 1) ":"))
(do
(let ((l line) (c col))
(rb-advance-n! 2)
(rb-push! "dcolon" "::" l c))
(rb-scan!)))
((= (rb-cur) ":")
(do
(let ((l line) (c col))
(rb-advance!)
(cond
((= (rb-cur) "\"")
(rb-push! "symbol" (rb-read-dq-string) l c))
((= (rb-cur) "'")
(rb-push! "symbol" (rb-read-sq-string) l c))
((rb-ident-start? (rb-cur))
(let ((name (rb-read-ident-word)))
(rb-push! "symbol" name l c)))
(:else
(rb-push! "colon" ":" l c))))
(rb-scan!)))
((and (= (rb-cur) "%")
(let ((p (rb-peek 1)))
(or (= p "w") (= p "W") (= p "i") (= p "I"))))
(do
(let ((l line) (c col))
(let ((kind (rb-peek 1)))
(let ((items (rb-read-percent-words)))
(if (or (= kind "i") (= kind "I"))
(rb-push! "isymbols" items l c)
(rb-push! "words" items l c)))))
(rb-scan!)))
((= (rb-cur) ".")
(do
(let ((l line) (c col))
(cond
((and (= (rb-peek 1) ".") (= (rb-peek 2) "."))
(do (rb-advance-n! 3) (rb-push! "dotdotdot" "..." l c)))
((= (rb-peek 1) ".")
(do (rb-advance-n! 2) (rb-push! "dotdot" ".." l c)))
(:else
(do (rb-advance!) (rb-push! "dot" "." l c)))))
(rb-scan!)))
((= (rb-cur) ",")
(do
(let ((l line) (c col)) (rb-push! "comma" "," l c) (rb-advance!))
(rb-scan!)))
((= (rb-cur) ";")
(do
(let ((l line) (c col)) (rb-push! "semi" ";" l c) (rb-advance!))
(rb-scan!)))
((= (rb-cur) "(")
(do
(let ((l line) (c col)) (rb-push! "lparen" "(" l c) (rb-advance!))
(rb-scan!)))
((= (rb-cur) ")")
(do
(let ((l line) (c col)) (rb-push! "rparen" ")" l c) (rb-advance!))
(rb-scan!)))
((= (rb-cur) "[")
(do
(let ((l line) (c col)) (rb-push! "lbracket" "[" l c) (rb-advance!))
(rb-scan!)))
((= (rb-cur) "]")
(do
(let ((l line) (c col)) (rb-push! "rbracket" "]" l c) (rb-advance!))
(rb-scan!)))
((= (rb-cur) "{")
(do
(let ((l line) (c col)) (rb-push! "lbrace" "{" l c) (rb-advance!))
(rb-scan!)))
((= (rb-cur) "}")
(do
(let ((l line) (c col)) (rb-push! "rbrace" "}" l c) (rb-advance!))
(rb-scan!)))
((or (= (rb-cur) "+") (= (rb-cur) "-") (= (rb-cur) "*")
(= (rb-cur) "/") (= (rb-cur) "%") (= (rb-cur) "=")
(= (rb-cur) "!") (= (rb-cur) "<") (= (rb-cur) ">")
(= (rb-cur) "&") (= (rb-cur) "^") (= (rb-cur) "~")
(= (rb-cur) "|"))
(do
(let ((l line) (c col)) (rb-read-op! l c))
(rb-scan!)))
(:else (do (rb-advance!) (rb-scan!))))))
(rb-scan!)
(rb-push! "eof" nil line col)
tokens)))

View File

@@ -51,7 +51,7 @@ Core mapping:
## Roadmap
### Phase 1 — tokenizer + parser
- [ ] Tokenizer: keywords (`def end class module if unless while until do return yield begin rescue ensure case when then else elsif`), identifiers (lowercase = local/method, `@` = ivar, `@@` = cvar, `$` = global, uppercase = constant), numbers (int, float, `0x` `0o` `0b`, `_` separators), strings (`"…"` interpolation, `'…'` literal, `%w[a b c]`, `%i[a b c]`), symbols `:foo` `:"…"`, operators (`+ - * / % ** == != < > <= >= <=> === =~ !~ << >> & | ^ ~ ! && || and or not`), `:: . , ; ( ) [ ] { } -> => |`, comments `#`
- [x] Tokenizer: keywords (`def end class module if unless while until do return yield begin rescue ensure case when then else elsif`), identifiers (lowercase = local/method, `@` = ivar, `@@` = cvar, `$` = global, uppercase = constant), numbers (int, float, `0x` `0o` `0b`, `_` separators), strings (`"…"` interpolation, `'…'` literal, `%w[a b c]`, `%i[a b c]`), symbols `:foo` `:"…"`, operators (`+ - * / % ** == != < > <= >= <=> === =~ !~ << >> & | ^ ~ ! && || and or not`), `:: . , ; ( ) [ ] { } -> => |`, comments `#`
- [ ] Parser: program is sequence of statements separated by newlines or `;`; method def `def name(args) … end`; class `class Foo < Bar … end`; module `module M … end`; block `do |a, b| … end` and `{ |a, b| … }`; call sugar (no parens), `obj.method`, `Mod::Const`; arg shapes (positional, default, splat `*args`, double-splat `**opts`, block `&blk`)
- [ ] If/while/case expressions (return values), `unless`/`until`, postfix modifiers
- [ ] Begin/rescue/ensure/retry, raise, raise with class+message
@@ -117,7 +117,7 @@ Core mapping:
_Newest first._
- _(none yet)_
- 2026-04-25: Phase 1 tokenizer complete — `lib/ruby/tokenizer.sx` + `lib/ruby/tests/tokenizer.sx` (107/107 tests). Keywords, identifiers (@ivar @@cvar $gvar), numbers (dec/hex/octal/binary/float), strings (dq with interpolation kept raw, sq), symbols, %w/%i literals, operators (all compound forms), punctuation, comments, line/col tracking.
## Blockers