diff --git a/lib/ruby/test.sh b/lib/ruby/test.sh new file mode 100755 index 00000000..861e1c62 --- /dev/null +++ b/lib/ruby/test.sh @@ -0,0 +1,85 @@ +#!/usr/bin/env bash +# Ruby-on-SX test runner. +# Usage: +# bash lib/ruby/test.sh # run all tests +# bash lib/ruby/test.sh -v # verbose +# bash lib/ruby/test.sh tests/tokenizer.sx # single file + +set -euo pipefail +cd "$(git rev-parse --show-toplevel)" + +SX_SERVER="hosts/ocaml/_build/default/bin/sx_server.exe" +if [ ! -x "$SX_SERVER" ]; then + MAIN_ROOT=$(git worktree list | head -1 | awk '{print $1}') + if [ -x "$MAIN_ROOT/$SX_SERVER" ]; then + SX_SERVER="$MAIN_ROOT/$SX_SERVER" + else + echo "ERROR: sx_server.exe not found." + exit 1 + fi +fi + +VERBOSE="" +FILES=() +for arg in "$@"; do + case "$arg" in + -v|--verbose) VERBOSE=1 ;; + *) FILES+=("$arg") ;; + esac +done + +if [ ${#FILES[@]} -eq 0 ]; then + mapfile -t FILES < <(find lib/ruby/tests -maxdepth 2 -name '*.sx' | sort) +fi + +TOTAL_PASS=0 +TOTAL_FAIL=0 +FAILED_FILES=() + +for FILE in "${FILES[@]}"; do + [ -f "$FILE" ] || { echo "skip $FILE (not found)"; continue; } + TMPFILE=$(mktemp) + cat > "$TMPFILE" <&1 || true) + rm -f "$TMPFILE" + + LINE=$(echo "$OUTPUT" | awk '/^\(ok-len 3 / {getline; print; exit}') + if [ -z "$LINE" ]; then + LINE=$(echo "$OUTPUT" | grep -E '^\(ok 3 \([0-9]+ [0-9]+\)\)' | tail -1 \ + | sed -E 's/^\(ok 3 //; s/\)$//') + fi + if [ -z "$LINE" ]; then + echo "✗ $FILE: could not extract summary" + echo "$OUTPUT" | tail -20 + TOTAL_FAIL=$((TOTAL_FAIL + 1)) + FAILED_FILES+=("$FILE") + continue + fi + P=$(echo "$LINE" | sed -E 's/^\(([0-9]+) ([0-9]+)\).*/\1/') + F=$(echo "$LINE" | sed -E 's/^\(([0-9]+) ([0-9]+)\).*/\2/') + TOTAL_PASS=$((TOTAL_PASS + P)) + TOTAL_FAIL=$((TOTAL_FAIL + F)) + if [ "$F" -gt 0 ]; then + FAILED_FILES+=("$FILE") + printf '✗ %-40s %d/%d\n' "$FILE" "$P" "$((P+F))" + elif [ "$VERBOSE" = "1" ]; then + printf '✓ %-40s %d passed\n' "$FILE" "$P" + fi +done + +TOTAL=$((TOTAL_PASS + TOTAL_FAIL)) +if [ $TOTAL_FAIL -eq 0 ]; then + echo "✓ $TOTAL_PASS/$TOTAL ruby-on-sx tests passed" +else + echo "✗ $TOTAL_PASS/$TOTAL passed, $TOTAL_FAIL failed in: ${FAILED_FILES[*]}" +fi + +[ $TOTAL_FAIL -eq 0 ] diff --git a/lib/ruby/tests/tokenizer.sx b/lib/ruby/tests/tokenizer.sx new file mode 100644 index 00000000..b1411ec2 --- /dev/null +++ b/lib/ruby/tests/tokenizer.sx @@ -0,0 +1,210 @@ +;; Ruby tokenizer tests. +;; Final value: {:pass N :fail N :fails (list)} + +(define rb-deep=? + (fn (a b) + (cond + ((= a b) true) + ((and (dict? a) (dict? b)) + (let ((ak (keys a)) (bk (keys b))) + (if (not (= (len ak) (len bk))) + false + (every? + (fn (k) (and (has-key? b k) (rb-deep=? (get a k) (get b k)))) + ak)))) + ((and (list? a) (list? b)) + (if (not (= (len a) (len b))) + false + (let ((i 0) (ok true)) + (define rb-de-loop + (fn () + (when (and ok (< i (len a))) + (do + (when (not (rb-deep=? (nth a i) (nth b i))) + (set! ok false)) + (set! i (+ i 1)) + (rb-de-loop))))) + (rb-de-loop) + ok))) + (:else false)))) + +(define rb-test-pass 0) +(define rb-test-fail 0) +(define rb-test-fails (list)) + +(define rb-test + (fn (name actual expected) + (if (rb-deep=? actual expected) + (set! rb-test-pass (+ rb-test-pass 1)) + (do + (set! rb-test-fail (+ rb-test-fail 1)) + (append! rb-test-fails {:name name :actual actual :expected expected}))))) + +;; Helper: tokenize, drop newline+eof, return {:type :value} pairs +(define rb-toks + (fn (src) + (map + (fn (tok) {:value (get tok "value") :type (get tok "type")}) + (filter + (fn (tok) + (let ((ty (get tok "type"))) + (not (or (= ty "newline") (= ty "eof"))))) + (rb-tokenize src))))) + +;; Helper: get just types +(define rb-types + (fn (src) (map (fn (t) (get t "type")) (rb-toks src)))) + +;; Helper: get first token type +(define rb-first-type + (fn (src) (get (get (rb-tokenize src) 0) "type"))) + +(define rb-first-value + (fn (src) (get (get (rb-tokenize src) 0) "value"))) + +;; ── 1. Keywords ────────────────────────���───────────────────────── +(rb-test "keyword def" (rb-toks "def") (list {:value "def" :type "keyword"})) +(rb-test "keyword end" (rb-toks "end") (list {:value "end" :type "keyword"})) +(rb-test "keyword class" (rb-toks "class") (list {:value "class" :type "keyword"})) +(rb-test "keyword if" (rb-toks "if") (list {:value "if" :type "keyword"})) +(rb-test "keyword while" (rb-toks "while") (list {:value "while" :type "keyword"})) +(rb-test "keyword nil" (rb-toks "nil") (list {:value "nil" :type "keyword"})) +(rb-test "keyword true" (rb-toks "true") (list {:value "true" :type "keyword"})) +(rb-test "keyword false" (rb-toks "false") (list {:value "false" :type "keyword"})) +(rb-test "keyword return" (rb-toks "return") (list {:value "return" :type "keyword"})) +(rb-test "keyword yield" (rb-toks "yield") (list {:value "yield" :type "keyword"})) +(rb-test "keyword begin" (rb-toks "begin") (list {:value "begin" :type "keyword"})) +(rb-test "keyword rescue" (rb-toks "rescue") (list {:value "rescue" :type "keyword"})) +(rb-test "keyword self" (rb-toks "self") (list {:value "self" :type "keyword"})) +(rb-test "keyword super" (rb-toks "super") (list {:value "super" :type "keyword"})) + +;; ── 2. Identifiers ──────────────────────────────────────────────── +(rb-test "ident simple" (rb-toks "foo") (list {:value "foo" :type "ident"})) +(rb-test "ident underscore" (rb-toks "_foo") (list {:value "_foo" :type "ident"})) +(rb-test "ident with digit" (rb-toks "foo2") (list {:value "foo2" :type "ident"})) +(rb-test "ident predicate" (rb-toks "empty?") (list {:value "empty?" :type "ident"})) +(rb-test "ident bang" (rb-toks "save!") (list {:value "save!" :type "ident"})) +(rb-test "defined?" (rb-toks "defined?") (list {:value "defined?" :type "keyword"})) + +;; ── 3. Constants ────────────────────────────────────────────────── +(rb-test "const simple" (rb-toks "Foo") (list {:value "Foo" :type "const"})) +(rb-test "const upcase" (rb-toks "MY_CONST") (list {:value "MY_CONST" :type "const"})) +(rb-test "const class" (rb-toks "String") (list {:value "String" :type "const"})) + +;; ── 4. Sigil variables ─────────────────────────────────────────── +(rb-test "ivar" (rb-toks "@name") (list {:value "@name" :type "ivar"})) +(rb-test "cvar" (rb-toks "@@count") (list {:value "@@count" :type "cvar"})) +(rb-test "gvar" (rb-toks "$global") (list {:value "$global" :type "gvar"})) + +;; ── 5. Integers ─────────────────────────────────────────────────── +(rb-test "int decimal" (rb-first-value "42") 42) +(rb-test "int zero" (rb-first-value "0") 0) +(rb-test "int underscore" (rb-first-value "1_000") 1000) +(rb-test "int hex" (rb-first-value "0xFF") 255) +(rb-test "int hex lower" (rb-first-value "0xff") 255) +(rb-test "int octal" (rb-first-value "0o17") 15) +(rb-test "int binary" (rb-first-value "0b1010") 10) +(rb-test "int type" (rb-first-type "42") "int") + +;; ── 6. Floats ───────────────────────────────────────────────────── +(rb-test "float simple" (rb-first-type "3.14") "float") +(rb-test "float value" (rb-first-value "3.14") "3.14") +(rb-test "float exp" (rb-first-type "1.5e10") "float") +(rb-test "float exp value" (rb-first-value "1.5e10") "1.5e10") + +;; ── 7. Strings ──────────────────────────────────────────────────── +(rb-test "dq string" (rb-first-value "\"hello\"") "hello") +(rb-test "dq string type" (rb-first-type "\"hello\"") "string") +(rb-test "sq string" (rb-first-value "'world'") "world") +(rb-test "dq escape nl" (rb-first-value "\"a\\nb\"") "a\nb") +(rb-test "dq escape tab" (rb-first-value "\"a\\tb\"") "a\tb") +(rb-test "dq escape quote" (rb-first-value "\"a\\\"b\"") "a\"b") +(rb-test "sq no escape" (rb-first-value "'a\\nb'") "a\\nb") +(rb-test "sq escape backslash" (rb-first-value "'a\\\\'") "a\\") +(rb-test "dq interp kept" (rb-first-value "\"#{x}\"") "#{x}") + +;; ── 8. Symbols ──────────────────────────────────────────────────── +(rb-test "symbol simple" (rb-first-type ":foo") "symbol") +(rb-test "symbol value" (rb-first-value ":foo") "foo") +(rb-test "symbol predicate" (rb-first-value ":empty?") "empty?") +(rb-test "symbol dq" (rb-first-value ":\"hello world\"") "hello world") +(rb-test "symbol sq" (rb-first-value ":'hello'") "hello") + +;; ── 9. %w and %i literals ──────────────────────────────────────── +(rb-test "%w bracket" (rb-first-type "%w[a b c]") "words") +(rb-test "%w value" (rb-first-value "%w[a b c]") (list "a" "b" "c")) +(rb-test "%w paren" (rb-first-value "%w(x y)") (list "x" "y")) +(rb-test "%i bracket" (rb-first-type "%i[a b]") "isymbols") +(rb-test "%i value" (rb-first-value "%i[foo bar]") (list "foo" "bar")) + +;; ── 10. Punctuation ─────────────────────────────────────────────── +(rb-test "dot" (rb-first-type ".") "dot") +(rb-test "dotdot" (rb-first-type "..") "dotdot") +(rb-test "dotdotdot" (rb-first-type "...") "dotdotdot") +(rb-test "dcolon" (rb-first-type "::") "dcolon") +(rb-test "comma" (rb-first-type ",") "comma") +(rb-test "semi" (rb-first-type ";") "semi") +(rb-test "lparen" (rb-first-type "(") "lparen") +(rb-test "rparen" (rb-first-type ")") "rparen") +(rb-test "lbracket" (rb-first-type "[") "lbracket") +(rb-test "rbracket" (rb-first-type "]") "rbracket") +(rb-test "lbrace" (rb-first-type "{") "lbrace") +(rb-test "rbrace" (rb-first-type "}") "rbrace") +(rb-test "pipe" (rb-first-type "|") "pipe") + +;; ── 11. Operators ───────────────────────────────────────────────── +(rb-test "op plus" (rb-first-value "+") "+") +(rb-test "op minus" (rb-first-value "-") "-") +(rb-test "op star" (rb-first-value "*") "*") +(rb-test "op slash" (rb-first-value "/") "/") +(rb-test "op eq" (rb-first-value "=") "=") +(rb-test "op eqeq" (rb-first-value "==") "==") +(rb-test "op neq" (rb-first-value "!=") "!=") +(rb-test "op lt" (rb-first-value "<") "<") +(rb-test "op gt" (rb-first-value ">") ">") +(rb-test "op lte" (rb-first-value "<=") "<=") +(rb-test "op gte" (rb-first-value ">=") ">=") +(rb-test "op spaceship" (rb-first-value "<=>") "<=>") +(rb-test "op tripleq" (rb-first-value "===") "===") +(rb-test "op match" (rb-first-value "=~") "=~") +(rb-test "op nomatch" (rb-first-value "!~") "!~") +(rb-test "op lshift" (rb-first-value "<<") "<<") +(rb-test "op rshift" (rb-first-value ">>") ">>") +(rb-test "op and" (rb-first-value "&&") "&&") +(rb-test "op or" (rb-first-value "||") "||") +(rb-test "op power" (rb-first-value "**") "**") +(rb-test "op plus-eq" (rb-first-value "+=") "+=") +(rb-test "op minus-eq" (rb-first-value "-=") "-=") +(rb-test "op arrow" (rb-first-value "->") "->") +(rb-test "op hash-rocket" (rb-first-value "=>") "=>") + +;; ── 12. Comments ────────────────────────────────────────────────── +(rb-test "comment skipped" (len (rb-toks "# this is a comment")) 0) +(rb-test "comment mid-line" (rb-types "x = 1 # comment") (list "ident" "op" "int")) + +;; ── 13. Multi-token sequences ───────────────────────────────────── +(rb-test "method call" (rb-types "foo.bar") + (list "ident" "dot" "ident")) +(rb-test "class def" (rb-types "class Foo") + (list "keyword" "const")) +(rb-test "method def" (rb-types "def greet(name)") + (list "keyword" "ident" "lparen" "ident" "rparen")) +(rb-test "assignment" (rb-types "x = 42") + (list "ident" "op" "int")) +(rb-test "block params" (rb-types "|x, y|") + (list "pipe" "ident" "comma" "ident" "pipe")) +(rb-test "scope resolution" (rb-types "Foo::Bar") + (list "const" "dcolon" "const")) +(rb-test "range" (rb-types "1..10") + (list "int" "dotdot" "int")) +(rb-test "exclusive range" (rb-types "1...10") + (list "int" "dotdotdot" "int")) + +;; ── 14. Line/col tracking ──────────────────────────────────────── +(define rb-tok1 (get (rb-tokenize "hello\nworld") 0)) +(define rb-tok2 (get (rb-tokenize "hello\nworld") 2)) +(rb-test "line track start" (get rb-tok1 "line") 1) +(rb-test "line track second" (get rb-tok2 "line") 2) +(rb-test "col track start" (get rb-tok1 "col") 1) + +(list rb-test-pass rb-test-fail) diff --git a/lib/ruby/tokenizer.sx b/lib/ruby/tokenizer.sx new file mode 100644 index 00000000..c8965201 --- /dev/null +++ b/lib/ruby/tokenizer.sx @@ -0,0 +1,549 @@ +;; Ruby tokenizer for Ruby 2.7 subset. +;; Token: {:type T :value V :line L :col C} +;; +;; Types: keyword ident ivar cvar gvar const +;; int float string symbol +;; op dot dotdot dotdotdot dcolon colon +;; lparen rparen lbracket rbracket lbrace rbrace +;; comma semi pipe newline words isymbols eof + +;; ── Character code table ────────────────────────────────────────── +(define rb-ord-table + (let ((t (dict)) (i 0)) + (define rb-build-table + (fn () + (when (< i 128) + (do + (dict-set! t (char-from-code i) i) + (set! i (+ i 1)) + (rb-build-table))))) + (rb-build-table) + t)) + +(define rb-ord (fn (c) (or (get rb-ord-table c) 0))) + +;; ── Character predicates ────────────────────────────────────────── +(define rb-digit? + (fn (c) (and (string? c) (>= (rb-ord c) 48) (<= (rb-ord c) 57)))) + +(define rb-hex-digit? + (fn (c) + (and (string? c) + (or (and (>= (rb-ord c) 48) (<= (rb-ord c) 57)) + (and (>= (rb-ord c) 97) (<= (rb-ord c) 102)) + (and (>= (rb-ord c) 65) (<= (rb-ord c) 70)))))) + +(define rb-octal-digit? + (fn (c) (and (string? c) (>= (rb-ord c) 48) (<= (rb-ord c) 55)))) + +(define rb-binary-digit? (fn (c) (or (= c "0") (= c "1")))) + +(define rb-lower? + (fn (c) (and (string? c) (>= (rb-ord c) 97) (<= (rb-ord c) 122)))) + +(define rb-upper? + (fn (c) (and (string? c) (>= (rb-ord c) 65) (<= (rb-ord c) 90)))) + +(define rb-ident-start? + (fn (c) (or (rb-lower? c) (rb-upper? c) (= c "_")))) + +(define rb-ident-cont? + (fn (c) (or (rb-lower? c) (rb-upper? c) (rb-digit? c) (= c "_")))) + +(define rb-space? (fn (c) (or (= c " ") (= c "\t") (= c "\r")))) + +;; ── Reserved words ──────────────────────────────────────────────── +(define rb-keywords + (list "__ENCODING__" "__LINE__" "__FILE__" + "BEGIN" "END" + "alias" "and" + "begin" "break" + "case" "class" + "def" "defined?" "do" + "else" "elsif" "end" "ensure" + "false" "for" + "if" "in" + "module" + "next" "nil" "not" + "or" + "redo" "rescue" "retry" "return" + "self" "super" + "then" "true" + "undef" "unless" "until" + "when" "while" + "yield")) + +(define rb-keyword? (fn (w) (contains? rb-keywords w))) + +;; ── Token constructor ───────────────────────────────────────────── +(define rb-make-token + (fn (type value line col) {:type type :value value :line line :col col})) + +;; ── Radix number parser ─────────────────────────────────────────── +(define rb-parse-radix + (fn (s radix) + (let ((n (len s)) (i 0) (acc 0)) + (define rb-rad-loop + (fn () + (when (< i n) + (do + (let ((c (substring s i (+ i 1)))) + (cond + ((and (>= (rb-ord c) 48) (<= (rb-ord c) 57)) + (set! acc (+ (* acc radix) (- (rb-ord c) 48)))) + ((and (>= (rb-ord c) 97) (<= (rb-ord c) 102)) + (set! acc (+ (* acc radix) (+ 10 (- (rb-ord c) 97))))) + ((and (>= (rb-ord c) 65) (<= (rb-ord c) 70)) + (set! acc (+ (* acc radix) (+ 10 (- (rb-ord c) 65))))))) + (set! i (+ i 1)) + (rb-rad-loop))))) + (rb-rad-loop) + acc))) + +;; ── Strip underscores from numeric literals ─────────────────────── +(define rb-strip-underscores + (fn (s) + (let ((n (len s)) (i 0) (parts (list))) + (define rb-su-loop + (fn () + (when (< i n) + (do + (let ((c (substring s i (+ i 1)))) + (when (not (= c "_")) + (append! parts c))) + (set! i (+ i 1)) + (rb-su-loop))))) + (rb-su-loop) + (join "" parts)))) + +;; ── Main tokenizer ──────────────────────────────────────────────── +(define rb-tokenize + (fn (src) + (let ((tokens (list)) + (pos 0) + (line 1) + (col 1) + (src-len (len src))) + (define rb-peek + (fn (offset) + (if (< (+ pos offset) src-len) + (substring src (+ pos offset) (+ pos offset 1)) + nil))) + (define rb-cur (fn () (rb-peek 0))) + (define rb-advance! + (fn () + (let ((c (rb-cur))) + (set! pos (+ pos 1)) + (if (= c "\n") + (do (set! line (+ line 1)) (set! col 1)) + (set! col (+ col 1)))))) + (define rb-advance-n! + (fn (n) + (when (> n 0) + (do (rb-advance!) (rb-advance-n! (- n 1)))))) + (define rb-push! + (fn (type value tok-line tok-col) + (append! tokens (rb-make-token type value tok-line tok-col)))) + (define rb-read-while + (fn (pred) + (let ((start pos)) + (define rb-rw-loop + (fn () + (when (and (< pos src-len) (pred (rb-cur))) + (do (rb-advance!) (rb-rw-loop))))) + (rb-rw-loop) + (substring src start pos)))) + (define rb-skip-line-comment! + (fn () + (define rb-slc-loop + (fn () + (when (and (< pos src-len) (not (= (rb-cur) "\n"))) + (do (rb-advance!) (rb-slc-loop))))) + (rb-slc-loop))) + (define rb-read-escape + (fn () + (rb-advance!) + (let ((c (rb-cur))) + (cond + ((= c "n") (do (rb-advance!) "\n")) + ((= c "t") (do (rb-advance!) "\t")) + ((= c "r") (do (rb-advance!) "\r")) + ((= c "\\") (do (rb-advance!) "\\")) + ((= c "'") (do (rb-advance!) "'")) + ((= c "\"") (do (rb-advance!) "\"")) + ((= c "a") (do (rb-advance!) (char-from-code 7))) + ((= c "b") (do (rb-advance!) (char-from-code 8))) + ((= c "f") (do (rb-advance!) (char-from-code 12))) + ((= c "v") (do (rb-advance!) (char-from-code 11))) + ((= c "e") (do (rb-advance!) (char-from-code 27))) + ((= c "s") (do (rb-advance!) " ")) + ((= c "0") (do (rb-advance!) (char-from-code 0))) + (:else (do (rb-advance!) (str "\\" c))))))) + (define rb-read-sq-string + (fn () + (let ((parts (list))) + (rb-advance!) + (define rb-sq-loop + (fn () + (cond + ((>= pos src-len) nil) + ((= (rb-cur) "'") (rb-advance!)) + ((and (= (rb-cur) "\\") + (let ((n (rb-peek 1))) + (or (= n "\\") (= n "'")))) + (do + (rb-advance!) + (append! parts (rb-cur)) + (rb-advance!) + (rb-sq-loop))) + (:else + (do + (append! parts (rb-cur)) + (rb-advance!) + (rb-sq-loop)))))) + (rb-sq-loop) + (join "" parts)))) + (define rb-read-dq-string + (fn () + (let ((parts (list))) + (rb-advance!) + (define rb-dq-loop + (fn () + (cond + ((>= pos src-len) nil) + ((= (rb-cur) "\"") (rb-advance!)) + ((= (rb-cur) "\\") + (do + (append! parts (rb-read-escape)) + (rb-dq-loop))) + ((and (= (rb-cur) "#") (= (rb-peek 1) "{")) + (do + (append! parts "#{") + (rb-advance-n! 2) + (let ((depth 1)) + (define rb-interp-inner + (fn () + (when (and (< pos src-len) (> depth 0)) + (do + (let ((c (rb-cur))) + (cond + ((= c "{") + (do + (set! depth (+ depth 1)) + (append! parts c) + (rb-advance!))) + ((= c "}") + (do + (set! depth (- depth 1)) + (when (> depth 0) + (do (append! parts c) (rb-advance!))))) + (:else + (do (append! parts c) (rb-advance!))))) + (rb-interp-inner))))) + (rb-interp-inner)) + (when (= (rb-cur) "}") + (do (append! parts "}") (rb-advance!))) + (rb-dq-loop))) + (:else + (do + (append! parts (rb-cur)) + (rb-advance!) + (rb-dq-loop)))))) + (rb-dq-loop) + (join "" parts)))) + (define rb-read-percent-words + (fn () + (rb-advance-n! 2) + (let ((open-ch (rb-cur))) + (let ((close-ch + (cond + ((= open-ch "[") "]") + ((= open-ch "(") ")") + ((= open-ch "{") "}") + ((= open-ch "<") ">") + (:else open-ch)))) + (rb-advance!) + (let ((items (list))) + (define rb-pw-skip + (fn () + (when (and (< pos src-len) (or (rb-space? (rb-cur)) (= (rb-cur) "\n"))) + (do (rb-advance!) (rb-pw-skip))))) + (define rb-pw-word + (fn (wparts) + (if (or (>= pos src-len) + (rb-space? (rb-cur)) + (= (rb-cur) "\n") + (= (rb-cur) close-ch)) + (append! items (join "" wparts)) + (do + (append! wparts (rb-cur)) + (rb-advance!) + (rb-pw-word wparts))))) + (define rb-pw-loop + (fn () + (rb-pw-skip) + (when (and (< pos src-len) (not (= (rb-cur) close-ch))) + (do + (rb-pw-word (list)) + (rb-pw-loop))))) + (rb-pw-loop) + (when (= (rb-cur) close-ch) (rb-advance!)) + items))))) + (define rb-read-ident-word + (fn () + (let ((start pos)) + (rb-read-while rb-ident-cont?) + (when (and (= (rb-cur) "?") (not (= (rb-peek 1) "="))) + (rb-advance!)) + (when (and (= (rb-cur) "!") (not (or (= (rb-peek 1) "=") (= (rb-peek 1) "~")))) + (rb-advance!)) + (substring src start pos)))) + (define rb-read-number! + (fn (tok-line tok-col) + (let ((start pos)) + (cond + ((and (= (rb-cur) "0") (let ((p (rb-peek 1))) (or (= p "b") (= p "B")))) + (do + (rb-advance-n! 2) + (let ((bin-str (rb-read-while rb-binary-digit?))) + (rb-push! "int" (rb-parse-radix bin-str 2) tok-line tok-col)))) + ((and (= (rb-cur) "0") (let ((p (rb-peek 1))) (or (= p "o") (= p "O")))) + (do + (rb-advance-n! 2) + (let ((oct-str (rb-read-while rb-octal-digit?))) + (rb-push! "int" (rb-parse-radix oct-str 8) tok-line tok-col)))) + ((and (= (rb-cur) "0") (let ((p (rb-peek 1))) (or (= p "x") (= p "X")))) + (do + (rb-advance-n! 2) + (let ((hex-str (rb-read-while rb-hex-digit?))) + (rb-push! "int" (rb-parse-radix hex-str 16) tok-line tok-col)))) + (:else + (do + (rb-read-while (fn (c) (or (rb-digit? c) (= c "_")))) + (let ((is-float false)) + (when (and (= (rb-cur) ".") (rb-digit? (rb-peek 1))) + (do + (set! is-float true) + (rb-advance!) + (rb-read-while (fn (c) (or (rb-digit? c) (= c "_")))))) + (when (or (= (rb-cur) "e") (= (rb-cur) "E")) + (do + (set! is-float true) + (rb-advance!) + (when (or (= (rb-cur) "+") (= (rb-cur) "-")) + (rb-advance!)) + (rb-read-while rb-digit?))) + (let ((num-str (rb-strip-underscores (substring src start pos)))) + (if is-float + (rb-push! "float" num-str tok-line tok-col) + (rb-push! "int" (parse-int num-str) tok-line tok-col)))))))))) + (define rb-read-op! + (fn (tok-line tok-col) + (let ((c0 (rb-cur)) (c1 (rb-peek 1)) (c2 (rb-peek 2))) + (cond + ((and (= c0 "<") (= c1 "=") (= c2 ">")) + (do (rb-advance-n! 3) (rb-push! "op" "<=>" tok-line tok-col))) + ((and (= c0 "=") (= c1 "=") (= c2 "=")) + (do (rb-advance-n! 3) (rb-push! "op" "===" tok-line tok-col))) + ((and (= c0 "*") (= c1 "*") (= c2 "=")) + (do (rb-advance-n! 3) (rb-push! "op" "**=" tok-line tok-col))) + ((and (= c0 "<") (= c1 "<") (= c2 "=")) + (do (rb-advance-n! 3) (rb-push! "op" "<<=" tok-line tok-col))) + ((and (= c0 ">") (= c1 ">") (= c2 "=")) + (do (rb-advance-n! 3) (rb-push! "op" ">>=" tok-line tok-col))) + ((and (= c0 "&") (= c1 "&") (= c2 "=")) + (do (rb-advance-n! 3) (rb-push! "op" "&&=" tok-line tok-col))) + ((and (= c0 "|") (= c1 "|") (= c2 "=")) + (do (rb-advance-n! 3) (rb-push! "op" "||=" tok-line tok-col))) + ((and (= c0 "*") (= c1 "*")) + (do (rb-advance-n! 2) (rb-push! "op" "**" tok-line tok-col))) + ((and (= c0 "=") (= c1 "=")) + (do (rb-advance-n! 2) (rb-push! "op" "==" tok-line tok-col))) + ((and (= c0 "!") (= c1 "=")) + (do (rb-advance-n! 2) (rb-push! "op" "!=" tok-line tok-col))) + ((and (= c0 "<") (= c1 "=")) + (do (rb-advance-n! 2) (rb-push! "op" "<=" tok-line tok-col))) + ((and (= c0 ">") (= c1 "=")) + (do (rb-advance-n! 2) (rb-push! "op" ">=" tok-line tok-col))) + ((and (= c0 "=") (= c1 "~")) + (do (rb-advance-n! 2) (rb-push! "op" "=~" tok-line tok-col))) + ((and (= c0 "!") (= c1 "~")) + (do (rb-advance-n! 2) (rb-push! "op" "!~" tok-line tok-col))) + ((and (= c0 "<") (= c1 "<")) + (do (rb-advance-n! 2) (rb-push! "op" "<<" tok-line tok-col))) + ((and (= c0 ">") (= c1 ">")) + (do (rb-advance-n! 2) (rb-push! "op" ">>" tok-line tok-col))) + ((and (= c0 "&") (= c1 "&")) + (do (rb-advance-n! 2) (rb-push! "op" "&&" tok-line tok-col))) + ((and (= c0 "|") (= c1 "|")) + (do (rb-advance-n! 2) (rb-push! "op" "||" tok-line tok-col))) + ((and (= c0 "+") (= c1 "=")) + (do (rb-advance-n! 2) (rb-push! "op" "+=" tok-line tok-col))) + ((and (= c0 "-") (= c1 "=")) + (do (rb-advance-n! 2) (rb-push! "op" "-=" tok-line tok-col))) + ((and (= c0 "*") (= c1 "=")) + (do (rb-advance-n! 2) (rb-push! "op" "*=" tok-line tok-col))) + ((and (= c0 "/") (= c1 "=")) + (do (rb-advance-n! 2) (rb-push! "op" "/=" tok-line tok-col))) + ((and (= c0 "%") (= c1 "=")) + (do (rb-advance-n! 2) (rb-push! "op" "%=" tok-line tok-col))) + ((and (= c0 "&") (= c1 "=")) + (do (rb-advance-n! 2) (rb-push! "op" "&=" tok-line tok-col))) + ((and (= c0 "|") (= c1 "=")) + (do (rb-advance-n! 2) (rb-push! "op" "|=" tok-line tok-col))) + ((and (= c0 "^") (= c1 "=")) + (do (rb-advance-n! 2) (rb-push! "op" "^=" tok-line tok-col))) + ((and (= c0 "-") (= c1 ">")) + (do (rb-advance-n! 2) (rb-push! "op" "->" tok-line tok-col))) + ((and (= c0 "=") (= c1 ">")) + (do (rb-advance-n! 2) (rb-push! "op" "=>" tok-line tok-col))) + ((and (= c0 "|") (nil? c1)) + (do (rb-advance!) (rb-push! "pipe" "|" tok-line tok-col))) + ((= c0 "|") + (do (rb-advance!) (rb-push! "pipe" "|" tok-line tok-col))) + (:else + (do (rb-advance!) (rb-push! "op" c0 tok-line tok-col))))))) + (define rb-scan! + (fn () + (cond + ((>= pos src-len) nil) + ((rb-space? (rb-cur)) (do (rb-advance!) (rb-scan!))) + ((= (rb-cur) "#") (do (rb-skip-line-comment!) (rb-scan!))) + ((= (rb-cur) "\n") + (do + (let ((l line) (c col)) + (rb-advance!) + (rb-push! "newline" nil l c)) + (rb-scan!))) + ((rb-digit? (rb-cur)) + (do + (let ((l line) (c col)) + (rb-read-number! l c)) + (rb-scan!))) + ((rb-ident-start? (rb-cur)) + (do + (let ((l line) (c col)) + (let ((w (rb-read-ident-word))) + (if (rb-keyword? w) + (rb-push! "keyword" w l c) + (if (rb-upper? (substring w 0 1)) + (rb-push! "const" w l c) + (rb-push! "ident" w l c))))) + (rb-scan!))) + ((= (rb-cur) "@") + (do + (let ((l line) (c col)) + (if (= (rb-peek 1) "@") + (do + (rb-advance-n! 2) + (let ((name (rb-read-while rb-ident-cont?))) + (rb-push! "cvar" (str "@@" name) l c))) + (do + (rb-advance!) + (let ((name (rb-read-while rb-ident-cont?))) + (rb-push! "ivar" (str "@" name) l c))))) + (rb-scan!))) + ((= (rb-cur) "$") + (do + (let ((l line) (c col)) + (rb-advance!) + (let ((name (rb-read-while rb-ident-cont?))) + (rb-push! "gvar" (str "$" name) l c))) + (rb-scan!))) + ((= (rb-cur) "\"") + (do + (let ((l line) (c col)) + (rb-push! "string" (rb-read-dq-string) l c)) + (rb-scan!))) + ((= (rb-cur) "'") + (do + (let ((l line) (c col)) + (rb-push! "string" (rb-read-sq-string) l c)) + (rb-scan!))) + ((and (= (rb-cur) ":") (= (rb-peek 1) ":")) + (do + (let ((l line) (c col)) + (rb-advance-n! 2) + (rb-push! "dcolon" "::" l c)) + (rb-scan!))) + ((= (rb-cur) ":") + (do + (let ((l line) (c col)) + (rb-advance!) + (cond + ((= (rb-cur) "\"") + (rb-push! "symbol" (rb-read-dq-string) l c)) + ((= (rb-cur) "'") + (rb-push! "symbol" (rb-read-sq-string) l c)) + ((rb-ident-start? (rb-cur)) + (let ((name (rb-read-ident-word))) + (rb-push! "symbol" name l c))) + (:else + (rb-push! "colon" ":" l c)))) + (rb-scan!))) + ((and (= (rb-cur) "%") + (let ((p (rb-peek 1))) + (or (= p "w") (= p "W") (= p "i") (= p "I")))) + (do + (let ((l line) (c col)) + (let ((kind (rb-peek 1))) + (let ((items (rb-read-percent-words))) + (if (or (= kind "i") (= kind "I")) + (rb-push! "isymbols" items l c) + (rb-push! "words" items l c))))) + (rb-scan!))) + ((= (rb-cur) ".") + (do + (let ((l line) (c col)) + (cond + ((and (= (rb-peek 1) ".") (= (rb-peek 2) ".")) + (do (rb-advance-n! 3) (rb-push! "dotdotdot" "..." l c))) + ((= (rb-peek 1) ".") + (do (rb-advance-n! 2) (rb-push! "dotdot" ".." l c))) + (:else + (do (rb-advance!) (rb-push! "dot" "." l c))))) + (rb-scan!))) + ((= (rb-cur) ",") + (do + (let ((l line) (c col)) (rb-push! "comma" "," l c) (rb-advance!)) + (rb-scan!))) + ((= (rb-cur) ";") + (do + (let ((l line) (c col)) (rb-push! "semi" ";" l c) (rb-advance!)) + (rb-scan!))) + ((= (rb-cur) "(") + (do + (let ((l line) (c col)) (rb-push! "lparen" "(" l c) (rb-advance!)) + (rb-scan!))) + ((= (rb-cur) ")") + (do + (let ((l line) (c col)) (rb-push! "rparen" ")" l c) (rb-advance!)) + (rb-scan!))) + ((= (rb-cur) "[") + (do + (let ((l line) (c col)) (rb-push! "lbracket" "[" l c) (rb-advance!)) + (rb-scan!))) + ((= (rb-cur) "]") + (do + (let ((l line) (c col)) (rb-push! "rbracket" "]" l c) (rb-advance!)) + (rb-scan!))) + ((= (rb-cur) "{") + (do + (let ((l line) (c col)) (rb-push! "lbrace" "{" l c) (rb-advance!)) + (rb-scan!))) + ((= (rb-cur) "}") + (do + (let ((l line) (c col)) (rb-push! "rbrace" "}" l c) (rb-advance!)) + (rb-scan!))) + ((or (= (rb-cur) "+") (= (rb-cur) "-") (= (rb-cur) "*") + (= (rb-cur) "/") (= (rb-cur) "%") (= (rb-cur) "=") + (= (rb-cur) "!") (= (rb-cur) "<") (= (rb-cur) ">") + (= (rb-cur) "&") (= (rb-cur) "^") (= (rb-cur) "~") + (= (rb-cur) "|")) + (do + (let ((l line) (c col)) (rb-read-op! l c)) + (rb-scan!))) + (:else (do (rb-advance!) (rb-scan!)))))) + (rb-scan!) + (rb-push! "eof" nil line col) + tokens))) diff --git a/plans/ruby-on-sx.md b/plans/ruby-on-sx.md index c10a4035..30d49960 100644 --- a/plans/ruby-on-sx.md +++ b/plans/ruby-on-sx.md @@ -51,7 +51,7 @@ Core mapping: ## Roadmap ### Phase 1 — tokenizer + parser -- [ ] Tokenizer: keywords (`def end class module if unless while until do return yield begin rescue ensure case when then else elsif`), identifiers (lowercase = local/method, `@` = ivar, `@@` = cvar, `$` = global, uppercase = constant), numbers (int, float, `0x` `0o` `0b`, `_` separators), strings (`"…"` interpolation, `'…'` literal, `%w[a b c]`, `%i[a b c]`), symbols `:foo` `:"…"`, operators (`+ - * / % ** == != < > <= >= <=> === =~ !~ << >> & | ^ ~ ! && || and or not`), `:: . , ; ( ) [ ] { } -> => |`, comments `#` +- [x] Tokenizer: keywords (`def end class module if unless while until do return yield begin rescue ensure case when then else elsif`), identifiers (lowercase = local/method, `@` = ivar, `@@` = cvar, `$` = global, uppercase = constant), numbers (int, float, `0x` `0o` `0b`, `_` separators), strings (`"…"` interpolation, `'…'` literal, `%w[a b c]`, `%i[a b c]`), symbols `:foo` `:"…"`, operators (`+ - * / % ** == != < > <= >= <=> === =~ !~ << >> & | ^ ~ ! && || and or not`), `:: . , ; ( ) [ ] { } -> => |`, comments `#` - [ ] Parser: program is sequence of statements separated by newlines or `;`; method def `def name(args) … end`; class `class Foo < Bar … end`; module `module M … end`; block `do |a, b| … end` and `{ |a, b| … }`; call sugar (no parens), `obj.method`, `Mod::Const`; arg shapes (positional, default, splat `*args`, double-splat `**opts`, block `&blk`) - [ ] If/while/case expressions (return values), `unless`/`until`, postfix modifiers - [ ] Begin/rescue/ensure/retry, raise, raise with class+message @@ -117,7 +117,7 @@ Core mapping: _Newest first._ -- _(none yet)_ +- 2026-04-25: Phase 1 tokenizer complete — `lib/ruby/tokenizer.sx` + `lib/ruby/tests/tokenizer.sx` (107/107 tests). Keywords, identifiers (@ivar @@cvar $gvar), numbers (dec/hex/octal/binary/float), strings (dq with interpolation kept raw, sq), symbols, %w/%i literals, operators (all compound forms), punctuation, comments, line/col tracking. ## Blockers