;; Ruby tokenizer for Ruby 2.7 subset. ;; Token: {:type T :value V :line L :col C} ;; ;; Types: keyword ident ivar cvar gvar const ;; int float string symbol ;; op dot dotdot dotdotdot dcolon colon ;; lparen rparen lbracket rbracket lbrace rbrace ;; comma semi pipe newline words isymbols eof ;; ── Character code table ────────────────────────────────────────── (define rb-ord-table (let ((t (dict)) (i 0)) (define rb-build-table (fn () (when (< i 128) (do (dict-set! t (char-from-code i) i) (set! i (+ i 1)) (rb-build-table))))) (rb-build-table) t)) (define rb-ord (fn (c) (or (get rb-ord-table c) 0))) ;; ── Character predicates ────────────────────────────────────────── (define rb-digit? (fn (c) (and (string? c) (>= (rb-ord c) 48) (<= (rb-ord c) 57)))) (define rb-hex-digit? (fn (c) (and (string? c) (or (and (>= (rb-ord c) 48) (<= (rb-ord c) 57)) (and (>= (rb-ord c) 97) (<= (rb-ord c) 102)) (and (>= (rb-ord c) 65) (<= (rb-ord c) 70)))))) (define rb-octal-digit? (fn (c) (and (string? c) (>= (rb-ord c) 48) (<= (rb-ord c) 55)))) (define rb-binary-digit? (fn (c) (or (= c "0") (= c "1")))) (define rb-lower? (fn (c) (and (string? c) (>= (rb-ord c) 97) (<= (rb-ord c) 122)))) (define rb-upper? (fn (c) (and (string? c) (>= (rb-ord c) 65) (<= (rb-ord c) 90)))) (define rb-ident-start? (fn (c) (or (rb-lower? c) (rb-upper? c) (= c "_")))) (define rb-ident-cont? (fn (c) (or (rb-lower? c) (rb-upper? c) (rb-digit? c) (= c "_")))) (define rb-space? (fn (c) (or (= c " ") (= c "\t") (= c "\r")))) ;; ── Reserved words ──────────────────────────────────────────────── (define rb-keywords (list "__ENCODING__" "__LINE__" "__FILE__" "BEGIN" "END" "alias" "and" "begin" "break" "case" "class" "def" "defined?" "do" "else" "elsif" "end" "ensure" "false" "for" "if" "in" "module" "next" "nil" "not" "or" "redo" "rescue" "retry" "return" "self" "super" "then" "true" "undef" "unless" "until" "when" "while" "yield")) (define rb-keyword? (fn (w) (contains? rb-keywords w))) ;; ── Token constructor ───────────────────────────────────────────── (define rb-make-token (fn (type value line col) {:type type :value value :line line :col col})) ;; ── Radix number parser ─────────────────────────────────────────── (define rb-parse-radix (fn (s radix) (let ((n (len s)) (i 0) (acc 0)) (define rb-rad-loop (fn () (when (< i n) (do (let ((c (substring s i (+ i 1)))) (cond ((and (>= (rb-ord c) 48) (<= (rb-ord c) 57)) (set! acc (+ (* acc radix) (- (rb-ord c) 48)))) ((and (>= (rb-ord c) 97) (<= (rb-ord c) 102)) (set! acc (+ (* acc radix) (+ 10 (- (rb-ord c) 97))))) ((and (>= (rb-ord c) 65) (<= (rb-ord c) 70)) (set! acc (+ (* acc radix) (+ 10 (- (rb-ord c) 65))))))) (set! i (+ i 1)) (rb-rad-loop))))) (rb-rad-loop) acc))) ;; ── Strip underscores from numeric literals ─────────────────────── (define rb-strip-underscores (fn (s) (let ((n (len s)) (i 0) (parts (list))) (define rb-su-loop (fn () (when (< i n) (do (let ((c (substring s i (+ i 1)))) (when (not (= c "_")) (append! parts c))) (set! i (+ i 1)) (rb-su-loop))))) (rb-su-loop) (join "" parts)))) ;; ── Main tokenizer ──────────────────────────────────────────────── (define rb-tokenize (fn (src) (let ((tokens (list)) (pos 0) (line 1) (col 1) (src-len (len src))) (define rb-peek (fn (offset) (if (< (+ pos offset) src-len) (substring src (+ pos offset) (+ pos offset 1)) nil))) (define rb-cur (fn () (rb-peek 0))) (define rb-advance! (fn () (let ((c (rb-cur))) (set! pos (+ pos 1)) (if (= c "\n") (do (set! line (+ line 1)) (set! col 1)) (set! col (+ col 1)))))) (define rb-advance-n! (fn (n) (when (> n 0) (do (rb-advance!) (rb-advance-n! (- n 1)))))) (define rb-push! (fn (type value tok-line tok-col) (append! tokens (rb-make-token type value tok-line tok-col)))) (define rb-read-while (fn (pred) (let ((start pos)) (define rb-rw-loop (fn () (when (and (< pos src-len) (pred (rb-cur))) (do (rb-advance!) (rb-rw-loop))))) (rb-rw-loop) (substring src start pos)))) (define rb-skip-line-comment! (fn () (define rb-slc-loop (fn () (when (and (< pos src-len) (not (= (rb-cur) "\n"))) (do (rb-advance!) (rb-slc-loop))))) (rb-slc-loop))) (define rb-read-escape (fn () (rb-advance!) (let ((c (rb-cur))) (cond ((= c "n") (do (rb-advance!) "\n")) ((= c "t") (do (rb-advance!) "\t")) ((= c "r") (do (rb-advance!) "\r")) ((= c "\\") (do (rb-advance!) "\\")) ((= c "'") (do (rb-advance!) "'")) ((= c "\"") (do (rb-advance!) "\"")) ((= c "a") (do (rb-advance!) (char-from-code 7))) ((= c "b") (do (rb-advance!) (char-from-code 8))) ((= c "f") (do (rb-advance!) (char-from-code 12))) ((= c "v") (do (rb-advance!) (char-from-code 11))) ((= c "e") (do (rb-advance!) (char-from-code 27))) ((= c "s") (do (rb-advance!) " ")) ((= c "0") (do (rb-advance!) (char-from-code 0))) (:else (do (rb-advance!) (str "\\" c))))))) (define rb-read-sq-string (fn () (let ((parts (list))) (rb-advance!) (define rb-sq-loop (fn () (cond ((>= pos src-len) nil) ((= (rb-cur) "'") (rb-advance!)) ((and (= (rb-cur) "\\") (let ((n (rb-peek 1))) (or (= n "\\") (= n "'")))) (do (rb-advance!) (append! parts (rb-cur)) (rb-advance!) (rb-sq-loop))) (:else (do (append! parts (rb-cur)) (rb-advance!) (rb-sq-loop)))))) (rb-sq-loop) (join "" parts)))) (define rb-read-dq-string (fn () (let ((parts (list))) (rb-advance!) (define rb-dq-loop (fn () (cond ((>= pos src-len) nil) ((= (rb-cur) "\"") (rb-advance!)) ((= (rb-cur) "\\") (do (append! parts (rb-read-escape)) (rb-dq-loop))) ((and (= (rb-cur) "#") (= (rb-peek 1) "{")) (do (append! parts "#{") (rb-advance-n! 2) (let ((depth 1)) (define rb-interp-inner (fn () (when (and (< pos src-len) (> depth 0)) (do (let ((c (rb-cur))) (cond ((= c "{") (do (set! depth (+ depth 1)) (append! parts c) (rb-advance!))) ((= c "}") (do (set! depth (- depth 1)) (when (> depth 0) (do (append! parts c) (rb-advance!))))) (:else (do (append! parts c) (rb-advance!))))) (rb-interp-inner))))) (rb-interp-inner)) (when (= (rb-cur) "}") (do (append! parts "}") (rb-advance!))) (rb-dq-loop))) (:else (do (append! parts (rb-cur)) (rb-advance!) (rb-dq-loop)))))) (rb-dq-loop) (join "" parts)))) (define rb-read-percent-words (fn () (rb-advance-n! 2) (let ((open-ch (rb-cur))) (let ((close-ch (cond ((= open-ch "[") "]") ((= open-ch "(") ")") ((= open-ch "{") "}") ((= open-ch "<") ">") (:else open-ch)))) (rb-advance!) (let ((items (list))) (define rb-pw-skip (fn () (when (and (< pos src-len) (or (rb-space? (rb-cur)) (= (rb-cur) "\n"))) (do (rb-advance!) (rb-pw-skip))))) (define rb-pw-word (fn (wparts) (if (or (>= pos src-len) (rb-space? (rb-cur)) (= (rb-cur) "\n") (= (rb-cur) close-ch)) (append! items (join "" wparts)) (do (append! wparts (rb-cur)) (rb-advance!) (rb-pw-word wparts))))) (define rb-pw-loop (fn () (rb-pw-skip) (when (and (< pos src-len) (not (= (rb-cur) close-ch))) (do (rb-pw-word (list)) (rb-pw-loop))))) (rb-pw-loop) (when (= (rb-cur) close-ch) (rb-advance!)) items))))) (define rb-read-ident-word (fn () (let ((start pos)) (rb-read-while rb-ident-cont?) (when (and (= (rb-cur) "?") (not (= (rb-peek 1) "="))) (rb-advance!)) (when (and (= (rb-cur) "!") (not (or (= (rb-peek 1) "=") (= (rb-peek 1) "~")))) (rb-advance!)) (substring src start pos)))) (define rb-read-number! (fn (tok-line tok-col) (let ((start pos)) (cond ((and (= (rb-cur) "0") (let ((p (rb-peek 1))) (or (= p "b") (= p "B")))) (do (rb-advance-n! 2) (let ((bin-str (rb-read-while rb-binary-digit?))) (rb-push! "int" (rb-parse-radix bin-str 2) tok-line tok-col)))) ((and (= (rb-cur) "0") (let ((p (rb-peek 1))) (or (= p "o") (= p "O")))) (do (rb-advance-n! 2) (let ((oct-str (rb-read-while rb-octal-digit?))) (rb-push! "int" (rb-parse-radix oct-str 8) tok-line tok-col)))) ((and (= (rb-cur) "0") (let ((p (rb-peek 1))) (or (= p "x") (= p "X")))) (do (rb-advance-n! 2) (let ((hex-str (rb-read-while rb-hex-digit?))) (rb-push! "int" (rb-parse-radix hex-str 16) tok-line tok-col)))) (:else (do (rb-read-while (fn (c) (or (rb-digit? c) (= c "_")))) (let ((is-float false)) (when (and (= (rb-cur) ".") (rb-digit? (rb-peek 1))) (do (set! is-float true) (rb-advance!) (rb-read-while (fn (c) (or (rb-digit? c) (= c "_")))))) (when (or (= (rb-cur) "e") (= (rb-cur) "E")) (do (set! is-float true) (rb-advance!) (when (or (= (rb-cur) "+") (= (rb-cur) "-")) (rb-advance!)) (rb-read-while rb-digit?))) (let ((num-str (rb-strip-underscores (substring src start pos)))) (if is-float (rb-push! "float" num-str tok-line tok-col) (rb-push! "int" (parse-int num-str) tok-line tok-col)))))))))) (define rb-read-op! (fn (tok-line tok-col) (let ((c0 (rb-cur)) (c1 (rb-peek 1)) (c2 (rb-peek 2))) (cond ((and (= c0 "<") (= c1 "=") (= c2 ">")) (do (rb-advance-n! 3) (rb-push! "op" "<=>" tok-line tok-col))) ((and (= c0 "=") (= c1 "=") (= c2 "=")) (do (rb-advance-n! 3) (rb-push! "op" "===" tok-line tok-col))) ((and (= c0 "*") (= c1 "*") (= c2 "=")) (do (rb-advance-n! 3) (rb-push! "op" "**=" tok-line tok-col))) ((and (= c0 "<") (= c1 "<") (= c2 "=")) (do (rb-advance-n! 3) (rb-push! "op" "<<=" tok-line tok-col))) ((and (= c0 ">") (= c1 ">") (= c2 "=")) (do (rb-advance-n! 3) (rb-push! "op" ">>=" tok-line tok-col))) ((and (= c0 "&") (= c1 "&") (= c2 "=")) (do (rb-advance-n! 3) (rb-push! "op" "&&=" tok-line tok-col))) ((and (= c0 "|") (= c1 "|") (= c2 "=")) (do (rb-advance-n! 3) (rb-push! "op" "||=" tok-line tok-col))) ((and (= c0 "*") (= c1 "*")) (do (rb-advance-n! 2) (rb-push! "op" "**" tok-line tok-col))) ((and (= c0 "=") (= c1 "=")) (do (rb-advance-n! 2) (rb-push! "op" "==" tok-line tok-col))) ((and (= c0 "!") (= c1 "=")) (do (rb-advance-n! 2) (rb-push! "op" "!=" tok-line tok-col))) ((and (= c0 "<") (= c1 "=")) (do (rb-advance-n! 2) (rb-push! "op" "<=" tok-line tok-col))) ((and (= c0 ">") (= c1 "=")) (do (rb-advance-n! 2) (rb-push! "op" ">=" tok-line tok-col))) ((and (= c0 "=") (= c1 "~")) (do (rb-advance-n! 2) (rb-push! "op" "=~" tok-line tok-col))) ((and (= c0 "!") (= c1 "~")) (do (rb-advance-n! 2) (rb-push! "op" "!~" tok-line tok-col))) ((and (= c0 "<") (= c1 "<")) (do (rb-advance-n! 2) (rb-push! "op" "<<" tok-line tok-col))) ((and (= c0 ">") (= c1 ">")) (do (rb-advance-n! 2) (rb-push! "op" ">>" tok-line tok-col))) ((and (= c0 "&") (= c1 "&")) (do (rb-advance-n! 2) (rb-push! "op" "&&" tok-line tok-col))) ((and (= c0 "|") (= c1 "|")) (do (rb-advance-n! 2) (rb-push! "op" "||" tok-line tok-col))) ((and (= c0 "+") (= c1 "=")) (do (rb-advance-n! 2) (rb-push! "op" "+=" tok-line tok-col))) ((and (= c0 "-") (= c1 "=")) (do (rb-advance-n! 2) (rb-push! "op" "-=" tok-line tok-col))) ((and (= c0 "*") (= c1 "=")) (do (rb-advance-n! 2) (rb-push! "op" "*=" tok-line tok-col))) ((and (= c0 "/") (= c1 "=")) (do (rb-advance-n! 2) (rb-push! "op" "/=" tok-line tok-col))) ((and (= c0 "%") (= c1 "=")) (do (rb-advance-n! 2) (rb-push! "op" "%=" tok-line tok-col))) ((and (= c0 "&") (= c1 "=")) (do (rb-advance-n! 2) (rb-push! "op" "&=" tok-line tok-col))) ((and (= c0 "|") (= c1 "=")) (do (rb-advance-n! 2) (rb-push! "op" "|=" tok-line tok-col))) ((and (= c0 "^") (= c1 "=")) (do (rb-advance-n! 2) (rb-push! "op" "^=" tok-line tok-col))) ((and (= c0 "-") (= c1 ">")) (do (rb-advance-n! 2) (rb-push! "op" "->" tok-line tok-col))) ((and (= c0 "=") (= c1 ">")) (do (rb-advance-n! 2) (rb-push! "op" "=>" tok-line tok-col))) ((and (= c0 "|") (nil? c1)) (do (rb-advance!) (rb-push! "pipe" "|" tok-line tok-col))) ((= c0 "|") (do (rb-advance!) (rb-push! "pipe" "|" tok-line tok-col))) (:else (do (rb-advance!) (rb-push! "op" c0 tok-line tok-col))))))) (define rb-scan! (fn () (cond ((>= pos src-len) nil) ((rb-space? (rb-cur)) (do (rb-advance!) (rb-scan!))) ((= (rb-cur) "#") (do (rb-skip-line-comment!) (rb-scan!))) ((= (rb-cur) "\n") (do (let ((l line) (c col)) (rb-advance!) (rb-push! "newline" nil l c)) (rb-scan!))) ((rb-digit? (rb-cur)) (do (let ((l line) (c col)) (rb-read-number! l c)) (rb-scan!))) ((rb-ident-start? (rb-cur)) (do (let ((l line) (c col)) (let ((w (rb-read-ident-word))) (if (rb-keyword? w) (rb-push! "keyword" w l c) (if (rb-upper? (substring w 0 1)) (rb-push! "const" w l c) (rb-push! "ident" w l c))))) (rb-scan!))) ((= (rb-cur) "@") (do (let ((l line) (c col)) (if (= (rb-peek 1) "@") (do (rb-advance-n! 2) (let ((name (rb-read-while rb-ident-cont?))) (rb-push! "cvar" (str "@@" name) l c))) (do (rb-advance!) (let ((name (rb-read-while rb-ident-cont?))) (rb-push! "ivar" (str "@" name) l c))))) (rb-scan!))) ((= (rb-cur) "$") (do (let ((l line) (c col)) (rb-advance!) (let ((name (rb-read-while rb-ident-cont?))) (rb-push! "gvar" (str "$" name) l c))) (rb-scan!))) ((= (rb-cur) "\"") (do (let ((l line) (c col)) (rb-push! "string" (rb-read-dq-string) l c)) (rb-scan!))) ((= (rb-cur) "'") (do (let ((l line) (c col)) (rb-push! "string" (rb-read-sq-string) l c)) (rb-scan!))) ((and (= (rb-cur) ":") (= (rb-peek 1) ":")) (do (let ((l line) (c col)) (rb-advance-n! 2) (rb-push! "dcolon" "::" l c)) (rb-scan!))) ((= (rb-cur) ":") (do (let ((l line) (c col)) (rb-advance!) (cond ((= (rb-cur) "\"") (rb-push! "symbol" (rb-read-dq-string) l c)) ((= (rb-cur) "'") (rb-push! "symbol" (rb-read-sq-string) l c)) ((rb-ident-start? (rb-cur)) (let ((name (rb-read-ident-word))) (rb-push! "symbol" name l c))) (:else (rb-push! "colon" ":" l c)))) (rb-scan!))) ((and (= (rb-cur) "%") (let ((p (rb-peek 1))) (or (= p "w") (= p "W") (= p "i") (= p "I")))) (do (let ((l line) (c col)) (let ((kind (rb-peek 1))) (let ((items (rb-read-percent-words))) (if (or (= kind "i") (= kind "I")) (rb-push! "isymbols" items l c) (rb-push! "words" items l c))))) (rb-scan!))) ((= (rb-cur) ".") (do (let ((l line) (c col)) (cond ((and (= (rb-peek 1) ".") (= (rb-peek 2) ".")) (do (rb-advance-n! 3) (rb-push! "dotdotdot" "..." l c))) ((= (rb-peek 1) ".") (do (rb-advance-n! 2) (rb-push! "dotdot" ".." l c))) (:else (do (rb-advance!) (rb-push! "dot" "." l c))))) (rb-scan!))) ((= (rb-cur) ",") (do (let ((l line) (c col)) (rb-push! "comma" "," l c) (rb-advance!)) (rb-scan!))) ((= (rb-cur) ";") (do (let ((l line) (c col)) (rb-push! "semi" ";" l c) (rb-advance!)) (rb-scan!))) ((= (rb-cur) "(") (do (let ((l line) (c col)) (rb-push! "lparen" "(" l c) (rb-advance!)) (rb-scan!))) ((= (rb-cur) ")") (do (let ((l line) (c col)) (rb-push! "rparen" ")" l c) (rb-advance!)) (rb-scan!))) ((= (rb-cur) "[") (do (let ((l line) (c col)) (rb-push! "lbracket" "[" l c) (rb-advance!)) (rb-scan!))) ((= (rb-cur) "]") (do (let ((l line) (c col)) (rb-push! "rbracket" "]" l c) (rb-advance!)) (rb-scan!))) ((= (rb-cur) "{") (do (let ((l line) (c col)) (rb-push! "lbrace" "{" l c) (rb-advance!)) (rb-scan!))) ((= (rb-cur) "}") (do (let ((l line) (c col)) (rb-push! "rbrace" "}" l c) (rb-advance!)) (rb-scan!))) ((or (= (rb-cur) "+") (= (rb-cur) "-") (= (rb-cur) "*") (= (rb-cur) "/") (= (rb-cur) "%") (= (rb-cur) "=") (= (rb-cur) "!") (= (rb-cur) "<") (= (rb-cur) ">") (= (rb-cur) "&") (= (rb-cur) "^") (= (rb-cur) "~") (= (rb-cur) "|")) (do (let ((l line) (c col)) (rb-read-op! l c)) (rb-scan!))) (:else (do (rb-advance!) (rb-scan!)))))) (rb-scan!) (rb-push! "eof" nil line col) tokens)))