Files
rose-ash/lib/smalltalk/tokenizer.sx
giles 4e7d2183ad
Some checks failed
Test, Build, and Deploy / test-build-deploy (push) Has been cancelled
smalltalk: tokenizer + 63 tests
2026-04-25 00:19:23 +00:00

367 lines
12 KiB
Plaintext

;; Smalltalk tokenizer.
;;
;; Token types:
;; ident identifier (foo, Foo, _x)
;; keyword selector keyword (foo:) — value is "foo:" with the colon
;; binary binary selector chars run together (+, ==, ->, <=, ~=, ...)
;; number integer or float; radix integers like 16rFF supported
;; string 'hello''world' style
;; char $c
;; symbol #foo, #foo:bar:, #+, #'with spaces'
;; array-open #(
;; byte-array-open #[
;; lparen rparen lbracket rbracket lbrace rbrace
;; period semi bar caret colon assign bang
;; eof
;;
;; Comments "…" are skipped.
(define st-make-token (fn (type value pos) {:type type :value value :pos pos}))
(define st-digit? (fn (c) (and (not (= c nil)) (>= c "0") (<= c "9"))))
(define
st-letter?
(fn
(c)
(and
(not (= c nil))
(or (and (>= c "a") (<= c "z")) (and (>= c "A") (<= c "Z"))))))
(define st-ident-start? (fn (c) (or (st-letter? c) (= c "_"))))
(define st-ident-char? (fn (c) (or (st-ident-start? c) (st-digit? c))))
(define st-ws? (fn (c) (or (= c " ") (= c "\t") (= c "\n") (= c "\r"))))
(define
st-binary-chars
(list "+" "-" "*" "/" "\\" "~" "<" ">" "=" "@" "%" "&" "?" ","))
(define
st-binary-char?
(fn (c) (and (not (= c nil)) (contains? st-binary-chars c))))
(define
st-radix-digit?
(fn
(c)
(and
(not (= c nil))
(or (st-digit? c) (and (>= c "A") (<= c "Z"))))))
(define
st-tokenize
(fn
(src)
(let
((tokens (list)) (pos 0) (src-len (len src)))
(define
pk
(fn
(offset)
(if (< (+ pos offset) src-len) (nth src (+ pos offset)) nil)))
(define cur (fn () (pk 0)))
(define advance! (fn (n) (set! pos (+ pos n))))
(define
push!
(fn
(type value start)
(append! tokens (st-make-token type value start))))
(define
skip-comment!
(fn
()
(cond
((>= pos src-len) nil)
((= (cur) "\"") (advance! 1))
(else (begin (advance! 1) (skip-comment!))))))
(define
skip-ws!
(fn
()
(cond
((>= pos src-len) nil)
((st-ws? (cur)) (begin (advance! 1) (skip-ws!)))
((= (cur) "\"") (begin (advance! 1) (skip-comment!) (skip-ws!)))
(else nil))))
(define
read-ident-chars!
(fn
()
(when
(and (< pos src-len) (st-ident-char? (cur)))
(begin (advance! 1) (read-ident-chars!)))))
(define
read-decimal-digits!
(fn
()
(when
(and (< pos src-len) (st-digit? (cur)))
(begin (advance! 1) (read-decimal-digits!)))))
(define
read-radix-digits!
(fn
()
(when
(and (< pos src-len) (st-radix-digit? (cur)))
(begin (advance! 1) (read-radix-digits!)))))
(define
read-exp-part!
(fn
()
(when
(and
(< pos src-len)
(or (= (cur) "e") (= (cur) "E"))
(let
((p1 (pk 1)) (p2 (pk 2)))
(or
(st-digit? p1)
(and (or (= p1 "+") (= p1 "-")) (st-digit? p2)))))
(begin
(advance! 1)
(when
(and (< pos src-len) (or (= (cur) "+") (= (cur) "-")))
(advance! 1))
(read-decimal-digits!)))))
(define
read-number
(fn
(start)
(begin
(read-decimal-digits!)
(cond
((and (< pos src-len) (= (cur) "r"))
(let
((base-str (slice src start pos)))
(begin
(advance! 1)
(let
((rstart pos))
(begin
(read-radix-digits!)
(let
((digits (slice src rstart pos)))
{:radix (parse-number base-str)
:digits digits
:value (parse-radix base-str digits)
:kind "radix"}))))))
((and
(< pos src-len)
(= (cur) ".")
(st-digit? (pk 1)))
(begin
(advance! 1)
(read-decimal-digits!)
(read-exp-part!)
(parse-number (slice src start pos))))
(else
(begin
(read-exp-part!)
(parse-number (slice src start pos))))))))
(define
parse-radix
(fn
(base-str digits)
(let
((base (parse-number base-str))
(chars digits)
(n-len (len digits))
(idx 0)
(acc 0))
(begin
(define
rd-loop
(fn
()
(when
(< idx n-len)
(let
((c (nth chars idx)))
(let
((d (cond
((and (>= c "0") (<= c "9")) (- (char-code c) 48))
((and (>= c "A") (<= c "Z")) (- (char-code c) 55))
(else 0))))
(begin
(set! acc (+ (* acc base) d))
(set! idx (+ idx 1))
(rd-loop)))))))
(rd-loop)
acc))))
(define
read-string
(fn
()
(let
((chars (list)))
(begin
(advance! 1)
(define
loop
(fn
()
(cond
((>= pos src-len) nil)
((= (cur) "'")
(cond
((= (pk 1) "'")
(begin
(append! chars "'")
(advance! 2)
(loop)))
(else (advance! 1))))
(else
(begin (append! chars (cur)) (advance! 1) (loop))))))
(loop)
(join "" chars)))))
(define
read-binary-run!
(fn
()
(let
((start pos))
(begin
(define
bin-loop
(fn
()
(when
(and (< pos src-len) (st-binary-char? (cur)))
(begin (advance! 1) (bin-loop)))))
(bin-loop)
(slice src start pos)))))
(define
read-symbol
(fn
(start)
(cond
;; Quoted symbol: #'whatever'
((= (cur) "'")
(let ((s (read-string))) (push! "symbol" s start)))
;; Binary-char symbol: #+, #==, #->, #|
((or (st-binary-char? (cur)) (= (cur) "|"))
(let ((b (read-binary-run!)))
(cond
((= b "")
;; lone | wasn't binary; consume it
(begin (advance! 1) (push! "symbol" "|" start)))
(else (push! "symbol" b start)))))
;; Identifier or keyword chain: #foo, #foo:bar:
((st-ident-start? (cur))
(let ((id-start pos))
(begin
(read-ident-chars!)
(define
kw-loop
(fn
()
(when
(and (< pos src-len) (= (cur) ":"))
(begin
(advance! 1)
(when
(and (< pos src-len) (st-ident-start? (cur)))
(begin (read-ident-chars!) (kw-loop)))))))
(kw-loop)
(push! "symbol" (slice src id-start pos) start))))
(else
(error
(str "st-tokenize: bad symbol at " pos))))))
(define
step
(fn
()
(begin
(skip-ws!)
(when
(< pos src-len)
(let
((start pos) (c (cur)))
(cond
;; Identifier or keyword
((st-ident-start? c)
(begin
(read-ident-chars!)
(let
((word (slice src start pos)))
(cond
;; ident immediately followed by ':' (and not ':=') => keyword
((and
(< pos src-len)
(= (cur) ":")
(not (= (pk 1) "=")))
(begin
(advance! 1)
(push!
"keyword"
(str word ":")
start)))
(else (push! "ident" word start))))
(step)))
;; Number
((st-digit? c)
(let
((v (read-number start)))
(begin (push! "number" v start) (step))))
;; String
((= c "'")
(let
((s (read-string)))
(begin (push! "string" s start) (step))))
;; Character literal
((= c "$")
(cond
((>= (+ pos 1) src-len)
(error (str "st-tokenize: $ at end of input")))
(else
(begin
(advance! 1)
(push! "char" (cur) start)
(advance! 1)
(step)))))
;; Symbol or array literal
((= c "#")
(cond
((= (pk 1) "(")
(begin (advance! 2) (push! "array-open" "#(" start) (step)))
((= (pk 1) "[")
(begin (advance! 2) (push! "byte-array-open" "#[" start) (step)))
(else
(begin (advance! 1) (read-symbol start) (step)))))
;; Assignment := or bare colon
((= c ":")
(cond
((= (pk 1) "=")
(begin (advance! 2) (push! "assign" ":=" start) (step)))
(else
(begin (advance! 1) (push! "colon" ":" start) (step)))))
;; Single-char structural punctuation
((= c "(") (begin (advance! 1) (push! "lparen" "(" start) (step)))
((= c ")") (begin (advance! 1) (push! "rparen" ")" start) (step)))
((= c "[") (begin (advance! 1) (push! "lbracket" "[" start) (step)))
((= c "]") (begin (advance! 1) (push! "rbracket" "]" start) (step)))
((= c "{") (begin (advance! 1) (push! "lbrace" "{" start) (step)))
((= c "}") (begin (advance! 1) (push! "rbrace" "}" start) (step)))
((= c ".") (begin (advance! 1) (push! "period" "." start) (step)))
((= c ";") (begin (advance! 1) (push! "semi" ";" start) (step)))
((= c "|") (begin (advance! 1) (push! "bar" "|" start) (step)))
((= c "^") (begin (advance! 1) (push! "caret" "^" start) (step)))
((= c "!") (begin (advance! 1) (push! "bang" "!" start) (step)))
;; Binary selector run
((st-binary-char? c)
(let
((b (read-binary-run!)))
(begin (push! "binary" b start) (step))))
(else
(error
(str
"st-tokenize: unexpected char "
c
" at "
pos)))))))))
(step)
(push! "eof" nil pos)
tokens)))