smalltalk: tokenizer + 63 tests
Some checks failed
Test, Build, and Deploy / test-build-deploy (push) Has been cancelled
Some checks failed
Test, Build, and Deploy / test-build-deploy (push) Has been cancelled
This commit is contained in:
366
lib/smalltalk/tokenizer.sx
Normal file
366
lib/smalltalk/tokenizer.sx
Normal file
@@ -0,0 +1,366 @@
|
||||
;; Smalltalk tokenizer.
|
||||
;;
|
||||
;; Token types:
|
||||
;; ident identifier (foo, Foo, _x)
|
||||
;; keyword selector keyword (foo:) — value is "foo:" with the colon
|
||||
;; binary binary selector chars run together (+, ==, ->, <=, ~=, ...)
|
||||
;; number integer or float; radix integers like 16rFF supported
|
||||
;; string 'hello''world' style
|
||||
;; char $c
|
||||
;; symbol #foo, #foo:bar:, #+, #'with spaces'
|
||||
;; array-open #(
|
||||
;; byte-array-open #[
|
||||
;; lparen rparen lbracket rbracket lbrace rbrace
|
||||
;; period semi bar caret colon assign bang
|
||||
;; eof
|
||||
;;
|
||||
;; Comments "…" are skipped.
|
||||
|
||||
(define st-make-token (fn (type value pos) {:type type :value value :pos pos}))
|
||||
|
||||
(define st-digit? (fn (c) (and (not (= c nil)) (>= c "0") (<= c "9"))))
|
||||
|
||||
(define
|
||||
st-letter?
|
||||
(fn
|
||||
(c)
|
||||
(and
|
||||
(not (= c nil))
|
||||
(or (and (>= c "a") (<= c "z")) (and (>= c "A") (<= c "Z"))))))
|
||||
|
||||
(define st-ident-start? (fn (c) (or (st-letter? c) (= c "_"))))
|
||||
|
||||
(define st-ident-char? (fn (c) (or (st-ident-start? c) (st-digit? c))))
|
||||
|
||||
(define st-ws? (fn (c) (or (= c " ") (= c "\t") (= c "\n") (= c "\r"))))
|
||||
|
||||
(define
|
||||
st-binary-chars
|
||||
(list "+" "-" "*" "/" "\\" "~" "<" ">" "=" "@" "%" "&" "?" ","))
|
||||
|
||||
(define
|
||||
st-binary-char?
|
||||
(fn (c) (and (not (= c nil)) (contains? st-binary-chars c))))
|
||||
|
||||
(define
|
||||
st-radix-digit?
|
||||
(fn
|
||||
(c)
|
||||
(and
|
||||
(not (= c nil))
|
||||
(or (st-digit? c) (and (>= c "A") (<= c "Z"))))))
|
||||
|
||||
(define
|
||||
st-tokenize
|
||||
(fn
|
||||
(src)
|
||||
(let
|
||||
((tokens (list)) (pos 0) (src-len (len src)))
|
||||
(define
|
||||
pk
|
||||
(fn
|
||||
(offset)
|
||||
(if (< (+ pos offset) src-len) (nth src (+ pos offset)) nil)))
|
||||
(define cur (fn () (pk 0)))
|
||||
(define advance! (fn (n) (set! pos (+ pos n))))
|
||||
(define
|
||||
push!
|
||||
(fn
|
||||
(type value start)
|
||||
(append! tokens (st-make-token type value start))))
|
||||
(define
|
||||
skip-comment!
|
||||
(fn
|
||||
()
|
||||
(cond
|
||||
((>= pos src-len) nil)
|
||||
((= (cur) "\"") (advance! 1))
|
||||
(else (begin (advance! 1) (skip-comment!))))))
|
||||
(define
|
||||
skip-ws!
|
||||
(fn
|
||||
()
|
||||
(cond
|
||||
((>= pos src-len) nil)
|
||||
((st-ws? (cur)) (begin (advance! 1) (skip-ws!)))
|
||||
((= (cur) "\"") (begin (advance! 1) (skip-comment!) (skip-ws!)))
|
||||
(else nil))))
|
||||
(define
|
||||
read-ident-chars!
|
||||
(fn
|
||||
()
|
||||
(when
|
||||
(and (< pos src-len) (st-ident-char? (cur)))
|
||||
(begin (advance! 1) (read-ident-chars!)))))
|
||||
(define
|
||||
read-decimal-digits!
|
||||
(fn
|
||||
()
|
||||
(when
|
||||
(and (< pos src-len) (st-digit? (cur)))
|
||||
(begin (advance! 1) (read-decimal-digits!)))))
|
||||
(define
|
||||
read-radix-digits!
|
||||
(fn
|
||||
()
|
||||
(when
|
||||
(and (< pos src-len) (st-radix-digit? (cur)))
|
||||
(begin (advance! 1) (read-radix-digits!)))))
|
||||
(define
|
||||
read-exp-part!
|
||||
(fn
|
||||
()
|
||||
(when
|
||||
(and
|
||||
(< pos src-len)
|
||||
(or (= (cur) "e") (= (cur) "E"))
|
||||
(let
|
||||
((p1 (pk 1)) (p2 (pk 2)))
|
||||
(or
|
||||
(st-digit? p1)
|
||||
(and (or (= p1 "+") (= p1 "-")) (st-digit? p2)))))
|
||||
(begin
|
||||
(advance! 1)
|
||||
(when
|
||||
(and (< pos src-len) (or (= (cur) "+") (= (cur) "-")))
|
||||
(advance! 1))
|
||||
(read-decimal-digits!)))))
|
||||
(define
|
||||
read-number
|
||||
(fn
|
||||
(start)
|
||||
(begin
|
||||
(read-decimal-digits!)
|
||||
(cond
|
||||
((and (< pos src-len) (= (cur) "r"))
|
||||
(let
|
||||
((base-str (slice src start pos)))
|
||||
(begin
|
||||
(advance! 1)
|
||||
(let
|
||||
((rstart pos))
|
||||
(begin
|
||||
(read-radix-digits!)
|
||||
(let
|
||||
((digits (slice src rstart pos)))
|
||||
{:radix (parse-number base-str)
|
||||
:digits digits
|
||||
:value (parse-radix base-str digits)
|
||||
:kind "radix"}))))))
|
||||
((and
|
||||
(< pos src-len)
|
||||
(= (cur) ".")
|
||||
(st-digit? (pk 1)))
|
||||
(begin
|
||||
(advance! 1)
|
||||
(read-decimal-digits!)
|
||||
(read-exp-part!)
|
||||
(parse-number (slice src start pos))))
|
||||
(else
|
||||
(begin
|
||||
(read-exp-part!)
|
||||
(parse-number (slice src start pos))))))))
|
||||
(define
|
||||
parse-radix
|
||||
(fn
|
||||
(base-str digits)
|
||||
(let
|
||||
((base (parse-number base-str))
|
||||
(chars digits)
|
||||
(n-len (len digits))
|
||||
(idx 0)
|
||||
(acc 0))
|
||||
(begin
|
||||
(define
|
||||
rd-loop
|
||||
(fn
|
||||
()
|
||||
(when
|
||||
(< idx n-len)
|
||||
(let
|
||||
((c (nth chars idx)))
|
||||
(let
|
||||
((d (cond
|
||||
((and (>= c "0") (<= c "9")) (- (char-code c) 48))
|
||||
((and (>= c "A") (<= c "Z")) (- (char-code c) 55))
|
||||
(else 0))))
|
||||
(begin
|
||||
(set! acc (+ (* acc base) d))
|
||||
(set! idx (+ idx 1))
|
||||
(rd-loop)))))))
|
||||
(rd-loop)
|
||||
acc))))
|
||||
(define
|
||||
read-string
|
||||
(fn
|
||||
()
|
||||
(let
|
||||
((chars (list)))
|
||||
(begin
|
||||
(advance! 1)
|
||||
(define
|
||||
loop
|
||||
(fn
|
||||
()
|
||||
(cond
|
||||
((>= pos src-len) nil)
|
||||
((= (cur) "'")
|
||||
(cond
|
||||
((= (pk 1) "'")
|
||||
(begin
|
||||
(append! chars "'")
|
||||
(advance! 2)
|
||||
(loop)))
|
||||
(else (advance! 1))))
|
||||
(else
|
||||
(begin (append! chars (cur)) (advance! 1) (loop))))))
|
||||
(loop)
|
||||
(join "" chars)))))
|
||||
(define
|
||||
read-binary-run!
|
||||
(fn
|
||||
()
|
||||
(let
|
||||
((start pos))
|
||||
(begin
|
||||
(define
|
||||
bin-loop
|
||||
(fn
|
||||
()
|
||||
(when
|
||||
(and (< pos src-len) (st-binary-char? (cur)))
|
||||
(begin (advance! 1) (bin-loop)))))
|
||||
(bin-loop)
|
||||
(slice src start pos)))))
|
||||
(define
|
||||
read-symbol
|
||||
(fn
|
||||
(start)
|
||||
(cond
|
||||
;; Quoted symbol: #'whatever'
|
||||
((= (cur) "'")
|
||||
(let ((s (read-string))) (push! "symbol" s start)))
|
||||
;; Binary-char symbol: #+, #==, #->, #|
|
||||
((or (st-binary-char? (cur)) (= (cur) "|"))
|
||||
(let ((b (read-binary-run!)))
|
||||
(cond
|
||||
((= b "")
|
||||
;; lone | wasn't binary; consume it
|
||||
(begin (advance! 1) (push! "symbol" "|" start)))
|
||||
(else (push! "symbol" b start)))))
|
||||
;; Identifier or keyword chain: #foo, #foo:bar:
|
||||
((st-ident-start? (cur))
|
||||
(let ((id-start pos))
|
||||
(begin
|
||||
(read-ident-chars!)
|
||||
(define
|
||||
kw-loop
|
||||
(fn
|
||||
()
|
||||
(when
|
||||
(and (< pos src-len) (= (cur) ":"))
|
||||
(begin
|
||||
(advance! 1)
|
||||
(when
|
||||
(and (< pos src-len) (st-ident-start? (cur)))
|
||||
(begin (read-ident-chars!) (kw-loop)))))))
|
||||
(kw-loop)
|
||||
(push! "symbol" (slice src id-start pos) start))))
|
||||
(else
|
||||
(error
|
||||
(str "st-tokenize: bad symbol at " pos))))))
|
||||
(define
|
||||
step
|
||||
(fn
|
||||
()
|
||||
(begin
|
||||
(skip-ws!)
|
||||
(when
|
||||
(< pos src-len)
|
||||
(let
|
||||
((start pos) (c (cur)))
|
||||
(cond
|
||||
;; Identifier or keyword
|
||||
((st-ident-start? c)
|
||||
(begin
|
||||
(read-ident-chars!)
|
||||
(let
|
||||
((word (slice src start pos)))
|
||||
(cond
|
||||
;; ident immediately followed by ':' (and not ':=') => keyword
|
||||
((and
|
||||
(< pos src-len)
|
||||
(= (cur) ":")
|
||||
(not (= (pk 1) "=")))
|
||||
(begin
|
||||
(advance! 1)
|
||||
(push!
|
||||
"keyword"
|
||||
(str word ":")
|
||||
start)))
|
||||
(else (push! "ident" word start))))
|
||||
(step)))
|
||||
;; Number
|
||||
((st-digit? c)
|
||||
(let
|
||||
((v (read-number start)))
|
||||
(begin (push! "number" v start) (step))))
|
||||
;; String
|
||||
((= c "'")
|
||||
(let
|
||||
((s (read-string)))
|
||||
(begin (push! "string" s start) (step))))
|
||||
;; Character literal
|
||||
((= c "$")
|
||||
(cond
|
||||
((>= (+ pos 1) src-len)
|
||||
(error (str "st-tokenize: $ at end of input")))
|
||||
(else
|
||||
(begin
|
||||
(advance! 1)
|
||||
(push! "char" (cur) start)
|
||||
(advance! 1)
|
||||
(step)))))
|
||||
;; Symbol or array literal
|
||||
((= c "#")
|
||||
(cond
|
||||
((= (pk 1) "(")
|
||||
(begin (advance! 2) (push! "array-open" "#(" start) (step)))
|
||||
((= (pk 1) "[")
|
||||
(begin (advance! 2) (push! "byte-array-open" "#[" start) (step)))
|
||||
(else
|
||||
(begin (advance! 1) (read-symbol start) (step)))))
|
||||
;; Assignment := or bare colon
|
||||
((= c ":")
|
||||
(cond
|
||||
((= (pk 1) "=")
|
||||
(begin (advance! 2) (push! "assign" ":=" start) (step)))
|
||||
(else
|
||||
(begin (advance! 1) (push! "colon" ":" start) (step)))))
|
||||
;; Single-char structural punctuation
|
||||
((= c "(") (begin (advance! 1) (push! "lparen" "(" start) (step)))
|
||||
((= c ")") (begin (advance! 1) (push! "rparen" ")" start) (step)))
|
||||
((= c "[") (begin (advance! 1) (push! "lbracket" "[" start) (step)))
|
||||
((= c "]") (begin (advance! 1) (push! "rbracket" "]" start) (step)))
|
||||
((= c "{") (begin (advance! 1) (push! "lbrace" "{" start) (step)))
|
||||
((= c "}") (begin (advance! 1) (push! "rbrace" "}" start) (step)))
|
||||
((= c ".") (begin (advance! 1) (push! "period" "." start) (step)))
|
||||
((= c ";") (begin (advance! 1) (push! "semi" ";" start) (step)))
|
||||
((= c "|") (begin (advance! 1) (push! "bar" "|" start) (step)))
|
||||
((= c "^") (begin (advance! 1) (push! "caret" "^" start) (step)))
|
||||
((= c "!") (begin (advance! 1) (push! "bang" "!" start) (step)))
|
||||
;; Binary selector run
|
||||
((st-binary-char? c)
|
||||
(let
|
||||
((b (read-binary-run!)))
|
||||
(begin (push! "binary" b start) (step))))
|
||||
(else
|
||||
(error
|
||||
(str
|
||||
"st-tokenize: unexpected char "
|
||||
c
|
||||
" at "
|
||||
pos)))))))))
|
||||
(step)
|
||||
(push! "eof" nil pos)
|
||||
tokens)))
|
||||
Reference in New Issue
Block a user