Some checks failed
Test, Build, and Deploy / test-build-deploy (push) Has been cancelled
367 lines
12 KiB
Plaintext
367 lines
12 KiB
Plaintext
;; Smalltalk tokenizer.
|
|
;;
|
|
;; Token types:
|
|
;; ident identifier (foo, Foo, _x)
|
|
;; keyword selector keyword (foo:) — value is "foo:" with the colon
|
|
;; binary binary selector chars run together (+, ==, ->, <=, ~=, ...)
|
|
;; number integer or float; radix integers like 16rFF supported
|
|
;; string 'hello''world' style
|
|
;; char $c
|
|
;; symbol #foo, #foo:bar:, #+, #'with spaces'
|
|
;; array-open #(
|
|
;; byte-array-open #[
|
|
;; lparen rparen lbracket rbracket lbrace rbrace
|
|
;; period semi bar caret colon assign bang
|
|
;; eof
|
|
;;
|
|
;; Comments "…" are skipped.
|
|
|
|
(define st-make-token (fn (type value pos) {:type type :value value :pos pos}))
|
|
|
|
(define st-digit? (fn (c) (and (not (= c nil)) (>= c "0") (<= c "9"))))
|
|
|
|
(define
|
|
st-letter?
|
|
(fn
|
|
(c)
|
|
(and
|
|
(not (= c nil))
|
|
(or (and (>= c "a") (<= c "z")) (and (>= c "A") (<= c "Z"))))))
|
|
|
|
(define st-ident-start? (fn (c) (or (st-letter? c) (= c "_"))))
|
|
|
|
(define st-ident-char? (fn (c) (or (st-ident-start? c) (st-digit? c))))
|
|
|
|
(define st-ws? (fn (c) (or (= c " ") (= c "\t") (= c "\n") (= c "\r"))))
|
|
|
|
(define
|
|
st-binary-chars
|
|
(list "+" "-" "*" "/" "\\" "~" "<" ">" "=" "@" "%" "&" "?" ","))
|
|
|
|
(define
|
|
st-binary-char?
|
|
(fn (c) (and (not (= c nil)) (contains? st-binary-chars c))))
|
|
|
|
(define
|
|
st-radix-digit?
|
|
(fn
|
|
(c)
|
|
(and
|
|
(not (= c nil))
|
|
(or (st-digit? c) (and (>= c "A") (<= c "Z"))))))
|
|
|
|
(define
|
|
st-tokenize
|
|
(fn
|
|
(src)
|
|
(let
|
|
((tokens (list)) (pos 0) (src-len (len src)))
|
|
(define
|
|
pk
|
|
(fn
|
|
(offset)
|
|
(if (< (+ pos offset) src-len) (nth src (+ pos offset)) nil)))
|
|
(define cur (fn () (pk 0)))
|
|
(define advance! (fn (n) (set! pos (+ pos n))))
|
|
(define
|
|
push!
|
|
(fn
|
|
(type value start)
|
|
(append! tokens (st-make-token type value start))))
|
|
(define
|
|
skip-comment!
|
|
(fn
|
|
()
|
|
(cond
|
|
((>= pos src-len) nil)
|
|
((= (cur) "\"") (advance! 1))
|
|
(else (begin (advance! 1) (skip-comment!))))))
|
|
(define
|
|
skip-ws!
|
|
(fn
|
|
()
|
|
(cond
|
|
((>= pos src-len) nil)
|
|
((st-ws? (cur)) (begin (advance! 1) (skip-ws!)))
|
|
((= (cur) "\"") (begin (advance! 1) (skip-comment!) (skip-ws!)))
|
|
(else nil))))
|
|
(define
|
|
read-ident-chars!
|
|
(fn
|
|
()
|
|
(when
|
|
(and (< pos src-len) (st-ident-char? (cur)))
|
|
(begin (advance! 1) (read-ident-chars!)))))
|
|
(define
|
|
read-decimal-digits!
|
|
(fn
|
|
()
|
|
(when
|
|
(and (< pos src-len) (st-digit? (cur)))
|
|
(begin (advance! 1) (read-decimal-digits!)))))
|
|
(define
|
|
read-radix-digits!
|
|
(fn
|
|
()
|
|
(when
|
|
(and (< pos src-len) (st-radix-digit? (cur)))
|
|
(begin (advance! 1) (read-radix-digits!)))))
|
|
(define
|
|
read-exp-part!
|
|
(fn
|
|
()
|
|
(when
|
|
(and
|
|
(< pos src-len)
|
|
(or (= (cur) "e") (= (cur) "E"))
|
|
(let
|
|
((p1 (pk 1)) (p2 (pk 2)))
|
|
(or
|
|
(st-digit? p1)
|
|
(and (or (= p1 "+") (= p1 "-")) (st-digit? p2)))))
|
|
(begin
|
|
(advance! 1)
|
|
(when
|
|
(and (< pos src-len) (or (= (cur) "+") (= (cur) "-")))
|
|
(advance! 1))
|
|
(read-decimal-digits!)))))
|
|
(define
|
|
read-number
|
|
(fn
|
|
(start)
|
|
(begin
|
|
(read-decimal-digits!)
|
|
(cond
|
|
((and (< pos src-len) (= (cur) "r"))
|
|
(let
|
|
((base-str (slice src start pos)))
|
|
(begin
|
|
(advance! 1)
|
|
(let
|
|
((rstart pos))
|
|
(begin
|
|
(read-radix-digits!)
|
|
(let
|
|
((digits (slice src rstart pos)))
|
|
{:radix (parse-number base-str)
|
|
:digits digits
|
|
:value (parse-radix base-str digits)
|
|
:kind "radix"}))))))
|
|
((and
|
|
(< pos src-len)
|
|
(= (cur) ".")
|
|
(st-digit? (pk 1)))
|
|
(begin
|
|
(advance! 1)
|
|
(read-decimal-digits!)
|
|
(read-exp-part!)
|
|
(parse-number (slice src start pos))))
|
|
(else
|
|
(begin
|
|
(read-exp-part!)
|
|
(parse-number (slice src start pos))))))))
|
|
(define
|
|
parse-radix
|
|
(fn
|
|
(base-str digits)
|
|
(let
|
|
((base (parse-number base-str))
|
|
(chars digits)
|
|
(n-len (len digits))
|
|
(idx 0)
|
|
(acc 0))
|
|
(begin
|
|
(define
|
|
rd-loop
|
|
(fn
|
|
()
|
|
(when
|
|
(< idx n-len)
|
|
(let
|
|
((c (nth chars idx)))
|
|
(let
|
|
((d (cond
|
|
((and (>= c "0") (<= c "9")) (- (char-code c) 48))
|
|
((and (>= c "A") (<= c "Z")) (- (char-code c) 55))
|
|
(else 0))))
|
|
(begin
|
|
(set! acc (+ (* acc base) d))
|
|
(set! idx (+ idx 1))
|
|
(rd-loop)))))))
|
|
(rd-loop)
|
|
acc))))
|
|
(define
|
|
read-string
|
|
(fn
|
|
()
|
|
(let
|
|
((chars (list)))
|
|
(begin
|
|
(advance! 1)
|
|
(define
|
|
loop
|
|
(fn
|
|
()
|
|
(cond
|
|
((>= pos src-len) nil)
|
|
((= (cur) "'")
|
|
(cond
|
|
((= (pk 1) "'")
|
|
(begin
|
|
(append! chars "'")
|
|
(advance! 2)
|
|
(loop)))
|
|
(else (advance! 1))))
|
|
(else
|
|
(begin (append! chars (cur)) (advance! 1) (loop))))))
|
|
(loop)
|
|
(join "" chars)))))
|
|
(define
|
|
read-binary-run!
|
|
(fn
|
|
()
|
|
(let
|
|
((start pos))
|
|
(begin
|
|
(define
|
|
bin-loop
|
|
(fn
|
|
()
|
|
(when
|
|
(and (< pos src-len) (st-binary-char? (cur)))
|
|
(begin (advance! 1) (bin-loop)))))
|
|
(bin-loop)
|
|
(slice src start pos)))))
|
|
(define
|
|
read-symbol
|
|
(fn
|
|
(start)
|
|
(cond
|
|
;; Quoted symbol: #'whatever'
|
|
((= (cur) "'")
|
|
(let ((s (read-string))) (push! "symbol" s start)))
|
|
;; Binary-char symbol: #+, #==, #->, #|
|
|
((or (st-binary-char? (cur)) (= (cur) "|"))
|
|
(let ((b (read-binary-run!)))
|
|
(cond
|
|
((= b "")
|
|
;; lone | wasn't binary; consume it
|
|
(begin (advance! 1) (push! "symbol" "|" start)))
|
|
(else (push! "symbol" b start)))))
|
|
;; Identifier or keyword chain: #foo, #foo:bar:
|
|
((st-ident-start? (cur))
|
|
(let ((id-start pos))
|
|
(begin
|
|
(read-ident-chars!)
|
|
(define
|
|
kw-loop
|
|
(fn
|
|
()
|
|
(when
|
|
(and (< pos src-len) (= (cur) ":"))
|
|
(begin
|
|
(advance! 1)
|
|
(when
|
|
(and (< pos src-len) (st-ident-start? (cur)))
|
|
(begin (read-ident-chars!) (kw-loop)))))))
|
|
(kw-loop)
|
|
(push! "symbol" (slice src id-start pos) start))))
|
|
(else
|
|
(error
|
|
(str "st-tokenize: bad symbol at " pos))))))
|
|
(define
|
|
step
|
|
(fn
|
|
()
|
|
(begin
|
|
(skip-ws!)
|
|
(when
|
|
(< pos src-len)
|
|
(let
|
|
((start pos) (c (cur)))
|
|
(cond
|
|
;; Identifier or keyword
|
|
((st-ident-start? c)
|
|
(begin
|
|
(read-ident-chars!)
|
|
(let
|
|
((word (slice src start pos)))
|
|
(cond
|
|
;; ident immediately followed by ':' (and not ':=') => keyword
|
|
((and
|
|
(< pos src-len)
|
|
(= (cur) ":")
|
|
(not (= (pk 1) "=")))
|
|
(begin
|
|
(advance! 1)
|
|
(push!
|
|
"keyword"
|
|
(str word ":")
|
|
start)))
|
|
(else (push! "ident" word start))))
|
|
(step)))
|
|
;; Number
|
|
((st-digit? c)
|
|
(let
|
|
((v (read-number start)))
|
|
(begin (push! "number" v start) (step))))
|
|
;; String
|
|
((= c "'")
|
|
(let
|
|
((s (read-string)))
|
|
(begin (push! "string" s start) (step))))
|
|
;; Character literal
|
|
((= c "$")
|
|
(cond
|
|
((>= (+ pos 1) src-len)
|
|
(error (str "st-tokenize: $ at end of input")))
|
|
(else
|
|
(begin
|
|
(advance! 1)
|
|
(push! "char" (cur) start)
|
|
(advance! 1)
|
|
(step)))))
|
|
;; Symbol or array literal
|
|
((= c "#")
|
|
(cond
|
|
((= (pk 1) "(")
|
|
(begin (advance! 2) (push! "array-open" "#(" start) (step)))
|
|
((= (pk 1) "[")
|
|
(begin (advance! 2) (push! "byte-array-open" "#[" start) (step)))
|
|
(else
|
|
(begin (advance! 1) (read-symbol start) (step)))))
|
|
;; Assignment := or bare colon
|
|
((= c ":")
|
|
(cond
|
|
((= (pk 1) "=")
|
|
(begin (advance! 2) (push! "assign" ":=" start) (step)))
|
|
(else
|
|
(begin (advance! 1) (push! "colon" ":" start) (step)))))
|
|
;; Single-char structural punctuation
|
|
((= c "(") (begin (advance! 1) (push! "lparen" "(" start) (step)))
|
|
((= c ")") (begin (advance! 1) (push! "rparen" ")" start) (step)))
|
|
((= c "[") (begin (advance! 1) (push! "lbracket" "[" start) (step)))
|
|
((= c "]") (begin (advance! 1) (push! "rbracket" "]" start) (step)))
|
|
((= c "{") (begin (advance! 1) (push! "lbrace" "{" start) (step)))
|
|
((= c "}") (begin (advance! 1) (push! "rbrace" "}" start) (step)))
|
|
((= c ".") (begin (advance! 1) (push! "period" "." start) (step)))
|
|
((= c ";") (begin (advance! 1) (push! "semi" ";" start) (step)))
|
|
((= c "|") (begin (advance! 1) (push! "bar" "|" start) (step)))
|
|
((= c "^") (begin (advance! 1) (push! "caret" "^" start) (step)))
|
|
((= c "!") (begin (advance! 1) (push! "bang" "!" start) (step)))
|
|
;; Binary selector run
|
|
((st-binary-char? c)
|
|
(let
|
|
((b (read-binary-run!)))
|
|
(begin (push! "binary" b start) (step))))
|
|
(else
|
|
(error
|
|
(str
|
|
"st-tokenize: unexpected char "
|
|
c
|
|
" at "
|
|
pos)))))))))
|
|
(step)
|
|
(push! "eof" nil pos)
|
|
tokens)))
|