;; Smalltalk tokenizer. ;; ;; Token types: ;; ident identifier (foo, Foo, _x) ;; keyword selector keyword (foo:) — value is "foo:" with the colon ;; binary binary selector chars run together (+, ==, ->, <=, ~=, ...) ;; number integer or float; radix integers like 16rFF supported ;; string 'hello''world' style ;; char $c ;; symbol #foo, #foo:bar:, #+, #'with spaces' ;; array-open #( ;; byte-array-open #[ ;; lparen rparen lbracket rbracket lbrace rbrace ;; period semi bar caret colon assign bang ;; eof ;; ;; Comments "…" are skipped. (define st-make-token (fn (type value pos) {:type type :value value :pos pos})) (define st-digit? (fn (c) (and (not (= c nil)) (>= c "0") (<= c "9")))) (define st-letter? (fn (c) (and (not (= c nil)) (or (and (>= c "a") (<= c "z")) (and (>= c "A") (<= c "Z")))))) (define st-ident-start? (fn (c) (or (st-letter? c) (= c "_")))) (define st-ident-char? (fn (c) (or (st-ident-start? c) (st-digit? c)))) (define st-ws? (fn (c) (or (= c " ") (= c "\t") (= c "\n") (= c "\r")))) (define st-binary-chars (list "+" "-" "*" "/" "\\" "~" "<" ">" "=" "@" "%" "&" "?" ",")) (define st-binary-char? (fn (c) (and (not (= c nil)) (contains? st-binary-chars c)))) (define st-radix-digit? (fn (c) (and (not (= c nil)) (or (st-digit? c) (and (>= c "A") (<= c "Z")))))) (define st-tokenize (fn (src) (let ((tokens (list)) (pos 0) (src-len (len src))) (define pk (fn (offset) (if (< (+ pos offset) src-len) (nth src (+ pos offset)) nil))) (define cur (fn () (pk 0))) (define advance! (fn (n) (set! pos (+ pos n)))) (define push! (fn (type value start) (append! tokens (st-make-token type value start)))) (define skip-comment! (fn () (cond ((>= pos src-len) nil) ((= (cur) "\"") (advance! 1)) (else (begin (advance! 1) (skip-comment!)))))) (define skip-ws! (fn () (cond ((>= pos src-len) nil) ((st-ws? (cur)) (begin (advance! 1) (skip-ws!))) ((= (cur) "\"") (begin (advance! 1) (skip-comment!) (skip-ws!))) (else nil)))) (define read-ident-chars! (fn () (when (and (< pos src-len) (st-ident-char? (cur))) (begin (advance! 1) (read-ident-chars!))))) (define read-decimal-digits! (fn () (when (and (< pos src-len) (st-digit? (cur))) (begin (advance! 1) (read-decimal-digits!))))) (define read-radix-digits! (fn () (when (and (< pos src-len) (st-radix-digit? (cur))) (begin (advance! 1) (read-radix-digits!))))) (define read-exp-part! (fn () (when (and (< pos src-len) (or (= (cur) "e") (= (cur) "E")) (let ((p1 (pk 1)) (p2 (pk 2))) (or (st-digit? p1) (and (or (= p1 "+") (= p1 "-")) (st-digit? p2))))) (begin (advance! 1) (when (and (< pos src-len) (or (= (cur) "+") (= (cur) "-"))) (advance! 1)) (read-decimal-digits!))))) (define read-number (fn (start) (begin (read-decimal-digits!) (cond ((and (< pos src-len) (= (cur) "r")) (let ((base-str (slice src start pos))) (begin (advance! 1) (let ((rstart pos)) (begin (read-radix-digits!) (let ((digits (slice src rstart pos))) {:radix (parse-number base-str) :digits digits :value (parse-radix base-str digits) :kind "radix"})))))) ((and (< pos src-len) (= (cur) ".") (st-digit? (pk 1))) (begin (advance! 1) (read-decimal-digits!) (read-exp-part!) (parse-number (slice src start pos)))) (else (begin (read-exp-part!) (parse-number (slice src start pos)))))))) (define parse-radix (fn (base-str digits) (let ((base (parse-number base-str)) (chars digits) (n-len (len digits)) (idx 0) (acc 0)) (begin (define rd-loop (fn () (when (< idx n-len) (let ((c (nth chars idx))) (let ((d (cond ((and (>= c "0") (<= c "9")) (- (char-code c) 48)) ((and (>= c "A") (<= c "Z")) (- (char-code c) 55)) (else 0)))) (begin (set! acc (+ (* acc base) d)) (set! idx (+ idx 1)) (rd-loop))))))) (rd-loop) acc)))) (define read-string (fn () (let ((chars (list))) (begin (advance! 1) (define loop (fn () (cond ((>= pos src-len) nil) ((= (cur) "'") (cond ((= (pk 1) "'") (begin (append! chars "'") (advance! 2) (loop))) (else (advance! 1)))) (else (begin (append! chars (cur)) (advance! 1) (loop)))))) (loop) (join "" chars))))) (define read-binary-run! (fn () (let ((start pos)) (begin (define bin-loop (fn () (when (and (< pos src-len) (st-binary-char? (cur))) (begin (advance! 1) (bin-loop))))) (bin-loop) (slice src start pos))))) (define read-symbol (fn (start) (cond ;; Quoted symbol: #'whatever' ((= (cur) "'") (let ((s (read-string))) (push! "symbol" s start))) ;; Binary-char symbol: #+, #==, #->, #| ((or (st-binary-char? (cur)) (= (cur) "|")) (let ((b (read-binary-run!))) (cond ((= b "") ;; lone | wasn't binary; consume it (begin (advance! 1) (push! "symbol" "|" start))) (else (push! "symbol" b start))))) ;; Identifier or keyword chain: #foo, #foo:bar: ((st-ident-start? (cur)) (let ((id-start pos)) (begin (read-ident-chars!) (define kw-loop (fn () (when (and (< pos src-len) (= (cur) ":")) (begin (advance! 1) (when (and (< pos src-len) (st-ident-start? (cur))) (begin (read-ident-chars!) (kw-loop))))))) (kw-loop) (push! "symbol" (slice src id-start pos) start)))) (else (error (str "st-tokenize: bad symbol at " pos)))))) (define step (fn () (begin (skip-ws!) (when (< pos src-len) (let ((start pos) (c (cur))) (cond ;; Identifier or keyword ((st-ident-start? c) (begin (read-ident-chars!) (let ((word (slice src start pos))) (cond ;; ident immediately followed by ':' (and not ':=') => keyword ((and (< pos src-len) (= (cur) ":") (not (= (pk 1) "="))) (begin (advance! 1) (push! "keyword" (str word ":") start))) (else (push! "ident" word start)))) (step))) ;; Number ((st-digit? c) (let ((v (read-number start))) (begin (push! "number" v start) (step)))) ;; String ((= c "'") (let ((s (read-string))) (begin (push! "string" s start) (step)))) ;; Character literal ((= c "$") (cond ((>= (+ pos 1) src-len) (error (str "st-tokenize: $ at end of input"))) (else (begin (advance! 1) (push! "char" (cur) start) (advance! 1) (step))))) ;; Symbol or array literal ((= c "#") (cond ((= (pk 1) "(") (begin (advance! 2) (push! "array-open" "#(" start) (step))) ((= (pk 1) "[") (begin (advance! 2) (push! "byte-array-open" "#[" start) (step))) (else (begin (advance! 1) (read-symbol start) (step))))) ;; Assignment := or bare colon ((= c ":") (cond ((= (pk 1) "=") (begin (advance! 2) (push! "assign" ":=" start) (step))) (else (begin (advance! 1) (push! "colon" ":" start) (step))))) ;; Single-char structural punctuation ((= c "(") (begin (advance! 1) (push! "lparen" "(" start) (step))) ((= c ")") (begin (advance! 1) (push! "rparen" ")" start) (step))) ((= c "[") (begin (advance! 1) (push! "lbracket" "[" start) (step))) ((= c "]") (begin (advance! 1) (push! "rbracket" "]" start) (step))) ((= c "{") (begin (advance! 1) (push! "lbrace" "{" start) (step))) ((= c "}") (begin (advance! 1) (push! "rbrace" "}" start) (step))) ((= c ".") (begin (advance! 1) (push! "period" "." start) (step))) ((= c ";") (begin (advance! 1) (push! "semi" ";" start) (step))) ((= c "|") (begin (advance! 1) (push! "bar" "|" start) (step))) ((= c "^") (begin (advance! 1) (push! "caret" "^" start) (step))) ((= c "!") (begin (advance! 1) (push! "bang" "!" start) (step))) ;; Binary selector run ((st-binary-char? c) (let ((b (read-binary-run!))) (begin (push! "binary" b start) (step)))) (else (error (str "st-tokenize: unexpected char " c " at " pos))))))))) (step) (push! "eof" nil pos) tokens)))