;; Common Lisp tokenizer ;; ;; Tokens: {:type T :value V :pos P} ;; ;; Types: ;; "symbol" — FOO, PKG:SYM, PKG::SYM, T, NIL (upcase) ;; "keyword" — :foo (value is upcase name without colon) ;; "integer" — 42, -5, #xFF, #b1010, #o17 (string) ;; "float" — 3.14, 1.0e10 (string) ;; "ratio" — 1/3 (string "N/D") ;; "string" — unescaped content ;; "char" — single-character string ;; "lparen" "rparen" "quote" "backquote" "comma" "comma-at" ;; "hash-quote" — #' ;; "hash-paren" — #( ;; "uninterned" — #:foo (upcase name) ;; "dot" — standalone . (dotted pair separator) ;; "eof" (define cl-make-tok (fn (type value pos) {:type type :value value :pos pos})) ;; ── char ordinal table ──────────────────────────────────────────── (define cl-ord-table (let ((t (dict)) (i 0)) (define cl-fill (fn () (when (< i 128) (do (dict-set! t (char-from-code i) i) (set! i (+ i 1)) (cl-fill))))) (cl-fill) t)) (define cl-ord (fn (c) (or (get cl-ord-table c) 0))) ;; ── character predicates ────────────────────────────────────────── (define cl-digit? (fn (c) (and (>= (cl-ord c) 48) (<= (cl-ord c) 57)))) (define cl-hex? (fn (c) (or (cl-digit? c) (and (>= (cl-ord c) 65) (<= (cl-ord c) 70)) (and (>= (cl-ord c) 97) (<= (cl-ord c) 102))))) (define cl-octal? (fn (c) (and (>= (cl-ord c) 48) (<= (cl-ord c) 55)))) (define cl-binary? (fn (c) (or (= c "0") (= c "1")))) (define cl-ws? (fn (c) (or (= c " ") (= c "\t") (= c "\n") (= c "\r")))) (define cl-alpha? (fn (c) (or (and (>= (cl-ord c) 65) (<= (cl-ord c) 90)) (and (>= (cl-ord c) 97) (<= (cl-ord c) 122))))) ;; Characters that end a token (whitespace + terminating macro chars) (define cl-terminating? (fn (c) (or (cl-ws? c) (= c "(") (= c ")") (= c "\"") (= c ";") (= c "`") (= c ",")))) ;; Symbol constituent: not terminating, not reader-special (define cl-sym-char? (fn (c) (not (or (cl-terminating? c) (= c "#") (= c "|") (= c "\\") (= c "'"))))) ;; ── named character table ───────────────────────────────────────── (define cl-named-chars {:space " " :newline "\n" :tab "\t" :return "\r" :backspace (char-from-code 8) :rubout (char-from-code 127) :delete (char-from-code 127) :escape (char-from-code 27) :altmode (char-from-code 27) :null (char-from-code 0) :nul (char-from-code 0) :page (char-from-code 12) :formfeed (char-from-code 12)}) ;; ── main tokenizer ──────────────────────────────────────────────── (define cl-tokenize (fn (src) (let ((pos 0) (n (string-length src)) (toks (list))) (define at (fn () (if (< pos n) (substring src pos (+ pos 1)) nil))) (define peek1 (fn () (if (< (+ pos 1) n) (substring src (+ pos 1) (+ pos 2)) nil))) (define adv (fn () (set! pos (+ pos 1)))) ;; Advance while predicate holds; return substring from start to end (define read-while (fn (pred) (let ((start pos)) (define rw-loop (fn () (when (and (at) (pred (at))) (do (adv) (rw-loop))))) (rw-loop) (substring src start pos)))) (define skip-line (fn () (when (and (at) (not (= (at) "\n"))) (do (adv) (skip-line))))) (define skip-block (fn (depth) (when (at) (cond ((and (= (at) "#") (= (peek1) "|")) (do (adv) (adv) (skip-block (+ depth 1)))) ((and (= (at) "|") (= (peek1) "#")) (do (adv) (adv) (when (> depth 1) (skip-block (- depth 1))))) (:else (do (adv) (skip-block depth))))))) ;; Read string literal — called with pos just past opening " (define read-str (fn (acc) (if (not (at)) acc (cond ((= (at) "\"") (do (adv) acc)) ((= (at) "\\") (do (adv) (let ((e (at))) (adv) (read-str (str acc (cond ((= e "n") "\n") ((= e "t") "\t") ((= e "r") "\r") ((= e "\"") "\"") ((= e "\\") "\\") (:else e))))))) (:else (let ((c (at))) (adv) (read-str (str acc c)))))))) ;; Read #\ char literal — called with pos just past the backslash (define read-char-lit (fn () (let ((first (at))) (adv) (let ((rest (if (and (at) (cl-alpha? (at))) (read-while cl-alpha?) ""))) (if (= rest "") first (let ((name (downcase (str first rest)))) (or (get cl-named-chars name) first))))))) ;; Number scanner — called with pos just past first digit(s). ;; acc holds what was already consumed (first digit or sign+digit). (define scan-num (fn (p acc) (let ((more (read-while cl-digit?))) (set! acc (str acc more)) (cond ;; ratio N/D ((and (at) (= (at) "/") (peek1) (cl-digit? (peek1))) (do (adv) (let ((denom (read-while cl-digit?))) {:type "ratio" :value (str acc "/" denom) :pos p}))) ;; float: decimal point N.M[eE] ((and (at) (= (at) ".") (peek1) (cl-digit? (peek1))) (do (adv) (let ((frac (read-while cl-digit?))) (set! acc (str acc "." frac)) (when (and (at) (or (= (at) "e") (= (at) "E"))) (do (set! acc (str acc (at))) (adv) (when (and (at) (or (= (at) "+") (= (at) "-"))) (do (set! acc (str acc (at))) (adv))) (set! acc (str acc (read-while cl-digit?))))) {:type "float" :value acc :pos p}))) ;; float: exponent only NeE ((and (at) (or (= (at) "e") (= (at) "E"))) (do (set! acc (str acc (at))) (adv) (when (and (at) (or (= (at) "+") (= (at) "-"))) (do (set! acc (str acc (at))) (adv))) (set! acc (str acc (read-while cl-digit?))) {:type "float" :value acc :pos p})) (:else {:type "integer" :value acc :pos p}))))) (define read-radix (fn (letter p) (let ((pred (cond ((or (= letter "x") (= letter "X")) cl-hex?) ((or (= letter "b") (= letter "B")) cl-binary?) ((or (= letter "o") (= letter "O")) cl-octal?) (:else cl-digit?)))) {:type "integer" :value (str "#" letter (read-while pred)) :pos p}))) (define emit (fn (tok) (append! toks tok))) (define scan (fn () (when (< pos n) (let ((c (at)) (p pos)) (cond ((cl-ws? c) (do (adv) (scan))) ((= c ";") (do (adv) (skip-line) (scan))) ((= c "(") (do (adv) (emit (cl-make-tok "lparen" "(" p)) (scan))) ((= c ")") (do (adv) (emit (cl-make-tok "rparen" ")" p)) (scan))) ((= c "'") (do (adv) (emit (cl-make-tok "quote" "'" p)) (scan))) ((= c "`") (do (adv) (emit (cl-make-tok "backquote" "`" p)) (scan))) ((= c ",") (do (adv) (if (= (at) "@") (do (adv) (emit (cl-make-tok "comma-at" ",@" p))) (emit (cl-make-tok "comma" "," p))) (scan))) ((= c "\"") (do (adv) (emit (cl-make-tok "string" (read-str "") p)) (scan))) ;; :keyword ((= c ":") (do (adv) (emit (cl-make-tok "keyword" (upcase (read-while cl-sym-char?)) p)) (scan))) ;; dispatch macro # ((= c "#") (do (adv) (let ((d (at))) (cond ((= d "'") (do (adv) (emit (cl-make-tok "hash-quote" "#'" p)) (scan))) ((= d "(") (do (adv) (emit (cl-make-tok "hash-paren" "#(" p)) (scan))) ((= d ":") (do (adv) (emit (cl-make-tok "uninterned" (upcase (read-while cl-sym-char?)) p)) (scan))) ((= d "|") (do (adv) (skip-block 1) (scan))) ((= d "\\") (do (adv) (emit (cl-make-tok "char" (read-char-lit) p)) (scan))) ((or (= d "x") (= d "X")) (do (adv) (emit (read-radix d p)) (scan))) ((or (= d "b") (= d "B")) (do (adv) (emit (read-radix d p)) (scan))) ((or (= d "o") (= d "O")) (do (adv) (emit (read-radix d p)) (scan))) (:else (scan)))))) ;; standalone dot, float .5, or symbol starting with dots ((= c ".") (do (adv) (cond ((or (not (at)) (cl-terminating? (at))) (do (emit (cl-make-tok "dot" "." p)) (scan))) ((cl-digit? (at)) (do (emit (cl-make-tok "float" (str "0." (read-while cl-digit?)) p)) (scan))) (:else (do (emit (cl-make-tok "symbol" (upcase (str "." (read-while cl-sym-char?))) p)) (scan)))))) ;; sign followed by digit → number ((and (or (= c "+") (= c "-")) (peek1) (cl-digit? (peek1))) (do (adv) (let ((first-d (at))) (adv) (emit (scan-num p (str c first-d)))) (scan))) ;; decimal digit → number ((cl-digit? c) (do (adv) (emit (scan-num p c)) (scan))) ;; symbol constituent (includes bare +, -, etc.) ((cl-sym-char? c) (do (emit (cl-make-tok "symbol" (upcase (read-while cl-sym-char?)) p)) (scan))) (:else (do (adv) (scan)))))))) (scan) (append! toks (cl-make-tok "eof" nil n)) toks)))