Some checks failed
Test, Build, and Deploy / test-build-deploy (push) Has been cancelled
lib/common-lisp/reader.sx — CL tokenizer: symbols with package qualification (pkg:sym/pkg::sym), integers, floats, ratios, hex/ binary/octal (#xFF/#b1010/#o17), strings with escapes, #\ char literals (named + bare), reader macros (#' #( #: ,@), line and nested block comments. lib/common-lisp/tests/read.sx — 79 tests, all green. lib/common-lisp/test.sh — test runner (sx_server pipe protocol). Key SX gotcha: use str not concat for string building. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
382 lines
12 KiB
Plaintext
382 lines
12 KiB
Plaintext
;; Common Lisp tokenizer
|
|
;;
|
|
;; Tokens: {:type T :value V :pos P}
|
|
;;
|
|
;; Types:
|
|
;; "symbol" — FOO, PKG:SYM, PKG::SYM, T, NIL (upcase)
|
|
;; "keyword" — :foo (value is upcase name without colon)
|
|
;; "integer" — 42, -5, #xFF, #b1010, #o17 (string)
|
|
;; "float" — 3.14, 1.0e10 (string)
|
|
;; "ratio" — 1/3 (string "N/D")
|
|
;; "string" — unescaped content
|
|
;; "char" — single-character string
|
|
;; "lparen" "rparen" "quote" "backquote" "comma" "comma-at"
|
|
;; "hash-quote" — #'
|
|
;; "hash-paren" — #(
|
|
;; "uninterned" — #:foo (upcase name)
|
|
;; "dot" — standalone . (dotted pair separator)
|
|
;; "eof"
|
|
|
|
(define cl-make-tok (fn (type value pos) {:type type :value value :pos pos}))
|
|
|
|
;; ── char ordinal table ────────────────────────────────────────────
|
|
|
|
(define
|
|
cl-ord-table
|
|
(let
|
|
((t (dict)) (i 0))
|
|
(define
|
|
cl-fill
|
|
(fn
|
|
()
|
|
(when
|
|
(< i 128)
|
|
(do
|
|
(dict-set! t (char-from-code i) i)
|
|
(set! i (+ i 1))
|
|
(cl-fill)))))
|
|
(cl-fill)
|
|
t))
|
|
|
|
(define cl-ord (fn (c) (or (get cl-ord-table c) 0)))
|
|
|
|
;; ── character predicates ──────────────────────────────────────────
|
|
|
|
(define cl-digit? (fn (c) (and (>= (cl-ord c) 48) (<= (cl-ord c) 57))))
|
|
|
|
(define
|
|
cl-hex?
|
|
(fn
|
|
(c)
|
|
(or
|
|
(cl-digit? c)
|
|
(and (>= (cl-ord c) 65) (<= (cl-ord c) 70))
|
|
(and (>= (cl-ord c) 97) (<= (cl-ord c) 102)))))
|
|
|
|
(define cl-octal? (fn (c) (and (>= (cl-ord c) 48) (<= (cl-ord c) 55))))
|
|
|
|
(define cl-binary? (fn (c) (or (= c "0") (= c "1"))))
|
|
|
|
(define cl-ws? (fn (c) (or (= c " ") (= c "\t") (= c "\n") (= c "\r"))))
|
|
|
|
(define
|
|
cl-alpha?
|
|
(fn
|
|
(c)
|
|
(or
|
|
(and (>= (cl-ord c) 65) (<= (cl-ord c) 90))
|
|
(and (>= (cl-ord c) 97) (<= (cl-ord c) 122)))))
|
|
|
|
;; Characters that end a token (whitespace + terminating macro chars)
|
|
(define
|
|
cl-terminating?
|
|
(fn
|
|
(c)
|
|
(or
|
|
(cl-ws? c)
|
|
(= c "(")
|
|
(= c ")")
|
|
(= c "\"")
|
|
(= c ";")
|
|
(= c "`")
|
|
(= c ","))))
|
|
|
|
;; Symbol constituent: not terminating, not reader-special
|
|
(define
|
|
cl-sym-char?
|
|
(fn
|
|
(c)
|
|
(not
|
|
(or
|
|
(cl-terminating? c)
|
|
(= c "#")
|
|
(= c "|")
|
|
(= c "\\")
|
|
(= c "'")))))
|
|
|
|
;; ── named character table ─────────────────────────────────────────
|
|
|
|
(define
|
|
cl-named-chars
|
|
{:space " "
|
|
:newline "\n"
|
|
:tab "\t"
|
|
:return "\r"
|
|
:backspace (char-from-code 8)
|
|
:rubout (char-from-code 127)
|
|
:delete (char-from-code 127)
|
|
:escape (char-from-code 27)
|
|
:altmode (char-from-code 27)
|
|
:null (char-from-code 0)
|
|
:nul (char-from-code 0)
|
|
:page (char-from-code 12)
|
|
:formfeed (char-from-code 12)})
|
|
|
|
;; ── main tokenizer ────────────────────────────────────────────────
|
|
|
|
(define
|
|
cl-tokenize
|
|
(fn
|
|
(src)
|
|
(let
|
|
((pos 0) (n (string-length src)) (toks (list)))
|
|
|
|
(define at (fn () (if (< pos n) (substring src pos (+ pos 1)) nil)))
|
|
(define peek1 (fn () (if (< (+ pos 1) n) (substring src (+ pos 1) (+ pos 2)) nil)))
|
|
(define adv (fn () (set! pos (+ pos 1))))
|
|
|
|
;; Advance while predicate holds; return substring from start to end
|
|
(define
|
|
read-while
|
|
(fn
|
|
(pred)
|
|
(let
|
|
((start pos))
|
|
(define
|
|
rw-loop
|
|
(fn
|
|
()
|
|
(when
|
|
(and (at) (pred (at)))
|
|
(do (adv) (rw-loop)))))
|
|
(rw-loop)
|
|
(substring src start pos))))
|
|
|
|
(define
|
|
skip-line
|
|
(fn
|
|
()
|
|
(when
|
|
(and (at) (not (= (at) "\n")))
|
|
(do (adv) (skip-line)))))
|
|
|
|
(define
|
|
skip-block
|
|
(fn
|
|
(depth)
|
|
(when
|
|
(at)
|
|
(cond
|
|
((and (= (at) "#") (= (peek1) "|"))
|
|
(do (adv) (adv) (skip-block (+ depth 1))))
|
|
((and (= (at) "|") (= (peek1) "#"))
|
|
(do
|
|
(adv)
|
|
(adv)
|
|
(when (> depth 1) (skip-block (- depth 1)))))
|
|
(:else (do (adv) (skip-block depth)))))))
|
|
|
|
;; Read string literal — called with pos just past opening "
|
|
(define
|
|
read-str
|
|
(fn
|
|
(acc)
|
|
(if
|
|
(not (at))
|
|
acc
|
|
(cond
|
|
((= (at) "\"") (do (adv) acc))
|
|
((= (at) "\\")
|
|
(do
|
|
(adv)
|
|
(let
|
|
((e (at)))
|
|
(adv)
|
|
(read-str
|
|
(str
|
|
acc
|
|
(cond
|
|
((= e "n") "\n")
|
|
((= e "t") "\t")
|
|
((= e "r") "\r")
|
|
((= e "\"") "\"")
|
|
((= e "\\") "\\")
|
|
(:else e)))))))
|
|
(:else
|
|
(let
|
|
((c (at)))
|
|
(adv)
|
|
(read-str (str acc c))))))))
|
|
|
|
;; Read #\ char literal — called with pos just past the backslash
|
|
(define
|
|
read-char-lit
|
|
(fn
|
|
()
|
|
(let
|
|
((first (at)))
|
|
(adv)
|
|
(let
|
|
((rest (if (and (at) (cl-alpha? (at))) (read-while cl-alpha?) "")))
|
|
(if
|
|
(= rest "")
|
|
first
|
|
(let
|
|
((name (downcase (str first rest))))
|
|
(or (get cl-named-chars name) first)))))))
|
|
|
|
;; Number scanner — called with pos just past first digit(s).
|
|
;; acc holds what was already consumed (first digit or sign+digit).
|
|
(define
|
|
scan-num
|
|
(fn
|
|
(p acc)
|
|
(let
|
|
((more (read-while cl-digit?)))
|
|
(set! acc (str acc more))
|
|
(cond
|
|
;; ratio N/D
|
|
((and (at) (= (at) "/") (peek1) (cl-digit? (peek1)))
|
|
(do
|
|
(adv)
|
|
(let
|
|
((denom (read-while cl-digit?)))
|
|
{:type "ratio" :value (str acc "/" denom) :pos p})))
|
|
;; float: decimal point N.M[eE]
|
|
((and (at) (= (at) ".") (peek1) (cl-digit? (peek1)))
|
|
(do
|
|
(adv)
|
|
(let
|
|
((frac (read-while cl-digit?)))
|
|
(set! acc (str acc "." frac))
|
|
(when
|
|
(and (at) (or (= (at) "e") (= (at) "E")))
|
|
(do
|
|
(set! acc (str acc (at)))
|
|
(adv)
|
|
(when
|
|
(and (at) (or (= (at) "+") (= (at) "-")))
|
|
(do (set! acc (str acc (at))) (adv)))
|
|
(set! acc (str acc (read-while cl-digit?)))))
|
|
{:type "float" :value acc :pos p})))
|
|
;; float: exponent only NeE
|
|
((and (at) (or (= (at) "e") (= (at) "E")))
|
|
(do
|
|
(set! acc (str acc (at)))
|
|
(adv)
|
|
(when
|
|
(and (at) (or (= (at) "+") (= (at) "-")))
|
|
(do (set! acc (str acc (at))) (adv)))
|
|
(set! acc (str acc (read-while cl-digit?)))
|
|
{:type "float" :value acc :pos p}))
|
|
(:else {:type "integer" :value acc :pos p})))))
|
|
|
|
(define
|
|
read-radix
|
|
(fn
|
|
(letter p)
|
|
(let
|
|
((pred
|
|
(cond
|
|
((or (= letter "x") (= letter "X")) cl-hex?)
|
|
((or (= letter "b") (= letter "B")) cl-binary?)
|
|
((or (= letter "o") (= letter "O")) cl-octal?)
|
|
(:else cl-digit?))))
|
|
{:type "integer"
|
|
:value (str "#" letter (read-while pred))
|
|
:pos p})))
|
|
|
|
(define emit (fn (tok) (append! toks tok)))
|
|
|
|
(define
|
|
scan
|
|
(fn
|
|
()
|
|
(when
|
|
(< pos n)
|
|
(let
|
|
((c (at)) (p pos))
|
|
(cond
|
|
((cl-ws? c) (do (adv) (scan)))
|
|
((= c ";") (do (adv) (skip-line) (scan)))
|
|
((= c "(") (do (adv) (emit (cl-make-tok "lparen" "(" p)) (scan)))
|
|
((= c ")") (do (adv) (emit (cl-make-tok "rparen" ")" p)) (scan)))
|
|
((= c "'") (do (adv) (emit (cl-make-tok "quote" "'" p)) (scan)))
|
|
((= c "`") (do (adv) (emit (cl-make-tok "backquote" "`" p)) (scan)))
|
|
((= c ",")
|
|
(do
|
|
(adv)
|
|
(if
|
|
(= (at) "@")
|
|
(do (adv) (emit (cl-make-tok "comma-at" ",@" p)))
|
|
(emit (cl-make-tok "comma" "," p)))
|
|
(scan)))
|
|
((= c "\"")
|
|
(do
|
|
(adv)
|
|
(emit (cl-make-tok "string" (read-str "") p))
|
|
(scan)))
|
|
;; :keyword
|
|
((= c ":")
|
|
(do
|
|
(adv)
|
|
(emit (cl-make-tok "keyword" (upcase (read-while cl-sym-char?)) p))
|
|
(scan)))
|
|
;; dispatch macro #
|
|
((= c "#")
|
|
(do
|
|
(adv)
|
|
(let
|
|
((d (at)))
|
|
(cond
|
|
((= d "'") (do (adv) (emit (cl-make-tok "hash-quote" "#'" p)) (scan)))
|
|
((= d "(") (do (adv) (emit (cl-make-tok "hash-paren" "#(" p)) (scan)))
|
|
((= d ":")
|
|
(do
|
|
(adv)
|
|
(emit
|
|
(cl-make-tok "uninterned" (upcase (read-while cl-sym-char?)) p))
|
|
(scan)))
|
|
((= d "|") (do (adv) (skip-block 1) (scan)))
|
|
((= d "\\")
|
|
(do (adv) (emit (cl-make-tok "char" (read-char-lit) p)) (scan)))
|
|
((or (= d "x") (= d "X"))
|
|
(do (adv) (emit (read-radix d p)) (scan)))
|
|
((or (= d "b") (= d "B"))
|
|
(do (adv) (emit (read-radix d p)) (scan)))
|
|
((or (= d "o") (= d "O"))
|
|
(do (adv) (emit (read-radix d p)) (scan)))
|
|
(:else (scan))))))
|
|
;; standalone dot, float .5, or symbol starting with dots
|
|
((= c ".")
|
|
(do
|
|
(adv)
|
|
(cond
|
|
((or (not (at)) (cl-terminating? (at)))
|
|
(do (emit (cl-make-tok "dot" "." p)) (scan)))
|
|
((cl-digit? (at))
|
|
(do
|
|
(emit
|
|
(cl-make-tok "float" (str "0." (read-while cl-digit?)) p))
|
|
(scan)))
|
|
(:else
|
|
(do
|
|
(emit
|
|
(cl-make-tok "symbol" (upcase (str "." (read-while cl-sym-char?))) p))
|
|
(scan))))))
|
|
;; sign followed by digit → number
|
|
((and (or (= c "+") (= c "-")) (peek1) (cl-digit? (peek1)))
|
|
(do
|
|
(adv)
|
|
(let
|
|
((first-d (at)))
|
|
(adv)
|
|
(emit (scan-num p (str c first-d))))
|
|
(scan)))
|
|
;; decimal digit → number
|
|
((cl-digit? c)
|
|
(do
|
|
(adv)
|
|
(emit (scan-num p c))
|
|
(scan)))
|
|
;; symbol constituent (includes bare +, -, etc.)
|
|
((cl-sym-char? c)
|
|
(do
|
|
(emit (cl-make-tok "symbol" (upcase (read-while cl-sym-char?)) p))
|
|
(scan)))
|
|
(:else (do (adv) (scan))))))))
|
|
|
|
(scan)
|
|
(append! toks (cl-make-tok "eof" nil n))
|
|
toks)))
|