Files
rose-ash/lib/scheme/parser.sx
giles c919d9a0d7
Some checks failed
Test, Build, and Deploy / test-build-deploy (push) Failing after 21s
scheme: Phase 1 parser — R7RS lexical reader + 62 tests [consumes-lex]
lib/scheme/parser.sx — reader for R7RS-small lexical syntax:
- numbers (int/float/exp)
- booleans #t / #f / #true / #false
- strings with standard escapes
- symbols (permissive — any non-delimiter)
- characters #\c, #\space, #\newline, #\tab, etc.
- vectors #(...)
- proper lists (dotted-pair deferred to Phase 3 with lambda rest-args)
- reader macros: 'X `X ,X ,@X → (quote X) (quasiquote X) etc.
  (Scheme conventions — lowercase, no $ prefix)
- line comments ;
- nestable block comments #| ... |#
- datum comments #;<datum>

AST shape mirrors Kernel: numbers/booleans/lists pass through;
strings wrapped as {:scm-string ...} to distinguish from symbols
(bare SX strings); chars as {:scm-char ...}; vectors as
{:scm-vector (list ...)}.

62 tests in lib/scheme/tests/parse.sx cover atom kinds, escape
sequences, quote/quasiquote/unquote/unquote-splicing, all three
comment flavours, and classic Scheme idioms (lambda, define, let,
if-cond).

Note: SX cond branches evaluate only the LAST expression, so
multi-mutation branches need explicit (do ...) or (begin ...)
wrappers — caught during block-comment debugging.

chisel: consumes-lex (lex-digit?, lex-whitespace? from
lib/guest/lex.sx); pratt not consumed (no operator precedence
in Scheme).
2026-05-13 19:58:30 +00:00

364 lines
11 KiB
Plaintext
Raw Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
;; lib/scheme/parser.sx — R7RS-small lexical reader.
;;
;; Reads numbers, booleans, characters, strings, symbols, vectors,
;; proper lists, and the standard reader macros (' ` , ,@). Line
;; comments `;`, datum comments `#;`, and nestable block comments
;; `#| ... |#` are all stripped.
;;
;; Dotted-pair syntax `(a b . c)` is deferred to Phase 3 — it's only
;; needed once lambdas with rest args land.
;;
;; AST shape
;; ---------
;; number → SX number
;; #t / #true → SX true ; #f / #false → SX false
;; () → SX empty list
;; "..." → {:scm-string "..."} wrapped so symbols stay bare
;; #\c → {:scm-char "c"} single-char (or named: space/newline/etc.)
;; #(1 2 3) → {:scm-vector (list 1 2 3)}
;; foo → "foo" bare SX string for symbols
;; (a b c) → SX list of parsed forms
;; 'X → (list "quote" X)
;; `X → (list "quasiquote" X)
;; ,X → (list "unquote" X)
;; ,@X → (list "unquote-splicing" X)
;;
;; Public API
;; (scheme-parse SRC) — first form; errors on extra trailing input
;; (scheme-parse-all SRC) — all top-level forms, as SX list
;; (scheme-string? V) — recognise wrapped string literal
;; (scheme-string-value V) — extract the underlying string
;; (scheme-char? V) / (scheme-char-value V)
;; (scheme-vector? V) / (scheme-vector-elements V)
;;
;; Consumes: lib/guest/lex.sx (lex-digit?, lex-whitespace?)
(define scheme-string-make (fn (s) {:scm-string s}))
(define
scheme-string?
(fn (v) (and (dict? v) (string? (get v :scm-string)))))
(define scheme-string-value (fn (v) (get v :scm-string)))
(define scheme-char-make (fn (s) {:scm-char s}))
(define
scheme-char?
(fn (v) (and (dict? v) (string? (get v :scm-char)))))
(define scheme-char-value (fn (v) (get v :scm-char)))
(define scheme-vector-make (fn (xs) {:scm-vector xs}))
(define
scheme-vector?
(fn (v) (and (dict? v) (list? (get v :scm-vector)))))
(define scheme-vector-elements (fn (v) (get v :scm-vector)))
;; Atom delimiters: characters that end a symbol or numeric token.
(define
scm-delim?
(fn
(c)
(or
(nil? c)
(lex-whitespace? c)
(= c "(")
(= c ")")
(= c "\"")
(= c ";")
(= c "'")
(= c "`")
(= c ",")
(= c "|"))))
;; Numeric grammar: [+-]? (digit+ ('.' digit+)? | '.' digit+) ([eE][+-]?digit+)?
(define
scm-numeric?
(fn
(s)
(let
((n (string-length s)))
(cond
((= n 0) false)
(:else
(let
((c0 (substring s 0 1)))
(let
((start (if (or (= c0 "+") (= c0 "-")) 1 0)))
(scm-num-body? s start n))))))))
(define
scm-num-body?
(fn
(s start n)
(cond
((>= start n) false)
((= (substring s start (+ start 1)) ".")
(scm-num-need-digits? s (+ start 1) n false))
((lex-digit? (substring s start (+ start 1)))
(scm-num-int-tail? s (+ start 1) n))
(:else false))))
(define
scm-num-int-tail?
(fn
(s i n)
(cond
((>= i n) true)
((lex-digit? (substring s i (+ i 1)))
(scm-num-int-tail? s (+ i 1) n))
((= (substring s i (+ i 1)) ".")
(scm-num-need-digits? s (+ i 1) n true))
((or (= (substring s i (+ i 1)) "e") (= (substring s i (+ i 1)) "E"))
(scm-num-exp-sign? s (+ i 1) n))
(:else false))))
(define
scm-num-need-digits?
(fn
(s i n had-int)
(cond
((>= i n) had-int)
((lex-digit? (substring s i (+ i 1)))
(scm-num-frac-tail? s (+ i 1) n))
(:else false))))
(define
scm-num-frac-tail?
(fn
(s i n)
(cond
((>= i n) true)
((lex-digit? (substring s i (+ i 1)))
(scm-num-frac-tail? s (+ i 1) n))
((or (= (substring s i (+ i 1)) "e") (= (substring s i (+ i 1)) "E"))
(scm-num-exp-sign? s (+ i 1) n))
(:else false))))
(define
scm-num-exp-sign?
(fn
(s i n)
(cond
((>= i n) false)
((or (= (substring s i (+ i 1)) "+") (= (substring s i (+ i 1)) "-"))
(scm-num-exp-digits? s (+ i 1) n false))
(:else (scm-num-exp-digits? s i n false)))))
(define
scm-num-exp-digits?
(fn
(s i n had)
(cond
((>= i n) had)
((lex-digit? (substring s i (+ i 1)))
(scm-num-exp-digits? s (+ i 1) n true))
(:else false))))
;; Named character literals: #\space, #\newline, etc.
(define
scm-named-char
(fn
(name)
(cond
((= name "space") " ")
((= name "newline") "\n")
((= name "tab") "\t")
((= name "return") "\r")
((= name "null") "")
((= name "nul") "")
((= name "delete") "")
((= name "rubout") "")
((= name "backspace") "")
((= name "escape") "")
((= name "alarm") "")
(:else nil))))
;; Reader: closure over (src, pos).
(define
scm-make-reader
(fn
(src)
(let
((pos 0) (n (string-length src)))
(define
at
(fn () (if (< pos n) (substring src pos (+ pos 1)) nil)))
(define
peek2
(fn
()
(if
(< (+ pos 1) n)
(substring src (+ pos 1) (+ pos 2))
nil)))
(define adv (fn () (set! pos (+ pos 1))))
(define
skip-line
(fn () (when (and (at) (not (= (at) "\n"))) (adv) (skip-line))))
(define
skip-block-comment
(fn
(depth)
(cond
((nil? (at)) (error "scheme-parse: unterminated block comment"))
((and (= (at) "|") (= (peek2) "#"))
(do (adv) (adv)
(cond
((> depth 1) (skip-block-comment (- depth 1)))
(:else nil))))
((and (= (at) "#") (= (peek2) "|"))
(do (adv) (adv) (skip-block-comment (+ depth 1))))
(:else (do (adv) (skip-block-comment depth))))))
(define
skip-ws
(fn
()
(cond
((nil? (at)) nil)
((lex-whitespace? (at)) (do (adv) (skip-ws)))
((= (at) ";") (do (adv) (skip-line) (skip-ws)))
((and (= (at) "#") (= (peek2) "|"))
(do (adv) (adv) (skip-block-comment 1) (skip-ws)))
(:else nil))))
(define
read-string-body
(fn
(acc)
(cond
((nil? (at)) (error "scheme-parse: unterminated string"))
((= (at) "\"") (do (adv) acc))
((= (at) "\\")
(do
(adv)
(let
((c (at)))
(when (nil? c) (error "scheme-parse: trailing backslash"))
(adv)
(read-string-body
(str
acc
(cond
((= c "n") "\n")
((= c "t") "\t")
((= c "r") "\r")
((= c "\"") "\"")
((= c "\\") "\\")
((= c "a") "")
((= c "b") "")
(:else c)))))))
(:else
(let ((c (at))) (adv) (read-string-body (str acc c)))))))
(define
read-atom-body
(fn
(acc)
(cond
((scm-delim? (at)) acc)
(:else (let ((c (at))) (adv) (read-atom-body (str acc c)))))))
(define
classify-atom
(fn (s) (cond ((scm-numeric? s) (string->number s)) (:else s))))
(define
read-hash
(fn
()
(let
((c (at)))
(cond
((= c "t")
(do
(adv)
(when (= (at) "r") (do (adv) (adv) (adv) (adv)))
true))
((= c "f")
(do
(adv)
(when
(= (at) "a")
(do (adv) (adv) (adv) (adv) (adv)))
false))
((= c "\\") (do (adv) (read-char-lit)))
((= c "(")
(do (adv) (scheme-vector-make (read-list (list)))))
((= c ";") (do (adv) (read-form) (read-form)))
(:else (error (str "scheme-parse: unknown # form: #" c)))))))
(define
read-char-lit
(fn
()
(let
((first-c (at)))
(when (nil? first-c) (error "scheme-parse: end after #\\"))
(adv)
(cond
((scm-delim? (at)) (scheme-char-make first-c))
(:else
(let
((name (str first-c (read-atom-body ""))))
(let
((named (scm-named-char name)))
(cond
((not (nil? named)) (scheme-char-make named))
((= (string-length name) 1)
(scheme-char-make name))
(:else
(error (str "scheme-parse: unknown char #\\" name)))))))))))
(define
read-form
(fn
()
(skip-ws)
(cond
((nil? (at)) :scm-eof)
((= (at) ")") (error "scheme-parse: unexpected ')'"))
((= (at) "(") (do (adv) (read-list (list))))
((= (at) "\"")
(do (adv) (scheme-string-make (read-string-body ""))))
((= (at) "#") (do (adv) (read-hash)))
((= (at) "'") (do (adv) (list "quote" (read-form))))
((= (at) "`") (do (adv) (list "quasiquote" (read-form))))
((= (at) ",")
(do
(adv)
(cond
((= (at) "@")
(do (adv) (list "unquote-splicing" (read-form))))
(:else (list "unquote" (read-form))))))
(:else (classify-atom (read-atom-body ""))))))
(define
read-list
(fn
(acc)
(skip-ws)
(cond
((nil? (at)) (error "scheme-parse: unterminated list"))
((= (at) ")") (do (adv) acc))
(:else (read-list (append acc (list (read-form))))))))
(define
read-all
(fn
(acc)
(skip-ws)
(if (nil? (at)) acc (read-all (append acc (list (read-form)))))))
{:read-form read-form :read-all read-all})))
(define
scheme-parse-all
(fn (src) ((get (scm-make-reader src) :read-all) (list))))
(define
scheme-parse
(fn
(src)
(let
((r (scm-make-reader src)))
(let
((form ((get r :read-form))))
(cond
((= form :scm-eof) (error "scheme-parse: empty input"))
(:else
(let
((next ((get r :read-form))))
(if
(= next :scm-eof)
form
(error "scheme-parse: trailing input after first form")))))))))