lib/parser-combinators.sx — 46 definitions in 8 layers:
0. Result constructors (make-ok, make-fail, ok?, result-value, etc.)
1. Primitives (satisfy, parse-char, any-char, parse-string, eof)
2. Core combinators (fmap, parse-bind, seq, alt, label, lazy-parser)
3. Repetition (many, many1, optional, skip-many)
4. Structural (between, sep-by, sep-by1, skip-left, skip-right,
not-followed-by, look-ahead)
5. Character classes (digit, letter, alpha-num, whitespace, skip-spaces)
6. Literal parsers (number-literal, string-literal, identifier)
7. Run function (run-parser)
8. SX tokenizer (sx-comment, sx-keyword, sx-symbol, sx-number,
sx-string, sx-token, sx-tokenize)
Self-tests by tokenizing SX: (define x 42), {:ok true}, (+ 1 (* 2 3)),
comments, negative numbers, nested parens, recursive grammars.
No evaluator changes. Pure HO functions + thunks for lazy recursion.
2868/2868 tests, zero failures.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
485 lines
13 KiB
Plaintext
485 lines
13 KiB
Plaintext
;; Parser Combinator Library — Pure SX
|
|
;;
|
|
;; A parser is (fn (input pos) result) where:
|
|
;; Success: {:ok true :value val :pos new-pos}
|
|
;; Failure: {:ok false :expected desc :pos pos}
|
|
;;
|
|
;; Combinators compose parsers: seq, alt, many, sep-by, between, etc.
|
|
;; Recursive grammars use (lazy-parser (fn () ...)) thunks.
|
|
|
|
;; ── Layer 0: Result constructors ──────────────────────────────────
|
|
|
|
(define make-ok (fn (val pos) {:pos pos :ok true :value val}))
|
|
(define make-fail (fn (expected pos) {:pos pos :ok false :expected expected}))
|
|
(define ok? (fn (r) (get r "ok")))
|
|
(define result-value (fn (r) (get r "value")))
|
|
(define result-pos (fn (r) (get r "pos")))
|
|
(define result-expected (fn (r) (get r "expected")))
|
|
|
|
;; ── Layer 1: Primitive parsers ────────────────────────────────────
|
|
|
|
(define
|
|
satisfy
|
|
(fn
|
|
(pred desc)
|
|
(fn
|
|
(input pos)
|
|
(if
|
|
(< pos (len input))
|
|
(let
|
|
((ch (nth input pos)))
|
|
(if (pred ch) (make-ok ch (+ pos 1)) (make-fail desc pos)))
|
|
(make-fail desc pos)))))
|
|
|
|
(define
|
|
parse-char
|
|
(fn (ch) (satisfy (fn (c) (= c ch)) (str "'" ch "'"))))
|
|
|
|
(define any-char (satisfy (fn (c) true) "any character"))
|
|
|
|
(define
|
|
parse-string
|
|
(fn
|
|
(target)
|
|
(let
|
|
((target-len (len target)))
|
|
(fn
|
|
(input pos)
|
|
(if
|
|
(<= (+ pos target-len) (len input))
|
|
(if
|
|
(= (slice input pos (+ pos target-len)) target)
|
|
(make-ok target (+ pos target-len))
|
|
(make-fail (str "\"" target "\"") pos))
|
|
(make-fail (str "\"" target "\"") pos))))))
|
|
|
|
(define
|
|
eof
|
|
(fn
|
|
(input pos)
|
|
(if
|
|
(>= pos (len input))
|
|
(make-ok nil pos)
|
|
(make-fail "end of input" pos))))
|
|
|
|
;; ── Layer 2: Core combinators ─────────────────────────────────────
|
|
|
|
(define
|
|
fmap
|
|
(fn
|
|
(f parser)
|
|
(fn
|
|
(input pos)
|
|
(let
|
|
((r (parser input pos)))
|
|
(if (ok? r) (make-ok (f (result-value r)) (result-pos r)) r)))))
|
|
|
|
(define
|
|
parse-bind
|
|
(fn
|
|
(parser f)
|
|
(fn
|
|
(input pos)
|
|
(let
|
|
((r (parser input pos)))
|
|
(if (ok? r) ((f (result-value r)) input (result-pos r)) r)))))
|
|
|
|
(define
|
|
seq
|
|
(fn
|
|
(parsers)
|
|
(fn
|
|
(input pos)
|
|
(let
|
|
((results (list)) (cur-pos pos) (failed nil))
|
|
(for-each
|
|
(fn
|
|
(p)
|
|
(when
|
|
(not failed)
|
|
(let
|
|
((r (p input cur-pos)))
|
|
(if
|
|
(ok? r)
|
|
(do
|
|
(append! results (result-value r))
|
|
(set! cur-pos (result-pos r)))
|
|
(set! failed r)))))
|
|
parsers)
|
|
(if failed failed (make-ok results cur-pos))))))
|
|
|
|
(define seq2 (fn (p1 p2) (seq (list p1 p2))))
|
|
|
|
(define
|
|
alt
|
|
(fn
|
|
(parsers)
|
|
(fn
|
|
(input pos)
|
|
(let
|
|
((best-fail nil))
|
|
(define
|
|
loop
|
|
(fn
|
|
(ps)
|
|
(if
|
|
(empty? ps)
|
|
(or best-fail (make-fail "no alternatives" pos))
|
|
(let
|
|
((r ((first ps) input pos)))
|
|
(if
|
|
(ok? r)
|
|
r
|
|
(do
|
|
(when
|
|
(or
|
|
(not best-fail)
|
|
(> (result-pos r) (result-pos best-fail)))
|
|
(set! best-fail r))
|
|
(loop (rest ps))))))))
|
|
(loop parsers)))))
|
|
|
|
(define alt2 (fn (p1 p2) (alt (list p1 p2))))
|
|
|
|
(define
|
|
label
|
|
(fn
|
|
(name parser)
|
|
(fn
|
|
(input pos)
|
|
(let ((r (parser input pos))) (if (ok? r) r (make-fail name pos))))))
|
|
|
|
(define lazy-parser (fn (thunk) (fn (input pos) ((thunk) input pos))))
|
|
|
|
;; ── Layer 3: Repetition combinators ───────────────────────────────
|
|
|
|
(define
|
|
many
|
|
(fn
|
|
(parser)
|
|
(fn
|
|
(input pos)
|
|
(let
|
|
((results (list)) (cur-pos pos))
|
|
(define
|
|
loop
|
|
(fn
|
|
()
|
|
(let
|
|
((r (parser input cur-pos)))
|
|
(if
|
|
(ok? r)
|
|
(do
|
|
(append! results (result-value r))
|
|
(set! cur-pos (result-pos r))
|
|
(loop))
|
|
(make-ok results cur-pos)))))
|
|
(loop)))))
|
|
|
|
(define
|
|
many1
|
|
(fn
|
|
(parser)
|
|
(fn
|
|
(input pos)
|
|
(let
|
|
((r (parser input pos)))
|
|
(if
|
|
(ok? r)
|
|
(let
|
|
((rest-r ((many parser) input (result-pos r))))
|
|
(make-ok
|
|
(cons (result-value r) (result-value rest-r))
|
|
(result-pos rest-r)))
|
|
r)))))
|
|
|
|
(define
|
|
optional
|
|
(fn
|
|
(parser)
|
|
(fn
|
|
(input pos)
|
|
(let ((r (parser input pos))) (if (ok? r) r (make-ok nil pos))))))
|
|
|
|
(define
|
|
skip-many
|
|
(fn
|
|
(parser)
|
|
(fn
|
|
(input pos)
|
|
(let
|
|
((cur-pos pos))
|
|
(define
|
|
loop
|
|
(fn
|
|
()
|
|
(let
|
|
((r (parser input cur-pos)))
|
|
(if
|
|
(ok? r)
|
|
(do (set! cur-pos (result-pos r)) (loop))
|
|
(make-ok nil cur-pos)))))
|
|
(loop)))))
|
|
|
|
;; ── Layer 4: Structural combinators ───────────────────────────────
|
|
|
|
(define
|
|
between
|
|
(fn
|
|
(open close body)
|
|
(fmap (fn (results) (nth results 1)) (seq (list open body close)))))
|
|
|
|
(define
|
|
sep-by1
|
|
(fn
|
|
(parser sep)
|
|
(fn
|
|
(input pos)
|
|
(let
|
|
((first-r (parser input pos)))
|
|
(if
|
|
(ok? first-r)
|
|
(let
|
|
((rest-r ((many (fmap (fn (pair) (nth pair 1)) (seq (list sep parser)))) input (result-pos first-r))))
|
|
(make-ok
|
|
(cons (result-value first-r) (result-value rest-r))
|
|
(result-pos rest-r)))
|
|
first-r)))))
|
|
|
|
(define
|
|
sep-by
|
|
(fn
|
|
(parser sep)
|
|
(alt
|
|
(list (sep-by1 parser sep) (fn (input pos) (make-ok (list) pos))))))
|
|
|
|
(define
|
|
skip-left
|
|
(fn
|
|
(skip keep)
|
|
(fmap (fn (results) (nth results 1)) (seq (list skip keep)))))
|
|
|
|
(define
|
|
skip-right
|
|
(fn
|
|
(keep skip)
|
|
(fmap (fn (results) (first results)) (seq (list keep skip)))))
|
|
|
|
(define
|
|
not-followed-by
|
|
(fn
|
|
(parser)
|
|
(fn
|
|
(input pos)
|
|
(let
|
|
((r (parser input pos)))
|
|
(if (ok? r) (make-fail "not followed by" pos) (make-ok nil pos))))))
|
|
|
|
(define
|
|
look-ahead
|
|
(fn
|
|
(parser)
|
|
(fn
|
|
(input pos)
|
|
(let
|
|
((r (parser input pos)))
|
|
(if (ok? r) (make-ok (result-value r) pos) r)))))
|
|
|
|
;; ── Layer 5: Character class parsers ──────────────────────────────
|
|
|
|
(define digit (satisfy (fn (c) (and (>= c "0") (<= c "9"))) "digit"))
|
|
|
|
(define
|
|
letter
|
|
(satisfy
|
|
(fn
|
|
(c)
|
|
(or (and (>= c "a") (<= c "z")) (and (>= c "A") (<= c "Z"))))
|
|
"letter"))
|
|
|
|
(define alpha-num (alt2 letter digit))
|
|
|
|
(define
|
|
whitespace-char
|
|
(satisfy
|
|
(fn (c) (or (= c " ") (= c "\t") (= c "\n") (= c "\r")))
|
|
"whitespace"))
|
|
|
|
(define skip-spaces (skip-many whitespace-char))
|
|
|
|
;; ── Layer 6: Literal parsers ──────────────────────────────────────
|
|
|
|
(define
|
|
number-literal
|
|
(fn
|
|
(input pos)
|
|
(let
|
|
((start pos) (cur pos) (input-len (len input)))
|
|
(when
|
|
(and (< cur input-len) (= (nth input cur) "-"))
|
|
(set! cur (+ cur 1)))
|
|
(let
|
|
((digit-start cur))
|
|
(define
|
|
scan-digits
|
|
(fn
|
|
()
|
|
(when
|
|
(and
|
|
(< cur input-len)
|
|
(>= (nth input cur) "0")
|
|
(<= (nth input cur) "9"))
|
|
(set! cur (+ cur 1))
|
|
(scan-digits))))
|
|
(scan-digits)
|
|
(if
|
|
(= cur digit-start)
|
|
(make-fail "number" pos)
|
|
(do
|
|
(when
|
|
(and (< cur input-len) (= (nth input cur) "."))
|
|
(set! cur (+ cur 1))
|
|
(scan-digits))
|
|
(make-ok (parse-float (slice input start cur)) cur)))))))
|
|
|
|
(define
|
|
string-literal
|
|
(fn
|
|
(input pos)
|
|
(if
|
|
(and (< pos (len input)) (= (nth input pos) "\""))
|
|
(let
|
|
((cur (+ pos 1)) (input-len (len input)) (chars (list)))
|
|
(define
|
|
loop
|
|
(fn
|
|
()
|
|
(if
|
|
(>= cur input-len)
|
|
(make-fail "closing quote" cur)
|
|
(let
|
|
((ch (nth input cur)))
|
|
(cond
|
|
(= ch "\"")
|
|
(make-ok (join "" chars) (+ cur 1))
|
|
(= ch "\\")
|
|
(if
|
|
(>= (+ cur 1) input-len)
|
|
(make-fail "escape character" cur)
|
|
(let
|
|
((next (nth input (+ cur 1))))
|
|
(cond
|
|
(= next "n")
|
|
(do
|
|
(append! chars "\n")
|
|
(set! cur (+ cur 2))
|
|
(loop))
|
|
(= next "t")
|
|
(do
|
|
(append! chars "\t")
|
|
(set! cur (+ cur 2))
|
|
(loop))
|
|
(= next "\\")
|
|
(do
|
|
(append! chars "\\")
|
|
(set! cur (+ cur 2))
|
|
(loop))
|
|
(= next "\"")
|
|
(do
|
|
(append! chars "\"")
|
|
(set! cur (+ cur 2))
|
|
(loop))
|
|
:else (do
|
|
(append! chars next)
|
|
(set! cur (+ cur 2))
|
|
(loop)))))
|
|
:else (do (append! chars ch) (set! cur (+ cur 1)) (loop)))))))
|
|
(loop))
|
|
(make-fail "string" pos))))
|
|
|
|
(define
|
|
identifier
|
|
(let
|
|
((id-start (satisfy (fn (c) (or (and (>= c "a") (<= c "z")) (and (>= c "A") (<= c "Z")) (= c "_") (= c "~"))) "identifier start"))
|
|
(id-char
|
|
(satisfy
|
|
(fn
|
|
(c)
|
|
(or
|
|
(and (>= c "a") (<= c "z"))
|
|
(and (>= c "A") (<= c "Z"))
|
|
(and (>= c "0") (<= c "9"))
|
|
(= c "-")
|
|
(= c "_")
|
|
(= c "?")
|
|
(= c "!")
|
|
(= c "~")
|
|
(= c "/")))
|
|
"identifier char")))
|
|
(fmap
|
|
(fn (pair) (str (first pair) (join "" (nth pair 1))))
|
|
(seq (list id-start (many id-char))))))
|
|
|
|
;; ── Layer 7: Run parser ───────────────────────────────────────────
|
|
|
|
(define run-parser (fn (parser input) (parser input 0)))
|
|
|
|
;; ── Layer 8: SX tokenizer ─────────────────────────────────────────
|
|
|
|
(define
|
|
sx-comment
|
|
(fn
|
|
(input pos)
|
|
(if
|
|
(and (< pos (len input)) (= (nth input pos) ";"))
|
|
(let
|
|
((cur (+ pos 1)) (input-len (len input)))
|
|
(define
|
|
loop
|
|
(fn
|
|
()
|
|
(if
|
|
(or (>= cur input-len) (= (nth input cur) "\n"))
|
|
(make-ok {:type "comment"} cur)
|
|
(do (set! cur (+ cur 1)) (loop)))))
|
|
(loop))
|
|
(make-fail "comment" pos))))
|
|
|
|
(define
|
|
sx-keyword
|
|
(parse-bind
|
|
(parse-char ":")
|
|
(fn (colon) (fmap (fn (name) {:value (str ":" name) :type "keyword"}) identifier))))
|
|
|
|
(define
|
|
sx-symbol
|
|
(let
|
|
((sym-char (satisfy (fn (c) (or (and (>= c "a") (<= c "z")) (and (>= c "A") (<= c "Z")) (and (>= c "0") (<= c "9")) (= c "-") (= c "_") (= c "?") (= c "!") (= c "+") (= c "*") (= c "/") (= c "<") (= c ">") (= c "=") (= c "~") (= c "&") (= c ".") (= c "#"))) "symbol char")))
|
|
(fmap (fn (chars) {:value (join "" chars) :type "symbol"}) (many1 sym-char))))
|
|
|
|
(define sx-number (fmap (fn (n) {:value n :type "number"}) number-literal))
|
|
|
|
(define sx-string (fmap (fn (s) {:value s :type "string"}) string-literal))
|
|
|
|
(define
|
|
sx-token
|
|
(alt
|
|
(list
|
|
sx-comment
|
|
sx-number
|
|
sx-string
|
|
sx-keyword
|
|
(fmap (fn (c) {:type "open-paren"}) (parse-char "("))
|
|
(fmap (fn (c) {:type "close-paren"}) (parse-char ")"))
|
|
(fmap (fn (c) {:type "open-brace"}) (parse-char "{"))
|
|
(fmap (fn (c) {:type "close-brace"}) (parse-char "}"))
|
|
(fmap (fn (c) {:type "quote"}) (parse-char "'"))
|
|
sx-symbol)))
|
|
|
|
(define
|
|
sx-tokenize
|
|
(fmap
|
|
(fn
|
|
(tokens)
|
|
(filter (fn (t) (not (= (get t "type") "comment"))) tokens))
|
|
(skip-left skip-spaces (many (skip-right sx-token skip-spaces))))) |