Adds a full regex engine written in SX, installed via js-regex-platform-override!.
Supports char classes (. \d\D\w\W\s\S [abc] [^abc] ranges), anchors (^ $ \b \B),
quantifiers (* + ? {n,m} greedy and lazy), capturing/non-capturing groups,
alternation (a|b), flags i/g/m. exec() returns {:match :index :input :groups}.
Also fixes String.prototype.match to dispatch through the platform engine
(was calling js-regex-stub-exec directly, bypassing regex.sx).
Adds TDZ sentinel infrastructure: __js_tdz_sentinel__, js-tdz?, js-tdz-check.
Updates test.sh (+34 regex tests + 4 TDZ infra tests), conformance.sh,
and test262-runner.py to load regex.sx as epoch 6.
Tests: 559/560 unit (1 pre-existing failure), 148/148 conformance.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
944 lines
29 KiB
Plaintext
944 lines
29 KiB
Plaintext
;; lib/js/regex.sx — pure-SX recursive backtracking regex engine
|
|
;;
|
|
;; Installed via (js-regex-platform-override! ...) at load time.
|
|
;; Covers: character classes (\d\w\s . [abc] [^abc] [a-z]),
|
|
;; anchors (^ $ \b \B), quantifiers (* + ? {n,m} lazy variants),
|
|
;; groups (capturing + non-capturing), alternation (a|b),
|
|
;; flags: i (case-insensitive), g (global), m (multiline).
|
|
;;
|
|
;; Architecture:
|
|
;; 1. rx-parse-pattern — pattern string → compiled node list
|
|
;; 2. rx-match-nodes — recursive backtracker
|
|
;; 3. rx-exec / rx-test — public interface
|
|
;; 4. Install as {:test rx-test :exec rx-exec}
|
|
|
|
;; ── Utilities ─────────────────────────────────────────────────────
|
|
|
|
(define
|
|
rx-char-at
|
|
(fn (s i) (if (and (>= i 0) (< i (len s))) (char-at s i) "")))
|
|
|
|
(define
|
|
rx-digit?
|
|
(fn
|
|
(c)
|
|
(and (not (= c "")) (>= (char-code c) 48) (<= (char-code c) 57))))
|
|
|
|
(define
|
|
rx-word?
|
|
(fn
|
|
(c)
|
|
(and
|
|
(not (= c ""))
|
|
(or
|
|
(and (>= (char-code c) 65) (<= (char-code c) 90))
|
|
(and (>= (char-code c) 97) (<= (char-code c) 122))
|
|
(and (>= (char-code c) 48) (<= (char-code c) 57))
|
|
(= c "_")))))
|
|
|
|
(define
|
|
rx-space?
|
|
(fn
|
|
(c)
|
|
(or (= c " ") (= c "\t") (= c "\n") (= c "\r") (= c "\\f") (= c ""))))
|
|
|
|
(define rx-newline? (fn (c) (or (= c "\n") (= c "\r"))))
|
|
|
|
(define
|
|
rx-downcase-char
|
|
(fn
|
|
(c)
|
|
(let
|
|
((cc (char-code c)))
|
|
(if (and (>= cc 65) (<= cc 90)) (char-from-code (+ cc 32)) c))))
|
|
|
|
(define
|
|
rx-char-eq?
|
|
(fn
|
|
(a b ci?)
|
|
(if ci? (= (rx-downcase-char a) (rx-downcase-char b)) (= a b))))
|
|
|
|
(define
|
|
rx-parse-int
|
|
(fn
|
|
(pat i acc)
|
|
(let
|
|
((c (rx-char-at pat i)))
|
|
(if
|
|
(rx-digit? c)
|
|
(rx-parse-int pat (+ i 1) (+ (* acc 10) (- (char-code c) 48)))
|
|
(list acc i)))))
|
|
|
|
(define
|
|
rx-hex-digit-val
|
|
(fn
|
|
(c)
|
|
(cond
|
|
((and (>= (char-code c) 48) (<= (char-code c) 57))
|
|
(- (char-code c) 48))
|
|
((and (>= (char-code c) 65) (<= (char-code c) 70))
|
|
(+ 10 (- (char-code c) 65)))
|
|
((and (>= (char-code c) 97) (<= (char-code c) 102))
|
|
(+ 10 (- (char-code c) 97)))
|
|
(else -1))))
|
|
|
|
(define
|
|
rx-parse-hex-n
|
|
(fn
|
|
(pat i n acc)
|
|
(if
|
|
(= n 0)
|
|
(list (char-from-code acc) i)
|
|
(let
|
|
((v (rx-hex-digit-val (rx-char-at pat i))))
|
|
(if
|
|
(< v 0)
|
|
(list (char-from-code acc) i)
|
|
(rx-parse-hex-n pat (+ i 1) (- n 1) (+ (* acc 16) v)))))))
|
|
|
|
;; ── Pattern compiler ──────────────────────────────────────────────
|
|
|
|
;; Node types (stored in dicts with "__t__" key):
|
|
;; literal : {:__t__ "literal" :__c__ char}
|
|
;; any : {:__t__ "any"}
|
|
;; class-d : {:__t__ "class-d" :__neg__ bool}
|
|
;; class-w : {:__t__ "class-w" :__neg__ bool}
|
|
;; class-s : {:__t__ "class-s" :__neg__ bool}
|
|
;; char-class: {:__t__ "char-class" :__neg__ bool :__items__ list}
|
|
;; anchor-start / anchor-end / anchor-word / anchor-nonword
|
|
;; quant : {:__t__ "quant" :__node__ n :__min__ m :__max__ mx :__lazy__ bool}
|
|
;; group : {:__t__ "group" :__idx__ i :__nodes__ list}
|
|
;; ncgroup : {:__t__ "ncgroup" :__nodes__ list}
|
|
;; alt : {:__t__ "alt" :__branches__ list-of-node-lists}
|
|
|
|
;; parse one escape after `\`, returns (node new-i)
|
|
(define
|
|
rx-parse-escape
|
|
(fn
|
|
(pat i)
|
|
(let
|
|
((c (rx-char-at pat i)))
|
|
(cond
|
|
((= c "d") (list (dict "__t__" "class-d" "__neg__" false) (+ i 1)))
|
|
((= c "D") (list (dict "__t__" "class-d" "__neg__" true) (+ i 1)))
|
|
((= c "w") (list (dict "__t__" "class-w" "__neg__" false) (+ i 1)))
|
|
((= c "W") (list (dict "__t__" "class-w" "__neg__" true) (+ i 1)))
|
|
((= c "s") (list (dict "__t__" "class-s" "__neg__" false) (+ i 1)))
|
|
((= c "S") (list (dict "__t__" "class-s" "__neg__" true) (+ i 1)))
|
|
((= c "b") (list (dict "__t__" "anchor-word") (+ i 1)))
|
|
((= c "B") (list (dict "__t__" "anchor-nonword") (+ i 1)))
|
|
((= c "n") (list (dict "__t__" "literal" "__c__" "\n") (+ i 1)))
|
|
((= c "r") (list (dict "__t__" "literal" "__c__" "\r") (+ i 1)))
|
|
((= c "t") (list (dict "__t__" "literal" "__c__" "\t") (+ i 1)))
|
|
((= c "f") (list (dict "__t__" "literal" "__c__" "\\f") (+ i 1)))
|
|
((= c "v") (list (dict "__t__" "literal" "__c__" "") (+ i 1)))
|
|
((= c "u")
|
|
(let
|
|
((res (rx-parse-hex-n pat (+ i 1) 4 0)))
|
|
(list (dict "__t__" "literal" "__c__" (nth res 0)) (nth res 1))))
|
|
((= c "x")
|
|
(let
|
|
((res (rx-parse-hex-n pat (+ i 1) 2 0)))
|
|
(list (dict "__t__" "literal" "__c__" (nth res 0)) (nth res 1))))
|
|
(else (list (dict "__t__" "literal" "__c__" c) (+ i 1)))))))
|
|
|
|
;; parse a char-class item inside [...], returns (item new-i)
|
|
(define
|
|
rx-parse-class-item
|
|
(fn
|
|
(pat i)
|
|
(let
|
|
((c (rx-char-at pat i)))
|
|
(cond
|
|
((= c "\\")
|
|
(let
|
|
((esc (rx-parse-escape pat (+ i 1))))
|
|
(let
|
|
((node (nth esc 0)) (ni (nth esc 1)))
|
|
(let
|
|
((t (get node "__t__")))
|
|
(cond
|
|
((= t "class-d")
|
|
(list
|
|
(dict "kind" "class-d" "neg" (get node "__neg__"))
|
|
ni))
|
|
((= t "class-w")
|
|
(list
|
|
(dict "kind" "class-w" "neg" (get node "__neg__"))
|
|
ni))
|
|
((= t "class-s")
|
|
(list
|
|
(dict "kind" "class-s" "neg" (get node "__neg__"))
|
|
ni))
|
|
(else
|
|
(let
|
|
((lc (get node "__c__")))
|
|
(if
|
|
(and
|
|
(= (rx-char-at pat ni) "-")
|
|
(not (= (rx-char-at pat (+ ni 1)) "]")))
|
|
(let
|
|
((hi-c (rx-char-at pat (+ ni 1))))
|
|
(list
|
|
(dict "kind" "range" "lo" lc "hi" hi-c)
|
|
(+ ni 2)))
|
|
(list (dict "kind" "lit" "c" lc) ni)))))))))
|
|
(else
|
|
(if
|
|
(and
|
|
(not (= c ""))
|
|
(= (rx-char-at pat (+ i 1)) "-")
|
|
(not (= (rx-char-at pat (+ i 2)) "]"))
|
|
(not (= (rx-char-at pat (+ i 2)) "")))
|
|
(let
|
|
((hi-c (rx-char-at pat (+ i 2))))
|
|
(list (dict "kind" "range" "lo" c "hi" hi-c) (+ i 3)))
|
|
(list (dict "kind" "lit" "c" c) (+ i 1))))))))
|
|
|
|
(define
|
|
rx-parse-class-items
|
|
(fn
|
|
(pat i items)
|
|
(let
|
|
((c (rx-char-at pat i)))
|
|
(if
|
|
(or (= c "]") (= c ""))
|
|
(list items i)
|
|
(let
|
|
((res (rx-parse-class-item pat i)))
|
|
(begin
|
|
(append! items (nth res 0))
|
|
(rx-parse-class-items pat (nth res 1) items)))))))
|
|
|
|
;; parse a sequence until stop-ch or EOF; returns (nodes new-i groups-count)
|
|
(define
|
|
rx-parse-seq
|
|
(fn
|
|
(pat i stop-ch ds)
|
|
(let
|
|
((c (rx-char-at pat i)))
|
|
(cond
|
|
((= c "") (list (get ds "nodes") i (get ds "groups")))
|
|
((= c stop-ch) (list (get ds "nodes") i (get ds "groups")))
|
|
((= c "|") (rx-parse-alt-rest pat i ds))
|
|
(else
|
|
(let
|
|
((res (rx-parse-atom pat i ds)))
|
|
(let
|
|
((node (nth res 0)) (ni (nth res 1)) (ds2 (nth res 2)))
|
|
(let
|
|
((qres (rx-parse-quant pat ni node)))
|
|
(begin
|
|
(append! (get ds2 "nodes") (nth qres 0))
|
|
(rx-parse-seq pat (nth qres 1) stop-ch ds2))))))))))
|
|
|
|
;; when we hit | inside a sequence, collect all alternatives
|
|
(define
|
|
rx-parse-alt-rest
|
|
(fn
|
|
(pat i ds)
|
|
(let
|
|
((left-branch (get ds "nodes")) (branches (list)))
|
|
(begin
|
|
(append! branches left-branch)
|
|
(rx-parse-alt-branches pat i (get ds "groups") branches)))))
|
|
|
|
(define
|
|
rx-parse-alt-branches
|
|
(fn
|
|
(pat i n-groups branches)
|
|
(let
|
|
((new-nodes (list)) (ds2 (dict "groups" n-groups "nodes" new-nodes)))
|
|
(let
|
|
((res (rx-parse-seq pat (+ i 1) "|" ds2)))
|
|
(begin
|
|
(append! branches (nth res 0))
|
|
(let
|
|
((ni2 (nth res 1)) (g2 (nth res 2)))
|
|
(if
|
|
(= (rx-char-at pat ni2) "|")
|
|
(rx-parse-alt-branches pat ni2 g2 branches)
|
|
(list
|
|
(list (dict "__t__" "alt" "__branches__" branches))
|
|
ni2
|
|
g2))))))))
|
|
|
|
;; parse quantifier suffix, returns (node new-i)
|
|
(define
|
|
rx-parse-quant
|
|
(fn
|
|
(pat i node)
|
|
(let
|
|
((c (rx-char-at pat i)))
|
|
(cond
|
|
((= c "*")
|
|
(let
|
|
((lazy? (= (rx-char-at pat (+ i 1)) "?")))
|
|
(list
|
|
(dict
|
|
"__t__"
|
|
"quant"
|
|
"__node__"
|
|
node
|
|
"__min__"
|
|
0
|
|
"__max__"
|
|
-1
|
|
"__lazy__"
|
|
lazy?)
|
|
(if lazy? (+ i 2) (+ i 1)))))
|
|
((= c "+")
|
|
(let
|
|
((lazy? (= (rx-char-at pat (+ i 1)) "?")))
|
|
(list
|
|
(dict
|
|
"__t__"
|
|
"quant"
|
|
"__node__"
|
|
node
|
|
"__min__"
|
|
1
|
|
"__max__"
|
|
-1
|
|
"__lazy__"
|
|
lazy?)
|
|
(if lazy? (+ i 2) (+ i 1)))))
|
|
((= c "?")
|
|
(let
|
|
((lazy? (= (rx-char-at pat (+ i 1)) "?")))
|
|
(list
|
|
(dict
|
|
"__t__"
|
|
"quant"
|
|
"__node__"
|
|
node
|
|
"__min__"
|
|
0
|
|
"__max__"
|
|
1
|
|
"__lazy__"
|
|
lazy?)
|
|
(if lazy? (+ i 2) (+ i 1)))))
|
|
((= c "{")
|
|
(let
|
|
((mres (rx-parse-int pat (+ i 1) 0)))
|
|
(let
|
|
((mn (nth mres 0)) (mi (nth mres 1)))
|
|
(let
|
|
((sep (rx-char-at pat mi)))
|
|
(cond
|
|
((= sep "}")
|
|
(let
|
|
((lazy? (= (rx-char-at pat (+ mi 1)) "?")))
|
|
(list
|
|
(dict
|
|
"__t__"
|
|
"quant"
|
|
"__node__"
|
|
node
|
|
"__min__"
|
|
mn
|
|
"__max__"
|
|
mn
|
|
"__lazy__"
|
|
lazy?)
|
|
(if lazy? (+ mi 2) (+ mi 1)))))
|
|
((= sep ",")
|
|
(let
|
|
((c2 (rx-char-at pat (+ mi 1))))
|
|
(if
|
|
(= c2 "}")
|
|
(let
|
|
((lazy? (= (rx-char-at pat (+ mi 2)) "?")))
|
|
(list
|
|
(dict
|
|
"__t__"
|
|
"quant"
|
|
"__node__"
|
|
node
|
|
"__min__"
|
|
mn
|
|
"__max__"
|
|
-1
|
|
"__lazy__"
|
|
lazy?)
|
|
(if lazy? (+ mi 3) (+ mi 2))))
|
|
(let
|
|
((mxres (rx-parse-int pat (+ mi 1) 0)))
|
|
(let
|
|
((mx (nth mxres 0)) (mxi (nth mxres 1)))
|
|
(let
|
|
((lazy? (= (rx-char-at pat (+ mxi 1)) "?")))
|
|
(list
|
|
(dict
|
|
"__t__"
|
|
"quant"
|
|
"__node__"
|
|
node
|
|
"__min__"
|
|
mn
|
|
"__max__"
|
|
mx
|
|
"__lazy__"
|
|
lazy?)
|
|
(if lazy? (+ mxi 2) (+ mxi 1)))))))))
|
|
(else (list node i)))))))
|
|
(else (list node i))))))
|
|
|
|
;; parse one atom, returns (node new-i new-ds)
|
|
(define
|
|
rx-parse-atom
|
|
(fn
|
|
(pat i ds)
|
|
(let
|
|
((c (rx-char-at pat i)))
|
|
(cond
|
|
((= c ".") (list (dict "__t__" "any") (+ i 1) ds))
|
|
((= c "^") (list (dict "__t__" "anchor-start") (+ i 1) ds))
|
|
((= c "$") (list (dict "__t__" "anchor-end") (+ i 1) ds))
|
|
((= c "\\")
|
|
(let
|
|
((esc (rx-parse-escape pat (+ i 1))))
|
|
(list (nth esc 0) (nth esc 1) ds)))
|
|
((= c "[")
|
|
(let
|
|
((neg? (= (rx-char-at pat (+ i 1)) "^")))
|
|
(let
|
|
((start (if neg? (+ i 2) (+ i 1))) (items (list)))
|
|
(let
|
|
((res (rx-parse-class-items pat start items)))
|
|
(let
|
|
((ci (nth res 1)))
|
|
(list
|
|
(dict
|
|
"__t__"
|
|
"char-class"
|
|
"__neg__"
|
|
neg?
|
|
"__items__"
|
|
items)
|
|
(+ ci 1)
|
|
ds))))))
|
|
((= c "(")
|
|
(let
|
|
((c2 (rx-char-at pat (+ i 1))))
|
|
(if
|
|
(and (= c2 "?") (= (rx-char-at pat (+ i 2)) ":"))
|
|
(let
|
|
((inner-nodes (list))
|
|
(inner-ds
|
|
(dict "groups" (get ds "groups") "nodes" inner-nodes)))
|
|
(let
|
|
((res (rx-parse-seq pat (+ i 3) ")" inner-ds)))
|
|
(list
|
|
(dict "__t__" "ncgroup" "__nodes__" (nth res 0))
|
|
(+ (nth res 1) 1)
|
|
(dict "groups" (nth res 2) "nodes" (get ds "nodes")))))
|
|
(let
|
|
((gidx (+ (get ds "groups") 1)) (inner-nodes (list)))
|
|
(let
|
|
((inner-ds (dict "groups" gidx "nodes" inner-nodes)))
|
|
(let
|
|
((res (rx-parse-seq pat (+ i 1) ")" inner-ds)))
|
|
(list
|
|
(dict
|
|
"__t__"
|
|
"group"
|
|
"__idx__"
|
|
gidx
|
|
"__nodes__"
|
|
(nth res 0))
|
|
(+ (nth res 1) 1)
|
|
(dict "groups" (nth res 2) "nodes" (get ds "nodes")))))))))
|
|
(else (list (dict "__t__" "literal" "__c__" c) (+ i 1) ds))))))
|
|
|
|
;; top-level compile
|
|
(define
|
|
rx-compile
|
|
(fn
|
|
(pattern)
|
|
(let
|
|
((nodes (list)) (ds (dict "groups" 0 "nodes" nodes)))
|
|
(let
|
|
((res (rx-parse-seq pattern 0 "" ds)))
|
|
(dict "nodes" (nth res 0) "ngroups" (nth res 2))))))
|
|
|
|
;; ── Matcher ───────────────────────────────────────────────────────
|
|
|
|
;; Match a char-class item against character c
|
|
(define
|
|
rx-item-matches?
|
|
(fn
|
|
(item c ci?)
|
|
(let
|
|
((kind (get item "kind")))
|
|
(cond
|
|
((= kind "lit") (rx-char-eq? c (get item "c") ci?))
|
|
((= kind "range")
|
|
(let
|
|
((lo (if ci? (rx-downcase-char (get item "lo")) (get item "lo")))
|
|
(hi
|
|
(if ci? (rx-downcase-char (get item "hi")) (get item "hi")))
|
|
(dc (if ci? (rx-downcase-char c) c)))
|
|
(and
|
|
(>= (char-code dc) (char-code lo))
|
|
(<= (char-code dc) (char-code hi)))))
|
|
((= kind "class-d")
|
|
(let ((m (rx-digit? c))) (if (get item "neg") (not m) m)))
|
|
((= kind "class-w")
|
|
(let ((m (rx-word? c))) (if (get item "neg") (not m) m)))
|
|
((= kind "class-s")
|
|
(let ((m (rx-space? c))) (if (get item "neg") (not m) m)))
|
|
(else false)))))
|
|
|
|
(define
|
|
rx-class-items-any?
|
|
(fn
|
|
(items c ci?)
|
|
(if
|
|
(empty? items)
|
|
false
|
|
(if
|
|
(rx-item-matches? (first items) c ci?)
|
|
true
|
|
(rx-class-items-any? (rest items) c ci?)))))
|
|
|
|
(define
|
|
rx-class-matches?
|
|
(fn
|
|
(node c ci?)
|
|
(let
|
|
((neg? (get node "__neg__")) (items (get node "__items__")))
|
|
(let
|
|
((hit (rx-class-items-any? items c ci?)))
|
|
(if neg? (not hit) hit)))))
|
|
|
|
;; Word boundary check
|
|
(define
|
|
rx-is-word-boundary?
|
|
(fn
|
|
(s i slen)
|
|
(let
|
|
((before (if (> i 0) (rx-word? (char-at s (- i 1))) false))
|
|
(after (if (< i slen) (rx-word? (char-at s i)) false)))
|
|
(not (= before after)))))
|
|
|
|
;; ── Core matcher ──────────────────────────────────────────────────
|
|
;;
|
|
;; rx-match-nodes : nodes s i slen ci? mi? groups → end-pos or -1
|
|
;;
|
|
;; Matches `nodes` starting at position `i` in string `s`.
|
|
;; Returns the position after the last character consumed, or -1 on failure.
|
|
;; Mutates `groups` dict to record captures.
|
|
|
|
(define
|
|
rx-match-nodes
|
|
(fn
|
|
(nodes s i slen ci? mi? groups)
|
|
(if
|
|
(empty? nodes)
|
|
i
|
|
(let
|
|
((node (first nodes)) (rest-nodes (rest nodes)))
|
|
(let
|
|
((t (get node "__t__")))
|
|
(cond
|
|
((= t "literal")
|
|
(if
|
|
(and
|
|
(< i slen)
|
|
(rx-char-eq? (char-at s i) (get node "__c__") ci?))
|
|
(rx-match-nodes rest-nodes s (+ i 1) slen ci? mi? groups)
|
|
-1))
|
|
((= t "any")
|
|
(if
|
|
(and (< i slen) (not (rx-newline? (char-at s i))))
|
|
(rx-match-nodes rest-nodes s (+ i 1) slen ci? mi? groups)
|
|
-1))
|
|
((= t "class-d")
|
|
(let
|
|
((m (and (< i slen) (rx-digit? (char-at s i)))))
|
|
(if
|
|
(if (get node "__neg__") (not m) m)
|
|
(rx-match-nodes rest-nodes s (+ i 1) slen ci? mi? groups)
|
|
-1)))
|
|
((= t "class-w")
|
|
(let
|
|
((m (and (< i slen) (rx-word? (char-at s i)))))
|
|
(if
|
|
(if (get node "__neg__") (not m) m)
|
|
(rx-match-nodes rest-nodes s (+ i 1) slen ci? mi? groups)
|
|
-1)))
|
|
((= t "class-s")
|
|
(let
|
|
((m (and (< i slen) (rx-space? (char-at s i)))))
|
|
(if
|
|
(if (get node "__neg__") (not m) m)
|
|
(rx-match-nodes rest-nodes s (+ i 1) slen ci? mi? groups)
|
|
-1)))
|
|
((= t "char-class")
|
|
(if
|
|
(and (< i slen) (rx-class-matches? node (char-at s i) ci?))
|
|
(rx-match-nodes rest-nodes s (+ i 1) slen ci? mi? groups)
|
|
-1))
|
|
((= t "anchor-start")
|
|
(if
|
|
(or
|
|
(= i 0)
|
|
(and mi? (rx-newline? (rx-char-at s (- i 1)))))
|
|
(rx-match-nodes rest-nodes s i slen ci? mi? groups)
|
|
-1))
|
|
((= t "anchor-end")
|
|
(if
|
|
(or (= i slen) (and mi? (rx-newline? (rx-char-at s i))))
|
|
(rx-match-nodes rest-nodes s i slen ci? mi? groups)
|
|
-1))
|
|
((= t "anchor-word")
|
|
(if
|
|
(rx-is-word-boundary? s i slen)
|
|
(rx-match-nodes rest-nodes s i slen ci? mi? groups)
|
|
-1))
|
|
((= t "anchor-nonword")
|
|
(if
|
|
(not (rx-is-word-boundary? s i slen))
|
|
(rx-match-nodes rest-nodes s i slen ci? mi? groups)
|
|
-1))
|
|
((= t "group")
|
|
(let
|
|
((gidx (get node "__idx__"))
|
|
(inner (get node "__nodes__")))
|
|
(let
|
|
((g-end (rx-match-nodes inner s i slen ci? mi? groups)))
|
|
(if
|
|
(>= g-end 0)
|
|
(begin
|
|
(dict-set!
|
|
groups
|
|
(js-to-string gidx)
|
|
(substring s i g-end))
|
|
(let
|
|
((final-end (rx-match-nodes rest-nodes s g-end slen ci? mi? groups)))
|
|
(if
|
|
(>= final-end 0)
|
|
final-end
|
|
(begin
|
|
(dict-set! groups (js-to-string gidx) nil)
|
|
-1))))
|
|
-1))))
|
|
((= t "ncgroup")
|
|
(let
|
|
((inner (get node "__nodes__")))
|
|
(rx-match-nodes
|
|
(append inner rest-nodes)
|
|
s
|
|
i
|
|
slen
|
|
ci?
|
|
mi?
|
|
groups)))
|
|
((= t "alt")
|
|
(let
|
|
((branches (get node "__branches__")))
|
|
(rx-try-branches branches rest-nodes s i slen ci? mi? groups)))
|
|
((= t "quant")
|
|
(let
|
|
((inner-node (get node "__node__"))
|
|
(mn (get node "__min__"))
|
|
(mx (get node "__max__"))
|
|
(lazy? (get node "__lazy__")))
|
|
(if
|
|
lazy?
|
|
(rx-quant-lazy
|
|
inner-node
|
|
mn
|
|
mx
|
|
rest-nodes
|
|
s
|
|
i
|
|
slen
|
|
ci?
|
|
mi?
|
|
groups
|
|
0)
|
|
(rx-quant-greedy
|
|
inner-node
|
|
mn
|
|
mx
|
|
rest-nodes
|
|
s
|
|
i
|
|
slen
|
|
ci?
|
|
mi?
|
|
groups
|
|
0))))
|
|
(else -1)))))))
|
|
|
|
(define
|
|
rx-try-branches
|
|
(fn
|
|
(branches rest-nodes s i slen ci? mi? groups)
|
|
(if
|
|
(empty? branches)
|
|
-1
|
|
(let
|
|
((res (rx-match-nodes (append (first branches) rest-nodes) s i slen ci? mi? groups)))
|
|
(if
|
|
(>= res 0)
|
|
res
|
|
(rx-try-branches (rest branches) rest-nodes s i slen ci? mi? groups))))))
|
|
|
|
;; Greedy: expand as far as possible, then try rest from the longest match
|
|
;; Strategy: recurse forward (extend first); only try rest when extension fails
|
|
(define
|
|
rx-quant-greedy
|
|
(fn
|
|
(inner-node mn mx rest-nodes s i slen ci? mi? groups count)
|
|
(let
|
|
((can-extend (and (< i slen) (or (= mx -1) (< count mx)))))
|
|
(if
|
|
can-extend
|
|
(let
|
|
((ni (rx-match-one inner-node s i slen ci? mi? groups)))
|
|
(if
|
|
(>= ni 0)
|
|
(let
|
|
((res (rx-quant-greedy inner-node mn mx rest-nodes s ni slen ci? mi? groups (+ count 1))))
|
|
(if
|
|
(>= res 0)
|
|
res
|
|
(if
|
|
(>= count mn)
|
|
(rx-match-nodes rest-nodes s i slen ci? mi? groups)
|
|
-1)))
|
|
(if
|
|
(>= count mn)
|
|
(rx-match-nodes rest-nodes s i slen ci? mi? groups)
|
|
-1)))
|
|
(if
|
|
(>= count mn)
|
|
(rx-match-nodes rest-nodes s i slen ci? mi? groups)
|
|
-1)))))
|
|
|
|
;; Lazy: try rest first, extend only if rest fails
|
|
(define
|
|
rx-quant-lazy
|
|
(fn
|
|
(inner-node mn mx rest-nodes s i slen ci? mi? groups count)
|
|
(if
|
|
(>= count mn)
|
|
(let
|
|
((res (rx-match-nodes rest-nodes s i slen ci? mi? groups)))
|
|
(if
|
|
(>= res 0)
|
|
res
|
|
(if
|
|
(and (< i slen) (or (= mx -1) (< count mx)))
|
|
(let
|
|
((ni (rx-match-one inner-node s i slen ci? mi? groups)))
|
|
(if
|
|
(>= ni 0)
|
|
(rx-quant-lazy
|
|
inner-node
|
|
mn
|
|
mx
|
|
rest-nodes
|
|
s
|
|
ni
|
|
slen
|
|
ci?
|
|
mi?
|
|
groups
|
|
(+ count 1))
|
|
-1))
|
|
-1)))
|
|
(if
|
|
(< i slen)
|
|
(let
|
|
((ni (rx-match-one inner-node s i slen ci? mi? groups)))
|
|
(if
|
|
(>= ni 0)
|
|
(rx-quant-lazy
|
|
inner-node
|
|
mn
|
|
mx
|
|
rest-nodes
|
|
s
|
|
ni
|
|
slen
|
|
ci?
|
|
mi?
|
|
groups
|
|
(+ count 1))
|
|
-1))
|
|
-1))))
|
|
|
|
;; Match a single node at position i, return new pos or -1
|
|
(define
|
|
rx-match-one
|
|
(fn
|
|
(node s i slen ci? mi? groups)
|
|
(rx-match-nodes (list node) s i slen ci? mi? groups)))
|
|
|
|
;; ── Engine entry points ───────────────────────────────────────────
|
|
|
|
;; Try matching at exactly position i. Returns result dict or nil.
|
|
(define
|
|
rx-try-at
|
|
(fn
|
|
(compiled s i slen ci? mi?)
|
|
(let
|
|
((nodes (get compiled "nodes")) (ngroups (get compiled "ngroups")))
|
|
(let
|
|
((groups (dict)))
|
|
(let
|
|
((end (rx-match-nodes nodes s i slen ci? mi? groups)))
|
|
(if
|
|
(>= end 0)
|
|
(dict "start" i "end" end "groups" groups "ngroups" ngroups)
|
|
nil))))))
|
|
|
|
;; Find first match scanning from search-start.
|
|
(define
|
|
rx-find-from
|
|
(fn
|
|
(compiled s search-start slen ci? mi?)
|
|
(if
|
|
(> search-start slen)
|
|
nil
|
|
(let
|
|
((res (rx-try-at compiled s search-start slen ci? mi?)))
|
|
(if
|
|
res
|
|
res
|
|
(rx-find-from compiled s (+ search-start 1) slen ci? mi?))))))
|
|
|
|
;; Build exec result dict from raw match result
|
|
(define
|
|
rx-build-exec-result
|
|
(fn
|
|
(s match-res)
|
|
(let
|
|
((start (get match-res "start"))
|
|
(end (get match-res "end"))
|
|
(groups (get match-res "groups"))
|
|
(ngroups (get match-res "ngroups")))
|
|
(let
|
|
((matched (substring s start end))
|
|
(caps (rx-build-captures groups ngroups 1)))
|
|
(dict "match" matched "index" start "input" s "groups" caps)))))
|
|
|
|
(define
|
|
rx-build-captures
|
|
(fn
|
|
(groups ngroups idx)
|
|
(if
|
|
(> idx ngroups)
|
|
(list)
|
|
(let
|
|
((cap (get groups (js-to-string idx))))
|
|
(cons
|
|
(if (= cap nil) :js-undefined cap)
|
|
(rx-build-captures groups ngroups (+ idx 1)))))))
|
|
|
|
;; ── Public interface ──────────────────────────────────────────────
|
|
|
|
;; Lazy compile: build NFA on first use, cache under "__compiled__"
|
|
(define
|
|
rx-ensure-compiled!
|
|
(fn
|
|
(rx)
|
|
(if
|
|
(dict-has? rx "__compiled__")
|
|
(get rx "__compiled__")
|
|
(let
|
|
((c (rx-compile (get rx "source"))))
|
|
(begin (dict-set! rx "__compiled__" c) c)))))
|
|
|
|
(define
|
|
rx-test
|
|
(fn
|
|
(rx s)
|
|
(let
|
|
((compiled (rx-ensure-compiled! rx))
|
|
(ci? (get rx "ignoreCase"))
|
|
(mi? (get rx "multiline"))
|
|
(slen (len s)))
|
|
(let
|
|
((start (if (get rx "global") (let ((li (get rx "lastIndex"))) (if (number? li) li 0)) 0)))
|
|
(let
|
|
((res (rx-find-from compiled s start slen ci? mi?)))
|
|
(if
|
|
(get rx "global")
|
|
(begin
|
|
(dict-set! rx "lastIndex" (if res (get res "end") 0))
|
|
(if res true false))
|
|
(if res true false)))))))
|
|
|
|
(define
|
|
rx-exec
|
|
(fn
|
|
(rx s)
|
|
(let
|
|
((compiled (rx-ensure-compiled! rx))
|
|
(ci? (get rx "ignoreCase"))
|
|
(mi? (get rx "multiline"))
|
|
(slen (len s)))
|
|
(let
|
|
((start (if (get rx "global") (let ((li (get rx "lastIndex"))) (if (number? li) li 0)) 0)))
|
|
(let
|
|
((res (rx-find-from compiled s start slen ci? mi?)))
|
|
(if
|
|
res
|
|
(begin
|
|
(when
|
|
(get rx "global")
|
|
(dict-set! rx "lastIndex" (get res "end")))
|
|
(rx-build-exec-result s res))
|
|
(begin
|
|
(when (get rx "global") (dict-set! rx "lastIndex" 0))
|
|
nil)))))))
|
|
|
|
;; match-all for String.prototype.matchAll
|
|
(define
|
|
js-regex-match-all
|
|
(fn
|
|
(rx s)
|
|
(let
|
|
((compiled (rx-ensure-compiled! rx))
|
|
(ci? (get rx "ignoreCase"))
|
|
(mi? (get rx "multiline"))
|
|
(slen (len s))
|
|
(results (list)))
|
|
(rx-match-all-loop compiled s 0 slen ci? mi? results))))
|
|
|
|
(define
|
|
rx-match-all-loop
|
|
(fn
|
|
(compiled s i slen ci? mi? results)
|
|
(if
|
|
(> i slen)
|
|
results
|
|
(let
|
|
((res (rx-find-from compiled s i slen ci? mi?)))
|
|
(if
|
|
res
|
|
(begin
|
|
(append! results (rx-build-exec-result s res))
|
|
(let
|
|
((next (get res "end")))
|
|
(rx-match-all-loop
|
|
compiled
|
|
s
|
|
(if (= next i) (+ i 1) next)
|
|
slen
|
|
ci?
|
|
mi?
|
|
results)))
|
|
results)))))
|
|
|
|
;; ── Install platform ──────────────────────────────────────────────
|
|
|
|
(js-regex-platform-override! "test" rx-test)
|
|
(js-regex-platform-override! "exec" rx-exec)
|