;; lib/js/regex.sx — pure-SX recursive backtracking regex engine ;; ;; Installed via (js-regex-platform-override! ...) at load time. ;; Covers: character classes (\d\w\s . [abc] [^abc] [a-z]), ;; anchors (^ $ \b \B), quantifiers (* + ? {n,m} lazy variants), ;; groups (capturing + non-capturing), alternation (a|b), ;; flags: i (case-insensitive), g (global), m (multiline). ;; ;; Architecture: ;; 1. rx-parse-pattern — pattern string → compiled node list ;; 2. rx-match-nodes — recursive backtracker ;; 3. rx-exec / rx-test — public interface ;; 4. Install as {:test rx-test :exec rx-exec} ;; ── Utilities ───────────────────────────────────────────────────── (define rx-char-at (fn (s i) (if (and (>= i 0) (< i (len s))) (char-at s i) ""))) (define rx-digit? (fn (c) (and (not (= c "")) (>= (char-code c) 48) (<= (char-code c) 57)))) (define rx-word? (fn (c) (and (not (= c "")) (or (and (>= (char-code c) 65) (<= (char-code c) 90)) (and (>= (char-code c) 97) (<= (char-code c) 122)) (and (>= (char-code c) 48) (<= (char-code c) 57)) (= c "_"))))) (define rx-space? (fn (c) (or (= c " ") (= c "\t") (= c "\n") (= c "\r") (= c "\\f") (= c "")))) (define rx-newline? (fn (c) (or (= c "\n") (= c "\r")))) (define rx-downcase-char (fn (c) (let ((cc (char-code c))) (if (and (>= cc 65) (<= cc 90)) (char-from-code (+ cc 32)) c)))) (define rx-char-eq? (fn (a b ci?) (if ci? (= (rx-downcase-char a) (rx-downcase-char b)) (= a b)))) (define rx-parse-int (fn (pat i acc) (let ((c (rx-char-at pat i))) (if (rx-digit? c) (rx-parse-int pat (+ i 1) (+ (* acc 10) (- (char-code c) 48))) (list acc i))))) (define rx-hex-digit-val (fn (c) (cond ((and (>= (char-code c) 48) (<= (char-code c) 57)) (- (char-code c) 48)) ((and (>= (char-code c) 65) (<= (char-code c) 70)) (+ 10 (- (char-code c) 65))) ((and (>= (char-code c) 97) (<= (char-code c) 102)) (+ 10 (- (char-code c) 97))) (else -1)))) (define rx-parse-hex-n (fn (pat i n acc) (if (= n 0) (list (char-from-code acc) i) (let ((v (rx-hex-digit-val (rx-char-at pat i)))) (if (< v 0) (list (char-from-code acc) i) (rx-parse-hex-n pat (+ i 1) (- n 1) (+ (* acc 16) v))))))) ;; ── Pattern compiler ────────────────────────────────────────────── ;; Node types (stored in dicts with "__t__" key): ;; literal : {:__t__ "literal" :__c__ char} ;; any : {:__t__ "any"} ;; class-d : {:__t__ "class-d" :__neg__ bool} ;; class-w : {:__t__ "class-w" :__neg__ bool} ;; class-s : {:__t__ "class-s" :__neg__ bool} ;; char-class: {:__t__ "char-class" :__neg__ bool :__items__ list} ;; anchor-start / anchor-end / anchor-word / anchor-nonword ;; quant : {:__t__ "quant" :__node__ n :__min__ m :__max__ mx :__lazy__ bool} ;; group : {:__t__ "group" :__idx__ i :__nodes__ list} ;; ncgroup : {:__t__ "ncgroup" :__nodes__ list} ;; alt : {:__t__ "alt" :__branches__ list-of-node-lists} ;; parse one escape after `\`, returns (node new-i) (define rx-parse-escape (fn (pat i) (let ((c (rx-char-at pat i))) (cond ((= c "d") (list (dict "__t__" "class-d" "__neg__" false) (+ i 1))) ((= c "D") (list (dict "__t__" "class-d" "__neg__" true) (+ i 1))) ((= c "w") (list (dict "__t__" "class-w" "__neg__" false) (+ i 1))) ((= c "W") (list (dict "__t__" "class-w" "__neg__" true) (+ i 1))) ((= c "s") (list (dict "__t__" "class-s" "__neg__" false) (+ i 1))) ((= c "S") (list (dict "__t__" "class-s" "__neg__" true) (+ i 1))) ((= c "b") (list (dict "__t__" "anchor-word") (+ i 1))) ((= c "B") (list (dict "__t__" "anchor-nonword") (+ i 1))) ((= c "n") (list (dict "__t__" "literal" "__c__" "\n") (+ i 1))) ((= c "r") (list (dict "__t__" "literal" "__c__" "\r") (+ i 1))) ((= c "t") (list (dict "__t__" "literal" "__c__" "\t") (+ i 1))) ((= c "f") (list (dict "__t__" "literal" "__c__" "\\f") (+ i 1))) ((= c "v") (list (dict "__t__" "literal" "__c__" "") (+ i 1))) ((= c "u") (let ((res (rx-parse-hex-n pat (+ i 1) 4 0))) (list (dict "__t__" "literal" "__c__" (nth res 0)) (nth res 1)))) ((= c "x") (let ((res (rx-parse-hex-n pat (+ i 1) 2 0))) (list (dict "__t__" "literal" "__c__" (nth res 0)) (nth res 1)))) (else (list (dict "__t__" "literal" "__c__" c) (+ i 1))))))) ;; parse a char-class item inside [...], returns (item new-i) (define rx-parse-class-item (fn (pat i) (let ((c (rx-char-at pat i))) (cond ((= c "\\") (let ((esc (rx-parse-escape pat (+ i 1)))) (let ((node (nth esc 0)) (ni (nth esc 1))) (let ((t (get node "__t__"))) (cond ((= t "class-d") (list (dict "kind" "class-d" "neg" (get node "__neg__")) ni)) ((= t "class-w") (list (dict "kind" "class-w" "neg" (get node "__neg__")) ni)) ((= t "class-s") (list (dict "kind" "class-s" "neg" (get node "__neg__")) ni)) (else (let ((lc (get node "__c__"))) (if (and (= (rx-char-at pat ni) "-") (not (= (rx-char-at pat (+ ni 1)) "]"))) (let ((hi-c (rx-char-at pat (+ ni 1)))) (list (dict "kind" "range" "lo" lc "hi" hi-c) (+ ni 2))) (list (dict "kind" "lit" "c" lc) ni))))))))) (else (if (and (not (= c "")) (= (rx-char-at pat (+ i 1)) "-") (not (= (rx-char-at pat (+ i 2)) "]")) (not (= (rx-char-at pat (+ i 2)) ""))) (let ((hi-c (rx-char-at pat (+ i 2)))) (list (dict "kind" "range" "lo" c "hi" hi-c) (+ i 3))) (list (dict "kind" "lit" "c" c) (+ i 1)))))))) (define rx-parse-class-items (fn (pat i items) (let ((c (rx-char-at pat i))) (if (or (= c "]") (= c "")) (list items i) (let ((res (rx-parse-class-item pat i))) (begin (append! items (nth res 0)) (rx-parse-class-items pat (nth res 1) items))))))) ;; parse a sequence until stop-ch or EOF; returns (nodes new-i groups-count) (define rx-parse-seq (fn (pat i stop-ch ds) (let ((c (rx-char-at pat i))) (cond ((= c "") (list (get ds "nodes") i (get ds "groups"))) ((= c stop-ch) (list (get ds "nodes") i (get ds "groups"))) ((= c "|") (rx-parse-alt-rest pat i ds)) (else (let ((res (rx-parse-atom pat i ds))) (let ((node (nth res 0)) (ni (nth res 1)) (ds2 (nth res 2))) (let ((qres (rx-parse-quant pat ni node))) (begin (append! (get ds2 "nodes") (nth qres 0)) (rx-parse-seq pat (nth qres 1) stop-ch ds2)))))))))) ;; when we hit | inside a sequence, collect all alternatives (define rx-parse-alt-rest (fn (pat i ds) (let ((left-branch (get ds "nodes")) (branches (list))) (begin (append! branches left-branch) (rx-parse-alt-branches pat i (get ds "groups") branches))))) (define rx-parse-alt-branches (fn (pat i n-groups branches) (let ((new-nodes (list)) (ds2 (dict "groups" n-groups "nodes" new-nodes))) (let ((res (rx-parse-seq pat (+ i 1) "|" ds2))) (begin (append! branches (nth res 0)) (let ((ni2 (nth res 1)) (g2 (nth res 2))) (if (= (rx-char-at pat ni2) "|") (rx-parse-alt-branches pat ni2 g2 branches) (list (list (dict "__t__" "alt" "__branches__" branches)) ni2 g2)))))))) ;; parse quantifier suffix, returns (node new-i) (define rx-parse-quant (fn (pat i node) (let ((c (rx-char-at pat i))) (cond ((= c "*") (let ((lazy? (= (rx-char-at pat (+ i 1)) "?"))) (list (dict "__t__" "quant" "__node__" node "__min__" 0 "__max__" -1 "__lazy__" lazy?) (if lazy? (+ i 2) (+ i 1))))) ((= c "+") (let ((lazy? (= (rx-char-at pat (+ i 1)) "?"))) (list (dict "__t__" "quant" "__node__" node "__min__" 1 "__max__" -1 "__lazy__" lazy?) (if lazy? (+ i 2) (+ i 1))))) ((= c "?") (let ((lazy? (= (rx-char-at pat (+ i 1)) "?"))) (list (dict "__t__" "quant" "__node__" node "__min__" 0 "__max__" 1 "__lazy__" lazy?) (if lazy? (+ i 2) (+ i 1))))) ((= c "{") (let ((mres (rx-parse-int pat (+ i 1) 0))) (let ((mn (nth mres 0)) (mi (nth mres 1))) (let ((sep (rx-char-at pat mi))) (cond ((= sep "}") (let ((lazy? (= (rx-char-at pat (+ mi 1)) "?"))) (list (dict "__t__" "quant" "__node__" node "__min__" mn "__max__" mn "__lazy__" lazy?) (if lazy? (+ mi 2) (+ mi 1))))) ((= sep ",") (let ((c2 (rx-char-at pat (+ mi 1)))) (if (= c2 "}") (let ((lazy? (= (rx-char-at pat (+ mi 2)) "?"))) (list (dict "__t__" "quant" "__node__" node "__min__" mn "__max__" -1 "__lazy__" lazy?) (if lazy? (+ mi 3) (+ mi 2)))) (let ((mxres (rx-parse-int pat (+ mi 1) 0))) (let ((mx (nth mxres 0)) (mxi (nth mxres 1))) (let ((lazy? (= (rx-char-at pat (+ mxi 1)) "?"))) (list (dict "__t__" "quant" "__node__" node "__min__" mn "__max__" mx "__lazy__" lazy?) (if lazy? (+ mxi 2) (+ mxi 1))))))))) (else (list node i))))))) (else (list node i)))))) ;; parse one atom, returns (node new-i new-ds) (define rx-parse-atom (fn (pat i ds) (let ((c (rx-char-at pat i))) (cond ((= c ".") (list (dict "__t__" "any") (+ i 1) ds)) ((= c "^") (list (dict "__t__" "anchor-start") (+ i 1) ds)) ((= c "$") (list (dict "__t__" "anchor-end") (+ i 1) ds)) ((= c "\\") (let ((esc (rx-parse-escape pat (+ i 1)))) (list (nth esc 0) (nth esc 1) ds))) ((= c "[") (let ((neg? (= (rx-char-at pat (+ i 1)) "^"))) (let ((start (if neg? (+ i 2) (+ i 1))) (items (list))) (let ((res (rx-parse-class-items pat start items))) (let ((ci (nth res 1))) (list (dict "__t__" "char-class" "__neg__" neg? "__items__" items) (+ ci 1) ds)))))) ((= c "(") (let ((c2 (rx-char-at pat (+ i 1)))) (if (and (= c2 "?") (= (rx-char-at pat (+ i 2)) ":")) (let ((inner-nodes (list)) (inner-ds (dict "groups" (get ds "groups") "nodes" inner-nodes))) (let ((res (rx-parse-seq pat (+ i 3) ")" inner-ds))) (list (dict "__t__" "ncgroup" "__nodes__" (nth res 0)) (+ (nth res 1) 1) (dict "groups" (nth res 2) "nodes" (get ds "nodes"))))) (let ((gidx (+ (get ds "groups") 1)) (inner-nodes (list))) (let ((inner-ds (dict "groups" gidx "nodes" inner-nodes))) (let ((res (rx-parse-seq pat (+ i 1) ")" inner-ds))) (list (dict "__t__" "group" "__idx__" gidx "__nodes__" (nth res 0)) (+ (nth res 1) 1) (dict "groups" (nth res 2) "nodes" (get ds "nodes"))))))))) (else (list (dict "__t__" "literal" "__c__" c) (+ i 1) ds)))))) ;; top-level compile (define rx-compile (fn (pattern) (let ((nodes (list)) (ds (dict "groups" 0 "nodes" nodes))) (let ((res (rx-parse-seq pattern 0 "" ds))) (dict "nodes" (nth res 0) "ngroups" (nth res 2)))))) ;; ── Matcher ─────────────────────────────────────────────────────── ;; Match a char-class item against character c (define rx-item-matches? (fn (item c ci?) (let ((kind (get item "kind"))) (cond ((= kind "lit") (rx-char-eq? c (get item "c") ci?)) ((= kind "range") (let ((lo (if ci? (rx-downcase-char (get item "lo")) (get item "lo"))) (hi (if ci? (rx-downcase-char (get item "hi")) (get item "hi"))) (dc (if ci? (rx-downcase-char c) c))) (and (>= (char-code dc) (char-code lo)) (<= (char-code dc) (char-code hi))))) ((= kind "class-d") (let ((m (rx-digit? c))) (if (get item "neg") (not m) m))) ((= kind "class-w") (let ((m (rx-word? c))) (if (get item "neg") (not m) m))) ((= kind "class-s") (let ((m (rx-space? c))) (if (get item "neg") (not m) m))) (else false))))) (define rx-class-items-any? (fn (items c ci?) (if (empty? items) false (if (rx-item-matches? (first items) c ci?) true (rx-class-items-any? (rest items) c ci?))))) (define rx-class-matches? (fn (node c ci?) (let ((neg? (get node "__neg__")) (items (get node "__items__"))) (let ((hit (rx-class-items-any? items c ci?))) (if neg? (not hit) hit))))) ;; Word boundary check (define rx-is-word-boundary? (fn (s i slen) (let ((before (if (> i 0) (rx-word? (char-at s (- i 1))) false)) (after (if (< i slen) (rx-word? (char-at s i)) false))) (not (= before after))))) ;; ── Core matcher ────────────────────────────────────────────────── ;; ;; rx-match-nodes : nodes s i slen ci? mi? groups → end-pos or -1 ;; ;; Matches `nodes` starting at position `i` in string `s`. ;; Returns the position after the last character consumed, or -1 on failure. ;; Mutates `groups` dict to record captures. (define rx-match-nodes (fn (nodes s i slen ci? mi? groups) (if (empty? nodes) i (let ((node (first nodes)) (rest-nodes (rest nodes))) (let ((t (get node "__t__"))) (cond ((= t "literal") (if (and (< i slen) (rx-char-eq? (char-at s i) (get node "__c__") ci?)) (rx-match-nodes rest-nodes s (+ i 1) slen ci? mi? groups) -1)) ((= t "any") (if (and (< i slen) (not (rx-newline? (char-at s i)))) (rx-match-nodes rest-nodes s (+ i 1) slen ci? mi? groups) -1)) ((= t "class-d") (let ((m (and (< i slen) (rx-digit? (char-at s i))))) (if (if (get node "__neg__") (not m) m) (rx-match-nodes rest-nodes s (+ i 1) slen ci? mi? groups) -1))) ((= t "class-w") (let ((m (and (< i slen) (rx-word? (char-at s i))))) (if (if (get node "__neg__") (not m) m) (rx-match-nodes rest-nodes s (+ i 1) slen ci? mi? groups) -1))) ((= t "class-s") (let ((m (and (< i slen) (rx-space? (char-at s i))))) (if (if (get node "__neg__") (not m) m) (rx-match-nodes rest-nodes s (+ i 1) slen ci? mi? groups) -1))) ((= t "char-class") (if (and (< i slen) (rx-class-matches? node (char-at s i) ci?)) (rx-match-nodes rest-nodes s (+ i 1) slen ci? mi? groups) -1)) ((= t "anchor-start") (if (or (= i 0) (and mi? (rx-newline? (rx-char-at s (- i 1))))) (rx-match-nodes rest-nodes s i slen ci? mi? groups) -1)) ((= t "anchor-end") (if (or (= i slen) (and mi? (rx-newline? (rx-char-at s i)))) (rx-match-nodes rest-nodes s i slen ci? mi? groups) -1)) ((= t "anchor-word") (if (rx-is-word-boundary? s i slen) (rx-match-nodes rest-nodes s i slen ci? mi? groups) -1)) ((= t "anchor-nonword") (if (not (rx-is-word-boundary? s i slen)) (rx-match-nodes rest-nodes s i slen ci? mi? groups) -1)) ((= t "group") (let ((gidx (get node "__idx__")) (inner (get node "__nodes__"))) (let ((g-end (rx-match-nodes inner s i slen ci? mi? groups))) (if (>= g-end 0) (begin (dict-set! groups (js-to-string gidx) (substring s i g-end)) (let ((final-end (rx-match-nodes rest-nodes s g-end slen ci? mi? groups))) (if (>= final-end 0) final-end (begin (dict-set! groups (js-to-string gidx) nil) -1)))) -1)))) ((= t "ncgroup") (let ((inner (get node "__nodes__"))) (rx-match-nodes (append inner rest-nodes) s i slen ci? mi? groups))) ((= t "alt") (let ((branches (get node "__branches__"))) (rx-try-branches branches rest-nodes s i slen ci? mi? groups))) ((= t "quant") (let ((inner-node (get node "__node__")) (mn (get node "__min__")) (mx (get node "__max__")) (lazy? (get node "__lazy__"))) (if lazy? (rx-quant-lazy inner-node mn mx rest-nodes s i slen ci? mi? groups 0) (rx-quant-greedy inner-node mn mx rest-nodes s i slen ci? mi? groups 0)))) (else -1))))))) (define rx-try-branches (fn (branches rest-nodes s i slen ci? mi? groups) (if (empty? branches) -1 (let ((res (rx-match-nodes (append (first branches) rest-nodes) s i slen ci? mi? groups))) (if (>= res 0) res (rx-try-branches (rest branches) rest-nodes s i slen ci? mi? groups)))))) ;; Greedy: expand as far as possible, then try rest from the longest match ;; Strategy: recurse forward (extend first); only try rest when extension fails (define rx-quant-greedy (fn (inner-node mn mx rest-nodes s i slen ci? mi? groups count) (let ((can-extend (and (< i slen) (or (= mx -1) (< count mx))))) (if can-extend (let ((ni (rx-match-one inner-node s i slen ci? mi? groups))) (if (>= ni 0) (let ((res (rx-quant-greedy inner-node mn mx rest-nodes s ni slen ci? mi? groups (+ count 1)))) (if (>= res 0) res (if (>= count mn) (rx-match-nodes rest-nodes s i slen ci? mi? groups) -1))) (if (>= count mn) (rx-match-nodes rest-nodes s i slen ci? mi? groups) -1))) (if (>= count mn) (rx-match-nodes rest-nodes s i slen ci? mi? groups) -1))))) ;; Lazy: try rest first, extend only if rest fails (define rx-quant-lazy (fn (inner-node mn mx rest-nodes s i slen ci? mi? groups count) (if (>= count mn) (let ((res (rx-match-nodes rest-nodes s i slen ci? mi? groups))) (if (>= res 0) res (if (and (< i slen) (or (= mx -1) (< count mx))) (let ((ni (rx-match-one inner-node s i slen ci? mi? groups))) (if (>= ni 0) (rx-quant-lazy inner-node mn mx rest-nodes s ni slen ci? mi? groups (+ count 1)) -1)) -1))) (if (< i slen) (let ((ni (rx-match-one inner-node s i slen ci? mi? groups))) (if (>= ni 0) (rx-quant-lazy inner-node mn mx rest-nodes s ni slen ci? mi? groups (+ count 1)) -1)) -1)))) ;; Match a single node at position i, return new pos or -1 (define rx-match-one (fn (node s i slen ci? mi? groups) (rx-match-nodes (list node) s i slen ci? mi? groups))) ;; ── Engine entry points ─────────────────────────────────────────── ;; Try matching at exactly position i. Returns result dict or nil. (define rx-try-at (fn (compiled s i slen ci? mi?) (let ((nodes (get compiled "nodes")) (ngroups (get compiled "ngroups"))) (let ((groups (dict))) (let ((end (rx-match-nodes nodes s i slen ci? mi? groups))) (if (>= end 0) (dict "start" i "end" end "groups" groups "ngroups" ngroups) nil)))))) ;; Find first match scanning from search-start. (define rx-find-from (fn (compiled s search-start slen ci? mi?) (if (> search-start slen) nil (let ((res (rx-try-at compiled s search-start slen ci? mi?))) (if res res (rx-find-from compiled s (+ search-start 1) slen ci? mi?)))))) ;; Build exec result dict from raw match result (define rx-build-exec-result (fn (s match-res) (let ((start (get match-res "start")) (end (get match-res "end")) (groups (get match-res "groups")) (ngroups (get match-res "ngroups"))) (let ((matched (substring s start end)) (caps (rx-build-captures groups ngroups 1))) (dict "match" matched "index" start "input" s "groups" caps))))) (define rx-build-captures (fn (groups ngroups idx) (if (> idx ngroups) (list) (let ((cap (get groups (js-to-string idx)))) (cons (if (= cap nil) :js-undefined cap) (rx-build-captures groups ngroups (+ idx 1))))))) ;; ── Public interface ────────────────────────────────────────────── ;; Lazy compile: build NFA on first use, cache under "__compiled__" (define rx-ensure-compiled! (fn (rx) (if (dict-has? rx "__compiled__") (get rx "__compiled__") (let ((c (rx-compile (get rx "source")))) (begin (dict-set! rx "__compiled__" c) c))))) (define rx-test (fn (rx s) (let ((compiled (rx-ensure-compiled! rx)) (ci? (get rx "ignoreCase")) (mi? (get rx "multiline")) (slen (len s))) (let ((start (if (get rx "global") (let ((li (get rx "lastIndex"))) (if (number? li) li 0)) 0))) (let ((res (rx-find-from compiled s start slen ci? mi?))) (if (get rx "global") (begin (dict-set! rx "lastIndex" (if res (get res "end") 0)) (if res true false)) (if res true false))))))) (define rx-exec (fn (rx s) (let ((compiled (rx-ensure-compiled! rx)) (ci? (get rx "ignoreCase")) (mi? (get rx "multiline")) (slen (len s))) (let ((start (if (get rx "global") (let ((li (get rx "lastIndex"))) (if (number? li) li 0)) 0))) (let ((res (rx-find-from compiled s start slen ci? mi?))) (if res (begin (when (get rx "global") (dict-set! rx "lastIndex" (get res "end"))) (rx-build-exec-result s res)) (begin (when (get rx "global") (dict-set! rx "lastIndex" 0)) nil))))))) ;; match-all for String.prototype.matchAll (define js-regex-match-all (fn (rx s) (let ((compiled (rx-ensure-compiled! rx)) (ci? (get rx "ignoreCase")) (mi? (get rx "multiline")) (slen (len s)) (results (list))) (rx-match-all-loop compiled s 0 slen ci? mi? results)))) (define rx-match-all-loop (fn (compiled s i slen ci? mi? results) (if (> i slen) results (let ((res (rx-find-from compiled s i slen ci? mi?))) (if res (begin (append! results (rx-build-exec-result s res)) (let ((next (get res "end"))) (rx-match-all-loop compiled s (if (= next i) (+ i 1) next) slen ci? mi? results))) results))))) ;; ── Install platform ────────────────────────────────────────────── (js-regex-platform-override! "test" rx-test) (js-regex-platform-override! "exec" rx-exec)