diff --git a/lib/js/conformance.sh b/lib/js/conformance.sh index c6f91502..b181e9ad 100755 --- a/lib/js/conformance.sh +++ b/lib/js/conformance.sh @@ -49,6 +49,8 @@ trap "rm -f $TMPFILE" EXIT echo '(load "lib/js/transpile.sx")' echo '(epoch 5)' echo '(load "lib/js/runtime.sx")' + echo '(epoch 6)' + echo '(load "lib/js/regex.sx")' epoch=100 for f in "${FIXTURES[@]}"; do diff --git a/lib/js/regex.sx b/lib/js/regex.sx new file mode 100644 index 00000000..c56957ca --- /dev/null +++ b/lib/js/regex.sx @@ -0,0 +1,943 @@ +;; lib/js/regex.sx — pure-SX recursive backtracking regex engine +;; +;; Installed via (js-regex-platform-override! ...) at load time. +;; Covers: character classes (\d\w\s . [abc] [^abc] [a-z]), +;; anchors (^ $ \b \B), quantifiers (* + ? {n,m} lazy variants), +;; groups (capturing + non-capturing), alternation (a|b), +;; flags: i (case-insensitive), g (global), m (multiline). +;; +;; Architecture: +;; 1. rx-parse-pattern — pattern string → compiled node list +;; 2. rx-match-nodes — recursive backtracker +;; 3. rx-exec / rx-test — public interface +;; 4. Install as {:test rx-test :exec rx-exec} + +;; ── Utilities ───────────────────────────────────────────────────── + +(define + rx-char-at + (fn (s i) (if (and (>= i 0) (< i (len s))) (char-at s i) ""))) + +(define + rx-digit? + (fn + (c) + (and (not (= c "")) (>= (char-code c) 48) (<= (char-code c) 57)))) + +(define + rx-word? + (fn + (c) + (and + (not (= c "")) + (or + (and (>= (char-code c) 65) (<= (char-code c) 90)) + (and (>= (char-code c) 97) (<= (char-code c) 122)) + (and (>= (char-code c) 48) (<= (char-code c) 57)) + (= c "_"))))) + +(define + rx-space? + (fn + (c) + (or (= c " ") (= c "\t") (= c "\n") (= c "\r") (= c "\\f") (= c "")))) + +(define rx-newline? (fn (c) (or (= c "\n") (= c "\r")))) + +(define + rx-downcase-char + (fn + (c) + (let + ((cc (char-code c))) + (if (and (>= cc 65) (<= cc 90)) (char-from-code (+ cc 32)) c)))) + +(define + rx-char-eq? + (fn + (a b ci?) + (if ci? (= (rx-downcase-char a) (rx-downcase-char b)) (= a b)))) + +(define + rx-parse-int + (fn + (pat i acc) + (let + ((c (rx-char-at pat i))) + (if + (rx-digit? c) + (rx-parse-int pat (+ i 1) (+ (* acc 10) (- (char-code c) 48))) + (list acc i))))) + +(define + rx-hex-digit-val + (fn + (c) + (cond + ((and (>= (char-code c) 48) (<= (char-code c) 57)) + (- (char-code c) 48)) + ((and (>= (char-code c) 65) (<= (char-code c) 70)) + (+ 10 (- (char-code c) 65))) + ((and (>= (char-code c) 97) (<= (char-code c) 102)) + (+ 10 (- (char-code c) 97))) + (else -1)))) + +(define + rx-parse-hex-n + (fn + (pat i n acc) + (if + (= n 0) + (list (char-from-code acc) i) + (let + ((v (rx-hex-digit-val (rx-char-at pat i)))) + (if + (< v 0) + (list (char-from-code acc) i) + (rx-parse-hex-n pat (+ i 1) (- n 1) (+ (* acc 16) v))))))) + +;; ── Pattern compiler ────────────────────────────────────────────── + +;; Node types (stored in dicts with "__t__" key): +;; literal : {:__t__ "literal" :__c__ char} +;; any : {:__t__ "any"} +;; class-d : {:__t__ "class-d" :__neg__ bool} +;; class-w : {:__t__ "class-w" :__neg__ bool} +;; class-s : {:__t__ "class-s" :__neg__ bool} +;; char-class: {:__t__ "char-class" :__neg__ bool :__items__ list} +;; anchor-start / anchor-end / anchor-word / anchor-nonword +;; quant : {:__t__ "quant" :__node__ n :__min__ m :__max__ mx :__lazy__ bool} +;; group : {:__t__ "group" :__idx__ i :__nodes__ list} +;; ncgroup : {:__t__ "ncgroup" :__nodes__ list} +;; alt : {:__t__ "alt" :__branches__ list-of-node-lists} + +;; parse one escape after `\`, returns (node new-i) +(define + rx-parse-escape + (fn + (pat i) + (let + ((c (rx-char-at pat i))) + (cond + ((= c "d") (list (dict "__t__" "class-d" "__neg__" false) (+ i 1))) + ((= c "D") (list (dict "__t__" "class-d" "__neg__" true) (+ i 1))) + ((= c "w") (list (dict "__t__" "class-w" "__neg__" false) (+ i 1))) + ((= c "W") (list (dict "__t__" "class-w" "__neg__" true) (+ i 1))) + ((= c "s") (list (dict "__t__" "class-s" "__neg__" false) (+ i 1))) + ((= c "S") (list (dict "__t__" "class-s" "__neg__" true) (+ i 1))) + ((= c "b") (list (dict "__t__" "anchor-word") (+ i 1))) + ((= c "B") (list (dict "__t__" "anchor-nonword") (+ i 1))) + ((= c "n") (list (dict "__t__" "literal" "__c__" "\n") (+ i 1))) + ((= c "r") (list (dict "__t__" "literal" "__c__" "\r") (+ i 1))) + ((= c "t") (list (dict "__t__" "literal" "__c__" "\t") (+ i 1))) + ((= c "f") (list (dict "__t__" "literal" "__c__" "\\f") (+ i 1))) + ((= c "v") (list (dict "__t__" "literal" "__c__" "") (+ i 1))) + ((= c "u") + (let + ((res (rx-parse-hex-n pat (+ i 1) 4 0))) + (list (dict "__t__" "literal" "__c__" (nth res 0)) (nth res 1)))) + ((= c "x") + (let + ((res (rx-parse-hex-n pat (+ i 1) 2 0))) + (list (dict "__t__" "literal" "__c__" (nth res 0)) (nth res 1)))) + (else (list (dict "__t__" "literal" "__c__" c) (+ i 1))))))) + +;; parse a char-class item inside [...], returns (item new-i) +(define + rx-parse-class-item + (fn + (pat i) + (let + ((c (rx-char-at pat i))) + (cond + ((= c "\\") + (let + ((esc (rx-parse-escape pat (+ i 1)))) + (let + ((node (nth esc 0)) (ni (nth esc 1))) + (let + ((t (get node "__t__"))) + (cond + ((= t "class-d") + (list + (dict "kind" "class-d" "neg" (get node "__neg__")) + ni)) + ((= t "class-w") + (list + (dict "kind" "class-w" "neg" (get node "__neg__")) + ni)) + ((= t "class-s") + (list + (dict "kind" "class-s" "neg" (get node "__neg__")) + ni)) + (else + (let + ((lc (get node "__c__"))) + (if + (and + (= (rx-char-at pat ni) "-") + (not (= (rx-char-at pat (+ ni 1)) "]"))) + (let + ((hi-c (rx-char-at pat (+ ni 1)))) + (list + (dict "kind" "range" "lo" lc "hi" hi-c) + (+ ni 2))) + (list (dict "kind" "lit" "c" lc) ni))))))))) + (else + (if + (and + (not (= c "")) + (= (rx-char-at pat (+ i 1)) "-") + (not (= (rx-char-at pat (+ i 2)) "]")) + (not (= (rx-char-at pat (+ i 2)) ""))) + (let + ((hi-c (rx-char-at pat (+ i 2)))) + (list (dict "kind" "range" "lo" c "hi" hi-c) (+ i 3))) + (list (dict "kind" "lit" "c" c) (+ i 1)))))))) + +(define + rx-parse-class-items + (fn + (pat i items) + (let + ((c (rx-char-at pat i))) + (if + (or (= c "]") (= c "")) + (list items i) + (let + ((res (rx-parse-class-item pat i))) + (begin + (append! items (nth res 0)) + (rx-parse-class-items pat (nth res 1) items))))))) + +;; parse a sequence until stop-ch or EOF; returns (nodes new-i groups-count) +(define + rx-parse-seq + (fn + (pat i stop-ch ds) + (let + ((c (rx-char-at pat i))) + (cond + ((= c "") (list (get ds "nodes") i (get ds "groups"))) + ((= c stop-ch) (list (get ds "nodes") i (get ds "groups"))) + ((= c "|") (rx-parse-alt-rest pat i ds)) + (else + (let + ((res (rx-parse-atom pat i ds))) + (let + ((node (nth res 0)) (ni (nth res 1)) (ds2 (nth res 2))) + (let + ((qres (rx-parse-quant pat ni node))) + (begin + (append! (get ds2 "nodes") (nth qres 0)) + (rx-parse-seq pat (nth qres 1) stop-ch ds2)))))))))) + +;; when we hit | inside a sequence, collect all alternatives +(define + rx-parse-alt-rest + (fn + (pat i ds) + (let + ((left-branch (get ds "nodes")) (branches (list))) + (begin + (append! branches left-branch) + (rx-parse-alt-branches pat i (get ds "groups") branches))))) + +(define + rx-parse-alt-branches + (fn + (pat i n-groups branches) + (let + ((new-nodes (list)) (ds2 (dict "groups" n-groups "nodes" new-nodes))) + (let + ((res (rx-parse-seq pat (+ i 1) "|" ds2))) + (begin + (append! branches (nth res 0)) + (let + ((ni2 (nth res 1)) (g2 (nth res 2))) + (if + (= (rx-char-at pat ni2) "|") + (rx-parse-alt-branches pat ni2 g2 branches) + (list + (list (dict "__t__" "alt" "__branches__" branches)) + ni2 + g2)))))))) + +;; parse quantifier suffix, returns (node new-i) +(define + rx-parse-quant + (fn + (pat i node) + (let + ((c (rx-char-at pat i))) + (cond + ((= c "*") + (let + ((lazy? (= (rx-char-at pat (+ i 1)) "?"))) + (list + (dict + "__t__" + "quant" + "__node__" + node + "__min__" + 0 + "__max__" + -1 + "__lazy__" + lazy?) + (if lazy? (+ i 2) (+ i 1))))) + ((= c "+") + (let + ((lazy? (= (rx-char-at pat (+ i 1)) "?"))) + (list + (dict + "__t__" + "quant" + "__node__" + node + "__min__" + 1 + "__max__" + -1 + "__lazy__" + lazy?) + (if lazy? (+ i 2) (+ i 1))))) + ((= c "?") + (let + ((lazy? (= (rx-char-at pat (+ i 1)) "?"))) + (list + (dict + "__t__" + "quant" + "__node__" + node + "__min__" + 0 + "__max__" + 1 + "__lazy__" + lazy?) + (if lazy? (+ i 2) (+ i 1))))) + ((= c "{") + (let + ((mres (rx-parse-int pat (+ i 1) 0))) + (let + ((mn (nth mres 0)) (mi (nth mres 1))) + (let + ((sep (rx-char-at pat mi))) + (cond + ((= sep "}") + (let + ((lazy? (= (rx-char-at pat (+ mi 1)) "?"))) + (list + (dict + "__t__" + "quant" + "__node__" + node + "__min__" + mn + "__max__" + mn + "__lazy__" + lazy?) + (if lazy? (+ mi 2) (+ mi 1))))) + ((= sep ",") + (let + ((c2 (rx-char-at pat (+ mi 1)))) + (if + (= c2 "}") + (let + ((lazy? (= (rx-char-at pat (+ mi 2)) "?"))) + (list + (dict + "__t__" + "quant" + "__node__" + node + "__min__" + mn + "__max__" + -1 + "__lazy__" + lazy?) + (if lazy? (+ mi 3) (+ mi 2)))) + (let + ((mxres (rx-parse-int pat (+ mi 1) 0))) + (let + ((mx (nth mxres 0)) (mxi (nth mxres 1))) + (let + ((lazy? (= (rx-char-at pat (+ mxi 1)) "?"))) + (list + (dict + "__t__" + "quant" + "__node__" + node + "__min__" + mn + "__max__" + mx + "__lazy__" + lazy?) + (if lazy? (+ mxi 2) (+ mxi 1))))))))) + (else (list node i))))))) + (else (list node i)))))) + +;; parse one atom, returns (node new-i new-ds) +(define + rx-parse-atom + (fn + (pat i ds) + (let + ((c (rx-char-at pat i))) + (cond + ((= c ".") (list (dict "__t__" "any") (+ i 1) ds)) + ((= c "^") (list (dict "__t__" "anchor-start") (+ i 1) ds)) + ((= c "$") (list (dict "__t__" "anchor-end") (+ i 1) ds)) + ((= c "\\") + (let + ((esc (rx-parse-escape pat (+ i 1)))) + (list (nth esc 0) (nth esc 1) ds))) + ((= c "[") + (let + ((neg? (= (rx-char-at pat (+ i 1)) "^"))) + (let + ((start (if neg? (+ i 2) (+ i 1))) (items (list))) + (let + ((res (rx-parse-class-items pat start items))) + (let + ((ci (nth res 1))) + (list + (dict + "__t__" + "char-class" + "__neg__" + neg? + "__items__" + items) + (+ ci 1) + ds)))))) + ((= c "(") + (let + ((c2 (rx-char-at pat (+ i 1)))) + (if + (and (= c2 "?") (= (rx-char-at pat (+ i 2)) ":")) + (let + ((inner-nodes (list)) + (inner-ds + (dict "groups" (get ds "groups") "nodes" inner-nodes))) + (let + ((res (rx-parse-seq pat (+ i 3) ")" inner-ds))) + (list + (dict "__t__" "ncgroup" "__nodes__" (nth res 0)) + (+ (nth res 1) 1) + (dict "groups" (nth res 2) "nodes" (get ds "nodes"))))) + (let + ((gidx (+ (get ds "groups") 1)) (inner-nodes (list))) + (let + ((inner-ds (dict "groups" gidx "nodes" inner-nodes))) + (let + ((res (rx-parse-seq pat (+ i 1) ")" inner-ds))) + (list + (dict + "__t__" + "group" + "__idx__" + gidx + "__nodes__" + (nth res 0)) + (+ (nth res 1) 1) + (dict "groups" (nth res 2) "nodes" (get ds "nodes"))))))))) + (else (list (dict "__t__" "literal" "__c__" c) (+ i 1) ds)))))) + +;; top-level compile +(define + rx-compile + (fn + (pattern) + (let + ((nodes (list)) (ds (dict "groups" 0 "nodes" nodes))) + (let + ((res (rx-parse-seq pattern 0 "" ds))) + (dict "nodes" (nth res 0) "ngroups" (nth res 2)))))) + +;; ── Matcher ─────────────────────────────────────────────────────── + +;; Match a char-class item against character c +(define + rx-item-matches? + (fn + (item c ci?) + (let + ((kind (get item "kind"))) + (cond + ((= kind "lit") (rx-char-eq? c (get item "c") ci?)) + ((= kind "range") + (let + ((lo (if ci? (rx-downcase-char (get item "lo")) (get item "lo"))) + (hi + (if ci? (rx-downcase-char (get item "hi")) (get item "hi"))) + (dc (if ci? (rx-downcase-char c) c))) + (and + (>= (char-code dc) (char-code lo)) + (<= (char-code dc) (char-code hi))))) + ((= kind "class-d") + (let ((m (rx-digit? c))) (if (get item "neg") (not m) m))) + ((= kind "class-w") + (let ((m (rx-word? c))) (if (get item "neg") (not m) m))) + ((= kind "class-s") + (let ((m (rx-space? c))) (if (get item "neg") (not m) m))) + (else false))))) + +(define + rx-class-items-any? + (fn + (items c ci?) + (if + (empty? items) + false + (if + (rx-item-matches? (first items) c ci?) + true + (rx-class-items-any? (rest items) c ci?))))) + +(define + rx-class-matches? + (fn + (node c ci?) + (let + ((neg? (get node "__neg__")) (items (get node "__items__"))) + (let + ((hit (rx-class-items-any? items c ci?))) + (if neg? (not hit) hit))))) + +;; Word boundary check +(define + rx-is-word-boundary? + (fn + (s i slen) + (let + ((before (if (> i 0) (rx-word? (char-at s (- i 1))) false)) + (after (if (< i slen) (rx-word? (char-at s i)) false))) + (not (= before after))))) + +;; ── Core matcher ────────────────────────────────────────────────── +;; +;; rx-match-nodes : nodes s i slen ci? mi? groups → end-pos or -1 +;; +;; Matches `nodes` starting at position `i` in string `s`. +;; Returns the position after the last character consumed, or -1 on failure. +;; Mutates `groups` dict to record captures. + +(define + rx-match-nodes + (fn + (nodes s i slen ci? mi? groups) + (if + (empty? nodes) + i + (let + ((node (first nodes)) (rest-nodes (rest nodes))) + (let + ((t (get node "__t__"))) + (cond + ((= t "literal") + (if + (and + (< i slen) + (rx-char-eq? (char-at s i) (get node "__c__") ci?)) + (rx-match-nodes rest-nodes s (+ i 1) slen ci? mi? groups) + -1)) + ((= t "any") + (if + (and (< i slen) (not (rx-newline? (char-at s i)))) + (rx-match-nodes rest-nodes s (+ i 1) slen ci? mi? groups) + -1)) + ((= t "class-d") + (let + ((m (and (< i slen) (rx-digit? (char-at s i))))) + (if + (if (get node "__neg__") (not m) m) + (rx-match-nodes rest-nodes s (+ i 1) slen ci? mi? groups) + -1))) + ((= t "class-w") + (let + ((m (and (< i slen) (rx-word? (char-at s i))))) + (if + (if (get node "__neg__") (not m) m) + (rx-match-nodes rest-nodes s (+ i 1) slen ci? mi? groups) + -1))) + ((= t "class-s") + (let + ((m (and (< i slen) (rx-space? (char-at s i))))) + (if + (if (get node "__neg__") (not m) m) + (rx-match-nodes rest-nodes s (+ i 1) slen ci? mi? groups) + -1))) + ((= t "char-class") + (if + (and (< i slen) (rx-class-matches? node (char-at s i) ci?)) + (rx-match-nodes rest-nodes s (+ i 1) slen ci? mi? groups) + -1)) + ((= t "anchor-start") + (if + (or + (= i 0) + (and mi? (rx-newline? (rx-char-at s (- i 1))))) + (rx-match-nodes rest-nodes s i slen ci? mi? groups) + -1)) + ((= t "anchor-end") + (if + (or (= i slen) (and mi? (rx-newline? (rx-char-at s i)))) + (rx-match-nodes rest-nodes s i slen ci? mi? groups) + -1)) + ((= t "anchor-word") + (if + (rx-is-word-boundary? s i slen) + (rx-match-nodes rest-nodes s i slen ci? mi? groups) + -1)) + ((= t "anchor-nonword") + (if + (not (rx-is-word-boundary? s i slen)) + (rx-match-nodes rest-nodes s i slen ci? mi? groups) + -1)) + ((= t "group") + (let + ((gidx (get node "__idx__")) + (inner (get node "__nodes__"))) + (let + ((g-end (rx-match-nodes inner s i slen ci? mi? groups))) + (if + (>= g-end 0) + (begin + (dict-set! + groups + (js-to-string gidx) + (substring s i g-end)) + (let + ((final-end (rx-match-nodes rest-nodes s g-end slen ci? mi? groups))) + (if + (>= final-end 0) + final-end + (begin + (dict-set! groups (js-to-string gidx) nil) + -1)))) + -1)))) + ((= t "ncgroup") + (let + ((inner (get node "__nodes__"))) + (rx-match-nodes + (append inner rest-nodes) + s + i + slen + ci? + mi? + groups))) + ((= t "alt") + (let + ((branches (get node "__branches__"))) + (rx-try-branches branches rest-nodes s i slen ci? mi? groups))) + ((= t "quant") + (let + ((inner-node (get node "__node__")) + (mn (get node "__min__")) + (mx (get node "__max__")) + (lazy? (get node "__lazy__"))) + (if + lazy? + (rx-quant-lazy + inner-node + mn + mx + rest-nodes + s + i + slen + ci? + mi? + groups + 0) + (rx-quant-greedy + inner-node + mn + mx + rest-nodes + s + i + slen + ci? + mi? + groups + 0)))) + (else -1))))))) + +(define + rx-try-branches + (fn + (branches rest-nodes s i slen ci? mi? groups) + (if + (empty? branches) + -1 + (let + ((res (rx-match-nodes (append (first branches) rest-nodes) s i slen ci? mi? groups))) + (if + (>= res 0) + res + (rx-try-branches (rest branches) rest-nodes s i slen ci? mi? groups)))))) + +;; Greedy: expand as far as possible, then try rest from the longest match +;; Strategy: recurse forward (extend first); only try rest when extension fails +(define + rx-quant-greedy + (fn + (inner-node mn mx rest-nodes s i slen ci? mi? groups count) + (let + ((can-extend (and (< i slen) (or (= mx -1) (< count mx))))) + (if + can-extend + (let + ((ni (rx-match-one inner-node s i slen ci? mi? groups))) + (if + (>= ni 0) + (let + ((res (rx-quant-greedy inner-node mn mx rest-nodes s ni slen ci? mi? groups (+ count 1)))) + (if + (>= res 0) + res + (if + (>= count mn) + (rx-match-nodes rest-nodes s i slen ci? mi? groups) + -1))) + (if + (>= count mn) + (rx-match-nodes rest-nodes s i slen ci? mi? groups) + -1))) + (if + (>= count mn) + (rx-match-nodes rest-nodes s i slen ci? mi? groups) + -1))))) + +;; Lazy: try rest first, extend only if rest fails +(define + rx-quant-lazy + (fn + (inner-node mn mx rest-nodes s i slen ci? mi? groups count) + (if + (>= count mn) + (let + ((res (rx-match-nodes rest-nodes s i slen ci? mi? groups))) + (if + (>= res 0) + res + (if + (and (< i slen) (or (= mx -1) (< count mx))) + (let + ((ni (rx-match-one inner-node s i slen ci? mi? groups))) + (if + (>= ni 0) + (rx-quant-lazy + inner-node + mn + mx + rest-nodes + s + ni + slen + ci? + mi? + groups + (+ count 1)) + -1)) + -1))) + (if + (< i slen) + (let + ((ni (rx-match-one inner-node s i slen ci? mi? groups))) + (if + (>= ni 0) + (rx-quant-lazy + inner-node + mn + mx + rest-nodes + s + ni + slen + ci? + mi? + groups + (+ count 1)) + -1)) + -1)))) + +;; Match a single node at position i, return new pos or -1 +(define + rx-match-one + (fn + (node s i slen ci? mi? groups) + (rx-match-nodes (list node) s i slen ci? mi? groups))) + +;; ── Engine entry points ─────────────────────────────────────────── + +;; Try matching at exactly position i. Returns result dict or nil. +(define + rx-try-at + (fn + (compiled s i slen ci? mi?) + (let + ((nodes (get compiled "nodes")) (ngroups (get compiled "ngroups"))) + (let + ((groups (dict))) + (let + ((end (rx-match-nodes nodes s i slen ci? mi? groups))) + (if + (>= end 0) + (dict "start" i "end" end "groups" groups "ngroups" ngroups) + nil)))))) + +;; Find first match scanning from search-start. +(define + rx-find-from + (fn + (compiled s search-start slen ci? mi?) + (if + (> search-start slen) + nil + (let + ((res (rx-try-at compiled s search-start slen ci? mi?))) + (if + res + res + (rx-find-from compiled s (+ search-start 1) slen ci? mi?)))))) + +;; Build exec result dict from raw match result +(define + rx-build-exec-result + (fn + (s match-res) + (let + ((start (get match-res "start")) + (end (get match-res "end")) + (groups (get match-res "groups")) + (ngroups (get match-res "ngroups"))) + (let + ((matched (substring s start end)) + (caps (rx-build-captures groups ngroups 1))) + (dict "match" matched "index" start "input" s "groups" caps))))) + +(define + rx-build-captures + (fn + (groups ngroups idx) + (if + (> idx ngroups) + (list) + (let + ((cap (get groups (js-to-string idx)))) + (cons + (if (= cap nil) :js-undefined cap) + (rx-build-captures groups ngroups (+ idx 1))))))) + +;; ── Public interface ────────────────────────────────────────────── + +;; Lazy compile: build NFA on first use, cache under "__compiled__" +(define + rx-ensure-compiled! + (fn + (rx) + (if + (dict-has? rx "__compiled__") + (get rx "__compiled__") + (let + ((c (rx-compile (get rx "source")))) + (begin (dict-set! rx "__compiled__" c) c))))) + +(define + rx-test + (fn + (rx s) + (let + ((compiled (rx-ensure-compiled! rx)) + (ci? (get rx "ignoreCase")) + (mi? (get rx "multiline")) + (slen (len s))) + (let + ((start (if (get rx "global") (let ((li (get rx "lastIndex"))) (if (number? li) li 0)) 0))) + (let + ((res (rx-find-from compiled s start slen ci? mi?))) + (if + (get rx "global") + (begin + (dict-set! rx "lastIndex" (if res (get res "end") 0)) + (if res true false)) + (if res true false))))))) + +(define + rx-exec + (fn + (rx s) + (let + ((compiled (rx-ensure-compiled! rx)) + (ci? (get rx "ignoreCase")) + (mi? (get rx "multiline")) + (slen (len s))) + (let + ((start (if (get rx "global") (let ((li (get rx "lastIndex"))) (if (number? li) li 0)) 0))) + (let + ((res (rx-find-from compiled s start slen ci? mi?))) + (if + res + (begin + (when + (get rx "global") + (dict-set! rx "lastIndex" (get res "end"))) + (rx-build-exec-result s res)) + (begin + (when (get rx "global") (dict-set! rx "lastIndex" 0)) + nil))))))) + +;; match-all for String.prototype.matchAll +(define + js-regex-match-all + (fn + (rx s) + (let + ((compiled (rx-ensure-compiled! rx)) + (ci? (get rx "ignoreCase")) + (mi? (get rx "multiline")) + (slen (len s)) + (results (list))) + (rx-match-all-loop compiled s 0 slen ci? mi? results)))) + +(define + rx-match-all-loop + (fn + (compiled s i slen ci? mi? results) + (if + (> i slen) + results + (let + ((res (rx-find-from compiled s i slen ci? mi?))) + (if + res + (begin + (append! results (rx-build-exec-result s res)) + (let + ((next (get res "end"))) + (rx-match-all-loop + compiled + s + (if (= next i) (+ i 1) next) + slen + ci? + mi? + results))) + results))))) + +;; ── Install platform ────────────────────────────────────────────── + +(js-regex-platform-override! "test" rx-test) +(js-regex-platform-override! "exec" rx-exec) diff --git a/lib/js/runtime.sx b/lib/js/runtime.sx index e1021cc2..1872b3e9 100644 --- a/lib/js/runtime.sx +++ b/lib/js/runtime.sx @@ -2032,7 +2032,15 @@ (&rest args) (cond ((= (len args) 0) nil) - ((js-regex? (nth args 0)) (js-regex-stub-exec (nth args 0) s)) + ((js-regex? (nth args 0)) + (let + ((rx (nth args 0))) + (let + ((impl (get __js_regex_platform__ "exec"))) + (if + (js-undefined? impl) + (js-regex-stub-exec rx s) + (impl rx s))))) (else (let ((needle (js-to-string (nth args 0)))) @@ -2041,7 +2049,7 @@ (if (= idx -1) nil - (let ((res (list))) (append! res needle) res)))))))) + (let ((res (list))) (begin (append! res needle) res))))))))) ((= name "at") (fn (i) @@ -2099,6 +2107,20 @@ ((= name "toWellFormed") (fn () s)) (else js-undefined)))) +(define __js_tdz_sentinel__ (dict "__tdz__" true)) + +(define js-tdz? (fn (v) (and (dict? v) (dict-has? v "__tdz__")))) + +(define + js-tdz-check + (fn + (name val) + (if + (js-tdz? val) + (raise + (TypeError (str "Cannot access '" name "' before initialization"))) + val))) + (define js-string-slice (fn diff --git a/lib/js/test.sh b/lib/js/test.sh index de6caea5..80cb135a 100755 --- a/lib/js/test.sh +++ b/lib/js/test.sh @@ -33,6 +33,8 @@ cat > "$TMPFILE" << 'EPOCHS' (load "lib/js/transpile.sx") (epoch 5) (load "lib/js/runtime.sx") +(epoch 6) +(load "lib/js/regex.sx") ;; ── Phase 0: stubs still behave ───────────────────────────────── (epoch 10) @@ -1323,6 +1325,108 @@ cat > "$TMPFILE" << 'EPOCHS' (epoch 3505) (eval "(js-eval \"var a = {length: 3, 0: 10, 1: 20, 2: 30}; var sum = 0; Array.prototype.forEach.call(a, function(x){sum += x;}); sum\")") +;; ── Phase 12: Regex engine ──────────────────────────────────────── +;; Platform is installed (test key is a function, not undefined) +(epoch 5000) +(eval "(js-undefined? (get __js_regex_platform__ \"test\"))") +(epoch 5001) +(eval "(js-eval \"/foo/.test('hi foo bar')\")") +(epoch 5002) +(eval "(js-eval \"/foo/.test('hi bar')\")") +;; Case-insensitive flag +(epoch 5003) +(eval "(js-eval \"/FOO/i.test('hello foo world')\")") +;; Anchors +(epoch 5004) +(eval "(js-eval \"/^hello/.test('hello world')\")") +(epoch 5005) +(eval "(js-eval \"/^hello/.test('say hello')\")") +(epoch 5006) +(eval "(js-eval \"/world$/.test('hello world')\")") +;; Character classes +(epoch 5007) +(eval "(js-eval \"/\\\\d+/.test('abc 123')\")") +(epoch 5008) +(eval "(js-eval \"/\\\\w+/.test('hello')\")") +(epoch 5009) +(eval "(js-eval \"/[abc]/.test('dog')\")") +(epoch 5010) +(eval "(js-eval \"/[abc]/.test('cat')\")") +;; Quantifiers +(epoch 5011) +(eval "(js-eval \"/a*b/.test('b')\")") +(epoch 5012) +(eval "(js-eval \"/a+b/.test('b')\")") +(epoch 5013) +(eval "(js-eval \"/a{2,3}/.test('aa')\")") +(epoch 5014) +(eval "(js-eval \"/a{2,3}/.test('a')\")") +;; Dot +(epoch 5015) +(eval "(js-eval \"/h.llo/.test('hello')\")") +(epoch 5016) +(eval "(js-eval \"/h.llo/.test('hllo')\")") +;; exec result +(epoch 5017) +(eval "(js-eval \"var m = /foo(\\\\w+)/.exec('foobar'); m.match\")") +(epoch 5018) +(eval "(js-eval \"var m = /foo(\\\\w+)/.exec('foobar'); m.index\")") +(epoch 5019) +(eval "(js-eval \"var m = /foo(\\\\w+)/.exec('foobar'); m.groups[0]\")") +;; Alternation +(epoch 5020) +(eval "(js-eval \"/cat|dog/.test('I have a dog')\")") +(epoch 5021) +(eval "(js-eval \"/cat|dog/.test('I have a fish')\")") +;; Non-capturing group +(epoch 5022) +(eval "(js-eval \"/(?:foo)+/.test('foofoo')\")") +;; Negated char class +(epoch 5023) +(eval "(js-eval \"/[^abc]/.test('d')\")") +(epoch 5024) +(eval "(js-eval \"/[^abc]/.test('a')\")") +;; Range inside char class +(epoch 5025) +(eval "(js-eval \"/[a-z]+/.test('hello')\")") +;; Word boundary +(epoch 5026) +(eval "(js-eval \"/\\\\bword\\\\b/.test('a word here')\")") +(epoch 5027) +(eval "(js-eval \"/\\\\bword\\\\b/.test('password')\")") +;; Lazy quantifier +(epoch 5028) +(eval "(js-eval \"var m = /a+?/.exec('aaa'); m.match\")") +;; Global flag exec +(epoch 5029) +(eval "(js-eval \"var r=/\\\\d+/g; r.exec('a1b2'); r.exec('a1b2').match\")") +;; String.prototype.match with regex +(epoch 5030) +(eval "(js-eval \"'hello world'.match(/\\\\w+/).match\")") +;; String.prototype.search +(epoch 5031) +(eval "(js-eval \"'hello world'.search(/world/)\")") +;; String.prototype.replace with regex +(epoch 5032) +(eval "(js-eval \"'hello world'.replace(/world/, 'there')\")") +;; multiline anchor +(epoch 5033) +(eval "(js-eval \"/^bar/m.test('foo\\nbar')\")") + +;; ── Phase 13: let/const TDZ infrastructure ─────────────────────── +;; The TDZ sentinel and checker are defined in runtime.sx. +;; let/const bindings work normally after initialization. +(epoch 5100) +(eval "(js-eval \"let x = 5; x\")") +(epoch 5101) +(eval "(js-eval \"const y = 42; y\")") +;; TDZ sentinel exists and is detectable +(epoch 5102) +(eval "(js-tdz? __js_tdz_sentinel__)") +;; js-tdz-check passes through non-sentinel values +(epoch 5103) +(eval "(js-tdz-check \"x\" 42)") + EPOCHS @@ -2042,6 +2146,48 @@ check 3503 "indexOf.call arrLike" '1' check 3504 "filter.call arrLike" '"2,3"' check 3505 "forEach.call arrLike sum" '60' +# ── Phase 12: Regex engine ──────────────────────────────────────── +check 5000 "regex platform installed" 'false' +check 5001 "/foo/ matches" 'true' +check 5002 "/foo/ no match" 'false' +check 5003 "/FOO/i case-insensitive" 'true' +check 5004 "/^hello/ anchor match" 'true' +check 5005 "/^hello/ anchor no-match" 'false' +check 5006 "/world$/ end anchor" 'true' +check 5007 "/\\d+/ digit class" 'true' +check 5008 "/\\w+/ word class" 'true' +check 5009 "/[abc]/ class no-match" 'false' +check 5010 "/[abc]/ class match" 'true' +check 5011 "/a*b/ zero-or-more" 'true' +check 5012 "/a+b/ one-or-more no-match" 'false' +check 5013 "/a{2,3}/ quant match" 'true' +check 5014 "/a{2,3}/ quant no-match" 'false' +check 5015 "dot matches any" 'true' +check 5016 "dot requires char" 'false' +check 5017 "exec match string" '"foobar"' +check 5018 "exec match index" '0' +check 5019 "exec capture group" '"bar"' +check 5020 "alternation cat|dog match" 'true' +check 5021 "alternation cat|dog no-match" 'false' +check 5022 "non-capturing group" 'true' +check 5023 "negated class match" 'true' +check 5024 "negated class no-match" 'false' +check 5025 "range [a-z]+" 'true' +check 5026 "word boundary match" 'true' +check 5027 "word boundary no-match" 'false' +check 5028 "lazy quantifier" '"a"' +check 5029 "global exec advances" '"2"' +check 5030 "String.match regex" '"hello"' +check 5031 "String.search regex" '6' +check 5032 "String.replace regex" '"hello there"' +check 5033 "multiline anchor" 'true' + +# ── Phase 13: let/const TDZ infrastructure ─────────────────────── +check 5100 "let binding initialized" '5' +check 5101 "const binding initialized" '42' +check 5102 "TDZ sentinel is detectable" 'true' +check 5103 "tdz-check passes non-sentinel" '42' + TOTAL=$((PASS + FAIL)) if [ $FAIL -eq 0 ]; then echo "✓ $PASS/$TOTAL JS-on-SX tests passed" diff --git a/lib/js/test262-runner.py b/lib/js/test262-runner.py index 9a0807b7..0b803c37 100644 --- a/lib/js/test262-runner.py +++ b/lib/js/test262-runner.py @@ -798,6 +798,7 @@ class ServerSession: self._run_and_collect(3, '(load "lib/js/parser.sx")', timeout=60.0) self._run_and_collect(4, '(load "lib/js/transpile.sx")', timeout=60.0) self._run_and_collect(5, '(load "lib/js/runtime.sx")', timeout=60.0) + self._run_and_collect(50, '(load "lib/js/regex.sx")', timeout=60.0) # Preload the stub harness — use precomputed SX cache when available # (huge win: ~15s js-eval HARNESS_STUB → ~0s load precomputed .sx). cache_rel = _harness_cache_rel_path()