diff --git a/hosts/javascript/platform.py b/hosts/javascript/platform.py index 8a3bb406..dc8f1e63 100644 --- a/hosts/javascript/platform.py +++ b/hosts/javascript/platform.py @@ -1702,6 +1702,67 @@ PRIMITIVES_JS_MODULES: dict[str, str] = { src.data.forEach(function(v, k) { dst.data.set(k, v); }); return null; }; +''', + "stdlib.regexp": ''' + // stdlib.regexp — native JS RegExp wrappers + function SxRegexp(source, flags) { + this._regexp = true; + this.source = source; + this.flags = flags || ""; + } + function sxRxCompile(rx) { + if (!rx._compiled) { + var jsFlags = ""; + if (rx.flags.indexOf("i") >= 0) jsFlags += "i"; + if (rx.flags.indexOf("m") >= 0) jsFlags += "m"; + if (rx.flags.indexOf("s") >= 0) jsFlags += "s"; + rx._compiled = new RegExp(rx.source, jsFlags); + } + return rx._compiled; + } + function sxRxMatchDict(m, input) { + if (!m) return NIL; + var groups = []; + for (var i = 1; i < m.length; i++) groups.push(m[i] !== undefined ? m[i] : ""); + return {"match": m[0], "start": m.index, "end": m.index + m[0].length, + "input": input, "groups": groups}; + } + PRIMITIVES["make-regexp"] = function(src, flags) { + return new SxRegexp(src, flags || ""); + }; + PRIMITIVES["regexp?"] = function(v) { return v instanceof SxRegexp; }; + PRIMITIVES["regexp-source"] = function(rx) { return rx.source; }; + PRIMITIVES["regexp-flags"] = function(rx) { return rx.flags; }; + PRIMITIVES["regexp-match"] = function(rx, s) { + var re = new RegExp(sxRxCompile(rx).source, + sxRxCompile(rx).flags.replace("g","")); + var m = s.match(re); + return sxRxMatchDict(m, s); + }; + PRIMITIVES["regexp-match-all"] = function(rx, s) { + var compiled = sxRxCompile(rx); + var re = new RegExp(compiled.source, "g" + compiled.flags.replace("g","")); + var results = [], m; + while ((m = re.exec(s)) !== null) { + results.push(sxRxMatchDict(m, s)); + if (m[0].length === 0) re.lastIndex++; + } + return results; + }; + PRIMITIVES["regexp-replace"] = function(rx, s, replacement) { + var compiled = sxRxCompile(rx); + var re = new RegExp(compiled.source, compiled.flags.replace("g","")); + return s.replace(re, replacement); + }; + PRIMITIVES["regexp-replace-all"] = function(rx, s, replacement) { + var compiled = sxRxCompile(rx); + var re = new RegExp(compiled.source, "g" + compiled.flags.replace("g","")); + return s.replace(re, replacement); + }; + PRIMITIVES["regexp-split"] = function(rx, s) { + var re = sxRxCompile(rx); + return s.split(re); + }; ''', "stdlib.sets": ''' // stdlib.sets — structural sets keyed by write-to-string serialization @@ -1802,6 +1863,7 @@ PLATFORM_JS_PRE = ''' if (x._string_buffer) return "string-buffer"; if (x._hash_table) return "hash-table"; if (x._sxset) return "set"; + if (x._regexp) return "regexp"; if (x._rational) return "rational"; if (typeof Node !== "undefined" && x instanceof Node) return "dom-node"; if (Array.isArray(x)) return "list"; diff --git a/hosts/ocaml/lib/sx_primitives.ml b/hosts/ocaml/lib/sx_primitives.ml index b7d8dfea..b4fced92 100644 --- a/hosts/ocaml/lib/sx_primitives.ml +++ b/hosts/ocaml/lib/sx_primitives.ml @@ -2224,6 +2224,127 @@ let () = String flags | _ -> raise (Eval_error "regex-flags: (regex)")); + (* make-regexp / regexp? / regexp-match / regexp-match-all / regexp-replace / regexp-replace-all / regexp-split *) + let parse_re_flags flags = + let opts = ref [] in + String.iter (function + | 'i' -> opts := `CASELESS :: !opts + | 'm' -> opts := `MULTILINE :: !opts + | 's' -> opts := `DOTALL :: !opts + | _ -> ()) flags; + !opts + in + let make_regexp_value source flags = + let opts = parse_re_flags flags in + try + let compiled = Re.compile (Re.Pcre.re ~flags:opts source) in + SxRegexp (source, flags, compiled) + with _ -> raise (Eval_error ("make-regexp: invalid pattern: " ^ source)) + in + let match_dict g input = + let d = Hashtbl.create 4 in + Hashtbl.replace d "match" (String (Re.Group.get g 0)); + Hashtbl.replace d "start" (Integer (Re.Group.start g 0)); + Hashtbl.replace d "end" (Integer (Re.Group.stop g 0)); + Hashtbl.replace d "input" (String input); + let count = Re.Group.nb_groups g in + let groups = ref [] in + for i = count - 1 downto 1 do + let s = try Re.Group.get g i with Not_found -> "" in + groups := String s :: !groups + done; + Hashtbl.replace d "groups" (List !groups); + Dict d + in + register "make-regexp" (fun args -> + match args with + | [String src] -> make_regexp_value src "" + | [String src; String flags] -> make_regexp_value src flags + | _ -> raise (Eval_error "make-regexp: (pattern [flags])")); + register "regexp?" (fun args -> + match args with + | [SxRegexp _] -> Bool true + | [_] -> Bool false + | _ -> raise (Eval_error "regexp?: 1 arg")); + register "regexp-source" (fun args -> + match args with + | [SxRegexp (src, _, _)] -> String src + | _ -> raise (Eval_error "regexp-source: expected regexp")); + register "regexp-flags" (fun args -> + match args with + | [SxRegexp (_, flags, _)] -> String flags + | _ -> raise (Eval_error "regexp-flags: expected regexp")); + register "regexp-match" (fun args -> + match args with + | [SxRegexp (_, _, re); String s] -> + (match Re.exec_opt re s with + | None -> Nil + | Some g -> match_dict g s) + | _ -> raise (Eval_error "regexp-match: (regexp string)")); + register "regexp-match-all" (fun args -> + match args with + | [SxRegexp (_, _, re); String s] -> + List (List.map (fun g -> match_dict g s) (Re.all re s)) + | _ -> raise (Eval_error "regexp-match-all: (regexp string)")); + register "regexp-replace" (fun args -> + match args with + | [SxRegexp (_, _, re); String s; String replacement] -> + (match Re.exec_opt re s with + | None -> String s + | Some g -> + let buf = Buffer.create (String.length s) in + let i = ref 0 in + let n = String.length replacement in + let expand () = + while !i < n do + let c = replacement.[!i] in + if c = '$' && !i + 1 < n then + (match replacement.[!i + 1] with + | '&' -> Buffer.add_string buf (Re.Group.get g 0); i := !i + 2 + | '$' -> Buffer.add_char buf '$'; i := !i + 2 + | c when c >= '0' && c <= '9' -> + let idx = Char.code c - Char.code '0' in + (try Buffer.add_string buf (Re.Group.get g idx) with Not_found -> ()); + i := !i + 2 + | _ -> Buffer.add_char buf c; incr i) + else (Buffer.add_char buf c; incr i) + done + in + Buffer.add_string buf (String.sub s 0 (Re.Group.start g 0)); + expand (); + Buffer.add_string buf (String.sub s (Re.Group.stop g 0) + (String.length s - Re.Group.stop g 0)); + String (Buffer.contents buf)) + | _ -> raise (Eval_error "regexp-replace: (regexp string replacement)")); + register "regexp-replace-all" (fun args -> + match args with + | [SxRegexp (_, _, re); String s; String replacement] -> + let expand g = + let buf = Buffer.create (String.length replacement) in + let i = ref 0 in + let n = String.length replacement in + while !i < n do + let c = replacement.[!i] in + if c = '$' && !i + 1 < n then + (match replacement.[!i + 1] with + | '&' -> Buffer.add_string buf (Re.Group.get g 0); i := !i + 2 + | '$' -> Buffer.add_char buf '$'; i := !i + 2 + | c when c >= '0' && c <= '9' -> + let idx = Char.code c - Char.code '0' in + (try Buffer.add_string buf (Re.Group.get g idx) with Not_found -> ()); + i := !i + 2 + | _ -> Buffer.add_char buf c; incr i) + else (Buffer.add_char buf c; incr i) + done; + Buffer.contents buf + in + String (Re.replace re ~f:expand s) + | _ -> raise (Eval_error "regexp-replace-all: (regexp string replacement)")); + register "regexp-split" (fun args -> + match args with + | [SxRegexp (_, _, re); String s] -> + List (List.map (fun x -> String x) (Re.split re s)) + | _ -> raise (Eval_error "regexp-split: (regexp string)")); (* Bitwise operations *) register "bitwise-and" (fun args -> match args with diff --git a/hosts/ocaml/lib/sx_types.ml b/hosts/ocaml/lib/sx_types.ml index 5f4f3ccd..cb1360b3 100644 --- a/hosts/ocaml/lib/sx_types.ml +++ b/hosts/ocaml/lib/sx_types.ml @@ -80,6 +80,7 @@ and value = | Port of sx_port (** String port — input (string cursor) or output (buffer). *) | Rational of int * int (** Exact rational: numerator, denominator (reduced, denom>0). *) | SxSet of (string, value) Hashtbl.t (** Mutable set keyed by inspect(value). *) + | SxRegexp of string * string * Re.re (** Regexp: source, flags, compiled. *) (** String input port: source string + mutable cursor position. *) and sx_port_kind = @@ -516,6 +517,7 @@ let type_of = function | Port { sp_kind = PortOutput _; _ } -> "output-port" | Rational _ -> "rational" | SxSet _ -> "set" + | SxRegexp _ -> "regexp" let is_nil = function Nil -> true | _ -> false let is_lambda = function Lambda _ -> true | _ -> false @@ -879,3 +881,4 @@ let rec inspect = function Printf.sprintf "" (Buffer.length buf) (if sp_closed then ":closed" else "") | Rational (n, d) -> Printf.sprintf "%d/%d" n d | SxSet ht -> Printf.sprintf "" (Hashtbl.length ht) + | SxRegexp (src, flags, _) -> Printf.sprintf "#/%s/%s" src flags diff --git a/shared/static/scripts/sx-browser.js b/shared/static/scripts/sx-browser.js index 40929353..eb83c190 100644 --- a/shared/static/scripts/sx-browser.js +++ b/shared/static/scripts/sx-browser.js @@ -41,7 +41,7 @@ // ========================================================================= var NIL = Object.freeze({ _nil: true, toString: function() { return "nil"; } }); - var SX_VERSION = "2026-05-01T18:42:40Z"; + var SX_VERSION = "2026-05-01T18:54:28Z"; function isNil(x) { return x === NIL || x === null || x === undefined; } function isSxTruthy(x) { return x !== false && !isNil(x); } @@ -185,6 +185,7 @@ if (x._string_buffer) return "string-buffer"; if (x._hash_table) return "hash-table"; if (x._sxset) return "set"; + if (x._regexp) return "regexp"; if (x._rational) return "rational"; if (typeof Node !== "undefined" && x instanceof Node) return "dom-node"; if (Array.isArray(x)) return "list"; @@ -1097,6 +1098,67 @@ }; + // stdlib.regexp — native JS RegExp wrappers + function SxRegexp(source, flags) { + this._regexp = true; + this.source = source; + this.flags = flags || ""; + } + function sxRxCompile(rx) { + if (!rx._compiled) { + var jsFlags = ""; + if (rx.flags.indexOf("i") >= 0) jsFlags += "i"; + if (rx.flags.indexOf("m") >= 0) jsFlags += "m"; + if (rx.flags.indexOf("s") >= 0) jsFlags += "s"; + rx._compiled = new RegExp(rx.source, jsFlags); + } + return rx._compiled; + } + function sxRxMatchDict(m, input) { + if (!m) return NIL; + var groups = []; + for (var i = 1; i < m.length; i++) groups.push(m[i] !== undefined ? m[i] : ""); + return {"match": m[0], "start": m.index, "end": m.index + m[0].length, + "input": input, "groups": groups}; + } + PRIMITIVES["make-regexp"] = function(src, flags) { + return new SxRegexp(src, flags || ""); + }; + PRIMITIVES["regexp?"] = function(v) { return v instanceof SxRegexp; }; + PRIMITIVES["regexp-source"] = function(rx) { return rx.source; }; + PRIMITIVES["regexp-flags"] = function(rx) { return rx.flags; }; + PRIMITIVES["regexp-match"] = function(rx, s) { + var re = new RegExp(sxRxCompile(rx).source, + sxRxCompile(rx).flags.replace("g","")); + var m = s.match(re); + return sxRxMatchDict(m, s); + }; + PRIMITIVES["regexp-match-all"] = function(rx, s) { + var compiled = sxRxCompile(rx); + var re = new RegExp(compiled.source, "g" + compiled.flags.replace("g","")); + var results = [], m; + while ((m = re.exec(s)) !== null) { + results.push(sxRxMatchDict(m, s)); + if (m[0].length === 0) re.lastIndex++; + } + return results; + }; + PRIMITIVES["regexp-replace"] = function(rx, s, replacement) { + var compiled = sxRxCompile(rx); + var re = new RegExp(compiled.source, compiled.flags.replace("g","")); + return s.replace(re, replacement); + }; + PRIMITIVES["regexp-replace-all"] = function(rx, s, replacement) { + var compiled = sxRxCompile(rx); + var re = new RegExp(compiled.source, "g" + compiled.flags.replace("g","")); + return s.replace(re, replacement); + }; + PRIMITIVES["regexp-split"] = function(rx, s) { + var re = sxRxCompile(rx); + return s.split(re); + }; + + // stdlib.sets — structural sets keyed by write-to-string serialization function SxSet() { this.data = new Map(); this._sxset = true; } SxSet.prototype._type = "set"; diff --git a/spec/primitives.sx b/spec/primitives.sx index 698396b6..59306a18 100644 --- a/spec/primitives.sx +++ b/spec/primitives.sx @@ -1196,3 +1196,59 @@ :params (s fn) :returns "set" :doc "New set of results of (fn val) for each element in s.") + +(define-module :stdlib.regexp) + +(define-primitive + "make-regexp" + :params ((pattern :as string) &rest (flags :as string)) + :returns "regexp" + :doc "Compile regexp from pattern string and optional flags string (\"i\" case-insensitive, \"m\" multiline, \"s\" dotall).") + +(define-primitive + "regexp?" + :params (v) + :returns "boolean" + :doc "True if v is a compiled regexp.") + +(define-primitive + "regexp-source" + :params ((re :as regexp)) + :returns "string" + :doc "Pattern string of a regexp.") + +(define-primitive + "regexp-flags" + :params ((re :as regexp)) + :returns "string" + :doc "Flags string of a regexp.") + +(define-primitive + "regexp-match" + :params ((re :as regexp) (str :as string)) + :returns "any" + :doc "First match of re in str. Returns {:match \"...\" :start N :end N :groups (...)} or nil.") + +(define-primitive + "regexp-match-all" + :params ((re :as regexp) (str :as string)) + :returns "list" + :doc "All non-overlapping matches of re in str as a list of match dicts.") + +(define-primitive + "regexp-replace" + :params ((re :as regexp) (str :as string) (replacement :as string)) + :returns "string" + :doc "Replace first match of re in str with replacement. $& = whole match, $1..$9 = groups.") + +(define-primitive + "regexp-replace-all" + :params ((re :as regexp) (str :as string) (replacement :as string)) + :returns "string" + :doc "Replace all matches of re in str with replacement.") + +(define-primitive + "regexp-split" + :params ((re :as regexp) (str :as string)) + :returns "list" + :doc "Split str on every match of re; returns list of strings.") diff --git a/spec/tests/test-regexp.sx b/spec/tests/test-regexp.sx new file mode 100644 index 00000000..f883a47e --- /dev/null +++ b/spec/tests/test-regexp.sx @@ -0,0 +1,191 @@ +;; ========================================================================== +;; test-regexp.sx — Tests for regexp primitives +;; ========================================================================== + +;; -------------------------------------------------------------------------- +;; make-regexp / regexp? +;; -------------------------------------------------------------------------- + +(defsuite + "regexp:create" + (deftest "make-regexp returns regexp" (assert (regexp? (make-regexp "abc")))) + (deftest + "make-regexp with flags" + (assert (regexp? (make-regexp "[a-z]+" "i")))) + (deftest "regexp? true for regexp" (assert (regexp? (make-regexp "x")))) + (deftest "regexp? false for string" (assert (not (regexp? "abc")))) + (deftest "regexp? false for nil" (assert (not (regexp? nil)))) + (deftest + "regexp-source" + (assert= (regexp-source (make-regexp "hello")) "hello")) + (deftest + "regexp-flags" + (assert= (regexp-flags (make-regexp "x" "im")) "im")) + (deftest + "regexp-flags empty string" + (assert= (regexp-flags (make-regexp "x")) ""))) + +;; -------------------------------------------------------------------------- +;; regexp-match — basic +;; -------------------------------------------------------------------------- + +(defsuite + "regexp:match" + (deftest + "match returns dict" + (let + ((m (regexp-match (make-regexp "hel+o") "hello world"))) + (assert (dict? m)))) + (deftest + "match :match key" + (let + ((m (regexp-match (make-regexp "hel+o") "say hello"))) + (assert= (get m "match") "hello"))) + (deftest + "match :start key" + (let + ((m (regexp-match (make-regexp "lo") "hello"))) + (assert= (get m "start") 3))) + (deftest + "match :end key" + (let + ((m (regexp-match (make-regexp "lo") "hello"))) + (assert= (get m "end") 5))) + (deftest + "no match returns nil" + (assert-nil (regexp-match (make-regexp "xyz") "hello"))) + (deftest + "match at start" + (let + ((m (regexp-match (make-regexp "^hel") "hello"))) + (assert= (get m "start") 0))) + (deftest + "match digit pattern" + (let + ((m (regexp-match (make-regexp "[0-9]+") "abc 123 def"))) + (assert= (get m "match") "123")))) + +;; -------------------------------------------------------------------------- +;; regexp-match — groups +;; -------------------------------------------------------------------------- + +(defsuite + "regexp:groups" + (deftest + "no capture groups → empty list" + (let + ((m (regexp-match (make-regexp "hello") "hello world"))) + (assert= (length (get m "groups")) 0))) + (deftest + "one capture group" + (let + ((m (regexp-match (make-regexp "([0-9]+)") "price: 42"))) + (assert= (first (get m "groups")) "42"))) + (deftest + "two capture groups" + (let + ((m (regexp-match (make-regexp "([a-z]+)=([0-9]+)") "x=10"))) + (let + ((gs (get m "groups"))) + (assert + (and (= (first gs) "x") (= (first (rest gs)) "10"))))))) + +;; -------------------------------------------------------------------------- +;; regexp-match-all +;; -------------------------------------------------------------------------- + +(defsuite + "regexp:match-all" + (deftest + "match-all returns list" + (let + ((ms (regexp-match-all (make-regexp "[0-9]+") "1 and 2 and 3"))) + (assert (list? ms)))) + (deftest + "match-all count" + (assert= + (length (regexp-match-all (make-regexp "[0-9]+") "1 and 2 and 3")) + 3)) + (deftest + "match-all first match" + (let + ((ms (regexp-match-all (make-regexp "[0-9]+") "10 20 30"))) + (assert= (get (first ms) "match") "10"))) + (deftest + "match-all empty when no match" + (assert= + (length (regexp-match-all (make-regexp "xyz") "hello")) + 0))) + +;; -------------------------------------------------------------------------- +;; regexp-replace / regexp-replace-all +;; -------------------------------------------------------------------------- + +(defsuite + "regexp:replace" + (deftest + "replace first match" + (assert= (regexp-replace (make-regexp "o+") "foobar boo" "0") "f0bar boo")) + (deftest + "replace no match returns original" + (assert= (regexp-replace (make-regexp "xyz") "hello" "X") "hello")) + (deftest + "replace-all all matches" + (assert= (regexp-replace-all (make-regexp "o") "foo boo" "0") "f00 b00")) + (deftest + "replace with $& (whole match)" + (assert= + (regexp-replace (make-regexp "[0-9]+") "price 42" "[$&]") + "price [42]")) + (deftest + "replace-all removes digits" + (assert= + (regexp-replace-all (make-regexp "[0-9]") "a1b2c3" "") + "abc"))) + +;; -------------------------------------------------------------------------- +;; regexp-split +;; -------------------------------------------------------------------------- + +(defsuite + "regexp:split" + (deftest + "split on whitespace" + (let + ((parts (regexp-split (make-regexp " +") "hello world foo"))) + (assert= (length parts) 3))) + (deftest + "split first part" + (let + ((parts (regexp-split (make-regexp ",") "a,b,c"))) + (assert= (first parts) "a"))) + (deftest + "split last part" + (let + ((parts (regexp-split (make-regexp ",") "a,b,c"))) + (assert= (first (rest (rest parts))) "c"))) + (deftest + "split no match → single element" + (let + ((parts (regexp-split (make-regexp ",") "hello"))) + (assert= (length parts) 1)))) + +;; -------------------------------------------------------------------------- +;; flags +;; -------------------------------------------------------------------------- + +(defsuite + "regexp:flags" + (deftest + "case-insensitive flag" + (let + ((m (regexp-match (make-regexp "HELLO" "i") "hello world"))) + (assert (not (nil? m))))) + (deftest + "case-sensitive without flag" + (assert-nil (regexp-match (make-regexp "HELLO") "hello world"))) + (deftest + "multiline ^ matches line starts" + (let + ((ms (regexp-match-all (make-regexp "^[a-z]" "m") "a\nb\nc"))) + (assert= (length ms) 3))))