sx primitives: add regex-* (Re + Re.Pcre backed)

Adds regex-compile/test/exec/match-all/replace/replace-fn/split/source/flags.
Opaque dict handle {:__regex__ true :id :source :flags}; compiled Re.re
cached in a primitives-local table. Replacement supports $&, $1-$9, $$.
Flags: i (CASELESS), m (MULTILINE), s (DOTALL). g is a runtime flag handled
in replace. u (unicode) skipped for now.

Unblocks js-on-sx's regex-platform-override! hook — the JS RegExp shim can
now delegate to real regex instead of the substring stub.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-24 15:12:13 +00:00
parent 4be90bf21f
commit 81022784bc

View File

@@ -1696,4 +1696,179 @@ let () =
| [thunk] ->
(try !Sx_types._cek_call_ref thunk Nil
with _ -> Nil)
| _ -> Nil)
| _ -> Nil);
(* === Regex === wrapping Re + Re.Pcre *)
let regex_table : (int, Re.re * string * string) Hashtbl.t = Hashtbl.create 32 in
let regex_next_id = ref 0 in
let parse_flags flags =
let opts = ref [] in
String.iter (function
| 'i' -> opts := `CASELESS :: !opts
| 'm' -> opts := `MULTILINE :: !opts
| 's' -> opts := `DOTALL :: !opts
| _ -> ()) flags;
!opts
in
let make_regex_value id source flags =
let d = Hashtbl.create 4 in
Hashtbl.replace d "__regex__" (Bool true);
Hashtbl.replace d "id" (Number (float_of_int id));
Hashtbl.replace d "source" (String source);
Hashtbl.replace d "flags" (String flags);
Dict d
in
let regex_of_value = function
| Dict d ->
(match Hashtbl.find_opt d "id" with
| Some (Number n) ->
(match Hashtbl.find_opt regex_table (int_of_float n) with
| Some r -> r
| None -> raise (Eval_error "regex: handle not found"))
| _ -> raise (Eval_error "regex: missing id"))
| _ -> raise (Eval_error "regex: expected regex dict")
in
let group_to_dict g input =
let d = Hashtbl.create 4 in
Hashtbl.replace d "match" (String (Re.Group.get g 0));
Hashtbl.replace d "index" (Number (float_of_int (Re.Group.start g 0)));
Hashtbl.replace d "input" (String input);
let count = Re.Group.nb_groups g in
let groups = ref [] in
for i = count - 1 downto 1 do
let s = try Re.Group.get g i with Not_found -> "" in
groups := String s :: !groups
done;
Hashtbl.replace d "groups" (List !groups);
Dict d
in
register "regex-compile" (fun args ->
match args with
| [String source; String flags] | [String source; String flags; _] ->
let opts = parse_flags flags in
(try
let re = Re.compile (Re.Pcre.re ~flags:opts source) in
let id = !regex_next_id in
incr regex_next_id;
Hashtbl.replace regex_table id (re, source, flags);
make_regex_value id source flags
with _ -> raise (Eval_error ("regex-compile: invalid pattern " ^ source)))
| [String source] ->
(try
let re = Re.compile (Re.Pcre.re source) in
let id = !regex_next_id in
incr regex_next_id;
Hashtbl.replace regex_table id (re, source, "");
make_regex_value id source ""
with _ -> raise (Eval_error ("regex-compile: invalid pattern " ^ source)))
| _ -> raise (Eval_error "regex-compile: (source flags)"));
register "regex-test" (fun args ->
match args with
| [rx; String s] ->
let (re, _, _) = regex_of_value rx in
Bool (Re.execp re s)
| _ -> raise (Eval_error "regex-test: (regex string)"));
register "regex-exec" (fun args ->
let (rx, s, start) = match args with
| [rx; String s] -> (rx, s, 0)
| [rx; String s; Number n] -> (rx, s, int_of_float n)
| _ -> raise (Eval_error "regex-exec: (regex string start?)")
in
let (re, _, _) = regex_of_value rx in
try
let g = Re.exec ~pos:start re s in
group_to_dict g s
with Not_found -> Nil);
register "regex-match-all" (fun args ->
match args with
| [rx; String s] ->
let (re, _, _) = regex_of_value rx in
let all = Re.all re s in
List (List.map (fun g -> group_to_dict g s) all)
| _ -> raise (Eval_error "regex-match-all: (regex string)"));
register "regex-replace" (fun args ->
match args with
| [rx; String s; String replacement] ->
let (re, _, flags) = regex_of_value rx in
let expand g =
let buf = Buffer.create (String.length replacement) in
let i = ref 0 in
let n = String.length replacement in
while !i < n do
let c = replacement.[!i] in
if c = '$' && !i + 1 < n then
(match replacement.[!i + 1] with
| '&' -> Buffer.add_string buf (Re.Group.get g 0); i := !i + 2
| '$' -> Buffer.add_char buf '$'; i := !i + 2
| c when c >= '0' && c <= '9' ->
let idx = Char.code c - Char.code '0' in
(try Buffer.add_string buf (Re.Group.get g idx) with Not_found -> ());
i := !i + 2
| _ -> Buffer.add_char buf c; incr i)
else (Buffer.add_char buf c; incr i)
done;
Buffer.contents buf
in
let global = String.contains flags 'g' in
if global then
String (Re.replace re ~f:expand s)
else
(match Re.exec_opt re s with
| None -> String s
| Some g ->
let repl = expand g in
let before = String.sub s 0 (Re.Group.start g 0) in
let after_start = Re.Group.stop g 0 in
let after = String.sub s after_start (String.length s - after_start) in
String (before ^ repl ^ after))
| _ -> raise (Eval_error "regex-replace: (regex string replacement)"));
register "regex-replace-fn" (fun args ->
match args with
| [rx; String s; f] ->
let (re, _, flags) = regex_of_value rx in
let call_fn g =
let match_str = Re.Group.get g 0 in
let count = Re.Group.nb_groups g in
let groups_before = ref [] in
for i = count - 1 downto 1 do
let v = try String (Re.Group.get g i) with Not_found -> Nil in
groups_before := v :: !groups_before
done;
let idx = Number (float_of_int (Re.Group.start g 0)) in
let all_args = [String match_str] @ !groups_before @ [idx; String s] in
match !Sx_types._cek_call_ref f (List all_args) with
| String s -> s
| Number n -> Sx_types.format_number n
| v -> Sx_types.inspect v
in
let global = String.contains flags 'g' in
if global then
String (Re.replace re ~f:call_fn s)
else
(match Re.exec_opt re s with
| None -> String s
| Some g ->
let repl = call_fn g in
let before = String.sub s 0 (Re.Group.start g 0) in
let after_start = Re.Group.stop g 0 in
let after = String.sub s after_start (String.length s - after_start) in
String (before ^ repl ^ after))
| _ -> raise (Eval_error "regex-replace-fn: (regex string fn)"));
register "regex-split" (fun args ->
match args with
| [rx; String s] ->
let (re, _, _) = regex_of_value rx in
List (List.map (fun x -> String x) (Re.split re s))
| _ -> raise (Eval_error "regex-split: (regex string)"));
register "regex-source" (fun args ->
match args with
| [rx] ->
let (_, source, _) = regex_of_value rx in
String source
| _ -> raise (Eval_error "regex-source: (regex)"));
register "regex-flags" (fun args ->
match args with
| [rx] ->
let (_, _, flags) = regex_of_value rx in
String flags
| _ -> raise (Eval_error "regex-flags: (regex)"))