From 5d88b363e466bd044c919d7c3349e5fa3cdc0d7e Mon Sep 17 00:00:00 2001 From: giles Date: Sun, 5 Apr 2026 20:38:40 +0000 Subject: [PATCH] =?UTF-8?q?Step=2013:=20String/regex=20primitives=20?= =?UTF-8?q?=E2=80=94=20PCRE-compatible,=20cross-host?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New primitives in sx_primitives.ml: char-at, char-code, parse-number — string inspection + conversion regex-match, regex-match?, regex-find-all — PCRE pattern matching regex-replace, regex-replace-first — PCRE substitution regex-split — split by PCRE pattern Uses Re.Pcre (OCaml re library) so regex patterns use the same syntax as JS RegExp — patterns in .sx files work identically on browser and server. Replaces the old test-only regex-find-all stub. Also: split now handles multi-char separators via Re. 176 new tests (10 suites). 2912/2912 total, zero failures. Co-Authored-By: Claude Opus 4.6 (1M context) --- hosts/ocaml/bin/run_tests.ml | 47 +--------- hosts/ocaml/lib/dune | 3 +- hosts/ocaml/lib/sx_primitives.ml | 95 +++++++++++++++++++- spec/tests/test-string-regex.sx | 150 +++++++++++++++++++++++++++++++ 4 files changed, 247 insertions(+), 48 deletions(-) create mode 100644 spec/tests/test-string-regex.sx diff --git a/hosts/ocaml/bin/run_tests.ml b/hosts/ocaml/bin/run_tests.ml index b52ce25e..90491aef 100644 --- a/hosts/ocaml/bin/run_tests.ml +++ b/hosts/ocaml/bin/run_tests.ml @@ -470,52 +470,7 @@ let make_test_env () = let stack = try Hashtbl.find _scope_stacks name with Not_found -> [] in (match stack with _ :: rest -> Hashtbl.replace _scope_stacks name (List [] :: rest) | [] -> ()); Nil | _ -> Nil); - bind "regex-find-all" (fun args -> - (* Stub: supports ~name patterns for component scanning *) - match args with - | [String pattern; String text] -> - (* Extract the literal prefix from patterns like: - "(~[a-z/.-]+" → prefix "~", has_group=true - "\(~([a-zA-Z_]..." → prefix "(~", has_group=true *) - let prefix, has_group = - if String.length pattern >= 4 && pattern.[0] = '\\' && pattern.[1] = '(' then - (* Pattern like \(~(...) — literal "(" + "~" prefix, group after *) - let s = String.sub pattern 2 (String.length pattern - 2) in - let lit_end = try String.index s '(' with Not_found -> try String.index s '[' with Not_found -> String.length s in - let lit = String.sub s 0 lit_end in - ("(" ^ lit, true) - else if String.length pattern > 2 && pattern.[0] = '(' then - let s = String.sub pattern 1 (String.length pattern - 1) in - let p = try String.sub s 0 (String.index s '[') - with Not_found -> try String.sub s 0 (String.index s '(') - with Not_found -> s in - ((if String.length p > 0 then p else "~"), true) - else (pattern, false) - in - let results = ref [] in - let len = String.length text in - let plen = String.length prefix in - let i = ref 0 in - while !i <= len - plen do - if String.sub text !i plen = prefix then begin - (* Find end of identifier *) - let j = ref (!i + plen) in - while !j < len && let c = text.[!j] in - (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') - || c = '-' || c = '/' || c = '_' || c = '.' do - incr j - done; - let full_match = String.sub text !i (!j - !i) in - (* If pattern has capture group, strip the literal prefix to simulate group 1 *) - let result = if has_group then - String.sub full_match plen (String.length full_match - plen) - else full_match in - results := String result :: !results; - i := !j - end else incr i - done; - List (List.rev !results) - | _ -> List []); + (* regex-find-all now provided by sx_primitives.ml *) bind "callable?" (fun args -> match args with | [NativeFn _] | [Lambda _] | [Component _] | [Island _] -> Bool true diff --git a/hosts/ocaml/lib/dune b/hosts/ocaml/lib/dune index 15bfaf51..4dd17fc1 100644 --- a/hosts/ocaml/lib/dune +++ b/hosts/ocaml/lib/dune @@ -1,3 +1,4 @@ (library (name sx) - (wrapped false)) + (wrapped false) + (libraries re re.pcre)) diff --git a/hosts/ocaml/lib/sx_primitives.ml b/hosts/ocaml/lib/sx_primitives.ml index 5f0d4a20..79e2e3e1 100644 --- a/hosts/ocaml/lib/sx_primitives.ml +++ b/hosts/ocaml/lib/sx_primitives.ml @@ -398,7 +398,12 @@ let () = register "split" (fun args -> match args with | [String s; String sep] -> - List (List.map (fun p -> String p) (String.split_on_char sep.[0] s)) + if String.length sep = 1 then + List (List.map (fun p -> String p) (String.split_on_char sep.[0] s)) + else + (* Multi-char separator: use Re for literal split *) + let re = Re.compile (Re.str sep) in + List (List.map (fun p -> String p) (Re.split re s)) | _ -> raise (Eval_error "split: 2 args")); register "join" (fun args -> match args with @@ -441,6 +446,94 @@ let () = Buffer.add_utf_8_uchar buf (Uchar.of_int (int_of_float n)); String (Buffer.contents buf) | _ -> raise (Eval_error "char-from-code: 1 arg")); + register "char-at" (fun args -> + match args with + | [String s; Number n] -> + let i = int_of_float n in + if i >= 0 && i < String.length s then + String (String.make 1 s.[i]) + else Nil + | _ -> raise (Eval_error "char-at: string and index")); + register "char-code" (fun args -> + match args with + | [String s] when String.length s > 0 -> Number (float_of_int (Char.code s.[0])) + | _ -> raise (Eval_error "char-code: 1 non-empty string arg")); + register "parse-number" (fun args -> + match args with + | [String s] -> + (try Number (float_of_string s) + with Failure _ -> Nil) + | _ -> raise (Eval_error "parse-number: 1 string arg")); + + (* === Regex (PCRE-compatible — same syntax as JS RegExp) === *) + register "regex-match" (fun args -> + match args with + | [String pattern; String input] -> + (try + let re = Re.Pcre.re pattern |> Re.compile in + match Re.exec_opt re input with + | Some group -> + let full = Re.Group.get group 0 in + let n = Re.Group.nb_groups group in + let groups = ref [String full] in + for i = 1 to n - 1 do + (try groups := !groups @ [String (Re.Group.get group i)] + with Not_found -> groups := !groups @ [Nil]) + done; + List !groups + | None -> Nil + with _ -> Nil) + | _ -> raise (Eval_error "regex-match: pattern and input strings")); + register "regex-match?" (fun args -> + match args with + | [String pattern; String input] -> + (try Bool (Re.execp (Re.Pcre.re pattern |> Re.compile) input) + with _ -> Bool false) + | _ -> raise (Eval_error "regex-match?: pattern and input strings")); + register "regex-find-all" (fun args -> + match args with + | [String pattern; String input] -> + (try + let re = Re.Pcre.re pattern |> Re.compile in + let matches = Re.all re input in + let results = List.map (fun group -> + (* If there's a capture group, return group 1; else full match *) + try String (Re.Group.get group 1) + with Not_found -> String (Re.Group.get group 0) + ) matches in + ListRef (ref results) + with _ -> ListRef (ref [])) + | _ -> raise (Eval_error "regex-find-all: pattern and input strings")); + register "regex-replace" (fun args -> + match args with + | [String pattern; String replacement; String input] -> + (try + let re = Re.Pcre.re pattern |> Re.compile in + String (Re.replace_string re ~by:replacement input) + with _ -> String input) + | _ -> raise (Eval_error "regex-replace: pattern, replacement, input strings")); + register "regex-replace-first" (fun args -> + match args with + | [String pattern; String replacement; String input] -> + (try + let re = Re.Pcre.re pattern |> Re.compile in + (* Re doesn't have replace_first, so use all matches and replace only first *) + match Re.exec_opt re input with + | Some group -> + let start = Re.Group.start group 0 and stop = Re.Group.stop group 0 in + String (String.sub input 0 start ^ replacement ^ + String.sub input stop (String.length input - stop)) + | None -> String input + with _ -> String input) + | _ -> raise (Eval_error "regex-replace-first: pattern, replacement, input strings")); + register "regex-split" (fun args -> + match args with + | [String pattern; String input] -> + (try + let re = Re.Pcre.re pattern |> Re.compile in + ListRef (ref (List.map (fun s -> String s) (Re.split re input))) + with _ -> ListRef (ref [String input])) + | _ -> raise (Eval_error "regex-split: pattern and input strings")); (* === Collections === *) register "list" (fun args -> ListRef (ref args)); diff --git a/spec/tests/test-string-regex.sx b/spec/tests/test-string-regex.sx new file mode 100644 index 00000000..c7af26ce --- /dev/null +++ b/spec/tests/test-string-regex.sx @@ -0,0 +1,150 @@ +;; String/regex primitive tests + +(defsuite + "string-char-at" + (deftest "char-at first" (assert= "h" (char-at "hello" 0))) + (deftest "char-at middle" (assert= "l" (char-at "hello" 2))) + (deftest "char-at last" (assert= "o" (char-at "hello" 4))) + (deftest "char-at out of bounds" (assert= nil (char-at "hello" 10))) + (deftest "char-at negative" (assert= nil (char-at "hello" -1)))) + +(defsuite + "string-char-code" + (deftest "char-code a" (assert= 97 (char-code "a"))) + (deftest "char-code A" (assert= 65 (char-code "A"))) + (deftest "char-code 0" (assert= 48 (char-code "0"))) + (deftest "char-code space" (assert= 32 (char-code " "))) + (deftest + "char-code roundtrip" + (assert= "a" (char-from-code (char-code "a")))) + (deftest + "char-from-code roundtrip" + (assert= 65 (char-code (char-from-code 65))))) + +(defsuite + "string-parse-number" + (deftest "parse-number integer" (assert= 42 (parse-number "42"))) + (deftest "parse-number float" (assert= 3.14 (parse-number "3.14"))) + (deftest "parse-number negative" (assert= -7 (parse-number "-7"))) + (deftest + "parse-number negative float" + (assert= -2.5 (parse-number "-2.5"))) + (deftest "parse-number zero" (assert= 0 (parse-number "0"))) + (deftest + "parse-number invalid returns nil" + (assert= nil (parse-number "abc"))) + (deftest "parse-number empty returns nil" (assert= nil (parse-number "")))) + +(defsuite + "regex-match" + (deftest + "regex-match simple" + (let + ((r (regex-match "h.llo" "hello world"))) + (assert (list? r)) + (assert= "hello" (first r)))) + (deftest "regex-match no match" (assert= nil (regex-match "xyz" "hello"))) + (deftest + "regex-match with group" + (let + ((r (regex-match "(h)ello" "hello"))) + (assert (list? r)) + (assert= "hello" (first r)) + (assert= "h" (nth r 1)))) + (deftest + "regex-match digits" + (let + ((r (regex-match "[0-9]+" "abc123def"))) + (assert= "123" (first r)))) + (deftest + "regex-match anchored" + (assert= nil (regex-match "^world" "hello world"))) + (deftest + "regex-match start" + (let + ((r (regex-match "^hello" "hello world"))) + (assert= "hello" (first r))))) + +(defsuite + "regex-match?" + (deftest "regex-match? true" (assert (regex-match? "h.llo" "hello"))) + (deftest "regex-match? false" (assert (not (regex-match? "xyz" "hello")))) + (deftest + "regex-match? digit pattern" + (assert (regex-match? "[0-9]" "abc1"))) + (deftest + "regex-match? empty pattern" + (assert (regex-match? "" "anything")))) + +(defsuite + "regex-find-all" + (deftest + "find-all digits" + (let + ((result (regex-find-all "[0-9]" "a1b2c3"))) + (assert= 3 (len result)) + (assert= "1" (first result)) + (assert= "3" (nth result 2)))) + (deftest + "find-all words" + (let + ((result (regex-find-all "[a-z]+" "hello 123 world"))) + (assert= 2 (len result)) + (assert= "hello" (first result)) + (assert= "world" (nth result 1)))) + (deftest + "find-all no matches" + (assert= (list) (regex-find-all "[0-9]" "abc"))) + (deftest + "find-all multi-char" + (let + ((result (regex-find-all "ab" "xababx"))) + (assert= 2 (len result)) + (assert= "ab" (first result)) + (assert= "ab" (nth result 1))))) + +(defsuite + "regex-replace" + (deftest + "replace all digits" + (assert= "a_b_c_" (regex-replace "[0-9]" "_" "a1b2c3"))) + (deftest + "replace word" + (assert= "hi hi" (regex-replace "hello" "hi" "hello hello"))) + (deftest + "replace no match" + (assert= "hello" (regex-replace "xyz" "!" "hello"))) + (deftest + "replace empty pattern" + (assert= "hello" (regex-replace "^$" "!" "hello")))) + +(defsuite + "regex-replace-first" + (deftest + "replace-first digit" + (assert= "a_b2c3" (regex-replace-first "[0-9]" "_" "a1b2c3"))) + (deftest + "replace-first no match" + (assert= "hello" (regex-replace-first "xyz" "!" "hello")))) + +(defsuite + "regex-split" + (deftest + "split on whitespace" + (assert= (list "hello" "world") (regex-split "[ \t]+" "hello world"))) + (deftest + "split on comma-space" + (assert= (list "a" "b" "c") (regex-split ", *" "a, b,c"))) + (deftest + "split no match" + (assert= (list "hello") (regex-split ";" "hello"))) + (deftest + "split digits" + (assert= (list "a" "b" "c") (regex-split "[0-9]+" "a1b23c")))) + +(defsuite + "string-split-multichar" + (deftest + "split on multi-char separator" + (assert= (list "a" "b" "c") (split "a::b::c" "::"))) + (deftest "split on arrow" (assert= (list "a" "b") (split "a->b" "->")))) \ No newline at end of file