diff --git a/lib/ocaml/baseline/expected.json b/lib/ocaml/baseline/expected.json index 6a9c8c82..6b361776 100644 --- a/lib/ocaml/baseline/expected.json +++ b/lib/ocaml/baseline/expected.json @@ -122,6 +122,7 @@ "quickselect.ml": 5, "quicksort.ml": 44, "roman.ml": 44, + "rolling_hash.ml": 6, "reverse_int.ml": 54329, "rpn.ml": 9, "run_decode.ml": 21, diff --git a/lib/ocaml/baseline/rolling_hash.ml b/lib/ocaml/baseline/rolling_hash.ml new file mode 100644 index 00000000..2b370cf9 --- /dev/null +++ b/lib/ocaml/baseline/rolling_hash.ml @@ -0,0 +1,38 @@ +let mod_p = 1000003 +let base = 257 + +let rolling_match text pat = + let n = String.length text in + let m = String.length pat in + if m > n then 0 + else begin + let pat_hash = ref 0 in + let win_hash = ref 0 in + let high = ref 1 in + for _ = 0 to m - 2 do + high := (!high * base) mod mod_p + done; + for i = 0 to m - 1 do + pat_hash := (!pat_hash * base + Char.code pat.[i]) mod mod_p; + win_hash := (!win_hash * base + Char.code text.[i]) mod mod_p + done; + let count = ref 0 in + for i = 0 to n - m do + if !win_hash = !pat_hash then begin + let ok = ref true in + for j = 0 to m - 1 do + if text.[i + j] <> pat.[j] then ok := false + done; + if !ok then count := !count + 1 + end; + if i < n - m then begin + let drop = (Char.code text.[i] * !high) mod mod_p in + win_hash := ((!win_hash - drop + mod_p) * base + Char.code text.[i + m]) mod mod_p + end + done; + !count + end + +;; + +rolling_match "abcabcabcabcabcabc" "abc" diff --git a/plans/ocaml-on-sx.md b/plans/ocaml-on-sx.md index eb1d0725..97c7934e 100644 --- a/plans/ocaml-on-sx.md +++ b/plans/ocaml-on-sx.md @@ -407,6 +407,15 @@ _Newest first._ binary search tree (`type 'a tree = Leaf | Node of 'a * 'a tree * 'a tree`) with insert + in-order traversal. Tests parametric ADT, recursive match, List.append, List.fold_left. +- 2026-05-10 Phase 5.1 — rolling_hash.ml baseline (Rabin-Karp + rolling hash for substring matching, count "abc" in + "abcabcabcabcabcabc" = 6). Polynomial hash mod 1000003 with + base 257; precompute base^(m-1), then slide window updating + hash in O(1) per step. Verify hash matches with O(m) memcmp to + avoid false positives. Tests `for _ = 0 to m - 2 do … done` + unused loop variable, char-code arithmetic, mod under negative + intermediate, complex nested if/begin/end branching. + 151 baseline programs total. - 2026-05-10 Phase 5.1 — huffman.ml baseline (Huffman tree weighted path length on letters {(5,a) (9,b) (12,c) (13,d) (16,e) (45,f)} = 224). Builds optimal prefix code by repeatedly merging the two