From 3759aad7a640f39afddaf682d47f9b5539f361dc Mon Sep 17 00:00:00 2001 From: giles Date: Sat, 9 May 2026 19:47:21 +0000 Subject: [PATCH] ocaml: phase 5.1 anagram_groups.ml baseline (group by canonical anagram, 3 groups) canonical builds a sorted-by-frequency string representation: let canonical s = let chars = Array.make 26 0 in for i = 0 to String.length s - 1 do let k = Char.code s.[i] - Char.code 'a' in if k >= 0 && k < 26 then chars.(k) <- chars.(k) + 1 done; expand into a-z order via a Buffer For 'eat', 'tea', 'ate' -> all canonicalise to 'aet'. For 'tan', 'nat' -> 'ant'. For 'bat' -> 'abt'. group_anagrams folds the input, accumulating per-key string lists; final answer is Hashtbl.length (number of distinct groups): ['eat'; 'tea'; 'tan'; 'ate'; 'nat'; 'bat'] -> 3 groups 99 baseline programs total. --- lib/ocaml/baseline/anagram_groups.ml | 29 ++++++++++++++++++++++++++++ lib/ocaml/baseline/expected.json | 1 + plans/ocaml-on-sx.md | 8 ++++++++ 3 files changed, 38 insertions(+) create mode 100644 lib/ocaml/baseline/anagram_groups.ml diff --git a/lib/ocaml/baseline/anagram_groups.ml b/lib/ocaml/baseline/anagram_groups.ml new file mode 100644 index 00000000..eab32fe5 --- /dev/null +++ b/lib/ocaml/baseline/anagram_groups.ml @@ -0,0 +1,29 @@ +let canonical s = + let chars = Array.make 26 0 in + for i = 0 to String.length s - 1 do + let k = Char.code s.[i] - Char.code 'a' in + if k >= 0 && k < 26 then chars.(k) <- chars.(k) + 1 + done; + let buf = Buffer.create 26 in + for i = 0 to 25 do + for _ = 1 to chars.(i) do + Buffer.add_string buf (String.make 1 (Char.chr (i + Char.code 'a'))) + done + done; + Buffer.contents buf + +let group_anagrams xs = + let h = Hashtbl.create 8 in + List.iter (fun s -> + let k = canonical s in + let cur = match Hashtbl.find_opt h k with + | Some xs -> xs + | None -> [] + in + Hashtbl.replace h k (s :: cur) + ) xs; + Hashtbl.length h + +;; + +group_anagrams ["eat"; "tea"; "tan"; "ate"; "nat"; "bat"] diff --git a/lib/ocaml/baseline/expected.json b/lib/ocaml/baseline/expected.json index 7a5bb436..bc8170b1 100644 --- a/lib/ocaml/baseline/expected.json +++ b/lib/ocaml/baseline/expected.json @@ -2,6 +2,7 @@ "ackermann.ml": 125, "adler32.ml": 300286872, "anagram_check.ml": 2, + "anagram_groups.ml": 3, "anagrams.ml": 3, "atm.ml": 120, "bag.ml": 3, diff --git a/plans/ocaml-on-sx.md b/plans/ocaml-on-sx.md index e73aeb8b..3c78bdef 100644 --- a/plans/ocaml-on-sx.md +++ b/plans/ocaml-on-sx.md @@ -407,6 +407,14 @@ _Newest first._ binary search tree (`type 'a tree = Leaf | Node of 'a * 'a tree * 'a tree`) with insert + in-order traversal. Tests parametric ADT, recursive match, List.append, List.fold_left. +- 2026-05-09 Phase 5.1 — anagram_groups.ml baseline (group strings + by canonical anagram form, ["eat";"tea";"tan";"ate";"nat";"bat"] + has 3 groups). canonical builds a sorted-by-frequency string + representation: count letters, then expand into a-z order. Used + as Hashtbl key. group_anagrams folds the input list, accumulating + per-key string lists; final answer is Hashtbl.length (number of + distinct groups). Tests count-then-expand canonical pattern + + Hashtbl as multimap. 99 baseline programs total. - 2026-05-09 Phase 5.1 — monotonic.ml baseline (monotonicity check, 4/5 inputs monotonic). Tracks two bool refs (inc, dec). Each pair of consecutive elements: if `h < prev` clear `inc`, if `h > prev`