From 911a2f57c07ff99bef4e986a682b31af220cb6ea Mon Sep 17 00:00:00 2001
From: giles <giles.bradshaw@rose-ash.com>
Date: Sat, 6 Jun 2026 22:50:19 +0000
Subject: [PATCH] search: stemming (suffix stripping) + 18 tests

Deterministic English suffix stripping (stem), stemText/stemTokens, indexStemmed.
Worked around two haskell-on-sx string gotchas: take/drop over a String yield
char codes (rebuild via joinChars . map chr), and isSuffixOf's reverse trips ++
(manual suffix compare). 196/196.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 lib/search/api.sx           |  6 +++--
 lib/search/conformance.conf |  2 ++
 lib/search/scoreboard.json  |  9 +++----
 lib/search/scoreboard.md    |  3 ++-
 lib/search/stem.sx          | 15 ++++++++++++
 lib/search/tests/stem.sx    | 47 +++++++++++++++++++++++++++++++++++++
 plans/search-on-sx.md       |  8 ++++++-
 7 files changed, 82 insertions(+), 8 deletions(-)
 create mode 100644 lib/search/stem.sx
 create mode 100644 lib/search/tests/stem.sx

diff --git a/lib/search/api.sx b/lib/search/api.sx
index 7abbe781..5ac85924 100644
--- a/lib/search/api.sx
+++ b/lib/search/api.sx
@@ -6,7 +6,7 @@
 ;; rankTfIdf, rankBm25, topNTfIdf, topNBm25, fedIndex, aclFilter, searchTfIdfAcl,
 ;; topNTfIdfAcl, searchBm25Acl, prefixTerms, prefixDocs, prefixRankTfIdf,
 ;; paginate, pageTfIdf, pageBm25, resultCount, editDist, fuzzyTerms, fuzzyDocs,
-;; fuzzyRankTfIdf, highlight, snippet.
+;; fuzzyRankTfIdf, highlight, snippet, stem, stemText, stemTokens, indexStemmed.
 
 (define
   search/src
@@ -29,4 +29,6 @@
     "\n"
     search/fuzzy-src
     "\n"
-    search/highlight-src))
+    search/highlight-src
+    "\n"
+    search/stem-src))
diff --git a/lib/search/conformance.conf b/lib/search/conformance.conf
index 28c7ddf6..8c5375b7 100644
--- a/lib/search/conformance.conf
+++ b/lib/search/conformance.conf
@@ -28,6 +28,7 @@ PRELOADS=(
   lib/search/page.sx
   lib/search/fuzzy.sx
   lib/search/highlight.sx
+  lib/search/stem.sx
   lib/search/api.sx
   lib/search/testlib.sx
 )
@@ -42,4 +43,5 @@ SUITES=(
   "page:lib/search/tests/page.sx"
   "fuzzy:lib/search/tests/fuzzy.sx"
   "highlight:lib/search/tests/highlight.sx"
+  "stem:lib/search/tests/stem.sx"
 )
diff --git a/lib/search/scoreboard.json b/lib/search/scoreboard.json
index a3ebb24c..4c88e5e3 100644
--- a/lib/search/scoreboard.json
+++ b/lib/search/scoreboard.json
@@ -1,8 +1,8 @@
 {
   "lang": "search",
-  "total_passed": 178,
+  "total_passed": 196,
   "total_failed": 0,
-  "total": 178,
+  "total": 196,
   "suites": [
     {"name":"index","passed":18,"failed":0,"total":18},
     {"name":"boolean","passed":28,"failed":0,"total":28},
@@ -12,7 +12,8 @@
     {"name":"prefix","passed":14,"failed":0,"total":14},
     {"name":"page","passed":12,"failed":0,"total":12},
     {"name":"fuzzy","passed":18,"failed":0,"total":18},
-    {"name":"highlight","passed":12,"failed":0,"total":12}
+    {"name":"highlight","passed":12,"failed":0,"total":12},
+    {"name":"stem","passed":18,"failed":0,"total":18}
   ],
-  "generated": "2026-06-06T22:07:05+00:00"
+  "generated": "2026-06-06T22:49:33+00:00"
 }
diff --git a/lib/search/scoreboard.md b/lib/search/scoreboard.md
index 767c5fc2..7e20b449 100644
--- a/lib/search/scoreboard.md
+++ b/lib/search/scoreboard.md
@@ -1,6 +1,6 @@
 # search scoreboard
 
-**178 / 178 passing** (0 failure(s)).
+**196 / 196 passing** (0 failure(s)).
 
 | Suite | Passed | Total | Status |
 |-------|--------|-------|--------|
@@ -13,3 +13,4 @@
 | page | 12 | 12 | ok |
 | fuzzy | 18 | 18 | ok |
 | highlight | 12 | 12 | ok |
+| stem | 18 | 18 | ok |
diff --git a/lib/search/stem.sx b/lib/search/stem.sx
new file mode 100644
index 00000000..816c3269
--- /dev/null
+++ b/lib/search/stem.sx
@@ -0,0 +1,15 @@
+;; search stemming — Haskell source fragment. Depends on tokenize + index.
+;; Lightweight, deterministic English suffix stripping (recall-improving
+;; normalizer). Rules are checked most-specific first; conservative length guards
+;; avoid mangling short words. Not a full Porter stemmer.
+;; Gotcha: take/drop over a String yield char CODES (ints), not char strings, so
+;; rebuild strings with `stStr = joinChars . map chr`. (isSuffixOf's reverse also
+;; trips `++` on the String representation, hence the manual stEnds.)
+;;   stem        :: String -> String
+;;   stemText    :: String -> String        (tokenize + stem + rejoin)
+;;   stemTokens  :: String -> [String]
+;;   indexStemmed:: DocId -> String -> Index -> Index   (index the stemmed text)
+
+(define
+  search/stem-src
+  "stStr cs = joinChars (map chr cs)\nstEnds suf w = let n = length w in let m = length suf in if m > n then False else stStr (drop (n - m) w) == suf\nstDropEnd k w = stStr (take (length w - k) w)\nstem w = if stEnds \"ies\" w && length w >= 5 then stDropEnd 3 w ++ \"y\" else if stEnds \"ss\" w then w else if stEnds \"es\" w && length w >= 5 then stDropEnd 2 w else if stEnds \"s\" w && length w >= 4 then stDropEnd 1 w else if stEnds \"ing\" w && length w >= 6 then stDropEnd 3 w else if stEnds \"ed\" w && length w >= 5 then stDropEnd 2 w else w\nstemTokens s = map stem (tokens s)\nstemText s = unwords (stemTokens s)\nindexStemmed d text idx = indexDoc d (stemText text) idx\n")
diff --git a/lib/search/tests/stem.sx b/lib/search/tests/stem.sx
new file mode 100644
index 00000000..cffd6c36
--- /dev/null
+++ b/lib/search/tests/stem.sx
@@ -0,0 +1,47 @@
+;; Extension — stemming (suffix stripping). Scalar string results wrapped in [].
+
+(define
+  stem-cases
+  (list
+    (list "stem plural s" "[stem \"cats\"]" (list "cat"))
+    (list "stem plural dogs" "[stem \"dogs\"]" (list "dog"))
+    (list "stem keeps ss" "[stem \"pass\"]" (list "pass"))
+    (list "stem short s unchanged" "[stem \"is\"]" (list "is"))
+    (list "stem es boxes" "[stem \"boxes\"]" (list "box"))
+    (list "stem es wishes" "[stem \"wishes\"]" (list "wish"))
+    (list "stem ies cities" "[stem \"cities\"]" (list "city"))
+    (list "stem ies parties" "[stem \"parties\"]" (list "party"))
+    (list "stem ing jumping" "[stem \"jumping\"]" (list "jump"))
+    (list "stem ing running literal" "[stem \"running\"]" (list "runn"))
+    (list "stem ed jumped" "[stem \"jumped\"]" (list "jump"))
+    (list "stem ed wanted" "[stem \"wanted\"]" (list "want"))
+    (list "stem short ed unchanged" "[stem \"red\"]" (list "red"))
+    (list "stem no suffix" "[stem \"cat\"]" (list "cat"))
+    (list
+      "stemText normalizes and stems"
+      "[stemText \"Cats Running!\"]"
+      (list "cat runn"))
+    (list
+      "stemTokens list"
+      "stemTokens \"boxes and cats\""
+      (list "box" "and" "cat"))
+    (list
+      "indexStemmed unifies plural"
+      "map fst (lookupTerm \"cat\" (indexStemmed 2 \"a cat\" (indexStemmed 1 \"the cats\" emptyIndex)))"
+      (list 1 2))
+    (list
+      "indexStemmed stem query"
+      "map fst (lookupTerm (stem \"boxes\") (indexStemmed 1 \"many boxes\" emptyIndex))"
+      (list 1))))
+
+(define
+  stem-results
+  (search-batch "" (map (fn (c) (nth c 1)) stem-cases)))
+
+(map-indexed
+  (fn
+    (i c)
+    (hk-test (nth c 0) (nth stem-results i) (nth c 2)))
+  stem-cases)
+
+{:fail hk-test-fail :pass hk-test-pass :fails hk-test-fails}
diff --git a/plans/search-on-sx.md b/plans/search-on-sx.md
index b2702653..c2c71b7b 100644
--- a/plans/search-on-sx.md
+++ b/plans/search-on-sx.md
@@ -116,10 +116,16 @@ lib/search/index.sx                     lib/search/eval.sx
 - [x] result pagination (offset / limit) — `paginate`, `pageTfIdf`, `pageBm25`,
   `resultCount` — 12 tests
 - [x] snippet / highlight generation (`highlight`, `snippet`) — 12 tests
-- [ ] stemming (suffix stripping) — recall-improving normalizer
+- [x] stemming (suffix stripping) — `stem`, `stemText`, `stemTokens`, `indexStemmed`
+  — 18 tests
 
 ## Progress log
 
+- **Extension: stemming (196/196 total).** Deterministic English suffix stripping
+  (`stem`), `stemText`/`stemTokens`, `indexStemmed`. Two haskell-on-sx gotchas: take/drop
+  over a String yield char CODES not char strings (rebuild via `joinChars . map chr`),
+  and isSuffixOf's `reverse` trips `++` on the String repr (manual suffix compare). All
+  five planned extensions now done; the loop can keep adding search vocabulary. 18 tests.
 - **Extension: highlight/snippet (178/178 total).** `highlight terms text` marks
   query-matching (normalized) tokens with [..]; `snippet ctx terms text` extracts a
   context window around the first match. 12 tests.