From 911a2f57c07ff99bef4e986a682b31af220cb6ea Mon Sep 17 00:00:00 2001 From: giles Date: Sat, 6 Jun 2026 22:50:19 +0000 Subject: [PATCH] search: stemming (suffix stripping) + 18 tests Deterministic English suffix stripping (stem), stemText/stemTokens, indexStemmed. Worked around two haskell-on-sx string gotchas: take/drop over a String yield char codes (rebuild via joinChars . map chr), and isSuffixOf's reverse trips ++ (manual suffix compare). 196/196. Co-Authored-By: Claude Opus 4.8 (1M context) --- lib/search/api.sx | 6 +++-- lib/search/conformance.conf | 2 ++ lib/search/scoreboard.json | 9 +++---- lib/search/scoreboard.md | 3 ++- lib/search/stem.sx | 15 ++++++++++++ lib/search/tests/stem.sx | 47 +++++++++++++++++++++++++++++++++++++ plans/search-on-sx.md | 8 ++++++- 7 files changed, 82 insertions(+), 8 deletions(-) create mode 100644 lib/search/stem.sx create mode 100644 lib/search/tests/stem.sx diff --git a/lib/search/api.sx b/lib/search/api.sx index 7abbe781..5ac85924 100644 --- a/lib/search/api.sx +++ b/lib/search/api.sx @@ -6,7 +6,7 @@ ;; rankTfIdf, rankBm25, topNTfIdf, topNBm25, fedIndex, aclFilter, searchTfIdfAcl, ;; topNTfIdfAcl, searchBm25Acl, prefixTerms, prefixDocs, prefixRankTfIdf, ;; paginate, pageTfIdf, pageBm25, resultCount, editDist, fuzzyTerms, fuzzyDocs, -;; fuzzyRankTfIdf, highlight, snippet. +;; fuzzyRankTfIdf, highlight, snippet, stem, stemText, stemTokens, indexStemmed. (define search/src @@ -29,4 +29,6 @@ "\n" search/fuzzy-src "\n" - search/highlight-src)) + search/highlight-src + "\n" + search/stem-src)) diff --git a/lib/search/conformance.conf b/lib/search/conformance.conf index 28c7ddf6..8c5375b7 100644 --- a/lib/search/conformance.conf +++ b/lib/search/conformance.conf @@ -28,6 +28,7 @@ PRELOADS=( lib/search/page.sx lib/search/fuzzy.sx lib/search/highlight.sx + lib/search/stem.sx lib/search/api.sx lib/search/testlib.sx ) @@ -42,4 +43,5 @@ SUITES=( "page:lib/search/tests/page.sx" "fuzzy:lib/search/tests/fuzzy.sx" "highlight:lib/search/tests/highlight.sx" + "stem:lib/search/tests/stem.sx" ) diff --git a/lib/search/scoreboard.json b/lib/search/scoreboard.json index a3ebb24c..4c88e5e3 100644 --- a/lib/search/scoreboard.json +++ b/lib/search/scoreboard.json @@ -1,8 +1,8 @@ { "lang": "search", - "total_passed": 178, + "total_passed": 196, "total_failed": 0, - "total": 178, + "total": 196, "suites": [ {"name":"index","passed":18,"failed":0,"total":18}, {"name":"boolean","passed":28,"failed":0,"total":28}, @@ -12,7 +12,8 @@ {"name":"prefix","passed":14,"failed":0,"total":14}, {"name":"page","passed":12,"failed":0,"total":12}, {"name":"fuzzy","passed":18,"failed":0,"total":18}, - {"name":"highlight","passed":12,"failed":0,"total":12} + {"name":"highlight","passed":12,"failed":0,"total":12}, + {"name":"stem","passed":18,"failed":0,"total":18} ], - "generated": "2026-06-06T22:07:05+00:00" + "generated": "2026-06-06T22:49:33+00:00" } diff --git a/lib/search/scoreboard.md b/lib/search/scoreboard.md index 767c5fc2..7e20b449 100644 --- a/lib/search/scoreboard.md +++ b/lib/search/scoreboard.md @@ -1,6 +1,6 @@ # search scoreboard -**178 / 178 passing** (0 failure(s)). +**196 / 196 passing** (0 failure(s)). | Suite | Passed | Total | Status | |-------|--------|-------|--------| @@ -13,3 +13,4 @@ | page | 12 | 12 | ok | | fuzzy | 18 | 18 | ok | | highlight | 12 | 12 | ok | +| stem | 18 | 18 | ok | diff --git a/lib/search/stem.sx b/lib/search/stem.sx new file mode 100644 index 00000000..816c3269 --- /dev/null +++ b/lib/search/stem.sx @@ -0,0 +1,15 @@ +;; search stemming — Haskell source fragment. Depends on tokenize + index. +;; Lightweight, deterministic English suffix stripping (recall-improving +;; normalizer). Rules are checked most-specific first; conservative length guards +;; avoid mangling short words. Not a full Porter stemmer. +;; Gotcha: take/drop over a String yield char CODES (ints), not char strings, so +;; rebuild strings with `stStr = joinChars . map chr`. (isSuffixOf's reverse also +;; trips `++` on the String representation, hence the manual stEnds.) +;; stem :: String -> String +;; stemText :: String -> String (tokenize + stem + rejoin) +;; stemTokens :: String -> [String] +;; indexStemmed:: DocId -> String -> Index -> Index (index the stemmed text) + +(define + search/stem-src + "stStr cs = joinChars (map chr cs)\nstEnds suf w = let n = length w in let m = length suf in if m > n then False else stStr (drop (n - m) w) == suf\nstDropEnd k w = stStr (take (length w - k) w)\nstem w = if stEnds \"ies\" w && length w >= 5 then stDropEnd 3 w ++ \"y\" else if stEnds \"ss\" w then w else if stEnds \"es\" w && length w >= 5 then stDropEnd 2 w else if stEnds \"s\" w && length w >= 4 then stDropEnd 1 w else if stEnds \"ing\" w && length w >= 6 then stDropEnd 3 w else if stEnds \"ed\" w && length w >= 5 then stDropEnd 2 w else w\nstemTokens s = map stem (tokens s)\nstemText s = unwords (stemTokens s)\nindexStemmed d text idx = indexDoc d (stemText text) idx\n") diff --git a/lib/search/tests/stem.sx b/lib/search/tests/stem.sx new file mode 100644 index 00000000..cffd6c36 --- /dev/null +++ b/lib/search/tests/stem.sx @@ -0,0 +1,47 @@ +;; Extension — stemming (suffix stripping). Scalar string results wrapped in []. + +(define + stem-cases + (list + (list "stem plural s" "[stem \"cats\"]" (list "cat")) + (list "stem plural dogs" "[stem \"dogs\"]" (list "dog")) + (list "stem keeps ss" "[stem \"pass\"]" (list "pass")) + (list "stem short s unchanged" "[stem \"is\"]" (list "is")) + (list "stem es boxes" "[stem \"boxes\"]" (list "box")) + (list "stem es wishes" "[stem \"wishes\"]" (list "wish")) + (list "stem ies cities" "[stem \"cities\"]" (list "city")) + (list "stem ies parties" "[stem \"parties\"]" (list "party")) + (list "stem ing jumping" "[stem \"jumping\"]" (list "jump")) + (list "stem ing running literal" "[stem \"running\"]" (list "runn")) + (list "stem ed jumped" "[stem \"jumped\"]" (list "jump")) + (list "stem ed wanted" "[stem \"wanted\"]" (list "want")) + (list "stem short ed unchanged" "[stem \"red\"]" (list "red")) + (list "stem no suffix" "[stem \"cat\"]" (list "cat")) + (list + "stemText normalizes and stems" + "[stemText \"Cats Running!\"]" + (list "cat runn")) + (list + "stemTokens list" + "stemTokens \"boxes and cats\"" + (list "box" "and" "cat")) + (list + "indexStemmed unifies plural" + "map fst (lookupTerm \"cat\" (indexStemmed 2 \"a cat\" (indexStemmed 1 \"the cats\" emptyIndex)))" + (list 1 2)) + (list + "indexStemmed stem query" + "map fst (lookupTerm (stem \"boxes\") (indexStemmed 1 \"many boxes\" emptyIndex))" + (list 1)))) + +(define + stem-results + (search-batch "" (map (fn (c) (nth c 1)) stem-cases))) + +(map-indexed + (fn + (i c) + (hk-test (nth c 0) (nth stem-results i) (nth c 2))) + stem-cases) + +{:fail hk-test-fail :pass hk-test-pass :fails hk-test-fails} diff --git a/plans/search-on-sx.md b/plans/search-on-sx.md index b2702653..c2c71b7b 100644 --- a/plans/search-on-sx.md +++ b/plans/search-on-sx.md @@ -116,10 +116,16 @@ lib/search/index.sx lib/search/eval.sx - [x] result pagination (offset / limit) — `paginate`, `pageTfIdf`, `pageBm25`, `resultCount` — 12 tests - [x] snippet / highlight generation (`highlight`, `snippet`) — 12 tests -- [ ] stemming (suffix stripping) — recall-improving normalizer +- [x] stemming (suffix stripping) — `stem`, `stemText`, `stemTokens`, `indexStemmed` + — 18 tests ## Progress log +- **Extension: stemming (196/196 total).** Deterministic English suffix stripping + (`stem`), `stemText`/`stemTokens`, `indexStemmed`. Two haskell-on-sx gotchas: take/drop + over a String yield char CODES not char strings (rebuild via `joinChars . map chr`), + and isSuffixOf's `reverse` trips `++` on the String repr (manual suffix compare). All + five planned extensions now done; the loop can keep adding search vocabulary. 18 tests. - **Extension: highlight/snippet (178/178 total).** `highlight terms text` marks query-matching (normalized) tokens with [..]; `snippet ctx terms text` extracts a context window around the first match. 12 tests.