search: stemming (suffix stripping) + 18 tests
Some checks failed
Test, Build, and Deploy / test-build-deploy (push) Failing after 16s
Some checks failed
Test, Build, and Deploy / test-build-deploy (push) Failing after 16s
Deterministic English suffix stripping (stem), stemText/stemTokens, indexStemmed. Worked around two haskell-on-sx string gotchas: take/drop over a String yield char codes (rebuild via joinChars . map chr), and isSuffixOf's reverse trips ++ (manual suffix compare). 196/196. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -6,7 +6,7 @@
|
||||
;; rankTfIdf, rankBm25, topNTfIdf, topNBm25, fedIndex, aclFilter, searchTfIdfAcl,
|
||||
;; topNTfIdfAcl, searchBm25Acl, prefixTerms, prefixDocs, prefixRankTfIdf,
|
||||
;; paginate, pageTfIdf, pageBm25, resultCount, editDist, fuzzyTerms, fuzzyDocs,
|
||||
;; fuzzyRankTfIdf, highlight, snippet.
|
||||
;; fuzzyRankTfIdf, highlight, snippet, stem, stemText, stemTokens, indexStemmed.
|
||||
|
||||
(define
|
||||
search/src
|
||||
@@ -29,4 +29,6 @@
|
||||
"\n"
|
||||
search/fuzzy-src
|
||||
"\n"
|
||||
search/highlight-src))
|
||||
search/highlight-src
|
||||
"\n"
|
||||
search/stem-src))
|
||||
|
||||
@@ -28,6 +28,7 @@ PRELOADS=(
|
||||
lib/search/page.sx
|
||||
lib/search/fuzzy.sx
|
||||
lib/search/highlight.sx
|
||||
lib/search/stem.sx
|
||||
lib/search/api.sx
|
||||
lib/search/testlib.sx
|
||||
)
|
||||
@@ -42,4 +43,5 @@ SUITES=(
|
||||
"page:lib/search/tests/page.sx"
|
||||
"fuzzy:lib/search/tests/fuzzy.sx"
|
||||
"highlight:lib/search/tests/highlight.sx"
|
||||
"stem:lib/search/tests/stem.sx"
|
||||
)
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
{
|
||||
"lang": "search",
|
||||
"total_passed": 178,
|
||||
"total_passed": 196,
|
||||
"total_failed": 0,
|
||||
"total": 178,
|
||||
"total": 196,
|
||||
"suites": [
|
||||
{"name":"index","passed":18,"failed":0,"total":18},
|
||||
{"name":"boolean","passed":28,"failed":0,"total":28},
|
||||
@@ -12,7 +12,8 @@
|
||||
{"name":"prefix","passed":14,"failed":0,"total":14},
|
||||
{"name":"page","passed":12,"failed":0,"total":12},
|
||||
{"name":"fuzzy","passed":18,"failed":0,"total":18},
|
||||
{"name":"highlight","passed":12,"failed":0,"total":12}
|
||||
{"name":"highlight","passed":12,"failed":0,"total":12},
|
||||
{"name":"stem","passed":18,"failed":0,"total":18}
|
||||
],
|
||||
"generated": "2026-06-06T22:07:05+00:00"
|
||||
"generated": "2026-06-06T22:49:33+00:00"
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# search scoreboard
|
||||
|
||||
**178 / 178 passing** (0 failure(s)).
|
||||
**196 / 196 passing** (0 failure(s)).
|
||||
|
||||
| Suite | Passed | Total | Status |
|
||||
|-------|--------|-------|--------|
|
||||
@@ -13,3 +13,4 @@
|
||||
| page | 12 | 12 | ok |
|
||||
| fuzzy | 18 | 18 | ok |
|
||||
| highlight | 12 | 12 | ok |
|
||||
| stem | 18 | 18 | ok |
|
||||
|
||||
15
lib/search/stem.sx
Normal file
15
lib/search/stem.sx
Normal file
@@ -0,0 +1,15 @@
|
||||
;; search stemming — Haskell source fragment. Depends on tokenize + index.
|
||||
;; Lightweight, deterministic English suffix stripping (recall-improving
|
||||
;; normalizer). Rules are checked most-specific first; conservative length guards
|
||||
;; avoid mangling short words. Not a full Porter stemmer.
|
||||
;; Gotcha: take/drop over a String yield char CODES (ints), not char strings, so
|
||||
;; rebuild strings with `stStr = joinChars . map chr`. (isSuffixOf's reverse also
|
||||
;; trips `++` on the String representation, hence the manual stEnds.)
|
||||
;; stem :: String -> String
|
||||
;; stemText :: String -> String (tokenize + stem + rejoin)
|
||||
;; stemTokens :: String -> [String]
|
||||
;; indexStemmed:: DocId -> String -> Index -> Index (index the stemmed text)
|
||||
|
||||
(define
|
||||
search/stem-src
|
||||
"stStr cs = joinChars (map chr cs)\nstEnds suf w = let n = length w in let m = length suf in if m > n then False else stStr (drop (n - m) w) == suf\nstDropEnd k w = stStr (take (length w - k) w)\nstem w = if stEnds \"ies\" w && length w >= 5 then stDropEnd 3 w ++ \"y\" else if stEnds \"ss\" w then w else if stEnds \"es\" w && length w >= 5 then stDropEnd 2 w else if stEnds \"s\" w && length w >= 4 then stDropEnd 1 w else if stEnds \"ing\" w && length w >= 6 then stDropEnd 3 w else if stEnds \"ed\" w && length w >= 5 then stDropEnd 2 w else w\nstemTokens s = map stem (tokens s)\nstemText s = unwords (stemTokens s)\nindexStemmed d text idx = indexDoc d (stemText text) idx\n")
|
||||
47
lib/search/tests/stem.sx
Normal file
47
lib/search/tests/stem.sx
Normal file
@@ -0,0 +1,47 @@
|
||||
;; Extension — stemming (suffix stripping). Scalar string results wrapped in [].
|
||||
|
||||
(define
|
||||
stem-cases
|
||||
(list
|
||||
(list "stem plural s" "[stem \"cats\"]" (list "cat"))
|
||||
(list "stem plural dogs" "[stem \"dogs\"]" (list "dog"))
|
||||
(list "stem keeps ss" "[stem \"pass\"]" (list "pass"))
|
||||
(list "stem short s unchanged" "[stem \"is\"]" (list "is"))
|
||||
(list "stem es boxes" "[stem \"boxes\"]" (list "box"))
|
||||
(list "stem es wishes" "[stem \"wishes\"]" (list "wish"))
|
||||
(list "stem ies cities" "[stem \"cities\"]" (list "city"))
|
||||
(list "stem ies parties" "[stem \"parties\"]" (list "party"))
|
||||
(list "stem ing jumping" "[stem \"jumping\"]" (list "jump"))
|
||||
(list "stem ing running literal" "[stem \"running\"]" (list "runn"))
|
||||
(list "stem ed jumped" "[stem \"jumped\"]" (list "jump"))
|
||||
(list "stem ed wanted" "[stem \"wanted\"]" (list "want"))
|
||||
(list "stem short ed unchanged" "[stem \"red\"]" (list "red"))
|
||||
(list "stem no suffix" "[stem \"cat\"]" (list "cat"))
|
||||
(list
|
||||
"stemText normalizes and stems"
|
||||
"[stemText \"Cats Running!\"]"
|
||||
(list "cat runn"))
|
||||
(list
|
||||
"stemTokens list"
|
||||
"stemTokens \"boxes and cats\""
|
||||
(list "box" "and" "cat"))
|
||||
(list
|
||||
"indexStemmed unifies plural"
|
||||
"map fst (lookupTerm \"cat\" (indexStemmed 2 \"a cat\" (indexStemmed 1 \"the cats\" emptyIndex)))"
|
||||
(list 1 2))
|
||||
(list
|
||||
"indexStemmed stem query"
|
||||
"map fst (lookupTerm (stem \"boxes\") (indexStemmed 1 \"many boxes\" emptyIndex))"
|
||||
(list 1))))
|
||||
|
||||
(define
|
||||
stem-results
|
||||
(search-batch "" (map (fn (c) (nth c 1)) stem-cases)))
|
||||
|
||||
(map-indexed
|
||||
(fn
|
||||
(i c)
|
||||
(hk-test (nth c 0) (nth stem-results i) (nth c 2)))
|
||||
stem-cases)
|
||||
|
||||
{:fail hk-test-fail :pass hk-test-pass :fails hk-test-fails}
|
||||
Reference in New Issue
Block a user