;; search stemming — Haskell source fragment. Depends on tokenize + index. ;; Lightweight, deterministic English suffix stripping (recall-improving ;; normalizer). Rules are checked most-specific first; conservative length guards ;; avoid mangling short words. Not a full Porter stemmer. ;; Gotcha: take/drop over a String yield char CODES (ints), not char strings, so ;; rebuild strings with `stStr = joinChars . map chr`. (isSuffixOf's reverse also ;; trips `++` on the String representation, hence the manual stEnds.) ;; stem :: String -> String ;; stemText :: String -> String (tokenize + stem + rejoin) ;; stemTokens :: String -> [String] ;; indexStemmed:: DocId -> String -> Index -> Index (index the stemmed text) (define search/stem-src "stStr cs = joinChars (map chr cs)\nstEnds suf w = let n = length w in let m = length suf in if m > n then False else stStr (drop (n - m) w) == suf\nstDropEnd k w = stStr (take (length w - k) w)\nstem w = if stEnds \"ies\" w && length w >= 5 then stDropEnd 3 w ++ \"y\" else if stEnds \"ss\" w then w else if stEnds \"es\" w && length w >= 5 then stDropEnd 2 w else if stEnds \"s\" w && length w >= 4 then stDropEnd 1 w else if stEnds \"ing\" w && length w >= 6 then stDropEnd 3 w else if stEnds \"ed\" w && length w >= 5 then stDropEnd 2 w else w\nstemTokens s = map stem (tokens s)\nstemText s = unwords (stemTokens s)\nindexStemmed d text idx = indexDoc d (stemText text) idx\n")