From 5945b51cfd5a271b761b9aa34765b1fcad622050 Mon Sep 17 00:00:00 2001 From: giles Date: Sat, 6 Jun 2026 21:47:56 +0000 Subject: [PATCH] search: fuzzy matching via edit distance + 18 tests editDist as an O(m*n) row-based Levenshtein DP (naive recursion is exponential and times out under load); fuzzyTerms/fuzzyDocs/fuzzyRankTfIdf expand a term to indexed terms within a max edit distance. 166/166. Co-Authored-By: Claude Opus 4.8 (1M context) --- lib/search/api.sx | 7 +++- lib/search/conformance.conf | 2 + lib/search/fuzzy.sx | 12 ++++++ lib/search/scoreboard.json | 9 +++-- lib/search/scoreboard.md | 3 +- lib/search/tests/fuzzy.sx | 74 +++++++++++++++++++++++++++++++++++++ plans/search-on-sx.md | 7 +++- 7 files changed, 106 insertions(+), 8 deletions(-) create mode 100644 lib/search/fuzzy.sx create mode 100644 lib/search/tests/fuzzy.sx diff --git a/lib/search/api.sx b/lib/search/api.sx index cef49db4..c55c7f31 100644 --- a/lib/search/api.sx +++ b/lib/search/api.sx @@ -5,7 +5,8 @@ ;; docFreq, allTerms, tokens, positioned, evalQuery, parseQuery, searchQuery, ;; rankTfIdf, rankBm25, topNTfIdf, topNBm25, fedIndex, aclFilter, searchTfIdfAcl, ;; topNTfIdfAcl, searchBm25Acl, prefixTerms, prefixDocs, prefixRankTfIdf, -;; paginate, pageTfIdf, pageBm25, resultCount. +;; paginate, pageTfIdf, pageBm25, resultCount, editDist, fuzzyTerms, fuzzyDocs, +;; fuzzyRankTfIdf. (define search/src @@ -24,4 +25,6 @@ "\n" search/prefix-src "\n" - search/page-src)) + search/page-src + "\n" + search/fuzzy-src)) diff --git a/lib/search/conformance.conf b/lib/search/conformance.conf index 79b14819..0fef2c39 100644 --- a/lib/search/conformance.conf +++ b/lib/search/conformance.conf @@ -26,6 +26,7 @@ PRELOADS=( lib/search/fed.sx lib/search/prefix.sx lib/search/page.sx + lib/search/fuzzy.sx lib/search/api.sx lib/search/testlib.sx ) @@ -38,4 +39,5 @@ SUITES=( "integration:lib/search/tests/integration.sx" "prefix:lib/search/tests/prefix.sx" "page:lib/search/tests/page.sx" + "fuzzy:lib/search/tests/fuzzy.sx" ) diff --git a/lib/search/fuzzy.sx b/lib/search/fuzzy.sx new file mode 100644 index 00000000..9a757abb --- /dev/null +++ b/lib/search/fuzzy.sx @@ -0,0 +1,12 @@ +;; search fuzzy matching — Haskell source fragment. Depends on index + rank. +;; Levenshtein edit distance (O(m*n) row-based DP — the naive recursive version is +;; exponential and far too slow under load) expands a query term to all indexed +;; terms within a max distance, then unions / ranks their docs. +;; editDist :: String -> String -> Int +;; fuzzyTerms :: Int -> String -> Index -> [Term] (sorted) +;; fuzzyDocs :: Int -> String -> Index -> [DocId] (sorted union) +;; fuzzyRankTfIdf :: Int -> String -> Index -> [DocId] + +(define + search/fuzzy-src + "edMin3 a b c = min a (min b c)\nedCost x y = if x == y then 0 else 1\nedUpto i n = if i > n then [] else i : edUpto (i + 1) n\nedLast [x] = x\nedLast (x:xs) = edLast xs\nedNrow x [] prev left = []\nedNrow x (y:ys) prev left = let v = edMin3 (head (tail prev) + 1) (left + 1) (head prev + edCost x y) in v : edNrow x ys (tail prev) v\nedRow x ys prev = let f = head prev + 1 in f : edNrow x ys prev f\nedRows [] ys prev = prev\nedRows (x:xs) ys prev = edRows xs ys (edRow x ys prev)\neditDist xs ys = edLast (edRows xs ys (edUpto 0 (length ys)))\nqWithinDist maxd term t = editDist term t <= maxd\nfuzzyTerms maxd term idx = filter (qWithinDist maxd term) (allTerms idx)\nfuzzyDocs maxd term idx = foldl (candStep idx) [] (fuzzyTerms maxd term idx)\nfuzzyRankTfIdf maxd term idx = rankTfIdf (fuzzyTerms maxd term idx) idx\n") diff --git a/lib/search/scoreboard.json b/lib/search/scoreboard.json index 16472224..b0baf95a 100644 --- a/lib/search/scoreboard.json +++ b/lib/search/scoreboard.json @@ -1,8 +1,8 @@ { "lang": "search", - "total_passed": 148, + "total_passed": 166, "total_failed": 0, - "total": 148, + "total": 166, "suites": [ {"name":"index","passed":18,"failed":0,"total":18}, {"name":"boolean","passed":28,"failed":0,"total":28}, @@ -10,7 +10,8 @@ {"name":"rank","passed":23,"failed":0,"total":23}, {"name":"integration","passed":21,"failed":0,"total":21}, {"name":"prefix","passed":14,"failed":0,"total":14}, - {"name":"page","passed":12,"failed":0,"total":12} + {"name":"page","passed":12,"failed":0,"total":12}, + {"name":"fuzzy","passed":18,"failed":0,"total":18} ], - "generated": "2026-06-06T20:54:50+00:00" + "generated": "2026-06-06T21:47:28+00:00" } diff --git a/lib/search/scoreboard.md b/lib/search/scoreboard.md index 9cdc93b3..74440558 100644 --- a/lib/search/scoreboard.md +++ b/lib/search/scoreboard.md @@ -1,6 +1,6 @@ # search scoreboard -**148 / 148 passing** (0 failure(s)). +**166 / 166 passing** (0 failure(s)). | Suite | Passed | Total | Status | |-------|--------|-------|--------| @@ -11,3 +11,4 @@ | integration | 21 | 21 | ok | | prefix | 14 | 14 | ok | | page | 12 | 12 | ok | +| fuzzy | 18 | 18 | ok | diff --git a/lib/search/tests/fuzzy.sx b/lib/search/tests/fuzzy.sx new file mode 100644 index 00000000..0b5c3fbd --- /dev/null +++ b/lib/search/tests/fuzzy.sx @@ -0,0 +1,74 @@ +;; Extension — fuzzy matching via Levenshtein edit distance. +;; Corpus: 1 "color flavor" 2 "colour kitten" 3 "colored" +;; allTerms: color colored colour flavor kitten + +(define + fuzzy-setup + "idx = indexDoc 3 \"colored\" (indexDoc 2 \"colour kitten\" (indexDoc 1 \"color flavor\" emptyIndex))\n") + +(define + fuzzy-cases + (list + (list + "editDist substitution" + "[editDist \"kitten\" \"sitten\"]" + (list 1)) + (list "editDist equal" "[editDist \"abc\" \"abc\"]" (list 0)) + (list "editDist deletion" "[editDist \"abc\" \"ab\"]" (list 1)) + (list "editDist insertion" "[editDist \"ab\" \"abc\"]" (list 1)) + (list "editDist from empty" "[editDist \"\" \"abc\"]" (list 3)) + (list "editDist both empty" "[editDist \"\" \"\"]" (list 0)) + (list + "editDist classic" + "[editDist \"kitten\" \"sitting\"]" + (list 3)) + (list + "editDist color colour" + "[editDist \"color\" \"colour\"]" + (list 1)) + (list + "editDist color colored" + "[editDist \"color\" \"colored\"]" + (list 2)) + (list + "fuzzy terms dist 1" + "fuzzyTerms 1 \"color\" idx" + (list "color" "colour")) + (list + "fuzzy terms dist 2" + "fuzzyTerms 2 \"color\" idx" + (list "color" "colored" "colour")) + (list "fuzzy terms exact" "fuzzyTerms 0 \"color\" idx" (list "color")) + (list + "fuzzy terms other word" + "fuzzyTerms 1 \"flavour\" idx" + (list "flavor")) + (list + "fuzzy docs dist 1" + "fuzzyDocs 1 \"color\" idx" + (list 1 2)) + (list + "fuzzy docs dist 2" + "fuzzyDocs 2 \"color\" idx" + (list 1 2 3)) + (list "fuzzy docs none" "fuzzyDocs 1 \"zzzzz\" idx" (list)) + (list + "fuzzy rank dist 1" + "fuzzyRankTfIdf 1 \"color\" idx" + (list 1 2)) + (list + "fuzzy rank dist 2" + "fuzzyRankTfIdf 2 \"color\" idx" + (list 1 2 3)))) + +(define + fuzzy-results + (search-batch fuzzy-setup (map (fn (c) (nth c 1)) fuzzy-cases))) + +(map-indexed + (fn + (i c) + (hk-test (nth c 0) (nth fuzzy-results i) (nth c 2))) + fuzzy-cases) + +{:fail hk-test-fail :pass hk-test-pass :fails hk-test-fails} diff --git a/plans/search-on-sx.md b/plans/search-on-sx.md index 32444f20..791c04dc 100644 --- a/plans/search-on-sx.md +++ b/plans/search-on-sx.md @@ -111,7 +111,8 @@ lib/search/index.sx lib/search/eval.sx ## Extensions (post-roadmap, search-shaped vocabulary) - [x] prefix / wildcard queries (`prefixTerms`, `prefixDocs`, `prefixRankTfIdf`) — 14 tests -- [ ] fuzzy matching — edit distance term expansion +- [x] fuzzy matching — edit distance term expansion (`editDist`, `fuzzyTerms`, + `fuzzyDocs`, `fuzzyRankTfIdf`) — 18 tests - [x] result pagination (offset / limit) — `paginate`, `pageTfIdf`, `pageBm25`, `resultCount` — 12 tests - [ ] snippet / highlight generation @@ -119,6 +120,10 @@ lib/search/index.sx lib/search/eval.sx ## Progress log +- **Extension: fuzzy matching (166/166 total).** Levenshtein `editDist` as an O(m*n) + row-based DP (the naive recursive version is exponential and times out under load), + `fuzzyTerms`/`fuzzyDocs`/`fuzzyRankTfIdf` expand a term to indexed terms within a max + edit distance. 18 tests. - **Extension: pagination (148/148 total).** `paginate off lim` windows a ranked list (take lim . drop off); `pageTfIdf`/`pageBm25` + `resultCount`. 12 tests. Note the full conformance now runs 8 suites sequentially and needs an overall timeout ~1900s