From 9d3b775b2583ea1fe9d8e4f0ef54675794e24bbd Mon Sep 17 00:00:00 2001 From: giles Date: Sat, 6 Jun 2026 20:22:23 +0000 Subject: [PATCH] search: prefix/wildcard queries + 14 tests prefixTerms matches indexed terms by prefix (allTerms + isPrefixOf); prefixDocs unions their docs; prefixRankTfIdf ranks via the matched terms. 136/136. Co-Authored-By: Claude Opus 4.8 (1M context) --- lib/search/api.sx | 6 ++-- lib/search/conformance.conf | 2 ++ lib/search/prefix.sx | 10 ++++++ lib/search/scoreboard.json | 9 +++--- lib/search/scoreboard.md | 3 +- lib/search/tests/prefix.sx | 63 +++++++++++++++++++++++++++++++++++++ plans/search-on-sx.md | 11 +++++++ 7 files changed, 97 insertions(+), 7 deletions(-) create mode 100644 lib/search/prefix.sx create mode 100644 lib/search/tests/prefix.sx diff --git a/lib/search/api.sx b/lib/search/api.sx index a9a3fe12..84918b5e 100644 --- a/lib/search/api.sx +++ b/lib/search/api.sx @@ -4,7 +4,7 @@ ;; interpreter. Public Haskell entry points: indexDoc, lookupTerm, deleteDoc, ;; docFreq, allTerms, tokens, positioned, evalQuery, parseQuery, searchQuery, ;; rankTfIdf, rankBm25, topNTfIdf, topNBm25, fedIndex, aclFilter, searchTfIdfAcl, -;; topNTfIdfAcl, searchBm25Acl. +;; topNTfIdfAcl, searchBm25Acl, prefixTerms, prefixDocs, prefixRankTfIdf. (define search/src @@ -19,4 +19,6 @@ "\n" search/rank-src "\n" - search/fed-src)) + search/fed-src + "\n" + search/prefix-src)) diff --git a/lib/search/conformance.conf b/lib/search/conformance.conf index b2ef2f74..c5d09b5c 100644 --- a/lib/search/conformance.conf +++ b/lib/search/conformance.conf @@ -24,6 +24,7 @@ PRELOADS=( lib/search/parse.sx lib/search/rank.sx lib/search/fed.sx + lib/search/prefix.sx lib/search/api.sx lib/search/testlib.sx ) @@ -34,4 +35,5 @@ SUITES=( "parse:lib/search/tests/parse.sx" "rank:lib/search/tests/rank.sx" "integration:lib/search/tests/integration.sx" + "prefix:lib/search/tests/prefix.sx" ) diff --git a/lib/search/prefix.sx b/lib/search/prefix.sx new file mode 100644 index 00000000..d50a5b1b --- /dev/null +++ b/lib/search/prefix.sx @@ -0,0 +1,10 @@ +;; search prefix / wildcard queries — Haskell source fragment. Depends on index + +;; rank (reuses candStep / rankTfIdf). A prefix matches every indexed term that +;; starts with it; the matching terms are unioned (OR) into a docid set. +;; prefixTerms :: String -> Index -> [Term] (sorted, from allTerms) +;; prefixDocs :: String -> Index -> [DocId] (sorted union) +;; prefixRankTfIdf :: String -> Index -> [DocId] (ranked by the matched terms) + +(define + search/prefix-src + "prefixTerms pre idx = filter (isPrefixOf pre) (allTerms idx)\nprefixDocs pre idx = foldl (candStep idx) [] (prefixTerms pre idx)\nprefixRankTfIdf pre idx = rankTfIdf (prefixTerms pre idx) idx\n") diff --git a/lib/search/scoreboard.json b/lib/search/scoreboard.json index d1cb07da..df5e60d7 100644 --- a/lib/search/scoreboard.json +++ b/lib/search/scoreboard.json @@ -1,14 +1,15 @@ { "lang": "search", - "total_passed": 122, + "total_passed": 136, "total_failed": 0, - "total": 122, + "total": 136, "suites": [ {"name":"index","passed":18,"failed":0,"total":18}, {"name":"boolean","passed":28,"failed":0,"total":28}, {"name":"parse","passed":32,"failed":0,"total":32}, {"name":"rank","passed":23,"failed":0,"total":23}, - {"name":"integration","passed":21,"failed":0,"total":21} + {"name":"integration","passed":21,"failed":0,"total":21}, + {"name":"prefix","passed":14,"failed":0,"total":14} ], - "generated": "2026-06-06T20:07:30+00:00" + "generated": "2026-06-06T20:21:41+00:00" } diff --git a/lib/search/scoreboard.md b/lib/search/scoreboard.md index 03a1d66c..0578f296 100644 --- a/lib/search/scoreboard.md +++ b/lib/search/scoreboard.md @@ -1,6 +1,6 @@ # search scoreboard -**122 / 122 passing** (0 failure(s)). +**136 / 136 passing** (0 failure(s)). | Suite | Passed | Total | Status | |-------|--------|-------|--------| @@ -9,3 +9,4 @@ | parse | 32 | 32 | ok | | rank | 23 | 23 | ok | | integration | 21 | 21 | ok | +| prefix | 14 | 14 | ok | diff --git a/lib/search/tests/prefix.sx b/lib/search/tests/prefix.sx new file mode 100644 index 00000000..97776491 --- /dev/null +++ b/lib/search/tests/prefix.sx @@ -0,0 +1,63 @@ +;; Extension — prefix / wildcard queries. +;; Corpus: 1 "alpha alpine" 2 "beta apple" 3 "banana alpha" +;; allTerms sorted: alpha alpine apple banana beta + +(define + prefix-setup + "idx = indexDoc 3 \"banana alpha\" (indexDoc 2 \"beta apple\" (indexDoc 1 \"alpha alpine\" emptyIndex))\n") + +(define + prefix-cases + (list + (list + "prefix terms two matches" + "prefixTerms \"al\" idx" + (list "alpha" "alpine")) + (list + "prefix terms narrower" + "prefixTerms \"alp\" idx" + (list "alpha" "alpine")) + (list + "prefix terms wide" + "prefixTerms \"a\" idx" + (list "alpha" "alpine" "apple")) + (list "prefix terms single" "prefixTerms \"ban\" idx" (list "banana")) + (list "prefix terms exact term" "prefixTerms \"beta\" idx" (list "beta")) + (list "prefix terms none" "prefixTerms \"z\" idx" (list)) + (list + "prefix docs union" + "prefixDocs \"al\" idx" + (list 1 3)) + (list "prefix docs single term" "prefixDocs \"ban\" idx" (list 3)) + (list + "prefix docs wide" + "prefixDocs \"a\" idx" + (list 1 2 3)) + (list "prefix docs none" "prefixDocs \"z\" idx" (list)) + (list + "prefix docs exact" + "prefixDocs \"alpha\" idx" + (list 1 3)) + (list + "prefix rank ranks by matched terms" + "prefixRankTfIdf \"al\" idx" + (list 1 3)) + (list + "prefix rank single doc" + "prefixRankTfIdf \"ban\" idx" + (list 3)) + (list "prefix rank empty" "prefixRankTfIdf \"z\" idx" (list)))) + +(define + prefix-results + (search-batch + prefix-setup + (map (fn (c) (nth c 1)) prefix-cases))) + +(map-indexed + (fn + (i c) + (hk-test (nth c 0) (nth prefix-results i) (nth c 2))) + prefix-cases) + +{:fail hk-test-fail :pass hk-test-pass :fails hk-test-fails} diff --git a/plans/search-on-sx.md b/plans/search-on-sx.md index 7324db74..2f440bd0 100644 --- a/plans/search-on-sx.md +++ b/plans/search-on-sx.md @@ -108,8 +108,19 @@ lib/search/index.sx lib/search/eval.sx - [x] `lib/search/tests/integration.sx` — 21 cases: index merge, cross-peer df/lookup, position preservation, boolean/phrase over the merge, ACL filter + top-N + bm25 +## Extensions (post-roadmap, search-shaped vocabulary) + +- [x] prefix / wildcard queries (`prefixTerms`, `prefixDocs`, `prefixRankTfIdf`) — 14 tests +- [ ] fuzzy matching — edit distance term expansion +- [ ] result pagination (offset / limit) +- [ ] snippet / highlight generation +- [ ] stemming (suffix stripping) — recall-improving normalizer + ## Progress log +- **Extension: prefix/wildcard queries (136/136 total).** `prefixTerms` matches every + indexed term starting with a prefix (via allTerms + isPrefixOf); `prefixDocs` unions + their docs; `prefixRankTfIdf` ranks treating the matched terms as the query. 14 tests. - **Phase 4 complete — federation + ACL (122/122 total). Roadmap done.** `fedIndex` merges per-peer inverted indices (union posting lists per term) after relabelling local DocIds to global `gid = peer*1000 + local` — the bijection makes (peer,doc-id)