search: boolean-filtered ranked search + 11 tests

searchRankTfIdf/searchRankBm25 parse a boolean query, filter docs via evalQuery, then rank survivors by relevance over the query's leaf terms (queryTerms) — the filter-then-rank pattern. 225/225. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-06 23:58:37 +00:00
parent cfa68c3db3
commit db2a5dc6ab
7 changed files with 98 additions and 7 deletions
--- a/lib/search/api.sx
+++ b/lib/search/api.sx
@@ -7,7 +7,8 @@
 ;; topNTfIdfAcl, searchBm25Acl, prefixTerms, prefixDocs, prefixRankTfIdf,
 ;; paginate, pageTfIdf, pageBm25, resultCount, editDist, fuzzyTerms, fuzzyDocs,
 ;; fuzzyRankTfIdf, highlight, snippet, stem, stemText, stemTokens, indexStemmed,
-;; nearDocs, expandTerm, synDocs, synRankTfIdf.
+;; nearDocs, expandTerm, synDocs, synRankTfIdf, queryTerms, searchRankTfIdf,
+;; searchRankBm25.

 (define
  search/src
@@ -36,4 +37,6 @@
    "\n"
    search/near-src
    "\n"
-    search/syn-src))
+    search/syn-src
+    "\n"
+    search/rankq-src))
--- a/lib/search/conformance.conf
+++ b/lib/search/conformance.conf
@@ -31,6 +31,7 @@ PRELOADS=(
  lib/search/stem.sx
  lib/search/near.sx
  lib/search/syn.sx
+  lib/search/rankq.sx
  lib/search/api.sx
  lib/search/testlib.sx
 )
@@ -48,4 +49,5 @@ SUITES=(
  "stem:lib/search/tests/stem.sx"
  "near:lib/search/tests/near.sx"
  "syn:lib/search/tests/syn.sx"
+  "rankq:lib/search/tests/rankq.sx"
 )
--- a/lib/search/rankq.sx
+++ b/lib/search/rankq.sx
@@ -0,0 +1,11 @@
+;; search boolean-filtered ranked search — Haskell source fragment.
+;; Depends on parse (parseQuery/Query), query (evalQuery), rank (tfidfDoc/bm25Doc/
+;; cmpScore). Filters by the boolean query, then ranks the surviving docs by
+;; relevance over the query's leaf terms — the real-world filter-then-rank pattern.
+;;   queryTerms       :: Query -> [Term]
+;;   searchRankTfIdf  :: String -> Index -> [DocId]
+;;   searchRankBm25   :: Float -> Float -> String -> Index -> [DocId]
+
+(define
+  search/rankq-src
+  "queryTerms (Term t) = [t]\nqueryTerms (And a b) = queryTerms a ++ queryTerms b\nqueryTerms (Or a b) = queryTerms a ++ queryTerms b\nqueryTerms (Not a) = queryTerms a\nqueryTerms (Phrase ts) = ts\nmkSubPair f terms idx d = (f terms idx d, d)\nrankSubsetWith f terms docs idx = map snd (sortBy cmpScore (map (mkSubPair f terms idx) docs))\nsearchRankTfIdf s idx = let q = parseQuery s in rankSubsetWith tfidfDoc (queryTerms q) (evalQuery idx q) idx\nsearchRankBm25 k1 b s idx = let q = parseQuery s in rankSubsetWith (bm25Doc k1 b) (queryTerms q) (evalQuery idx q) idx\n")
--- a/lib/search/scoreboard.json
+++ b/lib/search/scoreboard.json
@@ -1,8 +1,8 @@
 {
  "lang": "search",
-  "total_passed": 214,
+  "total_passed": 225,
  "total_failed": 0,
-  "total": 214,
+  "total": 225,
  "suites": [
    {"name":"index","passed":18,"failed":0,"total":18},
    {"name":"boolean","passed":28,"failed":0,"total":28},
@@ -15,7 +15,8 @@
    {"name":"highlight","passed":12,"failed":0,"total":12},
    {"name":"stem","passed":18,"failed":0,"total":18},
    {"name":"near","passed":9,"failed":0,"total":9},
-    {"name":"syn","passed":9,"failed":0,"total":9}
+    {"name":"syn","passed":9,"failed":0,"total":9},
+    {"name":"rankq","passed":11,"failed":0,"total":11}
  ],
-  "generated": "2026-06-06T23:25:35+00:00"
+  "generated": "2026-06-06T23:58:05+00:00"
 }
--- a/lib/search/scoreboard.md
+++ b/lib/search/scoreboard.md
@@ -1,6 +1,6 @@
 # search scoreboard

-**214 / 214 passing** (0 failure(s)).
+**225 / 225 passing** (0 failure(s)).

 | Suite | Passed | Total | Status |
 |-------|--------|-------|--------|
@@ -16,3 +16,4 @@
 | stem | 18 | 18 | ok |
 | near | 9 | 9 | ok |
 | syn | 9 | 9 | ok |
+| rankq | 11 | 11 | ok |
--- a/lib/search/tests/rankq.sx
+++ b/lib/search/tests/rankq.sx
@@ -0,0 +1,67 @@
+;; Extension — boolean-filtered ranked search (filter then rank by relevance).
+;; Corpus:
+;;   1 "apple apple banana"   apple2 banana1
+;;   2 "apple cherry"         apple1 cherry1
+;;   3 "banana cherry"        banana1 cherry1
+;;   4 "apple banana cherry"  apple1 banana1 cherry1
+
+(define
+  rankq-setup
+  "idx = indexDoc 4 \"apple banana cherry\" (indexDoc 3 \"banana cherry\" (indexDoc 2 \"apple cherry\" (indexDoc 1 \"apple apple banana\" emptyIndex)))\n")
+
+(define
+  rankq-cases
+  (list
+    (list
+      "queryTerms and"
+      "queryTerms (parseQuery \"apple AND banana\")"
+      (list "apple" "banana"))
+    (list
+      "queryTerms or not"
+      "queryTerms (parseQuery \"a OR NOT b\")"
+      (list "a" "b"))
+    (list
+      "queryTerms phrase"
+      "queryTerms (parseQuery \"\\\"x y\\\" OR z\")"
+      (list "x" "y" "z"))
+    (list
+      "and filter ranked by tf"
+      "searchRankTfIdf \"apple AND banana\" idx"
+      (list 1 4))
+    (list
+      "single term ranked tie"
+      "searchRankTfIdf \"cherry\" idx"
+      (list 2 3 4))
+    (list
+      "or filter ranked"
+      "searchRankTfIdf \"apple OR banana\" idx"
+      (list 1 4 2 3))
+    (list
+      "and-not narrows then ranks"
+      "searchRankTfIdf \"apple AND NOT banana\" idx"
+      (list 2))
+    (list
+      "phrase filter ranked"
+      "searchRankTfIdf \"\\\"apple banana\\\"\" idx"
+      (list 1 4))
+    (list "no matches" "searchRankTfIdf \"zzz\" idx" (list))
+    (list
+      "bm25 boolean ranked subset"
+      "sort (searchRankBm25 1.5 0.75 \"apple OR banana\" idx)"
+      (list 1 2 3 4))
+    (list
+      "bm25 and filter"
+      "searchRankBm25 1.5 0.75 \"apple AND NOT banana\" idx"
+      (list 2))))
+
+(define
+  rankq-results
+  (search-batch rankq-setup (map (fn (c) (nth c 1)) rankq-cases)))
+
+(map-indexed
+  (fn
+    (i c)
+    (hk-test (nth c 0) (nth rankq-results i) (nth c 2)))
+  rankq-cases)
+
+{:fail hk-test-fail :pass hk-test-pass :fails hk-test-fails}
--- a/plans/search-on-sx.md
+++ b/plans/search-on-sx.md
@@ -120,9 +120,15 @@ lib/search/index.sx                     lib/search/eval.sx
  — 18 tests
 - [x] proximity / NEAR — `nearDocs k t1 t2` (unordered, within k positions) — 9 tests
 - [x] synonym / query expansion — `expandTerm`, `synDocs`, `synRankTfIdf` — 9 tests
+- [x] boolean-filtered ranked search — `queryTerms`, `searchRankTfIdf`,
+  `searchRankBm25` (filter by boolean query, rank survivors by relevance) — 11 tests

 ## Progress log

+- **Extension: boolean-filtered ranked search (225/225 total).** `searchRankTfIdf`/
+  `searchRankBm25` parse a boolean query, filter docs via evalQuery, then rank the
+  survivors by relevance over the query's leaf terms (`queryTerms`) — the real-world
+  filter-then-rank pattern. 11 tests.
 - **Extension: synonyms/query expansion (214/214 total).** A synonym map
  `[(Term,[Term])]` expands a query term to itself + synonyms (`expandTerm`); `synDocs`
  unions, `synRankTfIdf` ranks the expanded set. 9 tests.