diff --git a/lib/search/api.sx b/lib/search/api.sx index 14ba1774..29f445af 100644 --- a/lib/search/api.sx +++ b/lib/search/api.sx @@ -7,7 +7,8 @@ ;; topNTfIdfAcl, searchBm25Acl, prefixTerms, prefixDocs, prefixRankTfIdf, ;; paginate, pageTfIdf, pageBm25, resultCount, editDist, fuzzyTerms, fuzzyDocs, ;; fuzzyRankTfIdf, highlight, snippet, stem, stemText, stemTokens, indexStemmed, -;; nearDocs, expandTerm, synDocs, synRankTfIdf. +;; nearDocs, expandTerm, synDocs, synRankTfIdf, queryTerms, searchRankTfIdf, +;; searchRankBm25. (define search/src @@ -36,4 +37,6 @@ "\n" search/near-src "\n" - search/syn-src)) + search/syn-src + "\n" + search/rankq-src)) diff --git a/lib/search/conformance.conf b/lib/search/conformance.conf index 96d38540..9c7b006e 100644 --- a/lib/search/conformance.conf +++ b/lib/search/conformance.conf @@ -31,6 +31,7 @@ PRELOADS=( lib/search/stem.sx lib/search/near.sx lib/search/syn.sx + lib/search/rankq.sx lib/search/api.sx lib/search/testlib.sx ) @@ -48,4 +49,5 @@ SUITES=( "stem:lib/search/tests/stem.sx" "near:lib/search/tests/near.sx" "syn:lib/search/tests/syn.sx" + "rankq:lib/search/tests/rankq.sx" ) diff --git a/lib/search/rankq.sx b/lib/search/rankq.sx new file mode 100644 index 00000000..77b70468 --- /dev/null +++ b/lib/search/rankq.sx @@ -0,0 +1,11 @@ +;; search boolean-filtered ranked search — Haskell source fragment. +;; Depends on parse (parseQuery/Query), query (evalQuery), rank (tfidfDoc/bm25Doc/ +;; cmpScore). Filters by the boolean query, then ranks the surviving docs by +;; relevance over the query's leaf terms — the real-world filter-then-rank pattern. +;; queryTerms :: Query -> [Term] +;; searchRankTfIdf :: String -> Index -> [DocId] +;; searchRankBm25 :: Float -> Float -> String -> Index -> [DocId] + +(define + search/rankq-src + "queryTerms (Term t) = [t]\nqueryTerms (And a b) = queryTerms a ++ queryTerms b\nqueryTerms (Or a b) = queryTerms a ++ queryTerms b\nqueryTerms (Not a) = queryTerms a\nqueryTerms (Phrase ts) = ts\nmkSubPair f terms idx d = (f terms idx d, d)\nrankSubsetWith f terms docs idx = map snd (sortBy cmpScore (map (mkSubPair f terms idx) docs))\nsearchRankTfIdf s idx = let q = parseQuery s in rankSubsetWith tfidfDoc (queryTerms q) (evalQuery idx q) idx\nsearchRankBm25 k1 b s idx = let q = parseQuery s in rankSubsetWith (bm25Doc k1 b) (queryTerms q) (evalQuery idx q) idx\n") diff --git a/lib/search/scoreboard.json b/lib/search/scoreboard.json index 6f965309..3ea5b5ee 100644 --- a/lib/search/scoreboard.json +++ b/lib/search/scoreboard.json @@ -1,8 +1,8 @@ { "lang": "search", - "total_passed": 214, + "total_passed": 225, "total_failed": 0, - "total": 214, + "total": 225, "suites": [ {"name":"index","passed":18,"failed":0,"total":18}, {"name":"boolean","passed":28,"failed":0,"total":28}, @@ -15,7 +15,8 @@ {"name":"highlight","passed":12,"failed":0,"total":12}, {"name":"stem","passed":18,"failed":0,"total":18}, {"name":"near","passed":9,"failed":0,"total":9}, - {"name":"syn","passed":9,"failed":0,"total":9} + {"name":"syn","passed":9,"failed":0,"total":9}, + {"name":"rankq","passed":11,"failed":0,"total":11} ], - "generated": "2026-06-06T23:25:35+00:00" + "generated": "2026-06-06T23:58:05+00:00" } diff --git a/lib/search/scoreboard.md b/lib/search/scoreboard.md index 0f54edbb..2cc7fd9c 100644 --- a/lib/search/scoreboard.md +++ b/lib/search/scoreboard.md @@ -1,6 +1,6 @@ # search scoreboard -**214 / 214 passing** (0 failure(s)). +**225 / 225 passing** (0 failure(s)). | Suite | Passed | Total | Status | |-------|--------|-------|--------| @@ -16,3 +16,4 @@ | stem | 18 | 18 | ok | | near | 9 | 9 | ok | | syn | 9 | 9 | ok | +| rankq | 11 | 11 | ok | diff --git a/lib/search/tests/rankq.sx b/lib/search/tests/rankq.sx new file mode 100644 index 00000000..dd360310 --- /dev/null +++ b/lib/search/tests/rankq.sx @@ -0,0 +1,67 @@ +;; Extension — boolean-filtered ranked search (filter then rank by relevance). +;; Corpus: +;; 1 "apple apple banana" apple2 banana1 +;; 2 "apple cherry" apple1 cherry1 +;; 3 "banana cherry" banana1 cherry1 +;; 4 "apple banana cherry" apple1 banana1 cherry1 + +(define + rankq-setup + "idx = indexDoc 4 \"apple banana cherry\" (indexDoc 3 \"banana cherry\" (indexDoc 2 \"apple cherry\" (indexDoc 1 \"apple apple banana\" emptyIndex)))\n") + +(define + rankq-cases + (list + (list + "queryTerms and" + "queryTerms (parseQuery \"apple AND banana\")" + (list "apple" "banana")) + (list + "queryTerms or not" + "queryTerms (parseQuery \"a OR NOT b\")" + (list "a" "b")) + (list + "queryTerms phrase" + "queryTerms (parseQuery \"\\\"x y\\\" OR z\")" + (list "x" "y" "z")) + (list + "and filter ranked by tf" + "searchRankTfIdf \"apple AND banana\" idx" + (list 1 4)) + (list + "single term ranked tie" + "searchRankTfIdf \"cherry\" idx" + (list 2 3 4)) + (list + "or filter ranked" + "searchRankTfIdf \"apple OR banana\" idx" + (list 1 4 2 3)) + (list + "and-not narrows then ranks" + "searchRankTfIdf \"apple AND NOT banana\" idx" + (list 2)) + (list + "phrase filter ranked" + "searchRankTfIdf \"\\\"apple banana\\\"\" idx" + (list 1 4)) + (list "no matches" "searchRankTfIdf \"zzz\" idx" (list)) + (list + "bm25 boolean ranked subset" + "sort (searchRankBm25 1.5 0.75 \"apple OR banana\" idx)" + (list 1 2 3 4)) + (list + "bm25 and filter" + "searchRankBm25 1.5 0.75 \"apple AND NOT banana\" idx" + (list 2)))) + +(define + rankq-results + (search-batch rankq-setup (map (fn (c) (nth c 1)) rankq-cases))) + +(map-indexed + (fn + (i c) + (hk-test (nth c 0) (nth rankq-results i) (nth c 2))) + rankq-cases) + +{:fail hk-test-fail :pass hk-test-pass :fails hk-test-fails} diff --git a/plans/search-on-sx.md b/plans/search-on-sx.md index 2e62c53b..cf8c530a 100644 --- a/plans/search-on-sx.md +++ b/plans/search-on-sx.md @@ -120,9 +120,15 @@ lib/search/index.sx lib/search/eval.sx — 18 tests - [x] proximity / NEAR — `nearDocs k t1 t2` (unordered, within k positions) — 9 tests - [x] synonym / query expansion — `expandTerm`, `synDocs`, `synRankTfIdf` — 9 tests +- [x] boolean-filtered ranked search — `queryTerms`, `searchRankTfIdf`, + `searchRankBm25` (filter by boolean query, rank survivors by relevance) — 11 tests ## Progress log +- **Extension: boolean-filtered ranked search (225/225 total).** `searchRankTfIdf`/ + `searchRankBm25` parse a boolean query, filter docs via evalQuery, then rank the + survivors by relevance over the query's leaf terms (`queryTerms`) — the real-world + filter-then-rank pattern. 11 tests. - **Extension: synonyms/query expansion (214/214 total).** A synonym map `[(Term,[Term])]` expands a query term to itself + synonyms (`expandTerm`); `synDocs` unions, `synRankTfIdf` ranks the expanded set. 9 tests.