search: Phase 3 ranking TF-IDF + BM25 + top-N + 23 tests

rankTfIdf and rankBm25 (configurable k1/b) over the candidate set, float scores with deterministic DocId tiebreak; topNTfIdf/topNBm25. df/idf derived from posting-list length. Tests cover tf/idf behavior, a BM25-vs-TF-IDF flip from length-norm + tf-saturation, the b-parameter effect, tiebreak stability. 101/101. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-06 19:56:50 +00:00
parent 4c84decc01
commit a3f9d4f6c9
7 changed files with 132 additions and 14 deletions
--- a/lib/search/api.sx
+++ b/lib/search/api.sx
@@ -2,7 +2,8 @@
 ;; Tests and callers concatenate `search/src` with their own top-level bindings
 ;; (e.g. "result = lookupTerm \"cat\" idx\n") and evaluate via the haskell-on-sx
 ;; interpreter. Public Haskell entry points: indexDoc, lookupTerm, deleteDoc,
-;; docFreq, allTerms, tokens, positioned, evalQuery, parseQuery, searchQuery.
+;; docFreq, allTerms, tokens, positioned, evalQuery, parseQuery, searchQuery,
+;; rankTfIdf, rankBm25, topNTfIdf, topNBm25.

 (define
  search/src
@@ -13,4 +14,6 @@
    "\n"
    search/query-src
    "\n"
-    search/parse-src))
+    search/parse-src
+    "\n"
+    search/rank-src))
--- a/lib/search/conformance.conf
+++ b/lib/search/conformance.conf
@@ -22,6 +22,7 @@ PRELOADS=(
  lib/search/index.sx
  lib/search/query.sx
  lib/search/parse.sx
+  lib/search/rank.sx
  lib/search/api.sx
  lib/search/testlib.sx
 )
@@ -30,4 +31,5 @@ SUITES=(
  "index:lib/search/tests/index.sx"
  "boolean:lib/search/tests/boolean.sx"
  "parse:lib/search/tests/parse.sx"
+  "rank:lib/search/tests/rank.sx"
 )
--- a/lib/search/rank.sx
+++ b/lib/search/rank.sx
@@ -0,0 +1,14 @@
+;; search ranking — Haskell source fragment. Depends on tokenize + index + query.
+;; Ranked retrieval over the candidate set (docs containing any query term).
+;; Scores are floats; ties broken by DocId ascending (deterministic).
+;;   numDocs    :: Index -> Int
+;;   docFreq    :: Term -> Index -> Int           (from index)
+;;   docLen     :: DocId -> Index -> Int
+;;   rankTfIdf  :: [Term] -> Index -> [DocId]
+;;   topNTfIdf  :: Int -> [Term] -> Index -> [DocId]
+;;   rankBm25   :: Float -> Float -> [Term] -> Index -> [DocId]   (k1, b)
+;;   topNBm25   :: Int -> Float -> Float -> [Term] -> Index -> [DocId]
+
+(define
+  search/rank-src
+  "numDocs idx = length (allDocs idx)\ntfIn t d idx = length (posIn t d idx)\nqIdf n df = if df == 0 then 0 else log (n / df)\nidf t idx = qIdf (numDocs idx) (docFreq t idx)\ntermScoreTf idx d t = tfIn t d idx * idf t idx\ntfidfDoc ts idx d = sum (map (termScoreTf idx d) ts)\ncandStep idx acc t = sortedUnion acc (docsWith t idx)\ncandDocs ts idx = foldl (candStep idx) [] ts\ncmpScore p1 p2 = if fst p1 > fst p2 then LT else if fst p1 < fst p2 then GT else compare (snd p1) (snd p2)\nmkPair f ts idx d = (f ts idx d, d)\nrankWith f ts idx = map snd (sortBy cmpScore (map (mkPair f ts idx) (candDocs ts idx)))\nrankTfIdf ts idx = rankWith tfidfDoc ts idx\ntopNTfIdf n ts idx = take n (rankTfIdf ts idx)\ntfAt d idx t = tfIn t d idx\ndocLen d idx = sum (map (tfAt d idx) (allTerms idx))\nlenAt idx d = docLen d idx\navgDocLen idx = sum (map (lenAt idx) (allDocs idx)) / numDocs idx\nbm25idf t idx = log ((numDocs idx - docFreq t idx + 0.5) / (docFreq t idx + 0.5) + 1)\nbm25Term k1 b avgdl idx d t = bm25idf t idx * (tfIn t d idx * (k1 + 1)) / (tfIn t d idx + k1 * (1 - b + b * docLen d idx / avgdl))\nbm25Doc k1 b ts idx d = sum (map (bm25Term k1 b (avgDocLen idx) idx d) ts)\nrankBm25 k1 b ts idx = rankWith (bm25Doc k1 b) ts idx\ntopNBm25 n k1 b ts idx = take n (rankBm25 k1 b ts idx)\n")
--- a/lib/search/scoreboard.json
+++ b/lib/search/scoreboard.json
@@ -1,12 +1,13 @@
 {
  "lang": "search",
-  "total_passed": 78,
+  "total_passed": 101,
  "total_failed": 0,
-  "total": 78,
+  "total": 101,
  "suites": [
    {"name":"index","passed":18,"failed":0,"total":18},
    {"name":"boolean","passed":28,"failed":0,"total":28},
-    {"name":"parse","passed":32,"failed":0,"total":32}
+    {"name":"parse","passed":32,"failed":0,"total":32},
+    {"name":"rank","passed":23,"failed":0,"total":23}
  ],
-  "generated": "2026-06-06T19:42:39+00:00"
+  "generated": "2026-06-06T19:56:08+00:00"
 }
--- a/lib/search/scoreboard.md
+++ b/lib/search/scoreboard.md
@@ -1,9 +1,10 @@
 # search scoreboard

-**78 / 78 passing** (0 failure(s)).
+**101 / 101 passing** (0 failure(s)).

 | Suite | Passed | Total | Status |
 |-------|--------|-------|--------|
 | index | 18 | 18 | ok |
 | boolean | 28 | 28 | ok |
 | parse | 32 | 32 | ok |
+| rank | 23 | 23 | ok |
--- a/lib/search/tests/rank.sx
+++ b/lib/search/tests/rank.sx
@@ -0,0 +1,90 @@
+;; Phase 3 — ranking (TF-IDF, BM25, top-N). Deterministic: ties broken by DocId.
+;; Corpora:
+;;   idx1: 1 "alpha alpha alpha gamma"  2 "alpha"  3 "beta"
+;;   idx2: 1 "cat"  2 "cat cat dog elephant frog grape"  3 "zzz"
+;;   idx3: 1 "kite"  2 "kite"   (identical docs -> tiebreak)
+
+(define
+  rank-setup
+  "idx1 = indexDoc 3 \"beta\" (indexDoc 2 \"alpha\" (indexDoc 1 \"alpha alpha alpha gamma\" emptyIndex))\nidx2 = indexDoc 3 \"zzz\" (indexDoc 2 \"cat cat dog elephant frog grape\" (indexDoc 1 \"cat\" emptyIndex))\nidx3 = indexDoc 2 \"kite\" (indexDoc 1 \"kite\" emptyIndex)\n")
+
+(define
+  rank-cases
+  (list
+    (list
+      "tfidf tf ordering"
+      "rankTfIdf [\"alpha\"] idx1"
+      (list 1 2))
+    (list
+      "tfidf rare term boosts"
+      "rankTfIdf [\"alpha\", \"beta\"] idx1"
+      (list 1 3 2))
+    (list
+      "tfidf single-doc term"
+      "rankTfIdf [\"gamma\"] idx1"
+      (list 1))
+    (list "tfidf absent term empty" "rankTfIdf [\"nope\"] idx1" (list))
+    (list "tfidf empty query empty" "rankTfIdf [] idx1" (list))
+    (list
+      "tfidf candidate union tie by docid"
+      "rankTfIdf [\"beta\", \"gamma\"] idx1"
+      (list 1 3))
+    (list
+      "tfidf tf ordering idx2"
+      "rankTfIdf [\"cat\"] idx2"
+      (list 2 1))
+    (list "topN tfidf 1" "topNTfIdf 1 [\"alpha\"] idx1" (list 1))
+    (list
+      "topN tfidf 2"
+      "topNTfIdf 2 [\"alpha\", \"beta\"] idx1"
+      (list 1 3))
+    (list
+      "topN exceeds results"
+      "topNTfIdf 10 [\"gamma\"] idx1"
+      (list 1))
+    (list "topN zero" "topNTfIdf 0 [\"alpha\"] idx1" (list))
+    (list
+      "bm25 tf+length flips tfidf"
+      "rankBm25 1.5 0.75 [\"cat\"] idx2"
+      (list 1 2))
+    (list
+      "bm25 b=0 ignores length"
+      "rankBm25 1.5 0.0 [\"cat\"] idx2"
+      (list 2 1))
+    (list
+      "bm25 alpha idx1"
+      "rankBm25 1.5 0.75 [\"alpha\"] idx1"
+      (list 1 2))
+    (list "bm25 absent empty" "rankBm25 1.5 0.75 [\"nope\"] idx1" (list))
+    (list
+      "bm25 single-doc term"
+      "rankBm25 1.5 0.75 [\"gamma\"] idx1"
+      (list 1))
+    (list "bm25 topN 1" "topNBm25 1 1.5 0.75 [\"cat\"] idx2" (list 1))
+    (list
+      "bm25 same candidate set"
+      "sort (rankBm25 1.5 0.75 [\"alpha\", \"beta\"] idx1)"
+      (list 1 2 3))
+    (list
+      "tfidf stable tiebreak"
+      "rankTfIdf [\"kite\"] idx3"
+      (list 1 2))
+    (list
+      "bm25 stable tiebreak"
+      "rankBm25 1.5 0.75 [\"kite\"] idx3"
+      (list 1 2))
+    (list "numDocs" "[numDocs idx1]" (list 3))
+    (list "docLen counts tokens" "[docLen 1 idx1]" (list 4))
+    (list "docFreq via index" "[docFreq \"alpha\" idx1]" (list 2))))
+
+(define
+  rank-results
+  (search-batch rank-setup (map (fn (c) (nth c 1)) rank-cases)))
+
+(map-indexed
+  (fn
+    (i c)
+    (hk-test (nth c 0) (nth rank-results i) (nth c 2)))
+  rank-cases)
+
+{:fail hk-test-fail :pass hk-test-pass :fails hk-test-fails}
--- a/plans/search-on-sx.md
+++ b/plans/search-on-sx.md
@@ -10,7 +10,7 @@ extension that merges per-peer indices.

 ## Status (rolling)

-`bash lib/search/conformance.sh` → **78/78** (Phases 1–2 complete)
+`bash lib/search/conformance.sh` → **101/101** (Phases 1–3 complete)

 ## Ground rules

@@ -89,12 +89,13 @@ lib/search/index.sx                     lib/search/eval.sx

 ## Phase 3 — Ranking

- [ ] document frequency tracking — extend index with `df` per term
- [ ] TF-IDF scoring
- [ ] BM25 scoring (configurable k1, b)
- [ ] top-N retrieval (heap-based)
- [ ] `lib/search/tests/rank.sx` — 20+ cases: TF-IDF behavior, BM25 vs TF-IDF,
-  ranking stability, top-N correctness
+- [x] document frequency — `docFreq`/`idf`/`bm25idf` derived from the index
+  (posting-list length); no separate df store needed
+- [x] TF-IDF scoring (`rankTfIdf`)
+- [x] BM25 scoring, configurable k1/b (`rankBm25 k1 b`)
+- [x] top-N retrieval (`topNTfIdf`/`topNBm25` — sortBy + take; stable DocId tiebreak)
+- [x] `lib/search/tests/rank.sx` — 23 cases: TF-IDF tf/idf behavior, BM25 length-norm
+  + tf-saturation flips vs TF-IDF, b-parameter effect, tiebreak stability, top-N

 ## Phase 4 — ACL filter + federation

@@ -105,6 +106,12 @@ lib/search/index.sx                     lib/search/eval.sx

 ## Progress log

+- **Phase 3 complete — ranking (101/101 total).** TF-IDF (`rankTfIdf`) and BM25
+  (`rankBm25 k1 b`) over the candidate set (docs containing any query term), scores
+  as floats with deterministic DocId-ascending tiebreak; `topNTfIdf`/`topNBm25` via
+  sortBy+take. df/idf derived from posting-list length (no separate df store). 23
+  tests incl. a BM25-vs-TF-IDF flip (length-norm + tf-saturation) and the b-parameter
+  effect. Float division/`log`/float literals all work in haskell-on-sx.
 - **Phase 2 complete — parser (78/78 total).** Query tokenizer (ord-based
  delimiters, quoted phrases) + recursive-descent parser with OR<AND<NOT precedence,
  implicit AND on adjacency, parens, case-insensitive keywords. `parseQuery`,