From a3f9d4f6c90e66d6efd282e726eb5297d6328e44 Mon Sep 17 00:00:00 2001 From: giles Date: Sat, 6 Jun 2026 19:56:50 +0000 Subject: [PATCH] search: Phase 3 ranking TF-IDF + BM25 + top-N + 23 tests rankTfIdf and rankBm25 (configurable k1/b) over the candidate set, float scores with deterministic DocId tiebreak; topNTfIdf/topNBm25. df/idf derived from posting-list length. Tests cover tf/idf behavior, a BM25-vs-TF-IDF flip from length-norm + tf-saturation, the b-parameter effect, tiebreak stability. 101/101. Co-Authored-By: Claude Opus 4.8 (1M context) --- lib/search/api.sx | 7 ++- lib/search/conformance.conf | 2 + lib/search/rank.sx | 14 ++++++ lib/search/scoreboard.json | 9 ++-- lib/search/scoreboard.md | 3 +- lib/search/tests/rank.sx | 90 +++++++++++++++++++++++++++++++++++++ plans/search-on-sx.md | 21 ++++++--- 7 files changed, 132 insertions(+), 14 deletions(-) create mode 100644 lib/search/rank.sx create mode 100644 lib/search/tests/rank.sx diff --git a/lib/search/api.sx b/lib/search/api.sx index 5a275f4d..2eaeac96 100644 --- a/lib/search/api.sx +++ b/lib/search/api.sx @@ -2,7 +2,8 @@ ;; Tests and callers concatenate `search/src` with their own top-level bindings ;; (e.g. "result = lookupTerm \"cat\" idx\n") and evaluate via the haskell-on-sx ;; interpreter. Public Haskell entry points: indexDoc, lookupTerm, deleteDoc, -;; docFreq, allTerms, tokens, positioned, evalQuery, parseQuery, searchQuery. +;; docFreq, allTerms, tokens, positioned, evalQuery, parseQuery, searchQuery, +;; rankTfIdf, rankBm25, topNTfIdf, topNBm25. (define search/src @@ -13,4 +14,6 @@ "\n" search/query-src "\n" - search/parse-src)) + search/parse-src + "\n" + search/rank-src)) diff --git a/lib/search/conformance.conf b/lib/search/conformance.conf index 6e9e8309..9793c9cc 100644 --- a/lib/search/conformance.conf +++ b/lib/search/conformance.conf @@ -22,6 +22,7 @@ PRELOADS=( lib/search/index.sx lib/search/query.sx lib/search/parse.sx + lib/search/rank.sx lib/search/api.sx lib/search/testlib.sx ) @@ -30,4 +31,5 @@ SUITES=( "index:lib/search/tests/index.sx" "boolean:lib/search/tests/boolean.sx" "parse:lib/search/tests/parse.sx" + "rank:lib/search/tests/rank.sx" ) diff --git a/lib/search/rank.sx b/lib/search/rank.sx new file mode 100644 index 00000000..efe40bb5 --- /dev/null +++ b/lib/search/rank.sx @@ -0,0 +1,14 @@ +;; search ranking — Haskell source fragment. Depends on tokenize + index + query. +;; Ranked retrieval over the candidate set (docs containing any query term). +;; Scores are floats; ties broken by DocId ascending (deterministic). +;; numDocs :: Index -> Int +;; docFreq :: Term -> Index -> Int (from index) +;; docLen :: DocId -> Index -> Int +;; rankTfIdf :: [Term] -> Index -> [DocId] +;; topNTfIdf :: Int -> [Term] -> Index -> [DocId] +;; rankBm25 :: Float -> Float -> [Term] -> Index -> [DocId] (k1, b) +;; topNBm25 :: Int -> Float -> Float -> [Term] -> Index -> [DocId] + +(define + search/rank-src + "numDocs idx = length (allDocs idx)\ntfIn t d idx = length (posIn t d idx)\nqIdf n df = if df == 0 then 0 else log (n / df)\nidf t idx = qIdf (numDocs idx) (docFreq t idx)\ntermScoreTf idx d t = tfIn t d idx * idf t idx\ntfidfDoc ts idx d = sum (map (termScoreTf idx d) ts)\ncandStep idx acc t = sortedUnion acc (docsWith t idx)\ncandDocs ts idx = foldl (candStep idx) [] ts\ncmpScore p1 p2 = if fst p1 > fst p2 then LT else if fst p1 < fst p2 then GT else compare (snd p1) (snd p2)\nmkPair f ts idx d = (f ts idx d, d)\nrankWith f ts idx = map snd (sortBy cmpScore (map (mkPair f ts idx) (candDocs ts idx)))\nrankTfIdf ts idx = rankWith tfidfDoc ts idx\ntopNTfIdf n ts idx = take n (rankTfIdf ts idx)\ntfAt d idx t = tfIn t d idx\ndocLen d idx = sum (map (tfAt d idx) (allTerms idx))\nlenAt idx d = docLen d idx\navgDocLen idx = sum (map (lenAt idx) (allDocs idx)) / numDocs idx\nbm25idf t idx = log ((numDocs idx - docFreq t idx + 0.5) / (docFreq t idx + 0.5) + 1)\nbm25Term k1 b avgdl idx d t = bm25idf t idx * (tfIn t d idx * (k1 + 1)) / (tfIn t d idx + k1 * (1 - b + b * docLen d idx / avgdl))\nbm25Doc k1 b ts idx d = sum (map (bm25Term k1 b (avgDocLen idx) idx d) ts)\nrankBm25 k1 b ts idx = rankWith (bm25Doc k1 b) ts idx\ntopNBm25 n k1 b ts idx = take n (rankBm25 k1 b ts idx)\n") diff --git a/lib/search/scoreboard.json b/lib/search/scoreboard.json index 4aab2a38..eb9509f9 100644 --- a/lib/search/scoreboard.json +++ b/lib/search/scoreboard.json @@ -1,12 +1,13 @@ { "lang": "search", - "total_passed": 78, + "total_passed": 101, "total_failed": 0, - "total": 78, + "total": 101, "suites": [ {"name":"index","passed":18,"failed":0,"total":18}, {"name":"boolean","passed":28,"failed":0,"total":28}, - {"name":"parse","passed":32,"failed":0,"total":32} + {"name":"parse","passed":32,"failed":0,"total":32}, + {"name":"rank","passed":23,"failed":0,"total":23} ], - "generated": "2026-06-06T19:42:39+00:00" + "generated": "2026-06-06T19:56:08+00:00" } diff --git a/lib/search/scoreboard.md b/lib/search/scoreboard.md index 0a71fd42..747a4d04 100644 --- a/lib/search/scoreboard.md +++ b/lib/search/scoreboard.md @@ -1,9 +1,10 @@ # search scoreboard -**78 / 78 passing** (0 failure(s)). +**101 / 101 passing** (0 failure(s)). | Suite | Passed | Total | Status | |-------|--------|-------|--------| | index | 18 | 18 | ok | | boolean | 28 | 28 | ok | | parse | 32 | 32 | ok | +| rank | 23 | 23 | ok | diff --git a/lib/search/tests/rank.sx b/lib/search/tests/rank.sx new file mode 100644 index 00000000..6200106f --- /dev/null +++ b/lib/search/tests/rank.sx @@ -0,0 +1,90 @@ +;; Phase 3 — ranking (TF-IDF, BM25, top-N). Deterministic: ties broken by DocId. +;; Corpora: +;; idx1: 1 "alpha alpha alpha gamma" 2 "alpha" 3 "beta" +;; idx2: 1 "cat" 2 "cat cat dog elephant frog grape" 3 "zzz" +;; idx3: 1 "kite" 2 "kite" (identical docs -> tiebreak) + +(define + rank-setup + "idx1 = indexDoc 3 \"beta\" (indexDoc 2 \"alpha\" (indexDoc 1 \"alpha alpha alpha gamma\" emptyIndex))\nidx2 = indexDoc 3 \"zzz\" (indexDoc 2 \"cat cat dog elephant frog grape\" (indexDoc 1 \"cat\" emptyIndex))\nidx3 = indexDoc 2 \"kite\" (indexDoc 1 \"kite\" emptyIndex)\n") + +(define + rank-cases + (list + (list + "tfidf tf ordering" + "rankTfIdf [\"alpha\"] idx1" + (list 1 2)) + (list + "tfidf rare term boosts" + "rankTfIdf [\"alpha\", \"beta\"] idx1" + (list 1 3 2)) + (list + "tfidf single-doc term" + "rankTfIdf [\"gamma\"] idx1" + (list 1)) + (list "tfidf absent term empty" "rankTfIdf [\"nope\"] idx1" (list)) + (list "tfidf empty query empty" "rankTfIdf [] idx1" (list)) + (list + "tfidf candidate union tie by docid" + "rankTfIdf [\"beta\", \"gamma\"] idx1" + (list 1 3)) + (list + "tfidf tf ordering idx2" + "rankTfIdf [\"cat\"] idx2" + (list 2 1)) + (list "topN tfidf 1" "topNTfIdf 1 [\"alpha\"] idx1" (list 1)) + (list + "topN tfidf 2" + "topNTfIdf 2 [\"alpha\", \"beta\"] idx1" + (list 1 3)) + (list + "topN exceeds results" + "topNTfIdf 10 [\"gamma\"] idx1" + (list 1)) + (list "topN zero" "topNTfIdf 0 [\"alpha\"] idx1" (list)) + (list + "bm25 tf+length flips tfidf" + "rankBm25 1.5 0.75 [\"cat\"] idx2" + (list 1 2)) + (list + "bm25 b=0 ignores length" + "rankBm25 1.5 0.0 [\"cat\"] idx2" + (list 2 1)) + (list + "bm25 alpha idx1" + "rankBm25 1.5 0.75 [\"alpha\"] idx1" + (list 1 2)) + (list "bm25 absent empty" "rankBm25 1.5 0.75 [\"nope\"] idx1" (list)) + (list + "bm25 single-doc term" + "rankBm25 1.5 0.75 [\"gamma\"] idx1" + (list 1)) + (list "bm25 topN 1" "topNBm25 1 1.5 0.75 [\"cat\"] idx2" (list 1)) + (list + "bm25 same candidate set" + "sort (rankBm25 1.5 0.75 [\"alpha\", \"beta\"] idx1)" + (list 1 2 3)) + (list + "tfidf stable tiebreak" + "rankTfIdf [\"kite\"] idx3" + (list 1 2)) + (list + "bm25 stable tiebreak" + "rankBm25 1.5 0.75 [\"kite\"] idx3" + (list 1 2)) + (list "numDocs" "[numDocs idx1]" (list 3)) + (list "docLen counts tokens" "[docLen 1 idx1]" (list 4)) + (list "docFreq via index" "[docFreq \"alpha\" idx1]" (list 2)))) + +(define + rank-results + (search-batch rank-setup (map (fn (c) (nth c 1)) rank-cases))) + +(map-indexed + (fn + (i c) + (hk-test (nth c 0) (nth rank-results i) (nth c 2))) + rank-cases) + +{:fail hk-test-fail :pass hk-test-pass :fails hk-test-fails} diff --git a/plans/search-on-sx.md b/plans/search-on-sx.md index 1ebb57b6..d1c0689b 100644 --- a/plans/search-on-sx.md +++ b/plans/search-on-sx.md @@ -10,7 +10,7 @@ extension that merges per-peer indices. ## Status (rolling) -`bash lib/search/conformance.sh` → **78/78** (Phases 1–2 complete) +`bash lib/search/conformance.sh` → **101/101** (Phases 1–3 complete) ## Ground rules @@ -89,12 +89,13 @@ lib/search/index.sx lib/search/eval.sx ## Phase 3 — Ranking -- [ ] document frequency tracking — extend index with `df` per term -- [ ] TF-IDF scoring -- [ ] BM25 scoring (configurable k1, b) -- [ ] top-N retrieval (heap-based) -- [ ] `lib/search/tests/rank.sx` — 20+ cases: TF-IDF behavior, BM25 vs TF-IDF, - ranking stability, top-N correctness +- [x] document frequency — `docFreq`/`idf`/`bm25idf` derived from the index + (posting-list length); no separate df store needed +- [x] TF-IDF scoring (`rankTfIdf`) +- [x] BM25 scoring, configurable k1/b (`rankBm25 k1 b`) +- [x] top-N retrieval (`topNTfIdf`/`topNBm25` — sortBy + take; stable DocId tiebreak) +- [x] `lib/search/tests/rank.sx` — 23 cases: TF-IDF tf/idf behavior, BM25 length-norm + + tf-saturation flips vs TF-IDF, b-parameter effect, tiebreak stability, top-N ## Phase 4 — ACL filter + federation @@ -105,6 +106,12 @@ lib/search/index.sx lib/search/eval.sx ## Progress log +- **Phase 3 complete — ranking (101/101 total).** TF-IDF (`rankTfIdf`) and BM25 + (`rankBm25 k1 b`) over the candidate set (docs containing any query term), scores + as floats with deterministic DocId-ascending tiebreak; `topNTfIdf`/`topNBm25` via + sortBy+take. df/idf derived from posting-list length (no separate df store). 23 + tests incl. a BM25-vs-TF-IDF flip (length-norm + tf-saturation) and the b-parameter + effect. Float division/`log`/float literals all work in haskell-on-sx. - **Phase 2 complete — parser (78/78 total).** Query tokenizer (ord-based delimiters, quoted phrases) + recursive-descent parser with OR