search: Phase 3 ranking TF-IDF + BM25 + top-N + 23 tests
Some checks failed
Test, Build, and Deploy / test-build-deploy (push) Failing after 37s

rankTfIdf and rankBm25 (configurable k1/b) over the candidate set, float scores
with deterministic DocId tiebreak; topNTfIdf/topNBm25. df/idf derived from
posting-list length. Tests cover tf/idf behavior, a BM25-vs-TF-IDF flip from
length-norm + tf-saturation, the b-parameter effect, tiebreak stability. 101/101.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-06 19:56:50 +00:00
parent 4c84decc01
commit a3f9d4f6c9
7 changed files with 132 additions and 14 deletions

View File

@@ -2,7 +2,8 @@
;; Tests and callers concatenate `search/src` with their own top-level bindings
;; (e.g. "result = lookupTerm \"cat\" idx\n") and evaluate via the haskell-on-sx
;; interpreter. Public Haskell entry points: indexDoc, lookupTerm, deleteDoc,
;; docFreq, allTerms, tokens, positioned, evalQuery, parseQuery, searchQuery.
;; docFreq, allTerms, tokens, positioned, evalQuery, parseQuery, searchQuery,
;; rankTfIdf, rankBm25, topNTfIdf, topNBm25.
(define
search/src
@@ -13,4 +14,6 @@
"\n"
search/query-src
"\n"
search/parse-src))
search/parse-src
"\n"
search/rank-src))

View File

@@ -22,6 +22,7 @@ PRELOADS=(
lib/search/index.sx
lib/search/query.sx
lib/search/parse.sx
lib/search/rank.sx
lib/search/api.sx
lib/search/testlib.sx
)
@@ -30,4 +31,5 @@ SUITES=(
"index:lib/search/tests/index.sx"
"boolean:lib/search/tests/boolean.sx"
"parse:lib/search/tests/parse.sx"
"rank:lib/search/tests/rank.sx"
)

14
lib/search/rank.sx Normal file
View File

@@ -0,0 +1,14 @@
;; search ranking — Haskell source fragment. Depends on tokenize + index + query.
;; Ranked retrieval over the candidate set (docs containing any query term).
;; Scores are floats; ties broken by DocId ascending (deterministic).
;; numDocs :: Index -> Int
;; docFreq :: Term -> Index -> Int (from index)
;; docLen :: DocId -> Index -> Int
;; rankTfIdf :: [Term] -> Index -> [DocId]
;; topNTfIdf :: Int -> [Term] -> Index -> [DocId]
;; rankBm25 :: Float -> Float -> [Term] -> Index -> [DocId] (k1, b)
;; topNBm25 :: Int -> Float -> Float -> [Term] -> Index -> [DocId]
(define
search/rank-src
"numDocs idx = length (allDocs idx)\ntfIn t d idx = length (posIn t d idx)\nqIdf n df = if df == 0 then 0 else log (n / df)\nidf t idx = qIdf (numDocs idx) (docFreq t idx)\ntermScoreTf idx d t = tfIn t d idx * idf t idx\ntfidfDoc ts idx d = sum (map (termScoreTf idx d) ts)\ncandStep idx acc t = sortedUnion acc (docsWith t idx)\ncandDocs ts idx = foldl (candStep idx) [] ts\ncmpScore p1 p2 = if fst p1 > fst p2 then LT else if fst p1 < fst p2 then GT else compare (snd p1) (snd p2)\nmkPair f ts idx d = (f ts idx d, d)\nrankWith f ts idx = map snd (sortBy cmpScore (map (mkPair f ts idx) (candDocs ts idx)))\nrankTfIdf ts idx = rankWith tfidfDoc ts idx\ntopNTfIdf n ts idx = take n (rankTfIdf ts idx)\ntfAt d idx t = tfIn t d idx\ndocLen d idx = sum (map (tfAt d idx) (allTerms idx))\nlenAt idx d = docLen d idx\navgDocLen idx = sum (map (lenAt idx) (allDocs idx)) / numDocs idx\nbm25idf t idx = log ((numDocs idx - docFreq t idx + 0.5) / (docFreq t idx + 0.5) + 1)\nbm25Term k1 b avgdl idx d t = bm25idf t idx * (tfIn t d idx * (k1 + 1)) / (tfIn t d idx + k1 * (1 - b + b * docLen d idx / avgdl))\nbm25Doc k1 b ts idx d = sum (map (bm25Term k1 b (avgDocLen idx) idx d) ts)\nrankBm25 k1 b ts idx = rankWith (bm25Doc k1 b) ts idx\ntopNBm25 n k1 b ts idx = take n (rankBm25 k1 b ts idx)\n")

View File

@@ -1,12 +1,13 @@
{
"lang": "search",
"total_passed": 78,
"total_passed": 101,
"total_failed": 0,
"total": 78,
"total": 101,
"suites": [
{"name":"index","passed":18,"failed":0,"total":18},
{"name":"boolean","passed":28,"failed":0,"total":28},
{"name":"parse","passed":32,"failed":0,"total":32}
{"name":"parse","passed":32,"failed":0,"total":32},
{"name":"rank","passed":23,"failed":0,"total":23}
],
"generated": "2026-06-06T19:42:39+00:00"
"generated": "2026-06-06T19:56:08+00:00"
}

View File

@@ -1,9 +1,10 @@
# search scoreboard
**78 / 78 passing** (0 failure(s)).
**101 / 101 passing** (0 failure(s)).
| Suite | Passed | Total | Status |
|-------|--------|-------|--------|
| index | 18 | 18 | ok |
| boolean | 28 | 28 | ok |
| parse | 32 | 32 | ok |
| rank | 23 | 23 | ok |

90
lib/search/tests/rank.sx Normal file
View File

@@ -0,0 +1,90 @@
;; Phase 3 — ranking (TF-IDF, BM25, top-N). Deterministic: ties broken by DocId.
;; Corpora:
;; idx1: 1 "alpha alpha alpha gamma" 2 "alpha" 3 "beta"
;; idx2: 1 "cat" 2 "cat cat dog elephant frog grape" 3 "zzz"
;; idx3: 1 "kite" 2 "kite" (identical docs -> tiebreak)
(define
rank-setup
"idx1 = indexDoc 3 \"beta\" (indexDoc 2 \"alpha\" (indexDoc 1 \"alpha alpha alpha gamma\" emptyIndex))\nidx2 = indexDoc 3 \"zzz\" (indexDoc 2 \"cat cat dog elephant frog grape\" (indexDoc 1 \"cat\" emptyIndex))\nidx3 = indexDoc 2 \"kite\" (indexDoc 1 \"kite\" emptyIndex)\n")
(define
rank-cases
(list
(list
"tfidf tf ordering"
"rankTfIdf [\"alpha\"] idx1"
(list 1 2))
(list
"tfidf rare term boosts"
"rankTfIdf [\"alpha\", \"beta\"] idx1"
(list 1 3 2))
(list
"tfidf single-doc term"
"rankTfIdf [\"gamma\"] idx1"
(list 1))
(list "tfidf absent term empty" "rankTfIdf [\"nope\"] idx1" (list))
(list "tfidf empty query empty" "rankTfIdf [] idx1" (list))
(list
"tfidf candidate union tie by docid"
"rankTfIdf [\"beta\", \"gamma\"] idx1"
(list 1 3))
(list
"tfidf tf ordering idx2"
"rankTfIdf [\"cat\"] idx2"
(list 2 1))
(list "topN tfidf 1" "topNTfIdf 1 [\"alpha\"] idx1" (list 1))
(list
"topN tfidf 2"
"topNTfIdf 2 [\"alpha\", \"beta\"] idx1"
(list 1 3))
(list
"topN exceeds results"
"topNTfIdf 10 [\"gamma\"] idx1"
(list 1))
(list "topN zero" "topNTfIdf 0 [\"alpha\"] idx1" (list))
(list
"bm25 tf+length flips tfidf"
"rankBm25 1.5 0.75 [\"cat\"] idx2"
(list 1 2))
(list
"bm25 b=0 ignores length"
"rankBm25 1.5 0.0 [\"cat\"] idx2"
(list 2 1))
(list
"bm25 alpha idx1"
"rankBm25 1.5 0.75 [\"alpha\"] idx1"
(list 1 2))
(list "bm25 absent empty" "rankBm25 1.5 0.75 [\"nope\"] idx1" (list))
(list
"bm25 single-doc term"
"rankBm25 1.5 0.75 [\"gamma\"] idx1"
(list 1))
(list "bm25 topN 1" "topNBm25 1 1.5 0.75 [\"cat\"] idx2" (list 1))
(list
"bm25 same candidate set"
"sort (rankBm25 1.5 0.75 [\"alpha\", \"beta\"] idx1)"
(list 1 2 3))
(list
"tfidf stable tiebreak"
"rankTfIdf [\"kite\"] idx3"
(list 1 2))
(list
"bm25 stable tiebreak"
"rankBm25 1.5 0.75 [\"kite\"] idx3"
(list 1 2))
(list "numDocs" "[numDocs idx1]" (list 3))
(list "docLen counts tokens" "[docLen 1 idx1]" (list 4))
(list "docFreq via index" "[docFreq \"alpha\" idx1]" (list 2))))
(define
rank-results
(search-batch rank-setup (map (fn (c) (nth c 1)) rank-cases)))
(map-indexed
(fn
(i c)
(hk-test (nth c 0) (nth rank-results i) (nth c 2)))
rank-cases)
{:fail hk-test-fail :pass hk-test-pass :fails hk-test-fails}

View File

@@ -10,7 +10,7 @@ extension that merges per-peer indices.
## Status (rolling)
`bash lib/search/conformance.sh`**78/78** (Phases 12 complete)
`bash lib/search/conformance.sh`**101/101** (Phases 13 complete)
## Ground rules
@@ -89,12 +89,13 @@ lib/search/index.sx lib/search/eval.sx
## Phase 3 — Ranking
- [ ] document frequency tracking — extend index with `df` per term
- [ ] TF-IDF scoring
- [ ] BM25 scoring (configurable k1, b)
- [ ] top-N retrieval (heap-based)
- [ ] `lib/search/tests/rank.sx` — 20+ cases: TF-IDF behavior, BM25 vs TF-IDF,
ranking stability, top-N correctness
- [x] document frequency — `docFreq`/`idf`/`bm25idf` derived from the index
(posting-list length); no separate df store needed
- [x] TF-IDF scoring (`rankTfIdf`)
- [x] BM25 scoring, configurable k1/b (`rankBm25 k1 b`)
- [x] top-N retrieval (`topNTfIdf`/`topNBm25` — sortBy + take; stable DocId tiebreak)
- [x] `lib/search/tests/rank.sx` — 23 cases: TF-IDF tf/idf behavior, BM25 length-norm
+ tf-saturation flips vs TF-IDF, b-parameter effect, tiebreak stability, top-N
## Phase 4 — ACL filter + federation
@@ -105,6 +106,12 @@ lib/search/index.sx lib/search/eval.sx
## Progress log
- **Phase 3 complete — ranking (101/101 total).** TF-IDF (`rankTfIdf`) and BM25
(`rankBm25 k1 b`) over the candidate set (docs containing any query term), scores
as floats with deterministic DocId-ascending tiebreak; `topNTfIdf`/`topNBm25` via
sortBy+take. df/idf derived from posting-list length (no separate df store). 23
tests incl. a BM25-vs-TF-IDF flip (length-norm + tf-saturation) and the b-parameter
effect. Float division/`log`/float literals all work in haskell-on-sx.
- **Phase 2 complete — parser (78/78 total).** Query tokenizer (ord-based
delimiters, quoted phrases) + recursive-descent parser with OR<AND<NOT precedence,
implicit AND on adjacency, parens, case-insensitive keywords. `parseQuery`,