search: Phase 3 ranking TF-IDF + BM25 + top-N + 23 tests
Some checks failed
Test, Build, and Deploy / test-build-deploy (push) Failing after 37s
Some checks failed
Test, Build, and Deploy / test-build-deploy (push) Failing after 37s
rankTfIdf and rankBm25 (configurable k1/b) over the candidate set, float scores with deterministic DocId tiebreak; topNTfIdf/topNBm25. df/idf derived from posting-list length. Tests cover tf/idf behavior, a BM25-vs-TF-IDF flip from length-norm + tf-saturation, the b-parameter effect, tiebreak stability. 101/101. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -2,7 +2,8 @@
|
||||
;; Tests and callers concatenate `search/src` with their own top-level bindings
|
||||
;; (e.g. "result = lookupTerm \"cat\" idx\n") and evaluate via the haskell-on-sx
|
||||
;; interpreter. Public Haskell entry points: indexDoc, lookupTerm, deleteDoc,
|
||||
;; docFreq, allTerms, tokens, positioned, evalQuery, parseQuery, searchQuery.
|
||||
;; docFreq, allTerms, tokens, positioned, evalQuery, parseQuery, searchQuery,
|
||||
;; rankTfIdf, rankBm25, topNTfIdf, topNBm25.
|
||||
|
||||
(define
|
||||
search/src
|
||||
@@ -13,4 +14,6 @@
|
||||
"\n"
|
||||
search/query-src
|
||||
"\n"
|
||||
search/parse-src))
|
||||
search/parse-src
|
||||
"\n"
|
||||
search/rank-src))
|
||||
|
||||
@@ -22,6 +22,7 @@ PRELOADS=(
|
||||
lib/search/index.sx
|
||||
lib/search/query.sx
|
||||
lib/search/parse.sx
|
||||
lib/search/rank.sx
|
||||
lib/search/api.sx
|
||||
lib/search/testlib.sx
|
||||
)
|
||||
@@ -30,4 +31,5 @@ SUITES=(
|
||||
"index:lib/search/tests/index.sx"
|
||||
"boolean:lib/search/tests/boolean.sx"
|
||||
"parse:lib/search/tests/parse.sx"
|
||||
"rank:lib/search/tests/rank.sx"
|
||||
)
|
||||
|
||||
14
lib/search/rank.sx
Normal file
14
lib/search/rank.sx
Normal file
@@ -0,0 +1,14 @@
|
||||
;; search ranking — Haskell source fragment. Depends on tokenize + index + query.
|
||||
;; Ranked retrieval over the candidate set (docs containing any query term).
|
||||
;; Scores are floats; ties broken by DocId ascending (deterministic).
|
||||
;; numDocs :: Index -> Int
|
||||
;; docFreq :: Term -> Index -> Int (from index)
|
||||
;; docLen :: DocId -> Index -> Int
|
||||
;; rankTfIdf :: [Term] -> Index -> [DocId]
|
||||
;; topNTfIdf :: Int -> [Term] -> Index -> [DocId]
|
||||
;; rankBm25 :: Float -> Float -> [Term] -> Index -> [DocId] (k1, b)
|
||||
;; topNBm25 :: Int -> Float -> Float -> [Term] -> Index -> [DocId]
|
||||
|
||||
(define
|
||||
search/rank-src
|
||||
"numDocs idx = length (allDocs idx)\ntfIn t d idx = length (posIn t d idx)\nqIdf n df = if df == 0 then 0 else log (n / df)\nidf t idx = qIdf (numDocs idx) (docFreq t idx)\ntermScoreTf idx d t = tfIn t d idx * idf t idx\ntfidfDoc ts idx d = sum (map (termScoreTf idx d) ts)\ncandStep idx acc t = sortedUnion acc (docsWith t idx)\ncandDocs ts idx = foldl (candStep idx) [] ts\ncmpScore p1 p2 = if fst p1 > fst p2 then LT else if fst p1 < fst p2 then GT else compare (snd p1) (snd p2)\nmkPair f ts idx d = (f ts idx d, d)\nrankWith f ts idx = map snd (sortBy cmpScore (map (mkPair f ts idx) (candDocs ts idx)))\nrankTfIdf ts idx = rankWith tfidfDoc ts idx\ntopNTfIdf n ts idx = take n (rankTfIdf ts idx)\ntfAt d idx t = tfIn t d idx\ndocLen d idx = sum (map (tfAt d idx) (allTerms idx))\nlenAt idx d = docLen d idx\navgDocLen idx = sum (map (lenAt idx) (allDocs idx)) / numDocs idx\nbm25idf t idx = log ((numDocs idx - docFreq t idx + 0.5) / (docFreq t idx + 0.5) + 1)\nbm25Term k1 b avgdl idx d t = bm25idf t idx * (tfIn t d idx * (k1 + 1)) / (tfIn t d idx + k1 * (1 - b + b * docLen d idx / avgdl))\nbm25Doc k1 b ts idx d = sum (map (bm25Term k1 b (avgDocLen idx) idx d) ts)\nrankBm25 k1 b ts idx = rankWith (bm25Doc k1 b) ts idx\ntopNBm25 n k1 b ts idx = take n (rankBm25 k1 b ts idx)\n")
|
||||
@@ -1,12 +1,13 @@
|
||||
{
|
||||
"lang": "search",
|
||||
"total_passed": 78,
|
||||
"total_passed": 101,
|
||||
"total_failed": 0,
|
||||
"total": 78,
|
||||
"total": 101,
|
||||
"suites": [
|
||||
{"name":"index","passed":18,"failed":0,"total":18},
|
||||
{"name":"boolean","passed":28,"failed":0,"total":28},
|
||||
{"name":"parse","passed":32,"failed":0,"total":32}
|
||||
{"name":"parse","passed":32,"failed":0,"total":32},
|
||||
{"name":"rank","passed":23,"failed":0,"total":23}
|
||||
],
|
||||
"generated": "2026-06-06T19:42:39+00:00"
|
||||
"generated": "2026-06-06T19:56:08+00:00"
|
||||
}
|
||||
|
||||
@@ -1,9 +1,10 @@
|
||||
# search scoreboard
|
||||
|
||||
**78 / 78 passing** (0 failure(s)).
|
||||
**101 / 101 passing** (0 failure(s)).
|
||||
|
||||
| Suite | Passed | Total | Status |
|
||||
|-------|--------|-------|--------|
|
||||
| index | 18 | 18 | ok |
|
||||
| boolean | 28 | 28 | ok |
|
||||
| parse | 32 | 32 | ok |
|
||||
| rank | 23 | 23 | ok |
|
||||
|
||||
90
lib/search/tests/rank.sx
Normal file
90
lib/search/tests/rank.sx
Normal file
@@ -0,0 +1,90 @@
|
||||
;; Phase 3 — ranking (TF-IDF, BM25, top-N). Deterministic: ties broken by DocId.
|
||||
;; Corpora:
|
||||
;; idx1: 1 "alpha alpha alpha gamma" 2 "alpha" 3 "beta"
|
||||
;; idx2: 1 "cat" 2 "cat cat dog elephant frog grape" 3 "zzz"
|
||||
;; idx3: 1 "kite" 2 "kite" (identical docs -> tiebreak)
|
||||
|
||||
(define
|
||||
rank-setup
|
||||
"idx1 = indexDoc 3 \"beta\" (indexDoc 2 \"alpha\" (indexDoc 1 \"alpha alpha alpha gamma\" emptyIndex))\nidx2 = indexDoc 3 \"zzz\" (indexDoc 2 \"cat cat dog elephant frog grape\" (indexDoc 1 \"cat\" emptyIndex))\nidx3 = indexDoc 2 \"kite\" (indexDoc 1 \"kite\" emptyIndex)\n")
|
||||
|
||||
(define
|
||||
rank-cases
|
||||
(list
|
||||
(list
|
||||
"tfidf tf ordering"
|
||||
"rankTfIdf [\"alpha\"] idx1"
|
||||
(list 1 2))
|
||||
(list
|
||||
"tfidf rare term boosts"
|
||||
"rankTfIdf [\"alpha\", \"beta\"] idx1"
|
||||
(list 1 3 2))
|
||||
(list
|
||||
"tfidf single-doc term"
|
||||
"rankTfIdf [\"gamma\"] idx1"
|
||||
(list 1))
|
||||
(list "tfidf absent term empty" "rankTfIdf [\"nope\"] idx1" (list))
|
||||
(list "tfidf empty query empty" "rankTfIdf [] idx1" (list))
|
||||
(list
|
||||
"tfidf candidate union tie by docid"
|
||||
"rankTfIdf [\"beta\", \"gamma\"] idx1"
|
||||
(list 1 3))
|
||||
(list
|
||||
"tfidf tf ordering idx2"
|
||||
"rankTfIdf [\"cat\"] idx2"
|
||||
(list 2 1))
|
||||
(list "topN tfidf 1" "topNTfIdf 1 [\"alpha\"] idx1" (list 1))
|
||||
(list
|
||||
"topN tfidf 2"
|
||||
"topNTfIdf 2 [\"alpha\", \"beta\"] idx1"
|
||||
(list 1 3))
|
||||
(list
|
||||
"topN exceeds results"
|
||||
"topNTfIdf 10 [\"gamma\"] idx1"
|
||||
(list 1))
|
||||
(list "topN zero" "topNTfIdf 0 [\"alpha\"] idx1" (list))
|
||||
(list
|
||||
"bm25 tf+length flips tfidf"
|
||||
"rankBm25 1.5 0.75 [\"cat\"] idx2"
|
||||
(list 1 2))
|
||||
(list
|
||||
"bm25 b=0 ignores length"
|
||||
"rankBm25 1.5 0.0 [\"cat\"] idx2"
|
||||
(list 2 1))
|
||||
(list
|
||||
"bm25 alpha idx1"
|
||||
"rankBm25 1.5 0.75 [\"alpha\"] idx1"
|
||||
(list 1 2))
|
||||
(list "bm25 absent empty" "rankBm25 1.5 0.75 [\"nope\"] idx1" (list))
|
||||
(list
|
||||
"bm25 single-doc term"
|
||||
"rankBm25 1.5 0.75 [\"gamma\"] idx1"
|
||||
(list 1))
|
||||
(list "bm25 topN 1" "topNBm25 1 1.5 0.75 [\"cat\"] idx2" (list 1))
|
||||
(list
|
||||
"bm25 same candidate set"
|
||||
"sort (rankBm25 1.5 0.75 [\"alpha\", \"beta\"] idx1)"
|
||||
(list 1 2 3))
|
||||
(list
|
||||
"tfidf stable tiebreak"
|
||||
"rankTfIdf [\"kite\"] idx3"
|
||||
(list 1 2))
|
||||
(list
|
||||
"bm25 stable tiebreak"
|
||||
"rankBm25 1.5 0.75 [\"kite\"] idx3"
|
||||
(list 1 2))
|
||||
(list "numDocs" "[numDocs idx1]" (list 3))
|
||||
(list "docLen counts tokens" "[docLen 1 idx1]" (list 4))
|
||||
(list "docFreq via index" "[docFreq \"alpha\" idx1]" (list 2))))
|
||||
|
||||
(define
|
||||
rank-results
|
||||
(search-batch rank-setup (map (fn (c) (nth c 1)) rank-cases)))
|
||||
|
||||
(map-indexed
|
||||
(fn
|
||||
(i c)
|
||||
(hk-test (nth c 0) (nth rank-results i) (nth c 2)))
|
||||
rank-cases)
|
||||
|
||||
{:fail hk-test-fail :pass hk-test-pass :fails hk-test-fails}
|
||||
@@ -10,7 +10,7 @@ extension that merges per-peer indices.
|
||||
|
||||
## Status (rolling)
|
||||
|
||||
`bash lib/search/conformance.sh` → **78/78** (Phases 1–2 complete)
|
||||
`bash lib/search/conformance.sh` → **101/101** (Phases 1–3 complete)
|
||||
|
||||
## Ground rules
|
||||
|
||||
@@ -89,12 +89,13 @@ lib/search/index.sx lib/search/eval.sx
|
||||
|
||||
## Phase 3 — Ranking
|
||||
|
||||
- [ ] document frequency tracking — extend index with `df` per term
|
||||
- [ ] TF-IDF scoring
|
||||
- [ ] BM25 scoring (configurable k1, b)
|
||||
- [ ] top-N retrieval (heap-based)
|
||||
- [ ] `lib/search/tests/rank.sx` — 20+ cases: TF-IDF behavior, BM25 vs TF-IDF,
|
||||
ranking stability, top-N correctness
|
||||
- [x] document frequency — `docFreq`/`idf`/`bm25idf` derived from the index
|
||||
(posting-list length); no separate df store needed
|
||||
- [x] TF-IDF scoring (`rankTfIdf`)
|
||||
- [x] BM25 scoring, configurable k1/b (`rankBm25 k1 b`)
|
||||
- [x] top-N retrieval (`topNTfIdf`/`topNBm25` — sortBy + take; stable DocId tiebreak)
|
||||
- [x] `lib/search/tests/rank.sx` — 23 cases: TF-IDF tf/idf behavior, BM25 length-norm
|
||||
+ tf-saturation flips vs TF-IDF, b-parameter effect, tiebreak stability, top-N
|
||||
|
||||
## Phase 4 — ACL filter + federation
|
||||
|
||||
@@ -105,6 +106,12 @@ lib/search/index.sx lib/search/eval.sx
|
||||
|
||||
## Progress log
|
||||
|
||||
- **Phase 3 complete — ranking (101/101 total).** TF-IDF (`rankTfIdf`) and BM25
|
||||
(`rankBm25 k1 b`) over the candidate set (docs containing any query term), scores
|
||||
as floats with deterministic DocId-ascending tiebreak; `topNTfIdf`/`topNBm25` via
|
||||
sortBy+take. df/idf derived from posting-list length (no separate df store). 23
|
||||
tests incl. a BM25-vs-TF-IDF flip (length-norm + tf-saturation) and the b-parameter
|
||||
effect. Float division/`log`/float literals all work in haskell-on-sx.
|
||||
- **Phase 2 complete — parser (78/78 total).** Query tokenizer (ord-based
|
||||
delimiters, quoted phrases) + recursive-descent parser with OR<AND<NOT precedence,
|
||||
implicit AND on adjacency, parens, case-insensitive keywords. `parseQuery`,
|
||||
|
||||
Reference in New Issue
Block a user