Some checks failed
Test, Build, and Deploy / test-build-deploy (push) Failing after 37s
rankTfIdf and rankBm25 (configurable k1/b) over the candidate set, float scores with deterministic DocId tiebreak; topNTfIdf/topNBm25. df/idf derived from posting-list length. Tests cover tf/idf behavior, a BM25-vs-TF-IDF flip from length-norm + tf-saturation, the b-parameter effect, tiebreak stability. 101/101. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
15 lines
1.9 KiB
Plaintext
15 lines
1.9 KiB
Plaintext
;; search ranking — Haskell source fragment. Depends on tokenize + index + query.
|
|
;; Ranked retrieval over the candidate set (docs containing any query term).
|
|
;; Scores are floats; ties broken by DocId ascending (deterministic).
|
|
;; numDocs :: Index -> Int
|
|
;; docFreq :: Term -> Index -> Int (from index)
|
|
;; docLen :: DocId -> Index -> Int
|
|
;; rankTfIdf :: [Term] -> Index -> [DocId]
|
|
;; topNTfIdf :: Int -> [Term] -> Index -> [DocId]
|
|
;; rankBm25 :: Float -> Float -> [Term] -> Index -> [DocId] (k1, b)
|
|
;; topNBm25 :: Int -> Float -> Float -> [Term] -> Index -> [DocId]
|
|
|
|
(define
|
|
search/rank-src
|
|
"numDocs idx = length (allDocs idx)\ntfIn t d idx = length (posIn t d idx)\nqIdf n df = if df == 0 then 0 else log (n / df)\nidf t idx = qIdf (numDocs idx) (docFreq t idx)\ntermScoreTf idx d t = tfIn t d idx * idf t idx\ntfidfDoc ts idx d = sum (map (termScoreTf idx d) ts)\ncandStep idx acc t = sortedUnion acc (docsWith t idx)\ncandDocs ts idx = foldl (candStep idx) [] ts\ncmpScore p1 p2 = if fst p1 > fst p2 then LT else if fst p1 < fst p2 then GT else compare (snd p1) (snd p2)\nmkPair f ts idx d = (f ts idx d, d)\nrankWith f ts idx = map snd (sortBy cmpScore (map (mkPair f ts idx) (candDocs ts idx)))\nrankTfIdf ts idx = rankWith tfidfDoc ts idx\ntopNTfIdf n ts idx = take n (rankTfIdf ts idx)\ntfAt d idx t = tfIn t d idx\ndocLen d idx = sum (map (tfAt d idx) (allTerms idx))\nlenAt idx d = docLen d idx\navgDocLen idx = sum (map (lenAt idx) (allDocs idx)) / numDocs idx\nbm25idf t idx = log ((numDocs idx - docFreq t idx + 0.5) / (docFreq t idx + 0.5) + 1)\nbm25Term k1 b avgdl idx d t = bm25idf t idx * (tfIn t d idx * (k1 + 1)) / (tfIn t d idx + k1 * (1 - b + b * docLen d idx / avgdl))\nbm25Doc k1 b ts idx d = sum (map (bm25Term k1 b (avgDocLen idx) idx d) ts)\nrankBm25 k1 b ts idx = rankWith (bm25Doc k1 b) ts idx\ntopNBm25 n k1 b ts idx = take n (rankBm25 k1 b ts idx)\n")
|