From 77ab827b91d29c9bec992bedd4c7b8e23252103e Mon Sep 17 00:00:00 2001 From: giles Date: Sat, 6 Jun 2026 20:08:08 +0000 Subject: [PATCH] search: Phase 4 federation merge + ACL post-filter + 21 tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fedIndex merges per-peer inverted indices (union posting lists per term) after relabelling local DocIds to global gid = peer*1000 + local — dedupe by (peer,doc-id) is automatic and positions survive, so ranking runs once over the merge and interleaves peers by score. ACL is a post-rank filter over an injected permit predicate (searchTfIdfAcl/topNTfIdfAcl/searchBm25Acl). Roadmap complete, 122/122. Co-Authored-By: Claude Opus 4.8 (1M context) --- lib/search/api.sx | 7 ++- lib/search/conformance.conf | 2 + lib/search/fed.sx | 16 +++++ lib/search/scoreboard.json | 9 +-- lib/search/scoreboard.md | 3 +- lib/search/tests/integration.sx | 102 ++++++++++++++++++++++++++++++++ plans/search-on-sx.md | 21 +++++-- 7 files changed, 148 insertions(+), 12 deletions(-) create mode 100644 lib/search/fed.sx create mode 100644 lib/search/tests/integration.sx diff --git a/lib/search/api.sx b/lib/search/api.sx index 2eaeac96..a9a3fe12 100644 --- a/lib/search/api.sx +++ b/lib/search/api.sx @@ -3,7 +3,8 @@ ;; (e.g. "result = lookupTerm \"cat\" idx\n") and evaluate via the haskell-on-sx ;; interpreter. Public Haskell entry points: indexDoc, lookupTerm, deleteDoc, ;; docFreq, allTerms, tokens, positioned, evalQuery, parseQuery, searchQuery, -;; rankTfIdf, rankBm25, topNTfIdf, topNBm25. +;; rankTfIdf, rankBm25, topNTfIdf, topNBm25, fedIndex, aclFilter, searchTfIdfAcl, +;; topNTfIdfAcl, searchBm25Acl. (define search/src @@ -16,4 +17,6 @@ "\n" search/parse-src "\n" - search/rank-src)) + search/rank-src + "\n" + search/fed-src)) diff --git a/lib/search/conformance.conf b/lib/search/conformance.conf index 9793c9cc..b2ef2f74 100644 --- a/lib/search/conformance.conf +++ b/lib/search/conformance.conf @@ -23,6 +23,7 @@ PRELOADS=( lib/search/query.sx lib/search/parse.sx lib/search/rank.sx + lib/search/fed.sx lib/search/api.sx lib/search/testlib.sx ) @@ -32,4 +33,5 @@ SUITES=( "boolean:lib/search/tests/boolean.sx" "parse:lib/search/tests/parse.sx" "rank:lib/search/tests/rank.sx" + "integration:lib/search/tests/integration.sx" ) diff --git a/lib/search/fed.sx b/lib/search/fed.sx new file mode 100644 index 00000000..36b59462 --- /dev/null +++ b/lib/search/fed.sx @@ -0,0 +1,16 @@ +;; search federation + ACL — Haskell source fragment. Depends on index + rank. +;; Federation merges per-peer INDICES (not ranked results): each peer's local +;; DocIds are relabelled to global ids `gid peer local = peer*1000 + local` +;; (dedupe by (peer,doc-id) is automatic via the bijection), then posting lists +;; are unioned per term. Ranking then runs once over the merged index, which is +;; rank-correct. ACL is a post-rank filter: an injected `permit :: DocId -> Bool` +;; predicate (viewer baked in by the caller) — never baked into the index. +;; fedIndex :: [(PeerId, Index)] -> Index +;; aclFilter :: (DocId -> Bool) -> [DocId] -> [DocId] +;; searchTfIdfAcl :: (DocId -> Bool) -> [Term] -> Index -> [DocId] +;; topNTfIdfAcl :: Int -> (DocId -> Bool) -> [Term] -> Index -> [DocId] +;; searchBm25Acl :: (DocId -> Bool) -> Float -> Float -> [Term] -> Index -> [DocId] + +(define + search/fed-src + "gid peer local = peer * 1000 + local\nfedRelabelPosting peer p = (gid peer (fst p), snd p)\nfedRelabelEntry peer e = (fst e, map (fedRelabelPosting peer) (snd e))\nfedRelabelIndex peer idx = map (fedRelabelEntry peer) idx\nfedInsP p [] = [p]\nfedInsP p (q:qs) = if fst p < fst q then p : q : qs else if fst p == fst q then p : qs else q : fedInsP p qs\nfedMergePL a b = foldr fedInsP b a\nfedInsTerm t pl [] = [(t, pl)]\nfedInsTerm t pl (x:xs) = if t < fst x then (t, pl) : x : xs else if t == fst x then (fst x, fedMergePL pl (snd x)) : xs else x : fedInsTerm t pl xs\nfedMergeEntry idx e = fedInsTerm (fst e) (snd e) idx\nfedMergeTwo a b = foldl fedMergeEntry a b\nfedAddPeer acc pair = fedMergeTwo acc (fedRelabelIndex (fst pair) (snd pair))\nfedIndex pairs = foldl fedAddPeer emptyIndex pairs\naclFilter permit docs = filter permit docs\nsearchTfIdfAcl permit ts idx = aclFilter permit (rankTfIdf ts idx)\ntopNTfIdfAcl n permit ts idx = take n (aclFilter permit (rankTfIdf ts idx))\nsearchBm25Acl permit k1 b ts idx = aclFilter permit (rankBm25 k1 b ts idx)\n") diff --git a/lib/search/scoreboard.json b/lib/search/scoreboard.json index eb9509f9..d1cb07da 100644 --- a/lib/search/scoreboard.json +++ b/lib/search/scoreboard.json @@ -1,13 +1,14 @@ { "lang": "search", - "total_passed": 101, + "total_passed": 122, "total_failed": 0, - "total": 101, + "total": 122, "suites": [ {"name":"index","passed":18,"failed":0,"total":18}, {"name":"boolean","passed":28,"failed":0,"total":28}, {"name":"parse","passed":32,"failed":0,"total":32}, - {"name":"rank","passed":23,"failed":0,"total":23} + {"name":"rank","passed":23,"failed":0,"total":23}, + {"name":"integration","passed":21,"failed":0,"total":21} ], - "generated": "2026-06-06T19:56:08+00:00" + "generated": "2026-06-06T20:07:30+00:00" } diff --git a/lib/search/scoreboard.md b/lib/search/scoreboard.md index 747a4d04..03a1d66c 100644 --- a/lib/search/scoreboard.md +++ b/lib/search/scoreboard.md @@ -1,6 +1,6 @@ # search scoreboard -**101 / 101 passing** (0 failure(s)). +**122 / 122 passing** (0 failure(s)). | Suite | Passed | Total | Status | |-------|--------|-------|--------| @@ -8,3 +8,4 @@ | boolean | 28 | 28 | ok | | parse | 32 | 32 | ok | | rank | 23 | 23 | ok | +| integration | 21 | 21 | ok | diff --git a/lib/search/tests/integration.sx b/lib/search/tests/integration.sx new file mode 100644 index 00000000..8c10685e --- /dev/null +++ b/lib/search/tests/integration.sx @@ -0,0 +1,102 @@ +;; Phase 4 — federation (merge per-peer indices) + ACL post-filter. +;; Peers (global id = peer*1000 + local): +;; peer 1: 1 "alpha beta" 2 "alpha gamma" -> 1001 1002 +;; peer 2: 1 "alpha delta" 2 "beta gamma" -> 2001 2002 +;; ACL predicates are injected (viewer baked in by the caller), applied post-rank. + +(define + fed-setup + "p1 = indexDoc 2 \"alpha gamma\" (indexDoc 1 \"alpha beta\" emptyIndex)\np2 = indexDoc 2 \"beta gamma\" (indexDoc 1 \"alpha delta\" emptyIndex)\nfed = fedIndex [(1, p1), (2, p2)]\npermitP1 g = g < 2000\npermitNone g = False\npermitList g = elem g [1002, 2001]\n") + +(define + fed-cases + (list + (list + "fed merges all docs" + "sort (allDocs fed)" + (list 1001 1002 2001 2002)) + (list + "fed docFreq across peers" + "[docFreq \"alpha\" fed]" + (list 3)) + (list "fed docFreq beta" "[docFreq \"beta\" fed]" (list 2)) + (list "fed numDocs" "[numDocs fed]" (list 4)) + (list + "fed term lookup spans peers" + "map fst (lookupTerm \"gamma\" fed)" + (list 1002 2002)) + (list + "fed preserves positions" + "lookupTerm \"beta\" fed" + (list + (list 1001 (list 1)) + (list 2002 (list 0)))) + (list + "fed rank alpha tie by gid" + "rankTfIdf [\"alpha\"] fed" + (list 1001 1002 2001)) + (list + "fed rank beta" + "rankTfIdf [\"beta\"] fed" + (list 1001 2002)) + (list + "fed boolean and" + "searchQuery \"alpha AND beta\" fed" + (list 1001)) + (list + "fed boolean or" + "searchQuery \"delta OR barks\" fed" + (list 2001)) + (list + "fed phrase within peer1" + "searchQuery \"\\\"alpha beta\\\"\" fed" + (list 1001)) + (list + "fed phrase within peer2" + "searchQuery \"\\\"beta gamma\\\"\" fed" + (list 2002)) + (list + "fed phrase peer2 alpha delta" + "searchQuery \"\\\"alpha delta\\\"\" fed" + (list 2001)) + (list "fed empty peer list" "allDocs (fedIndex [])" (list)) + (list + "fed single relabelled peer" + "rankTfIdf [\"alpha\"] (fedIndex [(5, p1)])" + (list 5001 5002)) + (list + "acl peer1 only" + "aclFilter permitP1 (rankTfIdf [\"alpha\"] fed)" + (list 1001 1002)) + (list + "acl allowlist preserves rank order" + "aclFilter permitList (rankTfIdf [\"alpha\"] fed)" + (list 1002 2001)) + (list + "acl topN after filter" + "topNTfIdfAcl 1 permitP1 [\"alpha\"] fed" + (list 1001)) + (list + "acl denies all" + "aclFilter permitNone (rankTfIdf [\"alpha\"] fed)" + (list)) + (list + "acl on bm25" + "searchBm25Acl permitP1 1.5 0.75 [\"alpha\"] fed" + (list 1001 1002)) + (list + "acl end-to-end tfidf" + "searchTfIdfAcl permitP1 [\"alpha\"] fed" + (list 1001 1002)))) + +(define + fed-results + (search-batch fed-setup (map (fn (c) (nth c 1)) fed-cases))) + +(map-indexed + (fn + (i c) + (hk-test (nth c 0) (nth fed-results i) (nth c 2))) + fed-cases) + +{:fail hk-test-fail :pass hk-test-pass :fails hk-test-fails} diff --git a/plans/search-on-sx.md b/plans/search-on-sx.md index d1c0689b..7324db74 100644 --- a/plans/search-on-sx.md +++ b/plans/search-on-sx.md @@ -10,7 +10,7 @@ extension that merges per-peer indices. ## Status (rolling) -`bash lib/search/conformance.sh` → **101/101** (Phases 1–3 complete) +`bash lib/search/conformance.sh` → **122/122** (Phases 1–4 complete) ## Ground rules @@ -99,13 +99,24 @@ lib/search/index.sx lib/search/eval.sx ## Phase 4 — ACL filter + federation -- [ ] post-filter — each candidate result tested via `(acl/permit? viewer :read doc)` -- [ ] federated query — fan out to peer instances via fed-sx, merge results -- [ ] merge policy — interleave by rank, dedupe by `(peer, doc-id)` -- [ ] `lib/search/tests/integration.sx` — federated search with ACL filter +- [x] post-filter — `aclFilter`/`searchTfIdfAcl`/`topNTfIdfAcl`/`searchBm25Acl` take an + injected `permit :: DocId -> Bool` predicate, applied post-rank (never in the index) +- [x] federated query — `fedIndex :: [(PeerId, Index)] -> Index` merges per-peer + inverted indices (union posting lists per term); rank/search run once over the merge +- [x] merge policy — relabel local DocIds to global `gid = peer*1000 + local` + (bijection ⇒ dedupe by (peer,doc-id) is automatic); ranking interleaves peers by score +- [x] `lib/search/tests/integration.sx` — 21 cases: index merge, cross-peer df/lookup, + position preservation, boolean/phrase over the merge, ACL filter + top-N + bm25 ## Progress log +- **Phase 4 complete — federation + ACL (122/122 total). Roadmap done.** `fedIndex` + merges per-peer inverted indices (union posting lists per term) after relabelling + local DocIds to global `gid = peer*1000 + local` — the bijection makes (peer,doc-id) + dedupe automatic and keeps positions, so ranking runs once over the merge and + interleaves peers by score (rank-correct). ACL is a post-rank `filter` over an + injected `permit :: DocId -> Bool` (viewer baked in by the caller) — never in the + index; `searchTfIdfAcl`/`topNTfIdfAcl`/`searchBm25Acl`. 21 integration tests. - **Phase 3 complete — ranking (101/101 total).** TF-IDF (`rankTfIdf`) and BM25 (`rankBm25 k1 b`) over the candidate set (docs containing any query term), scores as floats with deterministic DocId-ascending tiebreak; `topNTfIdf`/`topNBm25` via