From cf4e613e433c70ace0cfa4f9858bb09383478ae0 Mon Sep 17 00:00:00 2001 From: giles Date: Sat, 6 Jun 2026 23:01:42 +0000 Subject: [PATCH] search: proximity/NEAR search + 9 tests nearDocs k t1 t2 returns docs where both terms occur within k positions (unordered); candidates from the posting intersection, filtered on positional postings. 205/205. Co-Authored-By: Claude Opus 4.8 (1M context) --- lib/search/api.sx | 7 ++++-- lib/search/conformance.conf | 2 ++ lib/search/near.sx | 8 ++++++ lib/search/scoreboard.json | 9 ++++--- lib/search/scoreboard.md | 3 ++- lib/search/tests/near.sx | 49 +++++++++++++++++++++++++++++++++++++ plans/search-on-sx.md | 4 +++ 7 files changed, 75 insertions(+), 7 deletions(-) create mode 100644 lib/search/near.sx create mode 100644 lib/search/tests/near.sx diff --git a/lib/search/api.sx b/lib/search/api.sx index 5ac85924..84f5e943 100644 --- a/lib/search/api.sx +++ b/lib/search/api.sx @@ -6,7 +6,8 @@ ;; rankTfIdf, rankBm25, topNTfIdf, topNBm25, fedIndex, aclFilter, searchTfIdfAcl, ;; topNTfIdfAcl, searchBm25Acl, prefixTerms, prefixDocs, prefixRankTfIdf, ;; paginate, pageTfIdf, pageBm25, resultCount, editDist, fuzzyTerms, fuzzyDocs, -;; fuzzyRankTfIdf, highlight, snippet, stem, stemText, stemTokens, indexStemmed. +;; fuzzyRankTfIdf, highlight, snippet, stem, stemText, stemTokens, indexStemmed, +;; nearDocs. (define search/src @@ -31,4 +32,6 @@ "\n" search/highlight-src "\n" - search/stem-src)) + search/stem-src + "\n" + search/near-src)) diff --git a/lib/search/conformance.conf b/lib/search/conformance.conf index 8c5375b7..f92d61f5 100644 --- a/lib/search/conformance.conf +++ b/lib/search/conformance.conf @@ -29,6 +29,7 @@ PRELOADS=( lib/search/fuzzy.sx lib/search/highlight.sx lib/search/stem.sx + lib/search/near.sx lib/search/api.sx lib/search/testlib.sx ) @@ -44,4 +45,5 @@ SUITES=( "fuzzy:lib/search/tests/fuzzy.sx" "highlight:lib/search/tests/highlight.sx" "stem:lib/search/tests/stem.sx" + "near:lib/search/tests/near.sx" ) diff --git a/lib/search/near.sx b/lib/search/near.sx new file mode 100644 index 00000000..93893abc --- /dev/null +++ b/lib/search/near.sx @@ -0,0 +1,8 @@ +;; search proximity (NEAR) — Haskell source fragment. Depends on query (posIn, +;; docsWith, sortedInter). Finds docs where two terms occur within k positions of +;; each other (unordered), using the positional postings. +;; nearDocs :: Int -> Term -> Term -> Index -> [DocId] (sorted) + +(define + search/near-src + "nrAbsDiff a b = if a > b then a - b else b - a\nnrCloseTo k x [] = False\nnrCloseTo k x (y:ys) = if nrAbsDiff x y <= k then True else nrCloseTo k x ys\nnrAnyClose k [] ys = False\nnrAnyClose k (x:xs) ys = if nrCloseTo k x ys then True else nrAnyClose k xs ys\nnearInDoc k t1 t2 d idx = nrAnyClose k (posIn t1 d idx) (posIn t2 d idx)\nnearHere k t1 t2 idx d = nearInDoc k t1 t2 d idx\nnearDocs k t1 t2 idx = filter (nearHere k t1 t2 idx) (sortedInter (docsWith t1 idx) (docsWith t2 idx))\n") diff --git a/lib/search/scoreboard.json b/lib/search/scoreboard.json index 4c88e5e3..a7c01f7d 100644 --- a/lib/search/scoreboard.json +++ b/lib/search/scoreboard.json @@ -1,8 +1,8 @@ { "lang": "search", - "total_passed": 196, + "total_passed": 205, "total_failed": 0, - "total": 196, + "total": 205, "suites": [ {"name":"index","passed":18,"failed":0,"total":18}, {"name":"boolean","passed":28,"failed":0,"total":28}, @@ -13,7 +13,8 @@ {"name":"page","passed":12,"failed":0,"total":12}, {"name":"fuzzy","passed":18,"failed":0,"total":18}, {"name":"highlight","passed":12,"failed":0,"total":12}, - {"name":"stem","passed":18,"failed":0,"total":18} + {"name":"stem","passed":18,"failed":0,"total":18}, + {"name":"near","passed":9,"failed":0,"total":9} ], - "generated": "2026-06-06T22:49:33+00:00" + "generated": "2026-06-06T23:01:07+00:00" } diff --git a/lib/search/scoreboard.md b/lib/search/scoreboard.md index 7e20b449..985b7b97 100644 --- a/lib/search/scoreboard.md +++ b/lib/search/scoreboard.md @@ -1,6 +1,6 @@ # search scoreboard -**196 / 196 passing** (0 failure(s)). +**205 / 205 passing** (0 failure(s)). | Suite | Passed | Total | Status | |-------|--------|-------|--------| @@ -14,3 +14,4 @@ | fuzzy | 18 | 18 | ok | | highlight | 12 | 12 | ok | | stem | 18 | 18 | ok | +| near | 9 | 9 | ok | diff --git a/lib/search/tests/near.sx b/lib/search/tests/near.sx new file mode 100644 index 00000000..0caa32a8 --- /dev/null +++ b/lib/search/tests/near.sx @@ -0,0 +1,49 @@ +;; Extension — proximity (NEAR) search: terms within k positions, unordered. +;; Corpus: +;; 1 "the quick brown fox" the0 quick1 brown2 fox3 +;; 2 "quick the lazy fox dog" quick0 the1 lazy2 fox3 dog4 +;; 3 "fox runs quick" fox0 runs1 quick2 + +(define + near-setup + "idx = indexDoc 3 \"fox runs quick\" (indexDoc 2 \"quick the lazy fox dog\" (indexDoc 1 \"the quick brown fox\" emptyIndex))\n") + +(define + near-cases + (list + (list + "near adjacent one doc" + "nearDocs 1 \"quick\" \"brown\" idx" + (list 1)) + (list + "near adjacent both docs" + "nearDocs 1 \"quick\" \"the\" idx" + (list 1 2)) + (list + "near within 2" + "nearDocs 2 \"quick\" \"fox\" idx" + (list 1 3)) + (list "near too far at k1" "nearDocs 1 \"quick\" \"fox\" idx" (list)) + (list + "near unordered symmetric" + "nearDocs 2 \"fox\" \"quick\" idx" + (list 1 3)) + (list "near wider window" "nearDocs 5 \"the\" \"dog\" idx" (list 2)) + (list "near absent term" "nearDocs 1 \"quick\" \"zzz\" idx" (list)) + (list "near needs both terms" "nearDocs 3 \"brown\" \"dog\" idx" (list)) + (list + "near same docs only" + "nearDocs 3 \"fox\" \"runs\" idx" + (list 3)))) + +(define + near-results + (search-batch near-setup (map (fn (c) (nth c 1)) near-cases))) + +(map-indexed + (fn + (i c) + (hk-test (nth c 0) (nth near-results i) (nth c 2))) + near-cases) + +{:fail hk-test-fail :pass hk-test-pass :fails hk-test-fails} diff --git a/plans/search-on-sx.md b/plans/search-on-sx.md index c2c71b7b..775aa82a 100644 --- a/plans/search-on-sx.md +++ b/plans/search-on-sx.md @@ -118,9 +118,13 @@ lib/search/index.sx lib/search/eval.sx - [x] snippet / highlight generation (`highlight`, `snippet`) — 12 tests - [x] stemming (suffix stripping) — `stem`, `stemText`, `stemTokens`, `indexStemmed` — 18 tests +- [x] proximity / NEAR — `nearDocs k t1 t2` (unordered, within k positions) — 9 tests ## Progress log +- **Extension: proximity/NEAR (205/205 total).** `nearDocs k t1 t2 idx` returns docs + where both terms occur within k positions (unordered), candidates = posting + intersection, filtered on the positional postings. 9 tests. - **Extension: stemming (196/196 total).** Deterministic English suffix stripping (`stem`), `stemText`/`stemTokens`, `indexStemmed`. Two haskell-on-sx gotchas: take/drop over a String yield char CODES not char strings (rebuild via `joinChars . map chr`),