search: proximity/NEAR search + 9 tests
Some checks failed
Test, Build, and Deploy / test-build-deploy (push) Failing after 24s
Some checks failed
Test, Build, and Deploy / test-build-deploy (push) Failing after 24s
nearDocs k t1 t2 returns docs where both terms occur within k positions (unordered); candidates from the posting intersection, filtered on positional postings. 205/205. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -6,7 +6,8 @@
|
|||||||
;; rankTfIdf, rankBm25, topNTfIdf, topNBm25, fedIndex, aclFilter, searchTfIdfAcl,
|
;; rankTfIdf, rankBm25, topNTfIdf, topNBm25, fedIndex, aclFilter, searchTfIdfAcl,
|
||||||
;; topNTfIdfAcl, searchBm25Acl, prefixTerms, prefixDocs, prefixRankTfIdf,
|
;; topNTfIdfAcl, searchBm25Acl, prefixTerms, prefixDocs, prefixRankTfIdf,
|
||||||
;; paginate, pageTfIdf, pageBm25, resultCount, editDist, fuzzyTerms, fuzzyDocs,
|
;; paginate, pageTfIdf, pageBm25, resultCount, editDist, fuzzyTerms, fuzzyDocs,
|
||||||
;; fuzzyRankTfIdf, highlight, snippet, stem, stemText, stemTokens, indexStemmed.
|
;; fuzzyRankTfIdf, highlight, snippet, stem, stemText, stemTokens, indexStemmed,
|
||||||
|
;; nearDocs.
|
||||||
|
|
||||||
(define
|
(define
|
||||||
search/src
|
search/src
|
||||||
@@ -31,4 +32,6 @@
|
|||||||
"\n"
|
"\n"
|
||||||
search/highlight-src
|
search/highlight-src
|
||||||
"\n"
|
"\n"
|
||||||
search/stem-src))
|
search/stem-src
|
||||||
|
"\n"
|
||||||
|
search/near-src))
|
||||||
|
|||||||
@@ -29,6 +29,7 @@ PRELOADS=(
|
|||||||
lib/search/fuzzy.sx
|
lib/search/fuzzy.sx
|
||||||
lib/search/highlight.sx
|
lib/search/highlight.sx
|
||||||
lib/search/stem.sx
|
lib/search/stem.sx
|
||||||
|
lib/search/near.sx
|
||||||
lib/search/api.sx
|
lib/search/api.sx
|
||||||
lib/search/testlib.sx
|
lib/search/testlib.sx
|
||||||
)
|
)
|
||||||
@@ -44,4 +45,5 @@ SUITES=(
|
|||||||
"fuzzy:lib/search/tests/fuzzy.sx"
|
"fuzzy:lib/search/tests/fuzzy.sx"
|
||||||
"highlight:lib/search/tests/highlight.sx"
|
"highlight:lib/search/tests/highlight.sx"
|
||||||
"stem:lib/search/tests/stem.sx"
|
"stem:lib/search/tests/stem.sx"
|
||||||
|
"near:lib/search/tests/near.sx"
|
||||||
)
|
)
|
||||||
|
|||||||
8
lib/search/near.sx
Normal file
8
lib/search/near.sx
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
;; search proximity (NEAR) — Haskell source fragment. Depends on query (posIn,
|
||||||
|
;; docsWith, sortedInter). Finds docs where two terms occur within k positions of
|
||||||
|
;; each other (unordered), using the positional postings.
|
||||||
|
;; nearDocs :: Int -> Term -> Term -> Index -> [DocId] (sorted)
|
||||||
|
|
||||||
|
(define
|
||||||
|
search/near-src
|
||||||
|
"nrAbsDiff a b = if a > b then a - b else b - a\nnrCloseTo k x [] = False\nnrCloseTo k x (y:ys) = if nrAbsDiff x y <= k then True else nrCloseTo k x ys\nnrAnyClose k [] ys = False\nnrAnyClose k (x:xs) ys = if nrCloseTo k x ys then True else nrAnyClose k xs ys\nnearInDoc k t1 t2 d idx = nrAnyClose k (posIn t1 d idx) (posIn t2 d idx)\nnearHere k t1 t2 idx d = nearInDoc k t1 t2 d idx\nnearDocs k t1 t2 idx = filter (nearHere k t1 t2 idx) (sortedInter (docsWith t1 idx) (docsWith t2 idx))\n")
|
||||||
@@ -1,8 +1,8 @@
|
|||||||
{
|
{
|
||||||
"lang": "search",
|
"lang": "search",
|
||||||
"total_passed": 196,
|
"total_passed": 205,
|
||||||
"total_failed": 0,
|
"total_failed": 0,
|
||||||
"total": 196,
|
"total": 205,
|
||||||
"suites": [
|
"suites": [
|
||||||
{"name":"index","passed":18,"failed":0,"total":18},
|
{"name":"index","passed":18,"failed":0,"total":18},
|
||||||
{"name":"boolean","passed":28,"failed":0,"total":28},
|
{"name":"boolean","passed":28,"failed":0,"total":28},
|
||||||
@@ -13,7 +13,8 @@
|
|||||||
{"name":"page","passed":12,"failed":0,"total":12},
|
{"name":"page","passed":12,"failed":0,"total":12},
|
||||||
{"name":"fuzzy","passed":18,"failed":0,"total":18},
|
{"name":"fuzzy","passed":18,"failed":0,"total":18},
|
||||||
{"name":"highlight","passed":12,"failed":0,"total":12},
|
{"name":"highlight","passed":12,"failed":0,"total":12},
|
||||||
{"name":"stem","passed":18,"failed":0,"total":18}
|
{"name":"stem","passed":18,"failed":0,"total":18},
|
||||||
|
{"name":"near","passed":9,"failed":0,"total":9}
|
||||||
],
|
],
|
||||||
"generated": "2026-06-06T22:49:33+00:00"
|
"generated": "2026-06-06T23:01:07+00:00"
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
# search scoreboard
|
# search scoreboard
|
||||||
|
|
||||||
**196 / 196 passing** (0 failure(s)).
|
**205 / 205 passing** (0 failure(s)).
|
||||||
|
|
||||||
| Suite | Passed | Total | Status |
|
| Suite | Passed | Total | Status |
|
||||||
|-------|--------|-------|--------|
|
|-------|--------|-------|--------|
|
||||||
@@ -14,3 +14,4 @@
|
|||||||
| fuzzy | 18 | 18 | ok |
|
| fuzzy | 18 | 18 | ok |
|
||||||
| highlight | 12 | 12 | ok |
|
| highlight | 12 | 12 | ok |
|
||||||
| stem | 18 | 18 | ok |
|
| stem | 18 | 18 | ok |
|
||||||
|
| near | 9 | 9 | ok |
|
||||||
|
|||||||
49
lib/search/tests/near.sx
Normal file
49
lib/search/tests/near.sx
Normal file
@@ -0,0 +1,49 @@
|
|||||||
|
;; Extension — proximity (NEAR) search: terms within k positions, unordered.
|
||||||
|
;; Corpus:
|
||||||
|
;; 1 "the quick brown fox" the0 quick1 brown2 fox3
|
||||||
|
;; 2 "quick the lazy fox dog" quick0 the1 lazy2 fox3 dog4
|
||||||
|
;; 3 "fox runs quick" fox0 runs1 quick2
|
||||||
|
|
||||||
|
(define
|
||||||
|
near-setup
|
||||||
|
"idx = indexDoc 3 \"fox runs quick\" (indexDoc 2 \"quick the lazy fox dog\" (indexDoc 1 \"the quick brown fox\" emptyIndex))\n")
|
||||||
|
|
||||||
|
(define
|
||||||
|
near-cases
|
||||||
|
(list
|
||||||
|
(list
|
||||||
|
"near adjacent one doc"
|
||||||
|
"nearDocs 1 \"quick\" \"brown\" idx"
|
||||||
|
(list 1))
|
||||||
|
(list
|
||||||
|
"near adjacent both docs"
|
||||||
|
"nearDocs 1 \"quick\" \"the\" idx"
|
||||||
|
(list 1 2))
|
||||||
|
(list
|
||||||
|
"near within 2"
|
||||||
|
"nearDocs 2 \"quick\" \"fox\" idx"
|
||||||
|
(list 1 3))
|
||||||
|
(list "near too far at k1" "nearDocs 1 \"quick\" \"fox\" idx" (list))
|
||||||
|
(list
|
||||||
|
"near unordered symmetric"
|
||||||
|
"nearDocs 2 \"fox\" \"quick\" idx"
|
||||||
|
(list 1 3))
|
||||||
|
(list "near wider window" "nearDocs 5 \"the\" \"dog\" idx" (list 2))
|
||||||
|
(list "near absent term" "nearDocs 1 \"quick\" \"zzz\" idx" (list))
|
||||||
|
(list "near needs both terms" "nearDocs 3 \"brown\" \"dog\" idx" (list))
|
||||||
|
(list
|
||||||
|
"near same docs only"
|
||||||
|
"nearDocs 3 \"fox\" \"runs\" idx"
|
||||||
|
(list 3))))
|
||||||
|
|
||||||
|
(define
|
||||||
|
near-results
|
||||||
|
(search-batch near-setup (map (fn (c) (nth c 1)) near-cases)))
|
||||||
|
|
||||||
|
(map-indexed
|
||||||
|
(fn
|
||||||
|
(i c)
|
||||||
|
(hk-test (nth c 0) (nth near-results i) (nth c 2)))
|
||||||
|
near-cases)
|
||||||
|
|
||||||
|
{:fail hk-test-fail :pass hk-test-pass :fails hk-test-fails}
|
||||||
@@ -118,9 +118,13 @@ lib/search/index.sx lib/search/eval.sx
|
|||||||
- [x] snippet / highlight generation (`highlight`, `snippet`) — 12 tests
|
- [x] snippet / highlight generation (`highlight`, `snippet`) — 12 tests
|
||||||
- [x] stemming (suffix stripping) — `stem`, `stemText`, `stemTokens`, `indexStemmed`
|
- [x] stemming (suffix stripping) — `stem`, `stemText`, `stemTokens`, `indexStemmed`
|
||||||
— 18 tests
|
— 18 tests
|
||||||
|
- [x] proximity / NEAR — `nearDocs k t1 t2` (unordered, within k positions) — 9 tests
|
||||||
|
|
||||||
## Progress log
|
## Progress log
|
||||||
|
|
||||||
|
- **Extension: proximity/NEAR (205/205 total).** `nearDocs k t1 t2 idx` returns docs
|
||||||
|
where both terms occur within k positions (unordered), candidates = posting
|
||||||
|
intersection, filtered on the positional postings. 9 tests.
|
||||||
- **Extension: stemming (196/196 total).** Deterministic English suffix stripping
|
- **Extension: stemming (196/196 total).** Deterministic English suffix stripping
|
||||||
(`stem`), `stemText`/`stemTokens`, `indexStemmed`. Two haskell-on-sx gotchas: take/drop
|
(`stem`), `stemText`/`stemTokens`, `indexStemmed`. Two haskell-on-sx gotchas: take/drop
|
||||||
over a String yield char CODES not char strings (rebuild via `joinChars . map chr`),
|
over a String yield char CODES not char strings (rebuild via `joinChars . map chr`),
|
||||||
|
|||||||
Reference in New Issue
Block a user