search: highlight + snippet generation + 12 tests
Some checks failed
Test, Build, and Deploy / test-build-deploy (push) Failing after 26s

highlight marks query-matching (normalized) tokens with [..]; snippet extracts a
context window around the first match. 178/178.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-06 22:08:00 +00:00
parent 5945b51cfd
commit 7231cb651f
7 changed files with 93 additions and 8 deletions

View File

@@ -6,7 +6,7 @@
;; rankTfIdf, rankBm25, topNTfIdf, topNBm25, fedIndex, aclFilter, searchTfIdfAcl,
;; topNTfIdfAcl, searchBm25Acl, prefixTerms, prefixDocs, prefixRankTfIdf,
;; paginate, pageTfIdf, pageBm25, resultCount, editDist, fuzzyTerms, fuzzyDocs,
;; fuzzyRankTfIdf.
;; fuzzyRankTfIdf, highlight, snippet.
(define
search/src
@@ -27,4 +27,6 @@
"\n"
search/page-src
"\n"
search/fuzzy-src))
search/fuzzy-src
"\n"
search/highlight-src))

View File

@@ -27,6 +27,7 @@ PRELOADS=(
lib/search/prefix.sx
lib/search/page.sx
lib/search/fuzzy.sx
lib/search/highlight.sx
lib/search/api.sx
lib/search/testlib.sx
)
@@ -40,4 +41,5 @@ SUITES=(
"prefix:lib/search/tests/prefix.sx"
"page:lib/search/tests/page.sx"
"fuzzy:lib/search/tests/fuzzy.sx"
"highlight:lib/search/tests/highlight.sx"
)

10
lib/search/highlight.sx Normal file
View File

@@ -0,0 +1,10 @@
;; search highlight / snippet — Haskell source fragment. Depends on tokenize.
;; Operates on document text (not the index): marks query-matching tokens with
;; [..] and extracts a context window around the first match. Tokens are
;; normalized (lowercase, punctuation-stripped) by `tokens`, matching index side.
;; highlight :: [Term] -> String -> String
;; snippet :: Int -> [Term] -> String -> String (ctx tokens each side of 1st match)
(define
search/highlight-src
"hlMark terms t = if elem t terms then \"[\" ++ t ++ \"]\" else t\nhighlight terms text = unwords (map (hlMark terms) (tokens text))\nhlIdxFrom terms [] i = 0 - 1\nhlIdxFrom terms (t:ts) i = if elem t terms then i else hlIdxFrom terms ts (i + 1)\nhlIdx terms toks = hlIdxFrom terms toks 0\nhlMax0 x = if x < 0 then 0 else x\nsnipStart ctx i = if i < 0 then 0 else hlMax0 (i - ctx)\nsnipToks ctx terms toks = unwords (map (hlMark terms) (take (2 * ctx + 1) (drop (snipStart ctx (hlIdx terms toks)) toks)))\nsnippet ctx terms text = snipToks ctx terms (tokens text)\n")

View File

@@ -1,8 +1,8 @@
{
"lang": "search",
"total_passed": 166,
"total_passed": 178,
"total_failed": 0,
"total": 166,
"total": 178,
"suites": [
{"name":"index","passed":18,"failed":0,"total":18},
{"name":"boolean","passed":28,"failed":0,"total":28},
@@ -11,7 +11,8 @@
{"name":"integration","passed":21,"failed":0,"total":21},
{"name":"prefix","passed":14,"failed":0,"total":14},
{"name":"page","passed":12,"failed":0,"total":12},
{"name":"fuzzy","passed":18,"failed":0,"total":18}
{"name":"fuzzy","passed":18,"failed":0,"total":18},
{"name":"highlight","passed":12,"failed":0,"total":12}
],
"generated": "2026-06-06T21:47:28+00:00"
"generated": "2026-06-06T22:07:05+00:00"
}

View File

@@ -1,6 +1,6 @@
# search scoreboard
**166 / 166 passing** (0 failure(s)).
**178 / 178 passing** (0 failure(s)).
| Suite | Passed | Total | Status |
|-------|--------|-------|--------|
@@ -12,3 +12,4 @@
| prefix | 14 | 14 | ok |
| page | 12 | 12 | ok |
| fuzzy | 18 | 18 | ok |
| highlight | 12 | 12 | ok |

View File

@@ -0,0 +1,66 @@
;; Extension — highlight + snippet over document text.
;; Text: "the quick brown fox jumps"
(define
hl-cases
(list
(list
"highlight two terms"
"highlight [\"quick\", \"fox\"] \"the quick brown fox jumps\""
"the [quick] brown [fox] jumps")
(list
"highlight none"
"highlight [] \"the quick brown fox jumps\""
"the quick brown fox jumps")
(list
"highlight absent term"
"highlight [\"zzz\"] \"the quick brown fox jumps\""
"the quick brown fox jumps")
(list
"highlight first token"
"highlight [\"the\"] \"the quick brown fox jumps\""
"[the] quick brown fox jumps")
(list
"highlight normalizes text"
"highlight [\"quick\"] \"The Quick, brown!\""
"the [quick] brown")
(list
"snippet around middle"
"snippet 1 [\"brown\"] \"the quick brown fox jumps\""
"quick [brown] fox")
(list
"snippet at start"
"snippet 1 [\"the\"] \"the quick brown fox jumps\""
"[the] quick brown")
(list
"snippet near end"
"snippet 1 [\"fox\"] \"the quick brown fox jumps\""
"brown [fox] jumps")
(list
"snippet ctx zero"
"snippet 0 [\"brown\"] \"the quick brown fox jumps\""
"[brown]")
(list
"snippet clamps at end"
"snippet 2 [\"jumps\"] \"the quick brown fox jumps\""
"brown fox [jumps]")
(list
"snippet no match shows head"
"snippet 1 [\"zzz\"] \"the quick brown fox jumps\""
"the quick brown")
(list
"snippet wide window"
"snippet 5 [\"brown\"] \"the quick brown fox jumps\""
"the quick [brown] fox jumps")))
(define
hl-results
(search-batch "" (map (fn (c) (nth c 1)) hl-cases)))
(map-indexed
(fn
(i c)
(hk-test (nth c 0) (nth hl-results i) (nth c 2)))
hl-cases)
{:fail hk-test-fail :pass hk-test-pass :fails hk-test-fails}

View File

@@ -115,11 +115,14 @@ lib/search/index.sx lib/search/eval.sx
`fuzzyDocs`, `fuzzyRankTfIdf`) — 18 tests
- [x] result pagination (offset / limit) — `paginate`, `pageTfIdf`, `pageBm25`,
`resultCount` — 12 tests
- [ ] snippet / highlight generation
- [x] snippet / highlight generation (`highlight`, `snippet`) — 12 tests
- [ ] stemming (suffix stripping) — recall-improving normalizer
## Progress log
- **Extension: highlight/snippet (178/178 total).** `highlight terms text` marks
query-matching (normalized) tokens with [..]; `snippet ctx terms text` extracts a
context window around the first match. 12 tests.
- **Extension: fuzzy matching (166/166 total).** Levenshtein `editDist` as an O(m*n)
row-based DP (the naive recursive version is exponential and times out under load),
`fuzzyTerms`/`fuzzyDocs`/`fuzzyRankTfIdf` expand a term to indexed terms within a max