From 7231cb651f1d7be649adfe70ade87485352ed255 Mon Sep 17 00:00:00 2001 From: giles Date: Sat, 6 Jun 2026 22:08:00 +0000 Subject: [PATCH] search: highlight + snippet generation + 12 tests highlight marks query-matching (normalized) tokens with [..]; snippet extracts a context window around the first match. 178/178. Co-Authored-By: Claude Opus 4.8 (1M context) --- lib/search/api.sx | 6 ++-- lib/search/conformance.conf | 2 ++ lib/search/highlight.sx | 10 ++++++ lib/search/scoreboard.json | 9 ++--- lib/search/scoreboard.md | 3 +- lib/search/tests/highlight.sx | 66 +++++++++++++++++++++++++++++++++++ plans/search-on-sx.md | 5 ++- 7 files changed, 93 insertions(+), 8 deletions(-) create mode 100644 lib/search/highlight.sx create mode 100644 lib/search/tests/highlight.sx diff --git a/lib/search/api.sx b/lib/search/api.sx index c55c7f31..7abbe781 100644 --- a/lib/search/api.sx +++ b/lib/search/api.sx @@ -6,7 +6,7 @@ ;; rankTfIdf, rankBm25, topNTfIdf, topNBm25, fedIndex, aclFilter, searchTfIdfAcl, ;; topNTfIdfAcl, searchBm25Acl, prefixTerms, prefixDocs, prefixRankTfIdf, ;; paginate, pageTfIdf, pageBm25, resultCount, editDist, fuzzyTerms, fuzzyDocs, -;; fuzzyRankTfIdf. +;; fuzzyRankTfIdf, highlight, snippet. (define search/src @@ -27,4 +27,6 @@ "\n" search/page-src "\n" - search/fuzzy-src)) + search/fuzzy-src + "\n" + search/highlight-src)) diff --git a/lib/search/conformance.conf b/lib/search/conformance.conf index 0fef2c39..28c7ddf6 100644 --- a/lib/search/conformance.conf +++ b/lib/search/conformance.conf @@ -27,6 +27,7 @@ PRELOADS=( lib/search/prefix.sx lib/search/page.sx lib/search/fuzzy.sx + lib/search/highlight.sx lib/search/api.sx lib/search/testlib.sx ) @@ -40,4 +41,5 @@ SUITES=( "prefix:lib/search/tests/prefix.sx" "page:lib/search/tests/page.sx" "fuzzy:lib/search/tests/fuzzy.sx" + "highlight:lib/search/tests/highlight.sx" ) diff --git a/lib/search/highlight.sx b/lib/search/highlight.sx new file mode 100644 index 00000000..4c5def99 --- /dev/null +++ b/lib/search/highlight.sx @@ -0,0 +1,10 @@ +;; search highlight / snippet — Haskell source fragment. Depends on tokenize. +;; Operates on document text (not the index): marks query-matching tokens with +;; [..] and extracts a context window around the first match. Tokens are +;; normalized (lowercase, punctuation-stripped) by `tokens`, matching index side. +;; highlight :: [Term] -> String -> String +;; snippet :: Int -> [Term] -> String -> String (ctx tokens each side of 1st match) + +(define + search/highlight-src + "hlMark terms t = if elem t terms then \"[\" ++ t ++ \"]\" else t\nhighlight terms text = unwords (map (hlMark terms) (tokens text))\nhlIdxFrom terms [] i = 0 - 1\nhlIdxFrom terms (t:ts) i = if elem t terms then i else hlIdxFrom terms ts (i + 1)\nhlIdx terms toks = hlIdxFrom terms toks 0\nhlMax0 x = if x < 0 then 0 else x\nsnipStart ctx i = if i < 0 then 0 else hlMax0 (i - ctx)\nsnipToks ctx terms toks = unwords (map (hlMark terms) (take (2 * ctx + 1) (drop (snipStart ctx (hlIdx terms toks)) toks)))\nsnippet ctx terms text = snipToks ctx terms (tokens text)\n") diff --git a/lib/search/scoreboard.json b/lib/search/scoreboard.json index b0baf95a..a3ebb24c 100644 --- a/lib/search/scoreboard.json +++ b/lib/search/scoreboard.json @@ -1,8 +1,8 @@ { "lang": "search", - "total_passed": 166, + "total_passed": 178, "total_failed": 0, - "total": 166, + "total": 178, "suites": [ {"name":"index","passed":18,"failed":0,"total":18}, {"name":"boolean","passed":28,"failed":0,"total":28}, @@ -11,7 +11,8 @@ {"name":"integration","passed":21,"failed":0,"total":21}, {"name":"prefix","passed":14,"failed":0,"total":14}, {"name":"page","passed":12,"failed":0,"total":12}, - {"name":"fuzzy","passed":18,"failed":0,"total":18} + {"name":"fuzzy","passed":18,"failed":0,"total":18}, + {"name":"highlight","passed":12,"failed":0,"total":12} ], - "generated": "2026-06-06T21:47:28+00:00" + "generated": "2026-06-06T22:07:05+00:00" } diff --git a/lib/search/scoreboard.md b/lib/search/scoreboard.md index 74440558..767c5fc2 100644 --- a/lib/search/scoreboard.md +++ b/lib/search/scoreboard.md @@ -1,6 +1,6 @@ # search scoreboard -**166 / 166 passing** (0 failure(s)). +**178 / 178 passing** (0 failure(s)). | Suite | Passed | Total | Status | |-------|--------|-------|--------| @@ -12,3 +12,4 @@ | prefix | 14 | 14 | ok | | page | 12 | 12 | ok | | fuzzy | 18 | 18 | ok | +| highlight | 12 | 12 | ok | diff --git a/lib/search/tests/highlight.sx b/lib/search/tests/highlight.sx new file mode 100644 index 00000000..3a5001d5 --- /dev/null +++ b/lib/search/tests/highlight.sx @@ -0,0 +1,66 @@ +;; Extension — highlight + snippet over document text. +;; Text: "the quick brown fox jumps" + +(define + hl-cases + (list + (list + "highlight two terms" + "highlight [\"quick\", \"fox\"] \"the quick brown fox jumps\"" + "the [quick] brown [fox] jumps") + (list + "highlight none" + "highlight [] \"the quick brown fox jumps\"" + "the quick brown fox jumps") + (list + "highlight absent term" + "highlight [\"zzz\"] \"the quick brown fox jumps\"" + "the quick brown fox jumps") + (list + "highlight first token" + "highlight [\"the\"] \"the quick brown fox jumps\"" + "[the] quick brown fox jumps") + (list + "highlight normalizes text" + "highlight [\"quick\"] \"The Quick, brown!\"" + "the [quick] brown") + (list + "snippet around middle" + "snippet 1 [\"brown\"] \"the quick brown fox jumps\"" + "quick [brown] fox") + (list + "snippet at start" + "snippet 1 [\"the\"] \"the quick brown fox jumps\"" + "[the] quick brown") + (list + "snippet near end" + "snippet 1 [\"fox\"] \"the quick brown fox jumps\"" + "brown [fox] jumps") + (list + "snippet ctx zero" + "snippet 0 [\"brown\"] \"the quick brown fox jumps\"" + "[brown]") + (list + "snippet clamps at end" + "snippet 2 [\"jumps\"] \"the quick brown fox jumps\"" + "brown fox [jumps]") + (list + "snippet no match shows head" + "snippet 1 [\"zzz\"] \"the quick brown fox jumps\"" + "the quick brown") + (list + "snippet wide window" + "snippet 5 [\"brown\"] \"the quick brown fox jumps\"" + "the quick [brown] fox jumps"))) + +(define + hl-results + (search-batch "" (map (fn (c) (nth c 1)) hl-cases))) + +(map-indexed + (fn + (i c) + (hk-test (nth c 0) (nth hl-results i) (nth c 2))) + hl-cases) + +{:fail hk-test-fail :pass hk-test-pass :fails hk-test-fails} diff --git a/plans/search-on-sx.md b/plans/search-on-sx.md index 791c04dc..b2702653 100644 --- a/plans/search-on-sx.md +++ b/plans/search-on-sx.md @@ -115,11 +115,14 @@ lib/search/index.sx lib/search/eval.sx `fuzzyDocs`, `fuzzyRankTfIdf`) — 18 tests - [x] result pagination (offset / limit) — `paginate`, `pageTfIdf`, `pageBm25`, `resultCount` — 12 tests -- [ ] snippet / highlight generation +- [x] snippet / highlight generation (`highlight`, `snippet`) — 12 tests - [ ] stemming (suffix stripping) — recall-improving normalizer ## Progress log +- **Extension: highlight/snippet (178/178 total).** `highlight terms text` marks + query-matching (normalized) tokens with [..]; `snippet ctx terms text` extracts a + context window around the first match. 12 tests. - **Extension: fuzzy matching (166/166 total).** Levenshtein `editDist` as an O(m*n) row-based DP (the naive recursive version is exponential and times out under load), `fuzzyTerms`/`fuzzyDocs`/`fuzzyRankTfIdf` expand a term to indexed terms within a max