From cfa68c3db34c2d1405ebaaadb3369d50493d98f0 Mon Sep 17 00:00:00 2001 From: giles Date: Sat, 6 Jun 2026 23:27:03 +0000 Subject: [PATCH] search: synonym / query expansion + 9 tests A synonym map [(Term,[Term])] expands a query term to itself + synonyms (expandTerm); synDocs unions and synRankTfIdf ranks the expanded set. 214/214. Co-Authored-By: Claude Opus 4.8 (1M context) --- lib/search/api.sx | 6 +++-- lib/search/conformance.conf | 2 ++ lib/search/scoreboard.json | 9 ++++--- lib/search/scoreboard.md | 3 ++- lib/search/syn.sx | 10 +++++++ lib/search/tests/syn.sx | 53 +++++++++++++++++++++++++++++++++++++ plans/search-on-sx.md | 4 +++ 7 files changed, 80 insertions(+), 7 deletions(-) create mode 100644 lib/search/syn.sx create mode 100644 lib/search/tests/syn.sx diff --git a/lib/search/api.sx b/lib/search/api.sx index 84f5e943..14ba1774 100644 --- a/lib/search/api.sx +++ b/lib/search/api.sx @@ -7,7 +7,7 @@ ;; topNTfIdfAcl, searchBm25Acl, prefixTerms, prefixDocs, prefixRankTfIdf, ;; paginate, pageTfIdf, pageBm25, resultCount, editDist, fuzzyTerms, fuzzyDocs, ;; fuzzyRankTfIdf, highlight, snippet, stem, stemText, stemTokens, indexStemmed, -;; nearDocs. +;; nearDocs, expandTerm, synDocs, synRankTfIdf. (define search/src @@ -34,4 +34,6 @@ "\n" search/stem-src "\n" - search/near-src)) + search/near-src + "\n" + search/syn-src)) diff --git a/lib/search/conformance.conf b/lib/search/conformance.conf index f92d61f5..96d38540 100644 --- a/lib/search/conformance.conf +++ b/lib/search/conformance.conf @@ -30,6 +30,7 @@ PRELOADS=( lib/search/highlight.sx lib/search/stem.sx lib/search/near.sx + lib/search/syn.sx lib/search/api.sx lib/search/testlib.sx ) @@ -46,4 +47,5 @@ SUITES=( "highlight:lib/search/tests/highlight.sx" "stem:lib/search/tests/stem.sx" "near:lib/search/tests/near.sx" + "syn:lib/search/tests/syn.sx" ) diff --git a/lib/search/scoreboard.json b/lib/search/scoreboard.json index a7c01f7d..6f965309 100644 --- a/lib/search/scoreboard.json +++ b/lib/search/scoreboard.json @@ -1,8 +1,8 @@ { "lang": "search", - "total_passed": 205, + "total_passed": 214, "total_failed": 0, - "total": 205, + "total": 214, "suites": [ {"name":"index","passed":18,"failed":0,"total":18}, {"name":"boolean","passed":28,"failed":0,"total":28}, @@ -14,7 +14,8 @@ {"name":"fuzzy","passed":18,"failed":0,"total":18}, {"name":"highlight","passed":12,"failed":0,"total":12}, {"name":"stem","passed":18,"failed":0,"total":18}, - {"name":"near","passed":9,"failed":0,"total":9} + {"name":"near","passed":9,"failed":0,"total":9}, + {"name":"syn","passed":9,"failed":0,"total":9} ], - "generated": "2026-06-06T23:01:07+00:00" + "generated": "2026-06-06T23:25:35+00:00" } diff --git a/lib/search/scoreboard.md b/lib/search/scoreboard.md index 985b7b97..0f54edbb 100644 --- a/lib/search/scoreboard.md +++ b/lib/search/scoreboard.md @@ -1,6 +1,6 @@ # search scoreboard -**205 / 205 passing** (0 failure(s)). +**214 / 214 passing** (0 failure(s)). | Suite | Passed | Total | Status | |-------|--------|-------|--------| @@ -15,3 +15,4 @@ | highlight | 12 | 12 | ok | | stem | 18 | 18 | ok | | near | 9 | 9 | ok | +| syn | 9 | 9 | ok | diff --git a/lib/search/syn.sx b/lib/search/syn.sx new file mode 100644 index 00000000..6072cd65 --- /dev/null +++ b/lib/search/syn.sx @@ -0,0 +1,10 @@ +;; search synonym / query expansion — Haskell source fragment. Depends on index + +;; rank. A synonym map is an assoc list [(Term, [Term])]; a query term is expanded +;; to itself plus its synonyms, then the expanded set is unioned / ranked. +;; expandTerm :: [(Term,[Term])] -> Term -> [Term] +;; synDocs :: [(Term,[Term])] -> Term -> Index -> [DocId] +;; synRankTfIdf :: [(Term,[Term])] -> Term -> Index -> [DocId] + +(define + search/syn-src + "synLookup synmap t = case lookup t synmap of { Nothing -> [] ; Just ss -> ss }\nexpandTerm synmap t = t : synLookup synmap t\nsynDocs synmap t idx = foldl (candStep idx) [] (expandTerm synmap t)\nsynRankTfIdf synmap t idx = rankTfIdf (expandTerm synmap t) idx\n") diff --git a/lib/search/tests/syn.sx b/lib/search/tests/syn.sx new file mode 100644 index 00000000..aaeea7bd --- /dev/null +++ b/lib/search/tests/syn.sx @@ -0,0 +1,53 @@ +;; Extension — synonym / query expansion. +;; synmap: car -> automobile, vehicle ; big -> large +;; Corpus: 1 "fast car" 2 "shiny automobile" 3 "big truck" 4 "large house" 5 "vehicle review" + +(define + syn-setup + "synmap = [(\"car\", [\"automobile\", \"vehicle\"]), (\"big\", [\"large\"])]\nidx = indexDoc 5 \"vehicle review\" (indexDoc 4 \"large house\" (indexDoc 3 \"big truck\" (indexDoc 2 \"shiny automobile\" (indexDoc 1 \"fast car\" emptyIndex))))\n") + +(define + syn-cases + (list + (list + "expand term with synonyms" + "expandTerm synmap \"car\"" + (list "car" "automobile" "vehicle")) + (list + "expand single synonym" + "expandTerm synmap \"big\"" + (list "big" "large")) + (list "expand unknown term" "expandTerm synmap \"banana\"" (list "banana")) + (list + "syn docs union" + "synDocs synmap \"car\" idx" + (list 1 2 5)) + (list + "syn docs single synonym" + "synDocs synmap \"big\" idx" + (list 3 4)) + (list + "syn docs no synonyms" + "synDocs synmap \"house\" idx" + (list 4)) + (list "syn docs absent" "synDocs synmap \"plane\" idx" (list)) + (list + "syn rank expanded" + "synRankTfIdf synmap \"car\" idx" + (list 1 2 5)) + (list + "syn rank single" + "synRankTfIdf synmap \"big\" idx" + (list 3 4)))) + +(define + syn-results + (search-batch syn-setup (map (fn (c) (nth c 1)) syn-cases))) + +(map-indexed + (fn + (i c) + (hk-test (nth c 0) (nth syn-results i) (nth c 2))) + syn-cases) + +{:fail hk-test-fail :pass hk-test-pass :fails hk-test-fails} diff --git a/plans/search-on-sx.md b/plans/search-on-sx.md index 775aa82a..2e62c53b 100644 --- a/plans/search-on-sx.md +++ b/plans/search-on-sx.md @@ -119,9 +119,13 @@ lib/search/index.sx lib/search/eval.sx - [x] stemming (suffix stripping) — `stem`, `stemText`, `stemTokens`, `indexStemmed` — 18 tests - [x] proximity / NEAR — `nearDocs k t1 t2` (unordered, within k positions) — 9 tests +- [x] synonym / query expansion — `expandTerm`, `synDocs`, `synRankTfIdf` — 9 tests ## Progress log +- **Extension: synonyms/query expansion (214/214 total).** A synonym map + `[(Term,[Term])]` expands a query term to itself + synonyms (`expandTerm`); `synDocs` + unions, `synRankTfIdf` ranks the expanded set. 9 tests. - **Extension: proximity/NEAR (205/205 total).** `nearDocs k t1 t2 idx` returns docs where both terms occur within k positions (unordered), candidates = posting intersection, filtered on the positional postings. 9 tests.