From 0f0da0319c83285644f5b8299b8c09b3816a93c5 Mon Sep 17 00:00:00 2001 From: giles Date: Sat, 6 Jun 2026 18:47:42 +0000 Subject: [PATCH] search: Phase 2 query AST + boolean/phrase eval + 28 tests Query ADT (Term|And|Or|Not|Phrase) and evalQuery over docid-sorted posting lists: boolean ops as linear merges, Not over the allDocs universe, Phrase via positional adjacency. Batched both test suites into one program eval each (search-batch) so they finish under heavy CPU load. 46/46. Co-Authored-By: Claude Opus 4.8 (1M context) --- lib/search/api.sx | 6 +- lib/search/conformance.conf | 2 + lib/search/query.sx | 11 ++ lib/search/scoreboard.json | 9 +- lib/search/scoreboard.md | 3 +- lib/search/testlib.sx | 21 ++++ lib/search/tests/boolean.sx | 123 +++++++++++++++++++++++ lib/search/tests/index.sx | 193 +++++++++++++++--------------------- plans/search-on-sx.md | 21 ++-- 9 files changed, 264 insertions(+), 125 deletions(-) create mode 100644 lib/search/query.sx create mode 100644 lib/search/tests/boolean.sx diff --git a/lib/search/api.sx b/lib/search/api.sx index 8a06d444..e2da2bb6 100644 --- a/lib/search/api.sx +++ b/lib/search/api.sx @@ -2,6 +2,8 @@ ;; Tests and callers concatenate `search/src` with their own top-level bindings ;; (e.g. "result = lookupTerm \"cat\" idx\n") and evaluate via the haskell-on-sx ;; interpreter. Public Haskell entry points: indexDoc, lookupTerm, deleteDoc, -;; docFreq, allTerms, tokens, positioned. +;; docFreq, allTerms, tokens, positioned, evalQuery, parseQuery. -(define search/src (str search/tokenize-src "\n" search/index-src)) +(define + search/src + (str search/tokenize-src "\n" search/index-src "\n" search/query-src)) diff --git a/lib/search/conformance.conf b/lib/search/conformance.conf index cc75c6e0..4e418e9f 100644 --- a/lib/search/conformance.conf +++ b/lib/search/conformance.conf @@ -20,10 +20,12 @@ PRELOADS=( lib/haskell/testlib.sx lib/search/tokenize.sx lib/search/index.sx + lib/search/query.sx lib/search/api.sx lib/search/testlib.sx ) SUITES=( "index:lib/search/tests/index.sx" + "boolean:lib/search/tests/boolean.sx" ) diff --git a/lib/search/query.sx b/lib/search/query.sx new file mode 100644 index 00000000..23025908 --- /dev/null +++ b/lib/search/query.sx @@ -0,0 +1,11 @@ +;; search query AST + boolean/phrase evaluation — Haskell source fragment. +;; Depends on tokenize + index. +;; data Query = Term String | And Query Query | Or Query Query +;; | Not Query | Phrase [String] +;; evalQuery :: Index -> Query -> [DocId] (sorted, unique) +;; Boolean ops are linear merges over docid-sorted posting lists; Not uses +;; allDocs as the universe; Phrase checks positional adjacency. + +(define + search/query-src + "data Query = Term String | And Query Query | Or Query Query | Not Query | Phrase [String]\ndocsWith t idx = map fst (lookupTerm t idx)\nsortedUnion [] ys = ys\nsortedUnion xs [] = xs\nsortedUnion (x:xs) (y:ys) = if x < y then x : sortedUnion xs (y:ys) else if x > y then y : sortedUnion (x:xs) ys else x : sortedUnion xs ys\nsortedInter [] ys = []\nsortedInter xs [] = []\nsortedInter (x:xs) (y:ys) = if x < y then sortedInter xs (y:ys) else if x > y then sortedInter (x:xs) ys else x : sortedInter xs ys\nsortedDiff [] ys = []\nsortedDiff xs [] = xs\nsortedDiff (x:xs) (y:ys) = if x < y then x : sortedDiff xs (y:ys) else if x > y then sortedDiff (x:xs) ys else sortedDiff xs ys\nmergeDocs acc e = sortedUnion acc (map fst (snd e))\nallDocs idx = foldl mergeDocs [] idx\nposIn t d idx = case lookup d (lookupTerm t idx) of { Nothing -> []; Just ps -> ps }\nelemSorted x [] = False\nelemSorted x (y:ys) = if x == y then True else if x < y then False else elemSorted x ys\nphraseAtAll [] d idx p i = True\nphraseAtAll (t:ts) d idx p i = if elemSorted (p + i) (posIn t d idx) then phraseAtAll ts d idx p (i + 1) else False\nphraseStartsAt ts d idx p = phraseAtAll ts d idx p 0\nphraseInDoc [] d idx = True\nphraseInDoc (t0:rest) d idx = any (phraseStartsAt (t0:rest) d idx) (posIn t0 d idx)\nphraseHere ts idx d = phraseInDoc ts d idx\ninterStep idx acc tt = sortedInter acc (docsWith tt idx)\nphraseCands [] idx = allDocs idx\nphraseCands (t:ts) idx = foldl (interStep idx) (docsWith t idx) ts\nphraseDocs ts idx = filter (phraseHere ts idx) (phraseCands ts idx)\nevalQuery idx q = case q of { Term t -> docsWith t idx ; And a b -> sortedInter (evalQuery idx a) (evalQuery idx b) ; Or a b -> sortedUnion (evalQuery idx a) (evalQuery idx b) ; Not a -> sortedDiff (allDocs idx) (evalQuery idx a) ; Phrase ts -> phraseDocs ts idx }\n") diff --git a/lib/search/scoreboard.json b/lib/search/scoreboard.json index 4c5202b0..51e8a2ec 100644 --- a/lib/search/scoreboard.json +++ b/lib/search/scoreboard.json @@ -1,10 +1,11 @@ { "lang": "search", - "total_passed": 18, + "total_passed": 46, "total_failed": 0, - "total": 18, + "total": 46, "suites": [ - {"name":"index","passed":18,"failed":0,"total":18} + {"name":"index","passed":18,"failed":0,"total":18}, + {"name":"boolean","passed":28,"failed":0,"total":28} ], - "generated": "2026-06-06T18:12:50+00:00" + "generated": "2026-06-06T18:46:54+00:00" } diff --git a/lib/search/scoreboard.md b/lib/search/scoreboard.md index cf9cabce..a214ce29 100644 --- a/lib/search/scoreboard.md +++ b/lib/search/scoreboard.md @@ -1,7 +1,8 @@ # search scoreboard -**18 / 18 passing** (0 failure(s)). +**46 / 46 passing** (0 failure(s)). | Suite | Passed | Total | Status | |-------|--------|-------|--------| | index | 18 | 18 | ok | +| boolean | 28 | 28 | ok | diff --git a/lib/search/testlib.sx b/lib/search/testlib.sx index 9c965b05..1e2212d0 100644 --- a/lib/search/testlib.sx +++ b/lib/search/testlib.sx @@ -27,3 +27,24 @@ (search-hk->sx (hk-deep-force (get (hk-eval-program (hk-core (str search/src extra))) name))))) + +(define + search-join + (fn + (sep xs) + (cond + ((empty? xs) "") + ((empty? (rest xs)) (first xs)) + (:else (str (first xs) sep (search-join sep (rest xs))))))) + +;; Batch many haskell expressions into ONE program evaluation (amortizes the +;; cost of parsing/binding search/src — important under heavy CPU load). +;; `setup` is extra top-level Haskell; `exprs` is a list of expression strings +;; whose results form a single haskell list. Returns the SX list of results. +(define + search-batch + (fn + (setup exprs) + (search-eval + (str setup "\nresult = [" (search-join ", " exprs) "]\n") + "result"))) diff --git a/lib/search/tests/boolean.sx b/lib/search/tests/boolean.sx new file mode 100644 index 00000000..f6e48ea8 --- /dev/null +++ b/lib/search/tests/boolean.sx @@ -0,0 +1,123 @@ +;; Phase 2 — query AST + boolean/phrase evaluation (hand-built Query values). +;; Corpus: +;; doc 1 "the quick brown dog" -> the quick brown dog +;; doc 2 "a quick brown fox" -> a quick brown fox +;; doc 3 "the dog barks loudly" -> the dog barks loudly +;; All queries run in ONE program evaluation (search-batch) to stay fast. + +(define + search-corpus + "idx = indexDoc 3 \"the dog barks loudly\" (indexDoc 2 \"a quick brown fox\" (indexDoc 1 \"the quick brown dog\" emptyIndex))\n") + +(define + bool-cases + (list + (list + "term in two docs" + "evalQuery idx (Term \"quick\")" + (list 1 2)) + (list + "term in two docs (the)" + "evalQuery idx (Term \"the\")" + (list 1 3)) + (list "term in one doc" "evalQuery idx (Term \"fox\")" (list 2)) + (list "term absent" "evalQuery idx (Term \"zzz\")" (list)) + (list + "term case-sensitive at AST level" + "evalQuery idx (Term \"QUICK\")" + (list)) + (list "term on empty index" "evalQuery emptyIndex (Term \"cat\")" (list)) + (list + "and both terms" + "evalQuery idx (And (Term \"quick\") (Term \"brown\"))" + (list 1 2)) + (list + "and overlap subset" + "evalQuery idx (And (Term \"the\") (Term \"dog\"))" + (list 1 3)) + (list + "and disjoint is empty" + "evalQuery idx (And (Term \"the\") (Term \"fox\"))" + (list)) + (list + "and right-nested" + "evalQuery idx (And (Term \"the\") (And (Term \"dog\") (Term \"barks\")))" + (list 3)) + (list + "or two singletons" + "evalQuery idx (Or (Term \"fox\") (Term \"barks\"))" + (list 2 3)) + (list + "or all docs" + "evalQuery idx (Or (Term \"quick\") (Term \"the\"))" + (list 1 2 3)) + (list + "or with absent term" + "evalQuery idx (Or (Term \"fox\") (Term \"zzz\"))" + (list 2)) + (list "not term" "evalQuery idx (Not (Term \"the\"))" (list 2)) + (list "not term 2" "evalQuery idx (Not (Term \"quick\"))" (list 3)) + (list + "and with not" + "evalQuery idx (And (Term \"quick\") (Not (Term \"the\")))" + (list 2)) + (list + "double negation" + "evalQuery idx (Not (Not (Term \"fox\")))" + (list 2)) + (list + "or of and with term" + "evalQuery idx (Or (And (Term \"the\") (Term \"dog\")) (Term \"fox\"))" + (list 1 2 3)) + (list + "phrase adjacent both docs" + "evalQuery idx (Phrase [\"quick\", \"brown\"])" + (list 1 2)) + (list + "phrase adjacent one doc" + "evalQuery idx (Phrase [\"brown\", \"dog\"])" + (list 1)) + (list + "phrase the quick" + "evalQuery idx (Phrase [\"the\", \"quick\"])" + (list 1)) + (list + "phrase dog barks" + "evalQuery idx (Phrase [\"dog\", \"barks\"])" + (list 3)) + (list + "phrase non-adjacent empty" + "evalQuery idx (Phrase [\"quick\", \"dog\"])" + (list)) + (list + "phrase order matters" + "evalQuery idx (Phrase [\"brown\", \"quick\"])" + (list)) + (list + "phrase single term" + "evalQuery idx (Phrase [\"dog\"])" + (list 1 3)) + (list + "phrase three terms" + "evalQuery idx (Phrase [\"the\", \"dog\", \"barks\"])" + (list 3)) + (list + "and of phrase and term" + "evalQuery idx (And (Phrase [\"quick\", \"brown\"]) (Term \"dog\"))" + (list 1)) + (list + "not of phrase" + "evalQuery idx (Not (Phrase [\"quick\", \"brown\"]))" + (list 3)))) + +(define + bool-results + (search-batch search-corpus (map (fn (c) (nth c 1)) bool-cases))) + +(map-indexed + (fn + (i c) + (hk-test (nth c 0) (nth bool-results i) (nth c 2))) + bool-cases) + +{:fail hk-test-fail :pass hk-test-pass :fails hk-test-fails} diff --git a/lib/search/tests/index.sx b/lib/search/tests/index.sx index 2e9cb700..9415866f 100644 --- a/lib/search/tests/index.sx +++ b/lib/search/tests/index.sx @@ -1,119 +1,88 @@ ;; Phase 1 — tokenize + inverted index. +;; All cases run in ONE program evaluation (search-batch) to stay fast under load. +;; Scalar results (docFreq) are wrapped as singleton lists so the batch is a list +;; of lists. -(hk-test - "tokens basic lowercases" - (search-eval "\nresult = tokens \"The Cat sat\"\n" "result") - (list "the" "cat" "sat")) - -(hk-test - "tokens strips punctuation" - (search-eval "\nresult = tokens \"Hello, World!\"\n" "result") - (list "hello" "world")) - -(hk-test - "tokens collapses whitespace" - (search-eval "\nresult = tokens \" a b \"\n" "result") - (list "a" "b")) - -(hk-test - "tokens empty is empty" - (search-eval "\nresult = tokens \"\"\n" "result") - (list)) - -(hk-test - "tokens keeps digits" - (search-eval "\nresult = tokens \"abc123 x9\"\n" "result") - (list "abc123" "x9")) - -(hk-test - "positioned attaches ordinals" - (search-eval "\nresult = positioned \"a b a\"\n" "result") - (list (list "a" 0) (list "b" 1) (list "a" 2))) - -(hk-test - "index + lookup single doc" - (search-eval - "\nresult = lookupTerm \"cat\" (indexDoc 1 \"the cat sat\" emptyIndex)\n" - "result") - (list (list 1 (list 1)))) - -(hk-test - "lookup missing term is empty" - (search-eval - "\nresult = lookupTerm \"dog\" (indexDoc 1 \"the cat sat\" emptyIndex)\n" - "result") - (list)) - -(hk-test - "lookup records all positions" - (search-eval - "\nresult = lookupTerm \"the\" (indexDoc 1 \"the cat the dog the\" emptyIndex)\n" - "result") - (list (list 1 (list 0 2 4)))) - -(hk-test - "multi-doc posting list sorted by docid" - (search-eval - "\nresult = lookupTerm \"x\" (indexDoc 1 \"x y\" (indexDoc 2 \"x z\" emptyIndex))\n" - "result") +(define + index-cases (list - (list 1 (list 0)) - (list 2 (list 0)))) + (list + "tokens basic lowercases" + "tokens \"The Cat sat\"" + (list "the" "cat" "sat")) + (list + "tokens strips punctuation" + "tokens \"Hello, World!\"" + (list "hello" "world")) + (list "tokens collapses whitespace" "tokens \" a b \"" (list "a" "b")) + (list "tokens empty is empty" "tokens \"\"" (list)) + (list "tokens keeps digits" "tokens \"abc123 x9\"" (list "abc123" "x9")) + (list + "positioned attaches ordinals" + "positioned \"a b a\"" + (list + (list "a" 0) + (list "b" 1) + (list "a" 2))) + (list + "index + lookup single doc" + "lookupTerm \"cat\" (indexDoc 1 \"the cat sat\" emptyIndex)" + (list (list 1 (list 1)))) + (list + "lookup missing term is empty" + "lookupTerm \"dog\" (indexDoc 1 \"the cat sat\" emptyIndex)" + (list)) + (list + "lookup records all positions" + "lookupTerm \"the\" (indexDoc 1 \"the cat the dog the\" emptyIndex)" + (list (list 1 (list 0 2 4)))) + (list + "multi-doc posting list sorted by docid" + "lookupTerm \"x\" (indexDoc 1 \"x y\" (indexDoc 2 \"x z\" emptyIndex))" + (list + (list 1 (list 0)) + (list 2 (list 0)))) + (list + "index/query case symmetry" + "lookupTerm \"cat\" (indexDoc 1 \"CAT Cat cat\" emptyIndex)" + (list (list 1 (list 0 1 2)))) + (list + "re-index replaces a doc" + "lookupTerm \"a\" (indexDoc 1 \"a a a\" (indexDoc 1 \"a\" emptyIndex))" + (list (list 1 (list 0 1 2)))) + (list + "delete removes a doc" + "lookupTerm \"cat\" (deleteDoc 1 (indexDoc 1 \"the cat\" emptyIndex))" + (list)) + (list + "delete leaves other docs" + "lookupTerm \"cat\" (deleteDoc 2 (indexDoc 2 \"big cat\" (indexDoc 1 \"the cat\" emptyIndex)))" + (list (list 1 (list 1)))) + (list + "docFreq counts docs" + "[docFreq \"cat\" (indexDoc 2 \"a cat\" (indexDoc 1 \"the cat\" emptyIndex))]" + (list 2)) + (list + "docFreq zero for missing" + "[docFreq \"zzz\" (indexDoc 1 \"a b\" emptyIndex)]" + (list 0)) + (list + "allTerms sorted and unique" + "allTerms (indexDoc 1 \"banana apple cherry apple\" emptyIndex)" + (list "apple" "banana" "cherry")) + (list + "allTerms merged across docs" + "allTerms (indexDoc 2 \"d a\" (indexDoc 1 \"c b\" emptyIndex))" + (list "a" "b" "c" "d")))) -(hk-test - "index/query case symmetry" - (search-eval - "\nresult = lookupTerm \"cat\" (indexDoc 1 \"CAT Cat cat\" emptyIndex)\n" - "result") - (list (list 1 (list 0 1 2)))) +(define + index-results + (search-batch "" (map (fn (c) (nth c 1)) index-cases))) -(hk-test - "re-index replaces a doc" - (search-eval - "\nresult = lookupTerm \"a\" (indexDoc 1 \"a a a\" (indexDoc 1 \"a\" emptyIndex))\n" - "result") - (list (list 1 (list 0 1 2)))) - -(hk-test - "delete removes a doc" - (search-eval - "\nresult = lookupTerm \"cat\" (deleteDoc 1 (indexDoc 1 \"the cat\" emptyIndex))\n" - "result") - (list)) - -(hk-test - "delete leaves other docs" - (search-eval - "\nresult = lookupTerm \"cat\" (deleteDoc 2 (indexDoc 2 \"big cat\" (indexDoc 1 \"the cat\" emptyIndex)))\n" - "result") - (list (list 1 (list 1)))) - -(hk-test - "docFreq counts docs" - (search-eval - "\nresult = docFreq \"cat\" (indexDoc 2 \"a cat\" (indexDoc 1 \"the cat\" emptyIndex))\n" - "result") - 2) - -(hk-test - "docFreq zero for missing" - (search-eval - "\nresult = docFreq \"zzz\" (indexDoc 1 \"a b\" emptyIndex)\n" - "result") - 0) - -(hk-test - "allTerms sorted and unique" - (search-eval - "\nresult = allTerms (indexDoc 1 \"banana apple cherry apple\" emptyIndex)\n" - "result") - (list "apple" "banana" "cherry")) - -(hk-test - "allTerms merged across docs" - (search-eval - "\nresult = allTerms (indexDoc 2 \"d a\" (indexDoc 1 \"c b\" emptyIndex))\n" - "result") - (list "a" "b" "c" "d")) +(map-indexed + (fn + (i c) + (hk-test (nth c 0) (nth index-results i) (nth c 2))) + index-cases) {:fail hk-test-fail :pass hk-test-pass :fails hk-test-fails} diff --git a/plans/search-on-sx.md b/plans/search-on-sx.md index 1baf6e9a..43c60240 100644 --- a/plans/search-on-sx.md +++ b/plans/search-on-sx.md @@ -76,13 +76,14 @@ lib/search/index.sx lib/search/eval.sx ## Phase 2 — Query AST + boolean evaluation -- [ ] Query ADT: `Term Text | And Query Query | Or Query Query | Not Query | - Phrase [Text]` +- [x] Query ADT: `Term String | And Query Query | Or Query Query | Not Query | + Phrase [String]` (in `lib/search/query.sx`) - [ ] `lib/search/parse.sx` — query syntax parser (boolean operators, quoted phrases) -- [ ] `lib/search/eval.sx` — boolean eval via set ops on posting lists -- [ ] phrase eval — adjacency check using positions -- [ ] `lib/search/tests/boolean.sx` — 25+ cases: term, and, or, not, phrase, - composition, parser edge cases +- [x] `lib/search/query.sx` — boolean eval via set ops on docid-sorted posting lists + (sortedUnion/Inter/Diff, Not over allDocs universe) +- [x] phrase eval — positional adjacency check (phraseInDoc / phraseStartsAt) +- [x] `lib/search/tests/boolean.sx` — 28 cases: term, and, or, not, phrase, + composition (parser edge cases move to the parse.sx suite) ## Phase 3 — Ranking @@ -102,6 +103,14 @@ lib/search/index.sx lib/search/eval.sx ## Progress log +- **Phase 2 boolean/phrase eval (46/46 total).** Query ADT + `Term|And|Or|Not|Phrase` + `evalQuery :: Index -> Query -> [DocId]` in query.sx. + Boolean ops are linear merges over docid-sorted posting lists; Not subtracts from + the allDocs universe; Phrase checks positional adjacency. 28 tests in boolean.sx. + Refactored both suites to **batch all cases into one program eval** (search-batch + in testlib) — under the heavy CPU load on this box (~11 on 2 cores), 18–28 separate + hk-eval-program calls timed out; one combined eval per suite is ~20× faster. + Parser (parse.sx) is the remaining Phase 2 box. - **Phase 1 complete (18/18).** Tokenizer (lowercase + strip punctuation + positions), inverted index as sorted assoc-list `[(Term,[(DocId,[Pos])])]`, indexDoc/deleteDoc/ lookupTerm/docFreq/allTerms. Search lib is Haskell source assembled into `search/src`