search: Phase 2 query AST + boolean/phrase eval + 28 tests
Some checks failed
Test, Build, and Deploy / test-build-deploy (push) Failing after 53s
Some checks failed
Test, Build, and Deploy / test-build-deploy (push) Failing after 53s
Query ADT (Term|And|Or|Not|Phrase) and evalQuery over docid-sorted posting lists: boolean ops as linear merges, Not over the allDocs universe, Phrase via positional adjacency. Batched both test suites into one program eval each (search-batch) so they finish under heavy CPU load. 46/46. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
123
lib/search/tests/boolean.sx
Normal file
123
lib/search/tests/boolean.sx
Normal file
@@ -0,0 +1,123 @@
|
||||
;; Phase 2 — query AST + boolean/phrase evaluation (hand-built Query values).
|
||||
;; Corpus:
|
||||
;; doc 1 "the quick brown dog" -> the quick brown dog
|
||||
;; doc 2 "a quick brown fox" -> a quick brown fox
|
||||
;; doc 3 "the dog barks loudly" -> the dog barks loudly
|
||||
;; All queries run in ONE program evaluation (search-batch) to stay fast.
|
||||
|
||||
(define
|
||||
search-corpus
|
||||
"idx = indexDoc 3 \"the dog barks loudly\" (indexDoc 2 \"a quick brown fox\" (indexDoc 1 \"the quick brown dog\" emptyIndex))\n")
|
||||
|
||||
(define
|
||||
bool-cases
|
||||
(list
|
||||
(list
|
||||
"term in two docs"
|
||||
"evalQuery idx (Term \"quick\")"
|
||||
(list 1 2))
|
||||
(list
|
||||
"term in two docs (the)"
|
||||
"evalQuery idx (Term \"the\")"
|
||||
(list 1 3))
|
||||
(list "term in one doc" "evalQuery idx (Term \"fox\")" (list 2))
|
||||
(list "term absent" "evalQuery idx (Term \"zzz\")" (list))
|
||||
(list
|
||||
"term case-sensitive at AST level"
|
||||
"evalQuery idx (Term \"QUICK\")"
|
||||
(list))
|
||||
(list "term on empty index" "evalQuery emptyIndex (Term \"cat\")" (list))
|
||||
(list
|
||||
"and both terms"
|
||||
"evalQuery idx (And (Term \"quick\") (Term \"brown\"))"
|
||||
(list 1 2))
|
||||
(list
|
||||
"and overlap subset"
|
||||
"evalQuery idx (And (Term \"the\") (Term \"dog\"))"
|
||||
(list 1 3))
|
||||
(list
|
||||
"and disjoint is empty"
|
||||
"evalQuery idx (And (Term \"the\") (Term \"fox\"))"
|
||||
(list))
|
||||
(list
|
||||
"and right-nested"
|
||||
"evalQuery idx (And (Term \"the\") (And (Term \"dog\") (Term \"barks\")))"
|
||||
(list 3))
|
||||
(list
|
||||
"or two singletons"
|
||||
"evalQuery idx (Or (Term \"fox\") (Term \"barks\"))"
|
||||
(list 2 3))
|
||||
(list
|
||||
"or all docs"
|
||||
"evalQuery idx (Or (Term \"quick\") (Term \"the\"))"
|
||||
(list 1 2 3))
|
||||
(list
|
||||
"or with absent term"
|
||||
"evalQuery idx (Or (Term \"fox\") (Term \"zzz\"))"
|
||||
(list 2))
|
||||
(list "not term" "evalQuery idx (Not (Term \"the\"))" (list 2))
|
||||
(list "not term 2" "evalQuery idx (Not (Term \"quick\"))" (list 3))
|
||||
(list
|
||||
"and with not"
|
||||
"evalQuery idx (And (Term \"quick\") (Not (Term \"the\")))"
|
||||
(list 2))
|
||||
(list
|
||||
"double negation"
|
||||
"evalQuery idx (Not (Not (Term \"fox\")))"
|
||||
(list 2))
|
||||
(list
|
||||
"or of and with term"
|
||||
"evalQuery idx (Or (And (Term \"the\") (Term \"dog\")) (Term \"fox\"))"
|
||||
(list 1 2 3))
|
||||
(list
|
||||
"phrase adjacent both docs"
|
||||
"evalQuery idx (Phrase [\"quick\", \"brown\"])"
|
||||
(list 1 2))
|
||||
(list
|
||||
"phrase adjacent one doc"
|
||||
"evalQuery idx (Phrase [\"brown\", \"dog\"])"
|
||||
(list 1))
|
||||
(list
|
||||
"phrase the quick"
|
||||
"evalQuery idx (Phrase [\"the\", \"quick\"])"
|
||||
(list 1))
|
||||
(list
|
||||
"phrase dog barks"
|
||||
"evalQuery idx (Phrase [\"dog\", \"barks\"])"
|
||||
(list 3))
|
||||
(list
|
||||
"phrase non-adjacent empty"
|
||||
"evalQuery idx (Phrase [\"quick\", \"dog\"])"
|
||||
(list))
|
||||
(list
|
||||
"phrase order matters"
|
||||
"evalQuery idx (Phrase [\"brown\", \"quick\"])"
|
||||
(list))
|
||||
(list
|
||||
"phrase single term"
|
||||
"evalQuery idx (Phrase [\"dog\"])"
|
||||
(list 1 3))
|
||||
(list
|
||||
"phrase three terms"
|
||||
"evalQuery idx (Phrase [\"the\", \"dog\", \"barks\"])"
|
||||
(list 3))
|
||||
(list
|
||||
"and of phrase and term"
|
||||
"evalQuery idx (And (Phrase [\"quick\", \"brown\"]) (Term \"dog\"))"
|
||||
(list 1))
|
||||
(list
|
||||
"not of phrase"
|
||||
"evalQuery idx (Not (Phrase [\"quick\", \"brown\"]))"
|
||||
(list 3))))
|
||||
|
||||
(define
|
||||
bool-results
|
||||
(search-batch search-corpus (map (fn (c) (nth c 1)) bool-cases)))
|
||||
|
||||
(map-indexed
|
||||
(fn
|
||||
(i c)
|
||||
(hk-test (nth c 0) (nth bool-results i) (nth c 2)))
|
||||
bool-cases)
|
||||
|
||||
{:fail hk-test-fail :pass hk-test-pass :fails hk-test-fails}
|
||||
@@ -1,119 +1,88 @@
|
||||
;; Phase 1 — tokenize + inverted index.
|
||||
;; All cases run in ONE program evaluation (search-batch) to stay fast under load.
|
||||
;; Scalar results (docFreq) are wrapped as singleton lists so the batch is a list
|
||||
;; of lists.
|
||||
|
||||
(hk-test
|
||||
"tokens basic lowercases"
|
||||
(search-eval "\nresult = tokens \"The Cat sat\"\n" "result")
|
||||
(list "the" "cat" "sat"))
|
||||
|
||||
(hk-test
|
||||
"tokens strips punctuation"
|
||||
(search-eval "\nresult = tokens \"Hello, World!\"\n" "result")
|
||||
(list "hello" "world"))
|
||||
|
||||
(hk-test
|
||||
"tokens collapses whitespace"
|
||||
(search-eval "\nresult = tokens \" a b \"\n" "result")
|
||||
(list "a" "b"))
|
||||
|
||||
(hk-test
|
||||
"tokens empty is empty"
|
||||
(search-eval "\nresult = tokens \"\"\n" "result")
|
||||
(list))
|
||||
|
||||
(hk-test
|
||||
"tokens keeps digits"
|
||||
(search-eval "\nresult = tokens \"abc123 x9\"\n" "result")
|
||||
(list "abc123" "x9"))
|
||||
|
||||
(hk-test
|
||||
"positioned attaches ordinals"
|
||||
(search-eval "\nresult = positioned \"a b a\"\n" "result")
|
||||
(list (list "a" 0) (list "b" 1) (list "a" 2)))
|
||||
|
||||
(hk-test
|
||||
"index + lookup single doc"
|
||||
(search-eval
|
||||
"\nresult = lookupTerm \"cat\" (indexDoc 1 \"the cat sat\" emptyIndex)\n"
|
||||
"result")
|
||||
(list (list 1 (list 1))))
|
||||
|
||||
(hk-test
|
||||
"lookup missing term is empty"
|
||||
(search-eval
|
||||
"\nresult = lookupTerm \"dog\" (indexDoc 1 \"the cat sat\" emptyIndex)\n"
|
||||
"result")
|
||||
(list))
|
||||
|
||||
(hk-test
|
||||
"lookup records all positions"
|
||||
(search-eval
|
||||
"\nresult = lookupTerm \"the\" (indexDoc 1 \"the cat the dog the\" emptyIndex)\n"
|
||||
"result")
|
||||
(list (list 1 (list 0 2 4))))
|
||||
|
||||
(hk-test
|
||||
"multi-doc posting list sorted by docid"
|
||||
(search-eval
|
||||
"\nresult = lookupTerm \"x\" (indexDoc 1 \"x y\" (indexDoc 2 \"x z\" emptyIndex))\n"
|
||||
"result")
|
||||
(define
|
||||
index-cases
|
||||
(list
|
||||
(list 1 (list 0))
|
||||
(list 2 (list 0))))
|
||||
(list
|
||||
"tokens basic lowercases"
|
||||
"tokens \"The Cat sat\""
|
||||
(list "the" "cat" "sat"))
|
||||
(list
|
||||
"tokens strips punctuation"
|
||||
"tokens \"Hello, World!\""
|
||||
(list "hello" "world"))
|
||||
(list "tokens collapses whitespace" "tokens \" a b \"" (list "a" "b"))
|
||||
(list "tokens empty is empty" "tokens \"\"" (list))
|
||||
(list "tokens keeps digits" "tokens \"abc123 x9\"" (list "abc123" "x9"))
|
||||
(list
|
||||
"positioned attaches ordinals"
|
||||
"positioned \"a b a\""
|
||||
(list
|
||||
(list "a" 0)
|
||||
(list "b" 1)
|
||||
(list "a" 2)))
|
||||
(list
|
||||
"index + lookup single doc"
|
||||
"lookupTerm \"cat\" (indexDoc 1 \"the cat sat\" emptyIndex)"
|
||||
(list (list 1 (list 1))))
|
||||
(list
|
||||
"lookup missing term is empty"
|
||||
"lookupTerm \"dog\" (indexDoc 1 \"the cat sat\" emptyIndex)"
|
||||
(list))
|
||||
(list
|
||||
"lookup records all positions"
|
||||
"lookupTerm \"the\" (indexDoc 1 \"the cat the dog the\" emptyIndex)"
|
||||
(list (list 1 (list 0 2 4))))
|
||||
(list
|
||||
"multi-doc posting list sorted by docid"
|
||||
"lookupTerm \"x\" (indexDoc 1 \"x y\" (indexDoc 2 \"x z\" emptyIndex))"
|
||||
(list
|
||||
(list 1 (list 0))
|
||||
(list 2 (list 0))))
|
||||
(list
|
||||
"index/query case symmetry"
|
||||
"lookupTerm \"cat\" (indexDoc 1 \"CAT Cat cat\" emptyIndex)"
|
||||
(list (list 1 (list 0 1 2))))
|
||||
(list
|
||||
"re-index replaces a doc"
|
||||
"lookupTerm \"a\" (indexDoc 1 \"a a a\" (indexDoc 1 \"a\" emptyIndex))"
|
||||
(list (list 1 (list 0 1 2))))
|
||||
(list
|
||||
"delete removes a doc"
|
||||
"lookupTerm \"cat\" (deleteDoc 1 (indexDoc 1 \"the cat\" emptyIndex))"
|
||||
(list))
|
||||
(list
|
||||
"delete leaves other docs"
|
||||
"lookupTerm \"cat\" (deleteDoc 2 (indexDoc 2 \"big cat\" (indexDoc 1 \"the cat\" emptyIndex)))"
|
||||
(list (list 1 (list 1))))
|
||||
(list
|
||||
"docFreq counts docs"
|
||||
"[docFreq \"cat\" (indexDoc 2 \"a cat\" (indexDoc 1 \"the cat\" emptyIndex))]"
|
||||
(list 2))
|
||||
(list
|
||||
"docFreq zero for missing"
|
||||
"[docFreq \"zzz\" (indexDoc 1 \"a b\" emptyIndex)]"
|
||||
(list 0))
|
||||
(list
|
||||
"allTerms sorted and unique"
|
||||
"allTerms (indexDoc 1 \"banana apple cherry apple\" emptyIndex)"
|
||||
(list "apple" "banana" "cherry"))
|
||||
(list
|
||||
"allTerms merged across docs"
|
||||
"allTerms (indexDoc 2 \"d a\" (indexDoc 1 \"c b\" emptyIndex))"
|
||||
(list "a" "b" "c" "d"))))
|
||||
|
||||
(hk-test
|
||||
"index/query case symmetry"
|
||||
(search-eval
|
||||
"\nresult = lookupTerm \"cat\" (indexDoc 1 \"CAT Cat cat\" emptyIndex)\n"
|
||||
"result")
|
||||
(list (list 1 (list 0 1 2))))
|
||||
(define
|
||||
index-results
|
||||
(search-batch "" (map (fn (c) (nth c 1)) index-cases)))
|
||||
|
||||
(hk-test
|
||||
"re-index replaces a doc"
|
||||
(search-eval
|
||||
"\nresult = lookupTerm \"a\" (indexDoc 1 \"a a a\" (indexDoc 1 \"a\" emptyIndex))\n"
|
||||
"result")
|
||||
(list (list 1 (list 0 1 2))))
|
||||
|
||||
(hk-test
|
||||
"delete removes a doc"
|
||||
(search-eval
|
||||
"\nresult = lookupTerm \"cat\" (deleteDoc 1 (indexDoc 1 \"the cat\" emptyIndex))\n"
|
||||
"result")
|
||||
(list))
|
||||
|
||||
(hk-test
|
||||
"delete leaves other docs"
|
||||
(search-eval
|
||||
"\nresult = lookupTerm \"cat\" (deleteDoc 2 (indexDoc 2 \"big cat\" (indexDoc 1 \"the cat\" emptyIndex)))\n"
|
||||
"result")
|
||||
(list (list 1 (list 1))))
|
||||
|
||||
(hk-test
|
||||
"docFreq counts docs"
|
||||
(search-eval
|
||||
"\nresult = docFreq \"cat\" (indexDoc 2 \"a cat\" (indexDoc 1 \"the cat\" emptyIndex))\n"
|
||||
"result")
|
||||
2)
|
||||
|
||||
(hk-test
|
||||
"docFreq zero for missing"
|
||||
(search-eval
|
||||
"\nresult = docFreq \"zzz\" (indexDoc 1 \"a b\" emptyIndex)\n"
|
||||
"result")
|
||||
0)
|
||||
|
||||
(hk-test
|
||||
"allTerms sorted and unique"
|
||||
(search-eval
|
||||
"\nresult = allTerms (indexDoc 1 \"banana apple cherry apple\" emptyIndex)\n"
|
||||
"result")
|
||||
(list "apple" "banana" "cherry"))
|
||||
|
||||
(hk-test
|
||||
"allTerms merged across docs"
|
||||
(search-eval
|
||||
"\nresult = allTerms (indexDoc 2 \"d a\" (indexDoc 1 \"c b\" emptyIndex))\n"
|
||||
"result")
|
||||
(list "a" "b" "c" "d"))
|
||||
(map-indexed
|
||||
(fn
|
||||
(i c)
|
||||
(hk-test (nth c 0) (nth index-results i) (nth c 2)))
|
||||
index-cases)
|
||||
|
||||
{:fail hk-test-fail :pass hk-test-pass :fails hk-test-fails}
|
||||
|
||||
Reference in New Issue
Block a user