diff --git a/lib/search/api.sx b/lib/search/api.sx index e2da2bb6..5a275f4d 100644 --- a/lib/search/api.sx +++ b/lib/search/api.sx @@ -2,8 +2,15 @@ ;; Tests and callers concatenate `search/src` with their own top-level bindings ;; (e.g. "result = lookupTerm \"cat\" idx\n") and evaluate via the haskell-on-sx ;; interpreter. Public Haskell entry points: indexDoc, lookupTerm, deleteDoc, -;; docFreq, allTerms, tokens, positioned, evalQuery, parseQuery. +;; docFreq, allTerms, tokens, positioned, evalQuery, parseQuery, searchQuery. (define search/src - (str search/tokenize-src "\n" search/index-src "\n" search/query-src)) + (str + search/tokenize-src + "\n" + search/index-src + "\n" + search/query-src + "\n" + search/parse-src)) diff --git a/lib/search/conformance.conf b/lib/search/conformance.conf index 4e418e9f..6e9e8309 100644 --- a/lib/search/conformance.conf +++ b/lib/search/conformance.conf @@ -21,6 +21,7 @@ PRELOADS=( lib/search/tokenize.sx lib/search/index.sx lib/search/query.sx + lib/search/parse.sx lib/search/api.sx lib/search/testlib.sx ) @@ -28,4 +29,5 @@ PRELOADS=( SUITES=( "index:lib/search/tests/index.sx" "boolean:lib/search/tests/boolean.sx" + "parse:lib/search/tests/parse.sx" ) diff --git a/lib/search/parse.sx b/lib/search/parse.sx new file mode 100644 index 00000000..a1dc4c8b --- /dev/null +++ b/lib/search/parse.sx @@ -0,0 +1,18 @@ +;; search query parser — Haskell source fragment. Depends on tokenize + query. +;; Grammar (precedence OR < AND < NOT): +;; expr = orExpr +;; orExpr = andExpr (OR andExpr)* +;; andExpr= notExpr ((AND | ) notExpr)* -- adjacency means AND +;; notExpr= NOT notExpr | atom +;; atom = '(' expr ')' | '"' word+ '"' | word +;; Keywords AND/OR/NOT are case-insensitive; bare words are normalized via tokens. +;; Gotchas: delimiters matched by ord (escaped char literals like '\"' break the +;; haskell-on-sx tokenizer); an [] *pattern* inside a `case` alt also breaks the +;; parser, so qNormTerm/qDropRP/showQ are written as multi-clause functions. +;; parseQuery :: String -> Query +;; searchQuery :: String -> Index -> [DocId] +;; showQ :: Query -> String -- canonical render for tests/debug + +(define + search/parse-src + "data QTok = TAnd | TOr | TNot | TLP | TRP | TWord String | TPhrase [String]\nqIsSpace c = ord c == 32\nqIsLP c = ord c == 40\nqIsRP c = ord c == 41\nqIsQuote c = ord c == 34\nqDelim c = qIsSpace c || qIsLP c || qIsRP c || qIsQuote c\nqReadWord [] = ([], [])\nqReadWord (c:cs) = if qDelim c then ([], c:cs) else let (w, rest) = qReadWord cs in (c:w, rest)\nqReadPhrase [] = ([], [])\nqReadPhrase (c:cs) = if qIsQuote c then ([], cs) else let (w, rest) = qReadPhrase cs in (c:w, rest)\ntoUpperCh c = chr (toUpper (ord c))\nqUpper w = joinChars (map toUpperCh w)\nqFirstTok [] = \"\"\nqFirstTok (x:xs) = x\nqNormTerm w = qFirstTok (tokens w)\nqClassify w = if qUpper w == \"AND\" then TAnd else if qUpper w == \"OR\" then TOr else if qUpper w == \"NOT\" then TNot else TWord (qNormTerm w)\nqPhraseTok cs = let (p, rest) = qReadPhrase cs in TPhrase (tokens p) : qtokens rest\nqWordTok cs = let (w, rest) = qReadWord cs in qClassify w : qtokens rest\nqtokens [] = []\nqtokens (c:cs) = if qIsSpace c then qtokens cs else if qIsLP c then TLP : qtokens cs else if qIsRP c then TRP : qtokens cs else if qIsQuote c then qPhraseTok cs else qWordTok (c:cs)\nqDropRP (q, (TRP:rest)) = (q, rest)\nqDropRP (q, ts) = (q, ts)\nparseAtom [] = (Term \"\", [])\nparseAtom (TLP:ts) = qDropRP (parseExpr ts)\nparseAtom (TPhrase ps : ts) = (Phrase ps, ts)\nparseAtom (TWord w : ts) = (Term w, ts)\nparseAtom ts = (Term \"\", ts)\nqWrapNot (q, ts) = (Not q, ts)\nparseNot (TNot:ts) = qWrapNot (parseNot ts)\nparseNot ts = parseAtom ts\nqStartsAtom (TWord w : ts) = True\nqStartsAtom (TPhrase p : ts) = True\nqStartsAtom (TLP : ts) = True\nqStartsAtom (TNot : ts) = True\nqStartsAtom ts = False\nqAndStep left ts = let (r, rest) = parseNot ts in parseAndR (And left r) rest\nparseAndR left (TAnd:ts) = qAndStep left ts\nparseAndR left ts = if qStartsAtom ts then qAndStep left ts else (left, ts)\nparseAnd ts = let (l, rest) = parseNot ts in parseAndR l rest\nparseOrR left (TOr:ts) = let (r, rest) = parseAnd ts in parseOrR (Or left r) rest\nparseOrR left ts = (left, ts)\nparseExpr ts = let (l, rest) = parseAnd ts in parseOrR l rest\nparseQuery s = fst (parseExpr (qtokens s))\nsearchQuery s idx = evalQuery idx (parseQuery s)\njoinSp [] = \"\"\njoinSp [x] = x\njoinSp (x:xs) = x ++ \"-\" ++ joinSp xs\nshowQ (Term t) = \"T:\" ++ t\nshowQ (And a b) = \"(\" ++ showQ a ++ \" & \" ++ showQ b ++ \")\"\nshowQ (Or a b) = \"(\" ++ showQ a ++ \" | \" ++ showQ b ++ \")\"\nshowQ (Not a) = \"!\" ++ showQ a\nshowQ (Phrase ts) = \"P:\" ++ joinSp ts\n") diff --git a/lib/search/scoreboard.json b/lib/search/scoreboard.json index 51e8a2ec..4aab2a38 100644 --- a/lib/search/scoreboard.json +++ b/lib/search/scoreboard.json @@ -1,11 +1,12 @@ { "lang": "search", - "total_passed": 46, + "total_passed": 78, "total_failed": 0, - "total": 46, + "total": 78, "suites": [ {"name":"index","passed":18,"failed":0,"total":18}, - {"name":"boolean","passed":28,"failed":0,"total":28} + {"name":"boolean","passed":28,"failed":0,"total":28}, + {"name":"parse","passed":32,"failed":0,"total":32} ], - "generated": "2026-06-06T18:46:54+00:00" + "generated": "2026-06-06T19:42:39+00:00" } diff --git a/lib/search/scoreboard.md b/lib/search/scoreboard.md index a214ce29..0a71fd42 100644 --- a/lib/search/scoreboard.md +++ b/lib/search/scoreboard.md @@ -1,8 +1,9 @@ # search scoreboard -**46 / 46 passing** (0 failure(s)). +**78 / 78 passing** (0 failure(s)). | Suite | Passed | Total | Status | |-------|--------|-------|--------| | index | 18 | 18 | ok | | boolean | 28 | 28 | ok | +| parse | 32 | 32 | ok | diff --git a/lib/search/tests/parse.sx b/lib/search/tests/parse.sx new file mode 100644 index 00000000..8f7f0ebd --- /dev/null +++ b/lib/search/tests/parse.sx @@ -0,0 +1,139 @@ +;; Phase 2 — query parser (parseQuery / searchQuery). +;; AST cases assert showQ (parseQuery s); search cases assert searchQuery s idx +;; against the standard corpus. Each group runs in one batched program eval. +;; doc 1 "the quick brown dog" doc 2 "a quick brown fox" doc 3 "the dog barks loudly" + +(define + parse-corpus + "idx = indexDoc 3 \"the dog barks loudly\" (indexDoc 2 \"a quick brown fox\" (indexDoc 1 \"the quick brown dog\" emptyIndex))\n") + +(define + ast-cases + (list + (list "single term" "showQ (parseQuery \"cat\")" "T:cat") + (list "term normalized" "showQ (parseQuery \"CAT\")" "T:cat") + (list "explicit and" "showQ (parseQuery \"cat AND dog\")" "(T:cat & T:dog)") + (list + "lowercase and keyword" + "showQ (parseQuery \"cat and dog\")" + "(T:cat & T:dog)") + (list "implicit and" "showQ (parseQuery \"cat dog\")" "(T:cat & T:dog)") + (list "or" "showQ (parseQuery \"cat OR dog\")" "(T:cat | T:dog)") + (list "not" "showQ (parseQuery \"NOT cat\")" "!T:cat") + (list + "and binds tighter than or" + "showQ (parseQuery \"cat AND dog OR bird\")" + "((T:cat & T:dog) | T:bird)") + (list + "or then and" + "showQ (parseQuery \"cat OR dog AND bird\")" + "(T:cat | (T:dog & T:bird))") + (list + "parens override precedence" + "showQ (parseQuery \"(cat OR dog) AND bird\")" + "((T:cat | T:dog) & T:bird)") + (list + "and with not" + "showQ (parseQuery \"cat AND NOT dog\")" + "(T:cat & !T:dog)") + (list + "two-word phrase" + "showQ (parseQuery \"\\\"quick brown\\\"\")" + "P:quick-brown") + (list + "three-word phrase" + "showQ (parseQuery \"\\\"quick brown fox\\\"\")" + "P:quick-brown-fox") + (list + "and left-assoc" + "showQ (parseQuery \"a AND b AND c\")" + "((T:a & T:b) & T:c)") + (list + "or left-assoc" + "showQ (parseQuery \"a OR b OR c\")" + "((T:a | T:b) | T:c)") + (list + "punctuation stripped" + "showQ (parseQuery \"cat, dog!\")" + "(T:cat & T:dog)"))) + +(define + search-cases + (list + (list "term" "searchQuery \"quick\" idx" (list 1 2)) + (list + "term normalized" + "searchQuery \"QUICK\" idx" + (list 1 2)) + (list + "explicit and" + "searchQuery \"quick AND brown\" idx" + (list 1 2)) + (list + "implicit and" + "searchQuery \"quick brown\" idx" + (list 1 2)) + (list "and disjoint" "searchQuery \"the AND fox\" idx" (list)) + (list "or" "searchQuery \"fox OR barks\" idx" (list 2 3)) + (list "not" "searchQuery \"NOT the\" idx" (list 2)) + (list "and not" "searchQuery \"quick AND NOT the\" idx" (list 2)) + (list + "precedence and-or" + "searchQuery \"the AND dog OR fox\" idx" + (list 1 2 3)) + (list + "precedence or-and" + "searchQuery \"fox OR the AND dog\" idx" + (list 1 2 3)) + (list + "parens" + "searchQuery \"the AND (dog OR fox)\" idx" + (list 1 3)) + (list + "phrase" + "searchQuery \"\\\"quick brown\\\"\" idx" + (list 1 2)) + (list + "phrase one doc" + "searchQuery \"\\\"brown dog\\\"\" idx" + (list 1)) + (list + "phrase and term" + "searchQuery \"\\\"quick brown\\\" AND dog\" idx" + (list 1)) + (list + "not phrase" + "searchQuery \"NOT \\\"quick brown\\\"\" idx" + (list 3)) + (list + "implicit and terms" + "searchQuery \"dog barks\" idx" + (list 3)))) + +(define + ast-results + (search-batch "" (map (fn (c) (nth c 1)) ast-cases))) +(define + search-results + (search-batch + parse-corpus + (map (fn (c) (nth c 1)) search-cases))) + +(map-indexed + (fn + (i c) + (hk-test + (str "ast: " (nth c 0)) + (nth ast-results i) + (nth c 2))) + ast-cases) +(map-indexed + (fn + (i c) + (hk-test + (str "search: " (nth c 0)) + (nth search-results i) + (nth c 2))) + search-cases) + +{:fail hk-test-fail :pass hk-test-pass :fails hk-test-fails} diff --git a/plans/search-on-sx.md b/plans/search-on-sx.md index 43c60240..1ebb57b6 100644 --- a/plans/search-on-sx.md +++ b/plans/search-on-sx.md @@ -10,7 +10,7 @@ extension that merges per-peer indices. ## Status (rolling) -`bash lib/search/conformance.sh` → **18/18** (Phase 1 complete) +`bash lib/search/conformance.sh` → **78/78** (Phases 1–2 complete) ## Ground rules @@ -78,7 +78,9 @@ lib/search/index.sx lib/search/eval.sx - [x] Query ADT: `Term String | And Query Query | Or Query Query | Not Query | Phrase [String]` (in `lib/search/query.sx`) -- [ ] `lib/search/parse.sx` — query syntax parser (boolean operators, quoted phrases) +- [x] `lib/search/parse.sx` — query syntax parser: tokenizer + recursive-descent + (OR < AND < NOT precedence, implicit AND on adjacency, quoted phrases, parens, + case-insensitive keywords); `parseQuery`, `searchQuery`, `showQ` - [x] `lib/search/query.sx` — boolean eval via set ops on docid-sorted posting lists (sortedUnion/Inter/Diff, Not over allDocs universe) - [x] phrase eval — positional adjacency check (phraseInDoc / phraseStartsAt) @@ -103,6 +105,16 @@ lib/search/index.sx lib/search/eval.sx ## Progress log +- **Phase 2 complete — parser (78/78 total).** Query tokenizer (ord-based + delimiters, quoted phrases) + recursive-descent parser with OR Query -> [DocId]` in query.sx. Boolean ops are linear merges over docid-sorted posting lists; Not subtracts from