diff --git a/lib/search/api.sx b/lib/search/api.sx new file mode 100644 index 00000000..8a06d444 --- /dev/null +++ b/lib/search/api.sx @@ -0,0 +1,7 @@ +;; search public API — assembles the canonical Haskell source from all layers. +;; Tests and callers concatenate `search/src` with their own top-level bindings +;; (e.g. "result = lookupTerm \"cat\" idx\n") and evaluate via the haskell-on-sx +;; interpreter. Public Haskell entry points: indexDoc, lookupTerm, deleteDoc, +;; docFreq, allTerms, tokens, positioned. + +(define search/src (str search/tokenize-src "\n" search/index-src)) diff --git a/lib/search/conformance.conf b/lib/search/conformance.conf new file mode 100644 index 00000000..cc75c6e0 --- /dev/null +++ b/lib/search/conformance.conf @@ -0,0 +1,29 @@ +# search-on-sx conformance config — sourced by lib/guest/conformance.sh. + +LANG_NAME=search +SCOREBOARD_DIR=lib/search +MODE=counters +COUNTERS_PASS=hk-test-pass +COUNTERS_FAIL=hk-test-fail +TIMEOUT_PER_SUITE=600 + +PRELOADS=( + lib/haskell/tokenizer.sx + lib/haskell/layout.sx + lib/haskell/parser.sx + lib/haskell/desugar.sx + lib/haskell/runtime.sx + lib/haskell/match.sx + lib/haskell/eval.sx + lib/haskell/map.sx + lib/haskell/set.sx + lib/haskell/testlib.sx + lib/search/tokenize.sx + lib/search/index.sx + lib/search/api.sx + lib/search/testlib.sx +) + +SUITES=( + "index:lib/search/tests/index.sx" +) diff --git a/lib/search/conformance.sh b/lib/search/conformance.sh new file mode 100755 index 00000000..e50befa3 --- /dev/null +++ b/lib/search/conformance.sh @@ -0,0 +1,3 @@ +#!/usr/bin/env bash +# Thin wrapper — see lib/guest/conformance.sh and lib/search/conformance.conf. +exec bash "$(dirname "$0")/../guest/conformance.sh" "$(dirname "$0")/conformance.conf" "$@" diff --git a/lib/search/index.sx b/lib/search/index.sx new file mode 100644 index 00000000..3d285ec9 --- /dev/null +++ b/lib/search/index.sx @@ -0,0 +1,15 @@ +;; search inverted index — Haskell source fragment (depends on tokenize). +;; Index = [(Term, [(DocId, [Pos])])], sorted by Term; postings sorted by DocId. +;; Data.Map's public API lacks toList/keys/map/filter, so a sorted assoc-list +;; index is used — it is the conceptual `Map Term [(DocId,[Pos])]` and exposes +;; term iteration (allTerms) and df naturally for ranking. +;; emptyIndex :: Index +;; indexDoc :: DocId -> String -> Index -> Index (re-index replaces) +;; lookupTerm :: Term -> Index -> [(DocId, [Pos])] +;; deleteDoc :: DocId -> Index -> Index +;; docFreq :: Term -> Index -> Int +;; allTerms :: Index -> [Term] + +(define + search/index-src + "emptyIndex = []\ngroupBump [] t p = [(t, [p])]\ngroupBump (g:gs) t p = if fst g == t then (t, snd g ++ [p]) : gs else g : groupBump gs t p\ngroupStep acc tp = groupBump acc (fst tp) (snd tp)\ngroupTok pairs = foldl groupStep [] pairs\ninsPosting d ps [] = [(d, ps)]\ninsPosting d ps (q:qs) = if d < fst q then (d, ps) : q : qs else if d == fst q then (d, ps) : qs else q : insPosting d ps qs\ninsTerm t d ps [] = [(t, [(d, ps)])]\ninsTerm t d ps (e:es) = if t < fst e then (t, [(d, ps)]) : e : es else if t == fst e then (fst e, insPosting d ps (snd e)) : es else e : insTerm t d ps es\nindexStep d ix tp = insTerm (fst tp) d (snd tp) ix\nindexDoc d text idx = foldl (indexStep d) idx (groupTok (positioned text))\nlookupTerm t idx = case lookup t idx of { Nothing -> []; Just pl -> pl }\ndocFreq t idx = length (lookupTerm t idx)\nallTerms idx = map fst idx\npostingKeep d q = fst q /= d\ndropTermDoc d e = (fst e, filter (postingKeep d) (snd e))\nplKeep e = not (null (snd e))\ndeleteDoc d idx = filter plKeep (map (dropTermDoc d) idx)\n") diff --git a/lib/search/scoreboard.json b/lib/search/scoreboard.json new file mode 100644 index 00000000..4c5202b0 --- /dev/null +++ b/lib/search/scoreboard.json @@ -0,0 +1,10 @@ +{ + "lang": "search", + "total_passed": 18, + "total_failed": 0, + "total": 18, + "suites": [ + {"name":"index","passed":18,"failed":0,"total":18} + ], + "generated": "2026-06-06T18:12:50+00:00" +} diff --git a/lib/search/scoreboard.md b/lib/search/scoreboard.md new file mode 100644 index 00000000..cf9cabce --- /dev/null +++ b/lib/search/scoreboard.md @@ -0,0 +1,7 @@ +# search scoreboard + +**18 / 18 passing** (0 failure(s)). + +| Suite | Passed | Total | Status | +|-------|--------|-------|--------| +| index | 18 | 18 | ok | diff --git a/lib/search/testlib.sx b/lib/search/testlib.sx new file mode 100644 index 00000000..9c965b05 --- /dev/null +++ b/lib/search/testlib.sx @@ -0,0 +1,29 @@ +;; search test helpers — convert forced haskell values to plain SX and run +;; programs built on top of search/src. Reuses hk-test / counters from +;; lib/haskell/testlib.sx (preloaded by the conformance config). + +;; Recursively turn a forced HK value into plain SX: +;; cons-list -> SX list, Tuple -> SX list, leaves unchanged. +(define + search-hk->sx + (fn + (v) + (cond + ((and (list? v) (not (empty? v)) (= (first v) "[]")) (list)) + ((and (list? v) (not (empty? v)) (= (first v) ":")) + (cons + (search-hk->sx (nth v 1)) + (search-hk->sx (nth v 2)))) + ((and (list? v) (not (empty? v)) (= (first v) "Tuple")) + (map search-hk->sx (rest v))) + (:else v)))) + +;; Evaluate `extra` (extra top-level Haskell bindings) on top of search/src +;; and return binding `name` as plain SX. +(define + search-eval + (fn + (extra name) + (search-hk->sx + (hk-deep-force + (get (hk-eval-program (hk-core (str search/src extra))) name))))) diff --git a/lib/search/tests/index.sx b/lib/search/tests/index.sx new file mode 100644 index 00000000..2e9cb700 --- /dev/null +++ b/lib/search/tests/index.sx @@ -0,0 +1,119 @@ +;; Phase 1 — tokenize + inverted index. + +(hk-test + "tokens basic lowercases" + (search-eval "\nresult = tokens \"The Cat sat\"\n" "result") + (list "the" "cat" "sat")) + +(hk-test + "tokens strips punctuation" + (search-eval "\nresult = tokens \"Hello, World!\"\n" "result") + (list "hello" "world")) + +(hk-test + "tokens collapses whitespace" + (search-eval "\nresult = tokens \" a b \"\n" "result") + (list "a" "b")) + +(hk-test + "tokens empty is empty" + (search-eval "\nresult = tokens \"\"\n" "result") + (list)) + +(hk-test + "tokens keeps digits" + (search-eval "\nresult = tokens \"abc123 x9\"\n" "result") + (list "abc123" "x9")) + +(hk-test + "positioned attaches ordinals" + (search-eval "\nresult = positioned \"a b a\"\n" "result") + (list (list "a" 0) (list "b" 1) (list "a" 2))) + +(hk-test + "index + lookup single doc" + (search-eval + "\nresult = lookupTerm \"cat\" (indexDoc 1 \"the cat sat\" emptyIndex)\n" + "result") + (list (list 1 (list 1)))) + +(hk-test + "lookup missing term is empty" + (search-eval + "\nresult = lookupTerm \"dog\" (indexDoc 1 \"the cat sat\" emptyIndex)\n" + "result") + (list)) + +(hk-test + "lookup records all positions" + (search-eval + "\nresult = lookupTerm \"the\" (indexDoc 1 \"the cat the dog the\" emptyIndex)\n" + "result") + (list (list 1 (list 0 2 4)))) + +(hk-test + "multi-doc posting list sorted by docid" + (search-eval + "\nresult = lookupTerm \"x\" (indexDoc 1 \"x y\" (indexDoc 2 \"x z\" emptyIndex))\n" + "result") + (list + (list 1 (list 0)) + (list 2 (list 0)))) + +(hk-test + "index/query case symmetry" + (search-eval + "\nresult = lookupTerm \"cat\" (indexDoc 1 \"CAT Cat cat\" emptyIndex)\n" + "result") + (list (list 1 (list 0 1 2)))) + +(hk-test + "re-index replaces a doc" + (search-eval + "\nresult = lookupTerm \"a\" (indexDoc 1 \"a a a\" (indexDoc 1 \"a\" emptyIndex))\n" + "result") + (list (list 1 (list 0 1 2)))) + +(hk-test + "delete removes a doc" + (search-eval + "\nresult = lookupTerm \"cat\" (deleteDoc 1 (indexDoc 1 \"the cat\" emptyIndex))\n" + "result") + (list)) + +(hk-test + "delete leaves other docs" + (search-eval + "\nresult = lookupTerm \"cat\" (deleteDoc 2 (indexDoc 2 \"big cat\" (indexDoc 1 \"the cat\" emptyIndex)))\n" + "result") + (list (list 1 (list 1)))) + +(hk-test + "docFreq counts docs" + (search-eval + "\nresult = docFreq \"cat\" (indexDoc 2 \"a cat\" (indexDoc 1 \"the cat\" emptyIndex))\n" + "result") + 2) + +(hk-test + "docFreq zero for missing" + (search-eval + "\nresult = docFreq \"zzz\" (indexDoc 1 \"a b\" emptyIndex)\n" + "result") + 0) + +(hk-test + "allTerms sorted and unique" + (search-eval + "\nresult = allTerms (indexDoc 1 \"banana apple cherry apple\" emptyIndex)\n" + "result") + (list "apple" "banana" "cherry")) + +(hk-test + "allTerms merged across docs" + (search-eval + "\nresult = allTerms (indexDoc 2 \"d a\" (indexDoc 1 \"c b\" emptyIndex))\n" + "result") + (list "a" "b" "c" "d")) + +{:fail hk-test-fail :pass hk-test-pass :fails hk-test-fails} diff --git a/lib/search/tokenize.sx b/lib/search/tokenize.sx new file mode 100644 index 00000000..7c1d74d3 --- /dev/null +++ b/lib/search/tokenize.sx @@ -0,0 +1,8 @@ +;; search tokenizer — Haskell source fragment. +;; normalize (lowercase + strip punctuation), split on whitespace, attach positions. +;; tokens :: String -> [String] +;; positioned :: String -> [(String, Int)] -- 0-based ordinal positions + +(define + search/tokenize-src + "lowerChar c = chr (toLower (ord c))\nnormChar c = if isAlphaNum c then lowerChar c else ' '\nisBlankCh c = c == ' '\ndropBlanks [] = []\ndropBlanks (c:cs) = if isBlankCh c then dropBlanks cs else c:cs\ntakeWord [] = []\ntakeWord (c:cs) = if isBlankCh c then [] else c : takeWord cs\nafterWord [] = []\nafterWord (c:cs) = if isBlankCh c then c:cs else afterWord cs\nsplitWords s = let s2 = dropBlanks s in if null s2 then [] else takeWord s2 : splitWords (afterWord s2)\nappendStr a b = a ++ b\njoinChars cs = foldr appendStr \"\" cs\ntokens s = map joinChars (splitWords (map normChar s))\nposFrom i [] = []\nposFrom i (x:xs) = (x, i) : posFrom (i + 1) xs\npositioned s = posFrom 0 (tokens s)\n") diff --git a/plans/search-on-sx.md b/plans/search-on-sx.md index 9e0045d4..1baf6e9a 100644 --- a/plans/search-on-sx.md +++ b/plans/search-on-sx.md @@ -10,7 +10,7 @@ extension that merges per-peer indices. ## Status (rolling) -`bash lib/search/conformance.sh` → **0/0** (not yet started) +`bash lib/search/conformance.sh` → **18/18** (Phase 1 complete) ## Ground rules @@ -61,15 +61,18 @@ lib/search/index.sx lib/search/eval.sx ## Phase 1 — Tokenize + index -- [ ] `lib/search/tokenize.sx` — normalize (lowercase, strip punctuation), split on +- [x] `lib/search/tokenize.sx` — normalize (lowercase, strip punctuation), split on whitespace, return positions -- [ ] `lib/search/index.sx` — inverted index data structure (typed `Map` from - haskell lib); `insert`, `delete`, `lookup` -- [ ] `lib/search/api.sx` — `(search/index doc)`, `(search/lookup term)` -- [ ] `lib/search/tests/index.sx` — 15+ cases: tokenize, insert + lookup, update, - delete, multi-doc -- [ ] `lib/search/scoreboard.{json,md}` -- [ ] `lib/search/conformance.sh` +- [x] `lib/search/index.sx` — inverted index data structure; `indexDoc`, `deleteDoc`, + `lookupTerm`, `docFreq`, `allTerms`. (Data.Map's public API lacks + toList/keys/map/filter, so a sorted assoc-list `[(Term,[(DocId,[Pos])])]` is used — + the conceptual `Map Term [(DocId,[Pos])]` with free term iteration.) +- [x] `lib/search/api.sx` — assembles `search/src` (tokenize + index); Haskell entry + points `indexDoc` / `lookupTerm` +- [x] `lib/search/tests/index.sx` — 18 cases: tokenize, insert + lookup, update, + delete, multi-doc, positions, docFreq, allTerms +- [x] `lib/search/scoreboard.{json,md}` +- [x] `lib/search/conformance.sh` ## Phase 2 — Query AST + boolean evaluation @@ -99,8 +102,19 @@ lib/search/index.sx lib/search/eval.sx ## Progress log -(loop fills this in) +- **Phase 1 complete (18/18).** Tokenizer (lowercase + strip punctuation + positions), + inverted index as sorted assoc-list `[(Term,[(DocId,[Pos])])]`, indexDoc/deleteDoc/ + lookupTerm/docFreq/allTerms. Search lib is Haskell source assembled into `search/src` + and evaluated via the haskell-on-sx interpreter; tests reuse `hk-test` counters and a + `search-eval` helper that forces HK values to plain SX. conformance.sh models + lib/haskell (MODE=counters, COUNTERS_PASS/FAIL=hk-test-pass/fail). ## Blockers -(loop fills this in) +- **None.** Note: the box is heavily CPU-oversubscribed by sibling loop agents + (load ~11 on 2 cores); each program eval is ~10× slower than nominal, so suite + timeout is set to 600s. Runs are correct, just slow. +- **Data.Map public API gap (informational, not fixing):** the haskell-on-sx + `import Data.Map` binds only empty/singleton/insert/lookup/member/size/null/delete/ + insertWith/adjust/findWithDefault — no toList/keys/elems/map/filter/unionWith. Index + uses a pure assoc-list instead so term iteration and federation merge stay simple.