Compare commits
27 Commits
loops/sear
...
loops/rada
| Author | SHA1 | Date | |
|---|---|---|---|
| 7cf661d514 | |||
| 4bbc27c159 | |||
| 1dc4548cc9 | |||
| 8cb985a2f3 | |||
| 80a925018c | |||
| adad4f4436 | |||
| a752334cc0 | |||
| 2b77dc9537 | |||
| 453f244a97 | |||
| 05f3ef9104 | |||
| 4b9b15e7c8 | |||
| dbc2daf64d | |||
| b6c2995b19 | |||
| d05b49873b | |||
| 8f9b8d6f5d | |||
| 4ee15a7ddd | |||
| 3480100caa | |||
| 0bd0003550 | |||
| d9f18a635e | |||
| 3aac6aae98 | |||
| 0d06966808 | |||
| 98ef13ad2a | |||
| 20c4a48d3b | |||
| b3e1af96af | |||
| 919bd961d1 | |||
| 1902cce57f | |||
| ff537bfba2 |
63
lib/apl/conformance.conf
Normal file
63
lib/apl/conformance.conf
Normal file
@@ -0,0 +1,63 @@
|
||||
# APL conformance config — sourced by lib/guest/conformance.sh.
|
||||
|
||||
LANG_NAME=apl
|
||||
MODE=counters
|
||||
COUNTERS_PASS=apl-test-pass
|
||||
COUNTERS_FAIL=apl-test-fail
|
||||
TIMEOUT_PER_SUITE=300
|
||||
|
||||
PRELOADS=(
|
||||
spec/stdlib.sx
|
||||
lib/r7rs.sx
|
||||
lib/apl/runtime.sx
|
||||
lib/apl/tokenizer.sx
|
||||
lib/apl/parser.sx
|
||||
lib/apl/transpile.sx
|
||||
lib/apl/test-harness.sx
|
||||
)
|
||||
|
||||
SUITES=(
|
||||
"structural:lib/apl/tests/structural.sx"
|
||||
"operators:lib/apl/tests/operators.sx"
|
||||
"dfn:lib/apl/tests/dfn.sx"
|
||||
"tradfn:lib/apl/tests/tradfn.sx"
|
||||
"valence:lib/apl/tests/valence.sx"
|
||||
"programs:lib/apl/tests/programs.sx"
|
||||
"system:lib/apl/tests/system.sx"
|
||||
"idioms:lib/apl/tests/idioms.sx"
|
||||
"eval-ops:lib/apl/tests/eval-ops.sx"
|
||||
"pipeline:lib/apl/tests/pipeline.sx"
|
||||
)
|
||||
|
||||
emit_scoreboard_json() {
|
||||
local n=${#GC_NAMES[@]} i sep
|
||||
printf '{\n'
|
||||
printf ' "suites": {\n'
|
||||
for ((i=0; i<n; i++)); do
|
||||
sep=","; [ $i -eq $((n-1)) ] && sep=""
|
||||
printf ' "%s": {"pass": %d, "fail": %d}%s\n' \
|
||||
"${GC_NAMES[$i]}" "${GC_PASS[$i]}" "${GC_FAIL[$i]}" "$sep"
|
||||
done
|
||||
printf ' },\n'
|
||||
printf ' "total_pass": %d,\n' "$GC_TOTAL_PASS"
|
||||
printf ' "total_fail": %d,\n' "$GC_TOTAL_FAIL"
|
||||
printf ' "total": %d\n' "$GC_TOTAL"
|
||||
printf '}\n'
|
||||
}
|
||||
|
||||
emit_scoreboard_md() {
|
||||
local n=${#GC_NAMES[@]} i
|
||||
printf '# APL Conformance Scoreboard\n\n'
|
||||
printf '_Generated by `lib/apl/conformance.sh`_\n\n'
|
||||
printf '| Suite | Pass | Fail | Total |\n'
|
||||
printf '|-------|-----:|-----:|------:|\n'
|
||||
for ((i=0; i<n; i++)); do
|
||||
printf '| %s | %d | %d | %d |\n' \
|
||||
"${GC_NAMES[$i]}" "${GC_PASS[$i]}" "${GC_FAIL[$i]}" "${GC_TOTAL_S[$i]}"
|
||||
done
|
||||
printf '| **Total** | **%d** | **%d** | **%d** |\n' "$GC_TOTAL_PASS" "$GC_TOTAL_FAIL" "$GC_TOTAL"
|
||||
printf '\n'
|
||||
printf '## Notes\n\n'
|
||||
printf '%s\n' '- Suites use the standard `apl-test name got expected` framework loaded against `lib/apl/runtime.sx` + `lib/apl/transpile.sx`.'
|
||||
printf '%s\n' '- `lib/apl/tests/parse.sx` and `lib/apl/tests/scalar.sx` use their own self-contained frameworks and are excluded from this scoreboard.'
|
||||
}
|
||||
@@ -1,116 +1,5 @@
|
||||
#!/usr/bin/env bash
|
||||
# lib/apl/conformance.sh — run APL test suites, emit scoreboard.json + scoreboard.md.
|
||||
|
||||
set -uo pipefail
|
||||
cd "$(git rev-parse --show-toplevel)"
|
||||
|
||||
SX_SERVER="${SX_SERVER:-/root/rose-ash/hosts/ocaml/_build/default/bin/sx_server.exe}"
|
||||
if [ ! -x "$SX_SERVER" ]; then
|
||||
SX_SERVER="hosts/ocaml/_build/default/bin/sx_server.exe"
|
||||
fi
|
||||
if [ ! -x "$SX_SERVER" ]; then
|
||||
echo "ERROR: sx_server.exe not found." >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
SUITES=(structural operators dfn tradfn valence programs system idioms eval-ops pipeline)
|
||||
|
||||
OUT_JSON="lib/apl/scoreboard.json"
|
||||
OUT_MD="lib/apl/scoreboard.md"
|
||||
|
||||
run_suite() {
|
||||
local suite=$1
|
||||
local file="lib/apl/tests/${suite}.sx"
|
||||
local TMP
|
||||
TMP=$(mktemp)
|
||||
cat > "$TMP" << EPOCHS
|
||||
(epoch 1)
|
||||
(load "spec/stdlib.sx")
|
||||
(load "lib/r7rs.sx")
|
||||
(load "lib/apl/runtime.sx")
|
||||
(load "lib/apl/tokenizer.sx")
|
||||
(load "lib/apl/parser.sx")
|
||||
(load "lib/apl/transpile.sx")
|
||||
(epoch 2)
|
||||
(eval "(define apl-test-pass 0)")
|
||||
(eval "(define apl-test-fail 0)")
|
||||
(eval "(define apl-test (fn (name got expected) (if (= got expected) (set! apl-test-pass (+ apl-test-pass 1)) (set! apl-test-fail (+ apl-test-fail 1)))))")
|
||||
(epoch 3)
|
||||
(load "${file}")
|
||||
(epoch 4)
|
||||
(eval "(list apl-test-pass apl-test-fail)")
|
||||
EPOCHS
|
||||
|
||||
local OUTPUT
|
||||
OUTPUT=$(timeout 300 "$SX_SERVER" < "$TMP" 2>/dev/null)
|
||||
rm -f "$TMP"
|
||||
|
||||
local LINE
|
||||
LINE=$(echo "$OUTPUT" | awk '/^\(ok-len 4 / {getline; print; exit}')
|
||||
if [ -z "$LINE" ]; then
|
||||
LINE=$(echo "$OUTPUT" | grep -E '^\(ok 4 \([0-9]+ [0-9]+\)\)' | tail -1 \
|
||||
| sed -E 's/^\(ok 4 //; s/\)$//')
|
||||
fi
|
||||
|
||||
local P F
|
||||
P=$(echo "$LINE" | sed -E 's/^\(([0-9]+) ([0-9]+)\).*/\1/')
|
||||
F=$(echo "$LINE" | sed -E 's/^\(([0-9]+) ([0-9]+)\).*/\2/')
|
||||
P=${P:-0}
|
||||
F=${F:-0}
|
||||
echo "${P} ${F}"
|
||||
}
|
||||
|
||||
declare -A SUITE_PASS
|
||||
declare -A SUITE_FAIL
|
||||
TOTAL_PASS=0
|
||||
TOTAL_FAIL=0
|
||||
|
||||
echo "Running APL conformance suite..." >&2
|
||||
for s in "${SUITES[@]}"; do
|
||||
read -r p f < <(run_suite "$s")
|
||||
SUITE_PASS[$s]=$p
|
||||
SUITE_FAIL[$s]=$f
|
||||
TOTAL_PASS=$((TOTAL_PASS + p))
|
||||
TOTAL_FAIL=$((TOTAL_FAIL + f))
|
||||
printf " %-12s %d/%d\n" "$s" "$p" "$((p+f))" >&2
|
||||
done
|
||||
|
||||
# scoreboard.json
|
||||
{
|
||||
printf '{\n'
|
||||
printf ' "suites": {\n'
|
||||
first=1
|
||||
for s in "${SUITES[@]}"; do
|
||||
if [ $first -eq 0 ]; then printf ',\n'; fi
|
||||
printf ' "%s": {"pass": %d, "fail": %d}' "$s" "${SUITE_PASS[$s]}" "${SUITE_FAIL[$s]}"
|
||||
first=0
|
||||
done
|
||||
printf '\n },\n'
|
||||
printf ' "total_pass": %d,\n' "$TOTAL_PASS"
|
||||
printf ' "total_fail": %d,\n' "$TOTAL_FAIL"
|
||||
printf ' "total": %d\n' "$((TOTAL_PASS + TOTAL_FAIL))"
|
||||
printf '}\n'
|
||||
} > "$OUT_JSON"
|
||||
|
||||
# scoreboard.md
|
||||
{
|
||||
printf '# APL Conformance Scoreboard\n\n'
|
||||
printf '_Generated by `lib/apl/conformance.sh`_\n\n'
|
||||
printf '| Suite | Pass | Fail | Total |\n'
|
||||
printf '|-------|-----:|-----:|------:|\n'
|
||||
for s in "${SUITES[@]}"; do
|
||||
p=${SUITE_PASS[$s]}
|
||||
f=${SUITE_FAIL[$s]}
|
||||
printf '| %s | %d | %d | %d |\n' "$s" "$p" "$f" "$((p+f))"
|
||||
done
|
||||
printf '| **Total** | **%d** | **%d** | **%d** |\n' "$TOTAL_PASS" "$TOTAL_FAIL" "$((TOTAL_PASS + TOTAL_FAIL))"
|
||||
printf '\n'
|
||||
printf '## Notes\n\n'
|
||||
printf '%s\n' '- Suites use the standard `apl-test name got expected` framework loaded against `lib/apl/runtime.sx` + `lib/apl/transpile.sx`.'
|
||||
printf '%s\n' '- `lib/apl/tests/parse.sx` and `lib/apl/tests/scalar.sx` use their own self-contained frameworks and are excluded from this scoreboard.'
|
||||
} > "$OUT_MD"
|
||||
|
||||
echo "Wrote $OUT_JSON and $OUT_MD" >&2
|
||||
echo "Total: $TOTAL_PASS pass, $TOTAL_FAIL fail" >&2
|
||||
|
||||
[ "$TOTAL_FAIL" -eq 0 ]
|
||||
# lib/apl/conformance.sh — APL conformance via the shared guest driver.
|
||||
# Config lives in lib/apl/conformance.conf (MODE=counters). Override the binary
|
||||
# with SX_SERVER=path/to/sx_server.exe bash lib/apl/conformance.sh
|
||||
exec bash "$(dirname "$0")/../guest/conformance.sh" "$(dirname "$0")/conformance.conf" "$@"
|
||||
|
||||
@@ -9,9 +9,9 @@
|
||||
"system": {"pass": 13, "fail": 0},
|
||||
"idioms": {"pass": 64, "fail": 0},
|
||||
"eval-ops": {"pass": 14, "fail": 0},
|
||||
"pipeline": {"pass": 40, "fail": 0}
|
||||
"pipeline": {"pass": 152, "fail": 0}
|
||||
},
|
||||
"total_pass": 450,
|
||||
"total_pass": 562,
|
||||
"total_fail": 0,
|
||||
"total": 450
|
||||
"total": 562
|
||||
}
|
||||
|
||||
@@ -13,8 +13,8 @@ _Generated by `lib/apl/conformance.sh`_
|
||||
| system | 13 | 0 | 13 |
|
||||
| idioms | 64 | 0 | 64 |
|
||||
| eval-ops | 14 | 0 | 14 |
|
||||
| pipeline | 40 | 0 | 40 |
|
||||
| **Total** | **450** | **0** | **450** |
|
||||
| pipeline | 152 | 0 | 152 |
|
||||
| **Total** | **562** | **0** | **562** |
|
||||
|
||||
## Notes
|
||||
|
||||
|
||||
15
lib/apl/test-harness.sx
Normal file
15
lib/apl/test-harness.sx
Normal file
@@ -0,0 +1,15 @@
|
||||
; lib/apl/test-harness.sx — counters + assertion fn for the shared conformance
|
||||
; driver (lib/guest/conformance.sh, MODE=counters). Loaded as a PRELOAD so each
|
||||
; suite starts from a fresh 0/0; suites call (apl-test name got expected).
|
||||
|
||||
(define apl-test-pass 0)
|
||||
(define apl-test-fail 0)
|
||||
|
||||
(define
|
||||
apl-test
|
||||
(fn
|
||||
(name got expected)
|
||||
(if
|
||||
(= got expected)
|
||||
(set! apl-test-pass (+ apl-test-pass 1))
|
||||
(set! apl-test-fail (+ apl-test-fail 1)))))
|
||||
@@ -1,44 +0,0 @@
|
||||
;; search public API — assembles the canonical Haskell source from all layers.
|
||||
;; Tests and callers concatenate `search/src` with their own top-level bindings
|
||||
;; (e.g. "result = lookupTerm \"cat\" idx\n") and evaluate via the haskell-on-sx
|
||||
;; interpreter. Public Haskell entry points: indexDoc, lookupTerm, deleteDoc,
|
||||
;; docFreq, allTerms, tokens, positioned, evalQuery, parseQuery, searchQuery,
|
||||
;; rankTfIdf, rankBm25, topNTfIdf, topNBm25, fedIndex, aclFilter, searchTfIdfAcl,
|
||||
;; topNTfIdfAcl, searchBm25Acl, prefixTerms, prefixDocs, prefixRankTfIdf,
|
||||
;; paginate, pageTfIdf, pageBm25, resultCount, editDist, fuzzyTerms, fuzzyDocs,
|
||||
;; fuzzyRankTfIdf, highlight, snippet, stem, stemText, stemTokens, indexStemmed,
|
||||
;; nearDocs, expandTerm, synDocs, synRankTfIdf, queryTerms, searchRankTfIdf,
|
||||
;; searchRankBm25, suggestN, suggest.
|
||||
|
||||
(define
|
||||
search/src
|
||||
(str
|
||||
search/tokenize-src
|
||||
"\n"
|
||||
search/index-src
|
||||
"\n"
|
||||
search/query-src
|
||||
"\n"
|
||||
search/parse-src
|
||||
"\n"
|
||||
search/rank-src
|
||||
"\n"
|
||||
search/fed-src
|
||||
"\n"
|
||||
search/prefix-src
|
||||
"\n"
|
||||
search/page-src
|
||||
"\n"
|
||||
search/fuzzy-src
|
||||
"\n"
|
||||
search/highlight-src
|
||||
"\n"
|
||||
search/stem-src
|
||||
"\n"
|
||||
search/near-src
|
||||
"\n"
|
||||
search/syn-src
|
||||
"\n"
|
||||
search/rankq-src
|
||||
"\n"
|
||||
search/suggest-src))
|
||||
@@ -1,55 +0,0 @@
|
||||
# search-on-sx conformance config — sourced by lib/guest/conformance.sh.
|
||||
|
||||
LANG_NAME=search
|
||||
SCOREBOARD_DIR=lib/search
|
||||
MODE=counters
|
||||
COUNTERS_PASS=hk-test-pass
|
||||
COUNTERS_FAIL=hk-test-fail
|
||||
TIMEOUT_PER_SUITE=600
|
||||
|
||||
PRELOADS=(
|
||||
lib/haskell/tokenizer.sx
|
||||
lib/haskell/layout.sx
|
||||
lib/haskell/parser.sx
|
||||
lib/haskell/desugar.sx
|
||||
lib/haskell/runtime.sx
|
||||
lib/haskell/match.sx
|
||||
lib/haskell/eval.sx
|
||||
lib/haskell/map.sx
|
||||
lib/haskell/set.sx
|
||||
lib/haskell/testlib.sx
|
||||
lib/search/tokenize.sx
|
||||
lib/search/index.sx
|
||||
lib/search/query.sx
|
||||
lib/search/parse.sx
|
||||
lib/search/rank.sx
|
||||
lib/search/fed.sx
|
||||
lib/search/prefix.sx
|
||||
lib/search/page.sx
|
||||
lib/search/fuzzy.sx
|
||||
lib/search/highlight.sx
|
||||
lib/search/stem.sx
|
||||
lib/search/near.sx
|
||||
lib/search/syn.sx
|
||||
lib/search/rankq.sx
|
||||
lib/search/suggest.sx
|
||||
lib/search/api.sx
|
||||
lib/search/testlib.sx
|
||||
)
|
||||
|
||||
SUITES=(
|
||||
"index:lib/search/tests/index.sx"
|
||||
"boolean:lib/search/tests/boolean.sx"
|
||||
"parse:lib/search/tests/parse.sx"
|
||||
"rank:lib/search/tests/rank.sx"
|
||||
"integration:lib/search/tests/integration.sx"
|
||||
"prefix:lib/search/tests/prefix.sx"
|
||||
"page:lib/search/tests/page.sx"
|
||||
"fuzzy:lib/search/tests/fuzzy.sx"
|
||||
"highlight:lib/search/tests/highlight.sx"
|
||||
"stem:lib/search/tests/stem.sx"
|
||||
"near:lib/search/tests/near.sx"
|
||||
"syn:lib/search/tests/syn.sx"
|
||||
"rankq:lib/search/tests/rankq.sx"
|
||||
"suggest:lib/search/tests/suggest.sx"
|
||||
)
|
||||
@@ -1,3 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
# Thin wrapper — see lib/guest/conformance.sh and lib/search/conformance.conf.
|
||||
exec bash "$(dirname "$0")/../guest/conformance.sh" "$(dirname "$0")/conformance.conf" "$@"
|
||||
@@ -1,16 +0,0 @@
|
||||
;; search federation + ACL — Haskell source fragment. Depends on index + rank.
|
||||
;; Federation merges per-peer INDICES (not ranked results): each peer's local
|
||||
;; DocIds are relabelled to global ids `gid peer local = peer*1000 + local`
|
||||
;; (dedupe by (peer,doc-id) is automatic via the bijection), then posting lists
|
||||
;; are unioned per term. Ranking then runs once over the merged index, which is
|
||||
;; rank-correct. ACL is a post-rank filter: an injected `permit :: DocId -> Bool`
|
||||
;; predicate (viewer baked in by the caller) — never baked into the index.
|
||||
;; fedIndex :: [(PeerId, Index)] -> Index
|
||||
;; aclFilter :: (DocId -> Bool) -> [DocId] -> [DocId]
|
||||
;; searchTfIdfAcl :: (DocId -> Bool) -> [Term] -> Index -> [DocId]
|
||||
;; topNTfIdfAcl :: Int -> (DocId -> Bool) -> [Term] -> Index -> [DocId]
|
||||
;; searchBm25Acl :: (DocId -> Bool) -> Float -> Float -> [Term] -> Index -> [DocId]
|
||||
|
||||
(define
|
||||
search/fed-src
|
||||
"gid peer local = peer * 1000 + local\nfedRelabelPosting peer p = (gid peer (fst p), snd p)\nfedRelabelEntry peer e = (fst e, map (fedRelabelPosting peer) (snd e))\nfedRelabelIndex peer idx = map (fedRelabelEntry peer) idx\nfedInsP p [] = [p]\nfedInsP p (q:qs) = if fst p < fst q then p : q : qs else if fst p == fst q then p : qs else q : fedInsP p qs\nfedMergePL a b = foldr fedInsP b a\nfedInsTerm t pl [] = [(t, pl)]\nfedInsTerm t pl (x:xs) = if t < fst x then (t, pl) : x : xs else if t == fst x then (fst x, fedMergePL pl (snd x)) : xs else x : fedInsTerm t pl xs\nfedMergeEntry idx e = fedInsTerm (fst e) (snd e) idx\nfedMergeTwo a b = foldl fedMergeEntry a b\nfedAddPeer acc pair = fedMergeTwo acc (fedRelabelIndex (fst pair) (snd pair))\nfedIndex pairs = foldl fedAddPeer emptyIndex pairs\naclFilter permit docs = filter permit docs\nsearchTfIdfAcl permit ts idx = aclFilter permit (rankTfIdf ts idx)\ntopNTfIdfAcl n permit ts idx = take n (aclFilter permit (rankTfIdf ts idx))\nsearchBm25Acl permit k1 b ts idx = aclFilter permit (rankBm25 k1 b ts idx)\n")
|
||||
@@ -1,12 +0,0 @@
|
||||
;; search fuzzy matching — Haskell source fragment. Depends on index + rank.
|
||||
;; Levenshtein edit distance (O(m*n) row-based DP — the naive recursive version is
|
||||
;; exponential and far too slow under load) expands a query term to all indexed
|
||||
;; terms within a max distance, then unions / ranks their docs.
|
||||
;; editDist :: String -> String -> Int
|
||||
;; fuzzyTerms :: Int -> String -> Index -> [Term] (sorted)
|
||||
;; fuzzyDocs :: Int -> String -> Index -> [DocId] (sorted union)
|
||||
;; fuzzyRankTfIdf :: Int -> String -> Index -> [DocId]
|
||||
|
||||
(define
|
||||
search/fuzzy-src
|
||||
"edMin3 a b c = min a (min b c)\nedCost x y = if x == y then 0 else 1\nedUpto i n = if i > n then [] else i : edUpto (i + 1) n\nedLast [x] = x\nedLast (x:xs) = edLast xs\nedNrow x [] prev left = []\nedNrow x (y:ys) prev left = let v = edMin3 (head (tail prev) + 1) (left + 1) (head prev + edCost x y) in v : edNrow x ys (tail prev) v\nedRow x ys prev = let f = head prev + 1 in f : edNrow x ys prev f\nedRows [] ys prev = prev\nedRows (x:xs) ys prev = edRows xs ys (edRow x ys prev)\neditDist xs ys = edLast (edRows xs ys (edUpto 0 (length ys)))\nqWithinDist maxd term t = editDist term t <= maxd\nfuzzyTerms maxd term idx = filter (qWithinDist maxd term) (allTerms idx)\nfuzzyDocs maxd term idx = foldl (candStep idx) [] (fuzzyTerms maxd term idx)\nfuzzyRankTfIdf maxd term idx = rankTfIdf (fuzzyTerms maxd term idx) idx\n")
|
||||
@@ -1,10 +0,0 @@
|
||||
;; search highlight / snippet — Haskell source fragment. Depends on tokenize.
|
||||
;; Operates on document text (not the index): marks query-matching tokens with
|
||||
;; [..] and extracts a context window around the first match. Tokens are
|
||||
;; normalized (lowercase, punctuation-stripped) by `tokens`, matching index side.
|
||||
;; highlight :: [Term] -> String -> String
|
||||
;; snippet :: Int -> [Term] -> String -> String (ctx tokens each side of 1st match)
|
||||
|
||||
(define
|
||||
search/highlight-src
|
||||
"hlMark terms t = if elem t terms then \"[\" ++ t ++ \"]\" else t\nhighlight terms text = unwords (map (hlMark terms) (tokens text))\nhlIdxFrom terms [] i = 0 - 1\nhlIdxFrom terms (t:ts) i = if elem t terms then i else hlIdxFrom terms ts (i + 1)\nhlIdx terms toks = hlIdxFrom terms toks 0\nhlMax0 x = if x < 0 then 0 else x\nsnipStart ctx i = if i < 0 then 0 else hlMax0 (i - ctx)\nsnipToks ctx terms toks = unwords (map (hlMark terms) (take (2 * ctx + 1) (drop (snipStart ctx (hlIdx terms toks)) toks)))\nsnippet ctx terms text = snipToks ctx terms (tokens text)\n")
|
||||
@@ -1,15 +0,0 @@
|
||||
;; search inverted index — Haskell source fragment (depends on tokenize).
|
||||
;; Index = [(Term, [(DocId, [Pos])])], sorted by Term; postings sorted by DocId.
|
||||
;; Data.Map's public API lacks toList/keys/map/filter, so a sorted assoc-list
|
||||
;; index is used — it is the conceptual `Map Term [(DocId,[Pos])]` and exposes
|
||||
;; term iteration (allTerms) and df naturally for ranking.
|
||||
;; emptyIndex :: Index
|
||||
;; indexDoc :: DocId -> String -> Index -> Index (re-index replaces)
|
||||
;; lookupTerm :: Term -> Index -> [(DocId, [Pos])]
|
||||
;; deleteDoc :: DocId -> Index -> Index
|
||||
;; docFreq :: Term -> Index -> Int
|
||||
;; allTerms :: Index -> [Term]
|
||||
|
||||
(define
|
||||
search/index-src
|
||||
"emptyIndex = []\ngroupBump [] t p = [(t, [p])]\ngroupBump (g:gs) t p = if fst g == t then (t, snd g ++ [p]) : gs else g : groupBump gs t p\ngroupStep acc tp = groupBump acc (fst tp) (snd tp)\ngroupTok pairs = foldl groupStep [] pairs\ninsPosting d ps [] = [(d, ps)]\ninsPosting d ps (q:qs) = if d < fst q then (d, ps) : q : qs else if d == fst q then (d, ps) : qs else q : insPosting d ps qs\ninsTerm t d ps [] = [(t, [(d, ps)])]\ninsTerm t d ps (e:es) = if t < fst e then (t, [(d, ps)]) : e : es else if t == fst e then (fst e, insPosting d ps (snd e)) : es else e : insTerm t d ps es\nindexStep d ix tp = insTerm (fst tp) d (snd tp) ix\nindexDoc d text idx = foldl (indexStep d) idx (groupTok (positioned text))\nlookupTerm t idx = case lookup t idx of { Nothing -> []; Just pl -> pl }\ndocFreq t idx = length (lookupTerm t idx)\nallTerms idx = map fst idx\npostingKeep d q = fst q /= d\ndropTermDoc d e = (fst e, filter (postingKeep d) (snd e))\nplKeep e = not (null (snd e))\ndeleteDoc d idx = filter plKeep (map (dropTermDoc d) idx)\n")
|
||||
@@ -1,8 +0,0 @@
|
||||
;; search proximity (NEAR) — Haskell source fragment. Depends on query (posIn,
|
||||
;; docsWith, sortedInter). Finds docs where two terms occur within k positions of
|
||||
;; each other (unordered), using the positional postings.
|
||||
;; nearDocs :: Int -> Term -> Term -> Index -> [DocId] (sorted)
|
||||
|
||||
(define
|
||||
search/near-src
|
||||
"nrAbsDiff a b = if a > b then a - b else b - a\nnrCloseTo k x [] = False\nnrCloseTo k x (y:ys) = if nrAbsDiff x y <= k then True else nrCloseTo k x ys\nnrAnyClose k [] ys = False\nnrAnyClose k (x:xs) ys = if nrCloseTo k x ys then True else nrAnyClose k xs ys\nnearInDoc k t1 t2 d idx = nrAnyClose k (posIn t1 d idx) (posIn t2 d idx)\nnearHere k t1 t2 idx d = nearInDoc k t1 t2 d idx\nnearDocs k t1 t2 idx = filter (nearHere k t1 t2 idx) (sortedInter (docsWith t1 idx) (docsWith t2 idx))\n")
|
||||
@@ -1,11 +0,0 @@
|
||||
;; search pagination — Haskell source fragment. Depends on rank.
|
||||
;; Windows a ranked result list by offset/limit (offset >= length -> empty;
|
||||
;; limit clamps to what remains).
|
||||
;; paginate :: Int -> Int -> [DocId] -> [DocId] (offset, limit)
|
||||
;; pageTfIdf :: Int -> Int -> [Term] -> Index -> [DocId]
|
||||
;; pageBm25 :: Int -> Int -> Float -> Float -> [Term] -> Index -> [DocId]
|
||||
;; resultCount :: [Term] -> Index -> Int
|
||||
|
||||
(define
|
||||
search/page-src
|
||||
"paginate off lim docs = take lim (drop off docs)\npageTfIdf off lim ts idx = paginate off lim (rankTfIdf ts idx)\npageBm25 off lim k1 b ts idx = paginate off lim (rankBm25 k1 b ts idx)\nresultCount ts idx = length (rankTfIdf ts idx)\n")
|
||||
@@ -1,18 +0,0 @@
|
||||
;; search query parser — Haskell source fragment. Depends on tokenize + query.
|
||||
;; Grammar (precedence OR < AND < NOT):
|
||||
;; expr = orExpr
|
||||
;; orExpr = andExpr (OR andExpr)*
|
||||
;; andExpr= notExpr ((AND | <implicit>) notExpr)* -- adjacency means AND
|
||||
;; notExpr= NOT notExpr | atom
|
||||
;; atom = '(' expr ')' | '"' word+ '"' | word
|
||||
;; Keywords AND/OR/NOT are case-insensitive; bare words are normalized via tokens.
|
||||
;; Gotchas: delimiters matched by ord (escaped char literals like '\"' break the
|
||||
;; haskell-on-sx tokenizer); an [] *pattern* inside a `case` alt also breaks the
|
||||
;; parser, so qNormTerm/qDropRP/showQ are written as multi-clause functions.
|
||||
;; parseQuery :: String -> Query
|
||||
;; searchQuery :: String -> Index -> [DocId]
|
||||
;; showQ :: Query -> String -- canonical render for tests/debug
|
||||
|
||||
(define
|
||||
search/parse-src
|
||||
"data QTok = TAnd | TOr | TNot | TLP | TRP | TWord String | TPhrase [String]\nqIsSpace c = ord c == 32\nqIsLP c = ord c == 40\nqIsRP c = ord c == 41\nqIsQuote c = ord c == 34\nqDelim c = qIsSpace c || qIsLP c || qIsRP c || qIsQuote c\nqReadWord [] = ([], [])\nqReadWord (c:cs) = if qDelim c then ([], c:cs) else let (w, rest) = qReadWord cs in (c:w, rest)\nqReadPhrase [] = ([], [])\nqReadPhrase (c:cs) = if qIsQuote c then ([], cs) else let (w, rest) = qReadPhrase cs in (c:w, rest)\ntoUpperCh c = chr (toUpper (ord c))\nqUpper w = joinChars (map toUpperCh w)\nqFirstTok [] = \"\"\nqFirstTok (x:xs) = x\nqNormTerm w = qFirstTok (tokens w)\nqClassify w = if qUpper w == \"AND\" then TAnd else if qUpper w == \"OR\" then TOr else if qUpper w == \"NOT\" then TNot else TWord (qNormTerm w)\nqPhraseTok cs = let (p, rest) = qReadPhrase cs in TPhrase (tokens p) : qtokens rest\nqWordTok cs = let (w, rest) = qReadWord cs in qClassify w : qtokens rest\nqtokens [] = []\nqtokens (c:cs) = if qIsSpace c then qtokens cs else if qIsLP c then TLP : qtokens cs else if qIsRP c then TRP : qtokens cs else if qIsQuote c then qPhraseTok cs else qWordTok (c:cs)\nqDropRP (q, (TRP:rest)) = (q, rest)\nqDropRP (q, ts) = (q, ts)\nparseAtom [] = (Term \"\", [])\nparseAtom (TLP:ts) = qDropRP (parseExpr ts)\nparseAtom (TPhrase ps : ts) = (Phrase ps, ts)\nparseAtom (TWord w : ts) = (Term w, ts)\nparseAtom ts = (Term \"\", ts)\nqWrapNot (q, ts) = (Not q, ts)\nparseNot (TNot:ts) = qWrapNot (parseNot ts)\nparseNot ts = parseAtom ts\nqStartsAtom (TWord w : ts) = True\nqStartsAtom (TPhrase p : ts) = True\nqStartsAtom (TLP : ts) = True\nqStartsAtom (TNot : ts) = True\nqStartsAtom ts = False\nqAndStep left ts = let (r, rest) = parseNot ts in parseAndR (And left r) rest\nparseAndR left (TAnd:ts) = qAndStep left ts\nparseAndR left ts = if qStartsAtom ts then qAndStep left ts else (left, ts)\nparseAnd ts = let (l, rest) = parseNot ts in parseAndR l rest\nparseOrR left (TOr:ts) = let (r, rest) = parseAnd ts in parseOrR (Or left r) rest\nparseOrR left ts = (left, ts)\nparseExpr ts = let (l, rest) = parseAnd ts in parseOrR l rest\nparseQuery s = fst (parseExpr (qtokens s))\nsearchQuery s idx = evalQuery idx (parseQuery s)\njoinSp [] = \"\"\njoinSp [x] = x\njoinSp (x:xs) = x ++ \"-\" ++ joinSp xs\nshowQ (Term t) = \"T:\" ++ t\nshowQ (And a b) = \"(\" ++ showQ a ++ \" & \" ++ showQ b ++ \")\"\nshowQ (Or a b) = \"(\" ++ showQ a ++ \" | \" ++ showQ b ++ \")\"\nshowQ (Not a) = \"!\" ++ showQ a\nshowQ (Phrase ts) = \"P:\" ++ joinSp ts\n")
|
||||
@@ -1,10 +0,0 @@
|
||||
;; search prefix / wildcard queries — Haskell source fragment. Depends on index +
|
||||
;; rank (reuses candStep / rankTfIdf). A prefix matches every indexed term that
|
||||
;; starts with it; the matching terms are unioned (OR) into a docid set.
|
||||
;; prefixTerms :: String -> Index -> [Term] (sorted, from allTerms)
|
||||
;; prefixDocs :: String -> Index -> [DocId] (sorted union)
|
||||
;; prefixRankTfIdf :: String -> Index -> [DocId] (ranked by the matched terms)
|
||||
|
||||
(define
|
||||
search/prefix-src
|
||||
"prefixTerms pre idx = filter (isPrefixOf pre) (allTerms idx)\nprefixDocs pre idx = foldl (candStep idx) [] (prefixTerms pre idx)\nprefixRankTfIdf pre idx = rankTfIdf (prefixTerms pre idx) idx\n")
|
||||
@@ -1,11 +0,0 @@
|
||||
;; search query AST + boolean/phrase evaluation — Haskell source fragment.
|
||||
;; Depends on tokenize + index.
|
||||
;; data Query = Term String | And Query Query | Or Query Query
|
||||
;; | Not Query | Phrase [String]
|
||||
;; evalQuery :: Index -> Query -> [DocId] (sorted, unique)
|
||||
;; Boolean ops are linear merges over docid-sorted posting lists; Not uses
|
||||
;; allDocs as the universe; Phrase checks positional adjacency.
|
||||
|
||||
(define
|
||||
search/query-src
|
||||
"data Query = Term String | And Query Query | Or Query Query | Not Query | Phrase [String]\ndocsWith t idx = map fst (lookupTerm t idx)\nsortedUnion [] ys = ys\nsortedUnion xs [] = xs\nsortedUnion (x:xs) (y:ys) = if x < y then x : sortedUnion xs (y:ys) else if x > y then y : sortedUnion (x:xs) ys else x : sortedUnion xs ys\nsortedInter [] ys = []\nsortedInter xs [] = []\nsortedInter (x:xs) (y:ys) = if x < y then sortedInter xs (y:ys) else if x > y then sortedInter (x:xs) ys else x : sortedInter xs ys\nsortedDiff [] ys = []\nsortedDiff xs [] = xs\nsortedDiff (x:xs) (y:ys) = if x < y then x : sortedDiff xs (y:ys) else if x > y then sortedDiff (x:xs) ys else sortedDiff xs ys\nmergeDocs acc e = sortedUnion acc (map fst (snd e))\nallDocs idx = foldl mergeDocs [] idx\nposIn t d idx = case lookup d (lookupTerm t idx) of { Nothing -> []; Just ps -> ps }\nelemSorted x [] = False\nelemSorted x (y:ys) = if x == y then True else if x < y then False else elemSorted x ys\nphraseAtAll [] d idx p i = True\nphraseAtAll (t:ts) d idx p i = if elemSorted (p + i) (posIn t d idx) then phraseAtAll ts d idx p (i + 1) else False\nphraseStartsAt ts d idx p = phraseAtAll ts d idx p 0\nphraseInDoc [] d idx = True\nphraseInDoc (t0:rest) d idx = any (phraseStartsAt (t0:rest) d idx) (posIn t0 d idx)\nphraseHere ts idx d = phraseInDoc ts d idx\ninterStep idx acc tt = sortedInter acc (docsWith tt idx)\nphraseCands [] idx = allDocs idx\nphraseCands (t:ts) idx = foldl (interStep idx) (docsWith t idx) ts\nphraseDocs ts idx = filter (phraseHere ts idx) (phraseCands ts idx)\nevalQuery idx q = case q of { Term t -> docsWith t idx ; And a b -> sortedInter (evalQuery idx a) (evalQuery idx b) ; Or a b -> sortedUnion (evalQuery idx a) (evalQuery idx b) ; Not a -> sortedDiff (allDocs idx) (evalQuery idx a) ; Phrase ts -> phraseDocs ts idx }\n")
|
||||
@@ -1,14 +0,0 @@
|
||||
;; search ranking — Haskell source fragment. Depends on tokenize + index + query.
|
||||
;; Ranked retrieval over the candidate set (docs containing any query term).
|
||||
;; Scores are floats; ties broken by DocId ascending (deterministic).
|
||||
;; numDocs :: Index -> Int
|
||||
;; docFreq :: Term -> Index -> Int (from index)
|
||||
;; docLen :: DocId -> Index -> Int
|
||||
;; rankTfIdf :: [Term] -> Index -> [DocId]
|
||||
;; topNTfIdf :: Int -> [Term] -> Index -> [DocId]
|
||||
;; rankBm25 :: Float -> Float -> [Term] -> Index -> [DocId] (k1, b)
|
||||
;; topNBm25 :: Int -> Float -> Float -> [Term] -> Index -> [DocId]
|
||||
|
||||
(define
|
||||
search/rank-src
|
||||
"numDocs idx = length (allDocs idx)\ntfIn t d idx = length (posIn t d idx)\nqIdf n df = if df == 0 then 0 else log (n / df)\nidf t idx = qIdf (numDocs idx) (docFreq t idx)\ntermScoreTf idx d t = tfIn t d idx * idf t idx\ntfidfDoc ts idx d = sum (map (termScoreTf idx d) ts)\ncandStep idx acc t = sortedUnion acc (docsWith t idx)\ncandDocs ts idx = foldl (candStep idx) [] ts\ncmpScore p1 p2 = if fst p1 > fst p2 then LT else if fst p1 < fst p2 then GT else compare (snd p1) (snd p2)\nmkPair f ts idx d = (f ts idx d, d)\nrankWith f ts idx = map snd (sortBy cmpScore (map (mkPair f ts idx) (candDocs ts idx)))\nrankTfIdf ts idx = rankWith tfidfDoc ts idx\ntopNTfIdf n ts idx = take n (rankTfIdf ts idx)\ntfAt d idx t = tfIn t d idx\ndocLen d idx = sum (map (tfAt d idx) (allTerms idx))\nlenAt idx d = docLen d idx\navgDocLen idx = sum (map (lenAt idx) (allDocs idx)) / numDocs idx\nbm25idf t idx = log ((numDocs idx - docFreq t idx + 0.5) / (docFreq t idx + 0.5) + 1)\nbm25Term k1 b avgdl idx d t = bm25idf t idx * (tfIn t d idx * (k1 + 1)) / (tfIn t d idx + k1 * (1 - b + b * docLen d idx / avgdl))\nbm25Doc k1 b ts idx d = sum (map (bm25Term k1 b (avgDocLen idx) idx d) ts)\nrankBm25 k1 b ts idx = rankWith (bm25Doc k1 b) ts idx\ntopNBm25 n k1 b ts idx = take n (rankBm25 k1 b ts idx)\n")
|
||||
@@ -1,11 +0,0 @@
|
||||
;; search boolean-filtered ranked search — Haskell source fragment.
|
||||
;; Depends on parse (parseQuery/Query), query (evalQuery), rank (tfidfDoc/bm25Doc/
|
||||
;; cmpScore). Filters by the boolean query, then ranks the surviving docs by
|
||||
;; relevance over the query's leaf terms — the real-world filter-then-rank pattern.
|
||||
;; queryTerms :: Query -> [Term]
|
||||
;; searchRankTfIdf :: String -> Index -> [DocId]
|
||||
;; searchRankBm25 :: Float -> Float -> String -> Index -> [DocId]
|
||||
|
||||
(define
|
||||
search/rankq-src
|
||||
"queryTerms (Term t) = [t]\nqueryTerms (And a b) = queryTerms a ++ queryTerms b\nqueryTerms (Or a b) = queryTerms a ++ queryTerms b\nqueryTerms (Not a) = queryTerms a\nqueryTerms (Phrase ts) = ts\nmkSubPair f terms idx d = (f terms idx d, d)\nrankSubsetWith f terms docs idx = map snd (sortBy cmpScore (map (mkSubPair f terms idx) docs))\nsearchRankTfIdf s idx = let q = parseQuery s in rankSubsetWith tfidfDoc (queryTerms q) (evalQuery idx q) idx\nsearchRankBm25 k1 b s idx = let q = parseQuery s in rankSubsetWith (bm25Doc k1 b) (queryTerms q) (evalQuery idx q) idx\n")
|
||||
@@ -1,23 +0,0 @@
|
||||
{
|
||||
"lang": "search",
|
||||
"total_passed": 234,
|
||||
"total_failed": 0,
|
||||
"total": 234,
|
||||
"suites": [
|
||||
{"name":"index","passed":18,"failed":0,"total":18},
|
||||
{"name":"boolean","passed":28,"failed":0,"total":28},
|
||||
{"name":"parse","passed":32,"failed":0,"total":32},
|
||||
{"name":"rank","passed":23,"failed":0,"total":23},
|
||||
{"name":"integration","passed":21,"failed":0,"total":21},
|
||||
{"name":"prefix","passed":14,"failed":0,"total":14},
|
||||
{"name":"page","passed":12,"failed":0,"total":12},
|
||||
{"name":"fuzzy","passed":18,"failed":0,"total":18},
|
||||
{"name":"highlight","passed":12,"failed":0,"total":12},
|
||||
{"name":"stem","passed":18,"failed":0,"total":18},
|
||||
{"name":"near","passed":9,"failed":0,"total":9},
|
||||
{"name":"syn","passed":9,"failed":0,"total":9},
|
||||
{"name":"rankq","passed":11,"failed":0,"total":11},
|
||||
{"name":"suggest","passed":9,"failed":0,"total":9}
|
||||
],
|
||||
"generated": "2026-06-07T00:44:05+00:00"
|
||||
}
|
||||
@@ -1,20 +0,0 @@
|
||||
# search scoreboard
|
||||
|
||||
**234 / 234 passing** (0 failure(s)).
|
||||
|
||||
| Suite | Passed | Total | Status |
|
||||
|-------|--------|-------|--------|
|
||||
| index | 18 | 18 | ok |
|
||||
| boolean | 28 | 28 | ok |
|
||||
| parse | 32 | 32 | ok |
|
||||
| rank | 23 | 23 | ok |
|
||||
| integration | 21 | 21 | ok |
|
||||
| prefix | 14 | 14 | ok |
|
||||
| page | 12 | 12 | ok |
|
||||
| fuzzy | 18 | 18 | ok |
|
||||
| highlight | 12 | 12 | ok |
|
||||
| stem | 18 | 18 | ok |
|
||||
| near | 9 | 9 | ok |
|
||||
| syn | 9 | 9 | ok |
|
||||
| rankq | 11 | 11 | ok |
|
||||
| suggest | 9 | 9 | ok |
|
||||
@@ -1,15 +0,0 @@
|
||||
;; search stemming — Haskell source fragment. Depends on tokenize + index.
|
||||
;; Lightweight, deterministic English suffix stripping (recall-improving
|
||||
;; normalizer). Rules are checked most-specific first; conservative length guards
|
||||
;; avoid mangling short words. Not a full Porter stemmer.
|
||||
;; Gotcha: take/drop over a String yield char CODES (ints), not char strings, so
|
||||
;; rebuild strings with `stStr = joinChars . map chr`. (isSuffixOf's reverse also
|
||||
;; trips `++` on the String representation, hence the manual stEnds.)
|
||||
;; stem :: String -> String
|
||||
;; stemText :: String -> String (tokenize + stem + rejoin)
|
||||
;; stemTokens :: String -> [String]
|
||||
;; indexStemmed:: DocId -> String -> Index -> Index (index the stemmed text)
|
||||
|
||||
(define
|
||||
search/stem-src
|
||||
"stStr cs = joinChars (map chr cs)\nstEnds suf w = let n = length w in let m = length suf in if m > n then False else stStr (drop (n - m) w) == suf\nstDropEnd k w = stStr (take (length w - k) w)\nstem w = if stEnds \"ies\" w && length w >= 5 then stDropEnd 3 w ++ \"y\" else if stEnds \"ss\" w then w else if stEnds \"es\" w && length w >= 5 then stDropEnd 2 w else if stEnds \"s\" w && length w >= 4 then stDropEnd 1 w else if stEnds \"ing\" w && length w >= 6 then stDropEnd 3 w else if stEnds \"ed\" w && length w >= 5 then stDropEnd 2 w else w\nstemTokens s = map stem (tokens s)\nstemText s = unwords (stemTokens s)\nindexStemmed d text idx = indexDoc d (stemText text) idx\n")
|
||||
@@ -1,9 +0,0 @@
|
||||
;; search did-you-mean / spelling suggestion — Haskell source fragment.
|
||||
;; Depends on fuzzy (editDist) + index (allTerms). Ranks indexed terms by edit
|
||||
;; distance to a (possibly misspelled) query term; ties broken alphabetically.
|
||||
;; suggestN :: Int -> String -> Index -> [Term]
|
||||
;; suggest :: String -> Index -> Term ("" if the index has no terms)
|
||||
|
||||
(define
|
||||
search/suggest-src
|
||||
"sgMk term t = (editDist term t, t)\nsgPairs term idx = map (sgMk term) (allTerms idx)\nsgCmp p1 p2 = if fst p1 < fst p2 then LT else if fst p1 > fst p2 then GT else compare (snd p1) (snd p2)\nsuggestN n term idx = take n (map snd (sortBy sgCmp (sgPairs term idx)))\nsgHead [] = \"\"\nsgHead (x:xs) = x\nsuggest term idx = sgHead (suggestN 1 term idx)\n")
|
||||
@@ -1,10 +0,0 @@
|
||||
;; search synonym / query expansion — Haskell source fragment. Depends on index +
|
||||
;; rank. A synonym map is an assoc list [(Term, [Term])]; a query term is expanded
|
||||
;; to itself plus its synonyms, then the expanded set is unioned / ranked.
|
||||
;; expandTerm :: [(Term,[Term])] -> Term -> [Term]
|
||||
;; synDocs :: [(Term,[Term])] -> Term -> Index -> [DocId]
|
||||
;; synRankTfIdf :: [(Term,[Term])] -> Term -> Index -> [DocId]
|
||||
|
||||
(define
|
||||
search/syn-src
|
||||
"synLookup synmap t = case lookup t synmap of { Nothing -> [] ; Just ss -> ss }\nexpandTerm synmap t = t : synLookup synmap t\nsynDocs synmap t idx = foldl (candStep idx) [] (expandTerm synmap t)\nsynRankTfIdf synmap t idx = rankTfIdf (expandTerm synmap t) idx\n")
|
||||
@@ -1,50 +0,0 @@
|
||||
;; search test helpers — convert forced haskell values to plain SX and run
|
||||
;; programs built on top of search/src. Reuses hk-test / counters from
|
||||
;; lib/haskell/testlib.sx (preloaded by the conformance config).
|
||||
|
||||
;; Recursively turn a forced HK value into plain SX:
|
||||
;; cons-list -> SX list, Tuple -> SX list, leaves unchanged.
|
||||
(define
|
||||
search-hk->sx
|
||||
(fn
|
||||
(v)
|
||||
(cond
|
||||
((and (list? v) (not (empty? v)) (= (first v) "[]")) (list))
|
||||
((and (list? v) (not (empty? v)) (= (first v) ":"))
|
||||
(cons
|
||||
(search-hk->sx (nth v 1))
|
||||
(search-hk->sx (nth v 2))))
|
||||
((and (list? v) (not (empty? v)) (= (first v) "Tuple"))
|
||||
(map search-hk->sx (rest v)))
|
||||
(:else v))))
|
||||
|
||||
;; Evaluate `extra` (extra top-level Haskell bindings) on top of search/src
|
||||
;; and return binding `name` as plain SX.
|
||||
(define
|
||||
search-eval
|
||||
(fn
|
||||
(extra name)
|
||||
(search-hk->sx
|
||||
(hk-deep-force
|
||||
(get (hk-eval-program (hk-core (str search/src extra))) name)))))
|
||||
|
||||
(define
|
||||
search-join
|
||||
(fn
|
||||
(sep xs)
|
||||
(cond
|
||||
((empty? xs) "")
|
||||
((empty? (rest xs)) (first xs))
|
||||
(:else (str (first xs) sep (search-join sep (rest xs)))))))
|
||||
|
||||
;; Batch many haskell expressions into ONE program evaluation (amortizes the
|
||||
;; cost of parsing/binding search/src — important under heavy CPU load).
|
||||
;; `setup` is extra top-level Haskell; `exprs` is a list of expression strings
|
||||
;; whose results form a single haskell list. Returns the SX list of results.
|
||||
(define
|
||||
search-batch
|
||||
(fn
|
||||
(setup exprs)
|
||||
(search-eval
|
||||
(str setup "\nresult = [" (search-join ", " exprs) "]\n")
|
||||
"result")))
|
||||
@@ -1,123 +0,0 @@
|
||||
;; Phase 2 — query AST + boolean/phrase evaluation (hand-built Query values).
|
||||
;; Corpus:
|
||||
;; doc 1 "the quick brown dog" -> the quick brown dog
|
||||
;; doc 2 "a quick brown fox" -> a quick brown fox
|
||||
;; doc 3 "the dog barks loudly" -> the dog barks loudly
|
||||
;; All queries run in ONE program evaluation (search-batch) to stay fast.
|
||||
|
||||
(define
|
||||
search-corpus
|
||||
"idx = indexDoc 3 \"the dog barks loudly\" (indexDoc 2 \"a quick brown fox\" (indexDoc 1 \"the quick brown dog\" emptyIndex))\n")
|
||||
|
||||
(define
|
||||
bool-cases
|
||||
(list
|
||||
(list
|
||||
"term in two docs"
|
||||
"evalQuery idx (Term \"quick\")"
|
||||
(list 1 2))
|
||||
(list
|
||||
"term in two docs (the)"
|
||||
"evalQuery idx (Term \"the\")"
|
||||
(list 1 3))
|
||||
(list "term in one doc" "evalQuery idx (Term \"fox\")" (list 2))
|
||||
(list "term absent" "evalQuery idx (Term \"zzz\")" (list))
|
||||
(list
|
||||
"term case-sensitive at AST level"
|
||||
"evalQuery idx (Term \"QUICK\")"
|
||||
(list))
|
||||
(list "term on empty index" "evalQuery emptyIndex (Term \"cat\")" (list))
|
||||
(list
|
||||
"and both terms"
|
||||
"evalQuery idx (And (Term \"quick\") (Term \"brown\"))"
|
||||
(list 1 2))
|
||||
(list
|
||||
"and overlap subset"
|
||||
"evalQuery idx (And (Term \"the\") (Term \"dog\"))"
|
||||
(list 1 3))
|
||||
(list
|
||||
"and disjoint is empty"
|
||||
"evalQuery idx (And (Term \"the\") (Term \"fox\"))"
|
||||
(list))
|
||||
(list
|
||||
"and right-nested"
|
||||
"evalQuery idx (And (Term \"the\") (And (Term \"dog\") (Term \"barks\")))"
|
||||
(list 3))
|
||||
(list
|
||||
"or two singletons"
|
||||
"evalQuery idx (Or (Term \"fox\") (Term \"barks\"))"
|
||||
(list 2 3))
|
||||
(list
|
||||
"or all docs"
|
||||
"evalQuery idx (Or (Term \"quick\") (Term \"the\"))"
|
||||
(list 1 2 3))
|
||||
(list
|
||||
"or with absent term"
|
||||
"evalQuery idx (Or (Term \"fox\") (Term \"zzz\"))"
|
||||
(list 2))
|
||||
(list "not term" "evalQuery idx (Not (Term \"the\"))" (list 2))
|
||||
(list "not term 2" "evalQuery idx (Not (Term \"quick\"))" (list 3))
|
||||
(list
|
||||
"and with not"
|
||||
"evalQuery idx (And (Term \"quick\") (Not (Term \"the\")))"
|
||||
(list 2))
|
||||
(list
|
||||
"double negation"
|
||||
"evalQuery idx (Not (Not (Term \"fox\")))"
|
||||
(list 2))
|
||||
(list
|
||||
"or of and with term"
|
||||
"evalQuery idx (Or (And (Term \"the\") (Term \"dog\")) (Term \"fox\"))"
|
||||
(list 1 2 3))
|
||||
(list
|
||||
"phrase adjacent both docs"
|
||||
"evalQuery idx (Phrase [\"quick\", \"brown\"])"
|
||||
(list 1 2))
|
||||
(list
|
||||
"phrase adjacent one doc"
|
||||
"evalQuery idx (Phrase [\"brown\", \"dog\"])"
|
||||
(list 1))
|
||||
(list
|
||||
"phrase the quick"
|
||||
"evalQuery idx (Phrase [\"the\", \"quick\"])"
|
||||
(list 1))
|
||||
(list
|
||||
"phrase dog barks"
|
||||
"evalQuery idx (Phrase [\"dog\", \"barks\"])"
|
||||
(list 3))
|
||||
(list
|
||||
"phrase non-adjacent empty"
|
||||
"evalQuery idx (Phrase [\"quick\", \"dog\"])"
|
||||
(list))
|
||||
(list
|
||||
"phrase order matters"
|
||||
"evalQuery idx (Phrase [\"brown\", \"quick\"])"
|
||||
(list))
|
||||
(list
|
||||
"phrase single term"
|
||||
"evalQuery idx (Phrase [\"dog\"])"
|
||||
(list 1 3))
|
||||
(list
|
||||
"phrase three terms"
|
||||
"evalQuery idx (Phrase [\"the\", \"dog\", \"barks\"])"
|
||||
(list 3))
|
||||
(list
|
||||
"and of phrase and term"
|
||||
"evalQuery idx (And (Phrase [\"quick\", \"brown\"]) (Term \"dog\"))"
|
||||
(list 1))
|
||||
(list
|
||||
"not of phrase"
|
||||
"evalQuery idx (Not (Phrase [\"quick\", \"brown\"]))"
|
||||
(list 3))))
|
||||
|
||||
(define
|
||||
bool-results
|
||||
(search-batch search-corpus (map (fn (c) (nth c 1)) bool-cases)))
|
||||
|
||||
(map-indexed
|
||||
(fn
|
||||
(i c)
|
||||
(hk-test (nth c 0) (nth bool-results i) (nth c 2)))
|
||||
bool-cases)
|
||||
|
||||
{:fail hk-test-fail :pass hk-test-pass :fails hk-test-fails}
|
||||
@@ -1,74 +0,0 @@
|
||||
;; Extension — fuzzy matching via Levenshtein edit distance.
|
||||
;; Corpus: 1 "color flavor" 2 "colour kitten" 3 "colored"
|
||||
;; allTerms: color colored colour flavor kitten
|
||||
|
||||
(define
|
||||
fuzzy-setup
|
||||
"idx = indexDoc 3 \"colored\" (indexDoc 2 \"colour kitten\" (indexDoc 1 \"color flavor\" emptyIndex))\n")
|
||||
|
||||
(define
|
||||
fuzzy-cases
|
||||
(list
|
||||
(list
|
||||
"editDist substitution"
|
||||
"[editDist \"kitten\" \"sitten\"]"
|
||||
(list 1))
|
||||
(list "editDist equal" "[editDist \"abc\" \"abc\"]" (list 0))
|
||||
(list "editDist deletion" "[editDist \"abc\" \"ab\"]" (list 1))
|
||||
(list "editDist insertion" "[editDist \"ab\" \"abc\"]" (list 1))
|
||||
(list "editDist from empty" "[editDist \"\" \"abc\"]" (list 3))
|
||||
(list "editDist both empty" "[editDist \"\" \"\"]" (list 0))
|
||||
(list
|
||||
"editDist classic"
|
||||
"[editDist \"kitten\" \"sitting\"]"
|
||||
(list 3))
|
||||
(list
|
||||
"editDist color colour"
|
||||
"[editDist \"color\" \"colour\"]"
|
||||
(list 1))
|
||||
(list
|
||||
"editDist color colored"
|
||||
"[editDist \"color\" \"colored\"]"
|
||||
(list 2))
|
||||
(list
|
||||
"fuzzy terms dist 1"
|
||||
"fuzzyTerms 1 \"color\" idx"
|
||||
(list "color" "colour"))
|
||||
(list
|
||||
"fuzzy terms dist 2"
|
||||
"fuzzyTerms 2 \"color\" idx"
|
||||
(list "color" "colored" "colour"))
|
||||
(list "fuzzy terms exact" "fuzzyTerms 0 \"color\" idx" (list "color"))
|
||||
(list
|
||||
"fuzzy terms other word"
|
||||
"fuzzyTerms 1 \"flavour\" idx"
|
||||
(list "flavor"))
|
||||
(list
|
||||
"fuzzy docs dist 1"
|
||||
"fuzzyDocs 1 \"color\" idx"
|
||||
(list 1 2))
|
||||
(list
|
||||
"fuzzy docs dist 2"
|
||||
"fuzzyDocs 2 \"color\" idx"
|
||||
(list 1 2 3))
|
||||
(list "fuzzy docs none" "fuzzyDocs 1 \"zzzzz\" idx" (list))
|
||||
(list
|
||||
"fuzzy rank dist 1"
|
||||
"fuzzyRankTfIdf 1 \"color\" idx"
|
||||
(list 1 2))
|
||||
(list
|
||||
"fuzzy rank dist 2"
|
||||
"fuzzyRankTfIdf 2 \"color\" idx"
|
||||
(list 1 2 3))))
|
||||
|
||||
(define
|
||||
fuzzy-results
|
||||
(search-batch fuzzy-setup (map (fn (c) (nth c 1)) fuzzy-cases)))
|
||||
|
||||
(map-indexed
|
||||
(fn
|
||||
(i c)
|
||||
(hk-test (nth c 0) (nth fuzzy-results i) (nth c 2)))
|
||||
fuzzy-cases)
|
||||
|
||||
{:fail hk-test-fail :pass hk-test-pass :fails hk-test-fails}
|
||||
@@ -1,66 +0,0 @@
|
||||
;; Extension — highlight + snippet over document text.
|
||||
;; Text: "the quick brown fox jumps"
|
||||
|
||||
(define
|
||||
hl-cases
|
||||
(list
|
||||
(list
|
||||
"highlight two terms"
|
||||
"highlight [\"quick\", \"fox\"] \"the quick brown fox jumps\""
|
||||
"the [quick] brown [fox] jumps")
|
||||
(list
|
||||
"highlight none"
|
||||
"highlight [] \"the quick brown fox jumps\""
|
||||
"the quick brown fox jumps")
|
||||
(list
|
||||
"highlight absent term"
|
||||
"highlight [\"zzz\"] \"the quick brown fox jumps\""
|
||||
"the quick brown fox jumps")
|
||||
(list
|
||||
"highlight first token"
|
||||
"highlight [\"the\"] \"the quick brown fox jumps\""
|
||||
"[the] quick brown fox jumps")
|
||||
(list
|
||||
"highlight normalizes text"
|
||||
"highlight [\"quick\"] \"The Quick, brown!\""
|
||||
"the [quick] brown")
|
||||
(list
|
||||
"snippet around middle"
|
||||
"snippet 1 [\"brown\"] \"the quick brown fox jumps\""
|
||||
"quick [brown] fox")
|
||||
(list
|
||||
"snippet at start"
|
||||
"snippet 1 [\"the\"] \"the quick brown fox jumps\""
|
||||
"[the] quick brown")
|
||||
(list
|
||||
"snippet near end"
|
||||
"snippet 1 [\"fox\"] \"the quick brown fox jumps\""
|
||||
"brown [fox] jumps")
|
||||
(list
|
||||
"snippet ctx zero"
|
||||
"snippet 0 [\"brown\"] \"the quick brown fox jumps\""
|
||||
"[brown]")
|
||||
(list
|
||||
"snippet clamps at end"
|
||||
"snippet 2 [\"jumps\"] \"the quick brown fox jumps\""
|
||||
"brown fox [jumps]")
|
||||
(list
|
||||
"snippet no match shows head"
|
||||
"snippet 1 [\"zzz\"] \"the quick brown fox jumps\""
|
||||
"the quick brown")
|
||||
(list
|
||||
"snippet wide window"
|
||||
"snippet 5 [\"brown\"] \"the quick brown fox jumps\""
|
||||
"the quick [brown] fox jumps")))
|
||||
|
||||
(define
|
||||
hl-results
|
||||
(search-batch "" (map (fn (c) (nth c 1)) hl-cases)))
|
||||
|
||||
(map-indexed
|
||||
(fn
|
||||
(i c)
|
||||
(hk-test (nth c 0) (nth hl-results i) (nth c 2)))
|
||||
hl-cases)
|
||||
|
||||
{:fail hk-test-fail :pass hk-test-pass :fails hk-test-fails}
|
||||
@@ -1,88 +0,0 @@
|
||||
;; Phase 1 — tokenize + inverted index.
|
||||
;; All cases run in ONE program evaluation (search-batch) to stay fast under load.
|
||||
;; Scalar results (docFreq) are wrapped as singleton lists so the batch is a list
|
||||
;; of lists.
|
||||
|
||||
(define
|
||||
index-cases
|
||||
(list
|
||||
(list
|
||||
"tokens basic lowercases"
|
||||
"tokens \"The Cat sat\""
|
||||
(list "the" "cat" "sat"))
|
||||
(list
|
||||
"tokens strips punctuation"
|
||||
"tokens \"Hello, World!\""
|
||||
(list "hello" "world"))
|
||||
(list "tokens collapses whitespace" "tokens \" a b \"" (list "a" "b"))
|
||||
(list "tokens empty is empty" "tokens \"\"" (list))
|
||||
(list "tokens keeps digits" "tokens \"abc123 x9\"" (list "abc123" "x9"))
|
||||
(list
|
||||
"positioned attaches ordinals"
|
||||
"positioned \"a b a\""
|
||||
(list
|
||||
(list "a" 0)
|
||||
(list "b" 1)
|
||||
(list "a" 2)))
|
||||
(list
|
||||
"index + lookup single doc"
|
||||
"lookupTerm \"cat\" (indexDoc 1 \"the cat sat\" emptyIndex)"
|
||||
(list (list 1 (list 1))))
|
||||
(list
|
||||
"lookup missing term is empty"
|
||||
"lookupTerm \"dog\" (indexDoc 1 \"the cat sat\" emptyIndex)"
|
||||
(list))
|
||||
(list
|
||||
"lookup records all positions"
|
||||
"lookupTerm \"the\" (indexDoc 1 \"the cat the dog the\" emptyIndex)"
|
||||
(list (list 1 (list 0 2 4))))
|
||||
(list
|
||||
"multi-doc posting list sorted by docid"
|
||||
"lookupTerm \"x\" (indexDoc 1 \"x y\" (indexDoc 2 \"x z\" emptyIndex))"
|
||||
(list
|
||||
(list 1 (list 0))
|
||||
(list 2 (list 0))))
|
||||
(list
|
||||
"index/query case symmetry"
|
||||
"lookupTerm \"cat\" (indexDoc 1 \"CAT Cat cat\" emptyIndex)"
|
||||
(list (list 1 (list 0 1 2))))
|
||||
(list
|
||||
"re-index replaces a doc"
|
||||
"lookupTerm \"a\" (indexDoc 1 \"a a a\" (indexDoc 1 \"a\" emptyIndex))"
|
||||
(list (list 1 (list 0 1 2))))
|
||||
(list
|
||||
"delete removes a doc"
|
||||
"lookupTerm \"cat\" (deleteDoc 1 (indexDoc 1 \"the cat\" emptyIndex))"
|
||||
(list))
|
||||
(list
|
||||
"delete leaves other docs"
|
||||
"lookupTerm \"cat\" (deleteDoc 2 (indexDoc 2 \"big cat\" (indexDoc 1 \"the cat\" emptyIndex)))"
|
||||
(list (list 1 (list 1))))
|
||||
(list
|
||||
"docFreq counts docs"
|
||||
"[docFreq \"cat\" (indexDoc 2 \"a cat\" (indexDoc 1 \"the cat\" emptyIndex))]"
|
||||
(list 2))
|
||||
(list
|
||||
"docFreq zero for missing"
|
||||
"[docFreq \"zzz\" (indexDoc 1 \"a b\" emptyIndex)]"
|
||||
(list 0))
|
||||
(list
|
||||
"allTerms sorted and unique"
|
||||
"allTerms (indexDoc 1 \"banana apple cherry apple\" emptyIndex)"
|
||||
(list "apple" "banana" "cherry"))
|
||||
(list
|
||||
"allTerms merged across docs"
|
||||
"allTerms (indexDoc 2 \"d a\" (indexDoc 1 \"c b\" emptyIndex))"
|
||||
(list "a" "b" "c" "d"))))
|
||||
|
||||
(define
|
||||
index-results
|
||||
(search-batch "" (map (fn (c) (nth c 1)) index-cases)))
|
||||
|
||||
(map-indexed
|
||||
(fn
|
||||
(i c)
|
||||
(hk-test (nth c 0) (nth index-results i) (nth c 2)))
|
||||
index-cases)
|
||||
|
||||
{:fail hk-test-fail :pass hk-test-pass :fails hk-test-fails}
|
||||
@@ -1,102 +0,0 @@
|
||||
;; Phase 4 — federation (merge per-peer indices) + ACL post-filter.
|
||||
;; Peers (global id = peer*1000 + local):
|
||||
;; peer 1: 1 "alpha beta" 2 "alpha gamma" -> 1001 1002
|
||||
;; peer 2: 1 "alpha delta" 2 "beta gamma" -> 2001 2002
|
||||
;; ACL predicates are injected (viewer baked in by the caller), applied post-rank.
|
||||
|
||||
(define
|
||||
fed-setup
|
||||
"p1 = indexDoc 2 \"alpha gamma\" (indexDoc 1 \"alpha beta\" emptyIndex)\np2 = indexDoc 2 \"beta gamma\" (indexDoc 1 \"alpha delta\" emptyIndex)\nfed = fedIndex [(1, p1), (2, p2)]\npermitP1 g = g < 2000\npermitNone g = False\npermitList g = elem g [1002, 2001]\n")
|
||||
|
||||
(define
|
||||
fed-cases
|
||||
(list
|
||||
(list
|
||||
"fed merges all docs"
|
||||
"sort (allDocs fed)"
|
||||
(list 1001 1002 2001 2002))
|
||||
(list
|
||||
"fed docFreq across peers"
|
||||
"[docFreq \"alpha\" fed]"
|
||||
(list 3))
|
||||
(list "fed docFreq beta" "[docFreq \"beta\" fed]" (list 2))
|
||||
(list "fed numDocs" "[numDocs fed]" (list 4))
|
||||
(list
|
||||
"fed term lookup spans peers"
|
||||
"map fst (lookupTerm \"gamma\" fed)"
|
||||
(list 1002 2002))
|
||||
(list
|
||||
"fed preserves positions"
|
||||
"lookupTerm \"beta\" fed"
|
||||
(list
|
||||
(list 1001 (list 1))
|
||||
(list 2002 (list 0))))
|
||||
(list
|
||||
"fed rank alpha tie by gid"
|
||||
"rankTfIdf [\"alpha\"] fed"
|
||||
(list 1001 1002 2001))
|
||||
(list
|
||||
"fed rank beta"
|
||||
"rankTfIdf [\"beta\"] fed"
|
||||
(list 1001 2002))
|
||||
(list
|
||||
"fed boolean and"
|
||||
"searchQuery \"alpha AND beta\" fed"
|
||||
(list 1001))
|
||||
(list
|
||||
"fed boolean or"
|
||||
"searchQuery \"delta OR barks\" fed"
|
||||
(list 2001))
|
||||
(list
|
||||
"fed phrase within peer1"
|
||||
"searchQuery \"\\\"alpha beta\\\"\" fed"
|
||||
(list 1001))
|
||||
(list
|
||||
"fed phrase within peer2"
|
||||
"searchQuery \"\\\"beta gamma\\\"\" fed"
|
||||
(list 2002))
|
||||
(list
|
||||
"fed phrase peer2 alpha delta"
|
||||
"searchQuery \"\\\"alpha delta\\\"\" fed"
|
||||
(list 2001))
|
||||
(list "fed empty peer list" "allDocs (fedIndex [])" (list))
|
||||
(list
|
||||
"fed single relabelled peer"
|
||||
"rankTfIdf [\"alpha\"] (fedIndex [(5, p1)])"
|
||||
(list 5001 5002))
|
||||
(list
|
||||
"acl peer1 only"
|
||||
"aclFilter permitP1 (rankTfIdf [\"alpha\"] fed)"
|
||||
(list 1001 1002))
|
||||
(list
|
||||
"acl allowlist preserves rank order"
|
||||
"aclFilter permitList (rankTfIdf [\"alpha\"] fed)"
|
||||
(list 1002 2001))
|
||||
(list
|
||||
"acl topN after filter"
|
||||
"topNTfIdfAcl 1 permitP1 [\"alpha\"] fed"
|
||||
(list 1001))
|
||||
(list
|
||||
"acl denies all"
|
||||
"aclFilter permitNone (rankTfIdf [\"alpha\"] fed)"
|
||||
(list))
|
||||
(list
|
||||
"acl on bm25"
|
||||
"searchBm25Acl permitP1 1.5 0.75 [\"alpha\"] fed"
|
||||
(list 1001 1002))
|
||||
(list
|
||||
"acl end-to-end tfidf"
|
||||
"searchTfIdfAcl permitP1 [\"alpha\"] fed"
|
||||
(list 1001 1002))))
|
||||
|
||||
(define
|
||||
fed-results
|
||||
(search-batch fed-setup (map (fn (c) (nth c 1)) fed-cases)))
|
||||
|
||||
(map-indexed
|
||||
(fn
|
||||
(i c)
|
||||
(hk-test (nth c 0) (nth fed-results i) (nth c 2)))
|
||||
fed-cases)
|
||||
|
||||
{:fail hk-test-fail :pass hk-test-pass :fails hk-test-fails}
|
||||
@@ -1,49 +0,0 @@
|
||||
;; Extension — proximity (NEAR) search: terms within k positions, unordered.
|
||||
;; Corpus:
|
||||
;; 1 "the quick brown fox" the0 quick1 brown2 fox3
|
||||
;; 2 "quick the lazy fox dog" quick0 the1 lazy2 fox3 dog4
|
||||
;; 3 "fox runs quick" fox0 runs1 quick2
|
||||
|
||||
(define
|
||||
near-setup
|
||||
"idx = indexDoc 3 \"fox runs quick\" (indexDoc 2 \"quick the lazy fox dog\" (indexDoc 1 \"the quick brown fox\" emptyIndex))\n")
|
||||
|
||||
(define
|
||||
near-cases
|
||||
(list
|
||||
(list
|
||||
"near adjacent one doc"
|
||||
"nearDocs 1 \"quick\" \"brown\" idx"
|
||||
(list 1))
|
||||
(list
|
||||
"near adjacent both docs"
|
||||
"nearDocs 1 \"quick\" \"the\" idx"
|
||||
(list 1 2))
|
||||
(list
|
||||
"near within 2"
|
||||
"nearDocs 2 \"quick\" \"fox\" idx"
|
||||
(list 1 3))
|
||||
(list "near too far at k1" "nearDocs 1 \"quick\" \"fox\" idx" (list))
|
||||
(list
|
||||
"near unordered symmetric"
|
||||
"nearDocs 2 \"fox\" \"quick\" idx"
|
||||
(list 1 3))
|
||||
(list "near wider window" "nearDocs 5 \"the\" \"dog\" idx" (list 2))
|
||||
(list "near absent term" "nearDocs 1 \"quick\" \"zzz\" idx" (list))
|
||||
(list "near needs both terms" "nearDocs 3 \"brown\" \"dog\" idx" (list))
|
||||
(list
|
||||
"near same docs only"
|
||||
"nearDocs 3 \"fox\" \"runs\" idx"
|
||||
(list 3))))
|
||||
|
||||
(define
|
||||
near-results
|
||||
(search-batch near-setup (map (fn (c) (nth c 1)) near-cases)))
|
||||
|
||||
(map-indexed
|
||||
(fn
|
||||
(i c)
|
||||
(hk-test (nth c 0) (nth near-results i) (nth c 2)))
|
||||
near-cases)
|
||||
|
||||
{:fail hk-test-fail :pass hk-test-pass :fails hk-test-fails}
|
||||
@@ -1,53 +0,0 @@
|
||||
;; Extension — result pagination (offset / limit) over ranked results.
|
||||
;; Corpus (tf of "x" descending): 1 x4 2 x3 3 x2 4 x1 5 y(no x)
|
||||
;; rankTfIdf ["x"] -> [1,2,3,4]
|
||||
|
||||
(define
|
||||
page-setup
|
||||
"idx = indexDoc 5 \"y\" (indexDoc 4 \"x\" (indexDoc 3 \"x x\" (indexDoc 2 \"x x x\" (indexDoc 1 \"x x x x other\" emptyIndex))))\n")
|
||||
|
||||
(define
|
||||
page-cases
|
||||
(list
|
||||
(list "first page" "pageTfIdf 0 2 [\"x\"] idx" (list 1 2))
|
||||
(list
|
||||
"second page"
|
||||
"pageTfIdf 2 2 [\"x\"] idx"
|
||||
(list 3 4))
|
||||
(list
|
||||
"sliding window"
|
||||
"pageTfIdf 1 2 [\"x\"] idx"
|
||||
(list 2 3))
|
||||
(list
|
||||
"limit exceeds remaining"
|
||||
"pageTfIdf 3 10 [\"x\"] idx"
|
||||
(list 4))
|
||||
(list "offset past end" "pageTfIdf 4 2 [\"x\"] idx" (list))
|
||||
(list "limit zero" "pageTfIdf 0 0 [\"x\"] idx" (list))
|
||||
(list
|
||||
"whole result"
|
||||
"pageTfIdf 0 10 [\"x\"] idx"
|
||||
(list 1 2 3 4))
|
||||
(list
|
||||
"paginate raw list"
|
||||
"paginate 1 2 [10, 20, 30, 40]"
|
||||
(list 20 30))
|
||||
(list "paginate raw past end" "paginate 9 2 [10, 20]" (list))
|
||||
(list
|
||||
"bm25 page window size"
|
||||
"[length (pageBm25 0 2 1.5 0.75 [\"x\"] idx)]"
|
||||
(list 2))
|
||||
(list "result count" "[resultCount [\"x\"] idx]" (list 4))
|
||||
(list "result count zero" "[resultCount [\"zzz\"] idx]" (list 0))))
|
||||
|
||||
(define
|
||||
page-results
|
||||
(search-batch page-setup (map (fn (c) (nth c 1)) page-cases)))
|
||||
|
||||
(map-indexed
|
||||
(fn
|
||||
(i c)
|
||||
(hk-test (nth c 0) (nth page-results i) (nth c 2)))
|
||||
page-cases)
|
||||
|
||||
{:fail hk-test-fail :pass hk-test-pass :fails hk-test-fails}
|
||||
@@ -1,139 +0,0 @@
|
||||
;; Phase 2 — query parser (parseQuery / searchQuery).
|
||||
;; AST cases assert showQ (parseQuery s); search cases assert searchQuery s idx
|
||||
;; against the standard corpus. Each group runs in one batched program eval.
|
||||
;; doc 1 "the quick brown dog" doc 2 "a quick brown fox" doc 3 "the dog barks loudly"
|
||||
|
||||
(define
|
||||
parse-corpus
|
||||
"idx = indexDoc 3 \"the dog barks loudly\" (indexDoc 2 \"a quick brown fox\" (indexDoc 1 \"the quick brown dog\" emptyIndex))\n")
|
||||
|
||||
(define
|
||||
ast-cases
|
||||
(list
|
||||
(list "single term" "showQ (parseQuery \"cat\")" "T:cat")
|
||||
(list "term normalized" "showQ (parseQuery \"CAT\")" "T:cat")
|
||||
(list "explicit and" "showQ (parseQuery \"cat AND dog\")" "(T:cat & T:dog)")
|
||||
(list
|
||||
"lowercase and keyword"
|
||||
"showQ (parseQuery \"cat and dog\")"
|
||||
"(T:cat & T:dog)")
|
||||
(list "implicit and" "showQ (parseQuery \"cat dog\")" "(T:cat & T:dog)")
|
||||
(list "or" "showQ (parseQuery \"cat OR dog\")" "(T:cat | T:dog)")
|
||||
(list "not" "showQ (parseQuery \"NOT cat\")" "!T:cat")
|
||||
(list
|
||||
"and binds tighter than or"
|
||||
"showQ (parseQuery \"cat AND dog OR bird\")"
|
||||
"((T:cat & T:dog) | T:bird)")
|
||||
(list
|
||||
"or then and"
|
||||
"showQ (parseQuery \"cat OR dog AND bird\")"
|
||||
"(T:cat | (T:dog & T:bird))")
|
||||
(list
|
||||
"parens override precedence"
|
||||
"showQ (parseQuery \"(cat OR dog) AND bird\")"
|
||||
"((T:cat | T:dog) & T:bird)")
|
||||
(list
|
||||
"and with not"
|
||||
"showQ (parseQuery \"cat AND NOT dog\")"
|
||||
"(T:cat & !T:dog)")
|
||||
(list
|
||||
"two-word phrase"
|
||||
"showQ (parseQuery \"\\\"quick brown\\\"\")"
|
||||
"P:quick-brown")
|
||||
(list
|
||||
"three-word phrase"
|
||||
"showQ (parseQuery \"\\\"quick brown fox\\\"\")"
|
||||
"P:quick-brown-fox")
|
||||
(list
|
||||
"and left-assoc"
|
||||
"showQ (parseQuery \"a AND b AND c\")"
|
||||
"((T:a & T:b) & T:c)")
|
||||
(list
|
||||
"or left-assoc"
|
||||
"showQ (parseQuery \"a OR b OR c\")"
|
||||
"((T:a | T:b) | T:c)")
|
||||
(list
|
||||
"punctuation stripped"
|
||||
"showQ (parseQuery \"cat, dog!\")"
|
||||
"(T:cat & T:dog)")))
|
||||
|
||||
(define
|
||||
search-cases
|
||||
(list
|
||||
(list "term" "searchQuery \"quick\" idx" (list 1 2))
|
||||
(list
|
||||
"term normalized"
|
||||
"searchQuery \"QUICK\" idx"
|
||||
(list 1 2))
|
||||
(list
|
||||
"explicit and"
|
||||
"searchQuery \"quick AND brown\" idx"
|
||||
(list 1 2))
|
||||
(list
|
||||
"implicit and"
|
||||
"searchQuery \"quick brown\" idx"
|
||||
(list 1 2))
|
||||
(list "and disjoint" "searchQuery \"the AND fox\" idx" (list))
|
||||
(list "or" "searchQuery \"fox OR barks\" idx" (list 2 3))
|
||||
(list "not" "searchQuery \"NOT the\" idx" (list 2))
|
||||
(list "and not" "searchQuery \"quick AND NOT the\" idx" (list 2))
|
||||
(list
|
||||
"precedence and-or"
|
||||
"searchQuery \"the AND dog OR fox\" idx"
|
||||
(list 1 2 3))
|
||||
(list
|
||||
"precedence or-and"
|
||||
"searchQuery \"fox OR the AND dog\" idx"
|
||||
(list 1 2 3))
|
||||
(list
|
||||
"parens"
|
||||
"searchQuery \"the AND (dog OR fox)\" idx"
|
||||
(list 1 3))
|
||||
(list
|
||||
"phrase"
|
||||
"searchQuery \"\\\"quick brown\\\"\" idx"
|
||||
(list 1 2))
|
||||
(list
|
||||
"phrase one doc"
|
||||
"searchQuery \"\\\"brown dog\\\"\" idx"
|
||||
(list 1))
|
||||
(list
|
||||
"phrase and term"
|
||||
"searchQuery \"\\\"quick brown\\\" AND dog\" idx"
|
||||
(list 1))
|
||||
(list
|
||||
"not phrase"
|
||||
"searchQuery \"NOT \\\"quick brown\\\"\" idx"
|
||||
(list 3))
|
||||
(list
|
||||
"implicit and terms"
|
||||
"searchQuery \"dog barks\" idx"
|
||||
(list 3))))
|
||||
|
||||
(define
|
||||
ast-results
|
||||
(search-batch "" (map (fn (c) (nth c 1)) ast-cases)))
|
||||
(define
|
||||
search-results
|
||||
(search-batch
|
||||
parse-corpus
|
||||
(map (fn (c) (nth c 1)) search-cases)))
|
||||
|
||||
(map-indexed
|
||||
(fn
|
||||
(i c)
|
||||
(hk-test
|
||||
(str "ast: " (nth c 0))
|
||||
(nth ast-results i)
|
||||
(nth c 2)))
|
||||
ast-cases)
|
||||
(map-indexed
|
||||
(fn
|
||||
(i c)
|
||||
(hk-test
|
||||
(str "search: " (nth c 0))
|
||||
(nth search-results i)
|
||||
(nth c 2)))
|
||||
search-cases)
|
||||
|
||||
{:fail hk-test-fail :pass hk-test-pass :fails hk-test-fails}
|
||||
@@ -1,63 +0,0 @@
|
||||
;; Extension — prefix / wildcard queries.
|
||||
;; Corpus: 1 "alpha alpine" 2 "beta apple" 3 "banana alpha"
|
||||
;; allTerms sorted: alpha alpine apple banana beta
|
||||
|
||||
(define
|
||||
prefix-setup
|
||||
"idx = indexDoc 3 \"banana alpha\" (indexDoc 2 \"beta apple\" (indexDoc 1 \"alpha alpine\" emptyIndex))\n")
|
||||
|
||||
(define
|
||||
prefix-cases
|
||||
(list
|
||||
(list
|
||||
"prefix terms two matches"
|
||||
"prefixTerms \"al\" idx"
|
||||
(list "alpha" "alpine"))
|
||||
(list
|
||||
"prefix terms narrower"
|
||||
"prefixTerms \"alp\" idx"
|
||||
(list "alpha" "alpine"))
|
||||
(list
|
||||
"prefix terms wide"
|
||||
"prefixTerms \"a\" idx"
|
||||
(list "alpha" "alpine" "apple"))
|
||||
(list "prefix terms single" "prefixTerms \"ban\" idx" (list "banana"))
|
||||
(list "prefix terms exact term" "prefixTerms \"beta\" idx" (list "beta"))
|
||||
(list "prefix terms none" "prefixTerms \"z\" idx" (list))
|
||||
(list
|
||||
"prefix docs union"
|
||||
"prefixDocs \"al\" idx"
|
||||
(list 1 3))
|
||||
(list "prefix docs single term" "prefixDocs \"ban\" idx" (list 3))
|
||||
(list
|
||||
"prefix docs wide"
|
||||
"prefixDocs \"a\" idx"
|
||||
(list 1 2 3))
|
||||
(list "prefix docs none" "prefixDocs \"z\" idx" (list))
|
||||
(list
|
||||
"prefix docs exact"
|
||||
"prefixDocs \"alpha\" idx"
|
||||
(list 1 3))
|
||||
(list
|
||||
"prefix rank ranks by matched terms"
|
||||
"prefixRankTfIdf \"al\" idx"
|
||||
(list 1 3))
|
||||
(list
|
||||
"prefix rank single doc"
|
||||
"prefixRankTfIdf \"ban\" idx"
|
||||
(list 3))
|
||||
(list "prefix rank empty" "prefixRankTfIdf \"z\" idx" (list))))
|
||||
|
||||
(define
|
||||
prefix-results
|
||||
(search-batch
|
||||
prefix-setup
|
||||
(map (fn (c) (nth c 1)) prefix-cases)))
|
||||
|
||||
(map-indexed
|
||||
(fn
|
||||
(i c)
|
||||
(hk-test (nth c 0) (nth prefix-results i) (nth c 2)))
|
||||
prefix-cases)
|
||||
|
||||
{:fail hk-test-fail :pass hk-test-pass :fails hk-test-fails}
|
||||
@@ -1,90 +0,0 @@
|
||||
;; Phase 3 — ranking (TF-IDF, BM25, top-N). Deterministic: ties broken by DocId.
|
||||
;; Corpora:
|
||||
;; idx1: 1 "alpha alpha alpha gamma" 2 "alpha" 3 "beta"
|
||||
;; idx2: 1 "cat" 2 "cat cat dog elephant frog grape" 3 "zzz"
|
||||
;; idx3: 1 "kite" 2 "kite" (identical docs -> tiebreak)
|
||||
|
||||
(define
|
||||
rank-setup
|
||||
"idx1 = indexDoc 3 \"beta\" (indexDoc 2 \"alpha\" (indexDoc 1 \"alpha alpha alpha gamma\" emptyIndex))\nidx2 = indexDoc 3 \"zzz\" (indexDoc 2 \"cat cat dog elephant frog grape\" (indexDoc 1 \"cat\" emptyIndex))\nidx3 = indexDoc 2 \"kite\" (indexDoc 1 \"kite\" emptyIndex)\n")
|
||||
|
||||
(define
|
||||
rank-cases
|
||||
(list
|
||||
(list
|
||||
"tfidf tf ordering"
|
||||
"rankTfIdf [\"alpha\"] idx1"
|
||||
(list 1 2))
|
||||
(list
|
||||
"tfidf rare term boosts"
|
||||
"rankTfIdf [\"alpha\", \"beta\"] idx1"
|
||||
(list 1 3 2))
|
||||
(list
|
||||
"tfidf single-doc term"
|
||||
"rankTfIdf [\"gamma\"] idx1"
|
||||
(list 1))
|
||||
(list "tfidf absent term empty" "rankTfIdf [\"nope\"] idx1" (list))
|
||||
(list "tfidf empty query empty" "rankTfIdf [] idx1" (list))
|
||||
(list
|
||||
"tfidf candidate union tie by docid"
|
||||
"rankTfIdf [\"beta\", \"gamma\"] idx1"
|
||||
(list 1 3))
|
||||
(list
|
||||
"tfidf tf ordering idx2"
|
||||
"rankTfIdf [\"cat\"] idx2"
|
||||
(list 2 1))
|
||||
(list "topN tfidf 1" "topNTfIdf 1 [\"alpha\"] idx1" (list 1))
|
||||
(list
|
||||
"topN tfidf 2"
|
||||
"topNTfIdf 2 [\"alpha\", \"beta\"] idx1"
|
||||
(list 1 3))
|
||||
(list
|
||||
"topN exceeds results"
|
||||
"topNTfIdf 10 [\"gamma\"] idx1"
|
||||
(list 1))
|
||||
(list "topN zero" "topNTfIdf 0 [\"alpha\"] idx1" (list))
|
||||
(list
|
||||
"bm25 tf+length flips tfidf"
|
||||
"rankBm25 1.5 0.75 [\"cat\"] idx2"
|
||||
(list 1 2))
|
||||
(list
|
||||
"bm25 b=0 ignores length"
|
||||
"rankBm25 1.5 0.0 [\"cat\"] idx2"
|
||||
(list 2 1))
|
||||
(list
|
||||
"bm25 alpha idx1"
|
||||
"rankBm25 1.5 0.75 [\"alpha\"] idx1"
|
||||
(list 1 2))
|
||||
(list "bm25 absent empty" "rankBm25 1.5 0.75 [\"nope\"] idx1" (list))
|
||||
(list
|
||||
"bm25 single-doc term"
|
||||
"rankBm25 1.5 0.75 [\"gamma\"] idx1"
|
||||
(list 1))
|
||||
(list "bm25 topN 1" "topNBm25 1 1.5 0.75 [\"cat\"] idx2" (list 1))
|
||||
(list
|
||||
"bm25 same candidate set"
|
||||
"sort (rankBm25 1.5 0.75 [\"alpha\", \"beta\"] idx1)"
|
||||
(list 1 2 3))
|
||||
(list
|
||||
"tfidf stable tiebreak"
|
||||
"rankTfIdf [\"kite\"] idx3"
|
||||
(list 1 2))
|
||||
(list
|
||||
"bm25 stable tiebreak"
|
||||
"rankBm25 1.5 0.75 [\"kite\"] idx3"
|
||||
(list 1 2))
|
||||
(list "numDocs" "[numDocs idx1]" (list 3))
|
||||
(list "docLen counts tokens" "[docLen 1 idx1]" (list 4))
|
||||
(list "docFreq via index" "[docFreq \"alpha\" idx1]" (list 2))))
|
||||
|
||||
(define
|
||||
rank-results
|
||||
(search-batch rank-setup (map (fn (c) (nth c 1)) rank-cases)))
|
||||
|
||||
(map-indexed
|
||||
(fn
|
||||
(i c)
|
||||
(hk-test (nth c 0) (nth rank-results i) (nth c 2)))
|
||||
rank-cases)
|
||||
|
||||
{:fail hk-test-fail :pass hk-test-pass :fails hk-test-fails}
|
||||
@@ -1,67 +0,0 @@
|
||||
;; Extension — boolean-filtered ranked search (filter then rank by relevance).
|
||||
;; Corpus:
|
||||
;; 1 "apple apple banana" apple2 banana1
|
||||
;; 2 "apple cherry" apple1 cherry1
|
||||
;; 3 "banana cherry" banana1 cherry1
|
||||
;; 4 "apple banana cherry" apple1 banana1 cherry1
|
||||
|
||||
(define
|
||||
rankq-setup
|
||||
"idx = indexDoc 4 \"apple banana cherry\" (indexDoc 3 \"banana cherry\" (indexDoc 2 \"apple cherry\" (indexDoc 1 \"apple apple banana\" emptyIndex)))\n")
|
||||
|
||||
(define
|
||||
rankq-cases
|
||||
(list
|
||||
(list
|
||||
"queryTerms and"
|
||||
"queryTerms (parseQuery \"apple AND banana\")"
|
||||
(list "apple" "banana"))
|
||||
(list
|
||||
"queryTerms or not"
|
||||
"queryTerms (parseQuery \"a OR NOT b\")"
|
||||
(list "a" "b"))
|
||||
(list
|
||||
"queryTerms phrase"
|
||||
"queryTerms (parseQuery \"\\\"x y\\\" OR z\")"
|
||||
(list "x" "y" "z"))
|
||||
(list
|
||||
"and filter ranked by tf"
|
||||
"searchRankTfIdf \"apple AND banana\" idx"
|
||||
(list 1 4))
|
||||
(list
|
||||
"single term ranked tie"
|
||||
"searchRankTfIdf \"cherry\" idx"
|
||||
(list 2 3 4))
|
||||
(list
|
||||
"or filter ranked"
|
||||
"searchRankTfIdf \"apple OR banana\" idx"
|
||||
(list 1 4 2 3))
|
||||
(list
|
||||
"and-not narrows then ranks"
|
||||
"searchRankTfIdf \"apple AND NOT banana\" idx"
|
||||
(list 2))
|
||||
(list
|
||||
"phrase filter ranked"
|
||||
"searchRankTfIdf \"\\\"apple banana\\\"\" idx"
|
||||
(list 1 4))
|
||||
(list "no matches" "searchRankTfIdf \"zzz\" idx" (list))
|
||||
(list
|
||||
"bm25 boolean ranked subset"
|
||||
"sort (searchRankBm25 1.5 0.75 \"apple OR banana\" idx)"
|
||||
(list 1 2 3 4))
|
||||
(list
|
||||
"bm25 and filter"
|
||||
"searchRankBm25 1.5 0.75 \"apple AND NOT banana\" idx"
|
||||
(list 2))))
|
||||
|
||||
(define
|
||||
rankq-results
|
||||
(search-batch rankq-setup (map (fn (c) (nth c 1)) rankq-cases)))
|
||||
|
||||
(map-indexed
|
||||
(fn
|
||||
(i c)
|
||||
(hk-test (nth c 0) (nth rankq-results i) (nth c 2)))
|
||||
rankq-cases)
|
||||
|
||||
{:fail hk-test-fail :pass hk-test-pass :fails hk-test-fails}
|
||||
@@ -1,47 +0,0 @@
|
||||
;; Extension — stemming (suffix stripping). Scalar string results wrapped in [].
|
||||
|
||||
(define
|
||||
stem-cases
|
||||
(list
|
||||
(list "stem plural s" "[stem \"cats\"]" (list "cat"))
|
||||
(list "stem plural dogs" "[stem \"dogs\"]" (list "dog"))
|
||||
(list "stem keeps ss" "[stem \"pass\"]" (list "pass"))
|
||||
(list "stem short s unchanged" "[stem \"is\"]" (list "is"))
|
||||
(list "stem es boxes" "[stem \"boxes\"]" (list "box"))
|
||||
(list "stem es wishes" "[stem \"wishes\"]" (list "wish"))
|
||||
(list "stem ies cities" "[stem \"cities\"]" (list "city"))
|
||||
(list "stem ies parties" "[stem \"parties\"]" (list "party"))
|
||||
(list "stem ing jumping" "[stem \"jumping\"]" (list "jump"))
|
||||
(list "stem ing running literal" "[stem \"running\"]" (list "runn"))
|
||||
(list "stem ed jumped" "[stem \"jumped\"]" (list "jump"))
|
||||
(list "stem ed wanted" "[stem \"wanted\"]" (list "want"))
|
||||
(list "stem short ed unchanged" "[stem \"red\"]" (list "red"))
|
||||
(list "stem no suffix" "[stem \"cat\"]" (list "cat"))
|
||||
(list
|
||||
"stemText normalizes and stems"
|
||||
"[stemText \"Cats Running!\"]"
|
||||
(list "cat runn"))
|
||||
(list
|
||||
"stemTokens list"
|
||||
"stemTokens \"boxes and cats\""
|
||||
(list "box" "and" "cat"))
|
||||
(list
|
||||
"indexStemmed unifies plural"
|
||||
"map fst (lookupTerm \"cat\" (indexStemmed 2 \"a cat\" (indexStemmed 1 \"the cats\" emptyIndex)))"
|
||||
(list 1 2))
|
||||
(list
|
||||
"indexStemmed stem query"
|
||||
"map fst (lookupTerm (stem \"boxes\") (indexStemmed 1 \"many boxes\" emptyIndex))"
|
||||
(list 1))))
|
||||
|
||||
(define
|
||||
stem-results
|
||||
(search-batch "" (map (fn (c) (nth c 1)) stem-cases)))
|
||||
|
||||
(map-indexed
|
||||
(fn
|
||||
(i c)
|
||||
(hk-test (nth c 0) (nth stem-results i) (nth c 2)))
|
||||
stem-cases)
|
||||
|
||||
{:fail hk-test-fail :pass hk-test-pass :fails hk-test-fails}
|
||||
@@ -1,42 +0,0 @@
|
||||
;; Extension — did-you-mean / spelling suggestion.
|
||||
;; Corpus terms (sorted): ample apple apply banana orange
|
||||
|
||||
(define
|
||||
suggest-setup
|
||||
"idx = indexDoc 1 \"apple apply ample banana orange\" emptyIndex\n")
|
||||
|
||||
(define
|
||||
suggest-cases
|
||||
(list
|
||||
(list "suggest exact term" "[suggest \"apple\" idx]" (list "apple"))
|
||||
(list
|
||||
"suggest misspelled banana"
|
||||
"[suggest \"bananna\" idx]"
|
||||
(list "banana"))
|
||||
(list
|
||||
"suggest missing letter orange"
|
||||
"[suggest \"orang\" idx]"
|
||||
(list "orange"))
|
||||
(list "suggest closest apply" "[suggest \"aply\" idx]" (list "apply"))
|
||||
(list "suggestN 1 banana" "suggestN 1 \"bananna\" idx" (list "banana"))
|
||||
(list
|
||||
"suggestN 2 ties alpha"
|
||||
"suggestN 2 \"aple\" idx"
|
||||
(list "ample" "apple"))
|
||||
(list "suggest empty term shortest" "[suggest \"\" idx]" (list "ample"))
|
||||
(list "suggest empty index" "[suggest \"apple\" emptyIndex]" (list ""))
|
||||
(list "suggestN empty index" "suggestN 1 \"apple\" emptyIndex" (list))))
|
||||
|
||||
(define
|
||||
suggest-results
|
||||
(search-batch
|
||||
suggest-setup
|
||||
(map (fn (c) (nth c 1)) suggest-cases)))
|
||||
|
||||
(map-indexed
|
||||
(fn
|
||||
(i c)
|
||||
(hk-test (nth c 0) (nth suggest-results i) (nth c 2)))
|
||||
suggest-cases)
|
||||
|
||||
{:fail hk-test-fail :pass hk-test-pass :fails hk-test-fails}
|
||||
@@ -1,53 +0,0 @@
|
||||
;; Extension — synonym / query expansion.
|
||||
;; synmap: car -> automobile, vehicle ; big -> large
|
||||
;; Corpus: 1 "fast car" 2 "shiny automobile" 3 "big truck" 4 "large house" 5 "vehicle review"
|
||||
|
||||
(define
|
||||
syn-setup
|
||||
"synmap = [(\"car\", [\"automobile\", \"vehicle\"]), (\"big\", [\"large\"])]\nidx = indexDoc 5 \"vehicle review\" (indexDoc 4 \"large house\" (indexDoc 3 \"big truck\" (indexDoc 2 \"shiny automobile\" (indexDoc 1 \"fast car\" emptyIndex))))\n")
|
||||
|
||||
(define
|
||||
syn-cases
|
||||
(list
|
||||
(list
|
||||
"expand term with synonyms"
|
||||
"expandTerm synmap \"car\""
|
||||
(list "car" "automobile" "vehicle"))
|
||||
(list
|
||||
"expand single synonym"
|
||||
"expandTerm synmap \"big\""
|
||||
(list "big" "large"))
|
||||
(list "expand unknown term" "expandTerm synmap \"banana\"" (list "banana"))
|
||||
(list
|
||||
"syn docs union"
|
||||
"synDocs synmap \"car\" idx"
|
||||
(list 1 2 5))
|
||||
(list
|
||||
"syn docs single synonym"
|
||||
"synDocs synmap \"big\" idx"
|
||||
(list 3 4))
|
||||
(list
|
||||
"syn docs no synonyms"
|
||||
"synDocs synmap \"house\" idx"
|
||||
(list 4))
|
||||
(list "syn docs absent" "synDocs synmap \"plane\" idx" (list))
|
||||
(list
|
||||
"syn rank expanded"
|
||||
"synRankTfIdf synmap \"car\" idx"
|
||||
(list 1 2 5))
|
||||
(list
|
||||
"syn rank single"
|
||||
"synRankTfIdf synmap \"big\" idx"
|
||||
(list 3 4))))
|
||||
|
||||
(define
|
||||
syn-results
|
||||
(search-batch syn-setup (map (fn (c) (nth c 1)) syn-cases)))
|
||||
|
||||
(map-indexed
|
||||
(fn
|
||||
(i c)
|
||||
(hk-test (nth c 0) (nth syn-results i) (nth c 2)))
|
||||
syn-cases)
|
||||
|
||||
{:fail hk-test-fail :pass hk-test-pass :fails hk-test-fails}
|
||||
@@ -1,8 +0,0 @@
|
||||
;; search tokenizer — Haskell source fragment.
|
||||
;; normalize (lowercase + strip punctuation), split on whitespace, attach positions.
|
||||
;; tokens :: String -> [String]
|
||||
;; positioned :: String -> [(String, Int)] -- 0-based ordinal positions
|
||||
|
||||
(define
|
||||
search/tokenize-src
|
||||
"lowerChar c = chr (toLower (ord c))\nnormChar c = if isAlphaNum c then lowerChar c else ' '\nisBlankCh c = c == ' '\ndropBlanks [] = []\ndropBlanks (c:cs) = if isBlankCh c then dropBlanks cs else c:cs\ntakeWord [] = []\ntakeWord (c:cs) = if isBlankCh c then [] else c : takeWord cs\nafterWord [] = []\nafterWord (c:cs) = if isBlankCh c then c:cs else afterWord cs\nsplitWords s = let s2 = dropBlanks s in if null s2 then [] else takeWord s2 : splitWords (afterWord s2)\nappendStr a b = a ++ b\njoinChars cs = foldr appendStr \"\" cs\ntokens s = map joinChars (splitWords (map normChar s))\nposFrom i [] = []\nposFrom i (x:xs) = (x, i) : posFrom (i + 1) xs\npositioned s = posFrom 0 (tokens s)\n")
|
||||
449
plans/abstractions.md
Normal file
449
plans/abstractions.md
Normal file
@@ -0,0 +1,449 @@
|
||||
# Abstraction Radar — backlog
|
||||
|
||||
Maintained by the read-only `radar` loop (see `plans/agent-briefings/radar-loop.md`).
|
||||
Detection only — implementation is a separate, coordinated step owned by the
|
||||
relevant subsystem loop, never by radar.
|
||||
|
||||
**AHA gate to reach _Proposed_:** ≥3 real consumers · all past Phase 2 & API-stable ·
|
||||
structurally identical (file:line evidence) · a natural home (usually NOT lib/guest).
|
||||
Anything short → _Watching_ (what's missing) or _Rejected_ (why).
|
||||
|
||||
---
|
||||
|
||||
## Last scan
|
||||
|
||||
- **Date:** 2026-06-07 (radar loop, pass 22)
|
||||
- **Empty-discovery streak: passes 19–22** (last verified pass 22). Fleet at steady state —
|
||||
active loops (content CvRDT, events recurrence/reschedule, identity grant-mgmt, fed-sx
|
||||
outbox internals) are building *inside* their domains, not cross-cutting infra. Census
|
||||
exhausted (p17); all gates re-tested (W1 p18, W2 p19). No new candidate clears any gate.
|
||||
- **Radar is now trigger-driven.** The next substantive pass needs one of: **(a)** a new
|
||||
subsystem worktree spawning (auto-joins scan), or **(b)** host-persist's durable adapter
|
||||
landing → unblocks the W4 acl/mod→persist/log migration, or **(c)** a quiescent
|
||||
subsystem (acl/mod/search/commerce, static ~9–16 passes) resuming. Polling ~hourly until
|
||||
one fires; will tighten cadence then.
|
||||
- **Date:** 2026-06-07 (radar loop, pass 20)
|
||||
- **Pass 20 — honest empty pass.** 3 new census recurrences since p17 (normalize/index ×2,
|
||||
query ×3) — all **name collisions** (same noun, domain-specific op), added to the table.
|
||||
Recorded the meta-pattern: the fleet shares vocabulary, not structure. Most subsystems
|
||||
quiescent (acl/mod/search/commerce static ~9-15 passes = API-stable); only events/
|
||||
identity/content/fed-sx still committing domain features. No new gate-clearer.
|
||||
- **Date:** 2026-06-07 (radar loop, pass 19)
|
||||
- **Pass 19 — honest empty pass.** Scanned 10 active subsystems. content/index.sx is a
|
||||
blog index/tag-cloud listing (presentation, not full-text search — no search reinvention)
|
||||
and content/multi-doc indexing adds no per-viewer filter. **W2 re-tested: still 2**
|
||||
(feed, search) — acl's `permit?`-like matches are its own authZ *engine* (the home),
|
||||
not a downstream read filter. No new candidate cleared any gate.
|
||||
- **Date:** 2026-06-07 (radar loop, pass 18)
|
||||
- **Pass 18 — W1 gate re-test.** events shipped Phase 4 federation (5th consumer): a 5th
|
||||
divergent merge (sorted agenda + `:origin` provenance), trust-gate = runtime list
|
||||
membership (shares mod's mechanism, not acl's). Reinforces W1's "theme not shape" — but
|
||||
the **inject-fed-sx-transport seam is now 5/5**, strengthening "all are fed-sx
|
||||
consumers-in-waiting." Trust sub-pattern refined: mod+events (runtime set) vs acl (rule).
|
||||
- **Date:** 2026-06-07 (radar loop, pass 17)
|
||||
- **Pass 17 — filename census declared EXHAUSTED** (see the Census-status table above).
|
||||
Examined the last unswept ≥2 recurrences (schema/engine = acl⇄mod substrate twins;
|
||||
catalog/batch = name collisions; store = divergent). No new candidate. Incremental churn
|
||||
elsewhere (content 621/621, identity PAR, events reminders). Future passes pivot from
|
||||
censusing to re-testing gates as consumers mature.
|
||||
- **Date:** 2026-06-07 (radar loop, pass 16)
|
||||
- **Pass 16:** events started Phase 3 — **durable notification delivery on `lib/flow`**
|
||||
(new W8: at-least-once + idempotency exemplar; fed-sx/mod roll their own outbox). The two
|
||||
`notify.sx` (feed vs events) are a name collision (read-side digest vs delivery), noted
|
||||
in W8. Substrate-adoption story deepening: app domains now consume persist (content/
|
||||
commerce/events), flow (events), commerce (events), acl-authZ (identity).
|
||||
- **Date:** 2026-06-07 (radar loop, pass 15)
|
||||
- **Pass 15:** added the **scanning-method note** above after `query.sx` again proved to
|
||||
be merged-lib copies (lib/prolog + lib/persist in every worktree). Corrected census
|
||||
surfaced `wire`×2 (content+mod) → Rejected (shared role, divergent structure: generic SX
|
||||
serializer vs bespoke pipe-format under a Prolog-env string-prim constraint). events↔
|
||||
commerce integration appeared (paid tickets); acl/mod/search quiescent ~7 passes (now
|
||||
API-stable). No new gate-clearer.
|
||||
- **Date:** 2026-06-07 (radar loop, pass 14)
|
||||
- **Pass 14:** filename census flagged `snapshot`×?? — but the `*/lib/persist/snapshot.sx`
|
||||
copies are just the merged `lib/persist` in each worktree, NOT consumers (same artifact
|
||||
as `lib/feed/rank.sx` everywhere). The one distinct file, `content/snapshot.sx`,
|
||||
reimplements persist's projection-checkpoint on raw KV instead of using `persist/snapshot`
|
||||
→ new W7 (persist-adoption nudge). `audit`×3 = the W4 fakes (acl/mod/identity), known.
|
||||
- **Date:** 2026-06-07 (radar loop, pass 13)
|
||||
- **Pass 13 — honest re-test, no gate-clearer.** Re-tested the two longest-waiting gates
|
||||
against the maturing app-domain loops: **W2** (per-viewer visibility) still 2 consumers
|
||||
(feed, search) — commerce/content/events/identity add no per-viewer read filter; **W3**
|
||||
(pagination) still 2 (feed, search) — `content/page.sx` is an HTML wrapper, not
|
||||
pagination (filename collision, noted in W3). Incremental churn only elsewhere.
|
||||
- **Date:** 2026-06-07 (radar loop, pass 12)
|
||||
- **Pass 12:** `events` shipped **transactional booking on persist** (3rd live persist
|
||||
consumer) using `persist/append-expect` (optimistic-concurrency CAS, lock-free capacity
|
||||
safety). W4 ledger now shows a persist feature-ladder append → append-once → append-expect
|
||||
that the hand-rolled fakes can't match. No new candidate; W4 reinforced.
|
||||
- **Date:** 2026-06-07 (radar loop, pass 11)
|
||||
- **Pass 11 — W4 sharpened with a consumer ledger.** commerce built an **order ledger on
|
||||
persist** (2nd live exemplar; uses `persist/append-once` for webhook idempotency) and
|
||||
identity a **grant audit ledger** (in-memory Erlang fake, gated on an Erlang↔persist
|
||||
bridge). The append-only monotonic-seq event-log pattern is now validated across 4
|
||||
domains, 2 live on persist + 3 fakes flagged for adoption. See W4 table.
|
||||
- **Date:** 2026-06-07 (radar loop, pass 10)
|
||||
- **Pass 10:** commerce/content/events/identity advancing (content 238/238). Probed a
|
||||
shape outside the routing table — **guarded lifecycle state machines** (mod/lifecycle +
|
||||
identity/membership) → new W6: shared *design principle*, divergent *structure*
|
||||
(SX transition-table vs Erlang gen_server), NOT an extraction target. No gate-clearer.
|
||||
- **Date:** 2026-06-07 (radar loop, pass 9)
|
||||
- **Pass 9:** `commerce` + `content` reached Phase 2 (`content` 162/162). **Key find:
|
||||
`content` built its op log directly on `persist/log`** (backend-injected, append+replay-
|
||||
to-seq) — the live reference exemplar for W4 (see W4). `events` MONTHLY RRULE,
|
||||
`identity` OAuth2 auth-code + PKCE, search boolean-filtered ranked. A1 still 6 adopters.
|
||||
- **Date:** 2026-06-06 (radar loop, pass 8)
|
||||
- **Pass 8 — fleet expanded by 4 app-domain loops** (the briefing's anticipated
|
||||
`commerce`/`identity` arrivals, auto-picked up by dynamic discovery). All early-stage,
|
||||
**pre-Phase-2 → moving targets, none count toward any gate yet**:
|
||||
- `commerce` (Phase 1: `api/cart/catalog/price`). Its "per-line audit" is a cost
|
||||
*breakdown view* (`api.sx:44`), **not** an append-only decision log → NOT a W4
|
||||
consumer.
|
||||
- `events` (Phase 1: `calendar.sx`, RRULE expansion).
|
||||
- `identity` (early: `session/token`). Defers authZ to acl (`token.sx:15`) — reinforces
|
||||
W2's "delegate `permit?` to acl-on-sx" routing; identity = authN, acl = authZ.
|
||||
- `content` (just-started: `block.sx`).
|
||||
These are the future consumers W2/W3 are waiting on — re-check their per-viewer filters
|
||||
/ pagination once each clears Phase 2. No new gate-clearer this pass.
|
||||
- **Pass 7:** **A1 jumped 4→6 adopters** — `acl` + `mod` migrated to the shared
|
||||
conformance driver (first app-domain adopters; proves it generalizes past substrates).
|
||||
`host-persist` closed its blob-adapter blocker (durable storage adapter now landing →
|
||||
W4 migration path opening). search shipped proximity/NEAR; flow + persist quiescent.
|
||||
- **Pass 6:** new worktree **`host-persist`** (active — building persist's durable host
|
||||
adapter); `feed` went quiescent (left tmux). acl shipped hardening (+25), fed-sx-m1 at
|
||||
Step 6c. **mod loop independently wrote a shared-plumbing note** (`mod-on-sx.md`,
|
||||
538b8a53) corroborating W4/W5 — folded its claims + home disagreements into W1/W4/W5.
|
||||
No new gate-clearer (audit log still 2 consumers), but consumers are now API-stable.
|
||||
- **Pass 5:** search (+highlight/snippet) and fed-sx-m1 (+follower_graph) moved; rest
|
||||
unchanged. Filename census: `api`×6, `fed`×3, then `schema/rank/query/page/explain/
|
||||
engine/batch/audit`×2. Examined the ×6 `api.sx` → Rejected (shared name, divergent
|
||||
structure incl. implicit-vs-explicit-state contract). rank/batch/engine all ≤2 +
|
||||
substrate/domain-divergent → no new gate-clearer.
|
||||
- **Pass 4:** no churn vs pass 3 (same worktrees/tmux/HEADs/adopters). Swept audit+explain
|
||||
surfaces: acl/mod share an append-only-log shape (→ sharpened W4 with persist/log API
|
||||
evidence) and a proof-explain shape (→ new W5, substrate-bound). No new gate-clearer.
|
||||
- **Pass 3 (earlier today):** subsystem set + tmux + A1 adopters (4) all unchanged vs pass 2. Loops
|
||||
advanced: acl shipped Phase 4 federation; search shipped Phase 4 + pagination; feed
|
||||
shipped pagination/threading; mod at Ext 19 (capstone); persist did a worked acl-grants
|
||||
migration (W4). New shape found: offset/limit pagination → folded into W3.
|
||||
- **Subsystem set discovered:** loop worktrees `acl, erlang, fed-prims, fed-sx-m1,
|
||||
feed, flow, go, kernel, mod, ocaml, persist, radar, ruby, search,
|
||||
sx-vm-extensions`; main-repo `lib/*` incl. merged `feed` + substrates (`apl,
|
||||
common-lisp, datalog, erlang, forth, go, haskell, hyperscript, js, lua, minikanren,
|
||||
ocaml, prolog, scheme, smalltalk, tcl`) + `lib/guest`.
|
||||
Actively looping (tmux): `acl, fed-sx-m1, feed, flow, mod, persist, search`
|
||||
(+ radar).
|
||||
- **New since pass 1:** worktrees `kernel` (empty/unset — not yet a repo) and `ocaml`
|
||||
(`lib/ocaml/baseline` only). Both early-stage, pre–Phase 2 → out of proposal scope.
|
||||
- Re-enumerate every pass; new loops (e.g. a future `commerce`/`identity`) auto-join.
|
||||
|
||||
**Census status (pass 17): EXHAUSTED.** Every own-namespace filename recurring ≥2× has
|
||||
been examined and dispositioned — further filename-censusing is low-yield until new
|
||||
subsystems/modules appear. Map:
|
||||
| filename | owners | verdict |
|
||||
|---|---|---|
|
||||
| `api` ×10 | all | Rejected — shared role, divergent state contract |
|
||||
| `fed`/`federation` | feed/search/mod/acl(+content) | W1 — theme not shape |
|
||||
| `audit` ×3 | acl/mod/identity | W4 — append-only log → persist/log |
|
||||
| `page` ×3 | feed/search (pagination) + content (HTML wrapper) | W3 + collision noted |
|
||||
| `explain` ×2 | acl/mod | W5 — proof tree, substrate-bound |
|
||||
| `snapshot` ×2 | persist(facet) + content(reinvents) | W7 |
|
||||
| `wire` ×2 | content(SX serializer) / mod(pipe-format) | Rejected — divergent |
|
||||
| `schema`,`engine` ×2 | acl/mod | substrate-twin parallels (Datalog vs Prolog); only audit (W4) is liftable |
|
||||
| `catalog`,`batch` ×2 | commerce/persist, mod/persist | name collisions, unrelated |
|
||||
| `normalize` ×2 | content(tree-prune)/feed(record-coerce) | name collision (pass 20) |
|
||||
| `index` ×2 | content(listing)/search(inverted index) | name collision (pass 20) |
|
||||
| `query` ×3 | content(doc-block)/search(bool AST)/persist(stream-read) | 3-way name collision (pass 20) |
|
||||
| `store` ×2 | content(on persist) / flow(workflow records) | related concept, divergent |
|
||||
| `rank` ×2 | feed/search | different domains (activities vs docs), ≤2 |
|
||||
**acl⇄mod are structural twins** (decision engine over a logic substrate, Datalog vs
|
||||
Prolog) — they parallel across engine/schema/explain/audit/fed, but only the *audit log*
|
||||
is substrate-agnostic and liftable (→ W4); the rest are substrate-idiomatic. Next passes:
|
||||
re-test gates (W2/W3/W8) as consumers mature, watch new modules — not re-census.
|
||||
|
||||
**Meta-pattern (pass 20):** new module names keep *recurring* but the operations keep
|
||||
*colliding* — same noun, domain-specific op (normalize, index, query, catalog, batch,
|
||||
notify, page, store all proved to be collisions). This is *why* genuine extraction
|
||||
candidates are rare: the fleet shares vocabulary, not structure. The real shared assets
|
||||
are the **substrate subsystems** (persist, flow, acl, fed-sx) that app domains *adopt*
|
||||
(W1/W2/W4/W7/W8), not hand-rolled libs to extract.
|
||||
|
||||
**Scanning-method note (learned the hard way, passes 5/12/14/15):** a filename census
|
||||
for *cross-subsystem* recurrence MUST restrict to each subsystem's OWN namespace —
|
||||
`X/lib/X/*.sx` — never `X/lib/*/`. The merged substrate libs (`lib/prolog`, `lib/persist`,
|
||||
`lib/feed`, `lib/datalog`, …) are checked out inside *every* worktree, so a naive census
|
||||
reports e.g. `query.sx`/`snapshot.sx`/`rank.sx` ×N as phantom recurrences that are really
|
||||
one merged file copied N times. Correct one-liner:
|
||||
`for w in <subsystems>; do for f in $w/lib/$w/*.sx; do basename $f .sx; done; done | sort | uniq -c | sort -rn`.
|
||||
|
||||
---
|
||||
|
||||
## Proposed (cleared the gate)
|
||||
|
||||
### A1 · Adopt the shared conformance driver across subsystems
|
||||
- **Pattern:** every subsystem hand-rolls a near-identical `conformance.sh`
|
||||
(epoch-load → eval → scoreboard emit) and an inline `<x>-test name got expected`
|
||||
pass/fail counter.
|
||||
- **Consumers (≥3, overwhelming):** 15 `lib/*/conformance.sh` — `apl, feed, datalog,
|
||||
flow, mod, lua, erlang, forth, go, common-lisp, haskell, js, ocaml, prolog,
|
||||
smalltalk, tcl`.
|
||||
- **Home:** `lib/guest` — the one legitimate exception (the shared driver
|
||||
`lib/guest/conformance.sh` + `lib/guest/conformance.sx` already exist; modes
|
||||
`dict` and `counters`).
|
||||
- **Status: IN PROGRESS — 6 adopters (pass 7).** `prolog` (dict), `haskell` (counters),
|
||||
`apl` (dict), `datalog` (dict), and **`acl` (dict) + `mod` (dict), newly migrated this
|
||||
pass** — all 3-line exec shims into `lib/guest/conformance.sh` with a `conformance.conf`.
|
||||
**acl + mod are the first *app-domain* adopters** (not language substrates) — strong
|
||||
evidence the driver generalizes beyond the substrate layer, which was the open question.
|
||||
The `apl` migration earlier *surfaced a latent bug*: the old awk extractor
|
||||
under-counted `pipeline` (40 vs the real 152 assertions); true apl total is **562**,
|
||||
not 450 — evidence that adopting the driver also improves correctness.
|
||||
- **Not a target (different harness shape):** `lua/conformance.sh` is a Python runner
|
||||
(`lib/lua/conformance.py`) that walks real `*.lua` source files via `lua-eval-ast`
|
||||
and classifies pass/fail/timeout — it does not run SX `deftest` suites with a
|
||||
counter/dict scoreboard, so the shared driver does not fit. Excluded, not pending.
|
||||
- **Remaining hand-rolled candidates (~120–220 lines each):** `common-lisp, erlang,
|
||||
feed, forth, go, js, ocaml, smalltalk, tcl` — each its OWN loop's migration when
|
||||
quiescent. (`search` + `lua` excluded: different harness shapes — search assembles a
|
||||
Haskell source string, lua walks real `*.lua` files.)
|
||||
- **Action:** each remaining subsystem's OWN loop migrates when quiescent — add a
|
||||
`conformance.conf` (+ a `test-harness.sx` preload defining its counters) and
|
||||
replace `conformance.sh` with the 1-line exec shim
|
||||
(`exec bash …/guest/conformance.sh …/conformance.conf "$@"`). Recipe template:
|
||||
`lib/haskell/conformance.conf` (counters) or `lib/prolog/conformance.conf` (dict).
|
||||
Keep the `bash lib/X/conformance.sh` entry point so no loop is disrupted.
|
||||
- **Priority: HIGH** (15 consumers, low risk, interface-preserving, additive).
|
||||
|
||||
---
|
||||
|
||||
## Watching (real but not yet through the gate)
|
||||
|
||||
### W1 · Federation scaffold (merge / ingest / backfill / trust-gate)
|
||||
- **FAILS the structural-identity gate (deep-dived 2026-06-06, all 4 read).** Consumer
|
||||
count is met (4) but they are *superficially* similar, not structurally identical —
|
||||
the federated unit and merge op differ fundamentally:
|
||||
|
||||
| Subsystem (file) | Federated unit | Merge op | Trust gate | Injected transport |
|
||||
|---|---|---|---|---|
|
||||
| feed (`fed.sx:14,18,40`) | activity streams | dedupe by `(actor verb object)` | none (visibility via `permit?` separately) | `send-fn`, `fetch-fn` |
|
||||
| search (`fed.sx:8`) | inverted indices | relabel DocId `peer*1000+local` + union posting lists | none | none (pure merge fn) |
|
||||
| mod (`fed.sx:11-14,99`) | moderation decisions | advisory-list vs applied-list; bind iff `mod/trusted?` | **yes — runtime list** `mod/trusted? peer scope` | mock outbox / `fed-send!` |
|
||||
| acl (`federation.sx:43,56`) | Datalog delegate facts | pull facts, gate by `trust`/`level_covers` rule, re-saturate | **yes — Datalog rule** at query time | `transport` dict |
|
||||
| events (`federation.sx`) | calendar agendas | fold trusted peers' agendas into one sorted agenda + `:origin` provenance | **yes — runtime list** `ev/trusts?` (peer-id ∈ trust-set) | injected behind `ev/peer-agenda` |
|
||||
|
||||
- **The ONLY real commonality is the injection seam** (now 5/5, pass 18), not extractable
|
||||
code: every one says "the real transport is `fed-sx`'s job; inject `send-fn`/`fetch-fn`/
|
||||
`transport`/`peer-agenda` and mock it in tests." That is an architectural *convention the
|
||||
fleet already follows*. The merge op diverges 5 ways (dedupe / index-union / advisory /
|
||||
fact-saturation / agenda-sort). The trust gate, where present, splits: **mod + events use
|
||||
a runtime trust-set membership check; acl uses a declarative Datalog rule** — so even the
|
||||
trust sub-pattern is 2-of-3, and the membership check is a trivial one-liner (below the
|
||||
extraction threshold). No shared merge, no single shared trust mechanism.
|
||||
- **Disposition:** do NOT extract a shared "federation lib." When `fed-sx` ships its
|
||||
real transport, these 4 become its *consumers* (wiring `send-fn`/`fetch-fn`/`transport`
|
||||
to it) — that work belongs to each subsystem's loop + the `fed-sx` loop, not a
|
||||
cross-cutting extraction. Stop re-proposing on the shared name. Home: `fed-sx`.
|
||||
- **Narrower sub-claim (mod note, pass 6; refined pass 18):** mod asserts the *fed
|
||||
trust/outbox* shape shares between mod+acl. Radar evidence refines this: the trust gate
|
||||
splits by mechanism, not by subsystem pair — **mod + events** both use a runtime
|
||||
trust-set membership check (`mod/trusted?`, `ev/trusts?`), while **acl** uses a Datalog
|
||||
rule. So a "trust-set membership" helper has 2 consumers (mod, events) — but it's a
|
||||
one-line `member?` and the merge it gates diverges, so still not worth extracting.
|
||||
Resolve at the architecture-merge point if a heavier shared trust-set surface emerges.
|
||||
|
||||
### W2 · Per-viewer visibility / permission filter
|
||||
- **2 shipped consumers, same shape** — `filter <injected-permit> <ranked/candidate stream>`:
|
||||
- `feed/lib/feed/acl.sx:27` `feed/visible = (feed/filter stream (fn (a) (permit? viewer a)))`,
|
||||
capstone at `:34` (stream → ACL → rank → top-N). `permit?` injected, sig `(viewer activity)→bool`.
|
||||
- `search/lib/search/fed.sx:16` `aclFilter permit docs = filter permit docs`;
|
||||
`topNTfIdfAcl n permit ts idx = take n (aclFilter permit (rankTfIdf ts idx))`.
|
||||
`permit` injected, sig `DocId→Bool` (viewer baked in by caller).
|
||||
- **NOT a consumer:** `mod/lib/mod/policy.sx` is moderation policy (reviewer actions),
|
||||
no per-viewer read filter. So mod won't be the 3rd.
|
||||
- **Missing:** (a) only 2 consumers, need ≥3; (b) the two interfaces *diverge* —
|
||||
feed passes `(viewer, item)`, search bakes the viewer in — so any shared form must
|
||||
pick a convention; (c) both already **inject** the predicate, and the filter body is
|
||||
literally one line (`filter permit xs`). Leaning toward: the predicate's home is
|
||||
`acl-on-sx` (`permit?`), and the one-line filter is too thin to extract.
|
||||
- **Home when ripe:** delegate `permit?` to `acl-on-sx`; do NOT extract the filter.
|
||||
Re-check if a 3rd genuine per-viewer read filter ships (e.g. events/commerce).
|
||||
|
||||
### W3 · Collection helpers (group-by, dedupe-by-key, stable top-N, distinct-order, offset/limit page)
|
||||
- feed built all of these on APL primitives. search/commerce/events will want
|
||||
group-by / top-N.
|
||||
- **NEW (2026-06-06): offset/limit pagination shipped in 2 subsystems, identical shape**
|
||||
`take limit (drop offset xs)`:
|
||||
- `feed/lib/feed/page.sx:9` `feed/page` (offset/limit window over a stream).
|
||||
- `search/lib/search/page.sx:9` `paginate off lim docs = take lim (drop off docs)`.
|
||||
- NOT a 3rd: `persist/lib/persist/query.sx:5` has a *since-cursor* for incremental log
|
||||
consumption — resumable-stream semantics, not result windowing. Different shape.
|
||||
- feed *also* has cursor-by-`:at` recency pagination (`page.sx:21-44`); search has no
|
||||
cursor. So only the plain offset/limit window is shared, and it is a literal 1-liner.
|
||||
- **Missing:** ≥3 stable consumers; AND every item here is collection math that belongs
|
||||
in the **substrate** (APL/Haskell already expose grade/sort/unique/take/drop), not a
|
||||
shared lib. A 1-line `take/drop` window is far below the extraction threshold. Watch;
|
||||
revisit only if a non-substrate subsystem needs the same windowing without take/drop.
|
||||
- **Filename-collision caution (pass 13):** `content/lib/content/page.sx` is an **HTML
|
||||
page wrapper** (full HTML5 doc), NOT pagination — do not count it as a 3rd pagination
|
||||
consumer. `page.sx` now means two unrelated things across the fleet. Re-tested pass 13:
|
||||
pagination still only feed + search (2).
|
||||
|
||||
### W4 · In-memory store fakes → `persist-on-sx`
|
||||
- Not an abstraction to extract — a migration target. Every subsystem fakes its
|
||||
store with a mutable list (`feed/-log`, flow store, mod audit, …).
|
||||
- **Owner:** `persist-on-sx` (in progress). Tracked there, listed here for visibility.
|
||||
- **Concrete instance (file:line, found pass 4): the append-only decision/audit log.**
|
||||
`acl/lib/acl/audit.sx` and `mod/lib/mod/audit.sx` are the SAME hand-rolled shape, and
|
||||
`persist/lib/persist/log.sx` (the persist *log facet*) already implements it durably:
|
||||
|
||||
| role | acl/audit.sx | mod/audit.sx | persist/log.sx (target) |
|
||||
|---|---|---|---|
|
||||
| log var | `acl-audit-log` :9 | `mod/*audit-log*` :10 | backend stream |
|
||||
| monotonic seq | `acl-audit-seq` :10 | `mod/*audit-seq*` :11 | per-stream high-water :1 |
|
||||
| append (auto-seq) | `acl-audit-decide!` | commit :32 | `persist/append` :17 |
|
||||
| count | `acl-audit-count` :51 | `mod/audit-count` :44 | `persist/count` :12 |
|
||||
| read-all oldest-first | snapshot/tail :73 | `mod/audit-all` :43 | `persist/read` :29 |
|
||||
| read seq≥from | — | by-seq | `persist/read-from` :31 |
|
||||
|
||||
Both deliberately use a monotonic seq with **no wall-clock** (deterministic/testable) —
|
||||
identical to persist/log's design. Action when persist's host adapter lands: acl + mod
|
||||
loops swap their in-memory log for `persist/log`. 2 consumers today; not a new lib —
|
||||
the home already exists. Belongs to acl/mod loops × persist loop, not an extraction.
|
||||
- **Cross-loop corroboration (pass 6):** the mod loop independently reached the same
|
||||
conclusion — `mod/plans/mod-on-sx.md` (commit 538b8a53): *"mod-sx (Prolog) and acl-sx
|
||||
(Datalog) converged on the same module shape … only the audit log + fed trust/outbox
|
||||
shapes truly share; extract at the architecture-merge point, refactoring both consumers
|
||||
atomically, not unilaterally from a loop branch."* Confirms the shape AND the
|
||||
do-not-extract-unilaterally stance.
|
||||
- **Home disagreement to resolve at merge:** mod's note proposes lifting the audit-log
|
||||
primitives into **`lib/guest/`**. Radar routing disagrees: a durable append-only log is
|
||||
a **`persist-on-sx`** concern (the log facet already exists), not language-impl plumbing.
|
||||
Hold the line — `lib/guest` is lexer/parser/AST/HM/test-runner, not an event log.
|
||||
- **Migration is becoming concrete:** new `host-persist` loop (worktree + tmux, pass 6)
|
||||
is building the durable-storage host adapter persist was blocked on — once it lands,
|
||||
acl/mod can actually swap to `persist/log`.
|
||||
- **LIVE REFERENCE EXEMPLAR (pass 9): `content` already does it right.** `content`
|
||||
(Phase 2 complete, 162/162) built its op log directly on `persist/log` instead of
|
||||
faking it — `content/lib/content/store.sx`: backend injected via `(persist/open)`
|
||||
("content knows nothing about which backend", :10); append op as event
|
||||
`persist/append b (content/-stream doc-id) …` (:20); read `persist/read` (:36);
|
||||
`persist/last-seq` (:47); **version = replay op stream up to a seq**
|
||||
(filter `persist/event-seq ev <= seq`, :61). "The op log is the source of truth …
|
||||
the materialised doc is a cache, never primary state."
|
||||
This proves the W4 target is real, not hypothetical: acl + mod's hand-rolled
|
||||
monotonic-seq logs should adopt exactly content's `persist/log` pattern.
|
||||
- **Consumer ledger of the append-only monotonic-seq event log (pass 11):**
|
||||
|
||||
| consumer | what | backing | note |
|
||||
|---|---|---|---|
|
||||
| content (`store.sx`) | doc op log | **persist/log ✓ live** | plain append + replay-to-seq |
|
||||
| commerce (`ledger.sx`) | order ledger | **persist/log ✓ live** | `persist/append-once` — idempotent, webhook-replay-safe :40,58 |
|
||||
| events (`booking.sx`) | booking roster | **persist/log ✓ live** | `persist/append-expect` — optimistic-concurrency CAS, capacity-safe, lock-free |
|
||||
| acl (`audit.sx`) | decision log | in-memory fake (SX) | migrate directly when host adapter lands |
|
||||
| mod (`audit.sx`) | decision log | in-memory fake (SX) | migrate directly |
|
||||
| identity (`audit.sx`) | grant ledger | in-memory fake (**Erlang**) | `{Seq,Subject,Action}`; needs an **Erlang↔persist bridge** first — author scoped it out until persist lands ("queryable semantics identical") |
|
||||
|
||||
- **Two takeaways:** (1) the pattern is **validated across domains** — CRDT doc ops,
|
||||
financial orders, event bookings, rule decisions, OAuth grants all reduce to the same
|
||||
append-only monotonic-seq stream; (2) migrating to `persist/log` is strictly *better*
|
||||
than the fakes — persist exposes a **feature ladder the fakes don't have**:
|
||||
`append` (content) → `append-once`/idempotency (commerce) → `append-expect`/optimistic-
|
||||
concurrency (events). Every fake would have to reinvent a weaker version of these.
|
||||
This is an **adoption** item (the home already exists), NOT a new extraction — owned by
|
||||
persist/host-persist × each consumer loop. The SX fakes (acl, mod) migrate directly;
|
||||
the Erlang fake (identity) is gated on an Erlang↔persist bridge.
|
||||
|
||||
### W5 · Proof-tree explanation over a logic-program derivation
|
||||
- `acl/lib/acl/explain.sx` (reconstructs a canonical proof by goal-directed search over a
|
||||
saturated Datalog db) and `mod/lib/mod/explain.sx` (renders a Prolog-style proof tree
|
||||
goal-by-goal with proved/unproved marks + unification bindings) are the same *idea*.
|
||||
- **Missing / disposition:** only 2 consumers, and they sit on **different substrates**
|
||||
(acl→`lib/datalog`, mod→`lib/prolog`). Proof reconstruction/rendering is logic-engine
|
||||
machinery → it belongs in each **substrate** (datalog/prolog), not a shared app lib.
|
||||
Watch; revisit only if a 3rd logic-backed subsystem reimplements proof explanation.
|
||||
- **Cross-loop note (pass 6):** mod's note calls `mod/proof-goals` (re-query-each-goal)
|
||||
generic and proposes lifting it into **`lib/guest/`**. Radar caveat: proof-tree
|
||||
reconstruction *is* engine-agnostic logic machinery, but `lib/guest` is for
|
||||
lexer/parser/AST/HM/match/test-runner — a logic-engine proof helper is a poor fit there.
|
||||
If genuinely shared by ≥3 engines, a `lib/logic`-style substrate helper is the better
|
||||
home than `lib/guest`. Still 2 consumers → stays Watching either way.
|
||||
|
||||
---
|
||||
|
||||
### W8 · Durable outbound delivery (at-least-once + idempotency + retry)
|
||||
- **Live exemplar on `lib/flow`:** `events/lib/events/notify.sx` — reminders/digests are
|
||||
durable `flow`s: a flow `request`s delivery (suspend point), the **host** performs the
|
||||
send via an injected `dispatch` transport, then resumes with the outcome; flow's
|
||||
deterministic replay means a completed delivery never re-runs on recovery. At-least-once
|
||||
with an idempotency key per message. This is "reliable delivery" done right on the flow
|
||||
substrate.
|
||||
- **Others roll their own:** `fed-sx` built its own outbox + `delivery_worker` + retry
|
||||
bookkeeping (Steps 8a–d); `mod/fed.sx` has an in-memory outbox seam; `acl/federation`
|
||||
propagates facts. Same *goal* (reliable outbound delivery, retry/idempotency) on
|
||||
different machinery.
|
||||
- **Disposition:** durable delivery is exactly what `lib/flow` is *for* (events proves
|
||||
it). Watch whether fed-sx / mod converge their outbox onto flow, or stay bespoke for
|
||||
perf/substrate reasons. 1 clean flow-based consumer today → Watching, not a proposal.
|
||||
- **Name-collision caveat:** `notify.sx` means two unrelated things — `feed/notify.sx` is
|
||||
a *read-side digest* (group inbox by verb+object), NOT delivery. Do not pair them.
|
||||
|
||||
### W7 · Snapshot/projection-checkpoint reimplemented vs `persist/snapshot` (delegate)
|
||||
- `persist/lib/persist/snapshot.sx` already provides a **generic** projection checkpoint:
|
||||
store `{:value :seq}` in the kv facet under a namespaced key; the headline property is
|
||||
**snapshot + tail == full replay** (pure, clock-free).
|
||||
- `content/lib/content/snapshot.sx` **reimplements that same pattern on raw persist KV**
|
||||
rather than delegating: `persist/kv-put b (content/-snap-key doc-id) {:doc … :seq seq}`
|
||||
(:20), `persist/kv-has?`/`kv-get` (:27-28), and its own tail-replay (:53-59). It never
|
||||
calls `persist/snapshot-*`. content's doc-materialisation *is* a projection fold over
|
||||
its op stream — exactly what `persist/snapshot` checkpoints generically.
|
||||
- **Disposition:** persist-adoption nudge (like W4): content could delegate to
|
||||
`persist/snapshot` (its projection = "fold ops → doc"), dropping the duplicated
|
||||
KV+replay code. Home already exists → NOT an extraction; owned by content × persist
|
||||
loops. Only 1 reinventor today; watch whether commerce/events/identity also hand-roll a
|
||||
snapshot on raw KV instead of using the facet (would strengthen the nudge). NB timeline:
|
||||
unclear if `persist/snapshot` predated content's — flag, don't blame.
|
||||
|
||||
### W6 · Guarded lifecycle state machine (illegal transition = explicit error)
|
||||
- Recurs as a **design principle**, NOT a shared structure (found pass 10):
|
||||
- `mod/lib/mod/lifecycle.sx` — pure SX: immutable case `{:state :error :history …}`,
|
||||
explicit transition table `mod/lc-transitions` (:31), illegal transition returns the
|
||||
case unchanged with `:error` set. States open→triaged→decided→appealed→final.
|
||||
- `identity/lib/identity/membership.sx` — an **Erlang `gen_server`** fragment (identity
|
||||
runs on erlang-on-sx): a `receive` loop with `case find(...) of … {error, St}` guards.
|
||||
States none→pending→active→lapsed→revoked.
|
||||
- **Both share the guideline** ("invalid transitions are explicit errors, never silent
|
||||
no-ops") but **implement it substrate-idiomatically** — SX transition-table over
|
||||
immutable values vs an Erlang process loop with per-message case guards. Same W1/`api.sx`
|
||||
trap: shared *idea*, divergent *structure*.
|
||||
- **Disposition:** not an extraction target — the FSM mechanism is ~10 substrate-specific
|
||||
lines; the value is in each domain's state graph, not the plumbing. At most a **design
|
||||
guideline** ("model lifecycle as a guarded FSM with explicit-error transitions"). Watch
|
||||
whether commerce-checkout / events-booking add their own — if so it confirms the
|
||||
*guideline*, still not a lib. Do not propose extracting a shared state-machine lib.
|
||||
|
||||
## Rejected (considered, declined — do not re-propose)
|
||||
|
||||
- **"Continuous auto-implementing abstractor loop."** Rejected at design time: an
|
||||
agent writing across `lib/<x>/**` breaks the worktree isolation that makes the
|
||||
fleet safe, and is rewarded for manufacturing premature/wrong abstractions. The
|
||||
radar is read-only by design. (This file is the alternative.)
|
||||
- **Shared `api.sx` "public boundary" module (×6).** Rejected pass 4-5: every subsystem
|
||||
has an `api.sx` (acl, feed, flow, mod, persist, search — a 100% filename match), but it
|
||||
is a naming *convention for the public entry point*, not a shared structure. They
|
||||
disagree on the most basic contract: acl/feed use **implicit module state**
|
||||
(`acl/api.sx` "implicit current db", `feed/api.sx` "single mutable log") while
|
||||
`persist/api.sx` threads an **explicit backend as every call's first arg**; flow's api
|
||||
*builds a Scheme env*, search's api *concatenates a Haskell source string*, mod's is a
|
||||
*lifecycle state-machine façade* (17 defs vs persist's 1). Same role, no common shape —
|
||||
the W1 coincidental-resemblance trap. Do not re-propose on the filename.
|
||||
- **Shared `wire.sx` "serialization" module (×2).** Rejected pass 15: content + mod both
|
||||
have a `wire.sx`, but `content/wire.sx` uses the **generic SX serializer**
|
||||
(`serialize`/`parse`, full-fidelity round-trip) while `mod/wire.sx` is a **bespoke
|
||||
versioned pipe-delimited line** (subset of fields, `split` hand-built over slice/len
|
||||
because mod's Prolog-loaded env strips string prims). Shared role (wire format),
|
||||
divergent structure + substrate constraint → not a candidate; the SX serializer is
|
||||
already the shared tool for SX-substrate subsystems, and mod can't use it. (Same family
|
||||
as the `api.sx` rejection above.)
|
||||
- **Dumping app-domain plumbing into `lib/guest`.** Rejected: `lib/guest` is for
|
||||
language-implementation plumbing. App patterns route to acl/fed-sx/persist/
|
||||
substrate/host instead (see the routing rule in the briefing).
|
||||
117
plans/agent-briefings/radar-loop.md
Normal file
117
plans/agent-briefings/radar-loop.md
Normal file
@@ -0,0 +1,117 @@
|
||||
# abstraction-radar loop agent (read-only scout)
|
||||
|
||||
Role: continuously scan **all** rose-ash subsystems for genuine abstraction /
|
||||
deduplication opportunities and maintain a ranked, evidence-backed backlog at
|
||||
`plans/abstractions.md`. You are a **scout, not an implementer** — you detect and
|
||||
document; you never refactor across subsystems.
|
||||
|
||||
```
|
||||
description: abstraction-radar (read-only scout)
|
||||
subagent_type: general-purpose
|
||||
run_in_background: true
|
||||
isolation: worktree
|
||||
```
|
||||
|
||||
## Prompt
|
||||
|
||||
You are the sole background agent on branch `loops/radar`, worktree
|
||||
`/root/rose-ash-loops/radar`, forever. Self-paced. Your ONLY writes are to
|
||||
`plans/abstractions.md` (and, rarely, refining this briefing). Push to
|
||||
`origin/loops/radar` after each update. Never touch `main` or `architecture`.
|
||||
|
||||
## The one hard rule: you do NOT edit `lib/**` — ever
|
||||
|
||||
You read across every subsystem and write findings to `plans/abstractions.md`.
|
||||
You do **not** implement abstractions, migrate code, or edit any `lib/<x>/**`
|
||||
file in any worktree. Implementation is a separate, coordinated, human-triggered
|
||||
step — proposing well is your whole job. An abstractor that writes across
|
||||
subsystems would collide with the very isolation that keeps the other loops safe;
|
||||
that is exactly why you are read-only.
|
||||
|
||||
## Dynamic discovery — re-enumerate every iteration, never hardcode
|
||||
|
||||
The set of subsystems grows as new loops are spawned. Each iteration, rebuild the
|
||||
list from the filesystem + tmux so newly-added subsystems are automatically in
|
||||
scope:
|
||||
|
||||
1. `ls -d /root/rose-ash-loops/*/` — every loop worktree. For a worktree named `X`,
|
||||
its in-flight subsystem is `lib/X/` **inside that worktree**
|
||||
(`/root/rose-ash-loops/X/lib/X/`) — that's the current, possibly-uncommitted
|
||||
state. Read it there, not from your own worktree.
|
||||
2. `ls -d /root/rose-ash/lib/*/` — subsystems merged into / dormant on the main repo
|
||||
(e.g. `feed` once merged, the language substrates `apl`/`haskell`/`prolog`/…).
|
||||
3. `tmux ls` — which subsystems are actively looping right now (affects whether a
|
||||
candidate's consumers are "stable" — see the gate).
|
||||
|
||||
Treat the union as your scan surface. When a `commerce` or `identity` loop appears
|
||||
later, step 1 picks it up with no change to you. Note in `abstractions.md` the
|
||||
date and the subsystem set you scanned, so drift is visible.
|
||||
|
||||
## The AHA gate — before ANY candidate goes in the backlog as "proposed"
|
||||
|
||||
"Avoid Hasty Abstractions." A wrong shared abstraction is far costlier than the
|
||||
duplication it replaces. A candidate may be listed as **proposed** only if ALL hold:
|
||||
|
||||
- **≥3 real consumers** (not 2 — three independent uses). Fewer → log it under
|
||||
"Watching" with its consumer count, do not propose.
|
||||
- **All consumers past Phase 2 and API-stable.** If a consumer's loop is mid-flight
|
||||
and its interfaces are still moving (`tmux ls` shows it active + its plan has
|
||||
unchecked early-phase boxes), the pattern is a moving target → "Watching."
|
||||
- **Structurally identical, not superficially similar.** Show the shared shape with
|
||||
file:line evidence from each consumer. Coincidental resemblance is the #1 trap.
|
||||
- **It has a natural home.** And that home is usually **not** `lib/guest` — see the
|
||||
routing rule below.
|
||||
|
||||
Anything failing a gate goes under **Watching** (with what's missing) or
|
||||
**Rejected** (with why), never silently dropped — so it isn't re-proposed each pass.
|
||||
|
||||
## Routing rule — most patterns do NOT belong in lib/guest
|
||||
|
||||
`lib/guest` is for **language-implementation plumbing** (lexer/parser/AST/HM/match/
|
||||
test-runner), and it has its own consumer-gated roadmap. App-subsystem patterns
|
||||
almost always have a better home — route, don't dump:
|
||||
|
||||
| Pattern kind | Home (not lib/guest) |
|
||||
|---|---|
|
||||
| per-viewer visibility / permission filter | `acl-on-sx` (delegate to `permit?`) |
|
||||
| federation scaffold (merge/ingest/backfill/trust) | `fed-sx` |
|
||||
| durable store / event log / kv | `persist-on-sx` |
|
||||
| collection math (group-by, dedupe, stable top-N) | the substrate (APL/Haskell/…) |
|
||||
| HTTP/handler/middleware plumbing | `host-on-sx` |
|
||||
| conformance/test harness | `lib/guest` (the one real exception — `test-runner.sx` + the shared driver live there) |
|
||||
|
||||
If a pattern's home is one of the subsystems, the recommended **action** is "adopt
|
||||
/ delegate there," and the work belongs to that subsystem's own loop (in its
|
||||
scope), not to a cross-cutting change.
|
||||
|
||||
## Each iteration
|
||||
|
||||
1. Re-discover the subsystem set (above). Record it + the date in `abstractions.md`.
|
||||
2. Pick ONE thread: either deep-dive a "Watching" candidate to gather file:line
|
||||
evidence and re-test its gates, or sweep for a new recurring shape across the
|
||||
current set.
|
||||
3. Update `plans/abstractions.md`: move items between Watching / Proposed /
|
||||
In-progress (owned by a subsystem loop) / Done / Rejected, with evidence.
|
||||
4. Keep it ranked by (consumers × effort-saved ÷ risk). Short, factual.
|
||||
5. Commit (`radar: <one-line finding>`) and push to `origin/loops/radar`.
|
||||
|
||||
Do not invent work to look busy: if a pass finds nothing that clears the gate,
|
||||
record "scanned N subsystems on <date>, no new candidates cleared the gate" and
|
||||
stop until next iteration. Empty passes are a valid, honest result.
|
||||
|
||||
## Gotchas
|
||||
|
||||
- SX files: `sx-tree` MCP tools take `file:` not `path:`. But you mostly READ —
|
||||
prefer `sx_find_across`, `sx_comp_usage`, `sx_comp_list`, `sx_summarise`, plus
|
||||
`Grep`/`Glob`/`Bash` for cross-worktree scanning.
|
||||
- `plans/abstractions.md` is a `.md` — edit it with normal Write/Edit, not sx-tree.
|
||||
- Never run `sx_build`. You don't build anything; you read.
|
||||
|
||||
## Style
|
||||
|
||||
- Evidence over assertion: every claim cites file:line in ≥3 consumers.
|
||||
- Honest empty passes. Rejected items stay rejected with a reason.
|
||||
- One finding per commit. Update. Push. Next.
|
||||
|
||||
Go. Read `plans/abstractions.md` (seeded), re-discover the subsystem set, and
|
||||
advance the highest-value thread.
|
||||
@@ -1,110 +0,0 @@
|
||||
# search-on-sx loop agent (single agent, queue-driven)
|
||||
|
||||
Role: iterates `plans/search-on-sx.md` forever. **Full-text + structured search on
|
||||
Haskell** — tokenize, inverted index, query AST, boolean + phrase + ranked
|
||||
queries (TF-IDF / BM25), ACL-aware post-filter, federated index merge. Typed ADTs
|
||||
make query parsing clean; lazy lists make posting-list iteration efficient. Sits on
|
||||
`lib/haskell/` (1514/1514 already green); adds a search-shaped vocabulary on top.
|
||||
|
||||
```
|
||||
description: search-on-sx queue loop
|
||||
subagent_type: general-purpose
|
||||
run_in_background: true
|
||||
isolation: worktree
|
||||
```
|
||||
|
||||
## Prompt
|
||||
|
||||
You are the sole background agent working `plans/search-on-sx.md`. Isolated
|
||||
worktree `/root/rose-ash-loops/search` on branch `loops/search`, forever, one
|
||||
commit per feature. Push to `origin/loops/search` after every commit. Never touch
|
||||
`main` or `architecture`.
|
||||
|
||||
## Restart baseline — check before iterating
|
||||
|
||||
1. Read `plans/search-on-sx.md` — roadmap + Progress log.
|
||||
2. `ls lib/search/` — pick up from the most advanced file.
|
||||
3. If `lib/search/tests/*.sx` exist, run them via `bash lib/search/conformance.sh`.
|
||||
Green before new work.
|
||||
4. If `lib/search/scoreboard.md` exists, that's your baseline.
|
||||
5. Read the `lib/haskell/` public API once — that's your substrate. `lib/haskell/
|
||||
haskell.sx` exists; also study `runtime.sx`, `eval.sx`, `parser.sx`, `infer.sx`,
|
||||
`match.sx`, `map.sx`, `set.sx`, `testlib.sx`. Learn how to declare ADTs, pattern
|
||||
match, and use the `Map`/`Set` helpers before writing index code. Verify the real
|
||||
exported names with sx_find_all / grep — don't assume from the plan's sketch.
|
||||
|
||||
## The queue
|
||||
|
||||
Phase order per `plans/search-on-sx.md`:
|
||||
|
||||
- **Phase 1** — tokenize + inverted index + simple term lookup
|
||||
(`Map Term [(DocId,[Pos])]`, insert/lookup, `(search/index doc)`,
|
||||
`(search/query term)`).
|
||||
- **Phase 2** — query AST + boolean/phrase eval (Term | And | Or | Not | Phrase;
|
||||
posting-list set ops; positional phrase match).
|
||||
- **Phase 3** — ranking (TF-IDF, BM25), top-N.
|
||||
- **Phase 4** — ACL-aware post-filter + federation (merge per-peer indices).
|
||||
|
||||
Within a phase, pick the checkbox that unlocks the most tests per effort.
|
||||
|
||||
Every iteration: implement → test → commit → tick `[ ]` → Progress log → next.
|
||||
|
||||
## Ground rules (hard)
|
||||
|
||||
- **Scope:** only `lib/search/**` and `plans/search-on-sx.md`. Do **not** edit
|
||||
`spec/`, `hosts/`, `shared/`, other `lib/<lang>/` dirs, `lib/stdlib.sx`, or
|
||||
`lib/` root. May **import** from `lib/haskell/` only (its public API). Do **not**
|
||||
modify Haskell.
|
||||
- **NEVER call `sx_build`.** 600s watchdog. If the sx_server binary is broken →
|
||||
Blockers entry, stop. Run tests by invoking the sx_server binary directly from a
|
||||
conformance.sh (model it on `lib/haskell/conformance.sh`), pointing `SX_SERVER`
|
||||
at `/root/rose-ash/hosts/ocaml/_build/default/bin/sx_server.exe` — fresh
|
||||
worktrees have no `_build/`, so the relative path won't resolve.
|
||||
- **Shared-file issues** → plan's Blockers with minimal repro; don't fix here.
|
||||
- **SX files:** `sx-tree` MCP tools ONLY. **They take `file:` not `path:`** — a
|
||||
wrong key yields `Yojson Type_error("Expected string, got null")`, which looks
|
||||
like a broken binary but is just a param mismatch. `sx_validate` after edits.
|
||||
Path-based edits (`sx_replace_node`) count comment headers in their indices and
|
||||
can clobber the wrong node — re-read after, or prefer `sx_write_file` for small
|
||||
files.
|
||||
- **Unicode in `.sx`:** raw UTF-8 only, never `\uXXXX` escapes.
|
||||
- **Commit granularity:** one feature per commit. Short factual messages
|
||||
(`search: phrase query positional match + 7 tests`). Push to `origin/loops/search`.
|
||||
- **Plan file:** update Progress log (newest first) + tick boxes every commit.
|
||||
|
||||
## search-specific gotchas
|
||||
|
||||
- **Posting lists are the hot path.** Keep them sorted by DocId so boolean AND/OR
|
||||
are linear merges, not nested scans. Phrase match needs positions, so store
|
||||
`(DocId, [Pos])` — don't drop positions early to save space; you can't recover them.
|
||||
- **Tokenization decides recall.** Normalize consistently (lowercase, strip
|
||||
punctuation) on BOTH index and query side, or queries silently miss. Test the
|
||||
index/query symmetry explicitly.
|
||||
- **Ranking must be deterministic on ties.** TF-IDF/BM25 scores collide; always
|
||||
add a stable tiebreak (DocId ascending) or tests flake.
|
||||
- **ACL filter is per-viewer and post-ranking.** Filter the result list against the
|
||||
viewer, after scoring — never bake visibility into the index (the same index
|
||||
serves all viewers). Inject the permit predicate; don't hardwire an ACL module
|
||||
that doesn't exist yet.
|
||||
- **Federation merges indices, not results.** Merging per-peer inverted indices
|
||||
(union posting lists per term) is cleaner and rank-correct vs merging ranked
|
||||
result lists. Mock peer indices in tests.
|
||||
|
||||
## General gotchas (all loops)
|
||||
|
||||
- SX `do` = R7RS iteration. Use `begin` for multi-expr sequences.
|
||||
- `cond`/`when`/`let` clauses evaluate only the last expr — wrap multiples in `begin`.
|
||||
- `let` is parallel, not sequential — nest `let`s when a binding references an earlier one.
|
||||
- `env-bind!` creates a binding; `env-set!` mutates an existing one (walks scope chain).
|
||||
- `sx_validate` after every structural edit.
|
||||
- Namespace-prefix all guest helpers (`search/...`) — short/host-colliding names
|
||||
get silently shadowed or hang the runtime.
|
||||
|
||||
## Style
|
||||
|
||||
- No comments in `.sx` unless non-obvious.
|
||||
- No new planning docs — update `plans/search-on-sx.md` inline.
|
||||
- Short, factual commit messages.
|
||||
- One feature per iteration. Commit. Log. Push. Next.
|
||||
|
||||
Go. Start by reading the plan; find the first unchecked `[ ]`; implement it.
|
||||
82
plans/commerce-on-sx.md
Normal file
82
plans/commerce-on-sx.md
Normal file
@@ -0,0 +1,82 @@
|
||||
# commerce-on-sx: Catalog, cart, pricing & orders on miniKanren
|
||||
|
||||
> **DRAFT outline.** The revenue vertical. Depends on `persist-on-sx` (durable
|
||||
> orders) and `flow-on-sx` (checkout as a durable flow). Don't start before
|
||||
> persist-on-sx Phase 1 is green.
|
||||
|
||||
rose-ash's revenue engine — market (catalog), cart (checkout), orders (SumUp
|
||||
payment, reconciliation) — has no SX subsystem. The hard part of commerce isn't
|
||||
CRUD; it's **pricing**: discounts, bundles, tax, membership rates, promotions that
|
||||
stack (or don't). These are relations, and a relational engine can run them in
|
||||
multiple directions — forward ("what's the total?") and backward ("what promo code
|
||||
yields this total?", "which line item triggered the discount?").
|
||||
|
||||
That's a miniKanren fit. Pricing/promotion rules are relational; cart and order
|
||||
*lifecycle* (reserve → pay → fulfil → reconcile) is a durable `flow`; the order
|
||||
ledger is a `persist` stream. Commerce is the first real **composition** subsystem.
|
||||
|
||||
End-state: a catalog model, a relational pricing/promotion engine, a cart with
|
||||
deterministic totals, and an order lifecycle flow with payment-webhook
|
||||
reconciliation — all auditable via the event log.
|
||||
|
||||
## Status (rolling)
|
||||
|
||||
`bash lib/commerce/conformance.sh` → **0/0** (not yet started)
|
||||
|
||||
## Ground rules
|
||||
|
||||
- **Scope:** only `lib/commerce/**` and `plans/commerce-on-sx.md`. May **import**
|
||||
from `lib/minikanren/`, and (once they exist) `lib/persist/` + `lib/flow/`. Do not
|
||||
edit substrates.
|
||||
- **Architecture:** prices/promotions are miniKanren relations over catalog facts;
|
||||
a cart total is a *deterministic* query result (first solution under a fixed rule
|
||||
order). Order lifecycle is a `flow` that suspends at the payment IO boundary.
|
||||
Money is integer minor units — never floats.
|
||||
- **Determinism:** promotion stacking must have explicit, tested precedence;
|
||||
totals must be reproducible from the cart + catalog snapshot.
|
||||
- **Commits:** one feature per commit. Progress log + tick boxes.
|
||||
|
||||
## Architecture sketch
|
||||
|
||||
```
|
||||
Catalog + cart Total / order
|
||||
product(id,price,tags) {:subtotal :discounts :tax :total}
|
||||
│ ▲
|
||||
▼ │
|
||||
lib/commerce/catalog.sx lib/commerce/price.sx
|
||||
— product / variant / stock facts — miniKanren pricing relations
|
||||
│ — promo stacking, membership rates
|
||||
▼ ▲
|
||||
lib/commerce/cart.sx lib/commerce/order.sx (flow + store)
|
||||
— line items, quantities — reserve→pay→fulfil→reconcile
|
||||
│ — SumUp webhook = flow resume
|
||||
▼ │
|
||||
lib/commerce/api.sx ── (commerce/add) (commerce/total) (commerce/checkout) ──┘
|
||||
```
|
||||
|
||||
## Phase 1 — Catalog + cart + deterministic totals
|
||||
- [ ] `catalog.sx` — product/variant/stock as facts
|
||||
- [ ] `cart.sx` — line items, add/remove/qty
|
||||
- [ ] `price.sx` — base pricing relation, subtotal; tax
|
||||
- [ ] `api.sx` + tests + scoreboard + conformance.sh
|
||||
|
||||
## Phase 2 — Promotions (relational)
|
||||
- [ ] promo rules: percentage, fixed, bundle, member rate
|
||||
- [ ] explicit stacking precedence; "best price" backward query
|
||||
- [ ] tests: stacking order, mutually-exclusive promos, member vs guest
|
||||
|
||||
## Phase 3 — Order lifecycle (flow + store)
|
||||
- [ ] order flow: reserve stock → await payment → fulfil
|
||||
- [ ] payment webhook resumes the suspended flow
|
||||
- [ ] order ledger as a `persist` stream; idempotent reconciliation
|
||||
|
||||
## Phase 4 — Reconciliation + federation
|
||||
- [ ] mismatch detection (paid≠ordered) as queries over the ledger
|
||||
- [ ] cross-instance catalog (federated marketplace) — out-of-scope stub
|
||||
- [ ] tests: webhook replay, partial refund, double-charge guard
|
||||
|
||||
## Progress log
|
||||
(loop fills this in)
|
||||
|
||||
## Blockers
|
||||
(loop fills this in)
|
||||
82
plans/content-on-sx.md
Normal file
82
plans/content-on-sx.md
Normal file
@@ -0,0 +1,82 @@
|
||||
# content-on-sx: Documents, blocks & collaborative editing on Smalltalk
|
||||
|
||||
> **DRAFT outline.** The CMS vertical — blog, WYSIWYG editor, Ghost sync. Depends
|
||||
> on `persist-on-sx` (document history as an event log). Ghost/CMS sync stays a thin
|
||||
> external adapter (Python/FFI) until a native replacement exists.
|
||||
|
||||
rose-ash's `blog` domain is content management: a block-based WYSIWYG editor,
|
||||
navigation, Ghost CMS sync. A document is a tree of live blocks; editing is a
|
||||
stream of operations; collaboration needs conflict-free merge. That is an object
|
||||
model — blocks are objects, edits are messages, and a document is the object graph
|
||||
responding to them. Smalltalk's "everything is an object responding to messages"
|
||||
maps directly to a block/WYSIWYG model, and a semilattice (CRDT) merge keeps
|
||||
concurrent edits conflict-free.
|
||||
|
||||
End-state: a Smalltalk-on-SX document model (typed blocks, structural ops),
|
||||
operation log + CRDT merge for collaborative editing, versioning/history via the
|
||||
event store, and a render boundary to HTML/SX. External CMS (Ghost) sync is an
|
||||
injected adapter, not core.
|
||||
|
||||
## Status (rolling)
|
||||
|
||||
`bash lib/content/conformance.sh` → **0/0** (not yet started)
|
||||
|
||||
## Ground rules
|
||||
|
||||
- **Scope:** only `lib/content/**` and `plans/content-on-sx.md`. May **import**
|
||||
from `lib/smalltalk/`, and (once it exists) `lib/persist/`. Do not edit substrates.
|
||||
- **Architecture:** a document is an ordered tree of blocks (objects); an edit is a
|
||||
message (`insert`/`update`/`move`/`delete`); concurrent edits merge via a
|
||||
commutative (CRDT/semilattice) operation so order doesn't matter. History is the
|
||||
`persist` event stream; any version is a replay.
|
||||
- **Determinism:** merge must be commutative + idempotent (test: apply ops in any
|
||||
order / twice → same document).
|
||||
- **Commits:** one feature per commit. Progress log + tick boxes.
|
||||
|
||||
## Architecture sketch
|
||||
|
||||
```
|
||||
Edit op Rendered document
|
||||
(insert block after id) ... HTML / SX tree
|
||||
│ ▲
|
||||
▼ │
|
||||
lib/content/block.sx lib/content/render.sx
|
||||
— typed blocks as objects — block tree → HTML/SX
|
||||
— heading/text/image/embed — (reuses SX render boundary)
|
||||
│ ▲
|
||||
▼ │
|
||||
lib/content/doc.sx lib/content/merge.sx
|
||||
— ordered block tree — CRDT/semilattice op merge
|
||||
— apply op, structural moves — concurrent-edit reconciliation
|
||||
│ ▲
|
||||
▼ │
|
||||
lib/content/api.sx ── (content/edit) (content/render) (content/history) ──┐
|
||||
│ │
|
||||
├── op log + versions → persist │
|
||||
└── Ghost/CMS sync → injected external adapter (thin, non-core) ──┘
|
||||
```
|
||||
|
||||
## Phase 1 — Block document model
|
||||
- [ ] `block.sx` — typed block objects
|
||||
- [ ] `doc.sx` — ordered tree, apply edit op, structural moves
|
||||
- [ ] `render.sx` — block tree → HTML/SX
|
||||
- [ ] `api.sx` + tests + scoreboard + conformance.sh
|
||||
|
||||
## Phase 2 — Op log + versioning
|
||||
- [ ] edit ops as `persist` events; replay to any version
|
||||
- [ ] `(content/history doc)`, diff between versions
|
||||
|
||||
## Phase 3 — Collaborative merge (CRDT)
|
||||
- [ ] commutative/idempotent op merge
|
||||
- [ ] concurrent-edit tests (any order, double-apply → identical)
|
||||
|
||||
## Phase 4 — External sync + federation
|
||||
- [ ] Ghost/CMS sync via injected adapter (import/export)
|
||||
- [ ] federated documents (peer-authored blocks) — trust-gated stub
|
||||
- [ ] tests: round-trip import/export, conflict on concurrent external edit
|
||||
|
||||
## Progress log
|
||||
(loop fills this in)
|
||||
|
||||
## Blockers
|
||||
(loop fills this in)
|
||||
81
plans/events-on-sx.md
Normal file
81
plans/events-on-sx.md
Normal file
@@ -0,0 +1,81 @@
|
||||
# events-on-sx: Calendar, ticketing & notification delivery on Datalog
|
||||
|
||||
> **DRAFT outline.** The events vertical + the shared notification-delivery edge.
|
||||
> Depends on `persist-on-sx` (bookings ledger) and `flow-on-sx` (reminders, retrying
|
||||
> delivery). Pairs with `commerce-on-sx` for paid tickets.
|
||||
|
||||
rose-ash's `events` domain is calendar + ticketing: recurring events, availability,
|
||||
capacity, bookings. Scheduling is constraint reasoning — "is this slot free given
|
||||
recurrence, capacity, and the attendee's other bookings?" — which is rule
|
||||
evaluation over facts. Datalog expresses availability, recurrence expansion, and
|
||||
capacity as rules; a booking is a transaction; reminders and digests are durable
|
||||
`flow`s. Notification *delivery* (email/push) — needed here and by `feed/notify` —
|
||||
is folded in as an injected transport, extractable later.
|
||||
|
||||
End-state: a Datalog-on-SX events layer with recurrence expansion, availability +
|
||||
capacity rules, transactional booking, and a flow-driven notification dispatcher
|
||||
(reminders, digests, retries) over an injected transport.
|
||||
|
||||
## Status (rolling)
|
||||
|
||||
`bash lib/events/conformance.sh` → **0/0** (not yet started)
|
||||
|
||||
## Ground rules
|
||||
|
||||
- **Scope:** only `lib/events/**` and `plans/events-on-sx.md`. May **import** from
|
||||
`lib/datalog/`, and (once they exist) `lib/persist/` + `lib/flow/`. Do not edit
|
||||
substrates.
|
||||
- **Architecture:** events/availability/capacity are Datalog facts + rules;
|
||||
recurrence expands to occurrence facts within a window; a booking checks rules
|
||||
then appends a `persist` event (idempotent, capacity-safe). Notifications are flows
|
||||
that suspend on transport IO and retry on failure.
|
||||
- **Determinism:** recurrence expansion + availability must be reproducible for a
|
||||
fixed window + ruleset; capacity checks must be race-safe (no overbooking).
|
||||
- **Commits:** one feature per commit. Progress log + tick boxes.
|
||||
|
||||
## Architecture sketch
|
||||
|
||||
```
|
||||
Event + booking Result
|
||||
event(id,start,rrule,capacity) {:booked | :full | :conflict} + reminders
|
||||
│ ▲
|
||||
▼ │
|
||||
lib/events/calendar.sx lib/events/availability.sx
|
||||
— event facts, recurrence (RRULE) — free/busy + capacity rules (Datalog)
|
||||
— expand occurrences in window │
|
||||
│ ▲
|
||||
▼ │
|
||||
lib/events/booking.sx lib/events/notify.sx (flow)
|
||||
— transactional, capacity-safe — reminders / digests, retry on fail
|
||||
— bookings → persist ledger — injected transport (email/push)
|
||||
│ │
|
||||
▼ ▼
|
||||
lib/events/api.sx ── (events/schedule) (events/book) (events/agenda) ──────┘
|
||||
```
|
||||
|
||||
## Phase 1 — Calendar + recurrence
|
||||
- [ ] `calendar.sx` — event facts, RRULE expansion in a window
|
||||
- [ ] `availability.sx` — free/busy rules
|
||||
- [ ] `api.sx` + tests + scoreboard + conformance.sh
|
||||
|
||||
## Phase 2 — Ticketing + booking
|
||||
- [ ] capacity rules; transactional booking → `persist` (no overbooking)
|
||||
- [ ] paid tickets compose with `commerce` order flow
|
||||
- [ ] tests: capacity edge, double-book guard, conflict detection
|
||||
|
||||
## Phase 3 — Notification delivery (flow)
|
||||
- [ ] `notify.sx` — reminder/digest flows over injected transport
|
||||
- [ ] retry/backoff on transport failure (flow suspend/resume)
|
||||
- [ ] tests: delivery success, retry path, idempotent re-send
|
||||
- [ ] NOTE: shared with `feed/notify` — candidate for later extraction to a
|
||||
`delivery-on-sx` once a second consumer is real
|
||||
|
||||
## Phase 4 — Federation
|
||||
- [ ] cross-instance events (peer calendar) — trust-gated stub
|
||||
- [ ] tests: federated agenda merge
|
||||
|
||||
## Progress log
|
||||
(loop fills this in)
|
||||
|
||||
## Blockers
|
||||
(loop fills this in)
|
||||
100
plans/host-on-sx.md
Normal file
100
plans/host-on-sx.md
Normal file
@@ -0,0 +1,100 @@
|
||||
# host-on-sx: The SX web host — off Quart, onto the kernel (Dream-bound)
|
||||
|
||||
> **DRAFT outline.** The integration boundary that turns the subsystem libraries
|
||||
> into running services, and the strangler path off Python/Quart. This is the
|
||||
> dependency hub — it imports every subsystem. Decision recorded below: native
|
||||
> server + SXTP **now**, `dream-on-sx` framework layer **next**, Python only at the
|
||||
> external-integration edges.
|
||||
|
||||
The subsystems (`feed`, `search`, `acl`, `mod`, `flow`, `commerce`, `identity`,
|
||||
`content`, `events`) are libraries. Something has to receive an HTTP request, route
|
||||
it, call the right subsystem, and serialize the response. Today that's Python/Quart
|
||||
— the one large non-SX component in the stack: separate runtime, deploy, and
|
||||
failure mode. The goal is to move the web/host/domain layer onto the SX substrate
|
||||
and retire Quart, **incrementally (strangler-fig), never big-bang.**
|
||||
|
||||
This is already underway: a native OCaml HTTP server is live in prod on
|
||||
`sx.rose-ash.com` (~3ms cached, ~323 req/s, ~2MB RSS), `defhandler`/`defpage`
|
||||
exist, and a partial **SXTP** protocol is specced. That is the unblocked near-term
|
||||
host — no `ocaml-on-sx` dependency.
|
||||
|
||||
## Two layers, two timelines
|
||||
|
||||
1. **Now (unblocked): native server + SXTP adapter + SX handlers.** Route rose-ash
|
||||
endpoints onto the SX host one at a time. Each migrated endpoint is an SX
|
||||
handler dispatching to a subsystem; Quart proxies the rest until cut over.
|
||||
2. **Next: `dream-on-sx` as the framework layer.** Dream gives Quart-grade
|
||||
ergonomics — typed routing, middleware stacks, sessions, CSRF. It is gated on
|
||||
`ocaml-on-sx` Phases 1–5 + minimal stdlib. **This plan is the concrete target
|
||||
user that un-parks `dream-on-sx`** (see `plans/dream-on-sx.md`): "the subsystems
|
||||
need an HTTP front door" is the real feature pulling Dream. Until then, do not
|
||||
block migration on Dream — the native server is sufficient.
|
||||
3. **Always: Python only at the edges.** External integrations — SumUp payments,
|
||||
Ghost CMS, ActivityPub crypto, IPFS/Kubo — ride Python libraries today. They
|
||||
stay as thin injected adapters (Python/FFI) behind subsystem interfaces until
|
||||
native replacements exist. "Drop Quart" ≠ "drop every line of Python."
|
||||
|
||||
## Status (rolling)
|
||||
|
||||
`bash lib/host/conformance.sh` → **0/0** (not yet started)
|
||||
|
||||
## Ground rules
|
||||
|
||||
- **Scope:** `lib/host/**` and `plans/host-on-sx.md`. May **import** every subsystem
|
||||
+ the kernel's server/SXTP surface. Do **not** edit `spec/`, `hosts/`, `shared/`,
|
||||
or subsystem internals — wire to their public APIs only. Host-primitive / server
|
||||
changes belong in `hosts/` (out of scope) → Blockers.
|
||||
- **Architecture:** a route maps (method, path) → handler; a handler is an SX fn
|
||||
`request -> response` that calls subsystem APIs; middleware is composed handlers
|
||||
(auth via `identity`, permission via `acl`, mute via subsystem prefs). SXTP is the
|
||||
wire format between host and subsystem-as-service.
|
||||
- **Migration discipline:** each endpoint moved must be behavior-equivalent to its
|
||||
Quart original (golden-response test before flip). Keep a migration ledger.
|
||||
- **Commits:** one feature per commit. Progress log + tick boxes.
|
||||
|
||||
## Architecture sketch
|
||||
|
||||
```
|
||||
HTTP request HTTP response
|
||||
│ ▲
|
||||
▼ │
|
||||
native OCaml http server (prod) ──────► lib/host/router.sx
|
||||
(hosts/ — out of scope) — (method,path) → handler
|
||||
│ ▲
|
||||
▼ │
|
||||
lib/host/middleware.sx lib/host/handler.sx
|
||||
— auth(identity) ∘ acl ∘ mute ∘ ... — request → subsystem call → response
|
||||
│ ▲
|
||||
▼ │
|
||||
lib/host/sxtp.sx subsystem APIs (feed/search/commerce/…)
|
||||
— wire format, host↔service — called via public interfaces
|
||||
│
|
||||
└── external edges: SumUp / Ghost / AP / IPFS → injected Python/FFI adapters
|
||||
```
|
||||
|
||||
## Phase 1 — Router + handler + one real endpoint
|
||||
- [ ] `router.sx` — route table, (method,path) match
|
||||
- [ ] `handler.sx` — request/response model, subsystem dispatch
|
||||
- [ ] migrate ONE read endpoint (e.g. a feed timeline) end-to-end, golden test
|
||||
- [ ] `conformance.sh` + scoreboard
|
||||
|
||||
## Phase 2 — Middleware + SXTP
|
||||
- [ ] `middleware.sx` — composable auth/acl/mute/error layers
|
||||
- [ ] `sxtp.sx` — host↔subsystem wire format (align with existing spec)
|
||||
- [ ] migrate a write endpoint (auth + permission + action)
|
||||
|
||||
## Phase 3 — Strangler migration ledger
|
||||
- [ ] enumerate Quart endpoints; track migrated vs proxied
|
||||
- [ ] golden-response harness vs the live Quart responses
|
||||
- [ ] cut over a whole domain (smallest: `likes` or `relations`) as proof
|
||||
|
||||
## Phase 4 — Dream framework layer (gated)
|
||||
- [ ] gate: `ocaml-on-sx` Phases 1–5 + minimal stdlib green
|
||||
- [ ] adopt `dream-on-sx` routing/middleware/session ergonomics over the same handlers
|
||||
- [ ] re-home external adapters as native where replacements land
|
||||
|
||||
## Progress log
|
||||
(loop fills this in)
|
||||
|
||||
## Blockers
|
||||
(loop fills this in)
|
||||
84
plans/identity-on-sx.md
Normal file
84
plans/identity-on-sx.md
Normal file
@@ -0,0 +1,84 @@
|
||||
# identity-on-sx: OAuth2, sessions & membership on Erlang
|
||||
|
||||
> **DRAFT outline.** The identity core `acl-on-sx` assumes already exists. `acl`
|
||||
> answers "may X do Y"; identity answers "who is X, and how did they prove it."
|
||||
> Depends on `persist-on-sx` (grant/audit ledger). Pairs with `acl-on-sx`.
|
||||
|
||||
rose-ash's `account` domain is the OAuth2 authorization server every other app is
|
||||
a client of: silent SSO, per-app first-party cookies, grant verification,
|
||||
membership. Sessions and grants are **long-lived, concurrent, individually
|
||||
addressable, and expire on their own** — that is the actor model. Erlang's
|
||||
processes + mailboxes map cleanly: a session is a process, token issue/refresh/
|
||||
revoke are messages, expiry is a process timeout, and SSO is one process answering
|
||||
many apps.
|
||||
|
||||
End-state: an Erlang-on-SX layer with the OAuth2 authorization-code + silent
|
||||
(`prompt=none`) flows as message protocols, a session/grant registry, token
|
||||
lifecycle (issue/refresh/revoke/introspect), and membership state — all auditable
|
||||
through the event log, all authorization questions delegated to `acl-on-sx`.
|
||||
|
||||
## Status (rolling)
|
||||
|
||||
`bash lib/identity/conformance.sh` → **0/0** (not yet started)
|
||||
|
||||
## Ground rules
|
||||
|
||||
- **Scope:** only `lib/identity/**` and `plans/identity-on-sx.md`. May **import**
|
||||
from `lib/erlang/`, and (once they exist) `lib/persist/` + `lib/acl/`. Do not edit
|
||||
substrates.
|
||||
- **Architecture:** a session/grant is a process holding its own state; the
|
||||
registry routes messages by subject/client id. Tokens are opaque + introspected,
|
||||
not self-validating (revocation must be real). Authorization decisions are NOT
|
||||
made here — `identity` proves identity, `acl` decides permission.
|
||||
- **Security:** revocation is immediate (kill the process / tombstone the grant);
|
||||
no decision relies on a token that outlived its grant. Negative answers are
|
||||
explicit, never "absence of a yes."
|
||||
- **Commits:** one feature per commit. Progress log + tick boxes.
|
||||
|
||||
## Architecture sketch
|
||||
|
||||
```
|
||||
Auth request Token / session
|
||||
(authorize client scope subject) {:access :refresh :expires :grant}
|
||||
│ ▲
|
||||
▼ │
|
||||
lib/identity/oauth.sx lib/identity/token.sx
|
||||
— authz-code + prompt=none flows — issue / refresh / revoke / introspect
|
||||
— as Erlang message protocols — opaque tokens, grant-backed
|
||||
│ ▲
|
||||
▼ │
|
||||
lib/identity/session.sx lib/identity/registry.sx
|
||||
— session = process, expiry=timeout — route by subject/client; SSO fan-out
|
||||
│ │
|
||||
▼ ▼
|
||||
lib/identity/api.sx ── (identity/login) (identity/grant?) (identity/revoke) ──┐
|
||||
│ │
|
||||
└──────── grant + audit events → persist ; permission? → acl ──────────┘
|
||||
```
|
||||
|
||||
## Phase 1 — Sessions + tokens
|
||||
- [ ] `session.sx` — session process, create/lookup/expire
|
||||
- [ ] `token.sx` — issue/introspect/revoke (opaque, grant-backed)
|
||||
- [ ] `registry.sx` — route by subject/client
|
||||
- [ ] `api.sx` + tests + scoreboard + conformance.sh
|
||||
|
||||
## Phase 2 — OAuth2 flows
|
||||
- [ ] authorization-code flow as a message protocol
|
||||
- [ ] refresh + rotation; revocation cascades to issued tokens
|
||||
- [ ] tests: full code exchange, refresh, revoke-then-use (must fail)
|
||||
|
||||
## Phase 3 — Silent SSO + membership
|
||||
- [ ] `prompt=none` cross-app login (one session, many clients)
|
||||
- [ ] membership state + per-app grant projection
|
||||
- [ ] grant verification delegated cache (mirror Redis-cache pattern)
|
||||
|
||||
## Phase 4 — Audit + federation
|
||||
- [ ] every issue/refresh/revoke is a `persist` event; `(identity/audit subject)`
|
||||
- [ ] federated identity (peer-asserted subject) — advisory, trust-gated stub
|
||||
- [ ] tests: audit completeness, cross-instance subject mapping
|
||||
|
||||
## Progress log
|
||||
(loop fills this in)
|
||||
|
||||
## Blockers
|
||||
(loop fills this in)
|
||||
119
plans/persist-on-sx.md
Normal file
119
plans/persist-on-sx.md
Normal file
@@ -0,0 +1,119 @@
|
||||
# persist-on-sx: Durable state on the SX kernel
|
||||
|
||||
> **DRAFT outline.** Foundation subsystem — the durable substrate the other five
|
||||
> currently fake with in-memory mutable lists. Build this first.
|
||||
>
|
||||
> **"persist" = persistence / data store, NOT the shop.** The shop/commerce vertical
|
||||
> is `commerce-on-sx`.
|
||||
|
||||
rose-ash needs durable state: every subsystem (feed log, flow store, mod audit,
|
||||
search index, acl grants, sessions) today hand-rolls an in-memory structure that
|
||||
vanishes on restart. `persist-on-sx` is the one durable substrate they share. It
|
||||
lives directly on the SX kernel's IO-suspension primitives (`perform`/`cek-resume`
|
||||
— the third CEK phase) so a read/write `perform`s and the kernel persists at the
|
||||
boundary. Concrete storage backends are injected.
|
||||
|
||||
## Does it cover ALL persistence? No — and on purpose.
|
||||
|
||||
Event-sourcing-everything is a known trap (replay cost, event schema evolution,
|
||||
awkward ad-hoc queries, 5MB images in a log). So persist owns the **durable
|
||||
source-of-truth substrate**, exposed as **two facets over one backend protocol**,
|
||||
with two things explicitly delegated out:
|
||||
|
||||
| Shape | Owner | Notes |
|
||||
|-------|-------|-------|
|
||||
| **Event streams** (append-only, history matters) | persist — **log facet** | feed activities, mod audit, order ledger, flow state, content edits |
|
||||
| **Current-state values** (KV / document, no history) | persist — **kv facet** | profiles, stock counts, config, session blobs; also where projections materialize |
|
||||
| **Snapshots / read models** (derived, queryable) | persist — projections → kv/log | rebuildable from the log; persisted so you don't replay to answer a query |
|
||||
| **Blobs / large objects** (images, media) | **delegated** → content-addressed store (artdag/IPFS already) | persist stores the *reference/CID*, never the bytes |
|
||||
| **Cache** (ephemeral, evictable) | **out of scope** | not persistence — different lifecycle (Redis-shaped) |
|
||||
| **Ad-hoc relational query** | the subsystem, over a projected read model | the log is bad at "all orders by X in March"; project into a queryable kv/SQL backend |
|
||||
|
||||
So: persist is the **single durable substrate** for state that's either a stream of
|
||||
changes or a current value — but it does **not** force everything into an event
|
||||
log, it does **not** hold blobs (only their content-addressed refs), and it does
|
||||
**not** do caching. Those boundaries are the whole point of calling it a substrate
|
||||
rather than "the database."
|
||||
|
||||
End-state: `log` (append/read streams) + `kv` (get/put/delete by key) facets, an
|
||||
injectable backend protocol (mem → file → Postgres → IPFS-ref), pure projections
|
||||
with incremental snapshots, optimistic concurrency, and a subscription hook so
|
||||
read models (feeds, indices, audit logs) update incrementally.
|
||||
|
||||
## Status (rolling)
|
||||
|
||||
`bash lib/persist/conformance.sh` → **0/0** (not yet started)
|
||||
|
||||
## Ground rules
|
||||
|
||||
- **Scope:** only `lib/persist/**` and `plans/persist-on-sx.md`. May **import** the
|
||||
kernel's IO-suspension surface (`perform`, platform IO ops) — verify what's
|
||||
exported first. Do not add host primitives; a missing durable IO op is a Blockers
|
||||
entry (it belongs in `hosts/`, out of scope).
|
||||
- **Architecture:** an event is `{:stream :seq :type :at :data}`; the log is an
|
||||
ordered append-only vector; a projection is `(fold step seed events)`; a kv value
|
||||
is `(get/put/delete key)`. Both facets sit on one injected backend
|
||||
`{:append :read :kv-get :kv-put :snapshot-read :snapshot-write}`. The in-memory
|
||||
backend is the test default; real backends wire in unchanged.
|
||||
- **Determinism:** replay is pure — same log → same state, always. No clocks or
|
||||
randomness inside projections; time lives on the event.
|
||||
- **Blobs:** store the content-address/CID and metadata; never the bytes. The blob
|
||||
backend is a separate injected dependency.
|
||||
- **Commits:** one feature per commit. Progress log + tick boxes.
|
||||
|
||||
## Architecture sketch
|
||||
|
||||
```
|
||||
Command / write Read model / value
|
||||
(append stream type data) (project stream step seed)
|
||||
(kv-put key value) (kv-get key)
|
||||
│ ▲
|
||||
▼ │
|
||||
lib/persist/event.sx lib/persist/project.sx
|
||||
— {:stream :seq :type :at :data} — fold step seed; incremental from snapshot
|
||||
│ ▲
|
||||
▼ │
|
||||
lib/persist/log.sx lib/persist/kv.sx lib/persist/snapshot.sx
|
||||
— append/read — get/put/delete — checkpoint; replay = snapshot + tail
|
||||
— optimistic seq — current-state
|
||||
│ │ ▲
|
||||
└──────────────────┴── (perform → backend) ───┘
|
||||
│
|
||||
lib/persist/backend.sx lib/persist/api.sx
|
||||
— injected protocol — (persist/append) (persist/project)
|
||||
— mem | file | pg | ipfs-ref — (persist/kv-get/put) (persist/subscribe)
|
||||
│
|
||||
└── blobs → content-addressed store (artdag/IPFS), by reference only
|
||||
```
|
||||
|
||||
## Phase 1 — Log + kv + in-memory backend
|
||||
- [ ] `event.sx` — event record, stream/seq helpers
|
||||
- [ ] `backend.sx` — injectable protocol + in-memory impl (log + kv)
|
||||
- [ ] `log.sx` — `append` (optimistic seq), `read`, `read-from`
|
||||
- [ ] `kv.sx` — `get`/`put`/`delete` current-state
|
||||
- [ ] `api.sx` + tests + scoreboard + conformance.sh
|
||||
|
||||
## Phase 2 — Projections + subscriptions
|
||||
- [ ] `project.sx` — `(project stream step seed)`, incremental fold
|
||||
- [ ] subscription hook — projection / kv read model re-runs on append
|
||||
- [ ] concurrency conflict surfaced as a real result, not a crash
|
||||
|
||||
## Phase 3 — Snapshots + replay
|
||||
- [ ] `snapshot.sx` — checkpoint a projection; replay = snapshot + tail
|
||||
- [ ] compaction policy; replay-determinism tests
|
||||
|
||||
## Phase 4 — Durable backends via kernel IO
|
||||
- [ ] file/log backend driven through `perform` (IO-suspension boundary)
|
||||
- [ ] blob backend interface (store ref/CID; bytes live in artdag/IPFS)
|
||||
- [ ] crash/restart replay test (mock IO platform)
|
||||
- [ ] migration notes for swapping mem → durable under a live subsystem
|
||||
|
||||
## Consumers (post-foundation, not in scope here)
|
||||
feed/-log, flow store, mod/audit, search index, acl grants, identity sessions all
|
||||
become `persist` log or kv. Track each migration in that subsystem's plan.
|
||||
|
||||
## Progress log
|
||||
(loop fills this in)
|
||||
|
||||
## Blockers
|
||||
(loop fills this in)
|
||||
@@ -10,7 +10,7 @@ extension that merges per-peer indices.
|
||||
|
||||
## Status (rolling)
|
||||
|
||||
`bash lib/search/conformance.sh` → **122/122** (Phases 1–4 complete)
|
||||
`bash lib/search/conformance.sh` → **0/0** (not yet started)
|
||||
|
||||
## Ground rules
|
||||
|
||||
@@ -61,148 +61,46 @@ lib/search/index.sx lib/search/eval.sx
|
||||
|
||||
## Phase 1 — Tokenize + index
|
||||
|
||||
- [x] `lib/search/tokenize.sx` — normalize (lowercase, strip punctuation), split on
|
||||
- [ ] `lib/search/tokenize.sx` — normalize (lowercase, strip punctuation), split on
|
||||
whitespace, return positions
|
||||
- [x] `lib/search/index.sx` — inverted index data structure; `indexDoc`, `deleteDoc`,
|
||||
`lookupTerm`, `docFreq`, `allTerms`. (Data.Map's public API lacks
|
||||
toList/keys/map/filter, so a sorted assoc-list `[(Term,[(DocId,[Pos])])]` is used —
|
||||
the conceptual `Map Term [(DocId,[Pos])]` with free term iteration.)
|
||||
- [x] `lib/search/api.sx` — assembles `search/src` (tokenize + index); Haskell entry
|
||||
points `indexDoc` / `lookupTerm`
|
||||
- [x] `lib/search/tests/index.sx` — 18 cases: tokenize, insert + lookup, update,
|
||||
delete, multi-doc, positions, docFreq, allTerms
|
||||
- [x] `lib/search/scoreboard.{json,md}`
|
||||
- [x] `lib/search/conformance.sh`
|
||||
- [ ] `lib/search/index.sx` — inverted index data structure (typed `Map` from
|
||||
haskell lib); `insert`, `delete`, `lookup`
|
||||
- [ ] `lib/search/api.sx` — `(search/index doc)`, `(search/lookup term)`
|
||||
- [ ] `lib/search/tests/index.sx` — 15+ cases: tokenize, insert + lookup, update,
|
||||
delete, multi-doc
|
||||
- [ ] `lib/search/scoreboard.{json,md}`
|
||||
- [ ] `lib/search/conformance.sh`
|
||||
|
||||
## Phase 2 — Query AST + boolean evaluation
|
||||
|
||||
- [x] Query ADT: `Term String | And Query Query | Or Query Query | Not Query |
|
||||
Phrase [String]` (in `lib/search/query.sx`)
|
||||
- [x] `lib/search/parse.sx` — query syntax parser: tokenizer + recursive-descent
|
||||
(OR < AND < NOT precedence, implicit AND on adjacency, quoted phrases, parens,
|
||||
case-insensitive keywords); `parseQuery`, `searchQuery`, `showQ`
|
||||
- [x] `lib/search/query.sx` — boolean eval via set ops on docid-sorted posting lists
|
||||
(sortedUnion/Inter/Diff, Not over allDocs universe)
|
||||
- [x] phrase eval — positional adjacency check (phraseInDoc / phraseStartsAt)
|
||||
- [x] `lib/search/tests/boolean.sx` — 28 cases: term, and, or, not, phrase,
|
||||
composition (parser edge cases move to the parse.sx suite)
|
||||
- [ ] Query ADT: `Term Text | And Query Query | Or Query Query | Not Query |
|
||||
Phrase [Text]`
|
||||
- [ ] `lib/search/parse.sx` — query syntax parser (boolean operators, quoted phrases)
|
||||
- [ ] `lib/search/eval.sx` — boolean eval via set ops on posting lists
|
||||
- [ ] phrase eval — adjacency check using positions
|
||||
- [ ] `lib/search/tests/boolean.sx` — 25+ cases: term, and, or, not, phrase,
|
||||
composition, parser edge cases
|
||||
|
||||
## Phase 3 — Ranking
|
||||
|
||||
- [x] document frequency — `docFreq`/`idf`/`bm25idf` derived from the index
|
||||
(posting-list length); no separate df store needed
|
||||
- [x] TF-IDF scoring (`rankTfIdf`)
|
||||
- [x] BM25 scoring, configurable k1/b (`rankBm25 k1 b`)
|
||||
- [x] top-N retrieval (`topNTfIdf`/`topNBm25` — sortBy + take; stable DocId tiebreak)
|
||||
- [x] `lib/search/tests/rank.sx` — 23 cases: TF-IDF tf/idf behavior, BM25 length-norm
|
||||
+ tf-saturation flips vs TF-IDF, b-parameter effect, tiebreak stability, top-N
|
||||
- [ ] document frequency tracking — extend index with `df` per term
|
||||
- [ ] TF-IDF scoring
|
||||
- [ ] BM25 scoring (configurable k1, b)
|
||||
- [ ] top-N retrieval (heap-based)
|
||||
- [ ] `lib/search/tests/rank.sx` — 20+ cases: TF-IDF behavior, BM25 vs TF-IDF,
|
||||
ranking stability, top-N correctness
|
||||
|
||||
## Phase 4 — ACL filter + federation
|
||||
|
||||
- [x] post-filter — `aclFilter`/`searchTfIdfAcl`/`topNTfIdfAcl`/`searchBm25Acl` take an
|
||||
injected `permit :: DocId -> Bool` predicate, applied post-rank (never in the index)
|
||||
- [x] federated query — `fedIndex :: [(PeerId, Index)] -> Index` merges per-peer
|
||||
inverted indices (union posting lists per term); rank/search run once over the merge
|
||||
- [x] merge policy — relabel local DocIds to global `gid = peer*1000 + local`
|
||||
(bijection ⇒ dedupe by (peer,doc-id) is automatic); ranking interleaves peers by score
|
||||
- [x] `lib/search/tests/integration.sx` — 21 cases: index merge, cross-peer df/lookup,
|
||||
position preservation, boolean/phrase over the merge, ACL filter + top-N + bm25
|
||||
|
||||
## Extensions (post-roadmap, search-shaped vocabulary)
|
||||
|
||||
- [x] prefix / wildcard queries (`prefixTerms`, `prefixDocs`, `prefixRankTfIdf`) — 14 tests
|
||||
- [x] fuzzy matching — edit distance term expansion (`editDist`, `fuzzyTerms`,
|
||||
`fuzzyDocs`, `fuzzyRankTfIdf`) — 18 tests
|
||||
- [x] result pagination (offset / limit) — `paginate`, `pageTfIdf`, `pageBm25`,
|
||||
`resultCount` — 12 tests
|
||||
- [x] snippet / highlight generation (`highlight`, `snippet`) — 12 tests
|
||||
- [x] stemming (suffix stripping) — `stem`, `stemText`, `stemTokens`, `indexStemmed`
|
||||
— 18 tests
|
||||
- [x] proximity / NEAR — `nearDocs k t1 t2` (unordered, within k positions) — 9 tests
|
||||
- [x] synonym / query expansion — `expandTerm`, `synDocs`, `synRankTfIdf` — 9 tests
|
||||
- [x] boolean-filtered ranked search — `queryTerms`, `searchRankTfIdf`,
|
||||
`searchRankBm25` (filter by boolean query, rank survivors by relevance) — 11 tests
|
||||
- [x] did-you-mean / spelling suggestion — `suggest`, `suggestN` (closest indexed
|
||||
terms by edit distance, alphabetical tiebreak) — 9 tests
|
||||
- [ ] post-filter — each candidate result tested via `(acl/permit? viewer :read doc)`
|
||||
- [ ] federated query — fan out to peer instances via fed-sx, merge results
|
||||
- [ ] merge policy — interleave by rank, dedupe by `(peer, doc-id)`
|
||||
- [ ] `lib/search/tests/integration.sx` — federated search with ACL filter
|
||||
|
||||
## Progress log
|
||||
|
||||
- **Extension: did-you-mean / spelling suggestion (234/234 total).** `suggest`/`suggestN`
|
||||
rank indexed terms by edit distance to a (misspelled) query term, alphabetical
|
||||
tiebreak. 9 tests.
|
||||
- **Extension: boolean-filtered ranked search (225/225 total).** `searchRankTfIdf`/
|
||||
`searchRankBm25` parse a boolean query, filter docs via evalQuery, then rank the
|
||||
survivors by relevance over the query's leaf terms (`queryTerms`) — the real-world
|
||||
filter-then-rank pattern. 11 tests.
|
||||
- **Extension: synonyms/query expansion (214/214 total).** A synonym map
|
||||
`[(Term,[Term])]` expands a query term to itself + synonyms (`expandTerm`); `synDocs`
|
||||
unions, `synRankTfIdf` ranks the expanded set. 9 tests.
|
||||
- **Extension: proximity/NEAR (205/205 total).** `nearDocs k t1 t2 idx` returns docs
|
||||
where both terms occur within k positions (unordered), candidates = posting
|
||||
intersection, filtered on the positional postings. 9 tests.
|
||||
- **Extension: stemming (196/196 total).** Deterministic English suffix stripping
|
||||
(`stem`), `stemText`/`stemTokens`, `indexStemmed`. Two haskell-on-sx gotchas: take/drop
|
||||
over a String yield char CODES not char strings (rebuild via `joinChars . map chr`),
|
||||
and isSuffixOf's `reverse` trips `++` on the String repr (manual suffix compare). All
|
||||
five planned extensions now done; the loop can keep adding search vocabulary. 18 tests.
|
||||
- **Extension: highlight/snippet (178/178 total).** `highlight terms text` marks
|
||||
query-matching (normalized) tokens with [..]; `snippet ctx terms text` extracts a
|
||||
context window around the first match. 12 tests.
|
||||
- **Extension: fuzzy matching (166/166 total).** Levenshtein `editDist` as an O(m*n)
|
||||
row-based DP (the naive recursive version is exponential and times out under load),
|
||||
`fuzzyTerms`/`fuzzyDocs`/`fuzzyRankTfIdf` expand a term to indexed terms within a max
|
||||
edit distance. 18 tests.
|
||||
- **Extension: pagination (148/148 total).** `paginate off lim` windows a ranked list
|
||||
(take lim . drop off); `pageTfIdf`/`pageBm25` + `resultCount`. 12 tests. Note the
|
||||
full conformance now runs 8 suites sequentially and needs an overall timeout ~1900s
|
||||
under the heavy box load.
|
||||
- **Extension: prefix/wildcard queries (136/136 total).** `prefixTerms` matches every
|
||||
indexed term starting with a prefix (via allTerms + isPrefixOf); `prefixDocs` unions
|
||||
their docs; `prefixRankTfIdf` ranks treating the matched terms as the query. 14 tests.
|
||||
- **Phase 4 complete — federation + ACL (122/122 total). Roadmap done.** `fedIndex`
|
||||
merges per-peer inverted indices (union posting lists per term) after relabelling
|
||||
local DocIds to global `gid = peer*1000 + local` — the bijection makes (peer,doc-id)
|
||||
dedupe automatic and keeps positions, so ranking runs once over the merge and
|
||||
interleaves peers by score (rank-correct). ACL is a post-rank `filter` over an
|
||||
injected `permit :: DocId -> Bool` (viewer baked in by the caller) — never in the
|
||||
index; `searchTfIdfAcl`/`topNTfIdfAcl`/`searchBm25Acl`. 21 integration tests.
|
||||
- **Phase 3 complete — ranking (101/101 total).** TF-IDF (`rankTfIdf`) and BM25
|
||||
(`rankBm25 k1 b`) over the candidate set (docs containing any query term), scores
|
||||
as floats with deterministic DocId-ascending tiebreak; `topNTfIdf`/`topNBm25` via
|
||||
sortBy+take. df/idf derived from posting-list length (no separate df store). 23
|
||||
tests incl. a BM25-vs-TF-IDF flip (length-norm + tf-saturation) and the b-parameter
|
||||
effect. Float division/`log`/float literals all work in haskell-on-sx.
|
||||
- **Phase 2 complete — parser (78/78 total).** Query tokenizer (ord-based
|
||||
delimiters, quoted phrases) + recursive-descent parser with OR<AND<NOT precedence,
|
||||
implicit AND on adjacency, parens, case-insensitive keywords. `parseQuery`,
|
||||
`searchQuery`, `showQ` (canonical render for AST tests). 32 tests in parse.sx.
|
||||
**haskell-on-sx parser gotchas hit while writing this (see parse.sx header):**
|
||||
(1) escaped char literals like `'\"'` break the tokenizer — match delimiters by
|
||||
`ord c == 34`; (2) an `[]` *pattern* inside a `case` alt breaks the parser — use
|
||||
multi-clause functions instead; (3) `case`/constructor patterns and `let (a,b)=..`
|
||||
are fine. Embedded Haskell string literals in a `.sx` source string need single
|
||||
`\"`, not `\\\"`.
|
||||
- **Phase 2 boolean/phrase eval (46/46 total).** Query ADT
|
||||
`Term|And|Or|Not|Phrase` + `evalQuery :: Index -> Query -> [DocId]` in query.sx.
|
||||
Boolean ops are linear merges over docid-sorted posting lists; Not subtracts from
|
||||
the allDocs universe; Phrase checks positional adjacency. 28 tests in boolean.sx.
|
||||
Refactored both suites to **batch all cases into one program eval** (search-batch
|
||||
in testlib) — under the heavy CPU load on this box (~11 on 2 cores), 18–28 separate
|
||||
hk-eval-program calls timed out; one combined eval per suite is ~20× faster.
|
||||
Parser (parse.sx) is the remaining Phase 2 box.
|
||||
- **Phase 1 complete (18/18).** Tokenizer (lowercase + strip punctuation + positions),
|
||||
inverted index as sorted assoc-list `[(Term,[(DocId,[Pos])])]`, indexDoc/deleteDoc/
|
||||
lookupTerm/docFreq/allTerms. Search lib is Haskell source assembled into `search/src`
|
||||
and evaluated via the haskell-on-sx interpreter; tests reuse `hk-test` counters and a
|
||||
`search-eval` helper that forces HK values to plain SX. conformance.sh models
|
||||
lib/haskell (MODE=counters, COUNTERS_PASS/FAIL=hk-test-pass/fail).
|
||||
(loop fills this in)
|
||||
|
||||
## Blockers
|
||||
|
||||
- **None.** Note: the box is heavily CPU-oversubscribed by sibling loop agents
|
||||
(load ~11 on 2 cores); each program eval is ~10× slower than nominal, so suite
|
||||
timeout is set to 600s. Runs are correct, just slow.
|
||||
- **Data.Map public API gap (informational, not fixing):** the haskell-on-sx
|
||||
`import Data.Map` binds only empty/singleton/insert/lookup/member/size/null/delete/
|
||||
insertWith/adjust/findWithDefault — no toList/keys/elems/map/filter/unionWith. Index
|
||||
uses a pure assoc-list instead so term iteration and federation merge stay simple.
|
||||
(loop fills this in)
|
||||
|
||||
Reference in New Issue
Block a user