Merge loops/sx-ws-w14 into architecture: W14 test gate & conformance infra
17 test-only commits delivering the full W14 workstream (PLAN.md §W14 —
the enabler every other sx-review fix verifies against):
- spec/tests/test-gate-pins.sx: 7 pin suites (29 tests) for dc7aa709's
landed fixes — K18, K20, K09/K11/K39, K49 (spec side), crit-2
(non-vacuous via side-effect sentinel), plus C21/C22 harness pins
- 6 gate scripts, all bidirectional ledgers (a healed KNOWN entry also
fails): test-protocol-gate (C1/C1b/S4 + C3-C7 quirk ledger + seeded
fuzz-liveness, 11), test-env-parity (runner-only bindings, 7),
test-harness-parity (mcp_tree vs sx_server, 12), test-wasm-corpus
(shipped kernel: 80/83 files green, 5192 passes), test-suite-baseline
(273-failure band pinned in spec/tests/known-failures.txt),
test-differential (49 probes native vs WASM, 3 ledgered)
- spec/harness.sx: C22 fix (IO logged before the mock runs) + C21
harness-run-perform (real CEK suspend/resume mode); W14-assigned per
PLAN approach item 4 — see merge note in the briefing re: the forge
briefing's stricter wording
- C9: empty suite labels eliminated across 6 test files
- web/tests/test-adapter-dom-render.sx: first render-output coverage of
the DOM adapter (the browser-only exclusion was false)
Confirmed handoffs recorded in the briefing: bare-server apply does not
spread args (F-3, runner masks it); both runners' sha3-256 are fake
stubs (test CIDs != production CIDs); generated sx_render.ml is regen-
stale (misses dc7aa709's HTML_TAGS fix); canonical-serialize broken on
bare server for any number.
Verified post-merge in this checkout: gate pins 275/0, protocol-gate
11/0, env-parity 7/0, harness-parity 12/0, differential 49/0.
Briefing conflict (add/add) resolved: kept the loop's completed version
with a merge note preserving the forge briefing's context (8181421c
landed after the worktree branched).
Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
94
scripts/test-differential.sh
Executable file
94
scripts/test-differential.sh
Executable file
@@ -0,0 +1,94 @@
|
||||
#!/bin/bash
|
||||
# test-differential.sh — W14/F8: cross-host differential battery.
|
||||
#
|
||||
# Evaluates every expression in spec/tests/differential-probes.txt on:
|
||||
# A) the native server (sx_server.exe, epoch protocol) — its printer
|
||||
# B) the SHIPPED browser kernel (eval_wasm_probes.js, guest sx-serialize)
|
||||
# and diffs the outputs. The review's original 130-probe corpus was
|
||||
# ephemeral (F-8); this is the committed replacement.
|
||||
#
|
||||
# KNOWN_DIVERGENT is the ledger of confirmed, still-open divergences —
|
||||
# keyed by the probe EXPRESSION. Red on a NEW divergence (host drift) and
|
||||
# red on a HEALED one (fix landed: delete the entry, locking in parity).
|
||||
#
|
||||
# Method note (finding refinement, 2026-07-04): comparing raw K.eval
|
||||
# JS-boundary values shows float-display divergences (0.3 vs
|
||||
# 0.30000000000000004) that DISAPPEAR under guest-level (sx-serialize …) —
|
||||
# the F-1 float-display class is a JS-boundary artifact, not a kernel
|
||||
# serialization divergence. This battery compares guest serialization.
|
||||
set -uo pipefail
|
||||
cd "$(dirname "$0")/.."
|
||||
|
||||
SERVER=hosts/ocaml/_build/default/bin/sx_server.exe
|
||||
WASM=shared/static/wasm/sx_browser.bc.wasm.js
|
||||
PROBES=spec/tests/differential-probes.txt
|
||||
[[ -x "$SERVER" ]] || { echo "SKIP: $SERVER not built" >&2; exit 2; }
|
||||
[[ -f "$WASM" ]] || { echo "SKIP: $WASM missing" >&2; exit 2; }
|
||||
|
||||
# --- KNOWN_DIVERGENT ledger (verified live 2026-07-04) -------------------
|
||||
# F-3/K53: bare sx_server's `apply` does NOT spread its argument list —
|
||||
# (apply + (list 1 2 3)) errors "Expected number, got list"; (apply str l)
|
||||
# returns the serialized list as one string. The WASM kernel spreads
|
||||
# correctly. The test runner masks this with its own apply (F-7 class).
|
||||
declare -A KNOWN_DIVERGENT
|
||||
KNOWN_DIVERGENT['(apply + (list 1 2 3))']="F-3: native apply does not spread"
|
||||
KNOWN_DIVERGENT['(apply max (list 1 5 2))']="F-3: native apply does not spread"
|
||||
KNOWN_DIVERGENT['(apply str (list "a" "b"))']="F-3: native apply does not spread"
|
||||
|
||||
native=$(mktemp); wasm=$(mktemp)
|
||||
|
||||
python3 - "$SERVER" "$PROBES" > "$native" <<'PY'
|
||||
import json, subprocess, sys
|
||||
server, probefile = sys.argv[1], sys.argv[2]
|
||||
probes = [l.strip() for l in open(probefile) if l.strip() and not l.startswith('#')]
|
||||
inp = []
|
||||
for i, p in enumerate(probes):
|
||||
inp.append(f"(epoch {i+1})")
|
||||
inp.append(f"(eval {json.dumps(p)})")
|
||||
out = subprocess.run(["timeout", "120", server], input="\n".join(inp) + "\n",
|
||||
capture_output=True, text=True).stdout
|
||||
res, cur = {}, None
|
||||
for l in out.splitlines():
|
||||
if l.startswith("(ok-len "):
|
||||
cur = int(l.split()[1]); res[cur] = None
|
||||
elif l.startswith("(error "):
|
||||
idx = int(l.split()[1]); res[idx] = "ERROR"; cur = None
|
||||
elif cur is not None and res.get(cur) is None:
|
||||
res[cur] = l; cur = None
|
||||
for i, p in enumerate(probes):
|
||||
print(f"PROBE {i+1} {res.get(i+1, '<none>')}")
|
||||
PY
|
||||
|
||||
timeout 300 node hosts/ocaml/browser/eval_wasm_probes.js "$PROBES" > "$wasm" 2>/dev/null
|
||||
|
||||
pass=0; fail=0; i=0
|
||||
while IFS= read -r expr; do
|
||||
[[ -z "$expr" || "$expr" == \#* ]] && continue
|
||||
i=$((i+1))
|
||||
a=$(sed -n "s/^PROBE $i //p" "$native")
|
||||
b=$(sed -n "s/^PROBE $i //p" "$wasm")
|
||||
known="${KNOWN_DIVERGENT[$expr]:-}"
|
||||
if [[ "$a" == "$b" ]]; then
|
||||
if [[ -n "$known" ]]; then
|
||||
echo "RED: $expr — KNOWN_DIVERGENT now AGREES ($known); delete from ledger"
|
||||
fail=$((fail+1))
|
||||
else
|
||||
pass=$((pass+1))
|
||||
fi
|
||||
else
|
||||
if [[ -n "$known" ]]; then
|
||||
echo "KNOWN-DIVERGENT: $expr ($known)"
|
||||
pass=$((pass+1))
|
||||
else
|
||||
echo "RED: $expr"
|
||||
echo " native: $a"
|
||||
echo " wasm: $b"
|
||||
fail=$((fail+1))
|
||||
fi
|
||||
fi
|
||||
done < <(grep -v '^\s*#' "$PROBES" | grep -v '^\s*$')
|
||||
|
||||
rm -f "$native" "$wasm"
|
||||
echo
|
||||
echo "differential: $i probes, $pass in agreement/ledgered, $fail red"
|
||||
[[ $fail -eq 0 ]]
|
||||
100
scripts/test-env-parity.sh
Executable file
100
scripts/test-env-parity.sh
Executable file
@@ -0,0 +1,100 @@
|
||||
#!/bin/bash
|
||||
# test-env-parity.sh — W14 section-B ledger: runner env vs production env.
|
||||
#
|
||||
# The review (F7, K42, JS5, core.md "canonical.sx depends on test-runner-only
|
||||
# helpers") found bindings that exist ONLY in the test runners, so suites
|
||||
# pass against an environment production never provides. Rule (PLAN.md W14):
|
||||
# "if the spec needs it, it's a kernel primitive; if not, the test can't
|
||||
# have it."
|
||||
#
|
||||
# This script is a LEDGER, not a wish: it asserts today's confirmed drift
|
||||
# stays exactly as recorded. Both directions fail loudly:
|
||||
# - a MUST_HAVE going missing on the server -> regression, fix the kernel
|
||||
# - a KNOWN_DRIFT binding appearing on the server -> the fix landed;
|
||||
# move it to MUST_HAVE and update the consequence pins below.
|
||||
#
|
||||
# Confirmed inventory (2026-07-04, all verified live over the epoch protocol):
|
||||
#
|
||||
# binding OCaml runner JS runner fresh sx_server
|
||||
# values real (rt.ml:1131) ? ABSENT
|
||||
# call-with-values real (rt.ml:1140) ? ABSENT
|
||||
# contains-char? real (rt.ml:728) real (:85) ABSENT
|
||||
# trim-right ABSENT real (:87) ABSENT
|
||||
# sha3-256 FAKE Hashtbl.hash FAKE stub ABSENT (real = crypto-sha3-256)
|
||||
#
|
||||
# Consequences (pinned in section 3):
|
||||
# - (canonical-serialize 42) on a fresh server errors "Undefined symbol:
|
||||
# contains-char?" -> content addressing broken for ANY number outside
|
||||
# the test runners.
|
||||
# - every CID computed inside run_tests uses a FAKE hash, so test CIDs
|
||||
# never equal production CIDs (crypto-sha3-256 is real SHA3).
|
||||
#
|
||||
# Each probe spawns its OWN timeout-bounded sx_server.exe. No shared process.
|
||||
set -uo pipefail
|
||||
|
||||
cd "$(dirname "$0")/.."
|
||||
SERVER=hosts/ocaml/_build/default/bin/sx_server.exe
|
||||
|
||||
if [[ ! -x "$SERVER" ]]; then
|
||||
echo "SKIP: $SERVER not built (run sx_build target=ocaml first)" >&2
|
||||
exit 2
|
||||
fi
|
||||
|
||||
pass=0
|
||||
fail=0
|
||||
|
||||
# deps_unresolved EXPR -> prints the (unresolved ...) list for EXPR on a fresh server
|
||||
deps_unresolved() {
|
||||
printf '(epoch 1)\n(deps-check "%s")\n' "$1" \
|
||||
| timeout 60 "$SERVER" 2>/dev/null \
|
||||
| grep -o ':unresolved ([^)]*)' || true
|
||||
}
|
||||
|
||||
# --- Section 1: MUST_HAVE — spec-needed bindings production must provide ---
|
||||
MUST_HAVE_EXPR='(list (equal? 1 1) (apply + (list 1 2)) (contains? {:a 1} :a) (crypto-sha3-256 \"x\") (split \"a-b\" \"-\"))'
|
||||
unres=$(deps_unresolved "$MUST_HAVE_EXPR")
|
||||
if [[ -z "$unres" || "$unres" == ':unresolved ()' ]]; then
|
||||
echo "PASS: MUST_HAVE core bindings all resolve on fresh sx_server"
|
||||
pass=$((pass+1))
|
||||
else
|
||||
echo "FAIL: MUST_HAVE binding missing on fresh sx_server: $unres"
|
||||
fail=$((fail+1))
|
||||
fi
|
||||
|
||||
# --- Section 2: KNOWN_DRIFT — runner-only bindings, asserted ABSENT -------
|
||||
# If one of these starts resolving, its kernel fix landed: move it to
|
||||
# MUST_HAVE above and update the consequence pin in section 3.
|
||||
for name in values call-with-values contains-char? trim-right sha3-256; do
|
||||
unres=$(deps_unresolved "($name)")
|
||||
if grep -q -- "$name" <<<"$unres"; then
|
||||
echo "PASS: KNOWN_DRIFT '$name' still absent on fresh sx_server (ledger accurate)"
|
||||
pass=$((pass+1))
|
||||
else
|
||||
echo "FAIL: KNOWN_DRIFT '$name' now RESOLVES on fresh sx_server — fix landed?"
|
||||
echo " Update this ledger: move '$name' to MUST_HAVE and revisit section 3."
|
||||
fail=$((fail+1))
|
||||
fi
|
||||
done
|
||||
|
||||
# --- Section 3: consequence pin — canonical.sx on the production server ---
|
||||
# Current reality: canonical-serialize of ANY number errors on a fresh
|
||||
# server because canonical-number calls runner-only contains-char?.
|
||||
out=$(printf '(epoch 1)\n(load "spec/canonical.sx")\n(epoch 2)\n(eval "(canonical-serialize 42)")\n' \
|
||||
| timeout 60 "$SERVER" 2>&1)
|
||||
if grep -q 'error 2 .*contains-char?' <<<"$out"; then
|
||||
echo "PASS: consequence pin — canonical-serialize on numbers still broken on server (as recorded)"
|
||||
pass=$((pass+1))
|
||||
elif grep -q '^(ok 2 ' <<<"$out"; then
|
||||
echo "FAIL: consequence pin — canonical-serialize 42 now WORKS on the server."
|
||||
echo " The canonical-helpers fix landed: flip this pin to assert success"
|
||||
echo " and pin the exact canonical form + CID stability."
|
||||
fail=$((fail+1))
|
||||
else
|
||||
echo "FAIL: consequence pin — unexpected server output:"
|
||||
sed 's/^/ /' <<<"$out"
|
||||
fail=$((fail+1))
|
||||
fi
|
||||
|
||||
echo
|
||||
echo "env-parity: $pass passed, $fail failed"
|
||||
[[ $fail -eq 0 ]]
|
||||
107
scripts/test-harness-parity.sh
Executable file
107
scripts/test-harness-parity.sh
Executable file
@@ -0,0 +1,107 @@
|
||||
#!/bin/bash
|
||||
# test-harness-parity.sh — W14 section-C pin for K19 (harness honesty).
|
||||
#
|
||||
# K19 (review, core.md): the MCP tree server (mcp_tree.ml) carries a
|
||||
# PARALLEL primitive table, and it drifted from the real runtime
|
||||
# (sx_primitives.ml) — e.g. (get {:a 1} :a 99) returned nil in the harness
|
||||
# but 1 in production, (split "a--b" "--") was char-class vs substring.
|
||||
# CLAUDE.md mandates harness verification, so drift silently produces
|
||||
# false findings/passes. dc7aa709 aligned 8 entries as a stopgap; the real
|
||||
# fix (mcp_tree links sx_primitives directly) is hosts-lane work.
|
||||
#
|
||||
# This pin runs the finding's exact probe battery through BOTH environments
|
||||
# — mcp_tree.exe sx_eval (JSON-RPC over stdio) and a fresh sx_server.exe
|
||||
# (epoch protocol) — and fails on ANY divergence. Errors are compared by
|
||||
# message, values by serialized form. Both subprocesses are fresh and
|
||||
# timeout-bounded; no shared process is touched.
|
||||
#
|
||||
# Exit: 0 = full parity; 1 = drift (harness lies about the runtime again).
|
||||
set -uo pipefail
|
||||
cd "$(dirname "$0")/.."
|
||||
|
||||
MCP=hosts/ocaml/_build/default/bin/mcp_tree.exe
|
||||
SERVER=hosts/ocaml/_build/default/bin/sx_server.exe
|
||||
for bin in "$MCP" "$SERVER"; do
|
||||
if [[ ! -x "$bin" ]]; then
|
||||
echo "SKIP: $bin not built (run sx_build target=ocaml first)" >&2
|
||||
exit 2
|
||||
fi
|
||||
done
|
||||
|
||||
python3 - "$MCP" "$SERVER" <<'PYEOF'
|
||||
import json, re, subprocess, sys
|
||||
|
||||
MCP, SERVER = sys.argv[1], sys.argv[2]
|
||||
|
||||
# K19 probe battery — the finding's confirmed drift cases + stopgap entries.
|
||||
PROBES = [
|
||||
'(empty? "")', '(empty? {})',
|
||||
'(get {:a 1} :a 99)', '(get {:a 1} :zz 99)', '(get (list 10 20) 1)',
|
||||
'(split "a--b" "--")', '(split "abc" "")',
|
||||
'(equal? (list 1 2) (list 1 2))',
|
||||
'(contains? {:a 1} :a)', '(keyword-name :kw)',
|
||||
'(char-code "A")', '(parse-number "42")',
|
||||
]
|
||||
|
||||
def norm_error(msg):
|
||||
"""Extract the quoted inner error message so harness/server error
|
||||
envelopes compare equal when the underlying failure is the same."""
|
||||
m = re.search(r'Unhandled exception: \\?"(.*?)\\?"', msg)
|
||||
if m:
|
||||
return "<ERROR> " + m.group(1)
|
||||
return "<ERROR> " + msg.strip()[:80]
|
||||
|
||||
# --- harness side: mcp_tree sx_eval over JSON-RPC ---
|
||||
lines = [
|
||||
json.dumps({"jsonrpc": "2.0", "id": 1, "method": "initialize",
|
||||
"params": {"protocolVersion": "2024-11-05", "capabilities": {},
|
||||
"clientInfo": {"name": "parity", "version": "0"}}}),
|
||||
json.dumps({"jsonrpc": "2.0", "method": "notifications/initialized"}),
|
||||
]
|
||||
for i, p in enumerate(PROBES):
|
||||
lines.append(json.dumps({"jsonrpc": "2.0", "id": 100 + i, "method": "tools/call",
|
||||
"params": {"name": "sx_eval", "arguments": {"expr": p}}}))
|
||||
out = subprocess.run(["timeout", "60", MCP], input="\n".join(lines) + "\n",
|
||||
capture_output=True, text=True).stdout
|
||||
harness = {}
|
||||
for l in out.splitlines():
|
||||
try:
|
||||
j = json.loads(l)
|
||||
except ValueError:
|
||||
continue
|
||||
if isinstance(j.get("id"), int) and j["id"] >= 100:
|
||||
txt = j.get("result", {}).get("content", [{}])[0].get("text", "<none>").strip()
|
||||
if txt.startswith("Error:") or j.get("result", {}).get("isError"):
|
||||
txt = norm_error(txt)
|
||||
harness[j["id"] - 100] = txt
|
||||
|
||||
# --- server side: fresh sx_server over the epoch protocol ---
|
||||
inp = []
|
||||
for i, p in enumerate(PROBES):
|
||||
inp.append(f"(epoch {i + 1})")
|
||||
inp.append(f"(eval {json.dumps(p)})")
|
||||
sout = subprocess.run(["timeout", "60", SERVER], input="\n".join(inp) + "\n",
|
||||
capture_output=True, text=True).stdout
|
||||
server, cur = {}, None
|
||||
for l in sout.splitlines():
|
||||
if l.startswith("(ok-len "):
|
||||
cur = int(l.split()[1]); server[cur - 1] = None
|
||||
elif l.startswith("(error "):
|
||||
idx = int(l.split()[1]); server[idx - 1] = norm_error(l); cur = None
|
||||
elif cur is not None and server.get(cur - 1) is None:
|
||||
server[cur - 1] = l.strip(); cur = None
|
||||
|
||||
fails = 0
|
||||
for i, p in enumerate(PROBES):
|
||||
h = harness.get(i, "<missing>")
|
||||
s = server.get(i, "<missing>")
|
||||
if h == s:
|
||||
print(f"PASS: {p:40s} both -> {h!r}")
|
||||
else:
|
||||
print(f"FAIL: {p:40s} harness={h!r} server={s!r}")
|
||||
fails += 1
|
||||
|
||||
print()
|
||||
print(f"harness-parity: {len(PROBES) - fails} passed, {fails} failed")
|
||||
sys.exit(1 if fails else 0)
|
||||
PYEOF
|
||||
233
scripts/test-protocol-gate.sh
Executable file
233
scripts/test-protocol-gate.sh
Executable file
@@ -0,0 +1,233 @@
|
||||
#!/bin/bash
|
||||
# test-protocol-gate.sh — W14 pins for the epoch/command-channel protocol.
|
||||
#
|
||||
# Pins C1/C1b (review, plans/sx-review/hosts.md): a malformed or non-ASCII
|
||||
# line on the top-level command channel used to raise an uncaught
|
||||
# Sx_types.Parse_error and KILL the whole sx_server process (the shared
|
||||
# channel used by bridges and conformance runners). Fixed in dc7aa709:
|
||||
# the server now answers `(error N "Malformed command line: ...")` and
|
||||
# keeps serving.
|
||||
#
|
||||
# Each case spawns its OWN timeout-bounded sx_server.exe subprocess —
|
||||
# no shared/sibling process is ever touched. Designed to grow into the
|
||||
# W14 section-E protocol fuzz suite (C3-C7).
|
||||
#
|
||||
# Usage: bash scripts/test-protocol-gate.sh
|
||||
# Exit: 0 = all pins green; 1 = a pin failed (fix regressed).
|
||||
set -uo pipefail
|
||||
|
||||
cd "$(dirname "$0")/.."
|
||||
SERVER=hosts/ocaml/_build/default/bin/sx_server.exe
|
||||
|
||||
if [[ ! -x "$SERVER" ]]; then
|
||||
echo "SKIP: $SERVER not built (run sx_build target=ocaml first)" >&2
|
||||
exit 2
|
||||
fi
|
||||
|
||||
pass=0
|
||||
fail=0
|
||||
|
||||
# run_case NAME INPUT EXPECT_SENTINEL
|
||||
# Feeds INPUT to a fresh server. Asserts:
|
||||
# 1. an (error ... "Malformed command line: ...") response is emitted
|
||||
# 2. the follow-up epoch still evaluates (EXPECT_SENTINEL in output)
|
||||
# 3. the process exits cleanly (no Fatal error, exit 0 on stdin EOF)
|
||||
run_case() {
|
||||
local name="$1" input="$2" sentinel="$3"
|
||||
local out rc
|
||||
out=$(printf '%b' "$input" | timeout 60 "$SERVER" 2>&1)
|
||||
rc=$?
|
||||
local ok=1
|
||||
if ! grep -q 'Malformed command line' <<<"$out"; then
|
||||
echo "FAIL: $name — no malformed-line error response"; ok=0
|
||||
fi
|
||||
if ! grep -q "^${sentinel}\$" <<<"$out"; then
|
||||
echo "FAIL: $name — follow-up epoch did not run (process died?)"; ok=0
|
||||
fi
|
||||
if grep -q 'Fatal error' <<<"$out"; then
|
||||
echo "FAIL: $name — Fatal error escaped to the top level"; ok=0
|
||||
fi
|
||||
if [[ $rc -ne 0 ]]; then
|
||||
echo "FAIL: $name — nonzero exit ($rc)"; ok=0
|
||||
fi
|
||||
if [[ $ok -eq 1 ]]; then
|
||||
echo "PASS: $name"
|
||||
pass=$((pass+1))
|
||||
else
|
||||
echo " --- output ---"; sed 's/^/ /' <<<"$out"; echo " --------------"
|
||||
fail=$((fail+1))
|
||||
fi
|
||||
}
|
||||
|
||||
# C1: unterminated list on the command channel (exact review repro)
|
||||
run_case "C1 unterminated list survives" \
|
||||
'(epoch 2)\n(eval "(+ 1 2"\n(epoch 3)\n(eval "99")\n' \
|
||||
'99'
|
||||
|
||||
# C1: plain-garbage line (second C1 repro shape)
|
||||
run_case "C1 garbage line survives" \
|
||||
'(epoch 1)\nnot an s-expr ]]] {{{\n(epoch 2)\n(eval "42")\n' \
|
||||
'42'
|
||||
|
||||
# C1b: non-ASCII byte on the command channel (exact review repro; \xc3\xa9 = é)
|
||||
run_case "C1b non-ASCII line survives" \
|
||||
'(epoch 1)\n(eval (quote caf\xc3\xa9))\n(epoch 2)\n(eval "99")\n' \
|
||||
'99'
|
||||
|
||||
# Control: a well-formed session still works end to end
|
||||
ctrl=$(printf '(epoch 1)\n(eval "(+ 40 2)")\n' | timeout 60 "$SERVER" 2>&1)
|
||||
if grep -q '^42$' <<<"$ctrl"; then
|
||||
echo "PASS: control well-formed session"
|
||||
pass=$((pass+1))
|
||||
else
|
||||
echo "FAIL: control well-formed session"; sed 's/^/ /' <<<"$ctrl"
|
||||
fail=$((fail+1))
|
||||
fi
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# C3–C7 protocol-quirk LEDGER (hosts.md, all OPEN server-side). These pin
|
||||
# CURRENT behavior, verified live 2026-07-04 — they are documentation, not
|
||||
# endorsement. When a server fix lands and a pin fails, update the ledger
|
||||
# to assert the corrected behavior (bidirectional, like test-env-parity.sh).
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# ledger_case NAME INPUT GREP_MUST GREP_MUST2
|
||||
ledger_case() {
|
||||
local name="$1" input="$2" must="$3" must2="${4:-}"
|
||||
local out
|
||||
out=$(printf '%b' "$input" | timeout 60 "$SERVER" 2>&1)
|
||||
local ok=1
|
||||
grep -q -- "$must" <<<"$out" || { echo "FAIL: $name — expected: $must"; ok=0; }
|
||||
if [[ -n "$must2" ]]; then
|
||||
grep -q -- "$must2" <<<"$out" || { echo "FAIL: $name — expected: $must2"; ok=0; }
|
||||
fi
|
||||
if grep -q 'Fatal error' <<<"$out"; then
|
||||
echo "FAIL: $name — process died"; ok=0
|
||||
fi
|
||||
if [[ $ok -eq 1 ]]; then echo "PASS: $name"; pass=$((pass+1));
|
||||
else echo " --- output ---"; sed 's/^/ /' <<<"$out"; fail=$((fail+1)); fi
|
||||
}
|
||||
|
||||
# C3: stray (io-response ...) is answered as Unknown command (dead guard) —
|
||||
# an EXTRA response the client didn't ask for; process keeps serving.
|
||||
ledger_case "C3 ledger: stray io-response gets an extra error reply" \
|
||||
'(epoch 1)\n(io-response 1 42)\n(eval "5")\n' \
|
||||
'Unknown command: (io-response 1 42)' '^5$'
|
||||
|
||||
# C4: malformed (epoch) doesn't update the epoch — next reply tagged with
|
||||
# the OLD epoch (0 here), i.e. stale from the client's viewpoint.
|
||||
ledger_case "C4 ledger: malformed epoch marker leaves epoch stale" \
|
||||
'(epoch)\n(eval "2")\n' \
|
||||
'(ok-len 0 1)' '^2$'
|
||||
|
||||
# C5: no monotonic-epoch enforcement — a decreasing epoch is accepted.
|
||||
ledger_case "C5 ledger: decreasing epoch accepted silently" \
|
||||
'(epoch 9)\n(epoch 3)\n(eval "42")\n' \
|
||||
'(ok-len 3 2)' '^42$'
|
||||
|
||||
# C6: two commands on one line -> one error, NEITHER executed.
|
||||
ledger_case "C6 ledger: two commands on one line both dropped" \
|
||||
'(epoch 1)\n(eval "1") (eval "2")\n(eval "3")\n' \
|
||||
'Expected single command, got 2' '^3$'
|
||||
|
||||
# C7: vm-trace without the compiler loaded errors opaquely.
|
||||
ledger_case "C7 ledger: vm-trace sans compiler is opaque Not-callable-nil" \
|
||||
'(epoch 1)\n(vm-trace "(+ 1 2)")\n' \
|
||||
'Not callable: nil'
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Fuzz-liveness property: after 60 deterministic hostile lines (unbalanced
|
||||
# parens, control chars, unicode, long lines, stray io-responses, epoch
|
||||
# mutations), the server must still answer a well-formed command and exit
|
||||
# cleanly. Seeded PRNG — reproducible corpus.
|
||||
# ---------------------------------------------------------------------------
|
||||
fuzz=$(python3 - <<'PY'
|
||||
import random
|
||||
r = random.Random(1404)
|
||||
lines = []
|
||||
frag = ['(', ')', '((', '))', '(eval', '(epoch', 'io-response', '"', '\\',
|
||||
'café', '\x01', '\x1b[2J', ':kw', '{', '}', '(+ 1', 'nil)', '#|', '|#']
|
||||
for i in range(60):
|
||||
kind = r.randrange(5)
|
||||
if kind == 0:
|
||||
lines.append(''.join(r.choice(frag) for _ in range(r.randrange(1, 8))))
|
||||
elif kind == 1:
|
||||
lines.append('(epoch ' + r.choice(['', 'foo', '-1', '999999999999999999999', ')']) + ')')
|
||||
elif kind == 2:
|
||||
lines.append('(io-response %d %s' % (r.randrange(99), r.choice([')', '', '42']) ))
|
||||
elif kind == 3:
|
||||
lines.append('x' * r.randrange(200, 2000))
|
||||
else:
|
||||
lines.append('(eval "' + r.choice(['(+ 1', '(list', '\\\\', '((((']) + '")')
|
||||
print('\n'.join(lines))
|
||||
PY
|
||||
)
|
||||
out=$(printf '%s\n(epoch 777)\n(eval "\\"alive\\"")\n' "$fuzz" | timeout 90 "$SERVER" 2>&1)
|
||||
rc=$?
|
||||
if grep -q '^"alive"$' <<<"$out" && ! grep -q 'Fatal error' <<<"$out" && [[ $rc -eq 0 ]]; then
|
||||
echo "PASS: fuzz-liveness — server survives 60 hostile lines and still answers"
|
||||
pass=$((pass+1))
|
||||
else
|
||||
echo "FAIL: fuzz-liveness (rc=$rc)"; tail -6 <<<"$out" | sed 's/^/ /'
|
||||
fail=$((fail+1))
|
||||
fi
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# S4 (review, hosts.md): soft error pages must NOT be stored in the HTTP
|
||||
# response cache. Pre-fix, a routing-failure page was cached as HTTP 200 and
|
||||
# served byte-identically from cache to every later visitor (cold 2s → warm
|
||||
# 0.0005s, ONE render line). Post-fix (dc7aa709), http_render_page returns
|
||||
# (html, is_error) and cache insertion is gated on `not is_err` (the skip is
|
||||
# logged as "[cache] <path> → error page, not cached").
|
||||
#
|
||||
# Pin: GET the same nonexistent path twice against a fresh --http server and
|
||||
# assert BOTH requests re-render (two [sx-http] render lines) plus the
|
||||
# is_err gate line appearing in the log. NB: in a standalone worktree all
|
||||
# docs pages render as soft error pages (no content), so a positive
|
||||
# "real page IS cached" control is not assertable here.
|
||||
# ---------------------------------------------------------------------------
|
||||
s4_case() {
|
||||
local port=$((18000 + RANDOM % 2000))
|
||||
local log; log=$(mktemp)
|
||||
timeout 90 "$SERVER" --http "$port" >"$log" 2>&1 &
|
||||
local srv=$!
|
||||
local up=0
|
||||
for _ in $(seq 1 40); do
|
||||
if curl -s -o /dev/null "http://localhost:$port/" 2>/dev/null; then up=1; break; fi
|
||||
sleep 1
|
||||
done
|
||||
if [[ $up -ne 1 ]]; then
|
||||
echo "FAIL: S4 — http server did not come up on :$port"
|
||||
kill "$srv" 2>/dev/null; rm -f "$log"
|
||||
fail=$((fail+1)); return
|
||||
fi
|
||||
local miss="/sx/gate-pin-missing-$$-$RANDOM"
|
||||
curl -s -o /dev/null "http://localhost:$port$miss"
|
||||
curl -s -o /dev/null "http://localhost:$port$miss"
|
||||
sleep 1
|
||||
local renders
|
||||
renders=$(grep -c "sx-http\] $miss " "$log")
|
||||
local ok=1
|
||||
if [[ "$renders" -ne 2 ]]; then
|
||||
echo "FAIL: S4 — expected 2 renders of $miss (not cache-served), got $renders"
|
||||
ok=0
|
||||
fi
|
||||
if ! grep -q 'error page, not cached' "$log"; then
|
||||
echo "FAIL: S4 — is_err cache gate line absent from server log"
|
||||
ok=0
|
||||
fi
|
||||
if [[ $ok -eq 1 ]]; then
|
||||
echo "PASS: S4 soft error page not cached (both GETs re-rendered)"
|
||||
pass=$((pass+1))
|
||||
else
|
||||
echo " --- log tail ---"; tail -12 "$log" | sed 's/^/ /'; echo " ---------------"
|
||||
fail=$((fail+1))
|
||||
fi
|
||||
kill "$srv" 2>/dev/null
|
||||
rm -f "$log"
|
||||
}
|
||||
s4_case
|
||||
|
||||
echo
|
||||
echo "protocol-gate: $pass passed, $fail failed"
|
||||
[[ $fail -eq 0 ]]
|
||||
61
scripts/test-suite-baseline.sh
Executable file
61
scripts/test-suite-baseline.sh
Executable file
@@ -0,0 +1,61 @@
|
||||
#!/bin/bash
|
||||
# test-suite-baseline.sh — W14/F10: make FAIL mean something again.
|
||||
#
|
||||
# The review (conformance.md F-10): the OCaml suite is not green — a
|
||||
# permanent ~274-failure band (in-progress hs-* + r7rs radix shadow) is
|
||||
# normalized, so real regressions hide inside the red noise and nobody can
|
||||
# tell a new failure from the band.
|
||||
#
|
||||
# This gate pins the band instead of ignoring it: the full suite's FAIL
|
||||
# set is diffed against the checked-in baseline
|
||||
# (spec/tests/known-failures.txt). Two red conditions, both loud:
|
||||
# NEW failure -> a real regression: fix it (or, if intentional,
|
||||
# justify + add to the baseline in the same commit)
|
||||
# VANISHED failure -> something got fixed: delete it from the baseline
|
||||
# so the win is locked in
|
||||
# Neither touches the runner or the hs loops' scoreboards — the band still
|
||||
# prints as FAIL lines for the teams working through it.
|
||||
#
|
||||
# Usage: bash scripts/test-suite-baseline.sh
|
||||
# Runtime: full suite, ~5–15 min. Exit 0 = fail set identical to baseline.
|
||||
set -uo pipefail
|
||||
cd "$(dirname "$0")/.."
|
||||
|
||||
RUNNER=hosts/ocaml/_build/default/bin/run_tests.exe
|
||||
BASELINE=spec/tests/known-failures.txt
|
||||
[[ -x "$RUNNER" ]] || { echo "SKIP: $RUNNER not built" >&2; exit 2; }
|
||||
[[ -f "$BASELINE" ]] || { echo "SKIP: $BASELINE missing" >&2; exit 2; }
|
||||
|
||||
log=$(mktemp)
|
||||
timeout 3000 "$RUNNER" > "$log" 2>&1
|
||||
rc=$?
|
||||
if [[ $rc -ne 0 && $rc -ne 1 ]]; then
|
||||
echo "RED: runner exited $rc (timeout/crash)"; tail -5 "$log"; rm -f "$log"; exit 1
|
||||
fi
|
||||
|
||||
# Normalize: keep the stable test identity (suite > name), drop messages
|
||||
# (error text may contain addresses/timings that churn).
|
||||
current=$(mktemp)
|
||||
grep '^ FAIL: ' "$log" | sed 's/^ FAIL: //; s/: .*$//' | sort -u > "$current"
|
||||
|
||||
new_failures=$(comm -13 <(sort -u "$BASELINE") "$current")
|
||||
vanished=$(comm -23 <(sort -u "$BASELINE") "$current")
|
||||
|
||||
summary=$(grep '^Results:' "$log" | tail -1)
|
||||
red=0
|
||||
if [[ -n "$new_failures" ]]; then
|
||||
echo "RED: NEW failures not in baseline:"
|
||||
sed 's/^/ + /' <<<"$new_failures"
|
||||
red=1
|
||||
fi
|
||||
if [[ -n "$vanished" ]]; then
|
||||
echo "RED: baseline entries now PASSING (delete them from $BASELINE):"
|
||||
sed 's/^/ - /' <<<"$vanished"
|
||||
red=1
|
||||
fi
|
||||
if [[ $red -eq 0 ]]; then
|
||||
echo "GREEN: fail set identical to baseline ($(wc -l < "$BASELINE") known failures)"
|
||||
fi
|
||||
echo "$summary"
|
||||
rm -f "$log" "$current"
|
||||
exit $red
|
||||
82
scripts/test-wasm-corpus.sh
Executable file
82
scripts/test-wasm-corpus.sh
Executable file
@@ -0,0 +1,82 @@
|
||||
#!/bin/bash
|
||||
# test-wasm-corpus.sh — W14/F2: sweep the spec test corpus through the
|
||||
# SHIPPED browser kernel (sx_browser.bc.wasm.js) headless in Node.
|
||||
#
|
||||
# The review (conformance.md F-2) found the shipped browser artifact never
|
||||
# runs the corpus — F-1/F-3 native/WASM divergences existed undetected.
|
||||
# Each file runs in its OWN node process via run_wasm_corpus.js (a hang is
|
||||
# killed by per-file timeout without ending the sweep).
|
||||
#
|
||||
# The SKIP list documents files that structurally cannot run on the browser
|
||||
# kernel (runner-only bindings, native-only machinery) — the F-5/F-6/F-10
|
||||
# "one-host-gated" theme, recorded honestly per file with the reason.
|
||||
# KNOWN_FAIL documents files that RUN but currently have failing tests on
|
||||
# the shipped kernel (host divergence, F-1/F-3 class): they execute and
|
||||
# report, but don't gate. Everything else must be GREEN — exit 1 otherwise;
|
||||
# a KNOWN_FAIL going green also fails (ledger must be updated).
|
||||
#
|
||||
# Usage: bash scripts/test-wasm-corpus.sh [file.sx ...]
|
||||
set -uo pipefail
|
||||
cd "$(dirname "$0")/.."
|
||||
|
||||
RUNNER=hosts/ocaml/browser/run_wasm_corpus.js
|
||||
KERNEL=shared/static/wasm/sx_browser.bc.wasm.js
|
||||
[[ -f "$KERNEL" ]] || { echo "SKIP: $KERNEL missing (run sx-build-all first)" >&2; exit 2; }
|
||||
|
||||
# --- classification (empirical sweep 2026-07-04; see sx-gate-loop.md) ---
|
||||
# Sweep baseline: 83 files, 80 fully green, 5192 passes, 0 test failures.
|
||||
# The shipped kernel even provides the CEK driver bindings (make-env,
|
||||
# cek-step-loop, ...) — broader than a bare sx_server.
|
||||
declare -A SKIP KNOWN_FAIL
|
||||
skip() { SKIP[$1]=$2; }
|
||||
known() { KNOWN_FAIL[$1]=$2; }
|
||||
# Partial load-errors: the kernel throws mid-file (opaque jsoo exception,
|
||||
# message "undefined"); tests before the failing form pass and report.
|
||||
known test-hash-table.sx "partial: 22 pass then load-error mid-file"
|
||||
known test-r7rs.sx "partial: 87 pass then load-error mid-file"
|
||||
known test-sets.sx "partial: 30 pass then load-error mid-file"
|
||||
|
||||
pass_total=0; fail_total=0; red=0; files=0
|
||||
declare -a targets
|
||||
if [[ $# -gt 0 ]]; then targets=("$@");
|
||||
else for f in spec/tests/test-*.sx; do
|
||||
[[ "$(basename "$f")" == "test-framework.sx" ]] && continue
|
||||
targets+=("$f")
|
||||
done; fi
|
||||
|
||||
for f in "${targets[@]}"; do
|
||||
base=$(basename "$f")
|
||||
if [[ -n "${SKIP[$base]:-}" ]]; then
|
||||
echo "SKIP: $base — ${SKIP[$base]}"
|
||||
continue
|
||||
fi
|
||||
files=$((files+1))
|
||||
line=$(timeout 120 node "$RUNNER" "$f" 2>/dev/null | grep '^CORPUS-RESULT' || true)
|
||||
if [[ -z "$line" ]]; then
|
||||
echo "RED: $base — timeout or crash (no CORPUS-RESULT)"
|
||||
red=$((red+1)); continue
|
||||
fi
|
||||
p=$(sed -n 's/.*pass=\([0-9]*\).*/\1/p' <<<"$line")
|
||||
fl=$(sed -n 's/.*fail=\([0-9]*\).*/\1/p' <<<"$line")
|
||||
st=$(sed -n 's/.*status=\([a-z-]*\).*/\1/p' <<<"$line")
|
||||
pass_total=$((pass_total+p)); fail_total=$((fail_total+fl))
|
||||
if [[ -n "${KNOWN_FAIL[$base]:-}" ]]; then
|
||||
if [[ "$fl" -eq 0 && "$st" == "ok" ]]; then
|
||||
echo "RED: $base — KNOWN_FAIL is now GREEN (${KNOWN_FAIL[$base]}); update the ledger"
|
||||
red=$((red+1))
|
||||
else
|
||||
echo "KNOWN-FAIL: $base pass=$p fail=$fl ($( echo "${KNOWN_FAIL[$base]}" ))"
|
||||
fi
|
||||
continue
|
||||
fi
|
||||
if [[ "$st" != "ok" || "$fl" -ne 0 ]]; then
|
||||
echo "RED: $base pass=$p fail=$fl status=$st"
|
||||
red=$((red+1))
|
||||
else
|
||||
echo "OK: $base pass=$p"
|
||||
fi
|
||||
done
|
||||
|
||||
echo
|
||||
echo "wasm-corpus: $files files run, $pass_total passed, $fail_total failed, $red red"
|
||||
[[ $red -eq 0 ]]
|
||||
Reference in New Issue
Block a user