Merge loops/sx-ws-w14 into architecture: W14 test gate & conformance infra

17 test-only commits delivering the full W14 workstream (PLAN.md §W14 —
the enabler every other sx-review fix verifies against):

- spec/tests/test-gate-pins.sx: 7 pin suites (29 tests) for dc7aa709's
  landed fixes — K18, K20, K09/K11/K39, K49 (spec side), crit-2
  (non-vacuous via side-effect sentinel), plus C21/C22 harness pins
- 6 gate scripts, all bidirectional ledgers (a healed KNOWN entry also
  fails): test-protocol-gate (C1/C1b/S4 + C3-C7 quirk ledger + seeded
  fuzz-liveness, 11), test-env-parity (runner-only bindings, 7),
  test-harness-parity (mcp_tree vs sx_server, 12), test-wasm-corpus
  (shipped kernel: 80/83 files green, 5192 passes), test-suite-baseline
  (273-failure band pinned in spec/tests/known-failures.txt),
  test-differential (49 probes native vs WASM, 3 ledgered)
- spec/harness.sx: C22 fix (IO logged before the mock runs) + C21
  harness-run-perform (real CEK suspend/resume mode); W14-assigned per
  PLAN approach item 4 — see merge note in the briefing re: the forge
  briefing's stricter wording
- C9: empty suite labels eliminated across 6 test files
- web/tests/test-adapter-dom-render.sx: first render-output coverage of
  the DOM adapter (the browser-only exclusion was false)

Confirmed handoffs recorded in the briefing: bare-server apply does not
spread args (F-3, runner masks it); both runners' sha3-256 are fake
stubs (test CIDs != production CIDs); generated sx_render.ml is regen-
stale (misses dc7aa709's HTML_TAGS fix); canonical-serialize broken on
bare server for any number.

Verified post-merge in this checkout: gate pins 275/0, protocol-gate
11/0, env-parity 7/0, harness-parity 12/0, differential 49/0.

Briefing conflict (add/add) resolved: kept the loop's completed version
with a merge note preserving the forge briefing's context (8181421c
landed after the worktree branched).

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
2026-07-04 08:14:48 +00:00
21 changed files with 2266 additions and 124 deletions

94
scripts/test-differential.sh Executable file
View File

@@ -0,0 +1,94 @@
#!/bin/bash
# test-differential.sh — W14/F8: cross-host differential battery.
#
# Evaluates every expression in spec/tests/differential-probes.txt on:
# A) the native server (sx_server.exe, epoch protocol) — its printer
# B) the SHIPPED browser kernel (eval_wasm_probes.js, guest sx-serialize)
# and diffs the outputs. The review's original 130-probe corpus was
# ephemeral (F-8); this is the committed replacement.
#
# KNOWN_DIVERGENT is the ledger of confirmed, still-open divergences —
# keyed by the probe EXPRESSION. Red on a NEW divergence (host drift) and
# red on a HEALED one (fix landed: delete the entry, locking in parity).
#
# Method note (finding refinement, 2026-07-04): comparing raw K.eval
# JS-boundary values shows float-display divergences (0.3 vs
# 0.30000000000000004) that DISAPPEAR under guest-level (sx-serialize …) —
# the F-1 float-display class is a JS-boundary artifact, not a kernel
# serialization divergence. This battery compares guest serialization.
set -uo pipefail
cd "$(dirname "$0")/.."
SERVER=hosts/ocaml/_build/default/bin/sx_server.exe
WASM=shared/static/wasm/sx_browser.bc.wasm.js
PROBES=spec/tests/differential-probes.txt
[[ -x "$SERVER" ]] || { echo "SKIP: $SERVER not built" >&2; exit 2; }
[[ -f "$WASM" ]] || { echo "SKIP: $WASM missing" >&2; exit 2; }
# --- KNOWN_DIVERGENT ledger (verified live 2026-07-04) -------------------
# F-3/K53: bare sx_server's `apply` does NOT spread its argument list —
# (apply + (list 1 2 3)) errors "Expected number, got list"; (apply str l)
# returns the serialized list as one string. The WASM kernel spreads
# correctly. The test runner masks this with its own apply (F-7 class).
declare -A KNOWN_DIVERGENT
KNOWN_DIVERGENT['(apply + (list 1 2 3))']="F-3: native apply does not spread"
KNOWN_DIVERGENT['(apply max (list 1 5 2))']="F-3: native apply does not spread"
KNOWN_DIVERGENT['(apply str (list "a" "b"))']="F-3: native apply does not spread"
native=$(mktemp); wasm=$(mktemp)
python3 - "$SERVER" "$PROBES" > "$native" <<'PY'
import json, subprocess, sys
server, probefile = sys.argv[1], sys.argv[2]
probes = [l.strip() for l in open(probefile) if l.strip() and not l.startswith('#')]
inp = []
for i, p in enumerate(probes):
inp.append(f"(epoch {i+1})")
inp.append(f"(eval {json.dumps(p)})")
out = subprocess.run(["timeout", "120", server], input="\n".join(inp) + "\n",
capture_output=True, text=True).stdout
res, cur = {}, None
for l in out.splitlines():
if l.startswith("(ok-len "):
cur = int(l.split()[1]); res[cur] = None
elif l.startswith("(error "):
idx = int(l.split()[1]); res[idx] = "ERROR"; cur = None
elif cur is not None and res.get(cur) is None:
res[cur] = l; cur = None
for i, p in enumerate(probes):
print(f"PROBE {i+1} {res.get(i+1, '<none>')}")
PY
timeout 300 node hosts/ocaml/browser/eval_wasm_probes.js "$PROBES" > "$wasm" 2>/dev/null
pass=0; fail=0; i=0
while IFS= read -r expr; do
[[ -z "$expr" || "$expr" == \#* ]] && continue
i=$((i+1))
a=$(sed -n "s/^PROBE $i //p" "$native")
b=$(sed -n "s/^PROBE $i //p" "$wasm")
known="${KNOWN_DIVERGENT[$expr]:-}"
if [[ "$a" == "$b" ]]; then
if [[ -n "$known" ]]; then
echo "RED: $expr — KNOWN_DIVERGENT now AGREES ($known); delete from ledger"
fail=$((fail+1))
else
pass=$((pass+1))
fi
else
if [[ -n "$known" ]]; then
echo "KNOWN-DIVERGENT: $expr ($known)"
pass=$((pass+1))
else
echo "RED: $expr"
echo " native: $a"
echo " wasm: $b"
fail=$((fail+1))
fi
fi
done < <(grep -v '^\s*#' "$PROBES" | grep -v '^\s*$')
rm -f "$native" "$wasm"
echo
echo "differential: $i probes, $pass in agreement/ledgered, $fail red"
[[ $fail -eq 0 ]]

100
scripts/test-env-parity.sh Executable file
View File

@@ -0,0 +1,100 @@
#!/bin/bash
# test-env-parity.sh — W14 section-B ledger: runner env vs production env.
#
# The review (F7, K42, JS5, core.md "canonical.sx depends on test-runner-only
# helpers") found bindings that exist ONLY in the test runners, so suites
# pass against an environment production never provides. Rule (PLAN.md W14):
# "if the spec needs it, it's a kernel primitive; if not, the test can't
# have it."
#
# This script is a LEDGER, not a wish: it asserts today's confirmed drift
# stays exactly as recorded. Both directions fail loudly:
# - a MUST_HAVE going missing on the server -> regression, fix the kernel
# - a KNOWN_DRIFT binding appearing on the server -> the fix landed;
# move it to MUST_HAVE and update the consequence pins below.
#
# Confirmed inventory (2026-07-04, all verified live over the epoch protocol):
#
# binding OCaml runner JS runner fresh sx_server
# values real (rt.ml:1131) ? ABSENT
# call-with-values real (rt.ml:1140) ? ABSENT
# contains-char? real (rt.ml:728) real (:85) ABSENT
# trim-right ABSENT real (:87) ABSENT
# sha3-256 FAKE Hashtbl.hash FAKE stub ABSENT (real = crypto-sha3-256)
#
# Consequences (pinned in section 3):
# - (canonical-serialize 42) on a fresh server errors "Undefined symbol:
# contains-char?" -> content addressing broken for ANY number outside
# the test runners.
# - every CID computed inside run_tests uses a FAKE hash, so test CIDs
# never equal production CIDs (crypto-sha3-256 is real SHA3).
#
# Each probe spawns its OWN timeout-bounded sx_server.exe. No shared process.
set -uo pipefail
cd "$(dirname "$0")/.."
SERVER=hosts/ocaml/_build/default/bin/sx_server.exe
if [[ ! -x "$SERVER" ]]; then
echo "SKIP: $SERVER not built (run sx_build target=ocaml first)" >&2
exit 2
fi
pass=0
fail=0
# deps_unresolved EXPR -> prints the (unresolved ...) list for EXPR on a fresh server
deps_unresolved() {
printf '(epoch 1)\n(deps-check "%s")\n' "$1" \
| timeout 60 "$SERVER" 2>/dev/null \
| grep -o ':unresolved ([^)]*)' || true
}
# --- Section 1: MUST_HAVE — spec-needed bindings production must provide ---
MUST_HAVE_EXPR='(list (equal? 1 1) (apply + (list 1 2)) (contains? {:a 1} :a) (crypto-sha3-256 \"x\") (split \"a-b\" \"-\"))'
unres=$(deps_unresolved "$MUST_HAVE_EXPR")
if [[ -z "$unres" || "$unres" == ':unresolved ()' ]]; then
echo "PASS: MUST_HAVE core bindings all resolve on fresh sx_server"
pass=$((pass+1))
else
echo "FAIL: MUST_HAVE binding missing on fresh sx_server: $unres"
fail=$((fail+1))
fi
# --- Section 2: KNOWN_DRIFT — runner-only bindings, asserted ABSENT -------
# If one of these starts resolving, its kernel fix landed: move it to
# MUST_HAVE above and update the consequence pin in section 3.
for name in values call-with-values contains-char? trim-right sha3-256; do
unres=$(deps_unresolved "($name)")
if grep -q -- "$name" <<<"$unres"; then
echo "PASS: KNOWN_DRIFT '$name' still absent on fresh sx_server (ledger accurate)"
pass=$((pass+1))
else
echo "FAIL: KNOWN_DRIFT '$name' now RESOLVES on fresh sx_server — fix landed?"
echo " Update this ledger: move '$name' to MUST_HAVE and revisit section 3."
fail=$((fail+1))
fi
done
# --- Section 3: consequence pin — canonical.sx on the production server ---
# Current reality: canonical-serialize of ANY number errors on a fresh
# server because canonical-number calls runner-only contains-char?.
out=$(printf '(epoch 1)\n(load "spec/canonical.sx")\n(epoch 2)\n(eval "(canonical-serialize 42)")\n' \
| timeout 60 "$SERVER" 2>&1)
if grep -q 'error 2 .*contains-char?' <<<"$out"; then
echo "PASS: consequence pin — canonical-serialize on numbers still broken on server (as recorded)"
pass=$((pass+1))
elif grep -q '^(ok 2 ' <<<"$out"; then
echo "FAIL: consequence pin — canonical-serialize 42 now WORKS on the server."
echo " The canonical-helpers fix landed: flip this pin to assert success"
echo " and pin the exact canonical form + CID stability."
fail=$((fail+1))
else
echo "FAIL: consequence pin — unexpected server output:"
sed 's/^/ /' <<<"$out"
fail=$((fail+1))
fi
echo
echo "env-parity: $pass passed, $fail failed"
[[ $fail -eq 0 ]]

107
scripts/test-harness-parity.sh Executable file
View File

@@ -0,0 +1,107 @@
#!/bin/bash
# test-harness-parity.sh — W14 section-C pin for K19 (harness honesty).
#
# K19 (review, core.md): the MCP tree server (mcp_tree.ml) carries a
# PARALLEL primitive table, and it drifted from the real runtime
# (sx_primitives.ml) — e.g. (get {:a 1} :a 99) returned nil in the harness
# but 1 in production, (split "a--b" "--") was char-class vs substring.
# CLAUDE.md mandates harness verification, so drift silently produces
# false findings/passes. dc7aa709 aligned 8 entries as a stopgap; the real
# fix (mcp_tree links sx_primitives directly) is hosts-lane work.
#
# This pin runs the finding's exact probe battery through BOTH environments
# — mcp_tree.exe sx_eval (JSON-RPC over stdio) and a fresh sx_server.exe
# (epoch protocol) — and fails on ANY divergence. Errors are compared by
# message, values by serialized form. Both subprocesses are fresh and
# timeout-bounded; no shared process is touched.
#
# Exit: 0 = full parity; 1 = drift (harness lies about the runtime again).
set -uo pipefail
cd "$(dirname "$0")/.."
MCP=hosts/ocaml/_build/default/bin/mcp_tree.exe
SERVER=hosts/ocaml/_build/default/bin/sx_server.exe
for bin in "$MCP" "$SERVER"; do
if [[ ! -x "$bin" ]]; then
echo "SKIP: $bin not built (run sx_build target=ocaml first)" >&2
exit 2
fi
done
python3 - "$MCP" "$SERVER" <<'PYEOF'
import json, re, subprocess, sys
MCP, SERVER = sys.argv[1], sys.argv[2]
# K19 probe battery — the finding's confirmed drift cases + stopgap entries.
PROBES = [
'(empty? "")', '(empty? {})',
'(get {:a 1} :a 99)', '(get {:a 1} :zz 99)', '(get (list 10 20) 1)',
'(split "a--b" "--")', '(split "abc" "")',
'(equal? (list 1 2) (list 1 2))',
'(contains? {:a 1} :a)', '(keyword-name :kw)',
'(char-code "A")', '(parse-number "42")',
]
def norm_error(msg):
"""Extract the quoted inner error message so harness/server error
envelopes compare equal when the underlying failure is the same."""
m = re.search(r'Unhandled exception: \\?"(.*?)\\?"', msg)
if m:
return "<ERROR> " + m.group(1)
return "<ERROR> " + msg.strip()[:80]
# --- harness side: mcp_tree sx_eval over JSON-RPC ---
lines = [
json.dumps({"jsonrpc": "2.0", "id": 1, "method": "initialize",
"params": {"protocolVersion": "2024-11-05", "capabilities": {},
"clientInfo": {"name": "parity", "version": "0"}}}),
json.dumps({"jsonrpc": "2.0", "method": "notifications/initialized"}),
]
for i, p in enumerate(PROBES):
lines.append(json.dumps({"jsonrpc": "2.0", "id": 100 + i, "method": "tools/call",
"params": {"name": "sx_eval", "arguments": {"expr": p}}}))
out = subprocess.run(["timeout", "60", MCP], input="\n".join(lines) + "\n",
capture_output=True, text=True).stdout
harness = {}
for l in out.splitlines():
try:
j = json.loads(l)
except ValueError:
continue
if isinstance(j.get("id"), int) and j["id"] >= 100:
txt = j.get("result", {}).get("content", [{}])[0].get("text", "<none>").strip()
if txt.startswith("Error:") or j.get("result", {}).get("isError"):
txt = norm_error(txt)
harness[j["id"] - 100] = txt
# --- server side: fresh sx_server over the epoch protocol ---
inp = []
for i, p in enumerate(PROBES):
inp.append(f"(epoch {i + 1})")
inp.append(f"(eval {json.dumps(p)})")
sout = subprocess.run(["timeout", "60", SERVER], input="\n".join(inp) + "\n",
capture_output=True, text=True).stdout
server, cur = {}, None
for l in sout.splitlines():
if l.startswith("(ok-len "):
cur = int(l.split()[1]); server[cur - 1] = None
elif l.startswith("(error "):
idx = int(l.split()[1]); server[idx - 1] = norm_error(l); cur = None
elif cur is not None and server.get(cur - 1) is None:
server[cur - 1] = l.strip(); cur = None
fails = 0
for i, p in enumerate(PROBES):
h = harness.get(i, "<missing>")
s = server.get(i, "<missing>")
if h == s:
print(f"PASS: {p:40s} both -> {h!r}")
else:
print(f"FAIL: {p:40s} harness={h!r} server={s!r}")
fails += 1
print()
print(f"harness-parity: {len(PROBES) - fails} passed, {fails} failed")
sys.exit(1 if fails else 0)
PYEOF

233
scripts/test-protocol-gate.sh Executable file
View File

@@ -0,0 +1,233 @@
#!/bin/bash
# test-protocol-gate.sh — W14 pins for the epoch/command-channel protocol.
#
# Pins C1/C1b (review, plans/sx-review/hosts.md): a malformed or non-ASCII
# line on the top-level command channel used to raise an uncaught
# Sx_types.Parse_error and KILL the whole sx_server process (the shared
# channel used by bridges and conformance runners). Fixed in dc7aa709:
# the server now answers `(error N "Malformed command line: ...")` and
# keeps serving.
#
# Each case spawns its OWN timeout-bounded sx_server.exe subprocess —
# no shared/sibling process is ever touched. Designed to grow into the
# W14 section-E protocol fuzz suite (C3-C7).
#
# Usage: bash scripts/test-protocol-gate.sh
# Exit: 0 = all pins green; 1 = a pin failed (fix regressed).
set -uo pipefail
cd "$(dirname "$0")/.."
SERVER=hosts/ocaml/_build/default/bin/sx_server.exe
if [[ ! -x "$SERVER" ]]; then
echo "SKIP: $SERVER not built (run sx_build target=ocaml first)" >&2
exit 2
fi
pass=0
fail=0
# run_case NAME INPUT EXPECT_SENTINEL
# Feeds INPUT to a fresh server. Asserts:
# 1. an (error ... "Malformed command line: ...") response is emitted
# 2. the follow-up epoch still evaluates (EXPECT_SENTINEL in output)
# 3. the process exits cleanly (no Fatal error, exit 0 on stdin EOF)
run_case() {
local name="$1" input="$2" sentinel="$3"
local out rc
out=$(printf '%b' "$input" | timeout 60 "$SERVER" 2>&1)
rc=$?
local ok=1
if ! grep -q 'Malformed command line' <<<"$out"; then
echo "FAIL: $name — no malformed-line error response"; ok=0
fi
if ! grep -q "^${sentinel}\$" <<<"$out"; then
echo "FAIL: $name — follow-up epoch did not run (process died?)"; ok=0
fi
if grep -q 'Fatal error' <<<"$out"; then
echo "FAIL: $name — Fatal error escaped to the top level"; ok=0
fi
if [[ $rc -ne 0 ]]; then
echo "FAIL: $name — nonzero exit ($rc)"; ok=0
fi
if [[ $ok -eq 1 ]]; then
echo "PASS: $name"
pass=$((pass+1))
else
echo " --- output ---"; sed 's/^/ /' <<<"$out"; echo " --------------"
fail=$((fail+1))
fi
}
# C1: unterminated list on the command channel (exact review repro)
run_case "C1 unterminated list survives" \
'(epoch 2)\n(eval "(+ 1 2"\n(epoch 3)\n(eval "99")\n' \
'99'
# C1: plain-garbage line (second C1 repro shape)
run_case "C1 garbage line survives" \
'(epoch 1)\nnot an s-expr ]]] {{{\n(epoch 2)\n(eval "42")\n' \
'42'
# C1b: non-ASCII byte on the command channel (exact review repro; \xc3\xa9 = é)
run_case "C1b non-ASCII line survives" \
'(epoch 1)\n(eval (quote caf\xc3\xa9))\n(epoch 2)\n(eval "99")\n' \
'99'
# Control: a well-formed session still works end to end
ctrl=$(printf '(epoch 1)\n(eval "(+ 40 2)")\n' | timeout 60 "$SERVER" 2>&1)
if grep -q '^42$' <<<"$ctrl"; then
echo "PASS: control well-formed session"
pass=$((pass+1))
else
echo "FAIL: control well-formed session"; sed 's/^/ /' <<<"$ctrl"
fail=$((fail+1))
fi
# ---------------------------------------------------------------------------
# C3C7 protocol-quirk LEDGER (hosts.md, all OPEN server-side). These pin
# CURRENT behavior, verified live 2026-07-04 — they are documentation, not
# endorsement. When a server fix lands and a pin fails, update the ledger
# to assert the corrected behavior (bidirectional, like test-env-parity.sh).
# ---------------------------------------------------------------------------
# ledger_case NAME INPUT GREP_MUST GREP_MUST2
ledger_case() {
local name="$1" input="$2" must="$3" must2="${4:-}"
local out
out=$(printf '%b' "$input" | timeout 60 "$SERVER" 2>&1)
local ok=1
grep -q -- "$must" <<<"$out" || { echo "FAIL: $name — expected: $must"; ok=0; }
if [[ -n "$must2" ]]; then
grep -q -- "$must2" <<<"$out" || { echo "FAIL: $name — expected: $must2"; ok=0; }
fi
if grep -q 'Fatal error' <<<"$out"; then
echo "FAIL: $name — process died"; ok=0
fi
if [[ $ok -eq 1 ]]; then echo "PASS: $name"; pass=$((pass+1));
else echo " --- output ---"; sed 's/^/ /' <<<"$out"; fail=$((fail+1)); fi
}
# C3: stray (io-response ...) is answered as Unknown command (dead guard) —
# an EXTRA response the client didn't ask for; process keeps serving.
ledger_case "C3 ledger: stray io-response gets an extra error reply" \
'(epoch 1)\n(io-response 1 42)\n(eval "5")\n' \
'Unknown command: (io-response 1 42)' '^5$'
# C4: malformed (epoch) doesn't update the epoch — next reply tagged with
# the OLD epoch (0 here), i.e. stale from the client's viewpoint.
ledger_case "C4 ledger: malformed epoch marker leaves epoch stale" \
'(epoch)\n(eval "2")\n' \
'(ok-len 0 1)' '^2$'
# C5: no monotonic-epoch enforcement — a decreasing epoch is accepted.
ledger_case "C5 ledger: decreasing epoch accepted silently" \
'(epoch 9)\n(epoch 3)\n(eval "42")\n' \
'(ok-len 3 2)' '^42$'
# C6: two commands on one line -> one error, NEITHER executed.
ledger_case "C6 ledger: two commands on one line both dropped" \
'(epoch 1)\n(eval "1") (eval "2")\n(eval "3")\n' \
'Expected single command, got 2' '^3$'
# C7: vm-trace without the compiler loaded errors opaquely.
ledger_case "C7 ledger: vm-trace sans compiler is opaque Not-callable-nil" \
'(epoch 1)\n(vm-trace "(+ 1 2)")\n' \
'Not callable: nil'
# ---------------------------------------------------------------------------
# Fuzz-liveness property: after 60 deterministic hostile lines (unbalanced
# parens, control chars, unicode, long lines, stray io-responses, epoch
# mutations), the server must still answer a well-formed command and exit
# cleanly. Seeded PRNG — reproducible corpus.
# ---------------------------------------------------------------------------
fuzz=$(python3 - <<'PY'
import random
r = random.Random(1404)
lines = []
frag = ['(', ')', '((', '))', '(eval', '(epoch', 'io-response', '"', '\\',
'café', '\x01', '\x1b[2J', ':kw', '{', '}', '(+ 1', 'nil)', '#|', '|#']
for i in range(60):
kind = r.randrange(5)
if kind == 0:
lines.append(''.join(r.choice(frag) for _ in range(r.randrange(1, 8))))
elif kind == 1:
lines.append('(epoch ' + r.choice(['', 'foo', '-1', '999999999999999999999', ')']) + ')')
elif kind == 2:
lines.append('(io-response %d %s' % (r.randrange(99), r.choice([')', '', '42']) ))
elif kind == 3:
lines.append('x' * r.randrange(200, 2000))
else:
lines.append('(eval "' + r.choice(['(+ 1', '(list', '\\\\', '((((']) + '")')
print('\n'.join(lines))
PY
)
out=$(printf '%s\n(epoch 777)\n(eval "\\"alive\\"")\n' "$fuzz" | timeout 90 "$SERVER" 2>&1)
rc=$?
if grep -q '^"alive"$' <<<"$out" && ! grep -q 'Fatal error' <<<"$out" && [[ $rc -eq 0 ]]; then
echo "PASS: fuzz-liveness — server survives 60 hostile lines and still answers"
pass=$((pass+1))
else
echo "FAIL: fuzz-liveness (rc=$rc)"; tail -6 <<<"$out" | sed 's/^/ /'
fail=$((fail+1))
fi
# ---------------------------------------------------------------------------
# S4 (review, hosts.md): soft error pages must NOT be stored in the HTTP
# response cache. Pre-fix, a routing-failure page was cached as HTTP 200 and
# served byte-identically from cache to every later visitor (cold 2s → warm
# 0.0005s, ONE render line). Post-fix (dc7aa709), http_render_page returns
# (html, is_error) and cache insertion is gated on `not is_err` (the skip is
# logged as "[cache] <path> → error page, not cached").
#
# Pin: GET the same nonexistent path twice against a fresh --http server and
# assert BOTH requests re-render (two [sx-http] render lines) plus the
# is_err gate line appearing in the log. NB: in a standalone worktree all
# docs pages render as soft error pages (no content), so a positive
# "real page IS cached" control is not assertable here.
# ---------------------------------------------------------------------------
s4_case() {
local port=$((18000 + RANDOM % 2000))
local log; log=$(mktemp)
timeout 90 "$SERVER" --http "$port" >"$log" 2>&1 &
local srv=$!
local up=0
for _ in $(seq 1 40); do
if curl -s -o /dev/null "http://localhost:$port/" 2>/dev/null; then up=1; break; fi
sleep 1
done
if [[ $up -ne 1 ]]; then
echo "FAIL: S4 — http server did not come up on :$port"
kill "$srv" 2>/dev/null; rm -f "$log"
fail=$((fail+1)); return
fi
local miss="/sx/gate-pin-missing-$$-$RANDOM"
curl -s -o /dev/null "http://localhost:$port$miss"
curl -s -o /dev/null "http://localhost:$port$miss"
sleep 1
local renders
renders=$(grep -c "sx-http\] $miss " "$log")
local ok=1
if [[ "$renders" -ne 2 ]]; then
echo "FAIL: S4 — expected 2 renders of $miss (not cache-served), got $renders"
ok=0
fi
if ! grep -q 'error page, not cached' "$log"; then
echo "FAIL: S4 — is_err cache gate line absent from server log"
ok=0
fi
if [[ $ok -eq 1 ]]; then
echo "PASS: S4 soft error page not cached (both GETs re-rendered)"
pass=$((pass+1))
else
echo " --- log tail ---"; tail -12 "$log" | sed 's/^/ /'; echo " ---------------"
fail=$((fail+1))
fi
kill "$srv" 2>/dev/null
rm -f "$log"
}
s4_case
echo
echo "protocol-gate: $pass passed, $fail failed"
[[ $fail -eq 0 ]]

61
scripts/test-suite-baseline.sh Executable file
View File

@@ -0,0 +1,61 @@
#!/bin/bash
# test-suite-baseline.sh — W14/F10: make FAIL mean something again.
#
# The review (conformance.md F-10): the OCaml suite is not green — a
# permanent ~274-failure band (in-progress hs-* + r7rs radix shadow) is
# normalized, so real regressions hide inside the red noise and nobody can
# tell a new failure from the band.
#
# This gate pins the band instead of ignoring it: the full suite's FAIL
# set is diffed against the checked-in baseline
# (spec/tests/known-failures.txt). Two red conditions, both loud:
# NEW failure -> a real regression: fix it (or, if intentional,
# justify + add to the baseline in the same commit)
# VANISHED failure -> something got fixed: delete it from the baseline
# so the win is locked in
# Neither touches the runner or the hs loops' scoreboards — the band still
# prints as FAIL lines for the teams working through it.
#
# Usage: bash scripts/test-suite-baseline.sh
# Runtime: full suite, ~515 min. Exit 0 = fail set identical to baseline.
set -uo pipefail
cd "$(dirname "$0")/.."
RUNNER=hosts/ocaml/_build/default/bin/run_tests.exe
BASELINE=spec/tests/known-failures.txt
[[ -x "$RUNNER" ]] || { echo "SKIP: $RUNNER not built" >&2; exit 2; }
[[ -f "$BASELINE" ]] || { echo "SKIP: $BASELINE missing" >&2; exit 2; }
log=$(mktemp)
timeout 3000 "$RUNNER" > "$log" 2>&1
rc=$?
if [[ $rc -ne 0 && $rc -ne 1 ]]; then
echo "RED: runner exited $rc (timeout/crash)"; tail -5 "$log"; rm -f "$log"; exit 1
fi
# Normalize: keep the stable test identity (suite > name), drop messages
# (error text may contain addresses/timings that churn).
current=$(mktemp)
grep '^ FAIL: ' "$log" | sed 's/^ FAIL: //; s/: .*$//' | sort -u > "$current"
new_failures=$(comm -13 <(sort -u "$BASELINE") "$current")
vanished=$(comm -23 <(sort -u "$BASELINE") "$current")
summary=$(grep '^Results:' "$log" | tail -1)
red=0
if [[ -n "$new_failures" ]]; then
echo "RED: NEW failures not in baseline:"
sed 's/^/ + /' <<<"$new_failures"
red=1
fi
if [[ -n "$vanished" ]]; then
echo "RED: baseline entries now PASSING (delete them from $BASELINE):"
sed 's/^/ - /' <<<"$vanished"
red=1
fi
if [[ $red -eq 0 ]]; then
echo "GREEN: fail set identical to baseline ($(wc -l < "$BASELINE") known failures)"
fi
echo "$summary"
rm -f "$log" "$current"
exit $red

82
scripts/test-wasm-corpus.sh Executable file
View File

@@ -0,0 +1,82 @@
#!/bin/bash
# test-wasm-corpus.sh — W14/F2: sweep the spec test corpus through the
# SHIPPED browser kernel (sx_browser.bc.wasm.js) headless in Node.
#
# The review (conformance.md F-2) found the shipped browser artifact never
# runs the corpus — F-1/F-3 native/WASM divergences existed undetected.
# Each file runs in its OWN node process via run_wasm_corpus.js (a hang is
# killed by per-file timeout without ending the sweep).
#
# The SKIP list documents files that structurally cannot run on the browser
# kernel (runner-only bindings, native-only machinery) — the F-5/F-6/F-10
# "one-host-gated" theme, recorded honestly per file with the reason.
# KNOWN_FAIL documents files that RUN but currently have failing tests on
# the shipped kernel (host divergence, F-1/F-3 class): they execute and
# report, but don't gate. Everything else must be GREEN — exit 1 otherwise;
# a KNOWN_FAIL going green also fails (ledger must be updated).
#
# Usage: bash scripts/test-wasm-corpus.sh [file.sx ...]
set -uo pipefail
cd "$(dirname "$0")/.."
RUNNER=hosts/ocaml/browser/run_wasm_corpus.js
KERNEL=shared/static/wasm/sx_browser.bc.wasm.js
[[ -f "$KERNEL" ]] || { echo "SKIP: $KERNEL missing (run sx-build-all first)" >&2; exit 2; }
# --- classification (empirical sweep 2026-07-04; see sx-gate-loop.md) ---
# Sweep baseline: 83 files, 80 fully green, 5192 passes, 0 test failures.
# The shipped kernel even provides the CEK driver bindings (make-env,
# cek-step-loop, ...) — broader than a bare sx_server.
declare -A SKIP KNOWN_FAIL
skip() { SKIP[$1]=$2; }
known() { KNOWN_FAIL[$1]=$2; }
# Partial load-errors: the kernel throws mid-file (opaque jsoo exception,
# message "undefined"); tests before the failing form pass and report.
known test-hash-table.sx "partial: 22 pass then load-error mid-file"
known test-r7rs.sx "partial: 87 pass then load-error mid-file"
known test-sets.sx "partial: 30 pass then load-error mid-file"
pass_total=0; fail_total=0; red=0; files=0
declare -a targets
if [[ $# -gt 0 ]]; then targets=("$@");
else for f in spec/tests/test-*.sx; do
[[ "$(basename "$f")" == "test-framework.sx" ]] && continue
targets+=("$f")
done; fi
for f in "${targets[@]}"; do
base=$(basename "$f")
if [[ -n "${SKIP[$base]:-}" ]]; then
echo "SKIP: $base${SKIP[$base]}"
continue
fi
files=$((files+1))
line=$(timeout 120 node "$RUNNER" "$f" 2>/dev/null | grep '^CORPUS-RESULT' || true)
if [[ -z "$line" ]]; then
echo "RED: $base — timeout or crash (no CORPUS-RESULT)"
red=$((red+1)); continue
fi
p=$(sed -n 's/.*pass=\([0-9]*\).*/\1/p' <<<"$line")
fl=$(sed -n 's/.*fail=\([0-9]*\).*/\1/p' <<<"$line")
st=$(sed -n 's/.*status=\([a-z-]*\).*/\1/p' <<<"$line")
pass_total=$((pass_total+p)); fail_total=$((fail_total+fl))
if [[ -n "${KNOWN_FAIL[$base]:-}" ]]; then
if [[ "$fl" -eq 0 && "$st" == "ok" ]]; then
echo "RED: $base — KNOWN_FAIL is now GREEN (${KNOWN_FAIL[$base]}); update the ledger"
red=$((red+1))
else
echo "KNOWN-FAIL: $base pass=$p fail=$fl ($( echo "${KNOWN_FAIL[$base]}" ))"
fi
continue
fi
if [[ "$st" != "ok" || "$fl" -ne 0 ]]; then
echo "RED: $base pass=$p fail=$fl status=$st"
red=$((red+1))
else
echo "OK: $base pass=$p"
fi
done
echo
echo "wasm-corpus: $files files run, $pass_total passed, $fail_total failed, $red red"
[[ $red -eq 0 ]]