Merge loops/sx-ws-w14 into architecture: W14 test gate & conformance infra

17 test-only commits delivering the full W14 workstream (PLAN.md §W14 — the enabler every other sx-review fix verifies against): - spec/tests/test-gate-pins.sx: 7 pin suites (29 tests) for dc7aa709's landed fixes — K18, K20, K09/K11/K39, K49 (spec side), crit-2 (non-vacuous via side-effect sentinel), plus C21/C22 harness pins - 6 gate scripts, all bidirectional ledgers (a healed KNOWN entry also fails): test-protocol-gate (C1/C1b/S4 + C3-C7 quirk ledger + seeded fuzz-liveness, 11), test-env-parity (runner-only bindings, 7), test-harness-parity (mcp_tree vs sx_server, 12), test-wasm-corpus (shipped kernel: 80/83 files green, 5192 passes), test-suite-baseline (273-failure band pinned in spec/tests/known-failures.txt), test-differential (49 probes native vs WASM, 3 ledgered) - spec/harness.sx: C22 fix (IO logged before the mock runs) + C21 harness-run-perform (real CEK suspend/resume mode); W14-assigned per PLAN approach item 4 — see merge note in the briefing re: the forge briefing's stricter wording - C9: empty suite labels eliminated across 6 test files - web/tests/test-adapter-dom-render.sx: first render-output coverage of the DOM adapter (the browser-only exclusion was false) Confirmed handoffs recorded in the briefing: bare-server apply does not spread args (F-3, runner masks it); both runners' sha3-256 are fake stubs (test CIDs != production CIDs); generated sx_render.ml is regen- stale (misses dc7aa709's HTML_TAGS fix); canonical-serialize broken on bare server for any number. Verified post-merge in this checkout: gate pins 275/0, protocol-gate 11/0, env-parity 7/0, harness-parity 12/0, differential 49/0. Briefing conflict (add/add) resolved: kept the loop's completed version with a merge note preserving the forge briefing's context (8181421c landed after the worktree branched). Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
2026-07-04 08:14:48 +00:00
parent 8ed44f7770 5e0abced32
commit 047fccc948
21 changed files with 2266 additions and 124 deletions
--- a/scripts/test-differential.sh
+++ b/scripts/test-differential.sh
@@ -0,0 +1,94 @@
+#!/bin/bash
+# test-differential.sh — W14/F8: cross-host differential battery.
+#
+# Evaluates every expression in spec/tests/differential-probes.txt on:
+#   A) the native server (sx_server.exe, epoch protocol) — its printer
+#   B) the SHIPPED browser kernel (eval_wasm_probes.js, guest sx-serialize)
+# and diffs the outputs. The review's original 130-probe corpus was
+# ephemeral (F-8); this is the committed replacement.
+#
+# KNOWN_DIVERGENT is the ledger of confirmed, still-open divergences —
+# keyed by the probe EXPRESSION. Red on a NEW divergence (host drift) and
+# red on a HEALED one (fix landed: delete the entry, locking in parity).
+#
+# Method note (finding refinement, 2026-07-04): comparing raw K.eval
+# JS-boundary values shows float-display divergences (0.3 vs
+# 0.30000000000000004) that DISAPPEAR under guest-level (sx-serialize …) —
+# the F-1 float-display class is a JS-boundary artifact, not a kernel
+# serialization divergence. This battery compares guest serialization.
+set -uo pipefail
+cd "$(dirname "$0")/.."
+
+SERVER=hosts/ocaml/_build/default/bin/sx_server.exe
+WASM=shared/static/wasm/sx_browser.bc.wasm.js
+PROBES=spec/tests/differential-probes.txt
+[[ -x "$SERVER" ]] || { echo "SKIP: $SERVER not built" >&2; exit 2; }
+[[ -f "$WASM" ]] || { echo "SKIP: $WASM missing" >&2; exit 2; }
+
+# --- KNOWN_DIVERGENT ledger (verified live 2026-07-04) -------------------
+# F-3/K53: bare sx_server's `apply` does NOT spread its argument list —
+# (apply + (list 1 2 3)) errors "Expected number, got list"; (apply str l)
+# returns the serialized list as one string. The WASM kernel spreads
+# correctly. The test runner masks this with its own apply (F-7 class).
+declare -A KNOWN_DIVERGENT
+KNOWN_DIVERGENT['(apply + (list 1 2 3))']="F-3: native apply does not spread"
+KNOWN_DIVERGENT['(apply max (list 1 5 2))']="F-3: native apply does not spread"
+KNOWN_DIVERGENT['(apply str (list "a" "b"))']="F-3: native apply does not spread"
+
+native=$(mktemp); wasm=$(mktemp)
+
+python3 - "$SERVER" "$PROBES" > "$native" <<'PY'
+import json, subprocess, sys
+server, probefile = sys.argv[1], sys.argv[2]
+probes = [l.strip() for l in open(probefile) if l.strip() and not l.startswith('#')]
+inp = []
+for i, p in enumerate(probes):
+    inp.append(f"(epoch {i+1})")
+    inp.append(f"(eval {json.dumps(p)})")
+out = subprocess.run(["timeout", "120", server], input="\n".join(inp) + "\n",
+                     capture_output=True, text=True).stdout
+res, cur = {}, None
+for l in out.splitlines():
+    if l.startswith("(ok-len "):
+        cur = int(l.split()[1]); res[cur] = None
+    elif l.startswith("(error "):
+        idx = int(l.split()[1]); res[idx] = "ERROR"; cur = None
+    elif cur is not None and res.get(cur) is None:
+        res[cur] = l; cur = None
+for i, p in enumerate(probes):
+    print(f"PROBE {i+1} {res.get(i+1, '<none>')}")
+PY
+
+timeout 300 node hosts/ocaml/browser/eval_wasm_probes.js "$PROBES" > "$wasm" 2>/dev/null
+
+pass=0; fail=0; i=0
+while IFS= read -r expr; do
+  [[ -z "$expr" || "$expr" == \#* ]] && continue
+  i=$((i+1))
+  a=$(sed -n "s/^PROBE $i //p" "$native")
+  b=$(sed -n "s/^PROBE $i //p" "$wasm")
+  known="${KNOWN_DIVERGENT[$expr]:-}"
+  if [[ "$a" == "$b" ]]; then
+    if [[ -n "$known" ]]; then
+      echo "RED:  $expr — KNOWN_DIVERGENT now AGREES ($known); delete from ledger"
+      fail=$((fail+1))
+    else
+      pass=$((pass+1))
+    fi
+  else
+    if [[ -n "$known" ]]; then
+      echo "KNOWN-DIVERGENT: $expr ($known)"
+      pass=$((pass+1))
+    else
+      echo "RED:  $expr"
+      echo "        native: $a"
+      echo "        wasm:   $b"
+      fail=$((fail+1))
+    fi
+  fi
+done < <(grep -v '^\s*#' "$PROBES" | grep -v '^\s*$')
+
+rm -f "$native" "$wasm"
+echo
+echo "differential: $i probes, $pass in agreement/ledgered, $fail red"
+[[ $fail -eq 0 ]]
--- a/scripts/test-env-parity.sh
+++ b/scripts/test-env-parity.sh
@@ -0,0 +1,100 @@
+#!/bin/bash
+# test-env-parity.sh — W14 section-B ledger: runner env vs production env.
+#
+# The review (F7, K42, JS5, core.md "canonical.sx depends on test-runner-only
+# helpers") found bindings that exist ONLY in the test runners, so suites
+# pass against an environment production never provides. Rule (PLAN.md W14):
+# "if the spec needs it, it's a kernel primitive; if not, the test can't
+# have it."
+#
+# This script is a LEDGER, not a wish: it asserts today's confirmed drift
+# stays exactly as recorded. Both directions fail loudly:
+#   - a MUST_HAVE going missing on the server  -> regression, fix the kernel
+#   - a KNOWN_DRIFT binding appearing on the server -> the fix landed;
+#     move it to MUST_HAVE and update the consequence pins below.
+#
+# Confirmed inventory (2026-07-04, all verified live over the epoch protocol):
+#
+#   binding            OCaml runner        JS runner    fresh sx_server
+#   values             real (rt.ml:1131)   ?            ABSENT
+#   call-with-values   real (rt.ml:1140)   ?            ABSENT
+#   contains-char?     real (rt.ml:728)    real (:85)   ABSENT
+#   trim-right         ABSENT              real (:87)   ABSENT
+#   sha3-256           FAKE Hashtbl.hash   FAKE stub    ABSENT (real = crypto-sha3-256)
+#
+# Consequences (pinned in section 3):
+#   - (canonical-serialize 42) on a fresh server errors "Undefined symbol:
+#     contains-char?" -> content addressing broken for ANY number outside
+#     the test runners.
+#   - every CID computed inside run_tests uses a FAKE hash, so test CIDs
+#     never equal production CIDs (crypto-sha3-256 is real SHA3).
+#
+# Each probe spawns its OWN timeout-bounded sx_server.exe. No shared process.
+set -uo pipefail
+
+cd "$(dirname "$0")/.."
+SERVER=hosts/ocaml/_build/default/bin/sx_server.exe
+
+if [[ ! -x "$SERVER" ]]; then
+  echo "SKIP: $SERVER not built (run sx_build target=ocaml first)" >&2
+  exit 2
+fi
+
+pass=0
+fail=0
+
+# deps_unresolved EXPR -> prints the (unresolved ...) list for EXPR on a fresh server
+deps_unresolved() {
+  printf '(epoch 1)\n(deps-check "%s")\n' "$1" \
+    | timeout 60 "$SERVER" 2>/dev/null \
+    | grep -o ':unresolved ([^)]*)' || true
+}
+
+# --- Section 1: MUST_HAVE — spec-needed bindings production must provide ---
+MUST_HAVE_EXPR='(list (equal? 1 1) (apply + (list 1 2)) (contains? {:a 1} :a) (crypto-sha3-256 \"x\") (split \"a-b\" \"-\"))'
+unres=$(deps_unresolved "$MUST_HAVE_EXPR")
+if [[ -z "$unres" || "$unres" == ':unresolved ()' ]]; then
+  echo "PASS: MUST_HAVE core bindings all resolve on fresh sx_server"
+  pass=$((pass+1))
+else
+  echo "FAIL: MUST_HAVE binding missing on fresh sx_server: $unres"
+  fail=$((fail+1))
+fi
+
+# --- Section 2: KNOWN_DRIFT — runner-only bindings, asserted ABSENT -------
+# If one of these starts resolving, its kernel fix landed: move it to
+# MUST_HAVE above and update the consequence pin in section 3.
+for name in values call-with-values contains-char? trim-right sha3-256; do
+  unres=$(deps_unresolved "($name)")
+  if grep -q -- "$name" <<<"$unres"; then
+    echo "PASS: KNOWN_DRIFT '$name' still absent on fresh sx_server (ledger accurate)"
+    pass=$((pass+1))
+  else
+    echo "FAIL: KNOWN_DRIFT '$name' now RESOLVES on fresh sx_server — fix landed?"
+    echo "      Update this ledger: move '$name' to MUST_HAVE and revisit section 3."
+    fail=$((fail+1))
+  fi
+done
+
+# --- Section 3: consequence pin — canonical.sx on the production server ---
+# Current reality: canonical-serialize of ANY number errors on a fresh
+# server because canonical-number calls runner-only contains-char?.
+out=$(printf '(epoch 1)\n(load "spec/canonical.sx")\n(epoch 2)\n(eval "(canonical-serialize 42)")\n' \
+  | timeout 60 "$SERVER" 2>&1)
+if grep -q 'error 2 .*contains-char?' <<<"$out"; then
+  echo "PASS: consequence pin — canonical-serialize on numbers still broken on server (as recorded)"
+  pass=$((pass+1))
+elif grep -q '^(ok 2 ' <<<"$out"; then
+  echo "FAIL: consequence pin — canonical-serialize 42 now WORKS on the server."
+  echo "      The canonical-helpers fix landed: flip this pin to assert success"
+  echo "      and pin the exact canonical form + CID stability."
+  fail=$((fail+1))
+else
+  echo "FAIL: consequence pin — unexpected server output:"
+  sed 's/^/  /' <<<"$out"
+  fail=$((fail+1))
+fi
+
+echo
+echo "env-parity: $pass passed, $fail failed"
+[[ $fail -eq 0 ]]
--- a/scripts/test-harness-parity.sh
+++ b/scripts/test-harness-parity.sh
@@ -0,0 +1,107 @@
+#!/bin/bash
+# test-harness-parity.sh — W14 section-C pin for K19 (harness honesty).
+#
+# K19 (review, core.md): the MCP tree server (mcp_tree.ml) carries a
+# PARALLEL primitive table, and it drifted from the real runtime
+# (sx_primitives.ml) — e.g. (get {:a 1} :a 99) returned nil in the harness
+# but 1 in production, (split "a--b" "--") was char-class vs substring.
+# CLAUDE.md mandates harness verification, so drift silently produces
+# false findings/passes. dc7aa709 aligned 8 entries as a stopgap; the real
+# fix (mcp_tree links sx_primitives directly) is hosts-lane work.
+#
+# This pin runs the finding's exact probe battery through BOTH environments
+# — mcp_tree.exe sx_eval (JSON-RPC over stdio) and a fresh sx_server.exe
+# (epoch protocol) — and fails on ANY divergence. Errors are compared by
+# message, values by serialized form. Both subprocesses are fresh and
+# timeout-bounded; no shared process is touched.
+#
+# Exit: 0 = full parity; 1 = drift (harness lies about the runtime again).
+set -uo pipefail
+cd "$(dirname "$0")/.."
+
+MCP=hosts/ocaml/_build/default/bin/mcp_tree.exe
+SERVER=hosts/ocaml/_build/default/bin/sx_server.exe
+for bin in "$MCP" "$SERVER"; do
+  if [[ ! -x "$bin" ]]; then
+    echo "SKIP: $bin not built (run sx_build target=ocaml first)" >&2
+    exit 2
+  fi
+done
+
+python3 - "$MCP" "$SERVER" <<'PYEOF'
+import json, re, subprocess, sys
+
+MCP, SERVER = sys.argv[1], sys.argv[2]
+
+# K19 probe battery — the finding's confirmed drift cases + stopgap entries.
+PROBES = [
+    '(empty? "")', '(empty? {})',
+    '(get {:a 1} :a 99)', '(get {:a 1} :zz 99)', '(get (list 10 20) 1)',
+    '(split "a--b" "--")', '(split "abc" "")',
+    '(equal? (list 1 2) (list 1 2))',
+    '(contains? {:a 1} :a)', '(keyword-name :kw)',
+    '(char-code "A")', '(parse-number "42")',
+]
+
+def norm_error(msg):
+    """Extract the quoted inner error message so harness/server error
+    envelopes compare equal when the underlying failure is the same."""
+    m = re.search(r'Unhandled exception: \\?"(.*?)\\?"', msg)
+    if m:
+        return "<ERROR> " + m.group(1)
+    return "<ERROR> " + msg.strip()[:80]
+
+# --- harness side: mcp_tree sx_eval over JSON-RPC ---
+lines = [
+    json.dumps({"jsonrpc": "2.0", "id": 1, "method": "initialize",
+                "params": {"protocolVersion": "2024-11-05", "capabilities": {},
+                           "clientInfo": {"name": "parity", "version": "0"}}}),
+    json.dumps({"jsonrpc": "2.0", "method": "notifications/initialized"}),
+]
+for i, p in enumerate(PROBES):
+    lines.append(json.dumps({"jsonrpc": "2.0", "id": 100 + i, "method": "tools/call",
+                             "params": {"name": "sx_eval", "arguments": {"expr": p}}}))
+out = subprocess.run(["timeout", "60", MCP], input="\n".join(lines) + "\n",
+                     capture_output=True, text=True).stdout
+harness = {}
+for l in out.splitlines():
+    try:
+        j = json.loads(l)
+    except ValueError:
+        continue
+    if isinstance(j.get("id"), int) and j["id"] >= 100:
+        txt = j.get("result", {}).get("content", [{}])[0].get("text", "<none>").strip()
+        if txt.startswith("Error:") or j.get("result", {}).get("isError"):
+            txt = norm_error(txt)
+        harness[j["id"] - 100] = txt
+
+# --- server side: fresh sx_server over the epoch protocol ---
+inp = []
+for i, p in enumerate(PROBES):
+    inp.append(f"(epoch {i + 1})")
+    inp.append(f"(eval {json.dumps(p)})")
+sout = subprocess.run(["timeout", "60", SERVER], input="\n".join(inp) + "\n",
+                      capture_output=True, text=True).stdout
+server, cur = {}, None
+for l in sout.splitlines():
+    if l.startswith("(ok-len "):
+        cur = int(l.split()[1]); server[cur - 1] = None
+    elif l.startswith("(error "):
+        idx = int(l.split()[1]); server[idx - 1] = norm_error(l); cur = None
+    elif cur is not None and server.get(cur - 1) is None:
+        server[cur - 1] = l.strip(); cur = None
+
+fails = 0
+for i, p in enumerate(PROBES):
+    h = harness.get(i, "<missing>")
+    s = server.get(i, "<missing>")
+    if h == s:
+        print(f"PASS: {p:40s} both -> {h!r}")
+    else:
+        print(f"FAIL: {p:40s} harness={h!r} server={s!r}")
+        fails += 1
+
+print()
+print(f"harness-parity: {len(PROBES) - fails} passed, {fails} failed")
+sys.exit(1 if fails else 0)
+PYEOF
--- a/scripts/test-protocol-gate.sh
+++ b/scripts/test-protocol-gate.sh
@@ -0,0 +1,233 @@
+#!/bin/bash
+# test-protocol-gate.sh — W14 pins for the epoch/command-channel protocol.
+#
+# Pins C1/C1b (review, plans/sx-review/hosts.md): a malformed or non-ASCII
+# line on the top-level command channel used to raise an uncaught
+# Sx_types.Parse_error and KILL the whole sx_server process (the shared
+# channel used by bridges and conformance runners). Fixed in dc7aa709:
+# the server now answers `(error N "Malformed command line: ...")` and
+# keeps serving.
+#
+# Each case spawns its OWN timeout-bounded sx_server.exe subprocess —
+# no shared/sibling process is ever touched. Designed to grow into the
+# W14 section-E protocol fuzz suite (C3-C7).
+#
+# Usage: bash scripts/test-protocol-gate.sh
+# Exit:  0 = all pins green; 1 = a pin failed (fix regressed).
+set -uo pipefail
+
+cd "$(dirname "$0")/.."
+SERVER=hosts/ocaml/_build/default/bin/sx_server.exe
+
+if [[ ! -x "$SERVER" ]]; then
+  echo "SKIP: $SERVER not built (run sx_build target=ocaml first)" >&2
+  exit 2
+fi
+
+pass=0
+fail=0
+
+# run_case NAME INPUT EXPECT_SENTINEL
+#   Feeds INPUT to a fresh server. Asserts:
+#   1. an (error ... "Malformed command line: ...") response is emitted
+#   2. the follow-up epoch still evaluates (EXPECT_SENTINEL in output)
+#   3. the process exits cleanly (no Fatal error, exit 0 on stdin EOF)
+run_case() {
+  local name="$1" input="$2" sentinel="$3"
+  local out rc
+  out=$(printf '%b' "$input" | timeout 60 "$SERVER" 2>&1)
+  rc=$?
+  local ok=1
+  if ! grep -q 'Malformed command line' <<<"$out"; then
+    echo "FAIL: $name — no malformed-line error response"; ok=0
+  fi
+  if ! grep -q "^${sentinel}\$" <<<"$out"; then
+    echo "FAIL: $name — follow-up epoch did not run (process died?)"; ok=0
+  fi
+  if grep -q 'Fatal error' <<<"$out"; then
+    echo "FAIL: $name — Fatal error escaped to the top level"; ok=0
+  fi
+  if [[ $rc -ne 0 ]]; then
+    echo "FAIL: $name — nonzero exit ($rc)"; ok=0
+  fi
+  if [[ $ok -eq 1 ]]; then
+    echo "PASS: $name"
+    pass=$((pass+1))
+  else
+    echo "  --- output ---"; sed 's/^/  /' <<<"$out"; echo "  --------------"
+    fail=$((fail+1))
+  fi
+}
+
+# C1: unterminated list on the command channel (exact review repro)
+run_case "C1 unterminated list survives" \
+  '(epoch 2)\n(eval "(+ 1 2"\n(epoch 3)\n(eval "99")\n' \
+  '99'
+
+# C1: plain-garbage line (second C1 repro shape)
+run_case "C1 garbage line survives" \
+  '(epoch 1)\nnot an s-expr ]]] {{{\n(epoch 2)\n(eval "42")\n' \
+  '42'
+
+# C1b: non-ASCII byte on the command channel (exact review repro; \xc3\xa9 = é)
+run_case "C1b non-ASCII line survives" \
+  '(epoch 1)\n(eval (quote caf\xc3\xa9))\n(epoch 2)\n(eval "99")\n' \
+  '99'
+
+# Control: a well-formed session still works end to end
+ctrl=$(printf '(epoch 1)\n(eval "(+ 40 2)")\n' | timeout 60 "$SERVER" 2>&1)
+if grep -q '^42$' <<<"$ctrl"; then
+  echo "PASS: control well-formed session"
+  pass=$((pass+1))
+else
+  echo "FAIL: control well-formed session"; sed 's/^/  /' <<<"$ctrl"
+  fail=$((fail+1))
+fi
+
+# ---------------------------------------------------------------------------
+# C3–C7 protocol-quirk LEDGER (hosts.md, all OPEN server-side). These pin
+# CURRENT behavior, verified live 2026-07-04 — they are documentation, not
+# endorsement. When a server fix lands and a pin fails, update the ledger
+# to assert the corrected behavior (bidirectional, like test-env-parity.sh).
+# ---------------------------------------------------------------------------
+
+# ledger_case NAME INPUT GREP_MUST GREP_MUST2
+ledger_case() {
+  local name="$1" input="$2" must="$3" must2="${4:-}"
+  local out
+  out=$(printf '%b' "$input" | timeout 60 "$SERVER" 2>&1)
+  local ok=1
+  grep -q -- "$must" <<<"$out" || { echo "FAIL: $name — expected: $must"; ok=0; }
+  if [[ -n "$must2" ]]; then
+    grep -q -- "$must2" <<<"$out" || { echo "FAIL: $name — expected: $must2"; ok=0; }
+  fi
+  if grep -q 'Fatal error' <<<"$out"; then
+    echo "FAIL: $name — process died"; ok=0
+  fi
+  if [[ $ok -eq 1 ]]; then echo "PASS: $name"; pass=$((pass+1));
+  else echo "  --- output ---"; sed 's/^/  /' <<<"$out"; fail=$((fail+1)); fi
+}
+
+# C3: stray (io-response ...) is answered as Unknown command (dead guard) —
+# an EXTRA response the client didn't ask for; process keeps serving.
+ledger_case "C3 ledger: stray io-response gets an extra error reply" \
+  '(epoch 1)\n(io-response 1 42)\n(eval "5")\n' \
+  'Unknown command: (io-response 1 42)' '^5$'
+
+# C4: malformed (epoch) doesn't update the epoch — next reply tagged with
+# the OLD epoch (0 here), i.e. stale from the client's viewpoint.
+ledger_case "C4 ledger: malformed epoch marker leaves epoch stale" \
+  '(epoch)\n(eval "2")\n' \
+  '(ok-len 0 1)' '^2$'
+
+# C5: no monotonic-epoch enforcement — a decreasing epoch is accepted.
+ledger_case "C5 ledger: decreasing epoch accepted silently" \
+  '(epoch 9)\n(epoch 3)\n(eval "42")\n' \
+  '(ok-len 3 2)' '^42$'
+
+# C6: two commands on one line -> one error, NEITHER executed.
+ledger_case "C6 ledger: two commands on one line both dropped" \
+  '(epoch 1)\n(eval "1") (eval "2")\n(eval "3")\n' \
+  'Expected single command, got 2' '^3$'
+
+# C7: vm-trace without the compiler loaded errors opaquely.
+ledger_case "C7 ledger: vm-trace sans compiler is opaque Not-callable-nil" \
+  '(epoch 1)\n(vm-trace "(+ 1 2)")\n' \
+  'Not callable: nil'
+
+# ---------------------------------------------------------------------------
+# Fuzz-liveness property: after 60 deterministic hostile lines (unbalanced
+# parens, control chars, unicode, long lines, stray io-responses, epoch
+# mutations), the server must still answer a well-formed command and exit
+# cleanly. Seeded PRNG — reproducible corpus.
+# ---------------------------------------------------------------------------
+fuzz=$(python3 - <<'PY'
+import random
+r = random.Random(1404)
+lines = []
+frag = ['(', ')', '((', '))', '(eval', '(epoch', 'io-response', '"', '\\',
+        'café', '\x01', '\x1b[2J', ':kw', '{', '}', '(+ 1', 'nil)', '#|', '|#']
+for i in range(60):
+    kind = r.randrange(5)
+    if kind == 0:
+        lines.append(''.join(r.choice(frag) for _ in range(r.randrange(1, 8))))
+    elif kind == 1:
+        lines.append('(epoch ' + r.choice(['', 'foo', '-1', '999999999999999999999', ')']) + ')')
+    elif kind == 2:
+        lines.append('(io-response %d %s' % (r.randrange(99), r.choice([')', '', '42']) ))
+    elif kind == 3:
+        lines.append('x' * r.randrange(200, 2000))
+    else:
+        lines.append('(eval "' + r.choice(['(+ 1', '(list', '\\\\', '((((']) + '")')
+print('\n'.join(lines))
+PY
+)
+out=$(printf '%s\n(epoch 777)\n(eval "\\"alive\\"")\n' "$fuzz" | timeout 90 "$SERVER" 2>&1)
+rc=$?
+if grep -q '^"alive"$' <<<"$out" && ! grep -q 'Fatal error' <<<"$out" && [[ $rc -eq 0 ]]; then
+  echo "PASS: fuzz-liveness — server survives 60 hostile lines and still answers"
+  pass=$((pass+1))
+else
+  echo "FAIL: fuzz-liveness (rc=$rc)"; tail -6 <<<"$out" | sed 's/^/  /'
+  fail=$((fail+1))
+fi
+
+# ---------------------------------------------------------------------------
+# S4 (review, hosts.md): soft error pages must NOT be stored in the HTTP
+# response cache. Pre-fix, a routing-failure page was cached as HTTP 200 and
+# served byte-identically from cache to every later visitor (cold 2s → warm
+# 0.0005s, ONE render line). Post-fix (dc7aa709), http_render_page returns
+# (html, is_error) and cache insertion is gated on `not is_err` (the skip is
+# logged as "[cache] <path> → error page, not cached").
+#
+# Pin: GET the same nonexistent path twice against a fresh --http server and
+# assert BOTH requests re-render (two [sx-http] render lines) plus the
+# is_err gate line appearing in the log. NB: in a standalone worktree all
+# docs pages render as soft error pages (no content), so a positive
+# "real page IS cached" control is not assertable here.
+# ---------------------------------------------------------------------------
+s4_case() {
+  local port=$((18000 + RANDOM % 2000))
+  local log; log=$(mktemp)
+  timeout 90 "$SERVER" --http "$port" >"$log" 2>&1 &
+  local srv=$!
+  local up=0
+  for _ in $(seq 1 40); do
+    if curl -s -o /dev/null "http://localhost:$port/" 2>/dev/null; then up=1; break; fi
+    sleep 1
+  done
+  if [[ $up -ne 1 ]]; then
+    echo "FAIL: S4 — http server did not come up on :$port"
+    kill "$srv" 2>/dev/null; rm -f "$log"
+    fail=$((fail+1)); return
+  fi
+  local miss="/sx/gate-pin-missing-$$-$RANDOM"
+  curl -s -o /dev/null "http://localhost:$port$miss"
+  curl -s -o /dev/null "http://localhost:$port$miss"
+  sleep 1
+  local renders
+  renders=$(grep -c "sx-http\] $miss " "$log")
+  local ok=1
+  if [[ "$renders" -ne 2 ]]; then
+    echo "FAIL: S4 — expected 2 renders of $miss (not cache-served), got $renders"
+    ok=0
+  fi
+  if ! grep -q 'error page, not cached' "$log"; then
+    echo "FAIL: S4 — is_err cache gate line absent from server log"
+    ok=0
+  fi
+  if [[ $ok -eq 1 ]]; then
+    echo "PASS: S4 soft error page not cached (both GETs re-rendered)"
+    pass=$((pass+1))
+  else
+    echo "  --- log tail ---"; tail -12 "$log" | sed 's/^/  /'; echo "  ---------------"
+    fail=$((fail+1))
+  fi
+  kill "$srv" 2>/dev/null
+  rm -f "$log"
+}
+s4_case
+
+echo
+echo "protocol-gate: $pass passed, $fail failed"
+[[ $fail -eq 0 ]]
--- a/scripts/test-suite-baseline.sh
+++ b/scripts/test-suite-baseline.sh
@@ -0,0 +1,61 @@
+#!/bin/bash
+# test-suite-baseline.sh — W14/F10: make FAIL mean something again.
+#
+# The review (conformance.md F-10): the OCaml suite is not green — a
+# permanent ~274-failure band (in-progress hs-* + r7rs radix shadow) is
+# normalized, so real regressions hide inside the red noise and nobody can
+# tell a new failure from the band.
+#
+# This gate pins the band instead of ignoring it: the full suite's FAIL
+# set is diffed against the checked-in baseline
+# (spec/tests/known-failures.txt). Two red conditions, both loud:
+#   NEW failure      -> a real regression: fix it (or, if intentional,
+#                       justify + add to the baseline in the same commit)
+#   VANISHED failure -> something got fixed: delete it from the baseline
+#                       so the win is locked in
+# Neither touches the runner or the hs loops' scoreboards — the band still
+# prints as FAIL lines for the teams working through it.
+#
+# Usage: bash scripts/test-suite-baseline.sh
+# Runtime: full suite, ~5–15 min. Exit 0 = fail set identical to baseline.
+set -uo pipefail
+cd "$(dirname "$0")/.."
+
+RUNNER=hosts/ocaml/_build/default/bin/run_tests.exe
+BASELINE=spec/tests/known-failures.txt
+[[ -x "$RUNNER" ]] || { echo "SKIP: $RUNNER not built" >&2; exit 2; }
+[[ -f "$BASELINE" ]] || { echo "SKIP: $BASELINE missing" >&2; exit 2; }
+
+log=$(mktemp)
+timeout 3000 "$RUNNER" > "$log" 2>&1
+rc=$?
+if [[ $rc -ne 0 && $rc -ne 1 ]]; then
+  echo "RED: runner exited $rc (timeout/crash)"; tail -5 "$log"; rm -f "$log"; exit 1
+fi
+
+# Normalize: keep the stable test identity (suite > name), drop messages
+# (error text may contain addresses/timings that churn).
+current=$(mktemp)
+grep '^  FAIL: ' "$log" | sed 's/^  FAIL: //; s/: .*$//' | sort -u > "$current"
+
+new_failures=$(comm -13 <(sort -u "$BASELINE") "$current")
+vanished=$(comm -23 <(sort -u "$BASELINE") "$current")
+
+summary=$(grep '^Results:' "$log" | tail -1)
+red=0
+if [[ -n "$new_failures" ]]; then
+  echo "RED: NEW failures not in baseline:"
+  sed 's/^/  + /' <<<"$new_failures"
+  red=1
+fi
+if [[ -n "$vanished" ]]; then
+  echo "RED: baseline entries now PASSING (delete them from $BASELINE):"
+  sed 's/^/  - /' <<<"$vanished"
+  red=1
+fi
+if [[ $red -eq 0 ]]; then
+  echo "GREEN: fail set identical to baseline ($(wc -l < "$BASELINE") known failures)"
+fi
+echo "$summary"
+rm -f "$log" "$current"
+exit $red
--- a/scripts/test-wasm-corpus.sh
+++ b/scripts/test-wasm-corpus.sh
@@ -0,0 +1,82 @@
+#!/bin/bash
+# test-wasm-corpus.sh — W14/F2: sweep the spec test corpus through the
+# SHIPPED browser kernel (sx_browser.bc.wasm.js) headless in Node.
+#
+# The review (conformance.md F-2) found the shipped browser artifact never
+# runs the corpus — F-1/F-3 native/WASM divergences existed undetected.
+# Each file runs in its OWN node process via run_wasm_corpus.js (a hang is
+# killed by per-file timeout without ending the sweep).
+#
+# The SKIP list documents files that structurally cannot run on the browser
+# kernel (runner-only bindings, native-only machinery) — the F-5/F-6/F-10
+# "one-host-gated" theme, recorded honestly per file with the reason.
+# KNOWN_FAIL documents files that RUN but currently have failing tests on
+# the shipped kernel (host divergence, F-1/F-3 class): they execute and
+# report, but don't gate. Everything else must be GREEN — exit 1 otherwise;
+# a KNOWN_FAIL going green also fails (ledger must be updated).
+#
+# Usage: bash scripts/test-wasm-corpus.sh [file.sx ...]
+set -uo pipefail
+cd "$(dirname "$0")/.."
+
+RUNNER=hosts/ocaml/browser/run_wasm_corpus.js
+KERNEL=shared/static/wasm/sx_browser.bc.wasm.js
+[[ -f "$KERNEL" ]] || { echo "SKIP: $KERNEL missing (run sx-build-all first)" >&2; exit 2; }
+
+# --- classification (empirical sweep 2026-07-04; see sx-gate-loop.md) ---
+# Sweep baseline: 83 files, 80 fully green, 5192 passes, 0 test failures.
+# The shipped kernel even provides the CEK driver bindings (make-env,
+# cek-step-loop, ...) — broader than a bare sx_server.
+declare -A SKIP KNOWN_FAIL
+skip()  { SKIP[$1]=$2; }
+known() { KNOWN_FAIL[$1]=$2; }
+# Partial load-errors: the kernel throws mid-file (opaque jsoo exception,
+# message "undefined"); tests before the failing form pass and report.
+known test-hash-table.sx "partial: 22 pass then load-error mid-file"
+known test-r7rs.sx       "partial: 87 pass then load-error mid-file"
+known test-sets.sx       "partial: 30 pass then load-error mid-file"
+
+pass_total=0; fail_total=0; red=0; files=0
+declare -a targets
+if [[ $# -gt 0 ]]; then targets=("$@");
+else for f in spec/tests/test-*.sx; do
+  [[ "$(basename "$f")" == "test-framework.sx" ]] && continue
+  targets+=("$f")
+done; fi
+
+for f in "${targets[@]}"; do
+  base=$(basename "$f")
+  if [[ -n "${SKIP[$base]:-}" ]]; then
+    echo "SKIP: $base — ${SKIP[$base]}"
+    continue
+  fi
+  files=$((files+1))
+  line=$(timeout 120 node "$RUNNER" "$f" 2>/dev/null | grep '^CORPUS-RESULT' || true)
+  if [[ -z "$line" ]]; then
+    echo "RED:  $base — timeout or crash (no CORPUS-RESULT)"
+    red=$((red+1)); continue
+  fi
+  p=$(sed -n 's/.*pass=\([0-9]*\).*/\1/p' <<<"$line")
+  fl=$(sed -n 's/.*fail=\([0-9]*\).*/\1/p' <<<"$line")
+  st=$(sed -n 's/.*status=\([a-z-]*\).*/\1/p' <<<"$line")
+  pass_total=$((pass_total+p)); fail_total=$((fail_total+fl))
+  if [[ -n "${KNOWN_FAIL[$base]:-}" ]]; then
+    if [[ "$fl" -eq 0 && "$st" == "ok" ]]; then
+      echo "RED:  $base — KNOWN_FAIL is now GREEN (${KNOWN_FAIL[$base]}); update the ledger"
+      red=$((red+1))
+    else
+      echo "KNOWN-FAIL: $base pass=$p fail=$fl ($( echo "${KNOWN_FAIL[$base]}" ))"
+    fi
+    continue
+  fi
+  if [[ "$st" != "ok" || "$fl" -ne 0 ]]; then
+    echo "RED:  $base pass=$p fail=$fl status=$st"
+    red=$((red+1))
+  else
+    echo "OK:   $base pass=$p"
+  fi
+done
+
+echo
+echo "wasm-corpus: $files files run, $pass_total passed, $fail_total failed, $red red"
+[[ $red -eq 0 ]]