From 01e5f876bc614d9355c6f1971460c64fd0137653 Mon Sep 17 00:00:00 2001 From: giles Date: Sat, 4 Jul 2026 01:41:07 +0000 Subject: [PATCH] W14: pin K19 MCP-harness/runtime primitive parity (test-only) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit mcp_tree.ml's parallel primitive table drifted from sx_primitives.ml — the spec-mandated harness verification path silently produced false findings ((get {:a 1} :a 99) -> nil vs 1, char-class vs substring split, etc.). dc7aa709 aligned 8 entries as a stopgap; the real fix (linking sx_primitives) is hosts-lane. Add scripts/test-harness-parity.sh: drives mcp_tree.exe sx_eval via raw JSON-RPC and a fresh sx_server.exe via the epoch protocol, runs the finding's 12-probe battery through both, fails on any divergence (errors compared by inner message). 12/12 parity today — the stopgap holds and can no longer rot silently. Test-only: no semantics edits, no push. Co-Authored-By: Claude Fable 5 --- plans/agent-briefings/sx-gate-loop.md | 16 +++- scripts/test-harness-parity.sh | 107 ++++++++++++++++++++++++++ 2 files changed, 121 insertions(+), 2 deletions(-) create mode 100755 scripts/test-harness-parity.sh diff --git a/plans/agent-briefings/sx-gate-loop.md b/plans/agent-briefings/sx-gate-loop.md index 3ce4f562..d14be3b6 100644 --- a/plans/agent-briefings/sx-gate-loop.md +++ b/plans/agent-briefings/sx-gate-loop.md @@ -66,8 +66,9 @@ Pin each confirmed-and-fixed finding with a minimal repro. Add suites to stubs → test CIDs ≠ production CIDs) ### C. Harness honesty -- [ ] K19 — MCP `mcp_tree.ml` harness primitive table drift vs `sx_primitives` - (parity test) +- [x] K19 — harness/runtime parity pinned (`scripts/test-harness-parity.sh`: + drives mcp_tree sx_eval over JSON-RPC vs fresh sx_server over epoch, + 12-probe battery from the finding, errors compared by message) - [ ] C22/K104 — harness logs IO *before* invoking the mock (throwing-mock pin) - [ ] C21 — real perform/suspend mode in harness - [ ] C23 — adapter-dom render-output tests @@ -85,6 +86,17 @@ Pin each confirmed-and-fixed finding with a minimal repro. Add suites to ## Progress log (newest first) +- 2026-07-04 — **K19 harness-parity pin (item C.1)**. Authored + `scripts/test-harness-parity.sh`: drives `mcp_tree.exe` `sx_eval` with + raw JSON-RPC over stdio and a fresh `sx_server.exe` over the epoch + protocol, running the finding's exact 12-probe battery (empty?/get/ + split/equal?/contains?/keyword-name/char-code/parse-number) through both + and failing on ANY divergence. Errors normalized to their inner message + so identical failures compare equal (`keyword-name :kw` errors the same + way on both — keywords evaluate to strings before the call). Result: + 12/12 parity — dc7aa709's 8-entry stopgap alignment holds; this pin keeps + it honest until the real fix (mcp_tree links sx_primitives) lands in the + hosts lane. Test-only. - 2026-07-04 — **Section B: env-parity audit + ledger**. Probed a fresh `sx_server` over the epoch protocol (`deps-check` + live eval). Confirmed runner-only drift: `values`/`call-with-values` (run_tests.ml:1131/1140), diff --git a/scripts/test-harness-parity.sh b/scripts/test-harness-parity.sh new file mode 100755 index 00000000..1192ac3c --- /dev/null +++ b/scripts/test-harness-parity.sh @@ -0,0 +1,107 @@ +#!/bin/bash +# test-harness-parity.sh — W14 section-C pin for K19 (harness honesty). +# +# K19 (review, core.md): the MCP tree server (mcp_tree.ml) carries a +# PARALLEL primitive table, and it drifted from the real runtime +# (sx_primitives.ml) — e.g. (get {:a 1} :a 99) returned nil in the harness +# but 1 in production, (split "a--b" "--") was char-class vs substring. +# CLAUDE.md mandates harness verification, so drift silently produces +# false findings/passes. dc7aa709 aligned 8 entries as a stopgap; the real +# fix (mcp_tree links sx_primitives directly) is hosts-lane work. +# +# This pin runs the finding's exact probe battery through BOTH environments +# — mcp_tree.exe sx_eval (JSON-RPC over stdio) and a fresh sx_server.exe +# (epoch protocol) — and fails on ANY divergence. Errors are compared by +# message, values by serialized form. Both subprocesses are fresh and +# timeout-bounded; no shared process is touched. +# +# Exit: 0 = full parity; 1 = drift (harness lies about the runtime again). +set -uo pipefail +cd "$(dirname "$0")/.." + +MCP=hosts/ocaml/_build/default/bin/mcp_tree.exe +SERVER=hosts/ocaml/_build/default/bin/sx_server.exe +for bin in "$MCP" "$SERVER"; do + if [[ ! -x "$bin" ]]; then + echo "SKIP: $bin not built (run sx_build target=ocaml first)" >&2 + exit 2 + fi +done + +python3 - "$MCP" "$SERVER" <<'PYEOF' +import json, re, subprocess, sys + +MCP, SERVER = sys.argv[1], sys.argv[2] + +# K19 probe battery — the finding's confirmed drift cases + stopgap entries. +PROBES = [ + '(empty? "")', '(empty? {})', + '(get {:a 1} :a 99)', '(get {:a 1} :zz 99)', '(get (list 10 20) 1)', + '(split "a--b" "--")', '(split "abc" "")', + '(equal? (list 1 2) (list 1 2))', + '(contains? {:a 1} :a)', '(keyword-name :kw)', + '(char-code "A")', '(parse-number "42")', +] + +def norm_error(msg): + """Extract the quoted inner error message so harness/server error + envelopes compare equal when the underlying failure is the same.""" + m = re.search(r'Unhandled exception: \\?"(.*?)\\?"', msg) + if m: + return " " + m.group(1) + return " " + msg.strip()[:80] + +# --- harness side: mcp_tree sx_eval over JSON-RPC --- +lines = [ + json.dumps({"jsonrpc": "2.0", "id": 1, "method": "initialize", + "params": {"protocolVersion": "2024-11-05", "capabilities": {}, + "clientInfo": {"name": "parity", "version": "0"}}}), + json.dumps({"jsonrpc": "2.0", "method": "notifications/initialized"}), +] +for i, p in enumerate(PROBES): + lines.append(json.dumps({"jsonrpc": "2.0", "id": 100 + i, "method": "tools/call", + "params": {"name": "sx_eval", "arguments": {"expr": p}}})) +out = subprocess.run(["timeout", "60", MCP], input="\n".join(lines) + "\n", + capture_output=True, text=True).stdout +harness = {} +for l in out.splitlines(): + try: + j = json.loads(l) + except ValueError: + continue + if isinstance(j.get("id"), int) and j["id"] >= 100: + txt = j.get("result", {}).get("content", [{}])[0].get("text", "").strip() + if txt.startswith("Error:") or j.get("result", {}).get("isError"): + txt = norm_error(txt) + harness[j["id"] - 100] = txt + +# --- server side: fresh sx_server over the epoch protocol --- +inp = [] +for i, p in enumerate(PROBES): + inp.append(f"(epoch {i + 1})") + inp.append(f"(eval {json.dumps(p)})") +sout = subprocess.run(["timeout", "60", SERVER], input="\n".join(inp) + "\n", + capture_output=True, text=True).stdout +server, cur = {}, None +for l in sout.splitlines(): + if l.startswith("(ok-len "): + cur = int(l.split()[1]); server[cur - 1] = None + elif l.startswith("(error "): + idx = int(l.split()[1]); server[idx - 1] = norm_error(l); cur = None + elif cur is not None and server.get(cur - 1) is None: + server[cur - 1] = l.strip(); cur = None + +fails = 0 +for i, p in enumerate(PROBES): + h = harness.get(i, "") + s = server.get(i, "") + if h == s: + print(f"PASS: {p:40s} both -> {h!r}") + else: + print(f"FAIL: {p:40s} harness={h!r} server={s!r}") + fails += 1 + +print() +print(f"harness-parity: {len(PROBES) - fails} passed, {fails} failed") +sys.exit(1 if fails else 0) +PYEOF