forth: Hayes conformance runner + baseline scoreboard (165/590, 28%)

2026-04-24 19:13:45 +00:00
parent a47b3e5420
commit 0e509af0a2
5 changed files with 370 additions and 2 deletions
--- a/lib/forth/conformance.sh
+++ b/lib/forth/conformance.sh
@@ -0,0 +1,169 @@
+#!/usr/bin/env bash
+# Run the Hayes/Gerry-Jackson Core conformance suite against our Forth
+# interpreter and emit scoreboard.json + scoreboard.md.
+#
+# Method:
+#   1. Preprocess lib/forth/ans-tests/core.fr — strip \ comments, ( ... )
+#      comments, and TESTING … metadata lines.
+#   2. Split into chunks ending at each `}T` so an error in one test
+#      chunk doesn't abort the run.
+#   3. Emit an SX file that exposes those chunks as a list.
+#   4. Run our Forth + hayes-runner under sx_server; record pass/fail/error.
+
+set -e
+FORTH_DIR="$(cd "$(dirname "$0")" && pwd)"
+ROOT="$(cd "$FORTH_DIR/../.." && pwd)"
+SX_SERVER="${SX_SERVER:-/root/rose-ash/hosts/ocaml/_build/default/bin/sx_server.exe}"
+SOURCE="$FORTH_DIR/ans-tests/core.fr"
+OUT_JSON="$FORTH_DIR/scoreboard.json"
+OUT_MD="$FORTH_DIR/scoreboard.md"
+TMP="$(mktemp -d)"
+PREPROC="$TMP/preproc.forth"
+CHUNKS_SX="$TMP/chunks.sx"
+
+cd "$ROOT"
+
+# 1. preprocess
+awk '
+{
+  line = $0
+  # strip leading/embedded \ line comments (must be \ followed by space or EOL)
+  gsub(/(^|[ \t])\\([ \t].*|$)/, " ", line)
+  # strip ( ... ) block comments that sit on one line
+  gsub(/\([^)]*\)/, " ", line)
+  # strip TESTING … metadata lines (rest of line, incl. bare TESTING)
+  sub(/TESTING([ \t].*)?$/, " ", line)
+  print line
+}' "$SOURCE" > "$PREPROC"
+
+# 2 + 3: split into chunks at each `}T` and emit as a SX file
+#
+# Cap chunks via MAX_CHUNKS env (default 590) — a small number of later
+# tests enter infinite runtime loops (e.g. COUNT-BITS with unsigned wrap)
+# that our bignum-based interpreter can't terminate. Raise the cap as
+# those tests unblock.
+MAX_CHUNKS="${MAX_CHUNKS:-590}"
+
+MAX_CHUNKS="$MAX_CHUNKS" python3 - "$PREPROC" "$CHUNKS_SX" <<'PY'
+import os, re, sys
+preproc_path, out_path = sys.argv[1], sys.argv[2]
+max_chunks = int(os.environ.get("MAX_CHUNKS", "590"))
+text = open(preproc_path).read()
+# keep the `}T` attached to the preceding chunk
+parts = re.split(r'(\}T)', text)
+chunks = []
+buf = ""
+for p in parts:
+    buf += p
+    if p == "}T":
+        s = buf.strip()
+        if s:
+            chunks.append(s)
+        buf = ""
+if buf.strip():
+    chunks.append(buf.strip())
+chunks = chunks[:max_chunks]
+
+def esc(s):
+    s = s.replace('\\', '\\\\').replace('"', '\\"')
+    s = s.replace('\r', ' ').replace('\n', ' ')
+    s = re.sub(r'\s+', ' ', s).strip()
+    return s
+
+with open(out_path, "w") as f:
+    f.write("(define hayes-chunks (list\n")
+    for c in chunks:
+        f.write('  "' + esc(c) + '"\n')
+    f.write("))\n\n")
+    f.write("(define\n")
+    f.write("  hayes-run-all\n")
+    f.write("  (fn\n")
+    f.write("    ()\n")
+    f.write("    (hayes-reset!)\n")
+    f.write("    (let ((s (hayes-boot)))\n")
+    f.write("      (for-each (fn (c) (hayes-run-chunk s c)) hayes-chunks))\n")
+    f.write("    (hayes-summary)))\n")
+PY
+
+# 4. run it
+OUT=$(printf '(epoch 1)\n(load "lib/forth/runtime.sx")\n(epoch 2)\n(load "lib/forth/reader.sx")\n(epoch 3)\n(load "lib/forth/interpreter.sx")\n(epoch 4)\n(load "lib/forth/compiler.sx")\n(epoch 5)\n(load "lib/forth/hayes-runner.sx")\n(epoch 6)\n(load "%s")\n(epoch 7)\n(eval "(hayes-run-all)")\n' "$CHUNKS_SX" \
+  | timeout 180 "$SX_SERVER" 2>&1)
+STATUS=$?
+
+SUMMARY=$(printf '%s\n' "$OUT" | awk '/^\{:pass / {print; exit}')
+PASS=$(printf '%s' "$SUMMARY" | sed -n 's/.*:pass \([0-9-]*\).*/\1/p')
+FAIL=$(printf '%s' "$SUMMARY" | sed -n 's/.*:fail \([0-9-]*\).*/\1/p')
+ERR=$(printf '%s' "$SUMMARY" | sed -n 's/.*:error \([0-9-]*\).*/\1/p')
+TOTAL=$(printf '%s' "$SUMMARY" | sed -n 's/.*:total \([0-9-]*\).*/\1/p')
+CHUNK_COUNT=$(grep -c '^  "' "$CHUNKS_SX" || echo 0)
+TOTAL_AVAILABLE=$(grep -c '}T' "$PREPROC" || echo 0)
+
+NOW="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
+
+if [ -z "$PASS" ]; then
+  PASS=0; FAIL=0; ERR=0; TOTAL=0
+  NOTE="runner halted before completing (timeout or SX error)"
+else
+  NOTE="completed"
+fi
+
+PCT=0
+if [ "$TOTAL" -gt 0 ]; then
+  PCT=$((PASS * 100 / TOTAL))
+fi
+
+cat > "$OUT_JSON" <<JSON
+{
+  "source": "gerryjackson/forth2012-test-suite src/core.fr",
+  "generated_at": "$NOW",
+  "chunks_available": $TOTAL_AVAILABLE,
+  "chunks_fed": $CHUNK_COUNT,
+  "total": $TOTAL,
+  "pass": $PASS,
+  "fail": $FAIL,
+  "error": $ERR,
+  "percent": $PCT,
+  "note": "$NOTE"
+}
+JSON
+
+cat > "$OUT_MD" <<MD
+# Forth Hayes Core scoreboard
+
+| metric            | value |
+| ----------------- | ----: |
+| chunks available  | $TOTAL_AVAILABLE |
+| chunks fed        | $CHUNK_COUNT |
+| total             | $TOTAL |
+| pass              | $PASS |
+| fail              | $FAIL |
+| error             | $ERR |
+| percent           | ${PCT}% |
+
+- **Source**: \`gerryjackson/forth2012-test-suite\` \`src/core.fr\`
+- **Generated**: $NOW
+- **Note**: $NOTE
+
+A "chunk" is any preprocessed segment ending at a \`}T\` (every Hayes test
+is one chunk, plus the small declaration blocks between tests).
+The runner catches raised errors at chunk boundaries so one bad chunk
+does not abort the rest. \`error\` covers chunks that raised; \`fail\`
+covers tests whose \`->\` / \`}T\` comparison mismatched.
+
+### Chunk cap
+
+\`conformance.sh\` processes the first \`\$MAX_CHUNKS\` chunks (default
+**590**). Past that, \`core.fr\` ships tests that rely on unsigned
+integer wrap-around (e.g. \`COUNT-BITS\` using \`BEGIN DUP WHILE … 2*
+REPEAT\`), which never terminates on our bignum-based interpreter. The
+cap should rise as those tests unblock — run with \`MAX_CHUNKS=639
+./conformance.sh\` once they do.
+MD
+
+echo "$SUMMARY"
+echo "Scoreboard: $OUT_JSON"
+echo "           $OUT_MD"
+
+if [ "$STATUS" -ne 0 ] && [ "$TOTAL" -eq 0 ]; then
+  exit 1
+fi