forth: Hayes conformance runner + baseline scoreboard (165/590, 28%)

2026-04-24 19:13:45 +00:00
parent a47b3e5420
commit 0e509af0a2
5 changed files with 370 additions and 2 deletions
--- a/lib/forth/conformance.sh
+++ b/lib/forth/conformance.sh
@@ -0,0 +1,169 @@
 #!/usr/bin/env bash
 # Run the Hayes/Gerry-Jackson Core conformance suite against our Forth
 # interpreter and emit scoreboard.json + scoreboard.md.
 #
 # Method:
 #   1. Preprocess lib/forth/ans-tests/core.fr — strip \ comments, ( ... )
 #      comments, and TESTING … metadata lines.
 #   2. Split into chunks ending at each `}T` so an error in one test
 #      chunk doesn't abort the run.
 #   3. Emit an SX file that exposes those chunks as a list.
 #   4. Run our Forth + hayes-runner under sx_server; record pass/fail/error.
 set -e
 FORTH_DIR="$(cd "$(dirname "$0")" && pwd)"
 ROOT="$(cd "$FORTH_DIR/../.." && pwd)"
 SX_SERVER="${SX_SERVER:-/root/rose-ash/hosts/ocaml/_build/default/bin/sx_server.exe}"
 SOURCE="$FORTH_DIR/ans-tests/core.fr"
 OUT_JSON="$FORTH_DIR/scoreboard.json"
 OUT_MD="$FORTH_DIR/scoreboard.md"
 TMP="$(mktemp -d)"
 PREPROC="$TMP/preproc.forth"
 CHUNKS_SX="$TMP/chunks.sx"
 cd "$ROOT"
 # 1. preprocess
 awk '
 {
  line = $0
  # strip leading/embedded \ line comments (must be \ followed by space or EOL)
  gsub(/(^|[ \t])\\([ \t].*|$)/, " ", line)
  # strip ( ... ) block comments that sit on one line
  gsub(/\([^)]*\)/, " ", line)
  # strip TESTING … metadata lines (rest of line, incl. bare TESTING)
  sub(/TESTING([ \t].*)?$/, " ", line)
  print line
 }' "$SOURCE" > "$PREPROC"
 # 2 + 3: split into chunks at each `}T` and emit as a SX file
 #
 # Cap chunks via MAX_CHUNKS env (default 590) — a small number of later
 # tests enter infinite runtime loops (e.g. COUNT-BITS with unsigned wrap)
 # that our bignum-based interpreter can't terminate. Raise the cap as
 # those tests unblock.
 MAX_CHUNKS="${MAX_CHUNKS:-590}"
 MAX_CHUNKS="$MAX_CHUNKS" python3 - "$PREPROC" "$CHUNKS_SX" <<'PY'
 import os, re, sys
 preproc_path, out_path = sys.argv[1], sys.argv[2]
 max_chunks = int(os.environ.get("MAX_CHUNKS", "590"))
 text = open(preproc_path).read()
 # keep the `}T` attached to the preceding chunk
 parts = re.split(r'(\}T)', text)
 chunks = []
 buf = ""
 for p in parts:
    buf += p
    if p == "}T":
        s = buf.strip()
        if s:
            chunks.append(s)
        buf = ""
 if buf.strip():
    chunks.append(buf.strip())
 chunks = chunks[:max_chunks]
 def esc(s):
    s = s.replace('\\', '\\\\').replace('"', '\\"')
    s = s.replace('\r', ' ').replace('\n', ' ')
    s = re.sub(r'\s+', ' ', s).strip()
    return s
 with open(out_path, "w") as f:
    f.write("(define hayes-chunks (list\n")
    for c in chunks:
        f.write('  "' + esc(c) + '"\n')
    f.write("))\n\n")
    f.write("(define\n")
    f.write("  hayes-run-all\n")
    f.write("  (fn\n")
    f.write("    ()\n")
    f.write("    (hayes-reset!)\n")
    f.write("    (let ((s (hayes-boot)))\n")
    f.write("      (for-each (fn (c) (hayes-run-chunk s c)) hayes-chunks))\n")
    f.write("    (hayes-summary)))\n")
 PY
 # 4. run it
 OUT=$(printf '(epoch 1)\n(load "lib/forth/runtime.sx")\n(epoch 2)\n(load "lib/forth/reader.sx")\n(epoch 3)\n(load "lib/forth/interpreter.sx")\n(epoch 4)\n(load "lib/forth/compiler.sx")\n(epoch 5)\n(load "lib/forth/hayes-runner.sx")\n(epoch 6)\n(load "%s")\n(epoch 7)\n(eval "(hayes-run-all)")\n' "$CHUNKS_SX" \
  | timeout 180 "$SX_SERVER" 2>&1)
 STATUS=$?
 SUMMARY=$(printf '%s\n' "$OUT" | awk '/^\{:pass / {print; exit}')
 PASS=$(printf '%s' "$SUMMARY" | sed -n 's/.*:pass \([0-9-]*\).*/\1/p')
 FAIL=$(printf '%s' "$SUMMARY" | sed -n 's/.*:fail \([0-9-]*\).*/\1/p')
 ERR=$(printf '%s' "$SUMMARY" | sed -n 's/.*:error \([0-9-]*\).*/\1/p')
 TOTAL=$(printf '%s' "$SUMMARY" | sed -n 's/.*:total \([0-9-]*\).*/\1/p')
 CHUNK_COUNT=$(grep -c '^  "' "$CHUNKS_SX" || echo 0)
 TOTAL_AVAILABLE=$(grep -c '}T' "$PREPROC" || echo 0)
 NOW="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
 if [ -z "$PASS" ]; then
  PASS=0; FAIL=0; ERR=0; TOTAL=0
  NOTE="runner halted before completing (timeout or SX error)"
 else
  NOTE="completed"
 fi
 PCT=0
 if [ "$TOTAL" -gt 0 ]; then
  PCT=$((PASS * 100 / TOTAL))
 fi
 cat > "$OUT_JSON" <<JSON
 {
  "source": "gerryjackson/forth2012-test-suite src/core.fr",
  "generated_at": "$NOW",
  "chunks_available": $TOTAL_AVAILABLE,
  "chunks_fed": $CHUNK_COUNT,
  "total": $TOTAL,
  "pass": $PASS,
  "fail": $FAIL,
  "error": $ERR,
  "percent": $PCT,
  "note": "$NOTE"
 }
 JSON
 cat > "$OUT_MD" <<MD
 # Forth Hayes Core scoreboard
 | metric            | value |
 | ----------------- | ----: |
 | chunks available  | $TOTAL_AVAILABLE |
 | chunks fed        | $CHUNK_COUNT |
 | total             | $TOTAL |
 | pass              | $PASS |
 | fail              | $FAIL |
 | error             | $ERR |
 | percent           | ${PCT}% |
 - **Source**: \`gerryjackson/forth2012-test-suite\` \`src/core.fr\`
 - **Generated**: $NOW
 - **Note**: $NOTE
 A "chunk" is any preprocessed segment ending at a \`}T\` (every Hayes test
 is one chunk, plus the small declaration blocks between tests).
 The runner catches raised errors at chunk boundaries so one bad chunk
 does not abort the rest. \`error\` covers chunks that raised; \`fail\`
 covers tests whose \`->\` / \`}T\` comparison mismatched.
 ### Chunk cap
 \`conformance.sh\` processes the first \`\$MAX_CHUNKS\` chunks (default
 **590**). Past that, \`core.fr\` ships tests that rely on unsigned
 integer wrap-around (e.g. \`COUNT-BITS\` using \`BEGIN DUP WHILE … 2*
 REPEAT\`), which never terminates on our bignum-based interpreter. The
 cap should rise as those tests unblock — run with \`MAX_CHUNKS=639
 ./conformance.sh\` once they do.
 MD
 echo "$SUMMARY"
 echo "Scoreboard: $OUT_JSON"
 echo "           $OUT_MD"
 if [ "$STATUS" -ne 0 ] && [ "$TOTAL" -eq 0 ]; then
  exit 1
 fi
--- a/lib/forth/hayes-runner.sx
+++ b/lib/forth/hayes-runner.sx
@@ -0,0 +1,135 @@
 ;; Hayes conformance test runner.
 ;; Installs T{ -> }T as Forth primitives that snapshot and compare dstack,
 ;; plus stub TESTING / HEX / DECIMAL so the Hayes Core file can stream
 ;; through the interpreter without halting on unsupported metadata words.
 (define hayes-pass 0)
 (define hayes-fail 0)
 (define hayes-error 0)
 (define hayes-start-depth 0)
 (define hayes-actual (list))
 (define hayes-actual-set false)
 (define hayes-failures (list))
 (define hayes-first-error "")
 (define
  hayes-reset!
  (fn
    ()
    (set! hayes-pass 0)
    (set! hayes-fail 0)
    (set! hayes-error 0)
    (set! hayes-start-depth 0)
    (set! hayes-actual (list))
    (set! hayes-actual-set false)
    (set! hayes-failures (list))
    (set! hayes-first-error "")))
 (define
  hayes-slice
  (fn
    (state base)
    (let
      ((n (- (forth-depth state) base)))
      (if (<= n 0) (list) (take (get state "dstack") n)))))
 (define
  hayes-truncate!
  (fn
    (state base)
    (let
      ((n (- (forth-depth state) base)))
      (when (> n 0) (dict-set! state "dstack" (drop (get state "dstack") n))))))
 (define
  hayes-install!
  (fn
    (state)
    (forth-def-prim!
      state
      "T{"
      (fn
        (s)
        (set! hayes-start-depth (forth-depth s))
        (set! hayes-actual-set false)
        (set! hayes-actual (list))))
    (forth-def-prim!
      state
      "->"
      (fn
        (s)
        (set! hayes-actual (hayes-slice s hayes-start-depth))
        (set! hayes-actual-set true)
        (hayes-truncate! s hayes-start-depth)))
    (forth-def-prim!
      state
      "}T"
      (fn
        (s)
        (let
          ((expected (hayes-slice s hayes-start-depth)))
          (hayes-truncate! s hayes-start-depth)
          (if
            (and hayes-actual-set (= expected hayes-actual))
            (set! hayes-pass (+ hayes-pass 1))
            (begin
              (set! hayes-fail (+ hayes-fail 1))
              (set!
                hayes-failures
                (concat
                  hayes-failures
                  (list
                    (dict
                      "kind"
                      "fail"
                      "expected"
                      (str expected)
                      "actual"
                      (str hayes-actual))))))))))
    (forth-def-prim! state "TESTING" (fn (s) nil))
    (forth-def-prim! state "HEX" (fn (s) (dict-set! s "base" 16)))
    (forth-def-prim! state "DECIMAL" (fn (s) (dict-set! s "base" 10)))
    state))
 (define
  hayes-boot
  (fn () (let ((s (forth-boot))) (hayes-install! s) (hayes-reset!) s)))
 ;; Run a single preprocessed chunk (string of Forth source) on the shared
 ;; state. Catch any raised error and move on — the chunk boundary is a
 ;; safe resume point.
 (define
  hayes-run-chunk
  (fn
    (state src)
    (guard
      (err
        ((= 1 1)
          (begin
            (set! hayes-error (+ hayes-error 1))
            (when
              (= (len hayes-first-error) 0)
              (set! hayes-first-error (str err)))
            (dict-set! state "dstack" (list))
            (dict-set! state "rstack" (list))
            (dict-set! state "compiling" false)
            (dict-set! state "current-def" nil)
            (dict-set! state "cstack" (list))
            (dict-set! state "input" (list)))))
      (forth-interpret state src))))
 (define
  hayes-summary
  (fn
    ()
    (dict
      "pass"
      hayes-pass
      "fail"
      hayes-fail
      "error"
      hayes-error
      "total"
      (+ (+ hayes-pass hayes-fail) hayes-error)
      "first-error"
      hayes-first-error)))
--- a/lib/forth/scoreboard.json
+++ b/lib/forth/scoreboard.json
@@ -0,0 +1,12 @@
 {
  "source": "gerryjackson/forth2012-test-suite src/core.fr",
  "generated_at": "2026-04-24T19:13:12Z",
  "chunks_available": 638,
  "chunks_fed": 590,
  "total": 590,
  "pass": 165,
  "fail": 0,
  "error": 425,
  "percent": 27,
  "note": "completed"
 }
--- a/lib/forth/scoreboard.md
+++ b/lib/forth/scoreboard.md
@@ -0,0 +1,30 @@
 # Forth Hayes Core scoreboard
 | metric            | value |
 | ----------------- | ----: |
 | chunks available  | 638 |
 | chunks fed        | 590 |
 | total             | 590 |
 | pass              | 165 |
 | fail              | 0 |
 | error             | 425 |
 | percent           | 27% |
 - **Source**: `gerryjackson/forth2012-test-suite` `src/core.fr`
 - **Generated**: 2026-04-24T19:13:12Z
 - **Note**: completed
 A "chunk" is any preprocessed segment ending at a `}T` (every Hayes test
 is one chunk, plus the small declaration blocks between tests).
 The runner catches raised errors at chunk boundaries so one bad chunk
 does not abort the rest. `error` covers chunks that raised; `fail`
 covers tests whose `->` / `}T` comparison mismatched.
 ### Chunk cap
 `conformance.sh` processes the first `$MAX_CHUNKS` chunks (default
 **590**). Past that, `core.fr` ships tests that rely on unsigned
 integer wrap-around (e.g. `COUNT-BITS` using `BEGIN DUP WHILE … 2*
 REPEAT`), which never terminates on our bignum-based interpreter. The
 cap should rise as those tests unblock — run with `MAX_CHUNKS=639
 ./conformance.sh` once they do.
--- a/plans/forth-on-sx.md
+++ b/plans/forth-on-sx.md
@@ -74,8 +74,8 @@ Representation:
 - [x] `DO`, `LOOP`, `+LOOP`, `I`, `J`, `LEAVE` — counted loops (needs a return stack)
 - [x] Return stack: `>R`, `R>`, `R@`, `2>R`, `2R>`, `2R@`
 - [x] Vendor John Hayes' test suite to `lib/forth/ans-tests/`
- [ ] `lib/forth/conformance.sh` + runner; `scoreboard.json` + `scoreboard.md`
+- [x] `lib/forth/conformance.sh` + runner; `scoreboard.json` + `scoreboard.md`
- [ ] Baseline: probably 30-50% Core passing after phase 3
+- [x] Baseline: probably 30-50% Core passing after phase 3
 ### Phase 4 — strings + more Core
 - [ ] `S"`, `C"`, `."`, `TYPE`, `COUNT`, `CMOVE`, `FILL`, `BLANK`
@@ -99,6 +99,28 @@ Representation:
 _Newest first._
 - **Phase 3 — Hayes conformance runner + baseline scoreboard (165/590, 28%).**
  `lib/forth/conformance.sh` preprocesses `ans-tests/core.fr` (strips `\`
  and `( ... )` comments + `TESTING` lines), splits the source on every
  `}T` so each Hayes test plus the small declaration blocks between
  them are one safe-resume chunk, and emits an SX driver that feeds
  the chunks through `lib/forth/hayes-runner.sx`. The runner registers
  `T{`/`->`/`}T` as Forth primitives that snapshot the dstack depth on
  `T{`, record actual on `->`, compare on `}T`, and install stub
  `HEX`/`DECIMAL`/`TESTING` so metadata doesn't halt the stream. Errors
  raised inside a chunk are caught by `guard` and the state is reset,
  so one bad test does not break the rest. Outputs
  `scoreboard.json` + `scoreboard.md`.
  First-run baseline: 165 pass / 425 error / 0 fail on the first 590
  chunks. The default cap sits at 590 because `core.fr` chunks beyond
  that rely on unsigned-integer wrap-around (e.g. `COUNT-BITS` with
  `BEGIN DUP WHILE … 2* REPEAT`) which never terminates on our
  bignum-based Forth; raise `MAX_CHUNKS` once those tests unblock.
  Majority of errors are missing Phase-4 words (`RSHIFT`, `LSHIFT`,
  `CELLS`, `S"`, `CHAR`, `SOURCE`, etc.) — each one implemented should
  convert a cluster of errors to passes.
 - **Phase 3 — vendor Gerry Jackson's forth2012-test-suite.** Added
  `lib/forth/ans-tests/{tester.fr, core.fr, coreexttest.fth}` from
  https://github.com/gerryjackson/forth2012-test-suite (master, fetched