diff --git a/lib/forth/conformance.sh b/lib/forth/conformance.sh new file mode 100755 index 00000000..3ddf257b --- /dev/null +++ b/lib/forth/conformance.sh @@ -0,0 +1,169 @@ +#!/usr/bin/env bash +# Run the Hayes/Gerry-Jackson Core conformance suite against our Forth +# interpreter and emit scoreboard.json + scoreboard.md. +# +# Method: +# 1. Preprocess lib/forth/ans-tests/core.fr — strip \ comments, ( ... ) +# comments, and TESTING … metadata lines. +# 2. Split into chunks ending at each `}T` so an error in one test +# chunk doesn't abort the run. +# 3. Emit an SX file that exposes those chunks as a list. +# 4. Run our Forth + hayes-runner under sx_server; record pass/fail/error. + +set -e +FORTH_DIR="$(cd "$(dirname "$0")" && pwd)" +ROOT="$(cd "$FORTH_DIR/../.." && pwd)" +SX_SERVER="${SX_SERVER:-/root/rose-ash/hosts/ocaml/_build/default/bin/sx_server.exe}" +SOURCE="$FORTH_DIR/ans-tests/core.fr" +OUT_JSON="$FORTH_DIR/scoreboard.json" +OUT_MD="$FORTH_DIR/scoreboard.md" +TMP="$(mktemp -d)" +PREPROC="$TMP/preproc.forth" +CHUNKS_SX="$TMP/chunks.sx" + +cd "$ROOT" + +# 1. preprocess +awk ' +{ + line = $0 + # strip leading/embedded \ line comments (must be \ followed by space or EOL) + gsub(/(^|[ \t])\\([ \t].*|$)/, " ", line) + # strip ( ... ) block comments that sit on one line + gsub(/\([^)]*\)/, " ", line) + # strip TESTING … metadata lines (rest of line, incl. bare TESTING) + sub(/TESTING([ \t].*)?$/, " ", line) + print line +}' "$SOURCE" > "$PREPROC" + +# 2 + 3: split into chunks at each `}T` and emit as a SX file +# +# Cap chunks via MAX_CHUNKS env (default 590) — a small number of later +# tests enter infinite runtime loops (e.g. COUNT-BITS with unsigned wrap) +# that our bignum-based interpreter can't terminate. Raise the cap as +# those tests unblock. +MAX_CHUNKS="${MAX_CHUNKS:-590}" + +MAX_CHUNKS="$MAX_CHUNKS" python3 - "$PREPROC" "$CHUNKS_SX" <<'PY' +import os, re, sys +preproc_path, out_path = sys.argv[1], sys.argv[2] +max_chunks = int(os.environ.get("MAX_CHUNKS", "590")) +text = open(preproc_path).read() +# keep the `}T` attached to the preceding chunk +parts = re.split(r'(\}T)', text) +chunks = [] +buf = "" +for p in parts: + buf += p + if p == "}T": + s = buf.strip() + if s: + chunks.append(s) + buf = "" +if buf.strip(): + chunks.append(buf.strip()) +chunks = chunks[:max_chunks] + +def esc(s): + s = s.replace('\\', '\\\\').replace('"', '\\"') + s = s.replace('\r', ' ').replace('\n', ' ') + s = re.sub(r'\s+', ' ', s).strip() + return s + +with open(out_path, "w") as f: + f.write("(define hayes-chunks (list\n") + for c in chunks: + f.write(' "' + esc(c) + '"\n') + f.write("))\n\n") + f.write("(define\n") + f.write(" hayes-run-all\n") + f.write(" (fn\n") + f.write(" ()\n") + f.write(" (hayes-reset!)\n") + f.write(" (let ((s (hayes-boot)))\n") + f.write(" (for-each (fn (c) (hayes-run-chunk s c)) hayes-chunks))\n") + f.write(" (hayes-summary)))\n") +PY + +# 4. run it +OUT=$(printf '(epoch 1)\n(load "lib/forth/runtime.sx")\n(epoch 2)\n(load "lib/forth/reader.sx")\n(epoch 3)\n(load "lib/forth/interpreter.sx")\n(epoch 4)\n(load "lib/forth/compiler.sx")\n(epoch 5)\n(load "lib/forth/hayes-runner.sx")\n(epoch 6)\n(load "%s")\n(epoch 7)\n(eval "(hayes-run-all)")\n' "$CHUNKS_SX" \ + | timeout 180 "$SX_SERVER" 2>&1) +STATUS=$? + +SUMMARY=$(printf '%s\n' "$OUT" | awk '/^\{:pass / {print; exit}') +PASS=$(printf '%s' "$SUMMARY" | sed -n 's/.*:pass \([0-9-]*\).*/\1/p') +FAIL=$(printf '%s' "$SUMMARY" | sed -n 's/.*:fail \([0-9-]*\).*/\1/p') +ERR=$(printf '%s' "$SUMMARY" | sed -n 's/.*:error \([0-9-]*\).*/\1/p') +TOTAL=$(printf '%s' "$SUMMARY" | sed -n 's/.*:total \([0-9-]*\).*/\1/p') +CHUNK_COUNT=$(grep -c '^ "' "$CHUNKS_SX" || echo 0) +TOTAL_AVAILABLE=$(grep -c '}T' "$PREPROC" || echo 0) + +NOW="$(date -u +%Y-%m-%dT%H:%M:%SZ)" + +if [ -z "$PASS" ]; then + PASS=0; FAIL=0; ERR=0; TOTAL=0 + NOTE="runner halted before completing (timeout or SX error)" +else + NOTE="completed" +fi + +PCT=0 +if [ "$TOTAL" -gt 0 ]; then + PCT=$((PASS * 100 / TOTAL)) +fi + +cat > "$OUT_JSON" < "$OUT_MD" <\` / \`}T\` comparison mismatched. + +### Chunk cap + +\`conformance.sh\` processes the first \`\$MAX_CHUNKS\` chunks (default +**590**). Past that, \`core.fr\` ships tests that rely on unsigned +integer wrap-around (e.g. \`COUNT-BITS\` using \`BEGIN DUP WHILE … 2* +REPEAT\`), which never terminates on our bignum-based interpreter. The +cap should rise as those tests unblock — run with \`MAX_CHUNKS=639 +./conformance.sh\` once they do. +MD + +echo "$SUMMARY" +echo "Scoreboard: $OUT_JSON" +echo " $OUT_MD" + +if [ "$STATUS" -ne 0 ] && [ "$TOTAL" -eq 0 ]; then + exit 1 +fi diff --git a/lib/forth/hayes-runner.sx b/lib/forth/hayes-runner.sx new file mode 100644 index 00000000..c7515e67 --- /dev/null +++ b/lib/forth/hayes-runner.sx @@ -0,0 +1,135 @@ +;; Hayes conformance test runner. +;; Installs T{ -> }T as Forth primitives that snapshot and compare dstack, +;; plus stub TESTING / HEX / DECIMAL so the Hayes Core file can stream +;; through the interpreter without halting on unsupported metadata words. + +(define hayes-pass 0) +(define hayes-fail 0) +(define hayes-error 0) +(define hayes-start-depth 0) +(define hayes-actual (list)) +(define hayes-actual-set false) +(define hayes-failures (list)) +(define hayes-first-error "") + +(define + hayes-reset! + (fn + () + (set! hayes-pass 0) + (set! hayes-fail 0) + (set! hayes-error 0) + (set! hayes-start-depth 0) + (set! hayes-actual (list)) + (set! hayes-actual-set false) + (set! hayes-failures (list)) + (set! hayes-first-error ""))) + +(define + hayes-slice + (fn + (state base) + (let + ((n (- (forth-depth state) base))) + (if (<= n 0) (list) (take (get state "dstack") n))))) + +(define + hayes-truncate! + (fn + (state base) + (let + ((n (- (forth-depth state) base))) + (when (> n 0) (dict-set! state "dstack" (drop (get state "dstack") n)))))) + +(define + hayes-install! + (fn + (state) + (forth-def-prim! + state + "T{" + (fn + (s) + (set! hayes-start-depth (forth-depth s)) + (set! hayes-actual-set false) + (set! hayes-actual (list)))) + (forth-def-prim! + state + "->" + (fn + (s) + (set! hayes-actual (hayes-slice s hayes-start-depth)) + (set! hayes-actual-set true) + (hayes-truncate! s hayes-start-depth))) + (forth-def-prim! + state + "}T" + (fn + (s) + (let + ((expected (hayes-slice s hayes-start-depth))) + (hayes-truncate! s hayes-start-depth) + (if + (and hayes-actual-set (= expected hayes-actual)) + (set! hayes-pass (+ hayes-pass 1)) + (begin + (set! hayes-fail (+ hayes-fail 1)) + (set! + hayes-failures + (concat + hayes-failures + (list + (dict + "kind" + "fail" + "expected" + (str expected) + "actual" + (str hayes-actual)))))))))) + (forth-def-prim! state "TESTING" (fn (s) nil)) + (forth-def-prim! state "HEX" (fn (s) (dict-set! s "base" 16))) + (forth-def-prim! state "DECIMAL" (fn (s) (dict-set! s "base" 10))) + state)) + +(define + hayes-boot + (fn () (let ((s (forth-boot))) (hayes-install! s) (hayes-reset!) s))) + +;; Run a single preprocessed chunk (string of Forth source) on the shared +;; state. Catch any raised error and move on — the chunk boundary is a +;; safe resume point. +(define + hayes-run-chunk + (fn + (state src) + (guard + (err + ((= 1 1) + (begin + (set! hayes-error (+ hayes-error 1)) + (when + (= (len hayes-first-error) 0) + (set! hayes-first-error (str err))) + (dict-set! state "dstack" (list)) + (dict-set! state "rstack" (list)) + (dict-set! state "compiling" false) + (dict-set! state "current-def" nil) + (dict-set! state "cstack" (list)) + (dict-set! state "input" (list))))) + (forth-interpret state src)))) + +(define + hayes-summary + (fn + () + (dict + "pass" + hayes-pass + "fail" + hayes-fail + "error" + hayes-error + "total" + (+ (+ hayes-pass hayes-fail) hayes-error) + "first-error" + hayes-first-error))) diff --git a/lib/forth/scoreboard.json b/lib/forth/scoreboard.json new file mode 100644 index 00000000..e1f2d9b2 --- /dev/null +++ b/lib/forth/scoreboard.json @@ -0,0 +1,12 @@ +{ + "source": "gerryjackson/forth2012-test-suite src/core.fr", + "generated_at": "2026-04-24T19:13:12Z", + "chunks_available": 638, + "chunks_fed": 590, + "total": 590, + "pass": 165, + "fail": 0, + "error": 425, + "percent": 27, + "note": "completed" +} diff --git a/lib/forth/scoreboard.md b/lib/forth/scoreboard.md new file mode 100644 index 00000000..e4d5a468 --- /dev/null +++ b/lib/forth/scoreboard.md @@ -0,0 +1,30 @@ +# Forth Hayes Core scoreboard + +| metric | value | +| ----------------- | ----: | +| chunks available | 638 | +| chunks fed | 590 | +| total | 590 | +| pass | 165 | +| fail | 0 | +| error | 425 | +| percent | 27% | + +- **Source**: `gerryjackson/forth2012-test-suite` `src/core.fr` +- **Generated**: 2026-04-24T19:13:12Z +- **Note**: completed + +A "chunk" is any preprocessed segment ending at a `}T` (every Hayes test +is one chunk, plus the small declaration blocks between tests). +The runner catches raised errors at chunk boundaries so one bad chunk +does not abort the rest. `error` covers chunks that raised; `fail` +covers tests whose `->` / `}T` comparison mismatched. + +### Chunk cap + +`conformance.sh` processes the first `$MAX_CHUNKS` chunks (default +**590**). Past that, `core.fr` ships tests that rely on unsigned +integer wrap-around (e.g. `COUNT-BITS` using `BEGIN DUP WHILE … 2* +REPEAT`), which never terminates on our bignum-based interpreter. The +cap should rise as those tests unblock — run with `MAX_CHUNKS=639 +./conformance.sh` once they do. diff --git a/plans/forth-on-sx.md b/plans/forth-on-sx.md index 97da76e4..73ebbca9 100644 --- a/plans/forth-on-sx.md +++ b/plans/forth-on-sx.md @@ -74,8 +74,8 @@ Representation: - [x] `DO`, `LOOP`, `+LOOP`, `I`, `J`, `LEAVE` — counted loops (needs a return stack) - [x] Return stack: `>R`, `R>`, `R@`, `2>R`, `2R>`, `2R@` - [x] Vendor John Hayes' test suite to `lib/forth/ans-tests/` -- [ ] `lib/forth/conformance.sh` + runner; `scoreboard.json` + `scoreboard.md` -- [ ] Baseline: probably 30-50% Core passing after phase 3 +- [x] `lib/forth/conformance.sh` + runner; `scoreboard.json` + `scoreboard.md` +- [x] Baseline: probably 30-50% Core passing after phase 3 ### Phase 4 — strings + more Core - [ ] `S"`, `C"`, `."`, `TYPE`, `COUNT`, `CMOVE`, `FILL`, `BLANK` @@ -99,6 +99,28 @@ Representation: _Newest first._ +- **Phase 3 — Hayes conformance runner + baseline scoreboard (165/590, 28%).** + `lib/forth/conformance.sh` preprocesses `ans-tests/core.fr` (strips `\` + and `( ... )` comments + `TESTING` lines), splits the source on every + `}T` so each Hayes test plus the small declaration blocks between + them are one safe-resume chunk, and emits an SX driver that feeds + the chunks through `lib/forth/hayes-runner.sx`. The runner registers + `T{`/`->`/`}T` as Forth primitives that snapshot the dstack depth on + `T{`, record actual on `->`, compare on `}T`, and install stub + `HEX`/`DECIMAL`/`TESTING` so metadata doesn't halt the stream. Errors + raised inside a chunk are caught by `guard` and the state is reset, + so one bad test does not break the rest. Outputs + `scoreboard.json` + `scoreboard.md`. + + First-run baseline: 165 pass / 425 error / 0 fail on the first 590 + chunks. The default cap sits at 590 because `core.fr` chunks beyond + that rely on unsigned-integer wrap-around (e.g. `COUNT-BITS` with + `BEGIN DUP WHILE … 2* REPEAT`) which never terminates on our + bignum-based Forth; raise `MAX_CHUNKS` once those tests unblock. + Majority of errors are missing Phase-4 words (`RSHIFT`, `LSHIFT`, + `CELLS`, `S"`, `CHAR`, `SOURCE`, etc.) — each one implemented should + convert a cluster of errors to passes. + - **Phase 3 — vendor Gerry Jackson's forth2012-test-suite.** Added `lib/forth/ans-tests/{tester.fr, core.fr, coreexttest.fth}` from https://github.com/gerryjackson/forth2012-test-suite (master, fetched