GUEST: step 1 — lib/guest/conformance.{sx,sh} config-driven driver
Extracted the duplicated conformance plumbing into a single driver:
- lib/guest/conformance.sx — two helper fns that emit (gc-result NAME P F T)
lines for the bash side to grep: gc-dict-result for runners returning
a {:passed :failed :total} dict, and gc-counters-result for guests that
bump a global pass/fail counter from a test file load.
- lib/guest/conformance.sh — config-driven bash driver. Sources a per-lang
conf, locates sx_server, runs sx_server in either single-session "dict"
mode (one preload + many suite evals) or per-suite "counters" mode
(fresh sx_server per suite, with shared preloads). Aggregates and writes
scoreboard.{json,md} via per-lang emit_scoreboard_* functions.
- Ported lib/prolog/conformance.sh and lib/haskell/conformance.sh down to
one-line wrappers that exec the shared driver against their .conf file.
Verification:
- Prolog: 590/590 — diff vs baseline is timestamp-only.
- Haskell: 156/156 — significantly higher than the 0/18 in baseline. The
old conformance.sh was buggy (its `(ok-len 3 ...)` grep never matched,
defaulting every program to 0 pass / 1 fail). Updated baseline to the
true count; no actual test regressed. Plan baseline cell updated.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1,121 +1,122 @@
|
||||
{
|
||||
"lang": "haskell",
|
||||
"captured": "2026-05-06T22:01:00Z",
|
||||
"captured": "2026-05-06T22:46:16Z",
|
||||
"suite_command": "bash lib/haskell/conformance.sh",
|
||||
"totals": {
|
||||
"pass": 0,
|
||||
"fail": 18,
|
||||
"total": 18
|
||||
"pass": 156,
|
||||
"fail": 0,
|
||||
"total": 156
|
||||
},
|
||||
"suites": [
|
||||
{
|
||||
"name": "fib",
|
||||
"pass": 0,
|
||||
"fail": 1,
|
||||
"total": 1
|
||||
"pass": 2,
|
||||
"fail": 0,
|
||||
"total": 2
|
||||
},
|
||||
{
|
||||
"name": "sieve",
|
||||
"pass": 0,
|
||||
"fail": 1,
|
||||
"total": 1
|
||||
"pass": 2,
|
||||
"fail": 0,
|
||||
"total": 2
|
||||
},
|
||||
{
|
||||
"name": "quicksort",
|
||||
"pass": 0,
|
||||
"fail": 1,
|
||||
"total": 1
|
||||
"pass": 5,
|
||||
"fail": 0,
|
||||
"total": 5
|
||||
},
|
||||
{
|
||||
"name": "nqueens",
|
||||
"pass": 0,
|
||||
"fail": 1,
|
||||
"total": 1
|
||||
"pass": 2,
|
||||
"fail": 0,
|
||||
"total": 2
|
||||
},
|
||||
{
|
||||
"name": "calculator",
|
||||
"pass": 0,
|
||||
"fail": 1,
|
||||
"total": 1
|
||||
"pass": 5,
|
||||
"fail": 0,
|
||||
"total": 5
|
||||
},
|
||||
{
|
||||
"name": "collatz",
|
||||
"pass": 0,
|
||||
"fail": 1,
|
||||
"total": 1
|
||||
"pass": 11,
|
||||
"fail": 0,
|
||||
"total": 11
|
||||
},
|
||||
{
|
||||
"name": "palindrome",
|
||||
"pass": 0,
|
||||
"fail": 1,
|
||||
"total": 1
|
||||
"pass": 8,
|
||||
"fail": 0,
|
||||
"total": 8
|
||||
},
|
||||
{
|
||||
"name": "maybe",
|
||||
"pass": 0,
|
||||
"fail": 1,
|
||||
"total": 1
|
||||
"pass": 12,
|
||||
"fail": 0,
|
||||
"total": 12
|
||||
},
|
||||
{
|
||||
"name": "fizzbuzz",
|
||||
"pass": 0,
|
||||
"fail": 1,
|
||||
"total": 1
|
||||
"pass": 12,
|
||||
"fail": 0,
|
||||
"total": 12
|
||||
},
|
||||
{
|
||||
"name": "anagram",
|
||||
"pass": 0,
|
||||
"fail": 1,
|
||||
"total": 1
|
||||
"pass": 9,
|
||||
"fail": 0,
|
||||
"total": 9
|
||||
},
|
||||
{
|
||||
"name": "roman",
|
||||
"pass": 0,
|
||||
"fail": 1,
|
||||
"total": 1
|
||||
"pass": 14,
|
||||
"fail": 0,
|
||||
"total": 14
|
||||
},
|
||||
{
|
||||
"name": "binary",
|
||||
"pass": 0,
|
||||
"fail": 1,
|
||||
"total": 1
|
||||
"pass": 12,
|
||||
"fail": 0,
|
||||
"total": 12
|
||||
},
|
||||
{
|
||||
"name": "either",
|
||||
"pass": 0,
|
||||
"fail": 1,
|
||||
"total": 1
|
||||
"pass": 12,
|
||||
"fail": 0,
|
||||
"total": 12
|
||||
},
|
||||
{
|
||||
"name": "primes",
|
||||
"pass": 0,
|
||||
"fail": 1,
|
||||
"total": 1
|
||||
"pass": 12,
|
||||
"fail": 0,
|
||||
"total": 12
|
||||
},
|
||||
{
|
||||
"name": "zipwith",
|
||||
"pass": 0,
|
||||
"fail": 1,
|
||||
"total": 1
|
||||
"pass": 9,
|
||||
"fail": 0,
|
||||
"total": 9
|
||||
},
|
||||
{
|
||||
"name": "matrix",
|
||||
"pass": 0,
|
||||
"fail": 1,
|
||||
"total": 1
|
||||
"pass": 8,
|
||||
"fail": 0,
|
||||
"total": 8
|
||||
},
|
||||
{
|
||||
"name": "wordcount",
|
||||
"pass": 0,
|
||||
"fail": 1,
|
||||
"total": 1
|
||||
"pass": 7,
|
||||
"fail": 0,
|
||||
"total": 7
|
||||
},
|
||||
{
|
||||
"name": "powers",
|
||||
"pass": 0,
|
||||
"fail": 1,
|
||||
"total": 1
|
||||
"pass": 14,
|
||||
"fail": 0,
|
||||
"total": 14
|
||||
}
|
||||
],
|
||||
"source_scoreboard": "lib/haskell/scoreboard.json"
|
||||
"source_scoreboard": "lib/haskell/scoreboard.json",
|
||||
"note": "Step 1: previous baseline (0/18) was an artefact of the old conformance.sh bug \u2014 its (ok-len 3 ...) grep never matched, defaulting every program to 0 pass / 1 fail. Shared driver in Step 1 reads counters correctly."
|
||||
}
|
||||
|
||||
221
lib/guest/conformance.sh
Executable file
221
lib/guest/conformance.sh
Executable file
@@ -0,0 +1,221 @@
|
||||
#!/usr/bin/env bash
|
||||
# lib/guest/conformance.sh — shared, config-driven conformance driver.
|
||||
#
|
||||
# Usage:
|
||||
# bash lib/guest/conformance.sh <conf-file>
|
||||
#
|
||||
# The conf file is a bash file that sets:
|
||||
# LANG_NAME e.g. prolog
|
||||
# PRELOADS=( ... ) .sx files to load before any suite (path from repo root)
|
||||
# SUITES=( ... ) colon-separated entries; format depends on MODE
|
||||
# MODE "dict" or "counters"
|
||||
# COUNTERS_PASS (counters mode) global symbol for the pass counter
|
||||
# COUNTERS_FAIL (counters mode) global symbol for the fail counter
|
||||
# TIMEOUT_PER_SUITE (optional, counters mode) seconds per suite, default 120
|
||||
# SCOREBOARD_DIR (optional) defaults to lib/$LANG_NAME
|
||||
#
|
||||
# It may override the bash functions emit_scoreboard_json / emit_scoreboard_md
|
||||
# to produce the per-language scoreboard schema. Defaults are provided.
|
||||
#
|
||||
# Suite formats:
|
||||
# MODE=dict — "name:test-file:(runner-fn)"
|
||||
# The runner expression is evaluated and is expected to
|
||||
# return a dict with :passed/:failed/:total.
|
||||
# MODE=counters — "name:test-file"
|
||||
# Each suite is run in a fresh sx_server session: preloads
|
||||
# are loaded, then the test file, then counters are read.
|
||||
# The suite is treated as starting from counters (0, 0).
|
||||
#
|
||||
# Output:
|
||||
# Writes $SCOREBOARD_DIR/scoreboard.json and $SCOREBOARD_DIR/scoreboard.md.
|
||||
# Exits 0 if every suite is green, 1 otherwise.
|
||||
|
||||
set -uo pipefail
|
||||
cd "$(git rev-parse --show-toplevel)"
|
||||
|
||||
if [ "$#" -lt 1 ]; then
|
||||
echo "usage: $0 <conf-file>" >&2
|
||||
exit 2
|
||||
fi
|
||||
|
||||
CONF="$1"
|
||||
if [ ! -f "$CONF" ]; then
|
||||
echo "config not found: $CONF" >&2
|
||||
exit 2
|
||||
fi
|
||||
|
||||
# Defaults — the conf file may override these.
|
||||
LANG_NAME=
|
||||
PRELOADS=()
|
||||
SUITES=()
|
||||
MODE=dict
|
||||
COUNTERS_PASS=
|
||||
COUNTERS_FAIL=
|
||||
TIMEOUT_PER_SUITE=120
|
||||
SCOREBOARD_DIR=
|
||||
|
||||
emit_scoreboard_json() {
|
||||
# Generic schema. Per-lang configs override this for byte-equality with
|
||||
# historical scoreboards.
|
||||
local n=${#GC_NAMES[@]} i sep
|
||||
printf '{\n'
|
||||
printf ' "lang": "%s",\n' "$LANG_NAME"
|
||||
printf ' "total_passed": %d,\n' "$GC_TOTAL_PASS"
|
||||
printf ' "total_failed": %d,\n' "$GC_TOTAL_FAIL"
|
||||
printf ' "total": %d,\n' "$GC_TOTAL"
|
||||
printf ' "suites": ['
|
||||
for ((i=0; i<n; i++)); do
|
||||
sep=","; [ $i -eq $((n-1)) ] && sep=""
|
||||
printf '\n {"name":"%s","passed":%d,"failed":%d,"total":%d}%s' \
|
||||
"${GC_NAMES[$i]}" "${GC_PASS[$i]}" "${GC_FAIL[$i]}" "${GC_TOTAL_S[$i]}" "$sep"
|
||||
done
|
||||
printf '\n ],\n'
|
||||
printf ' "generated": "%s"\n' "$(date -Iseconds 2>/dev/null || date)"
|
||||
printf '}\n'
|
||||
}
|
||||
|
||||
emit_scoreboard_md() {
|
||||
local n=${#GC_NAMES[@]} i status
|
||||
printf '# %s scoreboard\n\n' "$LANG_NAME"
|
||||
printf '**%d / %d passing** (%d failure(s)).\n\n' "$GC_TOTAL_PASS" "$GC_TOTAL" "$GC_TOTAL_FAIL"
|
||||
printf '| Suite | Passed | Total | Status |\n'
|
||||
printf '|-------|--------|-------|--------|\n'
|
||||
for ((i=0; i<n; i++)); do
|
||||
status="ok"; [ "${GC_FAIL[$i]}" -gt 0 ] && status="FAIL"
|
||||
printf '| %s | %d | %d | %s |\n' \
|
||||
"${GC_NAMES[$i]}" "${GC_PASS[$i]}" "${GC_TOTAL_S[$i]}" "$status"
|
||||
done
|
||||
}
|
||||
|
||||
# shellcheck disable=SC1090
|
||||
source "$CONF"
|
||||
|
||||
if [ -z "$LANG_NAME" ]; then
|
||||
echo "LANG_NAME not set in $CONF" >&2
|
||||
exit 2
|
||||
fi
|
||||
SCOREBOARD_DIR="${SCOREBOARD_DIR:-lib/$LANG_NAME}"
|
||||
|
||||
SX="${SX_SERVER:-hosts/ocaml/_build/default/bin/sx_server.exe}"
|
||||
if [ ! -x "$SX" ]; then
|
||||
MAIN_ROOT=$(git worktree list 2>/dev/null | head -1 | awk '{print $1}')
|
||||
if [ -n "${MAIN_ROOT:-}" ] && [ -x "$MAIN_ROOT/$SX" ]; then
|
||||
SX="$MAIN_ROOT/$SX"
|
||||
else
|
||||
echo "ERROR: sx_server.exe not found (set SX_SERVER to override)." >&2
|
||||
exit 2
|
||||
fi
|
||||
fi
|
||||
|
||||
GC_NAMES=()
|
||||
GC_PASS=()
|
||||
GC_FAIL=()
|
||||
GC_TOTAL_S=()
|
||||
|
||||
parse_result_line() {
|
||||
# Match a (gc-result "name" P F T) line.
|
||||
local line="$1"
|
||||
if [[ "$line" =~ ^\(gc-result\ \"([^\"]+)\"\ ([0-9]+)\ ([0-9]+)\ ([0-9]+)\)$ ]]; then
|
||||
GC_NAMES+=("${BASH_REMATCH[1]}")
|
||||
GC_PASS+=("${BASH_REMATCH[2]}")
|
||||
GC_FAIL+=("${BASH_REMATCH[3]}")
|
||||
GC_TOTAL_S+=("${BASH_REMATCH[4]}")
|
||||
return 0
|
||||
fi
|
||||
return 1
|
||||
}
|
||||
|
||||
case "$MODE" in
|
||||
dict)
|
||||
SCRIPT='(epoch 1)
|
||||
'
|
||||
for f in "${PRELOADS[@]}"; do
|
||||
SCRIPT+='(load "'"$f"'")
|
||||
'
|
||||
done
|
||||
SCRIPT+='(load "lib/guest/conformance.sx")
|
||||
'
|
||||
for entry in "${SUITES[@]}"; do
|
||||
IFS=: read -r _ file _ <<< "$entry"
|
||||
SCRIPT+='(load "'"$file"'")
|
||||
'
|
||||
done
|
||||
SCRIPT+='(epoch 2)
|
||||
'
|
||||
for entry in "${SUITES[@]}"; do
|
||||
IFS=: read -r name _ runner <<< "$entry"
|
||||
SCRIPT+='(eval "(gc-dict-result \"'"$name"'\" '"$runner"')")
|
||||
'
|
||||
done
|
||||
OUTPUT=$(printf '%s' "$SCRIPT" | "$SX" 2>&1)
|
||||
expected=${#SUITES[@]}
|
||||
matched=0
|
||||
while IFS= read -r line; do
|
||||
if parse_result_line "$line"; then
|
||||
matched=$((matched + 1))
|
||||
fi
|
||||
done <<< "$OUTPUT"
|
||||
if [ "$matched" -ne "$expected" ]; then
|
||||
echo "Expected $expected suite results, got $matched" >&2
|
||||
echo "---- raw output ----" >&2
|
||||
printf '%s\n' "$OUTPUT" >&2
|
||||
exit 3
|
||||
fi
|
||||
;;
|
||||
counters)
|
||||
if [ -z "$COUNTERS_PASS" ] || [ -z "$COUNTERS_FAIL" ]; then
|
||||
echo "MODE=counters requires COUNTERS_PASS and COUNTERS_FAIL in $CONF" >&2
|
||||
exit 2
|
||||
fi
|
||||
for entry in "${SUITES[@]}"; do
|
||||
IFS=: read -r name file <<< "$entry"
|
||||
TMPFILE=$(mktemp)
|
||||
{
|
||||
printf '(epoch 1)\n'
|
||||
for f in "${PRELOADS[@]}"; do printf '(load "%s")\n' "$f"; done
|
||||
printf '(load "lib/guest/conformance.sx")\n'
|
||||
printf '(epoch 2)\n'
|
||||
printf '(load "%s")\n' "$file"
|
||||
printf '(epoch 3)\n'
|
||||
printf '(eval "(gc-counters-result \\"%s\\" 0 0 %s %s)")\n' \
|
||||
"$name" "$COUNTERS_PASS" "$COUNTERS_FAIL"
|
||||
} > "$TMPFILE"
|
||||
OUTPUT=$(timeout "$TIMEOUT_PER_SUITE" "$SX" < "$TMPFILE" 2>&1 || true)
|
||||
rm -f "$TMPFILE"
|
||||
result=$(printf '%s\n' "$OUTPUT" | grep -E '^\(gc-result ' | tail -1 || true)
|
||||
if [ -n "$result" ] && parse_result_line "$result"; then
|
||||
:
|
||||
else
|
||||
# Suite hung or crashed before emitting a result. Record 0/1 so it
|
||||
# shows up as a failure rather than vanishing.
|
||||
GC_NAMES+=("$name")
|
||||
GC_PASS+=(0)
|
||||
GC_FAIL+=(1)
|
||||
GC_TOTAL_S+=(1)
|
||||
fi
|
||||
done
|
||||
;;
|
||||
*)
|
||||
echo "Unknown MODE=$MODE in $CONF (expected dict|counters)" >&2
|
||||
exit 2
|
||||
;;
|
||||
esac
|
||||
|
||||
GC_TOTAL_PASS=0
|
||||
GC_TOTAL_FAIL=0
|
||||
GC_TOTAL=0
|
||||
for ((i=0; i<${#GC_NAMES[@]}; i++)); do
|
||||
GC_TOTAL_PASS=$((GC_TOTAL_PASS + GC_PASS[i]))
|
||||
GC_TOTAL_FAIL=$((GC_TOTAL_FAIL + GC_FAIL[i]))
|
||||
GC_TOTAL=$((GC_TOTAL + GC_TOTAL_S[i]))
|
||||
done
|
||||
|
||||
mkdir -p "$SCOREBOARD_DIR"
|
||||
emit_scoreboard_json > "$SCOREBOARD_DIR/scoreboard.json"
|
||||
emit_scoreboard_md > "$SCOREBOARD_DIR/scoreboard.md"
|
||||
|
||||
if [ "$GC_TOTAL_FAIL" -gt 0 ]; then
|
||||
echo "$GC_TOTAL_FAIL failure(s) across $GC_TOTAL tests" >&2
|
||||
exit 1
|
||||
fi
|
||||
echo "All $GC_TOTAL tests pass."
|
||||
40
lib/guest/conformance.sx
Normal file
40
lib/guest/conformance.sx
Normal file
@@ -0,0 +1,40 @@
|
||||
;; lib/guest/conformance.sx — shared helpers for the guest conformance driver.
|
||||
;;
|
||||
;; The bash driver lib/guest/conformance.sh loads this file and then for each
|
||||
;; suite emits an (eval "...") form whose result is a tagged list:
|
||||
;;
|
||||
;; (gc-result NAME PASSED FAILED TOTAL)
|
||||
;;
|
||||
;; The driver greps these from sx_server's output and aggregates them.
|
||||
;;
|
||||
;; Two suite shapes are supported:
|
||||
;;
|
||||
;; :dict — runner expression returns a dict with :passed/:failed/:total.
|
||||
;; (gc-dict-result "parse" (pl-parse-tests-run!))
|
||||
;;
|
||||
;; :counters — runner has no return value, mutates pass/fail global counters.
|
||||
;; (gc-counters-result NAME P0 F0 PASS FAIL)
|
||||
;; where P0/F0 are the counters captured BEFORE the suite ran
|
||||
;; and PASS/FAIL are the counters AFTER.
|
||||
|
||||
(define
|
||||
gc-dict-result
|
||||
(fn
|
||||
(name r)
|
||||
(list
|
||||
(quote gc-result)
|
||||
name
|
||||
(get r :passed)
|
||||
(get r :failed)
|
||||
(get r :total))))
|
||||
|
||||
(define
|
||||
gc-counters-result
|
||||
(fn
|
||||
(name p0 f0 p1 f1)
|
||||
(list
|
||||
(quote gc-result)
|
||||
name
|
||||
(- p1 p0)
|
||||
(- f1 f0)
|
||||
(- (+ p1 f1) (+ p0 f0)))))
|
||||
Reference in New Issue
Block a user