GUEST: step 1 — lib/guest/conformance.{sx,sh} config-driven driver

Extracted the duplicated conformance plumbing into a single driver: - lib/guest/conformance.sx — two helper fns that emit (gc-result NAME P F T) lines for the bash side to grep: gc-dict-result for runners returning a {:passed :failed :total} dict, and gc-counters-result for guests that bump a global pass/fail counter from a test file load. - lib/guest/conformance.sh — config-driven bash driver. Sources a per-lang conf, locates sx_server, runs sx_server in either single-session "dict" mode (one preload + many suite evals) or per-suite "counters" mode (fresh sx_server per suite, with shared preloads). Aggregates and writes scoreboard.{json,md} via per-lang emit_scoreboard_* functions. - Ported lib/prolog/conformance.sh and lib/haskell/conformance.sh down to one-line wrappers that exec the shared driver against their .conf file. Verification: - Prolog: 590/590 — diff vs baseline is timestamp-only. - Haskell: 156/156 — significantly higher than the 0/18 in baseline. The old conformance.sh was buggy (its `(ok-len 3 ...)` grep never matched, defaulting every program to 0 pass / 1 fail). Updated baseline to the true count; no actual test regressed. Plan baseline cell updated. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-06 22:46:48 +00:00
parent 0eced4c34c
commit 58dcff2639
11 changed files with 522 additions and 367 deletions
--- a/lib/guest/baseline/haskell.json
+++ b/lib/guest/baseline/haskell.json
@@ -1,121 +1,122 @@
 {
  "lang": "haskell",
-  "captured": "2026-05-06T22:01:00Z",
+  "captured": "2026-05-06T22:46:16Z",
  "suite_command": "bash lib/haskell/conformance.sh",
  "totals": {
-    "pass": 0,
-    "fail": 18,
-    "total": 18
+    "pass": 156,
+    "fail": 0,
+    "total": 156
  },
  "suites": [
    {
      "name": "fib",
-      "pass": 0,
-      "fail": 1,
-      "total": 1
+      "pass": 2,
+      "fail": 0,
+      "total": 2
    },
    {
      "name": "sieve",
-      "pass": 0,
-      "fail": 1,
-      "total": 1
+      "pass": 2,
+      "fail": 0,
+      "total": 2
    },
    {
      "name": "quicksort",
-      "pass": 0,
-      "fail": 1,
-      "total": 1
+      "pass": 5,
+      "fail": 0,
+      "total": 5
    },
    {
      "name": "nqueens",
-      "pass": 0,
-      "fail": 1,
-      "total": 1
+      "pass": 2,
+      "fail": 0,
+      "total": 2
    },
    {
      "name": "calculator",
-      "pass": 0,
-      "fail": 1,
-      "total": 1
+      "pass": 5,
+      "fail": 0,
+      "total": 5
    },
    {
      "name": "collatz",
-      "pass": 0,
-      "fail": 1,
-      "total": 1
+      "pass": 11,
+      "fail": 0,
+      "total": 11
    },
    {
      "name": "palindrome",
-      "pass": 0,
-      "fail": 1,
-      "total": 1
+      "pass": 8,
+      "fail": 0,
+      "total": 8
    },
    {
      "name": "maybe",
-      "pass": 0,
-      "fail": 1,
-      "total": 1
+      "pass": 12,
+      "fail": 0,
+      "total": 12
    },
    {
      "name": "fizzbuzz",
-      "pass": 0,
-      "fail": 1,
-      "total": 1
+      "pass": 12,
+      "fail": 0,
+      "total": 12
    },
    {
      "name": "anagram",
-      "pass": 0,
-      "fail": 1,
-      "total": 1
+      "pass": 9,
+      "fail": 0,
+      "total": 9
    },
    {
      "name": "roman",
-      "pass": 0,
-      "fail": 1,
-      "total": 1
+      "pass": 14,
+      "fail": 0,
+      "total": 14
    },
    {
      "name": "binary",
-      "pass": 0,
-      "fail": 1,
-      "total": 1
+      "pass": 12,
+      "fail": 0,
+      "total": 12
    },
    {
      "name": "either",
-      "pass": 0,
-      "fail": 1,
-      "total": 1
+      "pass": 12,
+      "fail": 0,
+      "total": 12
    },
    {
      "name": "primes",
-      "pass": 0,
-      "fail": 1,
-      "total": 1
+      "pass": 12,
+      "fail": 0,
+      "total": 12
    },
    {
      "name": "zipwith",
-      "pass": 0,
-      "fail": 1,
-      "total": 1
+      "pass": 9,
+      "fail": 0,
+      "total": 9
    },
    {
      "name": "matrix",
-      "pass": 0,
-      "fail": 1,
-      "total": 1
+      "pass": 8,
+      "fail": 0,
+      "total": 8
    },
    {
      "name": "wordcount",
-      "pass": 0,
-      "fail": 1,
-      "total": 1
+      "pass": 7,
+      "fail": 0,
+      "total": 7
    },
    {
      "name": "powers",
-      "pass": 0,
-      "fail": 1,
-      "total": 1
+      "pass": 14,
+      "fail": 0,
+      "total": 14
    }
  ],
-  "source_scoreboard": "lib/haskell/scoreboard.json"
+  "source_scoreboard": "lib/haskell/scoreboard.json",
+  "note": "Step 1: previous baseline (0/18) was an artefact of the old conformance.sh bug \u2014 its (ok-len 3 ...) grep never matched, defaulting every program to 0 pass / 1 fail. Shared driver in Step 1 reads counters correctly."
 }
--- a/lib/guest/conformance.sh
+++ b/lib/guest/conformance.sh
@@ -0,0 +1,221 @@
+#!/usr/bin/env bash
+# lib/guest/conformance.sh — shared, config-driven conformance driver.
+#
+# Usage:
+#   bash lib/guest/conformance.sh <conf-file>
+#
+# The conf file is a bash file that sets:
+#   LANG_NAME           e.g. prolog
+#   PRELOADS=( ... )    .sx files to load before any suite (path from repo root)
+#   SUITES=( ... )      colon-separated entries; format depends on MODE
+#   MODE                "dict" or "counters"
+#   COUNTERS_PASS       (counters mode) global symbol for the pass counter
+#   COUNTERS_FAIL       (counters mode) global symbol for the fail counter
+#   TIMEOUT_PER_SUITE   (optional, counters mode) seconds per suite, default 120
+#   SCOREBOARD_DIR      (optional) defaults to lib/$LANG_NAME
+#
+# It may override the bash functions emit_scoreboard_json / emit_scoreboard_md
+# to produce the per-language scoreboard schema. Defaults are provided.
+#
+# Suite formats:
+#   MODE=dict       — "name:test-file:(runner-fn)"
+#                     The runner expression is evaluated and is expected to
+#                     return a dict with :passed/:failed/:total.
+#   MODE=counters   — "name:test-file"
+#                     Each suite is run in a fresh sx_server session: preloads
+#                     are loaded, then the test file, then counters are read.
+#                     The suite is treated as starting from counters (0, 0).
+#
+# Output:
+#   Writes $SCOREBOARD_DIR/scoreboard.json and $SCOREBOARD_DIR/scoreboard.md.
+#   Exits 0 if every suite is green, 1 otherwise.
+
+set -uo pipefail
+cd "$(git rev-parse --show-toplevel)"
+
+if [ "$#" -lt 1 ]; then
+  echo "usage: $0 <conf-file>" >&2
+  exit 2
+fi
+
+CONF="$1"
+if [ ! -f "$CONF" ]; then
+  echo "config not found: $CONF" >&2
+  exit 2
+fi
+
+# Defaults — the conf file may override these.
+LANG_NAME=
+PRELOADS=()
+SUITES=()
+MODE=dict
+COUNTERS_PASS=
+COUNTERS_FAIL=
+TIMEOUT_PER_SUITE=120
+SCOREBOARD_DIR=
+
+emit_scoreboard_json() {
+  # Generic schema. Per-lang configs override this for byte-equality with
+  # historical scoreboards.
+  local n=${#GC_NAMES[@]} i sep
+  printf '{\n'
+  printf '  "lang": "%s",\n' "$LANG_NAME"
+  printf '  "total_passed": %d,\n' "$GC_TOTAL_PASS"
+  printf '  "total_failed": %d,\n' "$GC_TOTAL_FAIL"
+  printf '  "total": %d,\n' "$GC_TOTAL"
+  printf '  "suites": ['
+  for ((i=0; i<n; i++)); do
+    sep=","; [ $i -eq $((n-1)) ] && sep=""
+    printf '\n    {"name":"%s","passed":%d,"failed":%d,"total":%d}%s' \
+      "${GC_NAMES[$i]}" "${GC_PASS[$i]}" "${GC_FAIL[$i]}" "${GC_TOTAL_S[$i]}" "$sep"
+  done
+  printf '\n  ],\n'
+  printf '  "generated": "%s"\n' "$(date -Iseconds 2>/dev/null || date)"
+  printf '}\n'
+}
+
+emit_scoreboard_md() {
+  local n=${#GC_NAMES[@]} i status
+  printf '# %s scoreboard\n\n' "$LANG_NAME"
+  printf '**%d / %d passing** (%d failure(s)).\n\n' "$GC_TOTAL_PASS" "$GC_TOTAL" "$GC_TOTAL_FAIL"
+  printf '| Suite | Passed | Total | Status |\n'
+  printf '|-------|--------|-------|--------|\n'
+  for ((i=0; i<n; i++)); do
+    status="ok"; [ "${GC_FAIL[$i]}" -gt 0 ] && status="FAIL"
+    printf '| %s | %d | %d | %s |\n' \
+      "${GC_NAMES[$i]}" "${GC_PASS[$i]}" "${GC_TOTAL_S[$i]}" "$status"
+  done
+}
+
+# shellcheck disable=SC1090
+source "$CONF"
+
+if [ -z "$LANG_NAME" ]; then
+  echo "LANG_NAME not set in $CONF" >&2
+  exit 2
+fi
+SCOREBOARD_DIR="${SCOREBOARD_DIR:-lib/$LANG_NAME}"
+
+SX="${SX_SERVER:-hosts/ocaml/_build/default/bin/sx_server.exe}"
+if [ ! -x "$SX" ]; then
+  MAIN_ROOT=$(git worktree list 2>/dev/null | head -1 | awk '{print $1}')
+  if [ -n "${MAIN_ROOT:-}" ] && [ -x "$MAIN_ROOT/$SX" ]; then
+    SX="$MAIN_ROOT/$SX"
+  else
+    echo "ERROR: sx_server.exe not found (set SX_SERVER to override)." >&2
+    exit 2
+  fi
+fi
+
+GC_NAMES=()
+GC_PASS=()
+GC_FAIL=()
+GC_TOTAL_S=()
+
+parse_result_line() {
+  # Match a (gc-result "name" P F T) line.
+  local line="$1"
+  if [[ "$line" =~ ^\(gc-result\ \"([^\"]+)\"\ ([0-9]+)\ ([0-9]+)\ ([0-9]+)\)$ ]]; then
+    GC_NAMES+=("${BASH_REMATCH[1]}")
+    GC_PASS+=("${BASH_REMATCH[2]}")
+    GC_FAIL+=("${BASH_REMATCH[3]}")
+    GC_TOTAL_S+=("${BASH_REMATCH[4]}")
+    return 0
+  fi
+  return 1
+}
+
+case "$MODE" in
+  dict)
+    SCRIPT='(epoch 1)
+'
+    for f in "${PRELOADS[@]}"; do
+      SCRIPT+='(load "'"$f"'")
+'
+    done
+    SCRIPT+='(load "lib/guest/conformance.sx")
+'
+    for entry in "${SUITES[@]}"; do
+      IFS=: read -r _ file _ <<< "$entry"
+      SCRIPT+='(load "'"$file"'")
+'
+    done
+    SCRIPT+='(epoch 2)
+'
+    for entry in "${SUITES[@]}"; do
+      IFS=: read -r name _ runner <<< "$entry"
+      SCRIPT+='(eval "(gc-dict-result \"'"$name"'\" '"$runner"')")
+'
+    done
+    OUTPUT=$(printf '%s' "$SCRIPT" | "$SX" 2>&1)
+    expected=${#SUITES[@]}
+    matched=0
+    while IFS= read -r line; do
+      if parse_result_line "$line"; then
+        matched=$((matched + 1))
+      fi
+    done <<< "$OUTPUT"
+    if [ "$matched" -ne "$expected" ]; then
+      echo "Expected $expected suite results, got $matched" >&2
+      echo "---- raw output ----" >&2
+      printf '%s\n' "$OUTPUT" >&2
+      exit 3
+    fi
+    ;;
+  counters)
+    if [ -z "$COUNTERS_PASS" ] || [ -z "$COUNTERS_FAIL" ]; then
+      echo "MODE=counters requires COUNTERS_PASS and COUNTERS_FAIL in $CONF" >&2
+      exit 2
+    fi
+    for entry in "${SUITES[@]}"; do
+      IFS=: read -r name file <<< "$entry"
+      TMPFILE=$(mktemp)
+      {
+        printf '(epoch 1)\n'
+        for f in "${PRELOADS[@]}"; do printf '(load "%s")\n' "$f"; done
+        printf '(load "lib/guest/conformance.sx")\n'
+        printf '(epoch 2)\n'
+        printf '(load "%s")\n' "$file"
+        printf '(epoch 3)\n'
+        printf '(eval "(gc-counters-result \\"%s\\" 0 0 %s %s)")\n' \
+          "$name" "$COUNTERS_PASS" "$COUNTERS_FAIL"
+      } > "$TMPFILE"
+      OUTPUT=$(timeout "$TIMEOUT_PER_SUITE" "$SX" < "$TMPFILE" 2>&1 || true)
+      rm -f "$TMPFILE"
+      result=$(printf '%s\n' "$OUTPUT" | grep -E '^\(gc-result ' | tail -1 || true)
+      if [ -n "$result" ] && parse_result_line "$result"; then
+        :
+      else
+        # Suite hung or crashed before emitting a result. Record 0/1 so it
+        # shows up as a failure rather than vanishing.
+        GC_NAMES+=("$name")
+        GC_PASS+=(0)
+        GC_FAIL+=(1)
+        GC_TOTAL_S+=(1)
+      fi
+    done
+    ;;
+  *)
+    echo "Unknown MODE=$MODE in $CONF (expected dict|counters)" >&2
+    exit 2
+    ;;
+esac
+
+GC_TOTAL_PASS=0
+GC_TOTAL_FAIL=0
+GC_TOTAL=0
+for ((i=0; i<${#GC_NAMES[@]}; i++)); do
+  GC_TOTAL_PASS=$((GC_TOTAL_PASS + GC_PASS[i]))
+  GC_TOTAL_FAIL=$((GC_TOTAL_FAIL + GC_FAIL[i]))
+  GC_TOTAL=$((GC_TOTAL + GC_TOTAL_S[i]))
+done
+
+mkdir -p "$SCOREBOARD_DIR"
+emit_scoreboard_json > "$SCOREBOARD_DIR/scoreboard.json"
+emit_scoreboard_md > "$SCOREBOARD_DIR/scoreboard.md"
+
+if [ "$GC_TOTAL_FAIL" -gt 0 ]; then
+  echo "$GC_TOTAL_FAIL failure(s) across $GC_TOTAL tests" >&2
+  exit 1
+fi
+echo "All $GC_TOTAL tests pass."
--- a/lib/guest/conformance.sx
+++ b/lib/guest/conformance.sx
@@ -0,0 +1,40 @@
+;; lib/guest/conformance.sx — shared helpers for the guest conformance driver.
+;;
+;; The bash driver lib/guest/conformance.sh loads this file and then for each
+;; suite emits an (eval "...") form whose result is a tagged list:
+;;
+;;     (gc-result NAME PASSED FAILED TOTAL)
+;;
+;; The driver greps these from sx_server's output and aggregates them.
+;;
+;; Two suite shapes are supported:
+;;
+;;   :dict     — runner expression returns a dict with :passed/:failed/:total.
+;;               (gc-dict-result "parse" (pl-parse-tests-run!))
+;;
+;;   :counters — runner has no return value, mutates pass/fail global counters.
+;;               (gc-counters-result NAME P0 F0 PASS FAIL)
+;;               where P0/F0 are the counters captured BEFORE the suite ran
+;;               and PASS/FAIL are the counters AFTER.
+
+(define
+  gc-dict-result
+  (fn
+    (name r)
+    (list
+      (quote gc-result)
+      name
+      (get r :passed)
+      (get r :failed)
+      (get r :total))))
+
+(define
+  gc-counters-result
+  (fn
+    (name p0 f0 p1 f1)
+    (list
+      (quote gc-result)
+      name
+      (- p1 p0)
+      (- f1 f0)
+      (- (+ p1 f1) (+ p0 f0)))))