rose-ash/scripts/test-suite-baseline.sh

#!/bin/bash
# test-suite-baseline.sh — W14/F10: make FAIL mean something again.
#
# The review (conformance.md F-10): the OCaml suite is not green — a
# permanent ~274-failure band (in-progress hs-* + r7rs radix shadow) is
# normalized, so real regressions hide inside the red noise and nobody can
# tell a new failure from the band.
#
# This gate pins the band instead of ignoring it: the full suite's FAIL
# set is diffed against the checked-in baseline
# (spec/tests/known-failures.txt). Two red conditions, both loud:
#   NEW failure      -> a real regression: fix it (or, if intentional,
#                       justify + add to the baseline in the same commit)
#   VANISHED failure -> something got fixed: delete it from the baseline
#                       so the win is locked in
# Neither touches the runner or the hs loops' scoreboards — the band still
# prints as FAIL lines for the teams working through it.
#
# Usage: bash scripts/test-suite-baseline.sh
# Runtime: full suite, ~5–15 min. Exit 0 = fail set identical to baseline.
set -uo pipefail
cd "$(dirname "$0")/.."

RUNNER=hosts/ocaml/_build/default/bin/run_tests.exe
BASELINE=spec/tests/known-failures.txt
[[ -x "$RUNNER" ]] || { echo "SKIP: $RUNNER not built" >&2; exit 2; }
[[ -f "$BASELINE" ]] || { echo "SKIP: $BASELINE missing" >&2; exit 2; }

log=$(mktemp)
timeout 3000 "$RUNNER" > "$log" 2>&1
rc=$?
if [[ $rc -ne 0 && $rc -ne 1 ]]; then
  echo "RED: runner exited $rc (timeout/crash)"; tail -5 "$log"; rm -f "$log"; exit 1
fi

# Normalize: keep the stable test identity (suite > name), drop messages
# (error text may contain addresses/timings that churn).
current=$(mktemp)
grep '^  FAIL: ' "$log" | sed 's/^  FAIL: //; s/: .*$//' | sort -u > "$current"

new_failures=$(comm -13 <(sort -u "$BASELINE") "$current")
vanished=$(comm -23 <(sort -u "$BASELINE") "$current")

summary=$(grep '^Results:' "$log" | tail -1)
red=0
if [[ -n "$new_failures" ]]; then
  echo "RED: NEW failures not in baseline:"
  sed 's/^/  + /' <<<"$new_failures"
  red=1
fi
if [[ -n "$vanished" ]]; then
  echo "RED: baseline entries now PASSING (delete them from $BASELINE):"
  sed 's/^/  - /' <<<"$vanished"
  red=1
fi
if [[ $red -eq 0 ]]; then
  echo "GREEN: fail set identical to baseline ($(wc -l < "$BASELINE") known failures)"
fi
echo "$summary"
rm -f "$log" "$current"
exit $red