Initial commit of the lib/js/ tree and plans/ directory. A previous session left template-string work in progress — 278/280 unit tests pass (2 failing: tpl part-count off-by-one, escaped-backtick ident lookup). test262-runner.py and scoreboard are placeholders (0/8 with 7 timeouts); fixing the runner is the next queue item.
712 lines
25 KiB
Python
712 lines
25 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
test262-runner — run the official TC39 test262 suite against our JS-on-SX runtime.
|
|
|
|
Walks lib/js/test262-upstream/test/**/*.js, parses YAML-ish frontmatter, batches
|
|
tests through sx_server.exe, and emits a JSON + Markdown scoreboard.
|
|
|
|
Usage:
|
|
python3 lib/js/test262-runner.py # full run
|
|
python3 lib/js/test262-runner.py --limit 2000 # first 2000 tests only
|
|
python3 lib/js/test262-runner.py --filter built-ins/Math
|
|
python3 lib/js/test262-runner.py --batch-size 200 # tests per sx_server boot
|
|
|
|
Outputs:
|
|
lib/js/test262-scoreboard.json — per-category stats + top failure modes
|
|
lib/js/test262-scoreboard.md — human-readable summary (worst first)
|
|
|
|
Pinned to commit (see test262-upstream/.git/HEAD after clone). Update:
|
|
rm -rf lib/js/test262-upstream
|
|
git -C lib/js clone --depth 1 https://github.com/tc39/test262.git test262-upstream
|
|
|
|
Timeouts:
|
|
per-test wallclock: 5s
|
|
per-batch wallclock: 120s
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import dataclasses
|
|
import json
|
|
import os
|
|
import re
|
|
import subprocess
|
|
import sys
|
|
import time
|
|
from collections import Counter, defaultdict
|
|
from pathlib import Path
|
|
|
|
REPO = Path(__file__).resolve().parents[2]
|
|
SX_SERVER = REPO / "hosts" / "ocaml" / "_build" / "default" / "bin" / "sx_server.exe"
|
|
UPSTREAM = REPO / "lib" / "js" / "test262-upstream"
|
|
TEST_ROOT = UPSTREAM / "test"
|
|
HARNESS_DIR = UPSTREAM / "harness"
|
|
|
|
# Default harness files every test implicitly gets (per INTERPRETING.md).
|
|
DEFAULT_HARNESS = ["assert.js", "sta.js"]
|
|
|
|
# Per-batch timeout (seconds). Each batch runs N tests; if sx_server hangs on
|
|
# one, we kill the whole batch and mark remaining as timeout.
|
|
BATCH_TIMEOUT_S = 120
|
|
|
|
# Per-test wallclock is enforced by slicing batches: if a batch of N tests
|
|
# takes > PER_TEST_S * N + slack, it's killed. We also record elapsed time
|
|
# per test by parsing the output stream.
|
|
PER_TEST_S = 5
|
|
|
|
# Target batch size — tune to balance sx_server startup cost (~500ms) against
|
|
# memory / risk of one bad test killing many.
|
|
DEFAULT_BATCH_SIZE = 200
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Frontmatter parsing
|
|
# ---------------------------------------------------------------------------
|
|
|
|
FRONTMATTER_RE = re.compile(r"/\*---(.*?)---\*/", re.DOTALL)
|
|
|
|
|
|
@dataclasses.dataclass
|
|
class Frontmatter:
|
|
description: str = ""
|
|
flags: list[str] = dataclasses.field(default_factory=list)
|
|
includes: list[str] = dataclasses.field(default_factory=list)
|
|
features: list[str] = dataclasses.field(default_factory=list)
|
|
negative_phase: str | None = None
|
|
negative_type: str | None = None
|
|
esid: str | None = None
|
|
es5id: str | None = None
|
|
es6id: str | None = None
|
|
|
|
|
|
def _parse_yaml_list(s: str) -> list[str]:
|
|
"""Parse a `[a, b, c]` style list. Loose — test262 YAML uses this form almost exclusively."""
|
|
s = s.strip()
|
|
if s.startswith("[") and s.endswith("]"):
|
|
s = s[1:-1]
|
|
return [item.strip().strip('"').strip("'") for item in s.split(",") if item.strip()]
|
|
|
|
|
|
def parse_frontmatter(src: str) -> Frontmatter:
|
|
"""Parse test262 YAML-ish frontmatter. Lenient — handles the subset actually in use."""
|
|
fm = Frontmatter()
|
|
m = FRONTMATTER_RE.search(src)
|
|
if not m:
|
|
return fm
|
|
body = m.group(1)
|
|
|
|
# Walk lines, tracking indent for nested negative: {phase, type}.
|
|
lines = body.split("\n")
|
|
i = 0
|
|
current_key = None
|
|
while i < len(lines):
|
|
line = lines[i]
|
|
stripped = line.strip()
|
|
if not stripped or stripped.startswith("#"):
|
|
i += 1
|
|
continue
|
|
# Top-level key: value
|
|
m2 = re.match(r"^([a-zA-Z_][a-zA-Z0-9_]*)\s*:\s*(.*)$", line)
|
|
if m2 and not line.startswith(" ") and not line.startswith("\t"):
|
|
key, value = m2.group(1), m2.group(2).strip()
|
|
if key == "description":
|
|
# Multi-line description supported via `>` or `|`
|
|
if value in (">", "|"):
|
|
desc_lines: list[str] = []
|
|
j = i + 1
|
|
while j < len(lines):
|
|
nxt = lines[j]
|
|
if nxt.startswith(" ") or nxt.startswith("\t") or not nxt.strip():
|
|
desc_lines.append(nxt.strip())
|
|
j += 1
|
|
else:
|
|
break
|
|
fm.description = " ".join(d for d in desc_lines if d)
|
|
i = j
|
|
continue
|
|
fm.description = value
|
|
elif key == "flags":
|
|
fm.flags = _parse_yaml_list(value)
|
|
elif key == "includes":
|
|
fm.includes = _parse_yaml_list(value)
|
|
elif key == "features":
|
|
fm.features = _parse_yaml_list(value)
|
|
elif key == "negative":
|
|
# Either `negative: {phase: parse, type: SyntaxError}` (inline)
|
|
# or spans two indented lines.
|
|
if value.startswith("{"):
|
|
# Inline dict
|
|
inner = value.strip("{}")
|
|
for part in inner.split(","):
|
|
if ":" in part:
|
|
pk, pv = part.split(":", 1)
|
|
pk = pk.strip()
|
|
pv = pv.strip().strip('"').strip("'")
|
|
if pk == "phase":
|
|
fm.negative_phase = pv
|
|
elif pk == "type":
|
|
fm.negative_type = pv
|
|
else:
|
|
current_key = "negative"
|
|
elif key == "esid":
|
|
fm.esid = value
|
|
elif key == "es5id":
|
|
fm.es5id = value
|
|
elif key == "es6id":
|
|
fm.es6id = value
|
|
i += 1
|
|
continue
|
|
# Indented continuation — e.g., negative: {phase:..., type:...}
|
|
if current_key == "negative":
|
|
m3 = re.match(r"^\s+([a-zA-Z_]+)\s*:\s*(.*)$", line)
|
|
if m3:
|
|
pk, pv = m3.group(1), m3.group(2).strip().strip('"').strip("'")
|
|
if pk == "phase":
|
|
fm.negative_phase = pv
|
|
elif pk == "type":
|
|
fm.negative_type = pv
|
|
else:
|
|
current_key = None
|
|
i += 1
|
|
|
|
return fm
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Harness loading
|
|
# ---------------------------------------------------------------------------
|
|
|
|
_HARNESS_CACHE: dict[str, str] = {}
|
|
|
|
|
|
def load_harness(name: str) -> str:
|
|
if name not in _HARNESS_CACHE:
|
|
p = HARNESS_DIR / name
|
|
if p.exists():
|
|
_HARNESS_CACHE[name] = p.read_text(encoding="utf-8")
|
|
else:
|
|
_HARNESS_CACHE[name] = ""
|
|
return _HARNESS_CACHE[name]
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Categories
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def test_category(test_path: Path) -> str:
|
|
"""Derive a category like 'built-ins/Math' from the test path."""
|
|
rel = test_path.relative_to(TEST_ROOT).as_posix()
|
|
parts = rel.split("/")
|
|
# Use at most 2 levels; e.g. built-ins/Math/abs/foo.js → built-ins/Math
|
|
if len(parts) >= 2:
|
|
return "/".join(parts[:2])
|
|
return parts[0]
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# SX escaping
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def sx_escape_double(s: str) -> str:
|
|
"""Escape for a single SX string literal. Turn bytes that break SX parsing into escapes."""
|
|
return (
|
|
s.replace("\\", "\\\\")
|
|
.replace('"', '\\"')
|
|
.replace("\n", "\\n")
|
|
.replace("\r", "\\r")
|
|
.replace("\t", "\\t")
|
|
)
|
|
|
|
|
|
def sx_double_escape(s: str) -> str:
|
|
"""Escape a JS source string for the nested `(eval "(js-eval \"...\")")` form.
|
|
|
|
Two levels of SX string-literal escaping. Matches conformance.sh.
|
|
"""
|
|
inner = sx_escape_double(s)
|
|
# The inner string gets consumed by the outer `(eval "...")`, so we need
|
|
# to escape backslashes and quotes again.
|
|
outer = inner.replace("\\", "\\\\").replace('"', '\\"')
|
|
return outer
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Test assembly
|
|
# ---------------------------------------------------------------------------
|
|
|
|
# A tiny helper we prepend so assert.X = function syntax has a hope. The real
|
|
# test262 assert.js does `assert.sameValue = function(...){}` which requires
|
|
# function-property support. Our runtime doesn't have that yet, so many tests
|
|
# will fail — that's the point of the scoreboard.
|
|
#
|
|
# We don't patch. We run the real harness as-is so the numbers reflect reality.
|
|
|
|
|
|
def assemble_source(test_src: str, includes: list[str]) -> str:
|
|
"""Assemble the full JS source for a test: harness preludes + test."""
|
|
chunks: list[str] = []
|
|
for h in DEFAULT_HARNESS:
|
|
chunks.append(load_harness(h))
|
|
for inc in includes:
|
|
chunks.append(load_harness(inc))
|
|
chunks.append(test_src)
|
|
return "\n".join(chunks)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Output parsing
|
|
# ---------------------------------------------------------------------------
|
|
|
|
# Output from sx_server looks like:
|
|
# (ready)
|
|
# (ok 1 2) -- short value: (ok EPOCH VALUE)
|
|
# (ok-len 100 42) -- long value: next line has the value
|
|
# NEXT_LINE_WITH_VALUE
|
|
# (error 101 "msg") -- epoch errored
|
|
#
|
|
# For our purposes, each test has an epoch. We look up the ok/error result
|
|
# and classify as pass/fail.
|
|
|
|
|
|
def parse_output(output: str) -> dict[int, tuple[str, str]]:
|
|
"""Return {epoch: (kind, payload)} where kind is 'ok' | 'error' | 'missing'."""
|
|
results: dict[int, tuple[str, str]] = {}
|
|
lines = output.split("\n")
|
|
i = 0
|
|
while i < len(lines):
|
|
line = lines[i]
|
|
m_ok = re.match(r"^\(ok (\d+) (.*)\)$", line)
|
|
m_oklen = re.match(r"^\(ok-len (\d+) \d+\)$", line)
|
|
m_err = re.match(r"^\(error (\d+) (.*)\)$", line)
|
|
if m_ok:
|
|
epoch = int(m_ok.group(1))
|
|
results[epoch] = ("ok", m_ok.group(2))
|
|
elif m_oklen:
|
|
epoch = int(m_oklen.group(1))
|
|
val = lines[i + 1] if i + 1 < len(lines) else ""
|
|
results[epoch] = ("ok", val)
|
|
i += 1
|
|
elif m_err:
|
|
epoch = int(m_err.group(1))
|
|
results[epoch] = ("error", m_err.group(2))
|
|
i += 1
|
|
return results
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Classification
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def classify_error(msg: str) -> str:
|
|
"""Bucket an error message into a failure mode."""
|
|
m = msg.lower()
|
|
if "syntaxerror" in m or "parse" in m or "expected" in m and "got" in m:
|
|
return "SyntaxError (parse/unsupported syntax)"
|
|
if "referenceerror" in m or "undefined symbol" in m or "unbound" in m:
|
|
return "ReferenceError (undefined symbol)"
|
|
if "typeerror" in m and "not a function" in m:
|
|
return "TypeError: not a function"
|
|
if "typeerror" in m:
|
|
return "TypeError (other)"
|
|
if "rangeerror" in m:
|
|
return "RangeError"
|
|
if "test262error" in m:
|
|
return "Test262Error (assertion failed)"
|
|
if "timeout" in m:
|
|
return "Timeout"
|
|
if "killed" in m or "crash" in m:
|
|
return "Crash"
|
|
if "unhandled exception" in m:
|
|
# Could be almost anything — extract the inner message.
|
|
inner = re.search(r"Unhandled exception:\s*\\?\"([^\"]{0,80})", msg)
|
|
if inner:
|
|
return f"Unhandled: {inner.group(1)[:60]}"
|
|
return "Unhandled exception"
|
|
return f"Other: {msg[:80]}"
|
|
|
|
|
|
def classify_negative_result(
|
|
fm: Frontmatter, kind: str, payload: str
|
|
) -> tuple[bool, str]:
|
|
"""For negative tests: pass if the right error was thrown."""
|
|
expected_type = fm.negative_type or ""
|
|
if kind == "error":
|
|
# We throw; check if it matches. Our error messages look like:
|
|
# Unhandled exception: "...TypeError..."
|
|
if expected_type and expected_type.lower() in payload.lower():
|
|
return True, f"negative: threw {expected_type} as expected"
|
|
# Also consider "Test262Error" a match for anything (assertion failed
|
|
# instead of throw) — some negative tests assert more than just the throw.
|
|
return False, f"negative: expected {expected_type}, got: {payload[:100]}"
|
|
# ok → the test ran without throwing; that's a fail for negative tests
|
|
return False, f"negative: expected {expected_type}, but test completed normally"
|
|
|
|
|
|
def classify_positive_result(kind: str, payload: str) -> tuple[bool, str]:
|
|
"""For positive tests: pass if no error thrown."""
|
|
if kind == "ok":
|
|
return True, "passed"
|
|
return False, classify_error(payload)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Batch execution
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
@dataclasses.dataclass
|
|
class TestCase:
|
|
path: Path
|
|
rel: str
|
|
category: str
|
|
fm: Frontmatter
|
|
src: str # Test source (pre-harness); full source assembled at run time.
|
|
|
|
|
|
@dataclasses.dataclass
|
|
class TestResult:
|
|
rel: str
|
|
category: str
|
|
status: str # pass | fail | skip | timeout
|
|
reason: str
|
|
elapsed_ms: int = 0
|
|
|
|
|
|
def build_batch_script(tests: list[TestCase], start_epoch: int) -> tuple[str, list[int]]:
|
|
"""Build one big SX script that loads the kernel once, then runs each test
|
|
in its own epoch. Returns (script, [epoch_per_test])."""
|
|
lines = []
|
|
lines.append("(epoch 1)")
|
|
lines.append('(load "lib/r7rs.sx")')
|
|
lines.append("(epoch 2)")
|
|
lines.append('(load "lib/js/lexer.sx")')
|
|
lines.append("(epoch 3)")
|
|
lines.append('(load "lib/js/parser.sx")')
|
|
lines.append("(epoch 4)")
|
|
lines.append('(load "lib/js/transpile.sx")')
|
|
lines.append("(epoch 5)")
|
|
lines.append('(load "lib/js/runtime.sx")')
|
|
|
|
epochs: list[int] = []
|
|
epoch = start_epoch
|
|
for t in tests:
|
|
full_src = assemble_source(t.src, t.fm.includes)
|
|
escaped = sx_double_escape(full_src)
|
|
lines.append(f"(epoch {epoch})")
|
|
lines.append(f'(eval "(js-eval \\"{escaped}\\")")')
|
|
epochs.append(epoch)
|
|
epoch += 1
|
|
return "\n".join(lines) + "\n", epochs
|
|
|
|
|
|
def run_batch(
|
|
tests: list[TestCase], start_epoch: int, timeout_s: int
|
|
) -> tuple[dict[int, tuple[str, str]], bool, float]:
|
|
"""Run a batch; return (results, timed_out, elapsed_s)."""
|
|
script, epochs = build_batch_script(tests, start_epoch)
|
|
start = time.monotonic()
|
|
try:
|
|
proc = subprocess.run(
|
|
[str(SX_SERVER)],
|
|
input=script,
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=timeout_s,
|
|
cwd=str(REPO),
|
|
)
|
|
elapsed = time.monotonic() - start
|
|
return parse_output(proc.stdout), False, elapsed
|
|
except subprocess.TimeoutExpired as e:
|
|
elapsed = time.monotonic() - start
|
|
# Partial output may still be parseable
|
|
stdout = (e.stdout or b"").decode("utf-8", errors="replace") if isinstance(e.stdout, bytes) else (e.stdout or "")
|
|
return parse_output(stdout), True, elapsed
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Main loop
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def discover_tests(filter_prefix: str | None) -> list[Path]:
|
|
"""Walk test262/test/**/*.js, skipping _FIXTURE files and _FIXTURE dirs."""
|
|
tests: list[Path] = []
|
|
for p in TEST_ROOT.rglob("*.js"):
|
|
if p.name.endswith("_FIXTURE.js"):
|
|
continue
|
|
if "_FIXTURE" in p.parts:
|
|
continue
|
|
if filter_prefix:
|
|
rel = p.relative_to(TEST_ROOT).as_posix()
|
|
if not rel.startswith(filter_prefix):
|
|
continue
|
|
tests.append(p)
|
|
tests.sort()
|
|
return tests
|
|
|
|
|
|
def load_test(path: Path) -> TestCase | None:
|
|
"""Load + parse frontmatter. Returns None on read error."""
|
|
try:
|
|
src = path.read_text(encoding="utf-8")
|
|
except Exception:
|
|
return None
|
|
fm = parse_frontmatter(src)
|
|
return TestCase(
|
|
path=path,
|
|
rel=path.relative_to(TEST_ROOT).as_posix(),
|
|
category=test_category(path),
|
|
fm=fm,
|
|
src=src,
|
|
)
|
|
|
|
|
|
def should_skip(t: TestCase) -> tuple[bool, str]:
|
|
"""Skip tests we know we can't run or are explicitly excluded."""
|
|
# Strict-mode tests — we don't support strict mode, so these are noise.
|
|
if "onlyStrict" in t.fm.flags:
|
|
return True, "strict-mode only (not supported)"
|
|
# module flag — ESM tests not supported
|
|
if "module" in t.fm.flags:
|
|
return True, "ESM module (not supported)"
|
|
# async tests time out easily without a proper event loop
|
|
if "async" in t.fm.flags:
|
|
# Let them run; the executor handles timeouts per-batch.
|
|
pass
|
|
# raw tests — they don't load the harness; we can't use assert.* at all.
|
|
# Still run them — some raw tests just check syntax via parse.
|
|
return False, ""
|
|
|
|
|
|
def aggregate(results: list[TestResult]) -> dict:
|
|
"""Build the scoreboard dict."""
|
|
by_cat: dict[str, dict] = defaultdict(
|
|
lambda: {"pass": 0, "fail": 0, "skip": 0, "timeout": 0, "total": 0, "failures": Counter()}
|
|
)
|
|
totals = {"pass": 0, "fail": 0, "skip": 0, "timeout": 0, "total": 0}
|
|
failure_modes: Counter[str] = Counter()
|
|
|
|
for r in results:
|
|
cat = by_cat[r.category]
|
|
cat[r.status] += 1
|
|
cat["total"] += 1
|
|
totals[r.status] += 1
|
|
totals["total"] += 1
|
|
if r.status == "fail":
|
|
cat["failures"][r.reason] += 1
|
|
failure_modes[r.reason] += 1
|
|
|
|
# Build the scoreboard
|
|
categories = []
|
|
for name, stats in sorted(by_cat.items()):
|
|
total = stats["total"]
|
|
passed = stats["pass"]
|
|
runnable = total - stats["skip"]
|
|
pass_rate = (passed / runnable * 100.0) if runnable else 0.0
|
|
categories.append(
|
|
{
|
|
"category": name,
|
|
"total": total,
|
|
"pass": passed,
|
|
"fail": stats["fail"],
|
|
"skip": stats["skip"],
|
|
"timeout": stats["timeout"],
|
|
"pass_rate": round(pass_rate, 1),
|
|
"top_failures": stats["failures"].most_common(5),
|
|
}
|
|
)
|
|
|
|
pass_rate = (totals["pass"] / (totals["total"] - totals["skip"]) * 100.0) if totals["total"] - totals["skip"] else 0.0
|
|
return {
|
|
"totals": {**totals, "pass_rate": round(pass_rate, 1)},
|
|
"categories": categories,
|
|
"top_failure_modes": failure_modes.most_common(20),
|
|
}
|
|
|
|
|
|
def write_markdown(scoreboard: dict, path: Path, pinned_commit: str) -> None:
|
|
t = scoreboard["totals"]
|
|
lines = [
|
|
"# test262 scoreboard",
|
|
"",
|
|
f"Pinned commit: `{pinned_commit}`",
|
|
"",
|
|
f"**Total:** {t['pass']}/{t['total']} passed ({t['pass_rate']}%), "
|
|
f"{t['fail']} failed, {t['skip']} skipped, {t['timeout']} timeouts.",
|
|
"",
|
|
"## Top failure modes",
|
|
"",
|
|
]
|
|
for mode, count in scoreboard["top_failure_modes"]:
|
|
lines.append(f"- **{count}x** {mode}")
|
|
lines.extend(["", "## Categories (worst pass-rate first)", ""])
|
|
lines.append("| Category | Pass | Fail | Skip | Timeout | Total | Pass % |")
|
|
lines.append("|---|---:|---:|---:|---:|---:|---:|")
|
|
# Sort: worst pass rate first, breaking ties by total desc
|
|
cats = sorted(scoreboard["categories"], key=lambda c: (c["pass_rate"], -c["total"]))
|
|
for c in cats:
|
|
lines.append(
|
|
f"| {c['category']} | {c['pass']} | {c['fail']} | {c['skip']} | "
|
|
f"{c['timeout']} | {c['total']} | {c['pass_rate']}% |"
|
|
)
|
|
lines.append("")
|
|
lines.append("## Per-category top failures")
|
|
lines.append("")
|
|
for c in cats:
|
|
if not c["top_failures"]:
|
|
continue
|
|
lines.append(f"### {c['category']}")
|
|
lines.append("")
|
|
for reason, count in c["top_failures"]:
|
|
lines.append(f"- **{count}x** {reason}")
|
|
lines.append("")
|
|
path.write_text("\n".join(lines), encoding="utf-8")
|
|
|
|
|
|
def main(argv: list[str]) -> int:
|
|
ap = argparse.ArgumentParser()
|
|
ap.add_argument("--limit", type=int, default=0, help="max tests to run (0 = all)")
|
|
ap.add_argument("--filter", type=str, default=None, help="path prefix filter")
|
|
ap.add_argument("--batch-size", type=int, default=DEFAULT_BATCH_SIZE)
|
|
ap.add_argument(
|
|
"--output-json",
|
|
type=str,
|
|
default=str(REPO / "lib" / "js" / "test262-scoreboard.json"),
|
|
)
|
|
ap.add_argument(
|
|
"--output-md",
|
|
type=str,
|
|
default=str(REPO / "lib" / "js" / "test262-scoreboard.md"),
|
|
)
|
|
ap.add_argument("--progress", action="store_true", help="print per-batch progress")
|
|
args = ap.parse_args(argv)
|
|
|
|
if not SX_SERVER.exists():
|
|
print(f"ERROR: sx_server.exe not found at {SX_SERVER}", file=sys.stderr)
|
|
print("Build with: cd hosts/ocaml && dune build", file=sys.stderr)
|
|
return 1
|
|
if not UPSTREAM.exists():
|
|
print(f"ERROR: test262-upstream not found at {UPSTREAM}", file=sys.stderr)
|
|
print(
|
|
"Clone with: cd lib/js && git clone --depth 1 "
|
|
"https://github.com/tc39/test262.git test262-upstream",
|
|
file=sys.stderr,
|
|
)
|
|
return 1
|
|
|
|
pinned_commit = ""
|
|
try:
|
|
pinned_commit = subprocess.check_output(
|
|
["git", "-C", str(UPSTREAM), "rev-parse", "HEAD"], text=True
|
|
).strip()
|
|
except Exception:
|
|
pass
|
|
|
|
all_paths = discover_tests(args.filter)
|
|
if args.limit:
|
|
all_paths = all_paths[: args.limit]
|
|
print(f"Discovered {len(all_paths)} test files.", file=sys.stderr)
|
|
|
|
# Load all (parse frontmatter, decide skips up front)
|
|
tests: list[TestCase] = []
|
|
skipped: list[TestResult] = []
|
|
for p in all_paths:
|
|
t = load_test(p)
|
|
if not t:
|
|
continue
|
|
skip, why = should_skip(t)
|
|
if skip:
|
|
skipped.append(
|
|
TestResult(rel=t.rel, category=t.category, status="skip", reason=why)
|
|
)
|
|
continue
|
|
tests.append(t)
|
|
|
|
print(
|
|
f"Will run {len(tests)} tests ({len(skipped)} skipped up front).",
|
|
file=sys.stderr,
|
|
)
|
|
|
|
results: list[TestResult] = list(skipped)
|
|
batch_size = args.batch_size
|
|
epoch_start = 100
|
|
n_batches = (len(tests) + batch_size - 1) // batch_size
|
|
t_run_start = time.monotonic()
|
|
|
|
for bi in range(n_batches):
|
|
batch = tests[bi * batch_size : (bi + 1) * batch_size]
|
|
timeout_s = min(BATCH_TIMEOUT_S, max(30, len(batch) * PER_TEST_S))
|
|
epoch_map, timed_out, elapsed = run_batch(batch, epoch_start, timeout_s)
|
|
for idx, t in enumerate(batch):
|
|
epoch = epoch_start + idx
|
|
res = epoch_map.get(epoch)
|
|
if res is None:
|
|
# No result for this epoch — batch probably timed out before
|
|
# reaching it, or sx_server died.
|
|
status = "timeout" if timed_out else "fail"
|
|
reason = "batch timeout before epoch" if timed_out else "no result from sx_server"
|
|
results.append(
|
|
TestResult(
|
|
rel=t.rel, category=t.category, status=status, reason=reason
|
|
)
|
|
)
|
|
continue
|
|
kind, payload = res
|
|
if t.fm.negative_phase:
|
|
ok, why = classify_negative_result(t.fm, kind, payload)
|
|
else:
|
|
ok, why = classify_positive_result(kind, payload)
|
|
results.append(
|
|
TestResult(
|
|
rel=t.rel,
|
|
category=t.category,
|
|
status="pass" if ok else "fail",
|
|
reason=why,
|
|
)
|
|
)
|
|
epoch_start += batch_size
|
|
|
|
if args.progress or bi % 10 == 0:
|
|
done_n = min((bi + 1) * batch_size, len(tests))
|
|
pass_so_far = sum(1 for r in results if r.status == "pass")
|
|
print(
|
|
f" [batch {bi + 1}/{n_batches}] {done_n}/{len(tests)} tests "
|
|
f"{elapsed:.1f}s{' TIMEOUT' if timed_out else ''} "
|
|
f"running-pass={pass_so_far}",
|
|
file=sys.stderr,
|
|
)
|
|
|
|
t_run_elapsed = time.monotonic() - t_run_start
|
|
print(f"\nFinished run in {t_run_elapsed:.1f}s", file=sys.stderr)
|
|
|
|
scoreboard = aggregate(results)
|
|
scoreboard["pinned_commit"] = pinned_commit
|
|
scoreboard["elapsed_seconds"] = round(t_run_elapsed, 1)
|
|
|
|
# Per-test detail is too large — omit from JSON by default; the aggregated
|
|
# scoreboard is what's useful.
|
|
out_json = Path(args.output_json)
|
|
out_json.parent.mkdir(parents=True, exist_ok=True)
|
|
out_json.write_text(json.dumps(scoreboard, indent=2), encoding="utf-8")
|
|
|
|
out_md = Path(args.output_md)
|
|
write_markdown(scoreboard, out_md, pinned_commit)
|
|
|
|
t = scoreboard["totals"]
|
|
print(
|
|
f"\nScoreboard: {t['pass']}/{t['total']} passed ({t['pass_rate']}%) "
|
|
f"fail={t['fail']} skip={t['skip']} timeout={t['timeout']}",
|
|
file=sys.stderr,
|
|
)
|
|
print(f"JSON: {out_json}", file=sys.stderr)
|
|
print(f"MD: {out_md}", file=sys.stderr)
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main(sys.argv[1:]))
|