js-on-sx: parallel test262 runner with raw-fd line buffer
Rework test262-runner.py to support --workers N parallel shards, each running a long-lived sx_server session. Replace thread-per-readline with a select-based raw-fd line buffer. On 2-core machines, 1 worker still beats 2 (OCaml eval is CPU-bound and starves when shared). Auto-defaults n_workers=1 on <=2 CPU, nproc-1 (up to 8) otherwise. Throughput baseline: ~1.1 Math tests/s serial on 2-core (unchanged; the evaluator dominates). The runner framework is now ready to scale on bigger machines without further code changes. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -3,14 +3,16 @@
|
|||||||
test262-runner — run the official TC39 test262 suite against our JS-on-SX runtime.
|
test262-runner — run the official TC39 test262 suite against our JS-on-SX runtime.
|
||||||
|
|
||||||
Walks lib/js/test262-upstream/test/**/*.js, parses YAML-ish frontmatter, runs
|
Walks lib/js/test262-upstream/test/**/*.js, parses YAML-ish frontmatter, runs
|
||||||
tests via a long-lived sx_server.exe subprocess (one harness load, one `js-eval`
|
tests via a pool of long-lived sx_server.exe subprocesses (each worker loads
|
||||||
call per test), and emits JSON + Markdown scoreboards.
|
the harness once, then runs `js-eval` per test on a persistent stdin channel),
|
||||||
|
and emits JSON + Markdown scoreboards.
|
||||||
|
|
||||||
Usage:
|
Usage:
|
||||||
python3 lib/js/test262-runner.py # full run (skips strict/module/etc)
|
python3 lib/js/test262-runner.py # full run (skips strict/module/etc)
|
||||||
python3 lib/js/test262-runner.py --limit 2000
|
python3 lib/js/test262-runner.py --limit 2000
|
||||||
python3 lib/js/test262-runner.py --filter built-ins/Math
|
python3 lib/js/test262-runner.py --filter built-ins/Math
|
||||||
python3 lib/js/test262-runner.py --per-test-timeout 3
|
python3 lib/js/test262-runner.py --per-test-timeout 3
|
||||||
|
python3 lib/js/test262-runner.py --workers 4 # parallel workers (default: 2)
|
||||||
|
|
||||||
Outputs:
|
Outputs:
|
||||||
lib/js/test262-scoreboard.json
|
lib/js/test262-scoreboard.json
|
||||||
@@ -34,11 +36,12 @@ from __future__ import annotations
|
|||||||
import argparse
|
import argparse
|
||||||
import dataclasses
|
import dataclasses
|
||||||
import json
|
import json
|
||||||
|
import multiprocessing as mp
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
|
import select
|
||||||
import subprocess
|
import subprocess
|
||||||
import sys
|
import sys
|
||||||
import threading
|
|
||||||
import time
|
import time
|
||||||
from collections import Counter, defaultdict
|
from collections import Counter, defaultdict
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
@@ -152,15 +155,15 @@ FRONTMATTER_RE = re.compile(r"/\*---(.*?)---\*/", re.DOTALL)
|
|||||||
@dataclasses.dataclass
|
@dataclasses.dataclass
|
||||||
class Frontmatter:
|
class Frontmatter:
|
||||||
description: str = ""
|
description: str = ""
|
||||||
flags: list[str] = dataclasses.field(default_factory=list)
|
flags: list = dataclasses.field(default_factory=list)
|
||||||
includes: list[str] = dataclasses.field(default_factory=list)
|
includes: list = dataclasses.field(default_factory=list)
|
||||||
features: list[str] = dataclasses.field(default_factory=list)
|
features: list = dataclasses.field(default_factory=list)
|
||||||
negative_phase: str | None = None
|
negative_phase: "str | None" = None
|
||||||
negative_type: str | None = None
|
negative_type: "str | None" = None
|
||||||
esid: str | None = None
|
esid: "str | None" = None
|
||||||
|
|
||||||
|
|
||||||
def _parse_yaml_list(s: str) -> list[str]:
|
def _parse_yaml_list(s: str) -> list:
|
||||||
s = s.strip()
|
s = s.strip()
|
||||||
if s.startswith("[") and s.endswith("]"):
|
if s.startswith("[") and s.endswith("]"):
|
||||||
s = s[1:-1]
|
s = s[1:-1]
|
||||||
@@ -187,7 +190,7 @@ def parse_frontmatter(src: str) -> Frontmatter:
|
|||||||
key, value = m2.group(1), m2.group(2).strip()
|
key, value = m2.group(1), m2.group(2).strip()
|
||||||
if key == "description":
|
if key == "description":
|
||||||
if value in (">", "|"):
|
if value in (">", "|"):
|
||||||
desc_lines: list[str] = []
|
desc_lines = []
|
||||||
j = i + 1
|
j = i + 1
|
||||||
while j < len(lines):
|
while j < len(lines):
|
||||||
nxt = lines[j]
|
nxt = lines[j]
|
||||||
@@ -328,7 +331,7 @@ def classify_error(msg: str) -> str:
|
|||||||
return f"Other: {msg[:80]}"
|
return f"Other: {msg[:80]}"
|
||||||
|
|
||||||
|
|
||||||
def classify_negative_result(fm: Frontmatter, kind: str, payload: str) -> tuple[bool, str]:
|
def classify_negative_result(fm: Frontmatter, kind: str, payload: str):
|
||||||
expected_type = fm.negative_type or ""
|
expected_type = fm.negative_type or ""
|
||||||
if kind == "error":
|
if kind == "error":
|
||||||
if expected_type and expected_type.lower() in payload.lower():
|
if expected_type and expected_type.lower() in payload.lower():
|
||||||
@@ -337,7 +340,7 @@ def classify_negative_result(fm: Frontmatter, kind: str, payload: str) -> tuple[
|
|||||||
return False, f"negative: expected {expected_type}, but test completed normally"
|
return False, f"negative: expected {expected_type}, but test completed normally"
|
||||||
|
|
||||||
|
|
||||||
def classify_positive_result(kind: str, payload: str) -> tuple[bool, str]:
|
def classify_positive_result(kind: str, payload: str):
|
||||||
if kind == "ok":
|
if kind == "ok":
|
||||||
return True, "passed"
|
return True, "passed"
|
||||||
return False, classify_error(payload)
|
return False, classify_error(payload)
|
||||||
@@ -435,7 +438,7 @@ UNSUPPORTED_FEATURES = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def should_skip(t: "TestCase") -> tuple[bool, str]:
|
def should_skip(t):
|
||||||
if "onlyStrict" in t.fm.flags:
|
if "onlyStrict" in t.fm.flags:
|
||||||
return True, "strict-mode only"
|
return True, "strict-mode only"
|
||||||
if "module" in t.fm.flags:
|
if "module" in t.fm.flags:
|
||||||
@@ -527,8 +530,8 @@ class TestResult:
|
|||||||
elapsed_ms: int = 0
|
elapsed_ms: int = 0
|
||||||
|
|
||||||
|
|
||||||
def discover_tests(filter_prefixes: list[str] | None) -> list[Path]:
|
def discover_tests(filter_prefixes):
|
||||||
tests: list[Path] = []
|
tests = []
|
||||||
for p in TEST_ROOT.rglob("*.js"):
|
for p in TEST_ROOT.rglob("*.js"):
|
||||||
if p.name.endswith("_FIXTURE.js"):
|
if p.name.endswith("_FIXTURE.js"):
|
||||||
continue
|
continue
|
||||||
@@ -543,7 +546,7 @@ def discover_tests(filter_prefixes: list[str] | None) -> list[Path]:
|
|||||||
return tests
|
return tests
|
||||||
|
|
||||||
|
|
||||||
def load_test(path: Path) -> TestCase | None:
|
def load_test(path: Path):
|
||||||
try:
|
try:
|
||||||
src = path.read_text(encoding="utf-8")
|
src = path.read_text(encoding="utf-8")
|
||||||
except Exception:
|
except Exception:
|
||||||
@@ -566,12 +569,15 @@ def load_test(path: Path) -> TestCase | None:
|
|||||||
class ServerSession:
|
class ServerSession:
|
||||||
"""Wrap a long-lived sx_server.exe subprocess; feed it one-liner commands,
|
"""Wrap a long-lived sx_server.exe subprocess; feed it one-liner commands,
|
||||||
collect results per-epoch. Restart on hang/crash.
|
collect results per-epoch. Restart on hang/crash.
|
||||||
|
|
||||||
|
Uses a raw-fd line buffer + select() to avoid spawning a thread per read.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, per_test_timeout: float):
|
def __init__(self, per_test_timeout: float):
|
||||||
self.per_test_timeout = per_test_timeout
|
self.per_test_timeout = per_test_timeout
|
||||||
self.proc: subprocess.Popen | None = None
|
self.proc = None
|
||||||
self.lock = threading.Lock()
|
self._buf = b""
|
||||||
|
self._fd = -1
|
||||||
|
|
||||||
def start(self) -> None:
|
def start(self) -> None:
|
||||||
self.proc = subprocess.Popen(
|
self.proc = subprocess.Popen(
|
||||||
@@ -580,22 +586,24 @@ class ServerSession:
|
|||||||
stdout=subprocess.PIPE,
|
stdout=subprocess.PIPE,
|
||||||
stderr=subprocess.DEVNULL,
|
stderr=subprocess.DEVNULL,
|
||||||
cwd=str(REPO),
|
cwd=str(REPO),
|
||||||
text=True,
|
bufsize=0, # binary, unbuffered — we do our own line parsing
|
||||||
bufsize=1,
|
|
||||||
)
|
)
|
||||||
self._wait_for("(ready)", timeout=10.0)
|
self._fd = self.proc.stdout.fileno()
|
||||||
|
self._buf = b""
|
||||||
|
os.set_blocking(self._fd, False)
|
||||||
|
self._wait_for("(ready)", timeout=15.0)
|
||||||
# Load kernel libraries
|
# Load kernel libraries
|
||||||
self._run_and_collect(1, '(load "lib/r7rs.sx")', timeout=30.0)
|
self._run_and_collect(1, '(load "lib/r7rs.sx")', timeout=60.0)
|
||||||
self._run_and_collect(2, '(load "lib/js/lexer.sx")', timeout=30.0)
|
self._run_and_collect(2, '(load "lib/js/lexer.sx")', timeout=60.0)
|
||||||
self._run_and_collect(3, '(load "lib/js/parser.sx")', timeout=30.0)
|
self._run_and_collect(3, '(load "lib/js/parser.sx")', timeout=60.0)
|
||||||
self._run_and_collect(4, '(load "lib/js/transpile.sx")', timeout=30.0)
|
self._run_and_collect(4, '(load "lib/js/transpile.sx")', timeout=60.0)
|
||||||
self._run_and_collect(5, '(load "lib/js/runtime.sx")', timeout=30.0)
|
self._run_and_collect(5, '(load "lib/js/runtime.sx")', timeout=60.0)
|
||||||
# Preload the stub harness as one big js-eval
|
# Preload the stub harness as one big js-eval
|
||||||
stub_escaped = sx_escape_for_nested_eval(HARNESS_STUB)
|
stub_escaped = sx_escape_for_nested_eval(HARNESS_STUB)
|
||||||
self._run_and_collect(
|
self._run_and_collect(
|
||||||
6,
|
6,
|
||||||
f'(eval "(js-eval \\"{stub_escaped}\\")")',
|
f'(eval "(js-eval \\"{stub_escaped}\\")")',
|
||||||
timeout=30.0,
|
timeout=60.0,
|
||||||
)
|
)
|
||||||
|
|
||||||
def stop(self) -> None:
|
def stop(self) -> None:
|
||||||
@@ -614,31 +622,77 @@ class ServerSession:
|
|||||||
pass
|
pass
|
||||||
self.proc = None
|
self.proc = None
|
||||||
|
|
||||||
|
def _readline_raw(self, timeout: float):
|
||||||
|
"""Read one line (including trailing \\n) from the subprocess's stdout.
|
||||||
|
Returns bytes or None on EOF. Raises TimeoutError if no newline appears
|
||||||
|
within `timeout` seconds.
|
||||||
|
"""
|
||||||
|
deadline = time.monotonic() + timeout
|
||||||
|
while True:
|
||||||
|
nl = self._buf.find(b"\n")
|
||||||
|
if nl >= 0:
|
||||||
|
line = self._buf[: nl + 1]
|
||||||
|
self._buf = self._buf[nl + 1 :]
|
||||||
|
return line
|
||||||
|
remaining = deadline - time.monotonic()
|
||||||
|
if remaining <= 0:
|
||||||
|
raise TimeoutError("readline timeout")
|
||||||
|
try:
|
||||||
|
rlist, _, _ = select.select([self._fd], [], [], remaining)
|
||||||
|
except (OSError, ValueError):
|
||||||
|
return None
|
||||||
|
if not rlist:
|
||||||
|
raise TimeoutError("readline timeout")
|
||||||
|
try:
|
||||||
|
chunk = os.read(self._fd, 65536)
|
||||||
|
except (BlockingIOError, InterruptedError):
|
||||||
|
continue
|
||||||
|
except OSError:
|
||||||
|
return None
|
||||||
|
if not chunk:
|
||||||
|
if self._buf:
|
||||||
|
line = self._buf
|
||||||
|
self._buf = b""
|
||||||
|
return line
|
||||||
|
return None
|
||||||
|
self._buf += chunk
|
||||||
|
|
||||||
|
def _readline(self, timeout: float):
|
||||||
|
b = self._readline_raw(timeout)
|
||||||
|
if b is None:
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
return b.decode("utf-8", errors="replace")
|
||||||
|
except Exception:
|
||||||
|
return ""
|
||||||
|
|
||||||
def _wait_for(self, token: str, timeout: float) -> None:
|
def _wait_for(self, token: str, timeout: float) -> None:
|
||||||
assert self.proc and self.proc.stdout
|
|
||||||
start = time.monotonic()
|
start = time.monotonic()
|
||||||
while time.monotonic() - start < timeout:
|
while time.monotonic() - start < timeout:
|
||||||
line = self.proc.stdout.readline()
|
line = self._readline(timeout - (time.monotonic() - start))
|
||||||
if not line:
|
if line is None:
|
||||||
raise RuntimeError("sx_server closed stdout before ready")
|
raise RuntimeError("sx_server closed stdout before ready")
|
||||||
if token in line:
|
if token in line:
|
||||||
return
|
return
|
||||||
raise TimeoutError(f"timeout waiting for {token}")
|
raise TimeoutError(f"timeout waiting for {token}")
|
||||||
|
|
||||||
def _run_and_collect(self, epoch: int, cmd: str, timeout: float) -> tuple[str, str]:
|
def _run_and_collect(self, epoch: int, cmd: str, timeout: float):
|
||||||
"""Write `(epoch N)\n<cmd>\n` and read until we see ok/ok-len/error for that epoch.
|
"""Write `(epoch N)\\n<cmd>\\n` and read until we see ok/ok-len/error for that epoch.
|
||||||
Returns (kind, payload). Raises TimeoutError if the server hangs.
|
Returns (kind, payload). Raises TimeoutError if the server hangs.
|
||||||
"""
|
"""
|
||||||
assert self.proc and self.proc.stdin and self.proc.stdout
|
payload = f"(epoch {epoch})\n{cmd}\n".encode("utf-8")
|
||||||
self.proc.stdin.write(f"(epoch {epoch})\n{cmd}\n")
|
try:
|
||||||
self.proc.stdin.flush()
|
self.proc.stdin.write(payload)
|
||||||
|
self.proc.stdin.flush()
|
||||||
|
except (BrokenPipeError, OSError):
|
||||||
|
raise RuntimeError("sx_server stdin closed")
|
||||||
deadline = time.monotonic() + timeout
|
deadline = time.monotonic() + timeout
|
||||||
while time.monotonic() < deadline:
|
while time.monotonic() < deadline:
|
||||||
remaining = deadline - time.monotonic()
|
remaining = deadline - time.monotonic()
|
||||||
if remaining <= 0:
|
if remaining <= 0:
|
||||||
raise TimeoutError(f"epoch {epoch} exceeded timeout {timeout}")
|
raise TimeoutError(f"epoch {epoch} exceeded timeout {timeout}")
|
||||||
line = self._readline_with_timeout(remaining)
|
line = self._readline(remaining)
|
||||||
if not line:
|
if line is None:
|
||||||
raise RuntimeError("sx_server closed stdout mid-epoch")
|
raise RuntimeError("sx_server closed stdout mid-epoch")
|
||||||
m = RX_OK_INLINE.match(line)
|
m = RX_OK_INLINE.match(line)
|
||||||
if m:
|
if m:
|
||||||
@@ -649,7 +703,10 @@ class ServerSession:
|
|||||||
m = RX_OK_LEN.match(line)
|
m = RX_OK_LEN.match(line)
|
||||||
if m:
|
if m:
|
||||||
e = int(m.group(1))
|
e = int(m.group(1))
|
||||||
val = self._readline_with_timeout(remaining)
|
remaining2 = deadline - time.monotonic()
|
||||||
|
if remaining2 <= 0:
|
||||||
|
raise TimeoutError(f"epoch {epoch} exceeded timeout {timeout}")
|
||||||
|
val = self._readline(remaining2)
|
||||||
if val is None:
|
if val is None:
|
||||||
val = ""
|
val = ""
|
||||||
val = val.rstrip("\n")
|
val = val.rstrip("\n")
|
||||||
@@ -665,58 +722,107 @@ class ServerSession:
|
|||||||
# Other output — (ready), comment, noise — ignore
|
# Other output — (ready), comment, noise — ignore
|
||||||
raise TimeoutError(f"epoch {epoch} exceeded timeout {timeout}")
|
raise TimeoutError(f"epoch {epoch} exceeded timeout {timeout}")
|
||||||
|
|
||||||
def _readline_with_timeout(self, timeout: float) -> str | None:
|
def run_test(self, epoch: int, js_source: str):
|
||||||
"""Read one line with a timeout. On Linux we use a thread-wrapped read
|
|
||||||
since there's no portable non-blocking readline on a subprocess pipe.
|
|
||||||
"""
|
|
||||||
assert self.proc and self.proc.stdout
|
|
||||||
result: list[str | None] = [None]
|
|
||||||
done = threading.Event()
|
|
||||||
|
|
||||||
def reader() -> None:
|
|
||||||
try:
|
|
||||||
result[0] = self.proc.stdout.readline() # type: ignore[union-attr]
|
|
||||||
except Exception:
|
|
||||||
result[0] = None
|
|
||||||
finally:
|
|
||||||
done.set()
|
|
||||||
|
|
||||||
th = threading.Thread(target=reader, daemon=True)
|
|
||||||
th.start()
|
|
||||||
done.wait(timeout=timeout)
|
|
||||||
if not done.is_set():
|
|
||||||
# Hang — kill the process; caller will restart
|
|
||||||
try:
|
|
||||||
self.proc.kill()
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
raise TimeoutError("readline timeout")
|
|
||||||
return result[0]
|
|
||||||
|
|
||||||
def run_test(self, epoch: int, js_source: str) -> tuple[str, str]:
|
|
||||||
escaped = sx_escape_for_nested_eval(js_source)
|
escaped = sx_escape_for_nested_eval(js_source)
|
||||||
cmd = f'(eval "(js-eval \\"{escaped}\\")")'
|
cmd = f'(eval "(js-eval \\"{escaped}\\")")'
|
||||||
return self._run_and_collect(epoch, cmd, timeout=self.per_test_timeout)
|
return self._run_and_collect(epoch, cmd, timeout=self.per_test_timeout)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Parallel workers
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def _worker_run(args):
|
||||||
|
"""Run a shard of tests in this process. Returns list of (rel, category, status, reason).
|
||||||
|
|
||||||
|
Each worker keeps its own long-lived ServerSession. Restarts on timeout/crash.
|
||||||
|
"""
|
||||||
|
shard_tests, per_test_timeout, restart_every, worker_id = args
|
||||||
|
|
||||||
|
session = None
|
||||||
|
results = []
|
||||||
|
|
||||||
|
def get_session():
|
||||||
|
nonlocal session
|
||||||
|
if session is None:
|
||||||
|
session = ServerSession(per_test_timeout=per_test_timeout)
|
||||||
|
session.start()
|
||||||
|
return session
|
||||||
|
|
||||||
|
def restart():
|
||||||
|
nonlocal session
|
||||||
|
if session is not None:
|
||||||
|
try:
|
||||||
|
session.stop()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
session = None
|
||||||
|
|
||||||
|
try:
|
||||||
|
epoch = 100 + worker_id * 10000
|
||||||
|
done_n = 0
|
||||||
|
for t_data in shard_tests:
|
||||||
|
rel, category, src, negative_phase, negative_type = t_data
|
||||||
|
epoch += 1
|
||||||
|
done_n += 1
|
||||||
|
try:
|
||||||
|
sess = get_session()
|
||||||
|
kind, payload = sess.run_test(epoch, src)
|
||||||
|
if negative_phase:
|
||||||
|
# classify negative
|
||||||
|
expected_type = negative_type or ""
|
||||||
|
if kind == "error":
|
||||||
|
if expected_type and expected_type.lower() in payload.lower():
|
||||||
|
status, reason = "pass", f"negative: threw {expected_type} as expected"
|
||||||
|
else:
|
||||||
|
status, reason = "fail", f"negative: expected {expected_type}, got: {payload[:100]}"
|
||||||
|
else:
|
||||||
|
status, reason = "fail", f"negative: expected {expected_type}, but test completed normally"
|
||||||
|
else:
|
||||||
|
if kind == "ok":
|
||||||
|
status, reason = "pass", "passed"
|
||||||
|
else:
|
||||||
|
status, reason = "fail", classify_error(payload)
|
||||||
|
results.append((rel, category, status, reason))
|
||||||
|
except TimeoutError:
|
||||||
|
results.append((rel, category, "timeout", "per-test timeout"))
|
||||||
|
restart()
|
||||||
|
except Exception as e:
|
||||||
|
results.append((rel, category, "fail", f"runner-error: {e}"))
|
||||||
|
restart()
|
||||||
|
|
||||||
|
# Periodic restart to keep server healthy (memory bounded)
|
||||||
|
if restart_every > 0 and done_n % restart_every == 0:
|
||||||
|
restart()
|
||||||
|
finally:
|
||||||
|
if session is not None:
|
||||||
|
try:
|
||||||
|
session.stop()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Run driver
|
# Run driver
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
def assemble_source(t: TestCase) -> str:
|
def assemble_source(t):
|
||||||
"""Return JS source to feed to js-eval. Harness is preloaded, so we only
|
"""Return JS source to feed to js-eval. Harness is preloaded, so we only
|
||||||
append the test source (plus negative-test prep if needed).
|
append the test source (plus negative-test prep if needed).
|
||||||
"""
|
"""
|
||||||
return t.src
|
return t.src
|
||||||
|
|
||||||
|
|
||||||
def aggregate(results: list[TestResult]) -> dict:
|
def aggregate(results):
|
||||||
by_cat: dict[str, dict] = defaultdict(
|
by_cat = defaultdict(
|
||||||
lambda: {"pass": 0, "fail": 0, "skip": 0, "timeout": 0, "total": 0, "failures": Counter()}
|
lambda: {"pass": 0, "fail": 0, "skip": 0, "timeout": 0, "total": 0, "failures": Counter()}
|
||||||
)
|
)
|
||||||
totals = {"pass": 0, "fail": 0, "skip": 0, "timeout": 0, "total": 0}
|
totals = {"pass": 0, "fail": 0, "skip": 0, "timeout": 0, "total": 0}
|
||||||
failure_modes: Counter[str] = Counter()
|
failure_modes = Counter()
|
||||||
for r in results:
|
for r in results:
|
||||||
cat = by_cat[r.category]
|
cat = by_cat[r.category]
|
||||||
cat[r.status] += 1
|
cat[r.status] += 1
|
||||||
@@ -756,7 +862,7 @@ def aggregate(results: list[TestResult]) -> dict:
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def write_markdown(scoreboard: dict, path: Path, pinned_commit: str, elapsed_s: float) -> None:
|
def write_markdown(scoreboard, path: Path, pinned_commit: str, elapsed_s: float) -> None:
|
||||||
t = scoreboard["totals"]
|
t = scoreboard["totals"]
|
||||||
lines = [
|
lines = [
|
||||||
"# test262 scoreboard",
|
"# test262 scoreboard",
|
||||||
@@ -796,16 +902,18 @@ def write_markdown(scoreboard: dict, path: Path, pinned_commit: str, elapsed_s:
|
|||||||
path.write_text("\n".join(lines), encoding="utf-8")
|
path.write_text("\n".join(lines), encoding="utf-8")
|
||||||
|
|
||||||
|
|
||||||
def main(argv: list[str]) -> int:
|
def main(argv):
|
||||||
ap = argparse.ArgumentParser()
|
ap = argparse.ArgumentParser()
|
||||||
ap.add_argument("--limit", type=int, default=0, help="max tests to run (0 = all)")
|
ap.add_argument("--limit", type=int, default=0, help="max tests to run (0 = all)")
|
||||||
ap.add_argument("--filter", type=str, action="append", default=None,
|
ap.add_argument("--filter", type=str, action="append", default=None,
|
||||||
help="path prefix filter (repeatable; OR'd together)")
|
help="path prefix filter (repeatable; OR'd together)")
|
||||||
ap.add_argument("--per-test-timeout", type=float, default=DEFAULT_PER_TEST_TIMEOUT_S)
|
ap.add_argument("--per-test-timeout", type=float, default=DEFAULT_PER_TEST_TIMEOUT_S)
|
||||||
ap.add_argument("--restart-every", type=int, default=500,
|
ap.add_argument("--restart-every", type=int, default=500,
|
||||||
help="restart server every N tests to keep memory bounded")
|
help="restart worker server every N tests (keeps memory bounded)")
|
||||||
ap.add_argument("--max-per-category", type=int, default=0,
|
ap.add_argument("--max-per-category", type=int, default=0,
|
||||||
help="cap runnable tests per category (0 = no cap)")
|
help="cap runnable tests per category (0 = no cap)")
|
||||||
|
ap.add_argument("--workers", type=int, default=0,
|
||||||
|
help="number of parallel workers (0 = auto; min(nproc, 4))")
|
||||||
ap.add_argument("--output-json", type=str,
|
ap.add_argument("--output-json", type=str,
|
||||||
default=str(REPO / "lib" / "js" / "test262-scoreboard.json"))
|
default=str(REPO / "lib" / "js" / "test262-scoreboard.json"))
|
||||||
ap.add_argument("--output-md", type=str,
|
ap.add_argument("--output-md", type=str,
|
||||||
@@ -833,9 +941,9 @@ def main(argv: list[str]) -> int:
|
|||||||
all_paths = all_paths[: args.limit]
|
all_paths = all_paths[: args.limit]
|
||||||
print(f"Discovered {len(all_paths)} test files.", file=sys.stderr)
|
print(f"Discovered {len(all_paths)} test files.", file=sys.stderr)
|
||||||
|
|
||||||
tests: list[TestCase] = []
|
tests = []
|
||||||
results: list[TestResult] = []
|
results = []
|
||||||
per_cat_count: dict[str, int] = defaultdict(int)
|
per_cat_count = defaultdict(int)
|
||||||
for p in all_paths:
|
for p in all_paths:
|
||||||
t = load_test(p)
|
t = load_test(p)
|
||||||
if not t:
|
if not t:
|
||||||
@@ -853,74 +961,62 @@ def main(argv: list[str]) -> int:
|
|||||||
|
|
||||||
print(f"Will run {len(tests)} tests ({len(results)} skipped up front).", file=sys.stderr)
|
print(f"Will run {len(tests)} tests ({len(results)} skipped up front).", file=sys.stderr)
|
||||||
|
|
||||||
|
# Worker count
|
||||||
|
# Auto-default: on <=2-core machines, 1 worker beats 2 because OCaml eval is
|
||||||
|
# CPU-bound and two processes starve each other. On 4+ cores, use nproc-1
|
||||||
|
# (leave one core for OS/Python). Cap at 8 to avoid resource thrash.
|
||||||
|
n_workers = args.workers
|
||||||
|
if n_workers <= 0:
|
||||||
|
try:
|
||||||
|
cpu = os.cpu_count() or 2
|
||||||
|
except Exception:
|
||||||
|
cpu = 2
|
||||||
|
if cpu <= 2:
|
||||||
|
n_workers = 1
|
||||||
|
else:
|
||||||
|
n_workers = max(1, min(cpu - 1, 8))
|
||||||
|
n_workers = max(1, min(n_workers, len(tests))) if tests else 1
|
||||||
|
print(f"Using {n_workers} parallel worker(s).", file=sys.stderr)
|
||||||
|
|
||||||
|
# Shard tests across workers (round-robin so categories spread evenly)
|
||||||
|
shards = [[] for _ in range(n_workers)]
|
||||||
|
for i, t in enumerate(tests):
|
||||||
|
shards[i % n_workers].append(
|
||||||
|
(t.rel, t.category, t.src, t.fm.negative_phase, t.fm.negative_type)
|
||||||
|
)
|
||||||
|
|
||||||
t_run_start = time.monotonic()
|
t_run_start = time.monotonic()
|
||||||
|
|
||||||
session: ServerSession | None = None
|
if n_workers == 1:
|
||||||
|
# Serial path — avoids multiprocessing overhead
|
||||||
def ensure_session() -> ServerSession:
|
worker_results = [_worker_run((shards[0], args.per_test_timeout, args.restart_every, 0))]
|
||||||
nonlocal session
|
else:
|
||||||
if session is None:
|
with mp.Pool(n_workers) as pool:
|
||||||
session = ServerSession(per_test_timeout=args.per_test_timeout)
|
worker_args = [
|
||||||
session.start()
|
(shards[i], args.per_test_timeout, args.restart_every, i)
|
||||||
return session
|
for i in range(n_workers)
|
||||||
|
]
|
||||||
def restart_session() -> None:
|
# imap_unordered so progress prints show up sooner
|
||||||
nonlocal session
|
collected = []
|
||||||
if session is not None:
|
total_tests = len(tests)
|
||||||
session.stop()
|
last_print = time.monotonic()
|
||||||
session = None
|
for shard_out in pool.imap_unordered(_worker_run, worker_args):
|
||||||
|
collected.append(shard_out)
|
||||||
epoch = 100
|
now = time.monotonic()
|
||||||
done_n = 0
|
if now - last_print >= 5.0:
|
||||||
try:
|
done_so_far = sum(len(s) for s in collected)
|
||||||
for t in tests:
|
el = now - t_run_start
|
||||||
epoch += 1
|
print(
|
||||||
done_n += 1
|
f" worker returned: {done_so_far}/{total_tests} tests "
|
||||||
source = assemble_source(t)
|
f"elapsed={el:.1f}s rate={done_so_far/max(el,0.001):.1f}/s",
|
||||||
try:
|
file=sys.stderr,
|
||||||
sess = ensure_session()
|
|
||||||
kind, payload = sess.run_test(epoch, source)
|
|
||||||
if t.fm.negative_phase:
|
|
||||||
ok, why = classify_negative_result(t.fm, kind, payload)
|
|
||||||
else:
|
|
||||||
ok, why = classify_positive_result(kind, payload)
|
|
||||||
results.append(
|
|
||||||
TestResult(
|
|
||||||
rel=t.rel,
|
|
||||||
category=t.category,
|
|
||||||
status="pass" if ok else "fail",
|
|
||||||
reason=why,
|
|
||||||
)
|
)
|
||||||
)
|
last_print = now
|
||||||
except TimeoutError:
|
worker_results = collected
|
||||||
results.append(
|
|
||||||
TestResult(rel=t.rel, category=t.category, status="timeout", reason="per-test timeout")
|
|
||||||
)
|
|
||||||
restart_session()
|
|
||||||
except Exception as e:
|
|
||||||
results.append(
|
|
||||||
TestResult(rel=t.rel, category=t.category, status="fail", reason=f"runner-error: {e}")
|
|
||||||
)
|
|
||||||
restart_session()
|
|
||||||
|
|
||||||
# Periodic restart to keep server healthy
|
for shard_out in worker_results:
|
||||||
if args.restart_every > 0 and done_n % args.restart_every == 0:
|
for rel, category, status, reason in shard_out:
|
||||||
restart_session()
|
results.append(TestResult(rel=rel, category=category, status=status, reason=reason))
|
||||||
|
|
||||||
if done_n % args.progress_every == 0:
|
|
||||||
pass_so_far = sum(1 for r in results if r.status == "pass")
|
|
||||||
fail_so_far = sum(1 for r in results if r.status == "fail")
|
|
||||||
to_so_far = sum(1 for r in results if r.status == "timeout")
|
|
||||||
el = time.monotonic() - t_run_start
|
|
||||||
print(
|
|
||||||
f" [{done_n}/{len(tests)}] pass={pass_so_far} fail={fail_so_far} "
|
|
||||||
f"timeout={to_so_far} elapsed={el:.1f}s "
|
|
||||||
f"rate={done_n/max(el,0.001):.1f}/s",
|
|
||||||
file=sys.stderr,
|
|
||||||
)
|
|
||||||
finally:
|
|
||||||
if session is not None:
|
|
||||||
session.stop()
|
|
||||||
|
|
||||||
t_run_elapsed = time.monotonic() - t_run_start
|
t_run_elapsed = time.monotonic() - t_run_start
|
||||||
print(f"\nFinished run in {t_run_elapsed:.1f}s", file=sys.stderr)
|
print(f"\nFinished run in {t_run_elapsed:.1f}s", file=sys.stderr)
|
||||||
@@ -928,6 +1024,7 @@ def main(argv: list[str]) -> int:
|
|||||||
scoreboard = aggregate(results)
|
scoreboard = aggregate(results)
|
||||||
scoreboard["pinned_commit"] = pinned_commit
|
scoreboard["pinned_commit"] = pinned_commit
|
||||||
scoreboard["elapsed_seconds"] = round(t_run_elapsed, 1)
|
scoreboard["elapsed_seconds"] = round(t_run_elapsed, 1)
|
||||||
|
scoreboard["workers"] = n_workers
|
||||||
|
|
||||||
out_json = Path(args.output_json)
|
out_json = Path(args.output_json)
|
||||||
out_json.parent.mkdir(parents=True, exist_ok=True)
|
out_json.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
|||||||
@@ -1,22 +1,22 @@
|
|||||||
{
|
{
|
||||||
"totals": {
|
"totals": {
|
||||||
"pass": 66,
|
"pass": 67,
|
||||||
"fail": 206,
|
"fail": 204,
|
||||||
"skip": 39,
|
"skip": 39,
|
||||||
"timeout": 16,
|
"timeout": 17,
|
||||||
"total": 327,
|
"total": 327,
|
||||||
"runnable": 288,
|
"runnable": 288,
|
||||||
"pass_rate": 22.9
|
"pass_rate": 23.3
|
||||||
},
|
},
|
||||||
"categories": [
|
"categories": [
|
||||||
{
|
{
|
||||||
"category": "built-ins/Math",
|
"category": "built-ins/Math",
|
||||||
"total": 327,
|
"total": 327,
|
||||||
"pass": 66,
|
"pass": 67,
|
||||||
"fail": 206,
|
"fail": 204,
|
||||||
"skip": 39,
|
"skip": 39,
|
||||||
"timeout": 16,
|
"timeout": 17,
|
||||||
"pass_rate": 22.9,
|
"pass_rate": 23.3,
|
||||||
"top_failures": [
|
"top_failures": [
|
||||||
[
|
[
|
||||||
"ReferenceError (undefined symbol)",
|
"ReferenceError (undefined symbol)",
|
||||||
@@ -28,11 +28,11 @@
|
|||||||
],
|
],
|
||||||
[
|
[
|
||||||
"TypeError: not a function",
|
"TypeError: not a function",
|
||||||
31
|
30
|
||||||
],
|
],
|
||||||
[
|
[
|
||||||
"Timeout",
|
"Timeout",
|
||||||
16
|
17
|
||||||
],
|
],
|
||||||
[
|
[
|
||||||
"Unhandled: Not callable: {:random <js-math-random()> :trunc <js-math-tr",
|
"Unhandled: Not callable: {:random <js-math-random()> :trunc <js-math-tr",
|
||||||
@@ -52,21 +52,18 @@
|
|||||||
],
|
],
|
||||||
[
|
[
|
||||||
"TypeError: not a function",
|
"TypeError: not a function",
|
||||||
31
|
30
|
||||||
],
|
],
|
||||||
[
|
[
|
||||||
"Timeout",
|
"Timeout",
|
||||||
16
|
17
|
||||||
],
|
],
|
||||||
[
|
[
|
||||||
"Unhandled: Not callable: {:random <js-math-random()> :trunc <js-math-tr",
|
"Unhandled: Not callable: {:random <js-math-random()> :trunc <js-math-tr",
|
||||||
1
|
1
|
||||||
],
|
|
||||||
[
|
|
||||||
"SyntaxError (parse/unsupported syntax)",
|
|
||||||
1
|
|
||||||
]
|
]
|
||||||
],
|
],
|
||||||
"pinned_commit": "d5e73fc8d2c663554fb72e2380a8c2bc1a318a33",
|
"pinned_commit": "d5e73fc8d2c663554fb72e2380a8c2bc1a318a33",
|
||||||
"elapsed_seconds": 275.0
|
"elapsed_seconds": 426.2,
|
||||||
|
"workers": 2
|
||||||
}
|
}
|
||||||
@@ -1,31 +1,30 @@
|
|||||||
# test262 scoreboard
|
# test262 scoreboard
|
||||||
|
|
||||||
Pinned commit: `d5e73fc8d2c663554fb72e2380a8c2bc1a318a33`
|
Pinned commit: `d5e73fc8d2c663554fb72e2380a8c2bc1a318a33`
|
||||||
Wall time: 275.0s
|
Wall time: 426.2s
|
||||||
|
|
||||||
**Total:** 66/288 runnable passed (22.9%). Raw: pass=66 fail=206 skip=39 timeout=16 total=327.
|
**Total:** 67/288 runnable passed (23.3%). Raw: pass=67 fail=204 skip=39 timeout=17 total=327.
|
||||||
|
|
||||||
## Top failure modes
|
## Top failure modes
|
||||||
|
|
||||||
- **94x** ReferenceError (undefined symbol)
|
- **94x** ReferenceError (undefined symbol)
|
||||||
- **79x** Test262Error (assertion failed)
|
- **79x** Test262Error (assertion failed)
|
||||||
- **31x** TypeError: not a function
|
- **30x** TypeError: not a function
|
||||||
- **16x** Timeout
|
- **17x** Timeout
|
||||||
- **1x** Unhandled: Not callable: {:random <js-math-random()> :trunc <js-math-tr
|
- **1x** Unhandled: Not callable: {:random <js-math-random()> :trunc <js-math-tr
|
||||||
- **1x** SyntaxError (parse/unsupported syntax)
|
|
||||||
|
|
||||||
## Categories (worst pass-rate first, min 10 runnable)
|
## Categories (worst pass-rate first, min 10 runnable)
|
||||||
|
|
||||||
| Category | Pass | Fail | Skip | Timeout | Total | Pass % |
|
| Category | Pass | Fail | Skip | Timeout | Total | Pass % |
|
||||||
|---|---:|---:|---:|---:|---:|---:|
|
|---|---:|---:|---:|---:|---:|---:|
|
||||||
| built-ins/Math | 66 | 206 | 39 | 16 | 327 | 22.9% |
|
| built-ins/Math | 67 | 204 | 39 | 17 | 327 | 23.3% |
|
||||||
|
|
||||||
## Per-category top failures (min 10 runnable, worst first)
|
## Per-category top failures (min 10 runnable, worst first)
|
||||||
|
|
||||||
### built-ins/Math (66/288 — 22.9%)
|
### built-ins/Math (67/288 — 23.3%)
|
||||||
|
|
||||||
- **94x** ReferenceError (undefined symbol)
|
- **94x** ReferenceError (undefined symbol)
|
||||||
- **79x** Test262Error (assertion failed)
|
- **79x** Test262Error (assertion failed)
|
||||||
- **31x** TypeError: not a function
|
- **30x** TypeError: not a function
|
||||||
- **16x** Timeout
|
- **17x** Timeout
|
||||||
- **1x** Unhandled: Not callable: {:random <js-math-random()> :trunc <js-math-tr
|
- **1x** Unhandled: Not callable: {:random <js-math-random()> :trunc <js-math-tr
|
||||||
|
|||||||
Reference in New Issue
Block a user