Files
rose-ash/tests/playwright/scrape-hs-upstream.py
giles fd1dfea9b3 HS tests: scrape v0.9.90 upstream in full, flip silent stubs to loud SKIPs
- scrape-hs-upstream.py: new scraper walks /tmp/hs-upstream/test/**/*.js
  and emits body-style records for all 1,496 v0.9.90 tests (up from 831).
  Widens coverage into 66 previously-missing categories — templates,
  reactivity, behavior, worker, classRef, make, throw, htmx, tailwind,
  viewTransition, and more.

- build-hs-manifest.py + hyperscript-upstream-manifest.{json,md}:
  coverage manifest tagging each upstream test with a status
  (runnable / skip-listed / untranslated / missing) and block reason.

- generate-sx-tests.py: emit (error "SKIP (...)") instead of silent
  (hs-cleanup!) no-op for both skip-listed tests and generator-
  untranslatable bodies. Stub counter now reports both buckets.

- hyperscript-feature-audit-0.9.90.md: gap audit against the 0.9.90
  spec; pre-0.9.90.json backs up prior 831-test snapshot.

New honest baseline (ocaml runner, test-hyperscript-behavioral):
  831 -> 1,496 tests; 645 -> 1,013 passing (67.7% conformance).
  483 failures split: 45 skip-list, 151 untranslated, 287 real.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-22 20:27:22 +00:00

298 lines
9.9 KiB
Python

#!/usr/bin/env python3
"""Scrape every test from _hyperscript v0.9.90 upstream into our JSON format.
Walks /tmp/hs-upstream/test/**/*.js, parses `test.describe(...)` and `test(...)`
calls with balanced-paren scanning, extracts the arrow function body, and the
first html(...) argument. Emits /root/rose-ash/spec/tests/hyperscript-upstream-tests.json
in body-style Playwright format (matching existing body entries).
"""
import json, os, re, sys
from collections import Counter
from pathlib import Path
HS_ROOT = Path('/tmp/hs-upstream')
TEST_ROOT = HS_ROOT / 'test'
OUT_JSON = Path('/root/rose-ash/spec/tests/hyperscript-upstream-tests.json')
BACKUP = Path('/root/rose-ash/spec/tests/hyperscript-upstream-tests.pre-0.9.90.json')
SKIP_FILES = {'fixtures.js', 'global-setup.js', 'global-teardown.js',
'entry.js', 'htmx-fixtures.js', 'playwright.config.js'}
# --- tokeniser-ish balanced-paren scanner -----------------------------------
def parse_string_literal(src, i):
"""src[i] must be quote; return (value, next_i). Handles template literals with ${...}."""
q = src[i]
i += 1
out = []
while i < len(src):
c = src[i]
if c == '\\':
nxt = src[i+1] if i+1 < len(src) else ''
if nxt == 'n': out.append('\n'); i += 2
elif nxt == 't': out.append('\t'); i += 2
elif nxt == 'r': out.append('\r'); i += 2
elif nxt == '\\': out.append('\\'); i += 2
elif nxt == q: out.append(q); i += 2
else:
out.append(nxt); i += 2
elif c == q:
return ''.join(out), i + 1
elif q == '`' and c == '$' and i+1 < len(src) and src[i+1] == '{':
# template interpolation — skip balanced braces
out.append('${'); i += 2
depth = 1
while i < len(src) and depth > 0:
cc = src[i]
if cc in ('"', "'", '`'):
_, i = parse_string_literal(src, i)
continue
if cc == '{': depth += 1
elif cc == '}': depth -= 1
out.append(cc); i += 1
else:
out.append(c); i += 1
raise ValueError("unterminated string")
def skip_comment_or_regex(src, i):
"""If src[i:] starts a // comment, /* block */, or regex literal, return next index. Else None."""
if src[i] != '/' or i+1 >= len(src):
return None
nxt = src[i+1]
if nxt == '/':
j = src.find('\n', i)
return len(src) if j == -1 else j + 1
if nxt == '*':
j = src.find('*/', i)
return len(src) if j == -1 else j + 2
# regex heuristic: preceding non-space char is operator-ish
k = i - 1
while k >= 0 and src[k].isspace(): k -= 1
prev = src[k] if k >= 0 else ''
if prev and prev not in '(,;=!?&|:+-*/<>%^~{[\n' and prev not in '' :
# not regex context — looks like division
return None
j = i + 1
while j < len(src):
cc = src[j]
if cc == '\\':
j += 2; continue
if cc == '[':
j += 1
while j < len(src) and src[j] != ']':
if src[j] == '\\': j += 2
else: j += 1
if j < len(src): j += 1
continue
if cc == '/':
j += 1
while j < len(src) and src[j].isalpha(): j += 1
return j
if cc == '\n':
return None
j += 1
return None
def find_matching(src, start, open_c='(', close_c=')'):
"""start is index of open_c; return index of matching close_c."""
depth = 0
i = start
while i < len(src):
c = src[i]
if c in ('"', "'", '`'):
try:
_, i = parse_string_literal(src, i)
except ValueError:
return -1
continue
j = skip_comment_or_regex(src, i)
if j is not None:
i = j
continue
if c == open_c:
depth += 1; i += 1
elif c == close_c:
depth -= 1
if depth == 0:
return i
i += 1
else:
i += 1
return -1
# --- test extraction --------------------------------------------------------
def extract_arrow_body(call_src):
"""Given the full `(...args...)` source of test(name, fn), extract the fn body.
Returns the content between { and } of the arrow function body, or None."""
# Find the arrow
arrow = call_src.find('=>')
if arrow == -1:
return None
# Find the first { after =>
j = arrow + 2
while j < len(call_src) and call_src[j].isspace(): j += 1
if j >= len(call_src) or call_src[j] != '{':
return None
end = find_matching(call_src, j, '{', '}')
if end == -1:
return None
body = call_src[j+1:end]
# Strip leading newline + common indentation (for readability)
return body
def extract_first_html(body):
"""Find the first html(...) call in body and extract its literal string argument.
Supports html("x" + "y"), html(`x`), html("x"). Returns '' if not findable."""
m = re.search(r'\bhtml\s*\(', body)
if not m:
return ''
lp = m.end() - 1
rp = find_matching(body, lp, '(', ')')
if rp == -1:
return ''
args = body[lp+1:rp].strip()
# Args should be a string or concatenation of strings.
parts = []
i = 0
while i < len(args):
c = args[i]
if c.isspace() or c == '+':
i += 1; continue
if c in ('"', "'", '`'):
try:
val, i = parse_string_literal(args, i)
parts.append(val)
except ValueError:
return ''
else:
# not a pure string concatenation — bail
return ''
return ''.join(parts)
def extract_tests_from_file(path, rel_category):
src = path.read_text()
# Find every test( call (not test.describe, not test.skip.)
tests = []
i = 0
while i < len(src):
m = re.search(r'(?<![a-zA-Z0-9_$.])test\s*\(', src[i:])
if not m:
break
abs_start = i + m.start()
abs_paren = i + m.end() - 1
# Ensure this is not test.describe / test.only / test.skip
# The lookbehind prevents .describe case. But test( is fine.
# parse name arg
j = abs_paren + 1
while j < len(src) and src[j].isspace(): j += 1
if j >= len(src) or src[j] not in ('"', "'", '`'):
i = abs_paren + 1
continue
try:
tname, j2 = parse_string_literal(src, j)
except ValueError:
i = abs_paren + 1
continue
endp = find_matching(src, abs_paren, '(', ')')
if endp == -1:
i = abs_paren + 1
continue
call_src = src[abs_paren:endp+1]
body = extract_arrow_body(call_src)
if body is None:
i = endp + 1
continue
html = extract_first_html(body)
tests.append({
'category': rel_category,
'name': tname,
'html': html,
'body': body,
'async': True,
'complexity': classify_complexity(body),
})
i = endp + 1
return tests
def classify_complexity(body):
if 'sinon.' in body:
return 'sinon'
if '<script type="text/hyperscript"' in body or "<script type='text/hyperscript'" in body:
return 'script-tag'
if '<script type="text/hypertemplate"' in body or "<script type='text/hypertemplate'" in body:
return 'script-tag'
if 'showModal' in body or '<dialog' in body.lower():
return 'dialog'
if 'new Promise' in body or '.resolves' in body or 'Promise.' in body:
return 'promise'
if 'html(' not in body:
if '_hyperscript.evaluate' in body or re.search(r'\bevaluate\s*\(', body):
return 'eval-only'
if re.search(r'\brun\s*\(', body):
return 'run-eval'
return 'simple'
# --- main -------------------------------------------------------------------
def rel_category(path):
"""For test/commands/foo.js, test/features/foo.js → 'foo'.
For test/core/foo.js → 'core/foo'. test/templates/foo.js → 'templates/foo' etc."""
rel = path.relative_to(TEST_ROOT)
parts = rel.parts
stem = path.stem
if len(parts) == 1:
# Top-level — shouldn't happen since all tests are in subdirs
return stem
top = parts[0]
if top in ('commands', 'features'):
return stem
# Single subdir like core/api.js → 'core/api'
if len(parts) == 2:
return f'{top}/{stem}'
# Deeper nesting — join all parts except final extension
return '/'.join(parts[:-1] + (stem,))
def main():
# Back up existing JSON
if OUT_JSON.exists() and not BACKUP.exists():
import shutil
shutil.copy2(OUT_JSON, BACKUP)
print(f'Backed up existing JSON to {BACKUP}', file=sys.stderr)
all_tests = []
file_count = 0
for path in sorted(TEST_ROOT.rglob('*.js')):
if path.name in SKIP_FILES:
continue
if any(p in ('vendor', 'node_modules', 'manual') for p in path.parts):
continue
cat = rel_category(path)
tests = extract_tests_from_file(path, cat)
all_tests.extend(tests)
file_count += 1
# Dedup by (category, name) — stable
seen = {}
for t in all_tests:
key = (t['category'], t['name'])
if key not in seen:
seen[key] = t
deduped = list(seen.values())
deduped.sort(key=lambda t: (t['category'], t['name']))
# Stats
cat_counts = Counter(t['category'] for t in deduped)
print(f'Scanned {file_count} files, extracted {len(all_tests)} tests ({len(deduped)} unique)')
print(f'Categories: {len(cat_counts)}')
for cat, n in cat_counts.most_common():
print(f' {cat:40s} {n:4d}')
with OUT_JSON.open('w') as f:
json.dump(deduped, f, indent=2, ensure_ascii=False)
f.write('\n')
print(f'\nWrote {OUT_JSON} ({len(deduped)} tests)')
if __name__ == '__main__':
main()