rose-ash/scripts/extract-upstream-tests.py

#!/usr/bin/env python3
"""Extract _hyperscript upstream tests into spec/tests/hyperscript-upstream-tests.json.

Walks /tmp/hs-upstream/test/**/*.js, finds every test('name', ...) call, extracts:
  - category   from file path (test/core/tokenizer.js → "core/tokenizer")
  - name       from first arg
  - body       from arrow function body (between outer { and })
  - html       from preceding test.use({html: '...'}) if any
  - async      from whether the arrow function is async
  - complexity heuristic — eval-only / event-driven / dom

Output: spec/tests/hyperscript-upstream-tests.json (overwrites)

Run after: cd /tmp && git clone --depth 1 https://github.com/bigskysoftware/_hyperscript hs-upstream
"""
import json
import os
import re
from pathlib import Path

UPSTREAM = Path('/tmp/hs-upstream/test')
OUT = Path(__file__).parent.parent / 'spec/tests/hyperscript-upstream-tests.json'


def find_matching_brace(src, open_idx):
    """Return index of matching close brace for { at open_idx. Handles strings/comments."""
    assert src[open_idx] == '{'
    depth = 0
    i = open_idx
    n = len(src)
    while i < n:
        c = src[i]
        if c == '{':
            depth += 1
        elif c == '}':
            depth -= 1
            if depth == 0:
                return i
        elif c == '"' or c == "'" or c == '`':
            # skip string
            quote = c
            i += 1
            while i < n and src[i] != quote:
                if src[i] == '\\':
                    i += 2
                    continue
                if quote == '`' and src[i] == '$' and i + 1 < n and src[i+1] == '{':
                    # template literal interpolation — skip nested braces
                    nested = find_matching_brace(src, i + 1)
                    i = nested + 1
                    continue
                i += 1
        elif c == '/' and i + 1 < n:
            nxt = src[i+1]
            if nxt == '/':
                # line comment
                while i < n and src[i] != '\n':
                    i += 1
                continue
            elif nxt == '*':
                # block comment
                i += 2
                while i < n - 1 and not (src[i] == '*' and src[i+1] == '/'):
                    i += 1
                i += 1
        i += 1
    raise ValueError(f"unbalanced brace at {open_idx}")


def extract_tests(src, category):
    """Find test('name', async/non-async ({...}) => { body }) patterns."""
    tests = []
    i = 0
    n = len(src)
    test_re = re.compile(r"\btest\s*\(\s*(['\"])((?:[^\\]|\\.)*?)\1\s*,\s*(async\s+)?(\([^)]*\))\s*=>\s*\{")
    for m in test_re.finditer(src):
        name = m.group(2)
        # Unescape quotes
        name = name.replace("\\'", "'").replace('\\"', '"').replace('\\\\', '\\')
        is_async = m.group(3) is not None
        body_open = src.index('{', m.end() - 1)
        try:
            body_close = find_matching_brace(src, body_open)
        except ValueError:
            continue
        body = src[body_open + 1:body_close]
        # Heuristic complexity classification
        complexity = 'eval-only'
        if 'html(' in body or 'find(' in body:
            complexity = 'dom'
        if 'click(' in body or 'dispatch' in body:
            complexity = 'event-driven'
        tests.append({
            'category': category,
            'name': name,
            'html': '',
            'body': body,
            'async': is_async,
            'complexity': complexity,
        })
    return tests


def main():
    import sys
    if not UPSTREAM.exists():
        print(f"ERROR: {UPSTREAM} not found. Clone first:")
        print("  git clone --depth 1 https://github.com/bigskysoftware/_hyperscript /tmp/hs-upstream")
        return 1

    merge_mode = '--replace' not in sys.argv

    all_tests = []
    skipped_files = []

    for path in sorted(UPSTREAM.rglob('*.js')):
        if path.name in {'fixtures.js', 'entry.js', 'global-setup.js', 'global-teardown.js',
                         'htmx-fixtures.js', 'playwright.config.js'}:
            continue

        rel = path.relative_to(UPSTREAM)
        category = str(rel.with_suffix('')).replace('\\', '/')
        for prefix in ('commands/', 'features/'):
            if category.startswith(prefix):
                category = category[len(prefix):]
                break

        try:
            src = path.read_text()
        except Exception as e:
            skipped_files.append((path, str(e)))
            continue

        all_tests.extend(extract_tests(src, category))

    print(f"Extracted {len(all_tests)} tests from {len(list(UPSTREAM.rglob('*.js')))} files")
    if skipped_files:
        print(f"Skipped {len(skipped_files)} files due to errors")

    if not OUT.exists():
        OUT.write_text(json.dumps(all_tests, indent=2))
        print(f"\nWrote {OUT} (no existing snapshot)")
        return 0

    old = json.loads(OUT.read_text())
    old_by_key = {(t['category'], t['name']): t for t in old}
    new_keys = set((t['category'], t['name']) for t in all_tests)
    old_keys = set(old_by_key)
    added_keys = new_keys - old_keys
    removed_keys = old_keys - new_keys

    print(f"\nDelta vs existing snapshot ({len(old)} tests):")
    print(f"  +{len(added_keys)} new")
    print(f"  -{len(removed_keys)} removed/renamed")
    if added_keys:
        print("\nNew tests:")
        for cat, name in sorted(added_keys):
            print(f"  [{cat}] {name}")
    if removed_keys:
        print("\nRemoved/renamed tests (first 20):")
        for cat, name in sorted(removed_keys)[:20]:
            print(f"  [{cat}] {name}")

    if merge_mode:
        # Merge mode (default): preserve existing test bodies, only add new tests.
        # The old snapshot's bodies were curated/cleaned — re-extracting from raw
        # upstream JS produces slightly different bodies that may not auto-translate.
        # New tests get the raw extracted body; existing tests keep theirs.
        new_by_key = {(t['category'], t['name']): t for t in all_tests}
        merged = list(old)  # preserves original order
        for k in sorted(added_keys):
            merged.append(new_by_key[k])
        OUT.write_text(json.dumps(merged, indent=2))
        print(f"\nMerged: {len(merged)} tests ({len(old)} existing + {len(added_keys)} new) → {OUT}")
        print("       (rerun with --replace to discard old bodies and use raw upstream)")
    else:
        OUT.write_text(json.dumps(all_tests, indent=2))
        print(f"\nReplaced: {len(all_tests)} tests → {OUT}")
    return 0


if __name__ == '__main__':
    raise SystemExit(main())