rose-ash/tests/playwright/scrape-hs-upstream.py

#!/usr/bin/env python3
"""Scrape every test from _hyperscript v0.9.90 upstream into our JSON format.

Walks /tmp/hs-upstream/test/**/*.js, parses `test.describe(...)` and `test(...)`
calls with balanced-paren scanning, extracts the arrow function body, and the
first html(...) argument. Emits /root/rose-ash/spec/tests/hyperscript-upstream-tests.json
in body-style Playwright format (matching existing body entries).
"""
import json, os, re, sys
from collections import Counter
from pathlib import Path

HS_ROOT   = Path('/tmp/hs-upstream')
TEST_ROOT = HS_ROOT / 'test'
OUT_JSON  = Path('/root/rose-ash/spec/tests/hyperscript-upstream-tests.json')
BACKUP    = Path('/root/rose-ash/spec/tests/hyperscript-upstream-tests.pre-0.9.90.json')

SKIP_FILES = {'fixtures.js', 'global-setup.js', 'global-teardown.js',
              'entry.js', 'htmx-fixtures.js', 'playwright.config.js'}

# --- tokeniser-ish balanced-paren scanner -----------------------------------

def parse_string_literal(src, i):
    """src[i] must be quote; return (value, next_i). Handles template literals with ${...}."""
    q = src[i]
    i += 1
    out = []
    while i < len(src):
        c = src[i]
        if c == '\\':
            nxt = src[i+1] if i+1 < len(src) else ''
            if nxt == 'n':   out.append('\n'); i += 2
            elif nxt == 't': out.append('\t'); i += 2
            elif nxt == 'r': out.append('\r'); i += 2
            elif nxt == '\\': out.append('\\'); i += 2
            elif nxt == q:    out.append(q); i += 2
            else:
                out.append(nxt); i += 2
        elif c == q:
            return ''.join(out), i + 1
        elif q == '`' and c == '$' and i+1 < len(src) and src[i+1] == '{':
            # template interpolation — skip balanced braces
            out.append('${'); i += 2
            depth = 1
            while i < len(src) and depth > 0:
                cc = src[i]
                if cc in ('"', "'", '`'):
                    _, i = parse_string_literal(src, i)
                    continue
                if cc == '{': depth += 1
                elif cc == '}': depth -= 1
                out.append(cc); i += 1
        else:
            out.append(c); i += 1
    raise ValueError("unterminated string")

def skip_comment_or_regex(src, i):
    """If src[i:] starts a // comment, /* block */, or regex literal, return next index. Else None."""
    if src[i] != '/' or i+1 >= len(src):
        return None
    nxt = src[i+1]
    if nxt == '/':
        j = src.find('\n', i)
        return len(src) if j == -1 else j + 1
    if nxt == '*':
        j = src.find('*/', i)
        return len(src) if j == -1 else j + 2
    # regex heuristic: preceding non-space char is operator-ish
    k = i - 1
    while k >= 0 and src[k].isspace(): k -= 1
    prev = src[k] if k >= 0 else ''
    if prev and prev not in '(,;=!?&|:+-*/<>%^~{[\n' and prev not in '' :
        # not regex context — looks like division
        return None
    j = i + 1
    while j < len(src):
        cc = src[j]
        if cc == '\\':
            j += 2; continue
        if cc == '[':
            j += 1
            while j < len(src) and src[j] != ']':
                if src[j] == '\\': j += 2
                else: j += 1
            if j < len(src): j += 1
            continue
        if cc == '/':
            j += 1
            while j < len(src) and src[j].isalpha(): j += 1
            return j
        if cc == '\n':
            return None
        j += 1
    return None

def find_matching(src, start, open_c='(', close_c=')'):
    """start is index of open_c; return index of matching close_c."""
    depth = 0
    i = start
    while i < len(src):
        c = src[i]
        if c in ('"', "'", '`'):
            try:
                _, i = parse_string_literal(src, i)
            except ValueError:
                return -1
            continue
        j = skip_comment_or_regex(src, i)
        if j is not None:
            i = j
            continue
        if c == open_c:
            depth += 1; i += 1
        elif c == close_c:
            depth -= 1
            if depth == 0:
                return i
            i += 1
        else:
            i += 1
    return -1

# --- test extraction --------------------------------------------------------

def extract_arrow_body(call_src):
    """Given the full `(...args...)` source of test(name, fn), extract the fn body.
    Returns the content between { and } of the arrow function body, or None."""
    # Find the arrow
    arrow = call_src.find('=>')
    if arrow == -1:
        return None
    # Find the first { after =>
    j = arrow + 2
    while j < len(call_src) and call_src[j].isspace(): j += 1
    if j >= len(call_src) or call_src[j] != '{':
        return None
    end = find_matching(call_src, j, '{', '}')
    if end == -1:
        return None
    body = call_src[j+1:end]
    # Strip leading newline + common indentation (for readability)
    return body

def extract_first_html(body):
    """Find the first html(...) call in body and extract its literal string argument.
    Supports html("x" + "y"), html(`x`), html("x"). Returns '' if not findable."""
    m = re.search(r'\bhtml\s*\(', body)
    if not m:
        return ''
    lp = m.end() - 1
    rp = find_matching(body, lp, '(', ')')
    if rp == -1:
        return ''
    args = body[lp+1:rp].strip()
    # Args should be a string or concatenation of strings.
    parts = []
    i = 0
    while i < len(args):
        c = args[i]
        if c.isspace() or c == '+':
            i += 1; continue
        if c in ('"', "'", '`'):
            try:
                val, i = parse_string_literal(args, i)
                parts.append(val)
            except ValueError:
                return ''
        else:
            # not a pure string concatenation — bail
            return ''
    return ''.join(parts)

def extract_tests_from_file(path, rel_category):
    src = path.read_text()
    # Find every test( call (not test.describe, not test.skip.)
    tests = []
    i = 0
    while i < len(src):
        m = re.search(r'(?<![a-zA-Z0-9_$.])test\s*\(', src[i:])
        if not m:
            break
        abs_start = i + m.start()
        abs_paren = i + m.end() - 1
        # Ensure this is not test.describe / test.only / test.skip
        # The lookbehind prevents .describe case. But test( is fine.
        # parse name arg
        j = abs_paren + 1
        while j < len(src) and src[j].isspace(): j += 1
        if j >= len(src) or src[j] not in ('"', "'", '`'):
            i = abs_paren + 1
            continue
        try:
            tname, j2 = parse_string_literal(src, j)
        except ValueError:
            i = abs_paren + 1
            continue
        endp = find_matching(src, abs_paren, '(', ')')
        if endp == -1:
            i = abs_paren + 1
            continue
        call_src = src[abs_paren:endp+1]
        body = extract_arrow_body(call_src)
        if body is None:
            i = endp + 1
            continue
        html = extract_first_html(body)
        tests.append({
            'category': rel_category,
            'name': tname,
            'html': html,
            'body': body,
            'async': True,
            'complexity': classify_complexity(body),
        })
        i = endp + 1
    return tests

def classify_complexity(body):
    if 'sinon.' in body:
        return 'sinon'
    if '<script type="text/hyperscript"' in body or "<script type='text/hyperscript'" in body:
        return 'script-tag'
    if '<script type="text/hypertemplate"' in body or "<script type='text/hypertemplate'" in body:
        return 'script-tag'
    if 'showModal' in body or '<dialog' in body.lower():
        return 'dialog'
    if 'new Promise' in body or '.resolves' in body or 'Promise.' in body:
        return 'promise'
    if 'html(' not in body:
        if '_hyperscript.evaluate' in body or re.search(r'\bevaluate\s*\(', body):
            return 'eval-only'
        if re.search(r'\brun\s*\(', body):
            return 'run-eval'
    return 'simple'

# --- main -------------------------------------------------------------------

def rel_category(path):
    """For test/commands/foo.js, test/features/foo.js → 'foo'.
    For test/core/foo.js → 'core/foo'. test/templates/foo.js → 'templates/foo' etc."""
    rel = path.relative_to(TEST_ROOT)
    parts = rel.parts
    stem = path.stem
    if len(parts) == 1:
        # Top-level — shouldn't happen since all tests are in subdirs
        return stem
    top = parts[0]
    if top in ('commands', 'features'):
        return stem
    # Single subdir like core/api.js → 'core/api'
    if len(parts) == 2:
        return f'{top}/{stem}'
    # Deeper nesting — join all parts except final extension
    return '/'.join(parts[:-1] + (stem,))

def main():
    # Back up existing JSON
    if OUT_JSON.exists() and not BACKUP.exists():
        import shutil
        shutil.copy2(OUT_JSON, BACKUP)
        print(f'Backed up existing JSON to {BACKUP}', file=sys.stderr)

    all_tests = []
    file_count = 0
    for path in sorted(TEST_ROOT.rglob('*.js')):
        if path.name in SKIP_FILES:
            continue
        if any(p in ('vendor', 'node_modules', 'manual') for p in path.parts):
            continue
        cat = rel_category(path)
        tests = extract_tests_from_file(path, cat)
        all_tests.extend(tests)
        file_count += 1

    # Dedup by (category, name) — stable
    seen = {}
    for t in all_tests:
        key = (t['category'], t['name'])
        if key not in seen:
            seen[key] = t
    deduped = list(seen.values())
    deduped.sort(key=lambda t: (t['category'], t['name']))

    # Stats
    cat_counts = Counter(t['category'] for t in deduped)
    print(f'Scanned {file_count} files, extracted {len(all_tests)} tests ({len(deduped)} unique)')
    print(f'Categories: {len(cat_counts)}')
    for cat, n in cat_counts.most_common():
        print(f'  {cat:40s} {n:4d}')

    with OUT_JSON.open('w') as f:
        json.dump(deduped, f, indent=2, ensure_ascii=False)
        f.write('\n')
    print(f'\nWrote {OUT_JSON} ({len(deduped)} tests)')

if __name__ == '__main__':
    main()