#!/usr/bin/env python3 """Scrape every test from _hyperscript v0.9.90 upstream into our JSON format. Walks /tmp/hs-upstream/test/**/*.js, parses `test.describe(...)` and `test(...)` calls with balanced-paren scanning, extracts the arrow function body, and the first html(...) argument. Emits /root/rose-ash/spec/tests/hyperscript-upstream-tests.json in body-style Playwright format (matching existing body entries). """ import json, os, re, sys from collections import Counter from pathlib import Path HS_ROOT = Path('/tmp/hs-upstream') TEST_ROOT = HS_ROOT / 'test' OUT_JSON = Path('/root/rose-ash/spec/tests/hyperscript-upstream-tests.json') BACKUP = Path('/root/rose-ash/spec/tests/hyperscript-upstream-tests.pre-0.9.90.json') SKIP_FILES = {'fixtures.js', 'global-setup.js', 'global-teardown.js', 'entry.js', 'htmx-fixtures.js', 'playwright.config.js'} # --- tokeniser-ish balanced-paren scanner ----------------------------------- def parse_string_literal(src, i): """src[i] must be quote; return (value, next_i). Handles template literals with ${...}.""" q = src[i] i += 1 out = [] while i < len(src): c = src[i] if c == '\\': nxt = src[i+1] if i+1 < len(src) else '' if nxt == 'n': out.append('\n'); i += 2 elif nxt == 't': out.append('\t'); i += 2 elif nxt == 'r': out.append('\r'); i += 2 elif nxt == '\\': out.append('\\'); i += 2 elif nxt == q: out.append(q); i += 2 else: out.append(nxt); i += 2 elif c == q: return ''.join(out), i + 1 elif q == '`' and c == '$' and i+1 < len(src) and src[i+1] == '{': # template interpolation — skip balanced braces out.append('${'); i += 2 depth = 1 while i < len(src) and depth > 0: cc = src[i] if cc in ('"', "'", '`'): _, i = parse_string_literal(src, i) continue if cc == '{': depth += 1 elif cc == '}': depth -= 1 out.append(cc); i += 1 else: out.append(c); i += 1 raise ValueError("unterminated string") def skip_comment_or_regex(src, i): """If src[i:] starts a // comment, /* block */, or regex literal, return next index. Else None.""" if src[i] != '/' or i+1 >= len(src): return None nxt = src[i+1] if nxt == '/': j = src.find('\n', i) return len(src) if j == -1 else j + 1 if nxt == '*': j = src.find('*/', i) return len(src) if j == -1 else j + 2 # regex heuristic: preceding non-space char is operator-ish k = i - 1 while k >= 0 and src[k].isspace(): k -= 1 prev = src[k] if k >= 0 else '' if prev and prev not in '(,;=!?&|:+-*/<>%^~{[\n' and prev not in '' : # not regex context — looks like division return None j = i + 1 while j < len(src): cc = src[j] if cc == '\\': j += 2; continue if cc == '[': j += 1 while j < len(src) and src[j] != ']': if src[j] == '\\': j += 2 else: j += 1 if j < len(src): j += 1 continue if cc == '/': j += 1 while j < len(src) and src[j].isalpha(): j += 1 return j if cc == '\n': return None j += 1 return None def find_matching(src, start, open_c='(', close_c=')'): """start is index of open_c; return index of matching close_c.""" depth = 0 i = start while i < len(src): c = src[i] if c in ('"', "'", '`'): try: _, i = parse_string_literal(src, i) except ValueError: return -1 continue j = skip_comment_or_regex(src, i) if j is not None: i = j continue if c == open_c: depth += 1; i += 1 elif c == close_c: depth -= 1 if depth == 0: return i i += 1 else: i += 1 return -1 # --- test extraction -------------------------------------------------------- def extract_arrow_body(call_src): """Given the full `(...args...)` source of test(name, fn), extract the fn body. Returns the content between { and } of the arrow function body, or None.""" # Find the arrow arrow = call_src.find('=>') if arrow == -1: return None # Find the first { after => j = arrow + 2 while j < len(call_src) and call_src[j].isspace(): j += 1 if j >= len(call_src) or call_src[j] != '{': return None end = find_matching(call_src, j, '{', '}') if end == -1: return None body = call_src[j+1:end] # Strip leading newline + common indentation (for readability) return body def extract_first_html(body): """Find the first html(...) call in body and extract its literal string argument. Supports html("x" + "y"), html(`x`), html("x"). Returns '' if not findable.""" m = re.search(r'\bhtml\s*\(', body) if not m: return '' lp = m.end() - 1 rp = find_matching(body, lp, '(', ')') if rp == -1: return '' args = body[lp+1:rp].strip() # Args should be a string or concatenation of strings. parts = [] i = 0 while i < len(args): c = args[i] if c.isspace() or c == '+': i += 1; continue if c in ('"', "'", '`'): try: val, i = parse_string_literal(args, i) parts.append(val) except ValueError: return '' else: # not a pure string concatenation — bail return '' return ''.join(parts) def extract_tests_from_file(path, rel_category): src = path.read_text() # Find every test( call (not test.describe, not test.skip.) tests = [] i = 0 while i < len(src): m = re.search(r'(?= len(src) or src[j] not in ('"', "'", '`'): i = abs_paren + 1 continue try: tname, j2 = parse_string_literal(src, j) except ValueError: i = abs_paren + 1 continue endp = find_matching(src, abs_paren, '(', ')') if endp == -1: i = abs_paren + 1 continue call_src = src[abs_paren:endp+1] body = extract_arrow_body(call_src) if body is None: i = endp + 1 continue html = extract_first_html(body) tests.append({ 'category': rel_category, 'name': tname, 'html': html, 'body': body, 'async': True, 'complexity': classify_complexity(body), }) i = endp + 1 return tests def classify_complexity(body): if 'sinon.' in body: return 'sinon' if '