rose-ash/scripts/migrate_one_per_file.py

#!/usr/bin/env python3
"""Migrate sx_docs components to one-definition-per-file convention.

Reads all .sx files under sx/sx/ and sx/sxc/, splits multi-definition
files into one file per definition.

Usage:
    python3 scripts/migrate_one_per_file.py --dry-run   # preview
    python3 scripts/migrate_one_per_file.py              # execute
"""
import os
import sys
import json
import argparse
from pathlib import Path

sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from shared.sx.parser import parse_all, serialize
from shared.sx.types import Symbol, Keyword

NAMED_DEFS = {"defcomp", "defisland", "defmacro", "defpage",
              "defhandler", "defstyle", "deftype", "defeffect",
              "defrelation", "deftest"}

SKIP_FILES = {"boundary.sx"}


def get_def_info(expr):
    """Return (keyword, name) for a definition, or None."""
    if not isinstance(expr, list) or not expr:
        return None
    head = expr[0]
    if not isinstance(head, Symbol):
        return None

    kw = head.name

    if kw in NAMED_DEFS:
        if len(expr) < 2 or not isinstance(expr[1], Symbol):
            return None
        return (kw, expr[1].name.lstrip("~"))

    if kw == "define":
        if len(expr) < 2:
            return None
        if isinstance(expr[1], Symbol):
            return ("define", expr[1].name)
        elif (isinstance(expr[1], list) and expr[1]
              and isinstance(expr[1][0], Symbol)):
            return ("define", expr[1][0].name)
        return None

    return None


def derive_local_name(def_name, file_rel_path):
    """Derive short filename for a definition within the file's directory.

    Strategy:
    1. If name contains '/', split on LAST '/' → namespace + local.
       Then strip redundant namespace-as-prefix from local.
    2. If no '/', try stripping the file's full path (as hyphens) prefix.
    3. Else use the full name.

    Examples:
        name: examples/card          file: examples.sx       → card
        name: layouts/doc            file: layouts.sx        → doc
        name: reactive-islands-demo/example-counter
                                     file: reactive-islands/demo.sx → example-counter
        name: geography-cek/geography-cek-cek-content
                                     file: geography/cek.sx  → cek-content
        name: docs-nav-items         file: nav-data.sx       → docs-nav-items
    """
    if '/' in def_name:
        namespace, local = def_name.rsplit('/', 1)
        # Strip redundant namespace prefix from local part
        # e.g. geography-cek/geography-cek-cek-content → cek-content
        ns_prefix = namespace + '-'
        if local.startswith(ns_prefix):
            stripped = local[len(ns_prefix):]
            if stripped:
                return stripped
        return local

    # No / in name — try stripping file path prefix
    stem = os.path.splitext(file_rel_path)[0]
    path_prefix = stem.replace('/', '-').replace('\\', '-') + '-'
    if def_name.startswith(path_prefix):
        remainder = def_name[len(path_prefix):]
        if remainder:
            return remainder

    # Try just the file stem
    file_stem = Path(file_rel_path).stem
    stem_prefix = file_stem + '-'
    if def_name.startswith(stem_prefix):
        remainder = def_name[len(stem_prefix):]
        if remainder:
            return remainder

    return def_name


def extract_form_sources(source, exprs):
    """Extract original source text for each top-level form.

    Walks the source text tracking paren depth to find form boundaries.
    Returns list of (source_text, is_comment_block) for each expression
    plus any preceding comments.
    """
    results = []
    pos = 0
    n = len(source)

    for expr_idx in range(len(exprs)):
        # Collect leading whitespace and comments
        comment_lines = []
        form_start = pos

        while pos < n:
            # Skip whitespace
            while pos < n and source[pos] in ' \t\r\n':
                pos += 1
            if pos >= n:
                break

            if source[pos] == ';':
                # Comment line
                line_start = pos
                while pos < n and source[pos] != '\n':
                    pos += 1
                if pos < n:
                    pos += 1  # skip newline
                comment_lines.append(source[line_start:pos].rstrip())
                continue

            # Found start of form
            break

        if pos >= n:
            break

        # Extract the form
        if source[pos] == '(':
            depth = 0
            in_string = False
            escape = False
            form_body_start = pos

            while pos < n:
                c = source[pos]
                if escape:
                    escape = False
                elif c == '\\' and in_string:
                    escape = True
                elif c == '"':
                    in_string = not in_string
                elif not in_string:
                    if c == '(':
                        depth += 1
                    elif c == ')':
                        depth -= 1
                        if depth == 0:
                            pos += 1
                            break
                pos += 1

            form_text = source[form_body_start:pos]
            # Include preceding comments
            if comment_lines:
                full_text = '\n'.join(comment_lines) + '\n' + form_text
            else:
                full_text = form_text

            results.append(full_text)
        else:
            # Non-paren form (symbol, etc.)
            start = pos
            while pos < n and source[pos] not in ' \t\r\n':
                pos += 1
            results.append(source[start:pos])

    return results


def process_directory(base_dir, dry_run=True):
    """Process all .sx files, return split plan."""

    splits = []     # (source_file, target_file, content, kw, old_name)
    single = []     # (source_file, kw, old_name)
    no_defs = []    # files with no definitions
    errors = []

    for root, dirs, files in os.walk(base_dir):
        dirs[:] = [d for d in dirs if d not in ('__pycache__', '.cache', '.pytest_cache')]

        for filename in sorted(files):
            if not filename.endswith('.sx') or filename in SKIP_FILES:
                continue

            filepath = os.path.join(root, filename)
            rel_path = os.path.relpath(filepath, base_dir)

            try:
                with open(filepath, encoding='utf-8') as f:
                    source = f.read()
            except Exception as e:
                errors.append((rel_path, str(e)))
                continue

            try:
                exprs = parse_all(source)
            except Exception as e:
                errors.append((rel_path, f"Parse: {e}"))
                continue

            # Classify
            defs = []
            non_defs = []
            for expr in exprs:
                info = get_def_info(expr)
                if info:
                    defs.append((expr, info))
                else:
                    non_defs.append(expr)

            if not defs:
                no_defs.append(rel_path)
                continue

            if len(defs) == 1 and not non_defs:
                # Single definition — stays as-is
                _, (kw, name) = defs[0]
                single.append((rel_path, kw, name))
                continue

            # Multiple definitions — split
            file_stem = Path(filename).stem
            file_dir = os.path.dirname(rel_path)
            target_dir = os.path.join(file_dir, file_stem)

            # Get original source for each form
            form_sources = extract_form_sources(source, exprs)

            all_exprs = []
            for expr in exprs:
                info = get_def_info(expr)
                all_exprs.append((expr, info))

            # Deduplicate: keep only the LAST definition for each name
            seen_names = {}
            for idx, (expr, info) in enumerate(all_exprs):
                if info:
                    seen_names[info[1]] = idx
            last_idx_for_name = set(seen_names.values())

            for idx, (expr, info) in enumerate(all_exprs):
                if info is None:
                    # Non-def form
                    continue
                if idx not in last_idx_for_name:
                    # Earlier duplicate — skip
                    continue

                kw, name = info
                local = derive_local_name(name, rel_path)
                safe_local = local.replace('/', '-')
                target_file = os.path.join(target_dir, safe_local + '.sx')

                # Use original source if available, else serialize
                if idx < len(form_sources):
                    content = form_sources[idx]
                else:
                    content = serialize(expr, pretty=True)

                splits.append((rel_path, target_file, content, kw, name))

            # Non-def forms: collect into _init.sx
            if non_defs:
                init_parts = []
                # Find their original source
                non_def_idx = 0
                for idx, (expr, info) in enumerate(all_exprs):
                    if info is None:
                        if idx < len(form_sources):
                            init_parts.append(form_sources[idx])
                        else:
                            init_parts.append(serialize(expr, pretty=True))
                        non_def_idx += 1

                if init_parts:
                    init_content = '\n\n'.join(init_parts)
                    init_file = os.path.join(target_dir, '_init.sx')
                    splits.append((rel_path, init_file, init_content, 'init', '_init'))

    return splits, single, no_defs, errors


def main():
    parser = argparse.ArgumentParser(description="Migrate SX to one-per-file")
    parser.add_argument("--dry-run", action="store_true")
    parser.add_argument("--dir", default=None)
    args = parser.parse_args()

    project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
    os.chdir(project_root)

    dirs = [args.dir] if args.dir else ["sx/sx", "sx/sxc"]

    all_splits = []
    all_single = []
    all_no_defs = []
    all_errors = []

    for d in dirs:
        if not os.path.isdir(d):
            print(f"Skip {d}")
            continue

        print(f"\n{'='*60}")
        print(f"  {d}")
        print(f"{'='*60}")

        splits, single, no_defs, errors = process_directory(d, args.dry_run)

        # Prefix with base dir for full paths
        for s in splits:
            src, tgt, content, kw, name = s
            all_splits.append((d, os.path.join(d, src), os.path.join(d, tgt),
                             content, kw, name))
        for s in single:
            path, kw, name = s
            all_single.append((d, os.path.join(d, path), kw, name))
        all_no_defs.extend(os.path.join(d, p) for p in no_defs)
        all_errors.extend((os.path.join(d, p), e) for p, e in errors)

    # Check conflicts
    target_map = {}
    conflicts = []
    for _, src, tgt, content, kw, name in all_splits:
        if tgt in target_map:
            conflicts.append((tgt, target_map[tgt], (kw, name, src)))
        else:
            target_map[tgt] = (kw, name, src)

    # Group splits by source file
    by_source = {}
    for _, src, tgt, content, kw, name in all_splits:
        by_source.setdefault(src, []).append((tgt, kw, name))

    # Report
    if all_errors:
        print(f"\n--- Errors ({len(all_errors)}) ---")
        for p, e in all_errors:
            print(f"  {p}: {e}")

    if conflicts:
        print(f"\n--- {len(conflicts)} Conflicts ---")
        for tgt, existing, new in conflicts:
            print(f"  {tgt}")
            print(f"    existing: {existing[1]} from {existing[2]}")
            print(f"    new:      {new[1]} from {new[2]}")

    total_new = len(all_splits)
    print(f"\n{'='*60}")
    print(f"  Summary")
    print(f"{'='*60}")
    print(f"  Files to split:    {len(by_source)}")
    print(f"  New files:         {total_new}")
    print(f"  Single-def (keep): {len(all_single)}")
    print(f"  No-defs (skip):    {len(all_no_defs)}")
    print(f"  Conflicts:         {len(conflicts)}")

    if args.dry_run:
        print(f"\n--- Split plan ---")
        for src in sorted(by_source.keys()):
            targets = by_source[src]
            print(f"\n  {src} → {len(targets)} files:")
            for tgt, kw, name in sorted(targets):
                print(f"    {tgt}  ({kw})")

        print(f"\n--- Single-def files ---")
        for _, path, kw, name in sorted(all_single)[:15]:
            print(f"  {path}  ({kw} {name})")
        if len(all_single) > 15:
            print(f"  ... and {len(all_single) - 15} more")

        # Show a sample
        if all_splits:
            _, src, tgt, content, kw, name = all_splits[0]
            print(f"\n--- Sample: {tgt} ---")
            lines = content.split('\n')
            for line in lines[:15]:
                print(f"  {line}")
            if len(lines) > 15:
                print(f"  ... ({len(lines)} lines total)")

        print(f"\nDry run. Run without --dry-run to execute.")
        return

    # Execute
    if conflicts:
        print("Aborting due to conflicts.")
        sys.exit(1)

    created = 0
    for _, src, tgt, content, kw, name in all_splits:
        os.makedirs(os.path.dirname(tgt), exist_ok=True)
        if os.path.exists(tgt):
            print(f"  SKIP (exists): {tgt}")
            continue
        with open(tgt, 'w', encoding='utf-8') as f:
            f.write(content.rstrip() + '\n')
        created += 1

    # Delete source files
    deleted = 0
    for src in by_source:
        if os.path.exists(src):
            os.remove(src)
            deleted += 1

    print(f"\n  Created {created}, deleted {deleted} source files.")

    # Build name mapping: old_name -> new_path_name
    mapping = {}
    for _, src, tgt, _, kw, old_name in all_splits:
        if kw in ('init', 'preamble'):
            continue
        # Determine which base_dir this target is in
        for d in dirs:
            if tgt.startswith(d + '/'):
                new_name = os.path.splitext(os.path.relpath(tgt, d))[0]
                if old_name != new_name:
                    mapping[old_name] = new_name
                break

    mapping_file = "scripts/name-mapping.json"
    with open(mapping_file, 'w') as f:
        json.dump(mapping, f, indent=2, sort_keys=True)
    print(f"  Name mapping: {mapping_file} ({len(mapping)} entries)")


if __name__ == "__main__":
    main()