Add analysis caching and segment looping for short videos

- Add _cache_analysis_tracks() to cache each analysis track individually with content-hash IDs, replacing inline data with cache-id refs - Add _resolve_analysis_refs() to resolve cache-id refs back to full data - Add extract_segment_with_loop() helper that detects when output is shorter than requested duration and re-runs with -stream_loop -1 - Update COMPOUND handler's FFmpeg and Python paths to use looping - This fixes videos shorter than audio duration being truncated Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-28 18:51:28 +00:00
parent 45f9171701
commit 17e3e23f06
2 changed files with 1210 additions and 158 deletions
--- a/execute.py
+++ b/execute.py
--- a/run_staged.py
+++ b/run_staged.py
@@ -18,6 +18,7 @@ The script:
 3. Produce final output
 """

+import os
 import sys
 import json
 import tempfile
@@ -30,9 +31,37 @@ from typing import Dict, List, Optional, Any
 sys.path.insert(0, str(Path(__file__).parent.parent / "artdag"))

 from artdag.sexp import compile_string, parse
-from artdag.sexp.parser import Symbol, Keyword
+from artdag.sexp.parser import Symbol, Keyword, serialize
 from artdag.sexp.planner import create_plan

+# Import unified cache
+import cache as unified_cache
+
+import hashlib
+
+
+def _cache_analysis_tracks(plan):
+    """Cache each analysis track individually, replace data with cache-id refs."""
+    import json as _json
+    for name, data in plan.analysis.items():
+        json_str = _json.dumps(data, sort_keys=True)
+        content_cid = hashlib.sha256(json_str.encode()).hexdigest()
+        unified_cache.cache_store_json(content_cid, data)
+        plan.analysis[name] = {"_cache_id": content_cid}
+
+
+def _resolve_analysis_refs(analysis_dict):
+    """Resolve cache-id refs back to full analysis data."""
+    resolved = {}
+    for name, data in analysis_dict.items():
+        if isinstance(data, dict) and "_cache_id" in data:
+            loaded = unified_cache.cache_get_json(data["_cache_id"])
+            if loaded:
+                resolved[name] = loaded
+        else:
+            resolved[name] = data
+    return resolved
+

 def run_staged_recipe(
    recipe_path: Path,
@@ -40,6 +69,7 @@ def run_staged_recipe(
    cache_dir: Optional[Path] = None,
    params: Optional[Dict[str, Any]] = None,
    verbose: bool = True,
+    force_replan: bool = False,
 ) -> Path:
    """
    Run a staged recipe with stage-level caching.
@@ -57,21 +87,56 @@ def run_staged_recipe(
    recipe_text = recipe_path.read_text()
    recipe_dir = recipe_path.parent

-    # Set up cache directory
-    if cache_dir is None:
-        cache_dir = recipe_dir / ".stage_cache"
-    cache_dir.mkdir(parents=True, exist_ok=True)
+    # Use unified cache
+    content_cache_dir = unified_cache.get_content_dir()

    def log(msg: str):
        if verbose:
            print(msg, file=sys.stderr)

+    # Store recipe source by CID
+    recipe_cid, _ = unified_cache.content_store_string(recipe_text)
+    log(f"Recipe CID: {recipe_cid[:16]}...")
+
    # Compile recipe
    log(f"Compiling: {recipe_path}")
-    compiled = compile_string(recipe_text, params)
+    compiled = compile_string(recipe_text, params, recipe_dir=recipe_dir)
    log(f"Recipe: {compiled.name} v{compiled.version}")
    log(f"Nodes: {len(compiled.nodes)}")

+    # Store effects by CID
+    for effect_name, effect_info in compiled.registry.get("effects", {}).items():
+        effect_path = effect_info.get("path")
+        effect_cid = effect_info.get("cid")
+        if effect_path and effect_cid:
+            effect_file = Path(effect_path)
+            if effect_file.exists():
+                stored_cid, _ = unified_cache.content_store_file(effect_file)
+                if stored_cid == effect_cid:
+                    log(f"Effect '{effect_name}' CID: {effect_cid[:16]}...")
+                else:
+                    log(f"Warning: Effect '{effect_name}' CID mismatch")
+
+    # Store analyzers by CID
+    for analyzer_name, analyzer_info in compiled.registry.get("analyzers", {}).items():
+        analyzer_path = analyzer_info.get("path")
+        analyzer_cid = analyzer_info.get("cid")
+        if analyzer_path:
+            analyzer_file = Path(analyzer_path) if Path(analyzer_path).is_absolute() else recipe_dir / analyzer_path
+            if analyzer_file.exists():
+                stored_cid, _ = unified_cache.content_store_file(analyzer_file)
+                log(f"Analyzer '{analyzer_name}' CID: {stored_cid[:16]}...")
+
+    # Store included files by CID
+    for include_path, include_cid in compiled.registry.get("includes", {}).items():
+        include_file = Path(include_path)
+        if include_file.exists():
+            stored_cid, _ = unified_cache.content_store_file(include_file)
+            if stored_cid == include_cid:
+                log(f"Include '{include_file.name}' CID: {include_cid[:16]}...")
+            else:
+                log(f"Warning: Include '{include_file.name}' CID mismatch")
+
    # Check for stages
    if not compiled.stages:
        log("No stages found - running as regular recipe")
@@ -96,6 +161,53 @@ def run_staged_recipe(
        times = results.get("times", [])
        log(f"  Analysis complete: {node_id[:16]}... ({len(times)} times)")

+    # Check for cached plan using unified cache
+    plan_cid = unified_cache.plan_exists(recipe_cid, params)
+
+    if plan_cid and not force_replan:
+        plan_cache_path = unified_cache.plan_get_path(recipe_cid, params)
+        log(f"\nFound cached plan: {plan_cid[:16]}...")
+        plan_sexp_str = unified_cache.plan_load(recipe_cid, params)
+
+        # Parse the cached plan
+        from execute import parse_plan_input
+        plan_dict = parse_plan_input(plan_sexp_str)
+
+        # Resolve cache-id refs in plan's embedded analysis
+        if "analysis" in plan_dict:
+            plan_dict["analysis"] = _resolve_analysis_refs(plan_dict["analysis"])
+
+        # Load analysis data from unified cache
+        analysis_data = {}
+        for step in plan_dict.get("steps", []):
+            if step.get("node_type") == "ANALYZE":
+                step_id = step.get("step_id")
+                cached_analysis = unified_cache.cache_get_json(step_id)
+                if cached_analysis:
+                    analysis_data[step_id] = cached_analysis
+                    log(f"  Loaded analysis: {step_id[:16]}...")
+
+        log(f"Plan ID: {plan_dict.get('plan_id', 'unknown')[:16]}...")
+        log(f"Steps: {len(plan_dict.get('steps', []))}")
+        log(f"Analysis tracks: {list(analysis_data.keys())}")
+
+        # Execute directly from cached plan
+        log("\n--- Execution (from cached plan) ---")
+        from execute import execute_plan
+
+        result_path = execute_plan(
+            plan_path=plan_cache_path,
+            output_path=output_path,
+            recipe_dir=recipe_dir,
+            external_analysis=analysis_data,
+            cache_dir=content_cache_dir,
+        )
+
+        log(f"\n--- Complete ---")
+        log(f"Output: {result_path}")
+        return result_path
+
+    # No cached plan - create new one
    plan = create_plan(
        compiled,
        inputs={},
@@ -105,18 +217,29 @@ def run_staged_recipe(

    log(f"\nPlan ID: {plan.plan_id[:16]}...")
    log(f"Steps: {len(plan.steps)}")
+    log(f"Analysis tracks: {list(analysis_data.keys())}")
+
+    # Cache analysis tracks individually and replace with cache-id refs
+    _cache_analysis_tracks(plan)
+
+    # Save plan to unified cache
+    plan_sexp_str = plan.to_string(pretty=True)
+    plan_cache_id, plan_cid, plan_cache_path = unified_cache.plan_store(recipe_cid, params, plan_sexp_str)
+    log(f"Saved plan: {plan_cache_id[:16]}... → {plan_cid[:16]}...")

    # Execute the plan using execute.py logic
    log("\n--- Execution ---")
    from execute import execute_plan

+    # Resolve cache-id refs back to full data for execution
+    resolved_analysis = _resolve_analysis_refs(plan.analysis)
+
    plan_dict = {
        "plan_id": plan.plan_id,
-        "recipe_id": compiled.name,
-        "recipe_hash": plan.recipe_hash,
+        "source_hash": plan.source_hash,
        "encoding": compiled.encoding,
        "output_step_id": plan.output_step_id,
-        "analysis": analysis_data,
+        "analysis": {**resolved_analysis, **analysis_data},
        "effects_registry": plan.effects_registry,
        "minimal_primitives": plan.minimal_primitives,
        "steps": [],
@@ -134,16 +257,16 @@ def run_staged_recipe(
        # Tag with stage info if present
        if step.stage:
            step_dict["stage"] = step.stage
-            step_dict["stage_cache_id"] = step.stage_cache_id
        plan_dict["steps"].append(step_dict)

-    # Execute
+    # Execute using unified cache
    result_path = execute_plan(
        plan_path=None,
        output_path=output_path,
        recipe_dir=recipe_dir,
        plan_data=plan_dict,
        external_analysis=analysis_data,
+        cache_dir=content_cache_dir,
    )

    log(f"\n--- Complete ---")
@@ -162,6 +285,11 @@ def _run_non_staged(compiled, recipe_dir: Path, output_path: Optional[Path], ver
    raise NotImplementedError("Non-staged recipes should use plan.py | execute.py")


+def list_cache(verbose: bool = False):
+    """List all cached items using the unified cache."""
+    unified_cache.print_cache_listing(verbose)
+
+
 def list_params(recipe_path: Path):
    """List available parameters for a recipe and its effects."""
    from artdag.sexp import parse
@@ -283,16 +411,41 @@ Examples:
    python3 run_staged.py recipe.sexp -p color_mode=lime -p char_jitter=5
        """
    )
-    parser.add_argument("recipe", type=Path, help="Recipe file (.sexp)")
+    parser.add_argument("recipe", type=Path, nargs="?", help="Recipe file (.sexp)")
    parser.add_argument("-o", "--output", type=Path, help="Output file path")
-    parser.add_argument("-c", "--cache", type=Path, help="Stage cache directory")
    parser.add_argument("-p", "--param", action="append", dest="params",
                       metavar="KEY=VALUE", help="Set recipe parameter")
    parser.add_argument("-q", "--quiet", action="store_true", help="Suppress progress output")
    parser.add_argument("--list-params", action="store_true", help="List available parameters and exit")
+    parser.add_argument("--list-cache", action="store_true", help="List cached items and exit")
+    parser.add_argument("--no-cache", action="store_true", help="Ignore cached plan, force re-planning")
+    parser.add_argument("--show-plan", action="store_true", help="Show the plan S-expression and exit (don't execute)")
+    parser.add_argument("-v", "--verbose", action="store_true", help="Verbose output")
+    parser.add_argument("-j", "--jobs", type=int, default=None,
+                       help="Max parallel workers (default: 4, or ARTDAG_WORKERS env)")
+    parser.add_argument("--pipelines", type=int, default=None,
+                       help="Max concurrent video pipelines (default: 1, or ARTDAG_VIDEO_PIPELINES env)")

    args = parser.parse_args()

+    # Apply concurrency limits before any execution
+    if args.jobs is not None:
+        os.environ["ARTDAG_WORKERS"] = str(args.jobs)
+    if args.pipelines is not None:
+        os.environ["ARTDAG_VIDEO_PIPELINES"] = str(args.pipelines)
+        from execute import set_max_video_pipelines
+        set_max_video_pipelines(args.pipelines)
+
+    # List cache mode - doesn't require recipe
+    if args.list_cache:
+        list_cache(verbose=args.verbose)
+        sys.exit(0)
+
+    # All other modes require a recipe
+    if not args.recipe:
+        print("Error: recipe file required", file=sys.stderr)
+        sys.exit(1)
+
    if not args.recipe.exists():
        print(f"Recipe not found: {args.recipe}", file=sys.stderr)
        sys.exit(1)
@@ -320,12 +473,51 @@ Examples:
                    pass  # Keep as string
            params[key] = value

+    # Show plan mode - generate plan and display without executing
+    if args.show_plan:
+        recipe_text = args.recipe.read_text()
+        recipe_dir = args.recipe.parent
+
+        # Compute recipe CID (content hash)
+        recipe_cid, _ = unified_cache.content_store_string(recipe_text)
+
+        compiled = compile_string(recipe_text, params if params else None, recipe_dir=recipe_dir)
+
+        # Check for cached plan using unified cache (keyed by source CID + params)
+        plan_cid = unified_cache.plan_exists(recipe_cid, params if params else None)
+
+        if plan_cid and not args.no_cache:
+            print(f";; Cached plan CID: {plan_cid}", file=sys.stderr)
+            plan_sexp_str = unified_cache.plan_load(recipe_cid, params if params else None)
+            print(plan_sexp_str)
+        else:
+            print(f";; Generating new plan...", file=sys.stderr)
+            analysis_data = {}
+            def on_analysis(node_id: str, results: dict):
+                analysis_data[node_id] = results
+
+            plan = create_plan(
+                compiled,
+                inputs={},
+                recipe_dir=recipe_dir,
+                on_analysis=on_analysis,
+            )
+            # Cache analysis tracks individually before serialization
+            _cache_analysis_tracks(plan)
+            plan_sexp_str = plan.to_string(pretty=True)
+
+            # Save to unified cache
+            cache_id, plan_cid, plan_path = unified_cache.plan_store(recipe_cid, params if params else None, plan_sexp_str)
+            print(f";; Saved: {cache_id[:16]}... → {plan_cid}", file=sys.stderr)
+            print(plan_sexp_str)
+        sys.exit(0)
+
    result = run_staged_recipe(
        recipe_path=args.recipe,
        output_path=args.output,
-        cache_dir=args.cache,
        params=params if params else None,
        verbose=not args.quiet,
+        force_replan=args.no_cache,
    )

    # Print final output path