Add 3-phase execution with IPFS cache and hash-based task claiming

New files: - claiming.py - Redis Lua scripts for atomic task claiming - tasks/analyze.py - Analysis Celery task - tasks/execute.py - Step execution with IPFS-backed cache - tasks/orchestrate.py - Plan orchestration (run_plan, run_recipe) New API endpoints (/api/v2/): - POST /api/v2/plan - Generate execution plan - POST /api/v2/execute - Execute a plan - POST /api/v2/run-recipe - Full 3-phase pipeline - GET /api/v2/run/{run_id} - Get run status Features: - Hash-based task claiming prevents duplicate work - Parallel execution within dependency levels - IPFS-backed cache for durability - Integration with artdag planning module Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-10 11:44:00 +00:00
parent 7d05011daa
commit f7890dd1ad
7 changed files with 1468 additions and 1 deletions
--- a/tasks/orchestrate.py
+++ b/tasks/orchestrate.py
@@ -0,0 +1,373 @@
+"""
+Plan orchestration tasks.
+
+Coordinates the full 3-phase execution:
+1. Analyze inputs
+2. Generate plan
+3. Execute steps level by level
+
+Uses IPFS-backed cache for durability.
+"""
+
+import json
+import logging
+import os
+from pathlib import Path
+from typing import Dict, List, Optional
+
+from celery import current_task, group, chain
+
+# Import from the Celery app
+import sys
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from celery_app import app
+from claiming import get_claimer
+from cache_manager import get_cache_manager
+
+# Import artdag modules
+try:
+    from artdag import Cache
+    from artdag.analysis import Analyzer, AnalysisResult
+    from artdag.planning import RecipePlanner, ExecutionPlan, Recipe
+except ImportError:
+    Cache = None
+    Analyzer = None
+    AnalysisResult = None
+    RecipePlanner = None
+    ExecutionPlan = None
+    Recipe = None
+
+from .execute import execute_step
+
+logger = logging.getLogger(__name__)
+
+# Cache directories
+CACHE_DIR = Path(os.environ.get('CACHE_DIR', '/data/cache'))
+ANALYSIS_CACHE_DIR = CACHE_DIR / 'analysis'
+PLAN_CACHE_DIR = CACHE_DIR / 'plans'
+
+
+@app.task(bind=True, name='tasks.run_plan')
+def run_plan(
+    self,
+    plan_json: str,
+    run_id: Optional[str] = None,
+) -> dict:
+    """
+    Execute a complete execution plan.
+
+    Runs steps level by level, with parallel execution within each level.
+    Results are stored in IPFS-backed cache.
+
+    Args:
+        plan_json: JSON-serialized ExecutionPlan
+        run_id: Optional run ID for tracking
+
+    Returns:
+        Dict with execution results
+    """
+    if ExecutionPlan is None:
+        raise ImportError("artdag.planning not available")
+
+    plan = ExecutionPlan.from_json(plan_json)
+    cache_mgr = get_cache_manager()
+
+    logger.info(f"Executing plan {plan.plan_id[:16]}... ({len(plan.steps)} steps)")
+
+    # Build initial cache_ids mapping (step_id -> cache_id)
+    cache_ids = {}
+    for step in plan.steps:
+        cache_ids[step.step_id] = step.cache_id
+
+    # Also map input hashes
+    for name, content_hash in plan.input_hashes.items():
+        cache_ids[name] = content_hash
+
+    # Group steps by level
+    steps_by_level = plan.get_steps_by_level()
+    max_level = max(steps_by_level.keys()) if steps_by_level else 0
+
+    results_by_step = {}
+    total_cached = 0
+    total_executed = 0
+
+    for level in range(max_level + 1):
+        level_steps = steps_by_level.get(level, [])
+        if not level_steps:
+            continue
+
+        logger.info(f"Executing level {level}: {len(level_steps)} steps")
+
+        # Check which steps need execution
+        steps_to_run = []
+
+        for step in level_steps:
+            # Check if cached
+            cached_path = cache_mgr.get_by_content_hash(step.cache_id)
+            if cached_path:
+                results_by_step[step.step_id] = {
+                    "status": "cached",
+                    "cache_id": step.cache_id,
+                    "output_path": str(cached_path),
+                }
+                total_cached += 1
+            else:
+                steps_to_run.append(step)
+
+        if not steps_to_run:
+            logger.info(f"Level {level}: all steps cached")
+            continue
+
+        # Build input cache_ids for this level
+        level_cache_ids = dict(cache_ids)
+
+        # Execute steps in parallel
+        tasks = [
+            execute_step.s(step.to_json(), plan.plan_id, level_cache_ids)
+            for step in steps_to_run
+        ]
+
+        job = group(tasks)
+        async_results = job.apply_async()
+
+        # Wait for completion
+        try:
+            step_results = async_results.get(timeout=3600)
+        except Exception as e:
+            logger.error(f"Level {level} execution failed: {e}")
+            return {
+                "status": "failed",
+                "error": str(e),
+                "level": level,
+                "results": results_by_step,
+                "run_id": run_id,
+            }
+
+        # Process results
+        for result in step_results:
+            step_id = result.get("step_id")
+            cache_id = result.get("cache_id")
+
+            results_by_step[step_id] = result
+            cache_ids[step_id] = cache_id
+
+            if result.get("status") in ("completed", "cached", "completed_by_other"):
+                total_executed += 1
+            elif result.get("status") == "failed":
+                logger.error(f"Step {step_id} failed: {result.get('error')}")
+                return {
+                    "status": "failed",
+                    "error": f"Step {step_id} failed: {result.get('error')}",
+                    "level": level,
+                    "results": results_by_step,
+                    "run_id": run_id,
+                }
+
+    # Get final output
+    output_step = plan.get_step(plan.output_step)
+    output_cache_id = output_step.cache_id if output_step else None
+    output_path = None
+    output_ipfs_cid = None
+
+    if output_cache_id:
+        output_path = cache_mgr.get_by_content_hash(output_cache_id)
+        output_ipfs_cid = cache_mgr.get_ipfs_cid(output_cache_id)
+
+    return {
+        "status": "completed",
+        "run_id": run_id,
+        "plan_id": plan.plan_id,
+        "output_cache_id": output_cache_id,
+        "output_path": str(output_path) if output_path else None,
+        "output_ipfs_cid": output_ipfs_cid,
+        "total_steps": len(plan.steps),
+        "cached": total_cached,
+        "executed": total_executed,
+        "results": results_by_step,
+    }
+
+
+@app.task(bind=True, name='tasks.run_recipe')
+def run_recipe(
+    self,
+    recipe_yaml: str,
+    input_hashes: Dict[str, str],
+    features: List[str] = None,
+    run_id: Optional[str] = None,
+) -> dict:
+    """
+    Run a complete recipe through all 3 phases.
+
+    1. Analyze: Extract features from inputs
+    2. Plan: Generate execution plan
+    3. Execute: Run the plan
+
+    Args:
+        recipe_yaml: Recipe YAML content
+        input_hashes: Mapping from input name to content hash
+        features: Features to extract (default: ["beats", "energy"])
+        run_id: Optional run ID for tracking
+
+    Returns:
+        Dict with final results
+    """
+    if RecipePlanner is None or Analyzer is None:
+        raise ImportError("artdag modules not available")
+
+    if features is None:
+        features = ["beats", "energy"]
+
+    cache_mgr = get_cache_manager()
+
+    logger.info(f"Running recipe with {len(input_hashes)} inputs")
+
+    # Phase 1: Analyze
+    logger.info("Phase 1: Analyzing inputs...")
+
+    ANALYSIS_CACHE_DIR.mkdir(parents=True, exist_ok=True)
+    analyzer = Analyzer(cache_dir=ANALYSIS_CACHE_DIR)
+
+    analysis_results = {}
+    for name, content_hash in input_hashes.items():
+        # Get path from cache
+        path = cache_mgr.get_by_content_hash(content_hash)
+        if path:
+            try:
+                result = analyzer.analyze(
+                    input_hash=content_hash,
+                    features=features,
+                    input_path=Path(path),
+                )
+                analysis_results[content_hash] = result
+                logger.info(f"Analyzed {name}: tempo={result.tempo}, beats={len(result.beat_times or [])}")
+            except Exception as e:
+                logger.warning(f"Analysis failed for {name}: {e}")
+        else:
+            logger.warning(f"Input {name} ({content_hash[:16]}...) not in cache")
+
+    logger.info(f"Analyzed {len(analysis_results)} inputs")
+
+    # Phase 2: Plan
+    logger.info("Phase 2: Generating execution plan...")
+
+    recipe = Recipe.from_yaml(recipe_yaml)
+    planner = RecipePlanner(use_tree_reduction=True)
+
+    plan = planner.plan(
+        recipe=recipe,
+        input_hashes=input_hashes,
+        analysis=analysis_results,
+    )
+
+    logger.info(f"Generated plan with {len(plan.steps)} steps")
+
+    # Save plan for debugging
+    PLAN_CACHE_DIR.mkdir(parents=True, exist_ok=True)
+    plan_path = PLAN_CACHE_DIR / f"{plan.plan_id}.json"
+    with open(plan_path, "w") as f:
+        f.write(plan.to_json())
+
+    # Phase 3: Execute
+    logger.info("Phase 3: Executing plan...")
+
+    result = run_plan(plan.to_json(), run_id=run_id)
+
+    return {
+        "status": result.get("status"),
+        "run_id": run_id,
+        "recipe": recipe.name,
+        "plan_id": plan.plan_id,
+        "output_path": result.get("output_path"),
+        "output_cache_id": result.get("output_cache_id"),
+        "output_ipfs_cid": result.get("output_ipfs_cid"),
+        "analysis_count": len(analysis_results),
+        "total_steps": len(plan.steps),
+        "cached": result.get("cached", 0),
+        "executed": result.get("executed", 0),
+        "error": result.get("error"),
+    }
+
+
+@app.task(bind=True, name='tasks.generate_plan')
+def generate_plan(
+    self,
+    recipe_yaml: str,
+    input_hashes: Dict[str, str],
+    features: List[str] = None,
+) -> dict:
+    """
+    Generate an execution plan without executing it.
+
+    Useful for:
+    - Previewing what will be executed
+    - Checking cache status
+    - Debugging recipe issues
+
+    Args:
+        recipe_yaml: Recipe YAML content
+        input_hashes: Mapping from input name to content hash
+        features: Features to extract for analysis
+
+    Returns:
+        Dict with plan details
+    """
+    if RecipePlanner is None or Analyzer is None:
+        raise ImportError("artdag modules not available")
+
+    if features is None:
+        features = ["beats", "energy"]
+
+    cache_mgr = get_cache_manager()
+
+    # Analyze inputs
+    ANALYSIS_CACHE_DIR.mkdir(parents=True, exist_ok=True)
+    analyzer = Analyzer(cache_dir=ANALYSIS_CACHE_DIR)
+
+    analysis_results = {}
+    for name, content_hash in input_hashes.items():
+        path = cache_mgr.get_by_content_hash(content_hash)
+        if path:
+            try:
+                result = analyzer.analyze(
+                    input_hash=content_hash,
+                    features=features,
+                    input_path=Path(path),
+                )
+                analysis_results[content_hash] = result
+            except Exception as e:
+                logger.warning(f"Analysis failed for {name}: {e}")
+
+    # Generate plan
+    recipe = Recipe.from_yaml(recipe_yaml)
+    planner = RecipePlanner(use_tree_reduction=True)
+
+    plan = planner.plan(
+        recipe=recipe,
+        input_hashes=input_hashes,
+        analysis=analysis_results,
+    )
+
+    # Check cache status for each step
+    steps_status = []
+    for step in plan.steps:
+        cached = cache_mgr.has_content(step.cache_id)
+        steps_status.append({
+            "step_id": step.step_id,
+            "node_type": step.node_type,
+            "cache_id": step.cache_id,
+            "level": step.level,
+            "cached": cached,
+        })
+
+    cached_count = sum(1 for s in steps_status if s["cached"])
+
+    return {
+        "status": "planned",
+        "recipe": recipe.name,
+        "plan_id": plan.plan_id,
+        "total_steps": len(plan.steps),
+        "cached_steps": cached_count,
+        "pending_steps": len(plan.steps) - cached_count,
+        "steps": steps_status,
+        "plan_json": plan.to_json(),
+    }