Add 3-phase execution with IPFS cache and hash-based task claiming

New files: - claiming.py - Redis Lua scripts for atomic task claiming - tasks/analyze.py - Analysis Celery task - tasks/execute.py - Step execution with IPFS-backed cache - tasks/orchestrate.py - Plan orchestration (run_plan, run_recipe) New API endpoints (/api/v2/): - POST /api/v2/plan - Generate execution plan - POST /api/v2/execute - Execute a plan - POST /api/v2/run-recipe - Full 3-phase pipeline - GET /api/v2/run/{run_id} - Get run status Features: - Hash-based task claiming prevents duplicate work - Parallel execution within dependency levels - IPFS-backed cache for durability - Integration with artdag planning module Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-10 11:44:00 +00:00
parent 7d05011daa
commit f7890dd1ad
7 changed files with 1468 additions and 1 deletions
--- a/celery_app.py
+++ b/celery_app.py
@@ -14,7 +14,7 @@ app = Celery(
    'art_celery',
    broker=REDIS_URL,
    backend=REDIS_URL,
-    include=['tasks']
+    include=['tasks', 'tasks.analyze', 'tasks.execute', 'tasks.orchestrate']
 )
 app.conf.update(
--- a/claiming.py
+++ b/claiming.py
@@ -0,0 +1,421 @@
 """
 Hash-based task claiming for distributed execution.
 Prevents duplicate work when multiple workers process the same plan.
 Uses Redis Lua scripts for atomic claim operations.
 """
 import json
 import logging
 import os
 import time
 from dataclasses import dataclass
 from datetime import datetime, timezone
 from enum import Enum
 from typing import Optional
 import redis
 logger = logging.getLogger(__name__)
 REDIS_URL = os.environ.get('REDIS_URL', 'redis://localhost:6379/5')
 # Key prefix for task claims
 CLAIM_PREFIX = "artdag:claim:"
 # Default TTL for claims (5 minutes)
 DEFAULT_CLAIM_TTL = 300
 # TTL for completed results (1 hour)
 COMPLETED_TTL = 3600
 class ClaimStatus(Enum):
    """Status of a task claim."""
    PENDING = "pending"
    CLAIMED = "claimed"
    RUNNING = "running"
    COMPLETED = "completed"
    CACHED = "cached"
    FAILED = "failed"
@dataclass
 class ClaimInfo:
    """Information about a task claim."""
    cache_id: str
    status: ClaimStatus
    worker_id: Optional[str] = None
    task_id: Optional[str] = None
    claimed_at: Optional[str] = None
    completed_at: Optional[str] = None
    output_path: Optional[str] = None
    error: Optional[str] = None
    def to_dict(self) -> dict:
        return {
            "cache_id": self.cache_id,
            "status": self.status.value,
            "worker_id": self.worker_id,
            "task_id": self.task_id,
            "claimed_at": self.claimed_at,
            "completed_at": self.completed_at,
            "output_path": self.output_path,
            "error": self.error,
        }
    @classmethod
    def from_dict(cls, data: dict) -> "ClaimInfo":
        return cls(
            cache_id=data["cache_id"],
            status=ClaimStatus(data["status"]),
            worker_id=data.get("worker_id"),
            task_id=data.get("task_id"),
            claimed_at=data.get("claimed_at"),
            completed_at=data.get("completed_at"),
            output_path=data.get("output_path"),
            error=data.get("error"),
        )
 # Lua script for atomic task claiming
 # Returns 1 if claim successful, 0 if already claimed/completed
 CLAIM_TASK_SCRIPT = """
 local key = KEYS[1]
 local data = redis.call('GET', key)
 if data then
    local status = cjson.decode(data)
    local s = status['status']
    -- Already claimed, running, completed, or cached - don't claim
    if s == 'claimed' or s == 'running' or s == 'completed' or s == 'cached' then
        return 0
    end
 end
 -- Claim the task
 local claim_data = ARGV[1]
 local ttl = tonumber(ARGV[2])
 redis.call('SETEX', key, ttl, claim_data)
 return 1
 """
 # Lua script for releasing a claim (e.g., on failure)
 RELEASE_CLAIM_SCRIPT = """
 local key = KEYS[1]
 local worker_id = ARGV[1]
 local data = redis.call('GET', key)
 if data then
    local status = cjson.decode(data)
    -- Only release if we own the claim
    if status['worker_id'] == worker_id then
        redis.call('DEL', key)
        return 1
    end
 end
 return 0
 """
 # Lua script for updating claim status (claimed -> running -> completed)
 UPDATE_STATUS_SCRIPT = """
 local key = KEYS[1]
 local worker_id = ARGV[1]
 local new_status = ARGV[2]
 local new_data = ARGV[3]
 local ttl = tonumber(ARGV[4])
 local data = redis.call('GET', key)
 if not data then
    return 0
 end
 local status = cjson.decode(data)
 -- Only update if we own the claim
 if status['worker_id'] ~= worker_id then
    return 0
 end
 redis.call('SETEX', key, ttl, new_data)
 return 1
 """
 class TaskClaimer:
    """
    Manages hash-based task claiming for distributed execution.
    Uses Redis for coordination between workers.
    Each task is identified by its cache_id (content-addressed).
    """
    def __init__(self, redis_url: str = None):
        """
        Initialize the claimer.
        Args:
            redis_url: Redis connection URL
        """
        self.redis_url = redis_url or REDIS_URL
        self._redis: Optional[redis.Redis] = None
        self._claim_script = None
        self._release_script = None
        self._update_script = None
    @property
    def redis(self) -> redis.Redis:
        """Get Redis connection (lazy initialization)."""
        if self._redis is None:
            self._redis = redis.from_url(self.redis_url, decode_responses=True)
            # Register Lua scripts
            self._claim_script = self._redis.register_script(CLAIM_TASK_SCRIPT)
            self._release_script = self._redis.register_script(RELEASE_CLAIM_SCRIPT)
            self._update_script = self._redis.register_script(UPDATE_STATUS_SCRIPT)
        return self._redis
    def _key(self, cache_id: str) -> str:
        """Get Redis key for a cache_id."""
        return f"{CLAIM_PREFIX}{cache_id}"
    def claim(
        self,
        cache_id: str,
        worker_id: str,
        task_id: Optional[str] = None,
        ttl: int = DEFAULT_CLAIM_TTL,
    ) -> bool:
        """
        Attempt to claim a task.
        Args:
            cache_id: The cache ID of the task to claim
            worker_id: Identifier for the claiming worker
            task_id: Optional Celery task ID
            ttl: Time-to-live for the claim in seconds
        Returns:
            True if claim successful, False if already claimed
        """
        claim_info = ClaimInfo(
            cache_id=cache_id,
            status=ClaimStatus.CLAIMED,
            worker_id=worker_id,
            task_id=task_id,
            claimed_at=datetime.now(timezone.utc).isoformat(),
        )
        result = self._claim_script(
            keys=[self._key(cache_id)],
            args=[json.dumps(claim_info.to_dict()), ttl],
            client=self.redis,
        )
        if result == 1:
            logger.debug(f"Claimed task {cache_id[:16]}... for worker {worker_id}")
            return True
        else:
            logger.debug(f"Task {cache_id[:16]}... already claimed")
            return False
    def update_status(
        self,
        cache_id: str,
        worker_id: str,
        status: ClaimStatus,
        output_path: Optional[str] = None,
        error: Optional[str] = None,
        ttl: Optional[int] = None,
    ) -> bool:
        """
        Update the status of a claimed task.
        Args:
            cache_id: The cache ID of the task
            worker_id: Worker ID that owns the claim
            status: New status
            output_path: Path to output (for completed)
            error: Error message (for failed)
            ttl: New TTL (defaults based on status)
        Returns:
            True if update successful
        """
        if ttl is None:
            if status in (ClaimStatus.COMPLETED, ClaimStatus.CACHED):
                ttl = COMPLETED_TTL
            else:
                ttl = DEFAULT_CLAIM_TTL
        # Get existing claim info
        existing = self.get_status(cache_id)
        if not existing:
            logger.warning(f"No claim found for {cache_id[:16]}...")
            return False
        claim_info = ClaimInfo(
            cache_id=cache_id,
            status=status,
            worker_id=worker_id,
            task_id=existing.task_id,
            claimed_at=existing.claimed_at,
            completed_at=datetime.now(timezone.utc).isoformat() if status in (
                ClaimStatus.COMPLETED, ClaimStatus.CACHED, ClaimStatus.FAILED
            ) else None,
            output_path=output_path,
            error=error,
        )
        result = self._update_script(
            keys=[self._key(cache_id)],
            args=[worker_id, status.value, json.dumps(claim_info.to_dict()), ttl],
            client=self.redis,
        )
        if result == 1:
            logger.debug(f"Updated task {cache_id[:16]}... to {status.value}")
            return True
        else:
            logger.warning(f"Failed to update task {cache_id[:16]}... (not owner?)")
            return False
    def release(self, cache_id: str, worker_id: str) -> bool:
        """
        Release a claim (e.g., on task failure before completion).
        Args:
            cache_id: The cache ID of the task
            worker_id: Worker ID that owns the claim
        Returns:
            True if release successful
        """
        result = self._release_script(
            keys=[self._key(cache_id)],
            args=[worker_id],
            client=self.redis,
        )
        if result == 1:
            logger.debug(f"Released claim on {cache_id[:16]}...")
            return True
        return False
    def get_status(self, cache_id: str) -> Optional[ClaimInfo]:
        """
        Get the current status of a task.
        Args:
            cache_id: The cache ID of the task
        Returns:
            ClaimInfo if task has been claimed, None otherwise
        """
        data = self.redis.get(self._key(cache_id))
        if data:
            return ClaimInfo.from_dict(json.loads(data))
        return None
    def is_completed(self, cache_id: str) -> bool:
        """Check if a task is completed or cached."""
        info = self.get_status(cache_id)
        return info is not None and info.status in (
            ClaimStatus.COMPLETED, ClaimStatus.CACHED
        )
    def wait_for_completion(
        self,
        cache_id: str,
        timeout: float = 300,
        poll_interval: float = 0.5,
    ) -> Optional[ClaimInfo]:
        """
        Wait for a task to complete.
        Args:
            cache_id: The cache ID of the task
            timeout: Maximum time to wait in seconds
            poll_interval: How often to check status
        Returns:
            ClaimInfo if completed, None if timeout
        """
        start_time = time.time()
        while time.time() - start_time < timeout:
            info = self.get_status(cache_id)
            if info and info.status in (
                ClaimStatus.COMPLETED, ClaimStatus.CACHED, ClaimStatus.FAILED
            ):
                return info
            time.sleep(poll_interval)
        logger.warning(f"Timeout waiting for {cache_id[:16]}...")
        return None
    def mark_cached(self, cache_id: str, output_path: str) -> None:
        """
        Mark a task as already cached (no processing needed).
        This is used when we discover the result already exists
        before attempting to claim.
        Args:
            cache_id: The cache ID of the task
            output_path: Path to the cached output
        """
        claim_info = ClaimInfo(
            cache_id=cache_id,
            status=ClaimStatus.CACHED,
            output_path=output_path,
            completed_at=datetime.now(timezone.utc).isoformat(),
        )
        self.redis.setex(
            self._key(cache_id),
            COMPLETED_TTL,
            json.dumps(claim_info.to_dict()),
        )
    def clear_all(self) -> int:
        """
        Clear all claims (for testing/reset).
        Returns:
            Number of claims cleared
        """
        pattern = f"{CLAIM_PREFIX}*"
        keys = list(self.redis.scan_iter(match=pattern))
        if keys:
            return self.redis.delete(*keys)
        return 0
 # Global claimer instance
 _claimer: Optional[TaskClaimer] = None
 def get_claimer() -> TaskClaimer:
    """Get the global TaskClaimer instance."""
    global _claimer
    if _claimer is None:
        _claimer = TaskClaimer()
    return _claimer
 def claim_task(cache_id: str, worker_id: str, task_id: str = None) -> bool:
    """Convenience function to claim a task."""
    return get_claimer().claim(cache_id, worker_id, task_id)
 def complete_task(cache_id: str, worker_id: str, output_path: str) -> bool:
    """Convenience function to mark a task as completed."""
    return get_claimer().update_status(
        cache_id, worker_id, ClaimStatus.COMPLETED, output_path=output_path
    )
 def fail_task(cache_id: str, worker_id: str, error: str) -> bool:
    """Convenience function to mark a task as failed."""
    return get_claimer().update_status(
        cache_id, worker_id, ClaimStatus.FAILED, error=error
    )
--- a/server.py
+++ b/server.py
@@ -4964,6 +4964,231 @@ async def download_client():
    )
 # ============================================================================
 # 3-Phase Execution API (Analyze → Plan → Execute)
 # ============================================================================
 class RecipeRunRequest(BaseModel):
    """Request to run a recipe with the 3-phase execution model."""
    recipe_yaml: str  # Recipe YAML content
    input_hashes: dict  # Mapping from input name to content hash
    features: Optional[list[str]] = None  # Features to extract (default: beats, energy)
 class PlanRequest(BaseModel):
    """Request to generate an execution plan."""
    recipe_yaml: str
    input_hashes: dict
    features: Optional[list[str]] = None
 class ExecutePlanRequest(BaseModel):
    """Request to execute a pre-generated plan."""
    plan_json: str  # JSON-serialized ExecutionPlan
@app.post("/api/v2/plan")
 async def generate_plan_endpoint(
    request: PlanRequest,
    ctx: UserContext = Depends(get_required_user_context)
 ):
    """
    Generate an execution plan without executing it.
    Phase 1 (Analyze) + Phase 2 (Plan) of the 3-phase model.
    Returns the plan with cache status for each step.
    """
    from tasks.orchestrate import generate_plan
    try:
        # Submit to Celery
        task = generate_plan.delay(
            recipe_yaml=request.recipe_yaml,
            input_hashes=request.input_hashes,
            features=request.features,
        )
        # Wait for result (plan generation is usually fast)
        result = task.get(timeout=60)
        return {
            "status": result.get("status"),
            "recipe": result.get("recipe"),
            "plan_id": result.get("plan_id"),
            "total_steps": result.get("total_steps"),
            "cached_steps": result.get("cached_steps"),
            "pending_steps": result.get("pending_steps"),
            "steps": result.get("steps"),
        }
    except Exception as e:
        logger.error(f"Plan generation failed: {e}")
        raise HTTPException(status_code=500, detail=str(e))
@app.post("/api/v2/execute")
 async def execute_plan_endpoint(
    request: ExecutePlanRequest,
    ctx: UserContext = Depends(get_required_user_context)
 ):
    """
    Execute a pre-generated execution plan.
    Phase 3 (Execute) of the 3-phase model.
    Submits the plan to Celery for parallel execution.
    """
    from tasks.orchestrate import run_plan
    run_id = str(uuid.uuid4())
    try:
        # Submit to Celery (async)
        task = run_plan.delay(
            plan_json=request.plan_json,
            run_id=run_id,
        )
        return {
            "status": "submitted",
            "run_id": run_id,
            "celery_task_id": task.id,
        }
    except Exception as e:
        logger.error(f"Plan execution failed: {e}")
        raise HTTPException(status_code=500, detail=str(e))
@app.post("/api/v2/run-recipe")
 async def run_recipe_endpoint(
    request: RecipeRunRequest,
    ctx: UserContext = Depends(get_required_user_context)
 ):
    """
    Run a complete recipe through all 3 phases.
    1. Analyze: Extract features from inputs
    2. Plan: Generate execution plan with cache IDs
    3. Execute: Run steps with parallel execution
    Returns immediately with run_id. Poll /api/v2/run/{run_id} for status.
    """
    from tasks.orchestrate import run_recipe
    # Compute run_id from inputs and recipe
    try:
        recipe_data = yaml.safe_load(request.recipe_yaml)
        recipe_name = recipe_data.get("name", "unknown")
    except Exception:
        recipe_name = "unknown"
    run_id = compute_run_id(
        list(request.input_hashes.values()),
        recipe_name,
        hashlib.sha3_256(request.recipe_yaml.encode()).hexdigest()
    )
    # Check if already completed
    cached = await database.get_run_cache(run_id)
    if cached:
        output_hash = cached.get("output_hash")
        if cache_manager.has_content(output_hash):
            return {
                "status": "completed",
                "run_id": run_id,
                "output_hash": output_hash,
                "output_ipfs_cid": cache_manager.get_ipfs_cid(output_hash),
                "cached": True,
            }
    # Submit to Celery
    try:
        task = run_recipe.delay(
            recipe_yaml=request.recipe_yaml,
            input_hashes=request.input_hashes,
            features=request.features,
            run_id=run_id,
        )
        # Store run status in Redis
        run_data = {
            "run_id": run_id,
            "status": "pending",
            "recipe": recipe_name,
            "inputs": list(request.input_hashes.values()),
            "celery_task_id": task.id,
            "created_at": datetime.now(timezone.utc).isoformat(),
            "username": ctx.actor_id,
        }
        redis_client.setex(
            f"{RUNS_KEY_PREFIX}{run_id}",
            86400,  # 24 hour expiry
            json.dumps(run_data)
        )
        return {
            "status": "submitted",
            "run_id": run_id,
            "celery_task_id": task.id,
            "recipe": recipe_name,
        }
    except Exception as e:
        logger.error(f"Recipe run failed: {e}")
        raise HTTPException(status_code=500, detail=str(e))
@app.get("/api/v2/run/{run_id}")
 async def get_run_v2(run_id: str, ctx: UserContext = Depends(get_required_user_context)):
    """
    Get status of a 3-phase execution run.
    """
    # Check Redis for run status
    run_data = redis_client.get(f"{RUNS_KEY_PREFIX}{run_id}")
    if run_data:
        data = json.loads(run_data)
        # If pending, check Celery task status
        if data.get("status") == "pending" and data.get("celery_task_id"):
            from celery.result import AsyncResult
            result = AsyncResult(data["celery_task_id"])
            if result.ready():
                if result.successful():
                    task_result = result.get()
                    data["status"] = task_result.get("status", "completed")
                    data["output_hash"] = task_result.get("output_cache_id")
                    data["output_ipfs_cid"] = task_result.get("output_ipfs_cid")
                    data["total_steps"] = task_result.get("total_steps")
                    data["cached"] = task_result.get("cached")
                    data["executed"] = task_result.get("executed")
                    # Update Redis
                    redis_client.setex(
                        f"{RUNS_KEY_PREFIX}{run_id}",
                        86400,
                        json.dumps(data)
                    )
                else:
                    data["status"] = "failed"
                    data["error"] = str(result.result)
            else:
                data["celery_status"] = result.status
        return data
    # Check database cache
    cached = await database.get_run_cache(run_id)
    if cached:
        return {
            "run_id": run_id,
            "status": "completed",
            "output_hash": cached.get("output_hash"),
            "cached": True,
        }
    raise HTTPException(status_code=404, detail="Run not found")
 if __name__ == "__main__":
    import uvicorn
    # Workers enabled - cache indexes shared via Redis
--- a/tasks/init.py
+++ b/tasks/init.py
@@ -0,0 +1,18 @@
 # art-celery/tasks - Celery tasks for 3-phase execution
 #
 # Tasks for the Art DAG distributed execution system:
 # 1. analyze_input - Extract features from input media
 # 2. execute_step - Execute a single step from the plan
 # 3. run_plan - Orchestrate execution of a full plan
 from .analyze import analyze_input, analyze_inputs
 from .execute import execute_step
 from .orchestrate import run_plan, run_recipe
 __all__ = [
    "analyze_input",
    "analyze_inputs",
    "execute_step",
    "run_plan",
    "run_recipe",
 ]
--- a/tasks/analyze.py
+++ b/tasks/analyze.py
@@ -0,0 +1,132 @@
 """
 Analysis tasks for extracting features from input media.
 Phase 1 of the 3-phase execution model.
 """
 import json
 import logging
 import os
 from pathlib import Path
 from typing import Dict, List, Optional
 from celery import current_task
 # Import from the Celery app
 import sys
 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 from celery_app import app
 # Import artdag analysis module
 try:
    from artdag.analysis import Analyzer, AnalysisResult
 except ImportError:
    # artdag not installed, will fail at runtime
    Analyzer = None
    AnalysisResult = None
 logger = logging.getLogger(__name__)
 # Cache directory for analysis results
 CACHE_DIR = Path(os.environ.get('CACHE_DIR', '/data/cache'))
 ANALYSIS_CACHE_DIR = CACHE_DIR / 'analysis'
@app.task(bind=True, name='tasks.analyze_input')
 def analyze_input(
    self,
    input_hash: str,
    input_path: str,
    features: List[str],
 ) -> dict:
    """
    Analyze a single input file.
    Args:
        input_hash: Content hash of the input
        input_path: Path to the input file
        features: List of features to extract
    Returns:
        Dict with analysis results
    """
    if Analyzer is None:
        raise ImportError("artdag.analysis not available")
    logger.info(f"Analyzing {input_hash[:16]}... for features: {features}")
    # Create analyzer with caching
    ANALYSIS_CACHE_DIR.mkdir(parents=True, exist_ok=True)
    analyzer = Analyzer(cache_dir=ANALYSIS_CACHE_DIR)
    try:
        result = analyzer.analyze(
            input_hash=input_hash,
            features=features,
            input_path=Path(input_path),
        )
        return {
            "status": "completed",
            "input_hash": input_hash,
            "cache_id": result.cache_id,
            "features": features,
            "result": result.to_dict(),
        }
    except Exception as e:
        logger.error(f"Analysis failed for {input_hash}: {e}")
        return {
            "status": "failed",
            "input_hash": input_hash,
            "error": str(e),
        }
@app.task(bind=True, name='tasks.analyze_inputs')
 def analyze_inputs(
    self,
    inputs: Dict[str, str],
    features: List[str],
 ) -> dict:
    """
    Analyze multiple inputs in parallel.
    Args:
        inputs: Dict mapping input_hash to file path
        features: List of features to extract from all inputs
    Returns:
        Dict with all analysis results
    """
    if Analyzer is None:
        raise ImportError("artdag.analysis not available")
    logger.info(f"Analyzing {len(inputs)} inputs for features: {features}")
    ANALYSIS_CACHE_DIR.mkdir(parents=True, exist_ok=True)
    analyzer = Analyzer(cache_dir=ANALYSIS_CACHE_DIR)
    results = {}
    errors = []
    for input_hash, input_path in inputs.items():
        try:
            result = analyzer.analyze(
                input_hash=input_hash,
                features=features,
                input_path=Path(input_path),
            )
            results[input_hash] = result.to_dict()
        except Exception as e:
            logger.error(f"Analysis failed for {input_hash}: {e}")
            errors.append({"input_hash": input_hash, "error": str(e)})
    return {
        "status": "completed" if not errors else "partial",
        "results": results,
        "errors": errors,
        "total": len(inputs),
        "successful": len(results),
    }
--- a/tasks/execute.py
+++ b/tasks/execute.py
@@ -0,0 +1,298 @@
 """
 Step execution task.
 Phase 3 of the 3-phase execution model.
 Executes individual steps from an execution plan with IPFS-backed caching.
 """
 import json
 import logging
 import os
 import socket
 from pathlib import Path
 from typing import Dict, List, Optional
 from celery import current_task
 # Import from the Celery app
 import sys
 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 from celery_app import app
 from claiming import (
    get_claimer,
    claim_task,
    complete_task,
    fail_task,
    ClaimStatus,
 )
 from cache_manager import get_cache_manager, L1CacheManager
 # Import artdag
 try:
    from artdag import Cache, NodeType
    from artdag.executor import get_executor
    from artdag.planning import ExecutionStep
 except ImportError:
    Cache = None
    NodeType = None
    get_executor = None
    ExecutionStep = None
 logger = logging.getLogger(__name__)
 def get_worker_id() -> str:
    """Get a unique identifier for this worker."""
    hostname = socket.gethostname()
    pid = os.getpid()
    return f"{hostname}:{pid}"
@app.task(bind=True, name='tasks.execute_step')
 def execute_step(
    self,
    step_json: str,
    plan_id: str,
    input_cache_ids: Dict[str, str],
 ) -> dict:
    """
    Execute a single step from an execution plan.
    Uses hash-based claiming to prevent duplicate work.
    Results are stored in IPFS-backed cache.
    Args:
        step_json: JSON-serialized ExecutionStep
        plan_id: ID of the parent execution plan
        input_cache_ids: Mapping from input step_id to their cache_id
    Returns:
        Dict with execution result
    """
    if ExecutionStep is None:
        raise ImportError("artdag.planning not available")
    step = ExecutionStep.from_json(step_json)
    worker_id = get_worker_id()
    task_id = self.request.id
    logger.info(f"Executing step {step.step_id} ({step.node_type}) cache_id={step.cache_id[:16]}...")
    # Get L1 cache manager (IPFS-backed)
    cache_mgr = get_cache_manager()
    # Check if already cached (by cache_id as content_hash)
    cached_path = cache_mgr.get_by_content_hash(step.cache_id)
    if cached_path:
        logger.info(f"Step {step.step_id} already cached at {cached_path}")
        # Mark as cached in claiming system
        claimer = get_claimer()
        claimer.mark_cached(step.cache_id, str(cached_path))
        return {
            "status": "cached",
            "step_id": step.step_id,
            "cache_id": step.cache_id,
            "output_path": str(cached_path),
        }
    # Try to claim the task
    if not claim_task(step.cache_id, worker_id, task_id):
        # Another worker is handling it
        logger.info(f"Step {step.step_id} claimed by another worker, waiting...")
        claimer = get_claimer()
        result = claimer.wait_for_completion(step.cache_id, timeout=600)
        if result and result.status == ClaimStatus.COMPLETED:
            return {
                "status": "completed_by_other",
                "step_id": step.step_id,
                "cache_id": step.cache_id,
                "output_path": result.output_path,
            }
        elif result and result.status == ClaimStatus.CACHED:
            return {
                "status": "cached",
                "step_id": step.step_id,
                "cache_id": step.cache_id,
                "output_path": result.output_path,
            }
        elif result and result.status == ClaimStatus.FAILED:
            return {
                "status": "failed",
                "step_id": step.step_id,
                "cache_id": step.cache_id,
                "error": result.error,
            }
        else:
            return {
                "status": "timeout",
                "step_id": step.step_id,
                "cache_id": step.cache_id,
                "error": "Timeout waiting for other worker",
            }
    # We have the claim, update to running
    claimer = get_claimer()
    claimer.update_status(step.cache_id, worker_id, ClaimStatus.RUNNING)
    try:
        # Handle SOURCE nodes
        if step.node_type == "SOURCE":
            content_hash = step.config.get("content_hash")
            if not content_hash:
                raise ValueError(f"SOURCE step missing content_hash")
            # Look up in cache
            path = cache_mgr.get_by_content_hash(content_hash)
            if not path:
                raise ValueError(f"SOURCE input not found in cache: {content_hash[:16]}...")
            output_path = str(path)
            complete_task(step.cache_id, worker_id, output_path)
            return {
                "status": "completed",
                "step_id": step.step_id,
                "cache_id": step.cache_id,
                "output_path": output_path,
            }
        # Handle _LIST virtual nodes
        if step.node_type == "_LIST":
            item_paths = []
            for item_id in step.config.get("items", []):
                item_cache_id = input_cache_ids.get(item_id)
                if item_cache_id:
                    path = cache_mgr.get_by_content_hash(item_cache_id)
                    if path:
                        item_paths.append(str(path))
            complete_task(step.cache_id, worker_id, json.dumps(item_paths))
            return {
                "status": "completed",
                "step_id": step.step_id,
                "cache_id": step.cache_id,
                "output_path": None,
                "item_paths": item_paths,
            }
        # Get executor for this node type
        try:
            node_type = NodeType[step.node_type]
        except KeyError:
            node_type = step.node_type
        executor = get_executor(node_type)
        if executor is None:
            raise ValueError(f"No executor for node type: {step.node_type}")
        # Resolve input paths from cache
        input_paths = []
        for input_step_id in step.input_steps:
            input_cache_id = input_cache_ids.get(input_step_id)
            if not input_cache_id:
                raise ValueError(f"No cache_id for input step: {input_step_id}")
            path = cache_mgr.get_by_content_hash(input_cache_id)
            if not path:
                raise ValueError(f"Input not in cache: {input_cache_id[:16]}...")
            input_paths.append(Path(path))
        # Create temp output path
        import tempfile
        output_dir = Path(tempfile.mkdtemp())
        output_path = output_dir / f"output_{step.cache_id[:16]}.mp4"
        # Execute
        logger.info(f"Running executor for {step.node_type} with {len(input_paths)} inputs")
        result_path = executor.execute(step.config, input_paths, output_path)
        # Store in IPFS-backed cache
        cached_file, ipfs_cid = cache_mgr.put(
            source_path=result_path,
            node_type=step.node_type,
            node_id=step.cache_id,
        )
        logger.info(f"Step {step.step_id} completed, IPFS CID: {ipfs_cid}")
        # Mark completed
        complete_task(step.cache_id, worker_id, str(cached_file.path))
        # Cleanup temp
        if output_dir.exists():
            import shutil
            shutil.rmtree(output_dir, ignore_errors=True)
        return {
            "status": "completed",
            "step_id": step.step_id,
            "cache_id": step.cache_id,
            "output_path": str(cached_file.path),
            "content_hash": cached_file.content_hash,
            "ipfs_cid": ipfs_cid,
        }
    except Exception as e:
        logger.error(f"Step {step.step_id} failed: {e}")
        fail_task(step.cache_id, worker_id, str(e))
        return {
            "status": "failed",
            "step_id": step.step_id,
            "cache_id": step.cache_id,
            "error": str(e),
        }
@app.task(bind=True, name='tasks.execute_level')
 def execute_level(
    self,
    steps_json: List[str],
    plan_id: str,
    cache_ids: Dict[str, str],
 ) -> dict:
    """
    Execute all steps at a given dependency level.
    Steps at the same level can run in parallel.
    Args:
        steps_json: List of JSON-serialized ExecutionSteps
        plan_id: ID of the parent execution plan
        cache_ids: Mapping from step_id to cache_id
    Returns:
        Dict with results for all steps
    """
    from celery import group
    # Dispatch all steps in parallel
    tasks = [
        execute_step.s(step_json, plan_id, cache_ids)
        for step_json in steps_json
    ]
    # Execute in parallel and collect results
    job = group(tasks)
    results = job.apply_async()
    # Wait for completion
    step_results = results.get(timeout=3600)  # 1 hour timeout
    # Build cache_ids from results
    new_cache_ids = dict(cache_ids)
    for result in step_results:
        step_id = result.get("step_id")
        cache_id = result.get("cache_id")
        if step_id and cache_id:
            new_cache_ids[step_id] = cache_id
    return {
        "status": "completed",
        "results": step_results,
        "cache_ids": new_cache_ids,
    }
--- a/tasks/orchestrate.py
+++ b/tasks/orchestrate.py
@@ -0,0 +1,373 @@
 """
 Plan orchestration tasks.
 Coordinates the full 3-phase execution:
 1. Analyze inputs
 2. Generate plan
 3. Execute steps level by level
 Uses IPFS-backed cache for durability.
 """
 import json
 import logging
 import os
 from pathlib import Path
 from typing import Dict, List, Optional
 from celery import current_task, group, chain
 # Import from the Celery app
 import sys
 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 from celery_app import app
 from claiming import get_claimer
 from cache_manager import get_cache_manager
 # Import artdag modules
 try:
    from artdag import Cache
    from artdag.analysis import Analyzer, AnalysisResult
    from artdag.planning import RecipePlanner, ExecutionPlan, Recipe
 except ImportError:
    Cache = None
    Analyzer = None
    AnalysisResult = None
    RecipePlanner = None
    ExecutionPlan = None
    Recipe = None
 from .execute import execute_step
 logger = logging.getLogger(__name__)
 # Cache directories
 CACHE_DIR = Path(os.environ.get('CACHE_DIR', '/data/cache'))
 ANALYSIS_CACHE_DIR = CACHE_DIR / 'analysis'
 PLAN_CACHE_DIR = CACHE_DIR / 'plans'
@app.task(bind=True, name='tasks.run_plan')
 def run_plan(
    self,
    plan_json: str,
    run_id: Optional[str] = None,
 ) -> dict:
    """
    Execute a complete execution plan.
    Runs steps level by level, with parallel execution within each level.
    Results are stored in IPFS-backed cache.
    Args:
        plan_json: JSON-serialized ExecutionPlan
        run_id: Optional run ID for tracking
    Returns:
        Dict with execution results
    """
    if ExecutionPlan is None:
        raise ImportError("artdag.planning not available")
    plan = ExecutionPlan.from_json(plan_json)
    cache_mgr = get_cache_manager()
    logger.info(f"Executing plan {plan.plan_id[:16]}... ({len(plan.steps)} steps)")
    # Build initial cache_ids mapping (step_id -> cache_id)
    cache_ids = {}
    for step in plan.steps:
        cache_ids[step.step_id] = step.cache_id
    # Also map input hashes
    for name, content_hash in plan.input_hashes.items():
        cache_ids[name] = content_hash
    # Group steps by level
    steps_by_level = plan.get_steps_by_level()
    max_level = max(steps_by_level.keys()) if steps_by_level else 0
    results_by_step = {}
    total_cached = 0
    total_executed = 0
    for level in range(max_level + 1):
        level_steps = steps_by_level.get(level, [])
        if not level_steps:
            continue
        logger.info(f"Executing level {level}: {len(level_steps)} steps")
        # Check which steps need execution
        steps_to_run = []
        for step in level_steps:
            # Check if cached
            cached_path = cache_mgr.get_by_content_hash(step.cache_id)
            if cached_path:
                results_by_step[step.step_id] = {
                    "status": "cached",
                    "cache_id": step.cache_id,
                    "output_path": str(cached_path),
                }
                total_cached += 1
            else:
                steps_to_run.append(step)
        if not steps_to_run:
            logger.info(f"Level {level}: all steps cached")
            continue
        # Build input cache_ids for this level
        level_cache_ids = dict(cache_ids)
        # Execute steps in parallel
        tasks = [
            execute_step.s(step.to_json(), plan.plan_id, level_cache_ids)
            for step in steps_to_run
        ]
        job = group(tasks)
        async_results = job.apply_async()
        # Wait for completion
        try:
            step_results = async_results.get(timeout=3600)
        except Exception as e:
            logger.error(f"Level {level} execution failed: {e}")
            return {
                "status": "failed",
                "error": str(e),
                "level": level,
                "results": results_by_step,
                "run_id": run_id,
            }
        # Process results
        for result in step_results:
            step_id = result.get("step_id")
            cache_id = result.get("cache_id")
            results_by_step[step_id] = result
            cache_ids[step_id] = cache_id
            if result.get("status") in ("completed", "cached", "completed_by_other"):
                total_executed += 1
            elif result.get("status") == "failed":
                logger.error(f"Step {step_id} failed: {result.get('error')}")
                return {
                    "status": "failed",
                    "error": f"Step {step_id} failed: {result.get('error')}",
                    "level": level,
                    "results": results_by_step,
                    "run_id": run_id,
                }
    # Get final output
    output_step = plan.get_step(plan.output_step)
    output_cache_id = output_step.cache_id if output_step else None
    output_path = None
    output_ipfs_cid = None
    if output_cache_id:
        output_path = cache_mgr.get_by_content_hash(output_cache_id)
        output_ipfs_cid = cache_mgr.get_ipfs_cid(output_cache_id)
    return {
        "status": "completed",
        "run_id": run_id,
        "plan_id": plan.plan_id,
        "output_cache_id": output_cache_id,
        "output_path": str(output_path) if output_path else None,
        "output_ipfs_cid": output_ipfs_cid,
        "total_steps": len(plan.steps),
        "cached": total_cached,
        "executed": total_executed,
        "results": results_by_step,
    }
@app.task(bind=True, name='tasks.run_recipe')
 def run_recipe(
    self,
    recipe_yaml: str,
    input_hashes: Dict[str, str],
    features: List[str] = None,
    run_id: Optional[str] = None,
 ) -> dict:
    """
    Run a complete recipe through all 3 phases.
    1. Analyze: Extract features from inputs
    2. Plan: Generate execution plan
    3. Execute: Run the plan
    Args:
        recipe_yaml: Recipe YAML content
        input_hashes: Mapping from input name to content hash
        features: Features to extract (default: ["beats", "energy"])
        run_id: Optional run ID for tracking
    Returns:
        Dict with final results
    """
    if RecipePlanner is None or Analyzer is None:
        raise ImportError("artdag modules not available")
    if features is None:
        features = ["beats", "energy"]
    cache_mgr = get_cache_manager()
    logger.info(f"Running recipe with {len(input_hashes)} inputs")
    # Phase 1: Analyze
    logger.info("Phase 1: Analyzing inputs...")
    ANALYSIS_CACHE_DIR.mkdir(parents=True, exist_ok=True)
    analyzer = Analyzer(cache_dir=ANALYSIS_CACHE_DIR)
    analysis_results = {}
    for name, content_hash in input_hashes.items():
        # Get path from cache
        path = cache_mgr.get_by_content_hash(content_hash)
        if path:
            try:
                result = analyzer.analyze(
                    input_hash=content_hash,
                    features=features,
                    input_path=Path(path),
                )
                analysis_results[content_hash] = result
                logger.info(f"Analyzed {name}: tempo={result.tempo}, beats={len(result.beat_times or [])}")
            except Exception as e:
                logger.warning(f"Analysis failed for {name}: {e}")
        else:
            logger.warning(f"Input {name} ({content_hash[:16]}...) not in cache")
    logger.info(f"Analyzed {len(analysis_results)} inputs")
    # Phase 2: Plan
    logger.info("Phase 2: Generating execution plan...")
    recipe = Recipe.from_yaml(recipe_yaml)
    planner = RecipePlanner(use_tree_reduction=True)
    plan = planner.plan(
        recipe=recipe,
        input_hashes=input_hashes,
        analysis=analysis_results,
    )
    logger.info(f"Generated plan with {len(plan.steps)} steps")
    # Save plan for debugging
    PLAN_CACHE_DIR.mkdir(parents=True, exist_ok=True)
    plan_path = PLAN_CACHE_DIR / f"{plan.plan_id}.json"
    with open(plan_path, "w") as f:
        f.write(plan.to_json())
    # Phase 3: Execute
    logger.info("Phase 3: Executing plan...")
    result = run_plan(plan.to_json(), run_id=run_id)
    return {
        "status": result.get("status"),
        "run_id": run_id,
        "recipe": recipe.name,
        "plan_id": plan.plan_id,
        "output_path": result.get("output_path"),
        "output_cache_id": result.get("output_cache_id"),
        "output_ipfs_cid": result.get("output_ipfs_cid"),
        "analysis_count": len(analysis_results),
        "total_steps": len(plan.steps),
        "cached": result.get("cached", 0),
        "executed": result.get("executed", 0),
        "error": result.get("error"),
    }
@app.task(bind=True, name='tasks.generate_plan')
 def generate_plan(
    self,
    recipe_yaml: str,
    input_hashes: Dict[str, str],
    features: List[str] = None,
 ) -> dict:
    """
    Generate an execution plan without executing it.
    Useful for:
    - Previewing what will be executed
    - Checking cache status
    - Debugging recipe issues
    Args:
        recipe_yaml: Recipe YAML content
        input_hashes: Mapping from input name to content hash
        features: Features to extract for analysis
    Returns:
        Dict with plan details
    """
    if RecipePlanner is None or Analyzer is None:
        raise ImportError("artdag modules not available")
    if features is None:
        features = ["beats", "energy"]
    cache_mgr = get_cache_manager()
    # Analyze inputs
    ANALYSIS_CACHE_DIR.mkdir(parents=True, exist_ok=True)
    analyzer = Analyzer(cache_dir=ANALYSIS_CACHE_DIR)
    analysis_results = {}
    for name, content_hash in input_hashes.items():
        path = cache_mgr.get_by_content_hash(content_hash)
        if path:
            try:
                result = analyzer.analyze(
                    input_hash=content_hash,
                    features=features,
                    input_path=Path(path),
                )
                analysis_results[content_hash] = result
            except Exception as e:
                logger.warning(f"Analysis failed for {name}: {e}")
    # Generate plan
    recipe = Recipe.from_yaml(recipe_yaml)
    planner = RecipePlanner(use_tree_reduction=True)
    plan = planner.plan(
        recipe=recipe,
        input_hashes=input_hashes,
        analysis=analysis_results,
    )
    # Check cache status for each step
    steps_status = []
    for step in plan.steps:
        cached = cache_mgr.has_content(step.cache_id)
        steps_status.append({
            "step_id": step.step_id,
            "node_type": step.node_type,
            "cache_id": step.cache_id,
            "level": step.level,
            "cached": cached,
        })
    cached_count = sum(1 for s in steps_status if s["cached"])
    return {
        "status": "planned",
        "recipe": recipe.name,
        "plan_id": plan.plan_id,
        "total_steps": len(plan.steps),
        "cached_steps": cached_count,
        "pending_steps": len(plan.steps) - cached_count,
        "steps": steps_status,
        "plan_json": plan.to_json(),
    }