Add durable pending runs and recipe list debugging

- Store pending runs in PostgreSQL for durability across restarts - Add recovery method for orphaned runs - Increase Celery result_expires to 7 days - Add task_reject_on_worker_lost for automatic re-queuing - Add logging to recipe list to debug filter issues Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-11 20:35:00 +00:00
parent a6dd470623
commit 8ab0f05a7d
4 changed files with 412 additions and 31 deletions
--- a/app/services/recipe_service.py
+++ b/app/services/recipe_service.py
@@ -46,23 +46,30 @@ class RecipeService:

    async def list_recipes(self, actor_id: str = None, offset: int = 0, limit: int = 20) -> list:
        """
-        List available recipes.
+        List available recipes for a user.
+
+        L1 data is isolated per-user - only shows recipes owned by actor_id.

        Note: This scans the cache for recipe files. For production,
        you might want a database index of recipes by owner.
        """
+        import logging
+        logger = logging.getLogger(__name__)
+
        # Get all cached items and filter for recipes
-        # This is a simplified implementation - production would use a proper index
        recipes = []

        # Check if cache has a list method for recipes
        if hasattr(self.cache, 'list_by_type'):
            items = self.cache.list_by_type('recipe')
+            logger.info(f"Found {len(items)} recipe items in cache")
            for content_hash in items:
                recipe = await self.get_recipe(content_hash)
                if recipe:
-                    # Filter by actor if specified
-                    if actor_id is None or recipe.get("uploader") == actor_id:
+                    uploader = recipe.get("uploader")
+                    logger.info(f"Recipe {content_hash[:12]}: uploader={uploader}, actor_id={actor_id}")
+                    # Filter by actor - L1 is per-user
+                    if actor_id is None or uploader == actor_id:
                        recipes.append(recipe)

        # Sort by name
--- a/app/services/run_service.py
+++ b/app/services/run_service.py
@@ -101,7 +101,67 @@ class RunService:
                "completed_at": cached.get("created_at"),
            }

-        # Check if there's a running task
+        # Check database for pending run
+        pending = await self.db.get_pending_run(run_id)
+        if pending:
+            task_id = pending.get("celery_task_id")
+            if task_id:
+                # Check actual Celery task state
+                from celery.result import AsyncResult
+                from celery_app import app as celery_app
+
+                result = AsyncResult(task_id, app=celery_app)
+                status = result.status.lower()
+
+                # Normalize status
+                status_map = {
+                    "pending": "pending",
+                    "started": "running",
+                    "success": "completed",
+                    "failure": "failed",
+                    "retry": "running",
+                    "revoked": "failed",
+                }
+                normalized_status = status_map.get(status, status)
+
+                run_data = {
+                    "run_id": run_id,
+                    "status": normalized_status,
+                    "celery_task_id": task_id,
+                    "actor_id": pending.get("actor_id"),
+                    "recipe": pending.get("recipe"),
+                    "inputs": pending.get("inputs"),
+                    "output_name": pending.get("output_name"),
+                    "created_at": pending.get("created_at"),
+                    "error": pending.get("error"),
+                }
+
+                # If task completed, get result
+                if result.ready():
+                    if result.successful():
+                        run_data["status"] = "completed"
+                        task_result = result.result
+                        if isinstance(task_result, dict):
+                            run_data["output_hash"] = task_result.get("output_hash")
+                    else:
+                        run_data["status"] = "failed"
+                        run_data["error"] = str(result.result)
+
+                return run_data
+
+            # No task_id but have pending record - return from DB
+            return {
+                "run_id": run_id,
+                "status": pending.get("status", "pending"),
+                "recipe": pending.get("recipe"),
+                "inputs": pending.get("inputs"),
+                "output_name": pending.get("output_name"),
+                "actor_id": pending.get("actor_id"),
+                "created_at": pending.get("created_at"),
+                "error": pending.get("error"),
+            }
+
+        # Fallback: Check Redis for backwards compatibility
        task_data = self.redis.get(f"{self.task_key_prefix}{run_id}")
        if task_data:
            if isinstance(task_data, bytes):
@@ -176,33 +236,28 @@ class RunService:
        return None

    async def list_runs(self, actor_id: str, offset: int = 0, limit: int = 20) -> list:
-        """List runs for a user. Returns completed runs from database."""
+        """List runs for a user. Returns completed and pending runs from database."""
        # Get completed runs from database
-        runs = await self.db.list_runs_by_actor(actor_id, offset=offset, limit=limit)
+        completed_runs = await self.db.list_runs_by_actor(actor_id, offset=0, limit=limit + 50)

-        # Also check for any pending tasks in Redis
+        # Get pending runs from database
+        pending_db = await self.db.list_pending_runs(actor_id=actor_id)
+
+        # Convert pending runs to run format with live status check
        pending = []
-        cursor = 0
-        while True:
-            cursor, keys = self.redis.scan(
-                cursor=cursor,
-                match=f"{self.task_key_prefix}*",
-                count=100
-            )
-            for key in keys:
-                run_id = key.decode().replace(self.task_key_prefix, "") if isinstance(key, bytes) else key.replace(self.task_key_prefix, "")
-                # Check if this run isn't already in completed results
-                if not any(r.get("run_id") == run_id for r in runs):
-                    run = await self.get_run(run_id)
-                    if run and run.get("status") in ("pending", "running"):
-                        # Filter by actor_id
-                        if run.get("actor_id") == actor_id:
-                            pending.append(run)
-            if cursor == 0:
-                break
+        for pr in pending_db:
+            run_id = pr.get("run_id")
+            # Skip if already in completed
+            if any(r.get("run_id") == run_id for r in completed_runs):
+                continue
+
+            # Get live status
+            run = await self.get_run(run_id)
+            if run and run.get("status") in ("pending", "running"):
+                pending.append(run)

        # Combine and sort
-        all_runs = pending + runs
+        all_runs = pending + completed_runs
        all_runs.sort(key=lambda r: r.get("created_at", ""), reverse=True)

        return all_runs[offset:offset + limit]
@@ -318,7 +373,18 @@ class RunService:
                    return None, "Legacy mode only supports single-input recipes. Use use_dag=true for multi-input."
                task = render_effect.delay(input_list[0], recipe, output_name)

-            # Store task mapping in Redis (ephemeral) - includes metadata for list display
+            # Store pending run in database for durability
+            await self.db.create_pending_run(
+                run_id=run_id,
+                celery_task_id=task.id,
+                recipe=recipe,
+                inputs=input_list,
+                actor_id=actor_id,
+                dag_json=dag_json,
+                output_name=output_name,
+            )
+
+            # Also store in Redis for backwards compatibility (shorter TTL)
            task_data = json.dumps({
                "task_id": task.id,
                "actor_id": actor_id,
@@ -329,7 +395,7 @@ class RunService:
            })
            self.redis.setex(
                f"{self.task_key_prefix}{run_id}",
-                3600 * 24,  # 24 hour TTL
+                3600 * 4,  # 4 hour TTL (database is primary now)
                task_data
            )

@@ -459,3 +525,123 @@ class RunService:
    def detect_media_type(self, path: Path) -> str:
        """Detect media type for a file path."""
        return detect_media_type(path)
+
+    async def recover_pending_runs(self) -> Dict[str, int]:
+        """
+        Recover pending runs after restart.
+
+        Checks all pending runs in the database and:
+        - Updates status for completed tasks
+        - Re-queues orphaned tasks that can be retried
+        - Marks as failed if unrecoverable
+
+        Returns counts of recovered, completed, failed runs.
+        """
+        from celery.result import AsyncResult
+        from celery_app import app as celery_app
+
+        try:
+            from legacy_tasks import execute_dag
+        except ImportError:
+            return {"error": "Celery tasks not available"}
+
+        stats = {"recovered": 0, "completed": 0, "failed": 0, "still_running": 0}
+
+        # Get all pending/running runs from database
+        pending_runs = await self.db.list_pending_runs()
+
+        for run in pending_runs:
+            run_id = run.get("run_id")
+            task_id = run.get("celery_task_id")
+            status = run.get("status")
+
+            if not task_id:
+                # No task ID - try to re-queue if we have dag_json
+                dag_json = run.get("dag_json")
+                if dag_json:
+                    try:
+                        new_task = execute_dag.delay(dag_json, run_id)
+                        await self.db.create_pending_run(
+                            run_id=run_id,
+                            celery_task_id=new_task.id,
+                            recipe=run.get("recipe", "unknown"),
+                            inputs=run.get("inputs", []),
+                            actor_id=run.get("actor_id"),
+                            dag_json=dag_json,
+                            output_name=run.get("output_name"),
+                        )
+                        stats["recovered"] += 1
+                    except Exception as e:
+                        await self.db.update_pending_run_status(
+                            run_id, "failed", f"Recovery failed: {e}"
+                        )
+                        stats["failed"] += 1
+                else:
+                    await self.db.update_pending_run_status(
+                        run_id, "failed", "No DAG data for recovery"
+                    )
+                    stats["failed"] += 1
+                continue
+
+            # Check Celery task state
+            result = AsyncResult(task_id, app=celery_app)
+            celery_status = result.status.lower()
+
+            if result.ready():
+                if result.successful():
+                    # Task completed - move to run_cache
+                    task_result = result.result
+                    if isinstance(task_result, dict) and task_result.get("output_hash"):
+                        await self.db.save_run_cache(
+                            run_id=run_id,
+                            output_hash=task_result["output_hash"],
+                            recipe=run.get("recipe", "unknown"),
+                            inputs=run.get("inputs", []),
+                            ipfs_cid=task_result.get("ipfs_cid"),
+                            provenance_cid=task_result.get("provenance_cid"),
+                            actor_id=run.get("actor_id"),
+                        )
+                        await self.db.complete_pending_run(run_id)
+                        stats["completed"] += 1
+                    else:
+                        await self.db.update_pending_run_status(
+                            run_id, "failed", "Task completed but no output hash"
+                        )
+                        stats["failed"] += 1
+                else:
+                    # Task failed
+                    await self.db.update_pending_run_status(
+                        run_id, "failed", str(result.result)
+                    )
+                    stats["failed"] += 1
+            elif celery_status in ("pending", "started", "retry"):
+                # Still running
+                stats["still_running"] += 1
+            else:
+                # Unknown state - try to re-queue if we have dag_json
+                dag_json = run.get("dag_json")
+                if dag_json:
+                    try:
+                        new_task = execute_dag.delay(dag_json, run_id)
+                        await self.db.create_pending_run(
+                            run_id=run_id,
+                            celery_task_id=new_task.id,
+                            recipe=run.get("recipe", "unknown"),
+                            inputs=run.get("inputs", []),
+                            actor_id=run.get("actor_id"),
+                            dag_json=dag_json,
+                            output_name=run.get("output_name"),
+                        )
+                        stats["recovered"] += 1
+                    except Exception as e:
+                        await self.db.update_pending_run_status(
+                            run_id, "failed", f"Recovery failed: {e}"
+                        )
+                        stats["failed"] += 1
+                else:
+                    await self.db.update_pending_run_status(
+                        run_id, "failed", f"Task in unknown state: {celery_status}"
+                    )
+                    stats["failed"] += 1
+
+        return stats