Add durable pending runs and recipe list debugging

- Store pending runs in PostgreSQL for durability across restarts
- Add recovery method for orphaned runs
- Increase Celery result_expires to 7 days
- Add task_reject_on_worker_lost for automatic re-queuing
- Add logging to recipe list to debug filter issues

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
gilesb
2026-01-11 20:35:00 +00:00
parent a6dd470623
commit 8ab0f05a7d
4 changed files with 412 additions and 31 deletions

View File

@@ -46,23 +46,30 @@ class RecipeService:
async def list_recipes(self, actor_id: str = None, offset: int = 0, limit: int = 20) -> list:
"""
List available recipes.
List available recipes for a user.
L1 data is isolated per-user - only shows recipes owned by actor_id.
Note: This scans the cache for recipe files. For production,
you might want a database index of recipes by owner.
"""
import logging
logger = logging.getLogger(__name__)
# Get all cached items and filter for recipes
# This is a simplified implementation - production would use a proper index
recipes = []
# Check if cache has a list method for recipes
if hasattr(self.cache, 'list_by_type'):
items = self.cache.list_by_type('recipe')
logger.info(f"Found {len(items)} recipe items in cache")
for content_hash in items:
recipe = await self.get_recipe(content_hash)
if recipe:
# Filter by actor if specified
if actor_id is None or recipe.get("uploader") == actor_id:
uploader = recipe.get("uploader")
logger.info(f"Recipe {content_hash[:12]}: uploader={uploader}, actor_id={actor_id}")
# Filter by actor - L1 is per-user
if actor_id is None or uploader == actor_id:
recipes.append(recipe)
# Sort by name

View File

@@ -101,7 +101,67 @@ class RunService:
"completed_at": cached.get("created_at"),
}
# Check if there's a running task
# Check database for pending run
pending = await self.db.get_pending_run(run_id)
if pending:
task_id = pending.get("celery_task_id")
if task_id:
# Check actual Celery task state
from celery.result import AsyncResult
from celery_app import app as celery_app
result = AsyncResult(task_id, app=celery_app)
status = result.status.lower()
# Normalize status
status_map = {
"pending": "pending",
"started": "running",
"success": "completed",
"failure": "failed",
"retry": "running",
"revoked": "failed",
}
normalized_status = status_map.get(status, status)
run_data = {
"run_id": run_id,
"status": normalized_status,
"celery_task_id": task_id,
"actor_id": pending.get("actor_id"),
"recipe": pending.get("recipe"),
"inputs": pending.get("inputs"),
"output_name": pending.get("output_name"),
"created_at": pending.get("created_at"),
"error": pending.get("error"),
}
# If task completed, get result
if result.ready():
if result.successful():
run_data["status"] = "completed"
task_result = result.result
if isinstance(task_result, dict):
run_data["output_hash"] = task_result.get("output_hash")
else:
run_data["status"] = "failed"
run_data["error"] = str(result.result)
return run_data
# No task_id but have pending record - return from DB
return {
"run_id": run_id,
"status": pending.get("status", "pending"),
"recipe": pending.get("recipe"),
"inputs": pending.get("inputs"),
"output_name": pending.get("output_name"),
"actor_id": pending.get("actor_id"),
"created_at": pending.get("created_at"),
"error": pending.get("error"),
}
# Fallback: Check Redis for backwards compatibility
task_data = self.redis.get(f"{self.task_key_prefix}{run_id}")
if task_data:
if isinstance(task_data, bytes):
@@ -176,33 +236,28 @@ class RunService:
return None
async def list_runs(self, actor_id: str, offset: int = 0, limit: int = 20) -> list:
"""List runs for a user. Returns completed runs from database."""
"""List runs for a user. Returns completed and pending runs from database."""
# Get completed runs from database
runs = await self.db.list_runs_by_actor(actor_id, offset=offset, limit=limit)
completed_runs = await self.db.list_runs_by_actor(actor_id, offset=0, limit=limit + 50)
# Also check for any pending tasks in Redis
# Get pending runs from database
pending_db = await self.db.list_pending_runs(actor_id=actor_id)
# Convert pending runs to run format with live status check
pending = []
cursor = 0
while True:
cursor, keys = self.redis.scan(
cursor=cursor,
match=f"{self.task_key_prefix}*",
count=100
)
for key in keys:
run_id = key.decode().replace(self.task_key_prefix, "") if isinstance(key, bytes) else key.replace(self.task_key_prefix, "")
# Check if this run isn't already in completed results
if not any(r.get("run_id") == run_id for r in runs):
run = await self.get_run(run_id)
if run and run.get("status") in ("pending", "running"):
# Filter by actor_id
if run.get("actor_id") == actor_id:
pending.append(run)
if cursor == 0:
break
for pr in pending_db:
run_id = pr.get("run_id")
# Skip if already in completed
if any(r.get("run_id") == run_id for r in completed_runs):
continue
# Get live status
run = await self.get_run(run_id)
if run and run.get("status") in ("pending", "running"):
pending.append(run)
# Combine and sort
all_runs = pending + runs
all_runs = pending + completed_runs
all_runs.sort(key=lambda r: r.get("created_at", ""), reverse=True)
return all_runs[offset:offset + limit]
@@ -318,7 +373,18 @@ class RunService:
return None, "Legacy mode only supports single-input recipes. Use use_dag=true for multi-input."
task = render_effect.delay(input_list[0], recipe, output_name)
# Store task mapping in Redis (ephemeral) - includes metadata for list display
# Store pending run in database for durability
await self.db.create_pending_run(
run_id=run_id,
celery_task_id=task.id,
recipe=recipe,
inputs=input_list,
actor_id=actor_id,
dag_json=dag_json,
output_name=output_name,
)
# Also store in Redis for backwards compatibility (shorter TTL)
task_data = json.dumps({
"task_id": task.id,
"actor_id": actor_id,
@@ -329,7 +395,7 @@ class RunService:
})
self.redis.setex(
f"{self.task_key_prefix}{run_id}",
3600 * 24, # 24 hour TTL
3600 * 4, # 4 hour TTL (database is primary now)
task_data
)
@@ -459,3 +525,123 @@ class RunService:
def detect_media_type(self, path: Path) -> str:
"""Detect media type for a file path."""
return detect_media_type(path)
async def recover_pending_runs(self) -> Dict[str, int]:
"""
Recover pending runs after restart.
Checks all pending runs in the database and:
- Updates status for completed tasks
- Re-queues orphaned tasks that can be retried
- Marks as failed if unrecoverable
Returns counts of recovered, completed, failed runs.
"""
from celery.result import AsyncResult
from celery_app import app as celery_app
try:
from legacy_tasks import execute_dag
except ImportError:
return {"error": "Celery tasks not available"}
stats = {"recovered": 0, "completed": 0, "failed": 0, "still_running": 0}
# Get all pending/running runs from database
pending_runs = await self.db.list_pending_runs()
for run in pending_runs:
run_id = run.get("run_id")
task_id = run.get("celery_task_id")
status = run.get("status")
if not task_id:
# No task ID - try to re-queue if we have dag_json
dag_json = run.get("dag_json")
if dag_json:
try:
new_task = execute_dag.delay(dag_json, run_id)
await self.db.create_pending_run(
run_id=run_id,
celery_task_id=new_task.id,
recipe=run.get("recipe", "unknown"),
inputs=run.get("inputs", []),
actor_id=run.get("actor_id"),
dag_json=dag_json,
output_name=run.get("output_name"),
)
stats["recovered"] += 1
except Exception as e:
await self.db.update_pending_run_status(
run_id, "failed", f"Recovery failed: {e}"
)
stats["failed"] += 1
else:
await self.db.update_pending_run_status(
run_id, "failed", "No DAG data for recovery"
)
stats["failed"] += 1
continue
# Check Celery task state
result = AsyncResult(task_id, app=celery_app)
celery_status = result.status.lower()
if result.ready():
if result.successful():
# Task completed - move to run_cache
task_result = result.result
if isinstance(task_result, dict) and task_result.get("output_hash"):
await self.db.save_run_cache(
run_id=run_id,
output_hash=task_result["output_hash"],
recipe=run.get("recipe", "unknown"),
inputs=run.get("inputs", []),
ipfs_cid=task_result.get("ipfs_cid"),
provenance_cid=task_result.get("provenance_cid"),
actor_id=run.get("actor_id"),
)
await self.db.complete_pending_run(run_id)
stats["completed"] += 1
else:
await self.db.update_pending_run_status(
run_id, "failed", "Task completed but no output hash"
)
stats["failed"] += 1
else:
# Task failed
await self.db.update_pending_run_status(
run_id, "failed", str(result.result)
)
stats["failed"] += 1
elif celery_status in ("pending", "started", "retry"):
# Still running
stats["still_running"] += 1
else:
# Unknown state - try to re-queue if we have dag_json
dag_json = run.get("dag_json")
if dag_json:
try:
new_task = execute_dag.delay(dag_json, run_id)
await self.db.create_pending_run(
run_id=run_id,
celery_task_id=new_task.id,
recipe=run.get("recipe", "unknown"),
inputs=run.get("inputs", []),
actor_id=run.get("actor_id"),
dag_json=dag_json,
output_name=run.get("output_name"),
)
stats["recovered"] += 1
except Exception as e:
await self.db.update_pending_run_status(
run_id, "failed", f"Recovery failed: {e}"
)
stats["failed"] += 1
else:
await self.db.update_pending_run_status(
run_id, "failed", f"Task in unknown state: {celery_status}"
)
stats["failed"] += 1
return stats