Add durable pending runs and recipe list debugging
- Store pending runs in PostgreSQL for durability across restarts - Add recovery method for orphaned runs - Increase Celery result_expires to 7 days - Add task_reject_on_worker_lost for automatic re-queuing - Add logging to recipe list to debug filter issues Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -46,23 +46,30 @@ class RecipeService:
|
||||
|
||||
async def list_recipes(self, actor_id: str = None, offset: int = 0, limit: int = 20) -> list:
|
||||
"""
|
||||
List available recipes.
|
||||
List available recipes for a user.
|
||||
|
||||
L1 data is isolated per-user - only shows recipes owned by actor_id.
|
||||
|
||||
Note: This scans the cache for recipe files. For production,
|
||||
you might want a database index of recipes by owner.
|
||||
"""
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Get all cached items and filter for recipes
|
||||
# This is a simplified implementation - production would use a proper index
|
||||
recipes = []
|
||||
|
||||
# Check if cache has a list method for recipes
|
||||
if hasattr(self.cache, 'list_by_type'):
|
||||
items = self.cache.list_by_type('recipe')
|
||||
logger.info(f"Found {len(items)} recipe items in cache")
|
||||
for content_hash in items:
|
||||
recipe = await self.get_recipe(content_hash)
|
||||
if recipe:
|
||||
# Filter by actor if specified
|
||||
if actor_id is None or recipe.get("uploader") == actor_id:
|
||||
uploader = recipe.get("uploader")
|
||||
logger.info(f"Recipe {content_hash[:12]}: uploader={uploader}, actor_id={actor_id}")
|
||||
# Filter by actor - L1 is per-user
|
||||
if actor_id is None or uploader == actor_id:
|
||||
recipes.append(recipe)
|
||||
|
||||
# Sort by name
|
||||
|
||||
@@ -101,7 +101,67 @@ class RunService:
|
||||
"completed_at": cached.get("created_at"),
|
||||
}
|
||||
|
||||
# Check if there's a running task
|
||||
# Check database for pending run
|
||||
pending = await self.db.get_pending_run(run_id)
|
||||
if pending:
|
||||
task_id = pending.get("celery_task_id")
|
||||
if task_id:
|
||||
# Check actual Celery task state
|
||||
from celery.result import AsyncResult
|
||||
from celery_app import app as celery_app
|
||||
|
||||
result = AsyncResult(task_id, app=celery_app)
|
||||
status = result.status.lower()
|
||||
|
||||
# Normalize status
|
||||
status_map = {
|
||||
"pending": "pending",
|
||||
"started": "running",
|
||||
"success": "completed",
|
||||
"failure": "failed",
|
||||
"retry": "running",
|
||||
"revoked": "failed",
|
||||
}
|
||||
normalized_status = status_map.get(status, status)
|
||||
|
||||
run_data = {
|
||||
"run_id": run_id,
|
||||
"status": normalized_status,
|
||||
"celery_task_id": task_id,
|
||||
"actor_id": pending.get("actor_id"),
|
||||
"recipe": pending.get("recipe"),
|
||||
"inputs": pending.get("inputs"),
|
||||
"output_name": pending.get("output_name"),
|
||||
"created_at": pending.get("created_at"),
|
||||
"error": pending.get("error"),
|
||||
}
|
||||
|
||||
# If task completed, get result
|
||||
if result.ready():
|
||||
if result.successful():
|
||||
run_data["status"] = "completed"
|
||||
task_result = result.result
|
||||
if isinstance(task_result, dict):
|
||||
run_data["output_hash"] = task_result.get("output_hash")
|
||||
else:
|
||||
run_data["status"] = "failed"
|
||||
run_data["error"] = str(result.result)
|
||||
|
||||
return run_data
|
||||
|
||||
# No task_id but have pending record - return from DB
|
||||
return {
|
||||
"run_id": run_id,
|
||||
"status": pending.get("status", "pending"),
|
||||
"recipe": pending.get("recipe"),
|
||||
"inputs": pending.get("inputs"),
|
||||
"output_name": pending.get("output_name"),
|
||||
"actor_id": pending.get("actor_id"),
|
||||
"created_at": pending.get("created_at"),
|
||||
"error": pending.get("error"),
|
||||
}
|
||||
|
||||
# Fallback: Check Redis for backwards compatibility
|
||||
task_data = self.redis.get(f"{self.task_key_prefix}{run_id}")
|
||||
if task_data:
|
||||
if isinstance(task_data, bytes):
|
||||
@@ -176,33 +236,28 @@ class RunService:
|
||||
return None
|
||||
|
||||
async def list_runs(self, actor_id: str, offset: int = 0, limit: int = 20) -> list:
|
||||
"""List runs for a user. Returns completed runs from database."""
|
||||
"""List runs for a user. Returns completed and pending runs from database."""
|
||||
# Get completed runs from database
|
||||
runs = await self.db.list_runs_by_actor(actor_id, offset=offset, limit=limit)
|
||||
completed_runs = await self.db.list_runs_by_actor(actor_id, offset=0, limit=limit + 50)
|
||||
|
||||
# Also check for any pending tasks in Redis
|
||||
# Get pending runs from database
|
||||
pending_db = await self.db.list_pending_runs(actor_id=actor_id)
|
||||
|
||||
# Convert pending runs to run format with live status check
|
||||
pending = []
|
||||
cursor = 0
|
||||
while True:
|
||||
cursor, keys = self.redis.scan(
|
||||
cursor=cursor,
|
||||
match=f"{self.task_key_prefix}*",
|
||||
count=100
|
||||
)
|
||||
for key in keys:
|
||||
run_id = key.decode().replace(self.task_key_prefix, "") if isinstance(key, bytes) else key.replace(self.task_key_prefix, "")
|
||||
# Check if this run isn't already in completed results
|
||||
if not any(r.get("run_id") == run_id for r in runs):
|
||||
run = await self.get_run(run_id)
|
||||
if run and run.get("status") in ("pending", "running"):
|
||||
# Filter by actor_id
|
||||
if run.get("actor_id") == actor_id:
|
||||
pending.append(run)
|
||||
if cursor == 0:
|
||||
break
|
||||
for pr in pending_db:
|
||||
run_id = pr.get("run_id")
|
||||
# Skip if already in completed
|
||||
if any(r.get("run_id") == run_id for r in completed_runs):
|
||||
continue
|
||||
|
||||
# Get live status
|
||||
run = await self.get_run(run_id)
|
||||
if run and run.get("status") in ("pending", "running"):
|
||||
pending.append(run)
|
||||
|
||||
# Combine and sort
|
||||
all_runs = pending + runs
|
||||
all_runs = pending + completed_runs
|
||||
all_runs.sort(key=lambda r: r.get("created_at", ""), reverse=True)
|
||||
|
||||
return all_runs[offset:offset + limit]
|
||||
@@ -318,7 +373,18 @@ class RunService:
|
||||
return None, "Legacy mode only supports single-input recipes. Use use_dag=true for multi-input."
|
||||
task = render_effect.delay(input_list[0], recipe, output_name)
|
||||
|
||||
# Store task mapping in Redis (ephemeral) - includes metadata for list display
|
||||
# Store pending run in database for durability
|
||||
await self.db.create_pending_run(
|
||||
run_id=run_id,
|
||||
celery_task_id=task.id,
|
||||
recipe=recipe,
|
||||
inputs=input_list,
|
||||
actor_id=actor_id,
|
||||
dag_json=dag_json,
|
||||
output_name=output_name,
|
||||
)
|
||||
|
||||
# Also store in Redis for backwards compatibility (shorter TTL)
|
||||
task_data = json.dumps({
|
||||
"task_id": task.id,
|
||||
"actor_id": actor_id,
|
||||
@@ -329,7 +395,7 @@ class RunService:
|
||||
})
|
||||
self.redis.setex(
|
||||
f"{self.task_key_prefix}{run_id}",
|
||||
3600 * 24, # 24 hour TTL
|
||||
3600 * 4, # 4 hour TTL (database is primary now)
|
||||
task_data
|
||||
)
|
||||
|
||||
@@ -459,3 +525,123 @@ class RunService:
|
||||
def detect_media_type(self, path: Path) -> str:
|
||||
"""Detect media type for a file path."""
|
||||
return detect_media_type(path)
|
||||
|
||||
async def recover_pending_runs(self) -> Dict[str, int]:
|
||||
"""
|
||||
Recover pending runs after restart.
|
||||
|
||||
Checks all pending runs in the database and:
|
||||
- Updates status for completed tasks
|
||||
- Re-queues orphaned tasks that can be retried
|
||||
- Marks as failed if unrecoverable
|
||||
|
||||
Returns counts of recovered, completed, failed runs.
|
||||
"""
|
||||
from celery.result import AsyncResult
|
||||
from celery_app import app as celery_app
|
||||
|
||||
try:
|
||||
from legacy_tasks import execute_dag
|
||||
except ImportError:
|
||||
return {"error": "Celery tasks not available"}
|
||||
|
||||
stats = {"recovered": 0, "completed": 0, "failed": 0, "still_running": 0}
|
||||
|
||||
# Get all pending/running runs from database
|
||||
pending_runs = await self.db.list_pending_runs()
|
||||
|
||||
for run in pending_runs:
|
||||
run_id = run.get("run_id")
|
||||
task_id = run.get("celery_task_id")
|
||||
status = run.get("status")
|
||||
|
||||
if not task_id:
|
||||
# No task ID - try to re-queue if we have dag_json
|
||||
dag_json = run.get("dag_json")
|
||||
if dag_json:
|
||||
try:
|
||||
new_task = execute_dag.delay(dag_json, run_id)
|
||||
await self.db.create_pending_run(
|
||||
run_id=run_id,
|
||||
celery_task_id=new_task.id,
|
||||
recipe=run.get("recipe", "unknown"),
|
||||
inputs=run.get("inputs", []),
|
||||
actor_id=run.get("actor_id"),
|
||||
dag_json=dag_json,
|
||||
output_name=run.get("output_name"),
|
||||
)
|
||||
stats["recovered"] += 1
|
||||
except Exception as e:
|
||||
await self.db.update_pending_run_status(
|
||||
run_id, "failed", f"Recovery failed: {e}"
|
||||
)
|
||||
stats["failed"] += 1
|
||||
else:
|
||||
await self.db.update_pending_run_status(
|
||||
run_id, "failed", "No DAG data for recovery"
|
||||
)
|
||||
stats["failed"] += 1
|
||||
continue
|
||||
|
||||
# Check Celery task state
|
||||
result = AsyncResult(task_id, app=celery_app)
|
||||
celery_status = result.status.lower()
|
||||
|
||||
if result.ready():
|
||||
if result.successful():
|
||||
# Task completed - move to run_cache
|
||||
task_result = result.result
|
||||
if isinstance(task_result, dict) and task_result.get("output_hash"):
|
||||
await self.db.save_run_cache(
|
||||
run_id=run_id,
|
||||
output_hash=task_result["output_hash"],
|
||||
recipe=run.get("recipe", "unknown"),
|
||||
inputs=run.get("inputs", []),
|
||||
ipfs_cid=task_result.get("ipfs_cid"),
|
||||
provenance_cid=task_result.get("provenance_cid"),
|
||||
actor_id=run.get("actor_id"),
|
||||
)
|
||||
await self.db.complete_pending_run(run_id)
|
||||
stats["completed"] += 1
|
||||
else:
|
||||
await self.db.update_pending_run_status(
|
||||
run_id, "failed", "Task completed but no output hash"
|
||||
)
|
||||
stats["failed"] += 1
|
||||
else:
|
||||
# Task failed
|
||||
await self.db.update_pending_run_status(
|
||||
run_id, "failed", str(result.result)
|
||||
)
|
||||
stats["failed"] += 1
|
||||
elif celery_status in ("pending", "started", "retry"):
|
||||
# Still running
|
||||
stats["still_running"] += 1
|
||||
else:
|
||||
# Unknown state - try to re-queue if we have dag_json
|
||||
dag_json = run.get("dag_json")
|
||||
if dag_json:
|
||||
try:
|
||||
new_task = execute_dag.delay(dag_json, run_id)
|
||||
await self.db.create_pending_run(
|
||||
run_id=run_id,
|
||||
celery_task_id=new_task.id,
|
||||
recipe=run.get("recipe", "unknown"),
|
||||
inputs=run.get("inputs", []),
|
||||
actor_id=run.get("actor_id"),
|
||||
dag_json=dag_json,
|
||||
output_name=run.get("output_name"),
|
||||
)
|
||||
stats["recovered"] += 1
|
||||
except Exception as e:
|
||||
await self.db.update_pending_run_status(
|
||||
run_id, "failed", f"Recovery failed: {e}"
|
||||
)
|
||||
stats["failed"] += 1
|
||||
else:
|
||||
await self.db.update_pending_run_status(
|
||||
run_id, "failed", f"Task in unknown state: {celery_status}"
|
||||
)
|
||||
stats["failed"] += 1
|
||||
|
||||
return stats
|
||||
|
||||
Reference in New Issue
Block a user