Add durable pending runs and recipe list debugging

- Store pending runs in PostgreSQL for durability across restarts
- Add recovery method for orphaned runs
- Increase Celery result_expires to 7 days
- Add task_reject_on_worker_lost for automatic re-queuing
- Add logging to recipe list to debug filter issues

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
gilesb
2026-01-11 20:35:00 +00:00
parent a6dd470623
commit 8ab0f05a7d
4 changed files with 412 additions and 31 deletions

View File

@@ -91,6 +91,25 @@ CREATE TABLE IF NOT EXISTS run_cache (
created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW()
);
-- Pending/running runs: tracks in-progress work for durability
-- Allows runs to survive restarts and be recovered
CREATE TABLE IF NOT EXISTS pending_runs (
run_id VARCHAR(64) PRIMARY KEY,
celery_task_id VARCHAR(128),
status VARCHAR(20) NOT NULL DEFAULT 'pending', -- pending, running, failed
recipe VARCHAR(255) NOT NULL,
inputs JSONB NOT NULL,
dag_json TEXT,
output_name VARCHAR(255),
actor_id VARCHAR(255),
error TEXT,
created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
updated_at TIMESTAMP WITH TIME ZONE DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_pending_runs_status ON pending_runs(status);
CREATE INDEX IF NOT EXISTS idx_pending_runs_actor ON pending_runs(actor_id);
-- User storage backends (synced from L2 or configured locally)
CREATE TABLE IF NOT EXISTS storage_backends (
id SERIAL PRIMARY KEY,
@@ -1357,3 +1376,170 @@ async def get_pins_for_content(content_hash: str) -> List[dict]:
content_hash
)
return [dict(row) for row in rows]
# ============ Pending Runs ============
async def create_pending_run(
run_id: str,
celery_task_id: str,
recipe: str,
inputs: List[str],
actor_id: str,
dag_json: Optional[str] = None,
output_name: Optional[str] = None,
) -> dict:
"""Create a pending run record for durability."""
async with pool.acquire() as conn:
row = await conn.fetchrow(
"""
INSERT INTO pending_runs (run_id, celery_task_id, status, recipe, inputs, dag_json, output_name, actor_id)
VALUES ($1, $2, 'running', $3, $4, $5, $6, $7)
ON CONFLICT (run_id) DO UPDATE SET
celery_task_id = EXCLUDED.celery_task_id,
status = 'running',
updated_at = NOW()
RETURNING run_id, celery_task_id, status, recipe, inputs, dag_json, output_name, actor_id, created_at, updated_at
""",
run_id, celery_task_id, recipe, _json.dumps(inputs), dag_json, output_name, actor_id
)
return {
"run_id": row["run_id"],
"celery_task_id": row["celery_task_id"],
"status": row["status"],
"recipe": row["recipe"],
"inputs": row["inputs"],
"dag_json": row["dag_json"],
"output_name": row["output_name"],
"actor_id": row["actor_id"],
"created_at": row["created_at"].isoformat() if row["created_at"] else None,
"updated_at": row["updated_at"].isoformat() if row["updated_at"] else None,
}
async def get_pending_run(run_id: str) -> Optional[dict]:
"""Get a pending run by ID."""
async with pool.acquire() as conn:
row = await conn.fetchrow(
"""
SELECT run_id, celery_task_id, status, recipe, inputs, dag_json, output_name, actor_id, error, created_at, updated_at
FROM pending_runs WHERE run_id = $1
""",
run_id
)
if row:
return {
"run_id": row["run_id"],
"celery_task_id": row["celery_task_id"],
"status": row["status"],
"recipe": row["recipe"],
"inputs": row["inputs"],
"dag_json": row["dag_json"],
"output_name": row["output_name"],
"actor_id": row["actor_id"],
"error": row["error"],
"created_at": row["created_at"].isoformat() if row["created_at"] else None,
"updated_at": row["updated_at"].isoformat() if row["updated_at"] else None,
}
return None
async def list_pending_runs(actor_id: Optional[str] = None, status: Optional[str] = None) -> List[dict]:
"""List pending runs, optionally filtered by actor and/or status."""
async with pool.acquire() as conn:
conditions = []
params = []
param_idx = 1
if actor_id:
conditions.append(f"actor_id = ${param_idx}")
params.append(actor_id)
param_idx += 1
if status:
conditions.append(f"status = ${param_idx}")
params.append(status)
param_idx += 1
where_clause = " AND ".join(conditions) if conditions else "TRUE"
rows = await conn.fetch(
f"""
SELECT run_id, celery_task_id, status, recipe, inputs, output_name, actor_id, error, created_at, updated_at
FROM pending_runs
WHERE {where_clause}
ORDER BY created_at DESC
""",
*params
)
return [
{
"run_id": row["run_id"],
"celery_task_id": row["celery_task_id"],
"status": row["status"],
"recipe": row["recipe"],
"inputs": row["inputs"],
"output_name": row["output_name"],
"actor_id": row["actor_id"],
"error": row["error"],
"created_at": row["created_at"].isoformat() if row["created_at"] else None,
"updated_at": row["updated_at"].isoformat() if row["updated_at"] else None,
}
for row in rows
]
async def update_pending_run_status(run_id: str, status: str, error: Optional[str] = None) -> bool:
"""Update the status of a pending run."""
async with pool.acquire() as conn:
if error:
result = await conn.execute(
"UPDATE pending_runs SET status = $2, error = $3, updated_at = NOW() WHERE run_id = $1",
run_id, status, error
)
else:
result = await conn.execute(
"UPDATE pending_runs SET status = $2, updated_at = NOW() WHERE run_id = $1",
run_id, status
)
return "UPDATE 1" in result
async def complete_pending_run(run_id: str) -> bool:
"""Remove a pending run after it completes (moves to run_cache)."""
async with pool.acquire() as conn:
result = await conn.execute(
"DELETE FROM pending_runs WHERE run_id = $1",
run_id
)
return "DELETE 1" in result
async def get_stale_pending_runs(older_than_hours: int = 24) -> List[dict]:
"""Get pending runs that haven't been updated recently (for recovery)."""
async with pool.acquire() as conn:
rows = await conn.fetch(
"""
SELECT run_id, celery_task_id, status, recipe, inputs, dag_json, output_name, actor_id, created_at, updated_at
FROM pending_runs
WHERE status IN ('pending', 'running')
AND updated_at < NOW() - INTERVAL '%s hours'
ORDER BY created_at
""",
older_than_hours
)
return [
{
"run_id": row["run_id"],
"celery_task_id": row["celery_task_id"],
"status": row["status"],
"recipe": row["recipe"],
"inputs": row["inputs"],
"dag_json": row["dag_json"],
"output_name": row["output_name"],
"actor_id": row["actor_id"],
"created_at": row["created_at"].isoformat() if row["created_at"] else None,
"updated_at": row["updated_at"].isoformat() if row["updated_at"] else None,
}
for row in rows
]