Add durable pending runs and recipe list debugging
- Store pending runs in PostgreSQL for durability across restarts - Add recovery method for orphaned runs - Increase Celery result_expires to 7 days - Add task_reject_on_worker_lost for automatic re-queuing - Add logging to recipe list to debug filter issues Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
186
database.py
186
database.py
@@ -91,6 +91,25 @@ CREATE TABLE IF NOT EXISTS run_cache (
|
||||
created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW()
|
||||
);
|
||||
|
||||
-- Pending/running runs: tracks in-progress work for durability
|
||||
-- Allows runs to survive restarts and be recovered
|
||||
CREATE TABLE IF NOT EXISTS pending_runs (
|
||||
run_id VARCHAR(64) PRIMARY KEY,
|
||||
celery_task_id VARCHAR(128),
|
||||
status VARCHAR(20) NOT NULL DEFAULT 'pending', -- pending, running, failed
|
||||
recipe VARCHAR(255) NOT NULL,
|
||||
inputs JSONB NOT NULL,
|
||||
dag_json TEXT,
|
||||
output_name VARCHAR(255),
|
||||
actor_id VARCHAR(255),
|
||||
error TEXT,
|
||||
created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
|
||||
updated_at TIMESTAMP WITH TIME ZONE DEFAULT NOW()
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_pending_runs_status ON pending_runs(status);
|
||||
CREATE INDEX IF NOT EXISTS idx_pending_runs_actor ON pending_runs(actor_id);
|
||||
|
||||
-- User storage backends (synced from L2 or configured locally)
|
||||
CREATE TABLE IF NOT EXISTS storage_backends (
|
||||
id SERIAL PRIMARY KEY,
|
||||
@@ -1357,3 +1376,170 @@ async def get_pins_for_content(content_hash: str) -> List[dict]:
|
||||
content_hash
|
||||
)
|
||||
return [dict(row) for row in rows]
|
||||
|
||||
|
||||
# ============ Pending Runs ============
|
||||
|
||||
async def create_pending_run(
|
||||
run_id: str,
|
||||
celery_task_id: str,
|
||||
recipe: str,
|
||||
inputs: List[str],
|
||||
actor_id: str,
|
||||
dag_json: Optional[str] = None,
|
||||
output_name: Optional[str] = None,
|
||||
) -> dict:
|
||||
"""Create a pending run record for durability."""
|
||||
async with pool.acquire() as conn:
|
||||
row = await conn.fetchrow(
|
||||
"""
|
||||
INSERT INTO pending_runs (run_id, celery_task_id, status, recipe, inputs, dag_json, output_name, actor_id)
|
||||
VALUES ($1, $2, 'running', $3, $4, $5, $6, $7)
|
||||
ON CONFLICT (run_id) DO UPDATE SET
|
||||
celery_task_id = EXCLUDED.celery_task_id,
|
||||
status = 'running',
|
||||
updated_at = NOW()
|
||||
RETURNING run_id, celery_task_id, status, recipe, inputs, dag_json, output_name, actor_id, created_at, updated_at
|
||||
""",
|
||||
run_id, celery_task_id, recipe, _json.dumps(inputs), dag_json, output_name, actor_id
|
||||
)
|
||||
return {
|
||||
"run_id": row["run_id"],
|
||||
"celery_task_id": row["celery_task_id"],
|
||||
"status": row["status"],
|
||||
"recipe": row["recipe"],
|
||||
"inputs": row["inputs"],
|
||||
"dag_json": row["dag_json"],
|
||||
"output_name": row["output_name"],
|
||||
"actor_id": row["actor_id"],
|
||||
"created_at": row["created_at"].isoformat() if row["created_at"] else None,
|
||||
"updated_at": row["updated_at"].isoformat() if row["updated_at"] else None,
|
||||
}
|
||||
|
||||
|
||||
async def get_pending_run(run_id: str) -> Optional[dict]:
|
||||
"""Get a pending run by ID."""
|
||||
async with pool.acquire() as conn:
|
||||
row = await conn.fetchrow(
|
||||
"""
|
||||
SELECT run_id, celery_task_id, status, recipe, inputs, dag_json, output_name, actor_id, error, created_at, updated_at
|
||||
FROM pending_runs WHERE run_id = $1
|
||||
""",
|
||||
run_id
|
||||
)
|
||||
if row:
|
||||
return {
|
||||
"run_id": row["run_id"],
|
||||
"celery_task_id": row["celery_task_id"],
|
||||
"status": row["status"],
|
||||
"recipe": row["recipe"],
|
||||
"inputs": row["inputs"],
|
||||
"dag_json": row["dag_json"],
|
||||
"output_name": row["output_name"],
|
||||
"actor_id": row["actor_id"],
|
||||
"error": row["error"],
|
||||
"created_at": row["created_at"].isoformat() if row["created_at"] else None,
|
||||
"updated_at": row["updated_at"].isoformat() if row["updated_at"] else None,
|
||||
}
|
||||
return None
|
||||
|
||||
|
||||
async def list_pending_runs(actor_id: Optional[str] = None, status: Optional[str] = None) -> List[dict]:
|
||||
"""List pending runs, optionally filtered by actor and/or status."""
|
||||
async with pool.acquire() as conn:
|
||||
conditions = []
|
||||
params = []
|
||||
param_idx = 1
|
||||
|
||||
if actor_id:
|
||||
conditions.append(f"actor_id = ${param_idx}")
|
||||
params.append(actor_id)
|
||||
param_idx += 1
|
||||
|
||||
if status:
|
||||
conditions.append(f"status = ${param_idx}")
|
||||
params.append(status)
|
||||
param_idx += 1
|
||||
|
||||
where_clause = " AND ".join(conditions) if conditions else "TRUE"
|
||||
|
||||
rows = await conn.fetch(
|
||||
f"""
|
||||
SELECT run_id, celery_task_id, status, recipe, inputs, output_name, actor_id, error, created_at, updated_at
|
||||
FROM pending_runs
|
||||
WHERE {where_clause}
|
||||
ORDER BY created_at DESC
|
||||
""",
|
||||
*params
|
||||
)
|
||||
return [
|
||||
{
|
||||
"run_id": row["run_id"],
|
||||
"celery_task_id": row["celery_task_id"],
|
||||
"status": row["status"],
|
||||
"recipe": row["recipe"],
|
||||
"inputs": row["inputs"],
|
||||
"output_name": row["output_name"],
|
||||
"actor_id": row["actor_id"],
|
||||
"error": row["error"],
|
||||
"created_at": row["created_at"].isoformat() if row["created_at"] else None,
|
||||
"updated_at": row["updated_at"].isoformat() if row["updated_at"] else None,
|
||||
}
|
||||
for row in rows
|
||||
]
|
||||
|
||||
|
||||
async def update_pending_run_status(run_id: str, status: str, error: Optional[str] = None) -> bool:
|
||||
"""Update the status of a pending run."""
|
||||
async with pool.acquire() as conn:
|
||||
if error:
|
||||
result = await conn.execute(
|
||||
"UPDATE pending_runs SET status = $2, error = $3, updated_at = NOW() WHERE run_id = $1",
|
||||
run_id, status, error
|
||||
)
|
||||
else:
|
||||
result = await conn.execute(
|
||||
"UPDATE pending_runs SET status = $2, updated_at = NOW() WHERE run_id = $1",
|
||||
run_id, status
|
||||
)
|
||||
return "UPDATE 1" in result
|
||||
|
||||
|
||||
async def complete_pending_run(run_id: str) -> bool:
|
||||
"""Remove a pending run after it completes (moves to run_cache)."""
|
||||
async with pool.acquire() as conn:
|
||||
result = await conn.execute(
|
||||
"DELETE FROM pending_runs WHERE run_id = $1",
|
||||
run_id
|
||||
)
|
||||
return "DELETE 1" in result
|
||||
|
||||
|
||||
async def get_stale_pending_runs(older_than_hours: int = 24) -> List[dict]:
|
||||
"""Get pending runs that haven't been updated recently (for recovery)."""
|
||||
async with pool.acquire() as conn:
|
||||
rows = await conn.fetch(
|
||||
"""
|
||||
SELECT run_id, celery_task_id, status, recipe, inputs, dag_json, output_name, actor_id, created_at, updated_at
|
||||
FROM pending_runs
|
||||
WHERE status IN ('pending', 'running')
|
||||
AND updated_at < NOW() - INTERVAL '%s hours'
|
||||
ORDER BY created_at
|
||||
""",
|
||||
older_than_hours
|
||||
)
|
||||
return [
|
||||
{
|
||||
"run_id": row["run_id"],
|
||||
"celery_task_id": row["celery_task_id"],
|
||||
"status": row["status"],
|
||||
"recipe": row["recipe"],
|
||||
"inputs": row["inputs"],
|
||||
"dag_json": row["dag_json"],
|
||||
"output_name": row["output_name"],
|
||||
"actor_id": row["actor_id"],
|
||||
"created_at": row["created_at"].isoformat() if row["created_at"] else None,
|
||||
"updated_at": row["updated_at"].isoformat() if row["updated_at"] else None,
|
||||
}
|
||||
for row in rows
|
||||
]
|
||||
|
||||
Reference in New Issue
Block a user