Add CI/CD workflow
Some checks failed
Build and Deploy / build-and-deploy (push) Failing after 1m2s
GPU Worker CI/CD / test (push) Failing after 1m4s
GPU Worker CI/CD / deploy (push) Has been skipped

This commit is contained in:
giles
2026-02-06 10:44:13 +00:00
parent 7411aa74c4
commit 48018d09b7
11 changed files with 934 additions and 20 deletions

View File

@@ -1071,6 +1071,159 @@ async def purge_failed_runs(
return {"purged": len(deleted), "run_ids": deleted}
@router.post("/{run_id}/pause")
async def pause_run(
run_id: str,
request: Request,
ctx: UserContext = Depends(require_auth),
):
"""Pause a running render. Waits for current segment to complete.
The render will checkpoint at the next segment boundary and stop.
"""
import database
from celery_app import app as celery_app
await database.init_db()
pending = await database.get_pending_run(run_id)
if not pending:
raise HTTPException(404, "Run not found")
if pending['status'] != 'running':
raise HTTPException(400, f"Can only pause running renders (current status: {pending['status']})")
# Revoke the Celery task (soft termination via SIGTERM - allows cleanup)
celery_task_id = pending.get('celery_task_id')
if celery_task_id:
celery_app.control.revoke(celery_task_id, terminate=True, signal='SIGTERM')
logger.info(f"Sent SIGTERM to task {celery_task_id} for run {run_id}")
# Update status to 'paused'
await database.update_pending_run_status(run_id, 'paused')
return {
"run_id": run_id,
"status": "paused",
"checkpoint_frame": pending.get('checkpoint_frame'),
}
@router.post("/{run_id}/resume")
async def resume_run(
run_id: str,
request: Request,
ctx: UserContext = Depends(require_auth),
):
"""Resume a paused or failed run from its last checkpoint.
The render will continue from the checkpoint frame.
"""
import database
from tasks.streaming import run_stream
await database.init_db()
pending = await database.get_pending_run(run_id)
if not pending:
raise HTTPException(404, "Run not found")
if pending['status'] not in ('failed', 'paused'):
raise HTTPException(400, f"Can only resume failed/paused runs (current status: {pending['status']})")
if not pending.get('checkpoint_frame'):
raise HTTPException(400, "No checkpoint available - use restart instead")
if not pending.get('resumable', True):
raise HTTPException(400, "Run checkpoint is corrupted - use restart instead")
# Submit new Celery task with resume=True
task = run_stream.apply_async(
kwargs=dict(
run_id=run_id,
recipe_sexp=pending.get('dag_json', ''), # Recipe is stored in dag_json
output_name=pending.get('output_name', 'output.mp4'),
actor_id=pending.get('actor_id'),
resume=True,
),
queue='gpu',
)
# Update status and celery_task_id
await database.update_pending_run_status(run_id, 'running')
# Update the celery_task_id manually since create_pending_run isn't called
async with database.pool.acquire() as conn:
await conn.execute(
"UPDATE pending_runs SET celery_task_id = $2, updated_at = NOW() WHERE run_id = $1",
run_id, task.id
)
logger.info(f"Resumed run {run_id} from frame {pending.get('checkpoint_frame')} with task {task.id}")
return {
"run_id": run_id,
"status": "running",
"celery_task_id": task.id,
"resumed_from_frame": pending.get('checkpoint_frame'),
}
@router.post("/{run_id}/restart")
async def restart_run(
run_id: str,
request: Request,
ctx: UserContext = Depends(require_auth),
):
"""Restart a failed/paused run from the beginning (discard checkpoint).
All progress will be lost. Use resume instead to continue from checkpoint.
"""
import database
from tasks.streaming import run_stream
await database.init_db()
pending = await database.get_pending_run(run_id)
if not pending:
raise HTTPException(404, "Run not found")
if pending['status'] not in ('failed', 'paused'):
raise HTTPException(400, f"Can only restart failed/paused runs (current status: {pending['status']})")
# Clear checkpoint data
await database.clear_run_checkpoint(run_id)
# Submit new Celery task (without resume)
task = run_stream.apply_async(
kwargs=dict(
run_id=run_id,
recipe_sexp=pending.get('dag_json', ''), # Recipe is stored in dag_json
output_name=pending.get('output_name', 'output.mp4'),
actor_id=pending.get('actor_id'),
resume=False,
),
queue='gpu',
)
# Update status and celery_task_id
await database.update_pending_run_status(run_id, 'running')
async with database.pool.acquire() as conn:
await conn.execute(
"UPDATE pending_runs SET celery_task_id = $2, updated_at = NOW() WHERE run_id = $1",
run_id, task.id
)
logger.info(f"Restarted run {run_id} from beginning with task {task.id}")
return {
"run_id": run_id,
"status": "running",
"celery_task_id": task.id,
}
@router.get("/{run_id}/stream")
async def stream_run_output(
run_id: str,