Add CI/CD workflow
This commit is contained in:
@@ -1071,6 +1071,159 @@ async def purge_failed_runs(
|
||||
return {"purged": len(deleted), "run_ids": deleted}
|
||||
|
||||
|
||||
@router.post("/{run_id}/pause")
|
||||
async def pause_run(
|
||||
run_id: str,
|
||||
request: Request,
|
||||
ctx: UserContext = Depends(require_auth),
|
||||
):
|
||||
"""Pause a running render. Waits for current segment to complete.
|
||||
|
||||
The render will checkpoint at the next segment boundary and stop.
|
||||
"""
|
||||
import database
|
||||
from celery_app import app as celery_app
|
||||
|
||||
await database.init_db()
|
||||
|
||||
pending = await database.get_pending_run(run_id)
|
||||
if not pending:
|
||||
raise HTTPException(404, "Run not found")
|
||||
|
||||
if pending['status'] != 'running':
|
||||
raise HTTPException(400, f"Can only pause running renders (current status: {pending['status']})")
|
||||
|
||||
# Revoke the Celery task (soft termination via SIGTERM - allows cleanup)
|
||||
celery_task_id = pending.get('celery_task_id')
|
||||
if celery_task_id:
|
||||
celery_app.control.revoke(celery_task_id, terminate=True, signal='SIGTERM')
|
||||
logger.info(f"Sent SIGTERM to task {celery_task_id} for run {run_id}")
|
||||
|
||||
# Update status to 'paused'
|
||||
await database.update_pending_run_status(run_id, 'paused')
|
||||
|
||||
return {
|
||||
"run_id": run_id,
|
||||
"status": "paused",
|
||||
"checkpoint_frame": pending.get('checkpoint_frame'),
|
||||
}
|
||||
|
||||
|
||||
@router.post("/{run_id}/resume")
|
||||
async def resume_run(
|
||||
run_id: str,
|
||||
request: Request,
|
||||
ctx: UserContext = Depends(require_auth),
|
||||
):
|
||||
"""Resume a paused or failed run from its last checkpoint.
|
||||
|
||||
The render will continue from the checkpoint frame.
|
||||
"""
|
||||
import database
|
||||
from tasks.streaming import run_stream
|
||||
|
||||
await database.init_db()
|
||||
|
||||
pending = await database.get_pending_run(run_id)
|
||||
if not pending:
|
||||
raise HTTPException(404, "Run not found")
|
||||
|
||||
if pending['status'] not in ('failed', 'paused'):
|
||||
raise HTTPException(400, f"Can only resume failed/paused runs (current status: {pending['status']})")
|
||||
|
||||
if not pending.get('checkpoint_frame'):
|
||||
raise HTTPException(400, "No checkpoint available - use restart instead")
|
||||
|
||||
if not pending.get('resumable', True):
|
||||
raise HTTPException(400, "Run checkpoint is corrupted - use restart instead")
|
||||
|
||||
# Submit new Celery task with resume=True
|
||||
task = run_stream.apply_async(
|
||||
kwargs=dict(
|
||||
run_id=run_id,
|
||||
recipe_sexp=pending.get('dag_json', ''), # Recipe is stored in dag_json
|
||||
output_name=pending.get('output_name', 'output.mp4'),
|
||||
actor_id=pending.get('actor_id'),
|
||||
resume=True,
|
||||
),
|
||||
queue='gpu',
|
||||
)
|
||||
|
||||
# Update status and celery_task_id
|
||||
await database.update_pending_run_status(run_id, 'running')
|
||||
|
||||
# Update the celery_task_id manually since create_pending_run isn't called
|
||||
async with database.pool.acquire() as conn:
|
||||
await conn.execute(
|
||||
"UPDATE pending_runs SET celery_task_id = $2, updated_at = NOW() WHERE run_id = $1",
|
||||
run_id, task.id
|
||||
)
|
||||
|
||||
logger.info(f"Resumed run {run_id} from frame {pending.get('checkpoint_frame')} with task {task.id}")
|
||||
|
||||
return {
|
||||
"run_id": run_id,
|
||||
"status": "running",
|
||||
"celery_task_id": task.id,
|
||||
"resumed_from_frame": pending.get('checkpoint_frame'),
|
||||
}
|
||||
|
||||
|
||||
@router.post("/{run_id}/restart")
|
||||
async def restart_run(
|
||||
run_id: str,
|
||||
request: Request,
|
||||
ctx: UserContext = Depends(require_auth),
|
||||
):
|
||||
"""Restart a failed/paused run from the beginning (discard checkpoint).
|
||||
|
||||
All progress will be lost. Use resume instead to continue from checkpoint.
|
||||
"""
|
||||
import database
|
||||
from tasks.streaming import run_stream
|
||||
|
||||
await database.init_db()
|
||||
|
||||
pending = await database.get_pending_run(run_id)
|
||||
if not pending:
|
||||
raise HTTPException(404, "Run not found")
|
||||
|
||||
if pending['status'] not in ('failed', 'paused'):
|
||||
raise HTTPException(400, f"Can only restart failed/paused runs (current status: {pending['status']})")
|
||||
|
||||
# Clear checkpoint data
|
||||
await database.clear_run_checkpoint(run_id)
|
||||
|
||||
# Submit new Celery task (without resume)
|
||||
task = run_stream.apply_async(
|
||||
kwargs=dict(
|
||||
run_id=run_id,
|
||||
recipe_sexp=pending.get('dag_json', ''), # Recipe is stored in dag_json
|
||||
output_name=pending.get('output_name', 'output.mp4'),
|
||||
actor_id=pending.get('actor_id'),
|
||||
resume=False,
|
||||
),
|
||||
queue='gpu',
|
||||
)
|
||||
|
||||
# Update status and celery_task_id
|
||||
await database.update_pending_run_status(run_id, 'running')
|
||||
|
||||
async with database.pool.acquire() as conn:
|
||||
await conn.execute(
|
||||
"UPDATE pending_runs SET celery_task_id = $2, updated_at = NOW() WHERE run_id = $1",
|
||||
run_id, task.id
|
||||
)
|
||||
|
||||
logger.info(f"Restarted run {run_id} from beginning with task {task.id}")
|
||||
|
||||
return {
|
||||
"run_id": run_id,
|
||||
"status": "running",
|
||||
"celery_task_id": task.id,
|
||||
}
|
||||
|
||||
|
||||
@router.get("/{run_id}/stream")
|
||||
async def stream_run_output(
|
||||
run_id: str,
|
||||
|
||||
Reference in New Issue
Block a user