Add CI/CD workflow
Some checks failed
Build and Deploy / build-and-deploy (push) Failing after 1m2s
GPU Worker CI/CD / test (push) Failing after 1m4s
GPU Worker CI/CD / deploy (push) Has been skipped

This commit is contained in:
giles
2026-02-06 10:44:13 +00:00
parent 7411aa74c4
commit 48018d09b7
11 changed files with 934 additions and 20 deletions

View File

@@ -1071,6 +1071,159 @@ async def purge_failed_runs(
return {"purged": len(deleted), "run_ids": deleted}
@router.post("/{run_id}/pause")
async def pause_run(
run_id: str,
request: Request,
ctx: UserContext = Depends(require_auth),
):
"""Pause a running render. Waits for current segment to complete.
The render will checkpoint at the next segment boundary and stop.
"""
import database
from celery_app import app as celery_app
await database.init_db()
pending = await database.get_pending_run(run_id)
if not pending:
raise HTTPException(404, "Run not found")
if pending['status'] != 'running':
raise HTTPException(400, f"Can only pause running renders (current status: {pending['status']})")
# Revoke the Celery task (soft termination via SIGTERM - allows cleanup)
celery_task_id = pending.get('celery_task_id')
if celery_task_id:
celery_app.control.revoke(celery_task_id, terminate=True, signal='SIGTERM')
logger.info(f"Sent SIGTERM to task {celery_task_id} for run {run_id}")
# Update status to 'paused'
await database.update_pending_run_status(run_id, 'paused')
return {
"run_id": run_id,
"status": "paused",
"checkpoint_frame": pending.get('checkpoint_frame'),
}
@router.post("/{run_id}/resume")
async def resume_run(
run_id: str,
request: Request,
ctx: UserContext = Depends(require_auth),
):
"""Resume a paused or failed run from its last checkpoint.
The render will continue from the checkpoint frame.
"""
import database
from tasks.streaming import run_stream
await database.init_db()
pending = await database.get_pending_run(run_id)
if not pending:
raise HTTPException(404, "Run not found")
if pending['status'] not in ('failed', 'paused'):
raise HTTPException(400, f"Can only resume failed/paused runs (current status: {pending['status']})")
if not pending.get('checkpoint_frame'):
raise HTTPException(400, "No checkpoint available - use restart instead")
if not pending.get('resumable', True):
raise HTTPException(400, "Run checkpoint is corrupted - use restart instead")
# Submit new Celery task with resume=True
task = run_stream.apply_async(
kwargs=dict(
run_id=run_id,
recipe_sexp=pending.get('dag_json', ''), # Recipe is stored in dag_json
output_name=pending.get('output_name', 'output.mp4'),
actor_id=pending.get('actor_id'),
resume=True,
),
queue='gpu',
)
# Update status and celery_task_id
await database.update_pending_run_status(run_id, 'running')
# Update the celery_task_id manually since create_pending_run isn't called
async with database.pool.acquire() as conn:
await conn.execute(
"UPDATE pending_runs SET celery_task_id = $2, updated_at = NOW() WHERE run_id = $1",
run_id, task.id
)
logger.info(f"Resumed run {run_id} from frame {pending.get('checkpoint_frame')} with task {task.id}")
return {
"run_id": run_id,
"status": "running",
"celery_task_id": task.id,
"resumed_from_frame": pending.get('checkpoint_frame'),
}
@router.post("/{run_id}/restart")
async def restart_run(
run_id: str,
request: Request,
ctx: UserContext = Depends(require_auth),
):
"""Restart a failed/paused run from the beginning (discard checkpoint).
All progress will be lost. Use resume instead to continue from checkpoint.
"""
import database
from tasks.streaming import run_stream
await database.init_db()
pending = await database.get_pending_run(run_id)
if not pending:
raise HTTPException(404, "Run not found")
if pending['status'] not in ('failed', 'paused'):
raise HTTPException(400, f"Can only restart failed/paused runs (current status: {pending['status']})")
# Clear checkpoint data
await database.clear_run_checkpoint(run_id)
# Submit new Celery task (without resume)
task = run_stream.apply_async(
kwargs=dict(
run_id=run_id,
recipe_sexp=pending.get('dag_json', ''), # Recipe is stored in dag_json
output_name=pending.get('output_name', 'output.mp4'),
actor_id=pending.get('actor_id'),
resume=False,
),
queue='gpu',
)
# Update status and celery_task_id
await database.update_pending_run_status(run_id, 'running')
async with database.pool.acquire() as conn:
await conn.execute(
"UPDATE pending_runs SET celery_task_id = $2, updated_at = NOW() WHERE run_id = $1",
run_id, task.id
)
logger.info(f"Restarted run {run_id} from beginning with task {task.id}")
return {
"run_id": run_id,
"status": "running",
"celery_task_id": task.id,
}
@router.get("/{run_id}/stream")
async def stream_run_output(
run_id: str,

View File

@@ -150,6 +150,7 @@ class RunService:
"inputs": self._ensure_inputs_list(cached.get("inputs")),
"output_cid": output_cid,
"ipfs_cid": cached.get("ipfs_cid"),
"ipfs_playlist_cid": cached.get("ipfs_playlist_cid") or (pending.get("ipfs_playlist_cid") if pending else None),
"provenance_cid": cached.get("provenance_cid"),
"plan_cid": cached.get("plan_cid"),
"actor_id": cached.get("actor_id"),
@@ -174,6 +175,7 @@ class RunService:
status_map = {
"pending": "pending",
"started": "running",
"rendering": "running", # Custom status from streaming task
"success": "completed",
"failure": "failed",
"retry": "running",
@@ -192,6 +194,14 @@ class RunService:
"created_at": pending.get("created_at"),
"error": pending.get("error"),
"recipe_sexp": pending.get("dag_json"), # Recipe content for streaming runs
# Checkpoint fields for resumable renders
"checkpoint_frame": pending.get("checkpoint_frame"),
"checkpoint_t": pending.get("checkpoint_t"),
"total_frames": pending.get("total_frames"),
"resumable": pending.get("resumable", True),
# IPFS streaming info
"ipfs_playlist_cid": pending.get("ipfs_playlist_cid"),
"quality_playlists": pending.get("quality_playlists"),
}
# If task completed, get result
@@ -227,6 +237,14 @@ class RunService:
"created_at": pending.get("created_at"),
"error": pending.get("error"),
"recipe_sexp": pending.get("dag_json"), # Recipe content for streaming runs
# Checkpoint fields for resumable renders
"checkpoint_frame": pending.get("checkpoint_frame"),
"checkpoint_t": pending.get("checkpoint_t"),
"total_frames": pending.get("total_frames"),
"resumable": pending.get("resumable", True),
# IPFS streaming info
"ipfs_playlist_cid": pending.get("ipfs_playlist_cid"),
"quality_playlists": pending.get("quality_playlists"),
}
# Fallback: Check Redis for backwards compatibility
@@ -272,6 +290,7 @@ class RunService:
status_map = {
"pending": "pending",
"started": "running",
"rendering": "running", # Custom status from streaming task
"success": "completed",
"failure": "failed",
"retry": "running",

View File

@@ -11,7 +11,7 @@
{% endblock %}
{% block content %}
{% set status_colors = {'completed': 'green', 'running': 'blue', 'pending': 'yellow', 'failed': 'red'} %}
{% set status_colors = {'completed': 'green', 'running': 'blue', 'pending': 'yellow', 'failed': 'red', 'paused': 'yellow'} %}
{% set color = status_colors.get(run.status, 'gray') %}
<div class="max-w-6xl mx-auto">
@@ -28,7 +28,42 @@
{% if run.error %}
<span class="text-red-400 text-sm ml-2">{{ run.error }}</span>
{% endif %}
{% if run.checkpoint_frame %}
<span class="text-gray-400 text-sm ml-2">
Checkpoint: {{ run.checkpoint_frame }}{% if run.total_frames %} / {{ run.total_frames }}{% endif %} frames
</span>
{% endif %}
<div class="flex-grow"></div>
<!-- Pause button for running renders -->
{% if run.status == 'running' %}
<button hx-post="/runs/{{ run.run_id }}/pause"
hx-target="#action-result"
hx-swap="innerHTML"
class="bg-yellow-600 hover:bg-yellow-700 px-3 py-1 rounded text-sm font-medium">
Pause
</button>
{% endif %}
<!-- Resume/Restart buttons for failed/paused renders -->
{% if run.status in ['failed', 'paused'] %}
{% if run.checkpoint_frame %}
<button hx-post="/runs/{{ run.run_id }}/resume"
hx-target="#action-result"
hx-swap="innerHTML"
class="bg-green-600 hover:bg-green-700 px-3 py-1 rounded text-sm font-medium">
Resume{% if run.total_frames %} ({{ ((run.checkpoint_frame / run.total_frames) * 100)|round|int }}%){% endif %}
</button>
{% endif %}
<button hx-post="/runs/{{ run.run_id }}/restart"
hx-target="#action-result"
hx-swap="innerHTML"
hx-confirm="Discard progress and start over?"
class="bg-yellow-600 hover:bg-yellow-700 px-3 py-1 rounded text-sm font-medium">
Restart
</button>
{% endif %}
{% if run.recipe %}
<button hx-post="/runs/rerun/{{ run.recipe }}"
hx-target="#action-result"
@@ -88,14 +123,20 @@
</div>
</div>
<!-- Unified HLS Player (shown during rendering OR for completed HLS streams) -->
{% if run.status == 'rendering' or run.ipfs_playlist_cid %}
<!-- Unified HLS Player (shown during rendering, for paused/failed runs with checkpoint, OR for completed HLS streams) -->
{% if run.status == 'rendering' or run.ipfs_playlist_cid or (run.status in ['paused', 'failed'] and run.checkpoint_frame) %}
<div id="hls-player-container" class="mb-6 bg-gray-800 rounded-lg p-4">
<div class="flex items-center justify-between mb-4">
<h3 class="text-lg font-semibold flex items-center">
{% if run.status == 'rendering' %}
<span id="live-indicator" class="w-3 h-3 bg-red-500 rounded-full mr-2 animate-pulse"></span>
<span id="player-title">Live Preview</span>
{% elif run.status == 'paused' %}
<span id="live-indicator" class="w-3 h-3 bg-yellow-500 rounded-full mr-2"></span>
<span id="player-title">Partial Output (Paused)</span>
{% elif run.status == 'failed' and run.checkpoint_frame %}
<span id="live-indicator" class="w-3 h-3 bg-red-500 rounded-full mr-2"></span>
<span id="player-title">Partial Output (Failed)</span>
{% else %}
<span id="live-indicator" class="w-3 h-3 bg-green-500 rounded-full mr-2 hidden"></span>
<span id="player-title">Video</span>
@@ -144,12 +185,15 @@
const baseUrl = '/runs/{{ run.run_id }}/playlist.m3u8';
const isRendering = {{ 'true' if run.status == 'rendering' else 'false' }};
const isPausedOrFailed = {{ 'true' if run.status in ['paused', 'failed'] else 'false' }};
let hls = null;
let retryCount = 0;
const maxRetries = 120;
let segmentsLoaded = 0;
let currentMode = isRendering ? 'live' : 'replay'; // Default based on status
// Start in replay mode for paused/failed (shows partial output from start)
// Start in live mode for rendering (follows the render progress)
let currentMode = isRendering ? 'live' : 'replay';
function getHlsUrl() {
return baseUrl + '?_t=' + Date.now();