Add CI/CD workflow
Some checks failed
Build and Deploy / build-and-deploy (push) Failing after 1m2s
GPU Worker CI/CD / test (push) Failing after 1m4s
GPU Worker CI/CD / deploy (push) Has been skipped

This commit is contained in:
giles
2026-02-06 10:44:13 +00:00
parent 7411aa74c4
commit 48018d09b7
11 changed files with 934 additions and 20 deletions

View File

@@ -111,6 +111,27 @@ BEGIN
WHERE table_name = 'pending_runs' AND column_name = 'quality_playlists') THEN
ALTER TABLE pending_runs ADD COLUMN quality_playlists JSONB;
END IF;
-- Checkpoint columns for resumable renders
IF NOT EXISTS (SELECT 1 FROM information_schema.columns
WHERE table_name = 'pending_runs' AND column_name = 'checkpoint_frame') THEN
ALTER TABLE pending_runs ADD COLUMN checkpoint_frame INTEGER;
END IF;
IF NOT EXISTS (SELECT 1 FROM information_schema.columns
WHERE table_name = 'pending_runs' AND column_name = 'checkpoint_t') THEN
ALTER TABLE pending_runs ADD COLUMN checkpoint_t FLOAT;
END IF;
IF NOT EXISTS (SELECT 1 FROM information_schema.columns
WHERE table_name = 'pending_runs' AND column_name = 'checkpoint_scans') THEN
ALTER TABLE pending_runs ADD COLUMN checkpoint_scans JSONB;
END IF;
IF NOT EXISTS (SELECT 1 FROM information_schema.columns
WHERE table_name = 'pending_runs' AND column_name = 'total_frames') THEN
ALTER TABLE pending_runs ADD COLUMN total_frames INTEGER;
END IF;
IF NOT EXISTS (SELECT 1 FROM information_schema.columns
WHERE table_name = 'pending_runs' AND column_name = 'resumable') THEN
ALTER TABLE pending_runs ADD COLUMN resumable BOOLEAN DEFAULT TRUE;
END IF;
END $$;
CREATE INDEX IF NOT EXISTS idx_pending_runs_status ON pending_runs(status);
@@ -1530,7 +1551,9 @@ async def get_pending_run(run_id: str) -> Optional[dict]:
async with pool.acquire() as conn:
row = await conn.fetchrow(
"""
SELECT run_id, celery_task_id, status, recipe, inputs, dag_json, plan_cid, output_name, actor_id, error, ipfs_playlist_cid, quality_playlists, created_at, updated_at
SELECT run_id, celery_task_id, status, recipe, inputs, dag_json, plan_cid, output_name, actor_id, error,
ipfs_playlist_cid, quality_playlists, checkpoint_frame, checkpoint_t, checkpoint_scans,
total_frames, resumable, created_at, updated_at
FROM pending_runs WHERE run_id = $1
""",
run_id
@@ -1544,6 +1567,10 @@ async def get_pending_run(run_id: str) -> Optional[dict]:
quality_playlists = row.get("quality_playlists")
if isinstance(quality_playlists, str):
quality_playlists = _json.loads(quality_playlists)
# Parse checkpoint_scans if it's a string
checkpoint_scans = row.get("checkpoint_scans")
if isinstance(checkpoint_scans, str):
checkpoint_scans = _json.loads(checkpoint_scans)
return {
"run_id": row["run_id"],
"celery_task_id": row["celery_task_id"],
@@ -1557,6 +1584,11 @@ async def get_pending_run(run_id: str) -> Optional[dict]:
"error": row["error"],
"ipfs_playlist_cid": row["ipfs_playlist_cid"],
"quality_playlists": quality_playlists,
"checkpoint_frame": row.get("checkpoint_frame"),
"checkpoint_t": row.get("checkpoint_t"),
"checkpoint_scans": checkpoint_scans,
"total_frames": row.get("total_frames"),
"resumable": row.get("resumable", True),
"created_at": row["created_at"].isoformat() if row["created_at"] else None,
"updated_at": row["updated_at"].isoformat() if row["updated_at"] else None,
}
@@ -1666,6 +1698,109 @@ async def update_pending_run_playlist(run_id: str, ipfs_playlist_cid: str, quali
return "UPDATE 1" in result
async def update_pending_run_checkpoint(
run_id: str,
checkpoint_frame: int,
checkpoint_t: float,
checkpoint_scans: Optional[dict] = None,
total_frames: Optional[int] = None,
) -> bool:
"""Update checkpoint state for a streaming run.
Called at segment boundaries to enable resume after failures.
Args:
run_id: The run ID
checkpoint_frame: Last completed frame at segment boundary
checkpoint_t: Time value for checkpoint frame
checkpoint_scans: Accumulated scan state {scan_name: state_dict}
total_frames: Total expected frames (for progress %)
"""
if pool is None:
raise RuntimeError("Database pool not initialized - call init_db() first")
async with pool.acquire() as conn:
result = await conn.execute(
"""
UPDATE pending_runs SET
checkpoint_frame = $2,
checkpoint_t = $3,
checkpoint_scans = $4,
total_frames = COALESCE($5, total_frames),
updated_at = NOW()
WHERE run_id = $1
""",
run_id,
checkpoint_frame,
checkpoint_t,
_json.dumps(checkpoint_scans) if checkpoint_scans else None,
total_frames,
)
return "UPDATE 1" in result
async def get_run_checkpoint(run_id: str) -> Optional[dict]:
"""Get checkpoint data for resuming a run.
Returns:
Dict with checkpoint_frame, checkpoint_t, checkpoint_scans, quality_playlists, etc.
or None if no checkpoint exists
"""
if pool is None:
raise RuntimeError("Database pool not initialized - call init_db() first")
async with pool.acquire() as conn:
row = await conn.fetchrow(
"""
SELECT checkpoint_frame, checkpoint_t, checkpoint_scans, total_frames,
quality_playlists, ipfs_playlist_cid, resumable
FROM pending_runs WHERE run_id = $1
""",
run_id
)
if row and row.get("checkpoint_frame") is not None:
# Parse JSONB fields
checkpoint_scans = row.get("checkpoint_scans")
if isinstance(checkpoint_scans, str):
checkpoint_scans = _json.loads(checkpoint_scans)
quality_playlists = row.get("quality_playlists")
if isinstance(quality_playlists, str):
quality_playlists = _json.loads(quality_playlists)
return {
"frame_num": row["checkpoint_frame"],
"t": row["checkpoint_t"],
"scans": checkpoint_scans or {},
"total_frames": row.get("total_frames"),
"quality_playlists": quality_playlists,
"ipfs_playlist_cid": row.get("ipfs_playlist_cid"),
"resumable": row.get("resumable", True),
}
return None
async def clear_run_checkpoint(run_id: str) -> bool:
"""Clear checkpoint data for a run (used on restart).
Args:
run_id: The run ID
"""
if pool is None:
raise RuntimeError("Database pool not initialized - call init_db() first")
async with pool.acquire() as conn:
result = await conn.execute(
"""
UPDATE pending_runs SET
checkpoint_frame = NULL,
checkpoint_t = NULL,
checkpoint_scans = NULL,
quality_playlists = NULL,
ipfs_playlist_cid = NULL,
updated_at = NOW()
WHERE run_id = $1
""",
run_id,
)
return "UPDATE 1" in result
async def complete_pending_run(run_id: str) -> bool:
"""Remove a pending run after it completes (moves to run_cache)."""
async with pool.acquire() as conn: