Add CI/CD workflow
Some checks failed
Build and Deploy / build-and-deploy (push) Failing after 1m2s
GPU Worker CI/CD / test (push) Failing after 1m4s
GPU Worker CI/CD / deploy (push) Has been skipped

This commit is contained in:
giles
2026-02-06 10:44:13 +00:00
parent 7411aa74c4
commit 48018d09b7
11 changed files with 934 additions and 20 deletions

View File

@@ -3,11 +3,13 @@ Streaming video rendering task.
Executes S-expression recipes for frame-by-frame video processing.
Supports CID and friendly name references for assets.
Supports pause/resume/restart for long renders.
"""
import hashlib
import logging
import os
import signal
import sys
import tempfile
from pathlib import Path
@@ -23,6 +25,11 @@ from cache_manager import get_cache_manager
logger = logging.getLogger(__name__)
class PauseRequested(Exception):
"""Raised when user requests pause via SIGTERM."""
pass
# Debug: verify module is being loaded
print(f"DEBUG MODULE LOAD: tasks/streaming.py loaded at {__file__}", file=sys.stderr)
@@ -246,6 +253,7 @@ def run_stream(
actor_id: Optional[str] = None,
sources_sexp: Optional[str] = None,
audio_sexp: Optional[str] = None,
resume: bool = False,
) -> dict:
"""
Execute a streaming S-expression recipe.
@@ -259,13 +267,25 @@ def run_stream(
actor_id: User ID for friendly name resolution
sources_sexp: Optional sources config S-expression
audio_sexp: Optional audio config S-expression
resume: If True, load checkpoint and resume from where we left off
Returns:
Dict with output_cid, output_path, and status
"""
global _resolve_loop, _db_initialized
task_id = self.request.id
logger.info(f"Starting stream task {task_id} for run {run_id}")
logger.info(f"Starting stream task {task_id} for run {run_id} (resume={resume})")
# Handle graceful pause (SIGTERM from Celery revoke)
pause_requested = False
original_sigterm = signal.getsignal(signal.SIGTERM)
def handle_sigterm(signum, frame):
nonlocal pause_requested
pause_requested = True
logger.info(f"Pause requested for run {run_id} (SIGTERM received)")
signal.signal(signal.SIGTERM, handle_sigterm)
self.update_state(state='INITIALIZING', meta={'progress': 0})
@@ -311,6 +331,28 @@ def run_stream(
# Import the streaming interpreter
from streaming.stream_sexp_generic import StreamInterpreter
# Load checkpoint if resuming
checkpoint = None
if resume:
import asyncio
import database
try:
if _resolve_loop is None or _resolve_loop.is_closed():
_resolve_loop = asyncio.new_event_loop()
asyncio.set_event_loop(_resolve_loop)
_db_initialized = False
if not _db_initialized:
_resolve_loop.run_until_complete(database.init_db())
_db_initialized = True
checkpoint = _resolve_loop.run_until_complete(database.get_run_checkpoint(run_id))
if checkpoint:
logger.info(f"Loaded checkpoint for run {run_id}: frame {checkpoint.get('frame_num')}")
else:
logger.warning(f"No checkpoint found for run {run_id}, starting from beginning")
except Exception as e:
logger.error(f"Failed to load checkpoint: {e}")
checkpoint = None
# Create interpreter (pass actor_id for friendly name resolution)
interp = StreamInterpreter(str(recipe_path), actor_id=actor_id)
@@ -361,6 +403,7 @@ def run_stream(
# Set up progress callback to update Celery task state
def on_progress(pct, frame_num, total_frames):
nonlocal pause_requested
# Scale progress: 5% (start) to 85% (before caching)
scaled_progress = 5 + (pct * 0.8) # 5% to 85%
self.update_state(state='RENDERING', meta={
@@ -372,9 +415,94 @@ def run_stream(
interp.on_progress = on_progress
# Set up checkpoint callback to save state at segment boundaries
def on_checkpoint(ckpt):
"""Save checkpoint state to database."""
nonlocal pause_requested
global _resolve_loop, _db_initialized
import asyncio
import database
try:
if _resolve_loop is None or _resolve_loop.is_closed():
_resolve_loop = asyncio.new_event_loop()
asyncio.set_event_loop(_resolve_loop)
_db_initialized = False
if not _db_initialized:
_resolve_loop.run_until_complete(database.init_db())
_db_initialized = True
# Get total frames from interpreter config
total_frames = None
if hasattr(interp, 'output') and hasattr(interp.output, '_frame_count'):
# Estimate total frames based on duration
fps_val = interp.config.get('fps', 30)
for name, val in interp.globals.items():
if hasattr(val, 'duration'):
total_frames = int(val.duration * fps_val)
break
_resolve_loop.run_until_complete(database.update_pending_run_checkpoint(
run_id=run_id,
checkpoint_frame=ckpt['frame_num'],
checkpoint_t=ckpt['t'],
checkpoint_scans=ckpt.get('scans'),
total_frames=total_frames,
))
logger.info(f"Saved checkpoint for run {run_id}: frame {ckpt['frame_num']}")
# Check if pause was requested after checkpoint
if pause_requested:
logger.info(f"Pause requested after checkpoint, raising PauseRequested")
raise PauseRequested("Render paused by user")
except PauseRequested:
raise # Re-raise to stop the render
except Exception as e:
logger.error(f"Failed to save checkpoint: {e}")
interp.on_checkpoint = on_checkpoint
# Build resume state for the interpreter (includes segment CIDs for output)
resume_from = None
if checkpoint:
resume_from = {
'frame_num': checkpoint.get('frame_num'),
't': checkpoint.get('t'),
'scans': checkpoint.get('scans', {}),
}
# Add segment CIDs if available (from quality_playlists in checkpoint)
# Note: We need to extract segment_cids from the output's state, which isn't
# directly stored. For now, the output will re-check existing segments on disk.
# Run rendering to file
logger.info(f"Rendering to {output_path}")
interp.run(duration=duration, output=str(output_path))
logger.info(f"Rendering to {output_path}" + (f" (resuming from frame {resume_from['frame_num']})" if resume_from else ""))
render_paused = False
try:
interp.run(duration=duration, output=str(output_path), resume_from=resume_from)
except PauseRequested:
# Graceful pause - checkpoint already saved
render_paused = True
logger.info(f"Render paused for run {run_id}")
# Restore original signal handler
signal.signal(signal.SIGTERM, original_sigterm)
if render_paused:
import asyncio
import database
try:
if _resolve_loop is None or _resolve_loop.is_closed():
_resolve_loop = asyncio.new_event_loop()
asyncio.set_event_loop(_resolve_loop)
_db_initialized = False
if not _db_initialized:
_resolve_loop.run_until_complete(database.init_db())
_db_initialized = True
_resolve_loop.run_until_complete(database.update_pending_run_status(run_id, 'paused'))
except Exception as e:
logger.error(f"Failed to update status to paused: {e}")
return {"status": "paused", "run_id": run_id, "task_id": task_id}
# Check for interpreter errors
if interp.errors:
@@ -395,7 +523,6 @@ def run_stream(
# Update pending run with playlist CID for live HLS redirect
if ipfs_playlist_cid:
global _resolve_loop, _db_initialized
import asyncio
import database
try: