Add CI/CD workflow

2026-02-06 10:44:13 +00:00
parent 7411aa74c4
commit 48018d09b7
11 changed files with 934 additions and 20 deletions
--- a/tasks/streaming.py
+++ b/tasks/streaming.py
@@ -3,11 +3,13 @@ Streaming video rendering task.

 Executes S-expression recipes for frame-by-frame video processing.
 Supports CID and friendly name references for assets.
+Supports pause/resume/restart for long renders.
 """

 import hashlib
 import logging
 import os
+import signal
 import sys
 import tempfile
 from pathlib import Path
@@ -23,6 +25,11 @@ from cache_manager import get_cache_manager

 logger = logging.getLogger(__name__)

+
+class PauseRequested(Exception):
+    """Raised when user requests pause via SIGTERM."""
+    pass
+
 # Debug: verify module is being loaded
 print(f"DEBUG MODULE LOAD: tasks/streaming.py loaded at {__file__}", file=sys.stderr)

@@ -246,6 +253,7 @@ def run_stream(
    actor_id: Optional[str] = None,
    sources_sexp: Optional[str] = None,
    audio_sexp: Optional[str] = None,
+    resume: bool = False,
 ) -> dict:
    """
    Execute a streaming S-expression recipe.
@@ -259,13 +267,25 @@ def run_stream(
        actor_id: User ID for friendly name resolution
        sources_sexp: Optional sources config S-expression
        audio_sexp: Optional audio config S-expression
+        resume: If True, load checkpoint and resume from where we left off

    Returns:
        Dict with output_cid, output_path, and status
    """
    global _resolve_loop, _db_initialized
    task_id = self.request.id
-    logger.info(f"Starting stream task {task_id} for run {run_id}")
+    logger.info(f"Starting stream task {task_id} for run {run_id} (resume={resume})")
+
+    # Handle graceful pause (SIGTERM from Celery revoke)
+    pause_requested = False
+    original_sigterm = signal.getsignal(signal.SIGTERM)
+
+    def handle_sigterm(signum, frame):
+        nonlocal pause_requested
+        pause_requested = True
+        logger.info(f"Pause requested for run {run_id} (SIGTERM received)")
+
+    signal.signal(signal.SIGTERM, handle_sigterm)

    self.update_state(state='INITIALIZING', meta={'progress': 0})

@@ -311,6 +331,28 @@ def run_stream(
        # Import the streaming interpreter
        from streaming.stream_sexp_generic import StreamInterpreter

+        # Load checkpoint if resuming
+        checkpoint = None
+        if resume:
+            import asyncio
+            import database
+            try:
+                if _resolve_loop is None or _resolve_loop.is_closed():
+                    _resolve_loop = asyncio.new_event_loop()
+                    asyncio.set_event_loop(_resolve_loop)
+                    _db_initialized = False
+                if not _db_initialized:
+                    _resolve_loop.run_until_complete(database.init_db())
+                    _db_initialized = True
+                checkpoint = _resolve_loop.run_until_complete(database.get_run_checkpoint(run_id))
+                if checkpoint:
+                    logger.info(f"Loaded checkpoint for run {run_id}: frame {checkpoint.get('frame_num')}")
+                else:
+                    logger.warning(f"No checkpoint found for run {run_id}, starting from beginning")
+            except Exception as e:
+                logger.error(f"Failed to load checkpoint: {e}")
+                checkpoint = None
+
        # Create interpreter (pass actor_id for friendly name resolution)
        interp = StreamInterpreter(str(recipe_path), actor_id=actor_id)

@@ -361,6 +403,7 @@ def run_stream(

        # Set up progress callback to update Celery task state
        def on_progress(pct, frame_num, total_frames):
+            nonlocal pause_requested
            # Scale progress: 5% (start) to 85% (before caching)
            scaled_progress = 5 + (pct * 0.8)  # 5% to 85%
            self.update_state(state='RENDERING', meta={
@@ -372,9 +415,94 @@ def run_stream(

        interp.on_progress = on_progress

+        # Set up checkpoint callback to save state at segment boundaries
+        def on_checkpoint(ckpt):
+            """Save checkpoint state to database."""
+            nonlocal pause_requested
+            global _resolve_loop, _db_initialized
+            import asyncio
+            import database
+
+            try:
+                if _resolve_loop is None or _resolve_loop.is_closed():
+                    _resolve_loop = asyncio.new_event_loop()
+                    asyncio.set_event_loop(_resolve_loop)
+                    _db_initialized = False
+                if not _db_initialized:
+                    _resolve_loop.run_until_complete(database.init_db())
+                    _db_initialized = True
+
+                # Get total frames from interpreter config
+                total_frames = None
+                if hasattr(interp, 'output') and hasattr(interp.output, '_frame_count'):
+                    # Estimate total frames based on duration
+                    fps_val = interp.config.get('fps', 30)
+                    for name, val in interp.globals.items():
+                        if hasattr(val, 'duration'):
+                            total_frames = int(val.duration * fps_val)
+                            break
+
+                _resolve_loop.run_until_complete(database.update_pending_run_checkpoint(
+                    run_id=run_id,
+                    checkpoint_frame=ckpt['frame_num'],
+                    checkpoint_t=ckpt['t'],
+                    checkpoint_scans=ckpt.get('scans'),
+                    total_frames=total_frames,
+                ))
+                logger.info(f"Saved checkpoint for run {run_id}: frame {ckpt['frame_num']}")
+
+                # Check if pause was requested after checkpoint
+                if pause_requested:
+                    logger.info(f"Pause requested after checkpoint, raising PauseRequested")
+                    raise PauseRequested("Render paused by user")
+
+            except PauseRequested:
+                raise  # Re-raise to stop the render
+            except Exception as e:
+                logger.error(f"Failed to save checkpoint: {e}")
+
+        interp.on_checkpoint = on_checkpoint
+
+        # Build resume state for the interpreter (includes segment CIDs for output)
+        resume_from = None
+        if checkpoint:
+            resume_from = {
+                'frame_num': checkpoint.get('frame_num'),
+                't': checkpoint.get('t'),
+                'scans': checkpoint.get('scans', {}),
+            }
+            # Add segment CIDs if available (from quality_playlists in checkpoint)
+            # Note: We need to extract segment_cids from the output's state, which isn't
+            # directly stored. For now, the output will re-check existing segments on disk.
+
        # Run rendering to file
-        logger.info(f"Rendering to {output_path}")
-        interp.run(duration=duration, output=str(output_path))
+        logger.info(f"Rendering to {output_path}" + (f" (resuming from frame {resume_from['frame_num']})" if resume_from else ""))
+        render_paused = False
+        try:
+            interp.run(duration=duration, output=str(output_path), resume_from=resume_from)
+        except PauseRequested:
+            # Graceful pause - checkpoint already saved
+            render_paused = True
+            logger.info(f"Render paused for run {run_id}")
+
+        # Restore original signal handler
+        signal.signal(signal.SIGTERM, original_sigterm)
+
+        if render_paused:
+            import asyncio
+            import database
+            try:
+                if _resolve_loop is None or _resolve_loop.is_closed():
+                    _resolve_loop = asyncio.new_event_loop()
+                    asyncio.set_event_loop(_resolve_loop)
+                    _db_initialized = False
+                if not _db_initialized:
+                    _resolve_loop.run_until_complete(database.init_db())
+                    _db_initialized = True
+                _resolve_loop.run_until_complete(database.update_pending_run_status(run_id, 'paused'))
+            except Exception as e:
+                logger.error(f"Failed to update status to paused: {e}")
+            return {"status": "paused", "run_id": run_id, "task_id": task_id}

        # Check for interpreter errors
        if interp.errors:
@@ -395,7 +523,6 @@ def run_stream(

            # Update pending run with playlist CID for live HLS redirect
            if ipfs_playlist_cid:
-                global _resolve_loop, _db_initialized
                import asyncio
                import database
                try: