Add IPFS HLS streaming and GPU optimizations

- Add IPFSHLSOutput class that uploads segments to IPFS as they're created - Update streaming task to use IPFS HLS output for distributed streaming - Add /ipfs-stream endpoint to get IPFS playlist URL - Update /stream endpoint to redirect to IPFS when available - Add GPU persistence mode (STREAMING_GPU_PERSIST=1) to keep frames on GPU - Add hardware video decoding (NVDEC) support for faster video processing - Add GPU-accelerated primitive libraries: blending_gpu, color_ops_gpu, geometry_gpu - Add streaming_gpu module with GPUFrame class for tracking CPU/GPU data location - Add Dockerfile.gpu for building GPU-enabled worker image Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-03 20:23:16 +00:00
parent 5bc655f8c8
commit 86830019ad
24 changed files with 4025 additions and 96 deletions
--- a/streaming/output.py
+++ b/streaming/output.py
@@ -5,14 +5,99 @@ Supports:
 - Display window (preview)
 - File output (recording)
 - Stream output (RTMP, etc.) - future
+- NVENC hardware encoding (auto-detected)
+- CuPy GPU arrays (auto-converted to numpy for output)
 """

 import numpy as np
 import subprocess
 from abc import ABC, abstractmethod
-from typing import Tuple, Optional
+from typing import Tuple, Optional, List, Union
 from pathlib import Path

+# Try to import CuPy for GPU array support
+try:
+    import cupy as cp
+    CUPY_AVAILABLE = True
+except ImportError:
+    cp = None
+    CUPY_AVAILABLE = False
+
+
+def ensure_numpy(frame: Union[np.ndarray, 'cp.ndarray']) -> np.ndarray:
+    """Convert frame to numpy array if it's a CuPy array."""
+    if CUPY_AVAILABLE and isinstance(frame, cp.ndarray):
+        return cp.asnumpy(frame)
+    return frame
+
+# Cache NVENC availability check
+_nvenc_available: Optional[bool] = None
+
+
+def check_nvenc_available() -> bool:
+    """Check if NVENC hardware encoding is available."""
+    global _nvenc_available
+    if _nvenc_available is not None:
+        return _nvenc_available
+
+    try:
+        result = subprocess.run(
+            ["ffmpeg", "-encoders"],
+            capture_output=True,
+            text=True,
+            timeout=5
+        )
+        _nvenc_available = "h264_nvenc" in result.stdout
+    except Exception:
+        _nvenc_available = False
+
+    return _nvenc_available
+
+
+def get_encoder_params(codec: str, preset: str, crf: int) -> List[str]:
+    """
+    Get encoder-specific FFmpeg parameters.
+
+    For NVENC (h264_nvenc, hevc_nvenc):
+    - Uses -cq for constant quality (similar to CRF)
+    - Presets: p1 (fastest) to p7 (slowest/best quality)
+    - Mapping: fast->p4, medium->p5, slow->p6
+
+    For libx264:
+    - Uses -crf for constant rate factor
+    - Presets: ultrafast, superfast, veryfast, faster, fast, medium, slow, slower, veryslow
+    """
+    if codec in ("h264_nvenc", "hevc_nvenc"):
+        # Map libx264 presets to NVENC presets
+        nvenc_preset_map = {
+            "ultrafast": "p1",
+            "superfast": "p2",
+            "veryfast": "p3",
+            "faster": "p3",
+            "fast": "p4",
+            "medium": "p5",
+            "slow": "p6",
+            "slower": "p6",
+            "veryslow": "p7",
+        }
+        nvenc_preset = nvenc_preset_map.get(preset, "p4")
+
+        # NVENC quality: 0 (best) to 51 (worst), similar to CRF
+        # CRF 18 = high quality, CRF 23 = good quality
+        return [
+            "-c:v", codec,
+            "-preset", nvenc_preset,
+            "-cq", str(crf),  # Constant quality mode
+            "-rc", "vbr",     # Variable bitrate with quality target
+        ]
+    else:
+        # Standard libx264 params
+        return [
+            "-c:v", codec,
+            "-preset", preset,
+            "-crf", str(crf),
+        ]
+

 class Output(ABC):
    """Abstract base class for output targets."""
@@ -91,6 +176,9 @@ class DisplayOutput(Output):
        if not self._is_open:
            return

+        # Convert GPU array to numpy if needed
+        frame = ensure_numpy(frame)
+
        # Ensure frame is correct format
        if frame.dtype != np.uint8:
            frame = np.clip(frame, 0, 255).astype(np.uint8)
@@ -136,6 +224,9 @@ class DisplayOutput(Output):
 class FileOutput(Output):
    """
    Write frames to a video file using ffmpeg.
+
+    Automatically uses NVENC hardware encoding when available,
+    falling back to libx264 CPU encoding otherwise.
    """

    def __init__(
@@ -143,7 +234,7 @@ class FileOutput(Output):
        path: str,
        size: Tuple[int, int],
        fps: float = 30,
-        codec: str = "libx264",
+        codec: str = "auto",  # "auto", "h264_nvenc", "libx264"
        crf: int = 18,
        preset: str = "fast",
        audio_source: str = None,
@@ -153,6 +244,11 @@ class FileOutput(Output):
        self.fps = fps
        self._is_open = True

+        # Auto-detect NVENC
+        if codec == "auto":
+            codec = "h264_nvenc" if check_nvenc_available() else "libx264"
+        self.codec = codec
+
        # Build ffmpeg command
        cmd = [
            "ffmpeg", "-y",
@@ -170,12 +266,9 @@ class FileOutput(Output):
            # Explicitly map: video from input 0 (rawvideo), audio from input 1
            cmd.extend(["-map", "0:v", "-map", "1:a"])

-        cmd.extend([
-            "-c:v", codec,
-            "-preset", preset,
-            "-crf", str(crf),
-            "-pix_fmt", "yuv420p",
-        ])
+        # Get encoder-specific params
+        cmd.extend(get_encoder_params(codec, preset, crf))
+        cmd.extend(["-pix_fmt", "yuv420p"])

        # Add audio codec if we have audio
        if audio_source:
@@ -201,11 +294,20 @@ class FileOutput(Output):
            self._is_open = False
            return

+        # Convert GPU array to numpy if needed
+        frame = ensure_numpy(frame)
+
        # Resize if needed
        if frame.shape[1] != self.size[0] or frame.shape[0] != self.size[1]:
            import cv2
            frame = cv2.resize(frame, self.size)

+        # Ensure correct format
+        if frame.dtype != np.uint8:
+            frame = np.clip(frame, 0, 255).astype(np.uint8)
+        if not frame.flags['C_CONTIGUOUS']:
+            frame = np.ascontiguousarray(frame)
+
        try:
            self._process.stdin.write(frame.tobytes())
        except BrokenPipeError:
@@ -335,6 +437,9 @@ class PipeOutput(Output):
            self._is_open = False
            return

+        # Convert GPU array to numpy if needed
+        frame = ensure_numpy(frame)
+
        # Resize if needed
        if frame.shape[1] != self.size[0] or frame.shape[0] != self.size[1]:
            import cv2
@@ -371,3 +476,424 @@ class PipeOutput(Output):
        if self._process and self._process.poll() is not None:
            self._is_open = False
        return self._is_open
+
+
+class HLSOutput(Output):
+    """
+    Write frames as HLS stream (m3u8 playlist + .ts segments).
+
+    This enables true live streaming where the browser can poll
+    for new segments as they become available.
+
+    Automatically uses NVENC hardware encoding when available.
+    """
+
+    def __init__(
+        self,
+        output_dir: str,
+        size: Tuple[int, int],
+        fps: float = 30,
+        segment_duration: float = 4.0,  # 4s segments for stability
+        codec: str = "auto",  # "auto", "h264_nvenc", "libx264"
+        crf: int = 23,
+        preset: str = "fast",  # Better quality than ultrafast
+        audio_source: str = None,
+    ):
+        self.output_dir = Path(output_dir)
+        self.output_dir.mkdir(parents=True, exist_ok=True)
+        self.size = size
+        self.fps = fps
+        self.segment_duration = segment_duration
+        self._is_open = True
+
+        # Auto-detect NVENC
+        if codec == "auto":
+            codec = "h264_nvenc" if check_nvenc_available() else "libx264"
+        self.codec = codec
+
+        # HLS playlist path
+        self.playlist_path = self.output_dir / "stream.m3u8"
+
+        # Build ffmpeg command for HLS output
+        cmd = [
+            "ffmpeg", "-y",
+            "-f", "rawvideo",
+            "-vcodec", "rawvideo",
+            "-pix_fmt", "rgb24",
+            "-s", f"{size[0]}x{size[1]}",
+            "-r", str(fps),
+            "-i", "-",
+        ]
+
+        # Add audio input if provided
+        if audio_source:
+            cmd.extend(["-i", str(audio_source)])
+            cmd.extend(["-map", "0:v", "-map", "1:a"])
+
+        # Keyframe interval - must be exactly segment_duration for clean cuts
+        gop_size = int(fps * segment_duration)
+
+        # Get encoder-specific params
+        cmd.extend(get_encoder_params(codec, preset, crf))
+        cmd.extend([
+            "-pix_fmt", "yuv420p",
+            # Force keyframes at exact intervals for clean segment boundaries
+            "-g", str(gop_size),
+            "-keyint_min", str(gop_size),
+            "-sc_threshold", "0",  # Disable scene change detection
+            "-force_key_frames", f"expr:gte(t,n_forced*{segment_duration})",
+            # Reduce buffering for faster segment availability
+            "-flush_packets", "1",
+        ])
+
+        # Add audio codec if we have audio
+        if audio_source:
+            cmd.extend(["-c:a", "aac", "-b:a", "128k"])
+
+        # HLS specific options for smooth live streaming
+        cmd.extend([
+            "-f", "hls",
+            "-hls_time", str(segment_duration),
+            "-hls_list_size", "0",  # Keep all segments in playlist
+            "-hls_flags", "independent_segments+append_list+split_by_time",
+            "-hls_segment_type", "mpegts",
+            "-hls_segment_filename", str(self.output_dir / "segment_%05d.ts"),
+            str(self.playlist_path),
+        ])
+
+        import sys
+        print(f"HLSOutput cmd: {' '.join(cmd)}", file=sys.stderr)
+        self._process = subprocess.Popen(
+            cmd,
+            stdin=subprocess.PIPE,
+            stderr=None,  # Show errors for debugging
+        )
+
+        # Track segments for status reporting
+        self.segments_written = 0
+        self._last_segment_check = 0
+
+    def write(self, frame: np.ndarray, t: float):
+        """Write frame to HLS stream."""
+        if not self._is_open or self._process.poll() is not None:
+            self._is_open = False
+            return
+
+        # Convert GPU array to numpy if needed
+        frame = ensure_numpy(frame)
+
+        # Resize if needed
+        if frame.shape[1] != self.size[0] or frame.shape[0] != self.size[1]:
+            import cv2
+            frame = cv2.resize(frame, self.size)
+
+        # Ensure correct format
+        if frame.dtype != np.uint8:
+            frame = np.clip(frame, 0, 255).astype(np.uint8)
+        if not frame.flags['C_CONTIGUOUS']:
+            frame = np.ascontiguousarray(frame)
+
+        try:
+            self._process.stdin.write(frame.tobytes())
+        except BrokenPipeError:
+            self._is_open = False
+
+        # Periodically count segments
+        if t - self._last_segment_check > 1.0:
+            self._last_segment_check = t
+            self.segments_written = len(list(self.output_dir.glob("segment_*.ts")))
+
+    def close(self):
+        """Close the HLS stream."""
+        if self._process:
+            self._process.stdin.close()
+            self._process.wait()
+        self._is_open = False
+
+        # Final segment count
+        self.segments_written = len(list(self.output_dir.glob("segment_*.ts")))
+
+        # Mark playlist as ended (VOD mode)
+        if self.playlist_path.exists():
+            with open(self.playlist_path, "a") as f:
+                f.write("#EXT-X-ENDLIST\n")
+
+    @property
+    def is_open(self) -> bool:
+        return self._is_open and self._process.poll() is None
+
+
+class IPFSHLSOutput(Output):
+    """
+    Write frames as HLS stream with segments uploaded to IPFS.
+
+    Each segment is uploaded to IPFS as it's created, enabling distributed
+    streaming where clients can fetch segments from any IPFS gateway.
+
+    The m3u8 playlist is continuously updated with IPFS URLs and can be
+    fetched via get_playlist() or the playlist_cid property.
+    """
+
+    def __init__(
+        self,
+        output_dir: str,
+        size: Tuple[int, int],
+        fps: float = 30,
+        segment_duration: float = 4.0,
+        codec: str = "auto",
+        crf: int = 23,
+        preset: str = "fast",
+        audio_source: str = None,
+        ipfs_gateway: str = "https://ipfs.io/ipfs",
+    ):
+        self.output_dir = Path(output_dir)
+        self.output_dir.mkdir(parents=True, exist_ok=True)
+        self.size = size
+        self.fps = fps
+        self.segment_duration = segment_duration
+        self.ipfs_gateway = ipfs_gateway.rstrip("/")
+        self._is_open = True
+
+        # Auto-detect NVENC
+        if codec == "auto":
+            codec = "h264_nvenc" if check_nvenc_available() else "libx264"
+        self.codec = codec
+
+        # Track segment CIDs
+        self.segment_cids: dict = {}  # segment_number -> cid
+        self._last_segment_checked = -1
+        self._playlist_cid: Optional[str] = None
+
+        # Import IPFS client
+        from ipfs_client import add_file, add_bytes
+        self._ipfs_add_file = add_file
+        self._ipfs_add_bytes = add_bytes
+
+        # Local HLS paths
+        self.local_playlist_path = self.output_dir / "stream.m3u8"
+
+        # Build ffmpeg command for HLS output
+        cmd = [
+            "ffmpeg", "-y",
+            "-f", "rawvideo",
+            "-vcodec", "rawvideo",
+            "-pix_fmt", "rgb24",
+            "-s", f"{size[0]}x{size[1]}",
+            "-r", str(fps),
+            "-i", "-",
+        ]
+
+        # Add audio input if provided
+        if audio_source:
+            cmd.extend(["-i", str(audio_source)])
+            cmd.extend(["-map", "0:v", "-map", "1:a"])
+
+        # Keyframe interval
+        gop_size = int(fps * segment_duration)
+
+        # Get encoder-specific params
+        cmd.extend(get_encoder_params(codec, preset, crf))
+        cmd.extend([
+            "-pix_fmt", "yuv420p",
+            "-g", str(gop_size),
+            "-keyint_min", str(gop_size),
+            "-sc_threshold", "0",
+            "-force_key_frames", f"expr:gte(t,n_forced*{segment_duration})",
+            "-flush_packets", "1",
+        ])
+
+        # Add audio codec if we have audio
+        if audio_source:
+            cmd.extend(["-c:a", "aac", "-b:a", "128k"])
+
+        # HLS options
+        cmd.extend([
+            "-f", "hls",
+            "-hls_time", str(segment_duration),
+            "-hls_list_size", "0",
+            "-hls_flags", "independent_segments+append_list+split_by_time",
+            "-hls_segment_type", "mpegts",
+            "-hls_segment_filename", str(self.output_dir / "segment_%05d.ts"),
+            str(self.local_playlist_path),
+        ])
+
+        import sys
+        print(f"IPFSHLSOutput: starting ffmpeg", file=sys.stderr)
+        self._process = subprocess.Popen(
+            cmd,
+            stdin=subprocess.PIPE,
+            stderr=None,
+        )
+
+    def _upload_new_segments(self):
+        """Check for new segments and upload them to IPFS."""
+        import sys
+
+        # Find all segments
+        segments = sorted(self.output_dir.glob("segment_*.ts"))
+
+        for seg_path in segments:
+            # Extract segment number from filename
+            seg_name = seg_path.stem  # segment_00000
+            seg_num = int(seg_name.split("_")[1])
+
+            # Skip if already uploaded
+            if seg_num in self.segment_cids:
+                continue
+
+            # Skip if segment is still being written (check if file size is stable)
+            try:
+                size1 = seg_path.stat().st_size
+                if size1 == 0:
+                    continue  # Empty file, still being created
+
+                import time
+                time.sleep(0.1)
+                size2 = seg_path.stat().st_size
+                if size1 != size2:
+                    continue  # File still being written
+            except FileNotFoundError:
+                continue
+
+            # Upload to IPFS
+            cid = self._ipfs_add_file(seg_path, pin=True)
+            if cid:
+                self.segment_cids[seg_num] = cid
+                print(f"IPFS: segment_{seg_num:05d}.ts -> {cid}", file=sys.stderr)
+
+                # Update playlist after each segment upload
+                self._update_ipfs_playlist()
+
+    def _update_ipfs_playlist(self):
+        """Generate and upload IPFS-aware m3u8 playlist."""
+        if not self.segment_cids:
+            return
+
+        import sys
+
+        # Build m3u8 content with IPFS URLs
+        lines = [
+            "#EXTM3U",
+            "#EXT-X-VERSION:3",
+            f"#EXT-X-TARGETDURATION:{int(self.segment_duration) + 1}",
+            "#EXT-X-MEDIA-SEQUENCE:0",
+        ]
+
+        # Add segments in order
+        for seg_num in sorted(self.segment_cids.keys()):
+            cid = self.segment_cids[seg_num]
+            lines.append(f"#EXTINF:{self.segment_duration:.3f},")
+            lines.append(f"{self.ipfs_gateway}/{cid}")
+
+        playlist_content = "\n".join(lines) + "\n"
+
+        # Upload playlist to IPFS
+        cid = self._ipfs_add_bytes(playlist_content.encode("utf-8"), pin=True)
+        if cid:
+            self._playlist_cid = cid
+            print(f"IPFS: playlist updated -> {cid} ({len(self.segment_cids)} segments)", file=sys.stderr)
+
+    def write(self, frame: np.ndarray, t: float):
+        """Write frame to HLS stream and upload segments to IPFS."""
+        if not self._is_open or self._process.poll() is not None:
+            self._is_open = False
+            return
+
+        # Convert GPU array to numpy if needed
+        frame = ensure_numpy(frame)
+
+        # Resize if needed
+        if frame.shape[1] != self.size[0] or frame.shape[0] != self.size[1]:
+            import cv2
+            frame = cv2.resize(frame, self.size)
+
+        # Ensure correct format
+        if frame.dtype != np.uint8:
+            frame = np.clip(frame, 0, 255).astype(np.uint8)
+        if not frame.flags['C_CONTIGUOUS']:
+            frame = np.ascontiguousarray(frame)
+
+        try:
+            self._process.stdin.write(frame.tobytes())
+        except BrokenPipeError:
+            self._is_open = False
+            return
+
+        # Check for new segments periodically (every second)
+        current_segment = int(t / self.segment_duration)
+        if current_segment > self._last_segment_checked:
+            self._last_segment_checked = current_segment
+            self._upload_new_segments()
+
+    def close(self):
+        """Close the HLS stream and finalize IPFS uploads."""
+        import sys
+
+        if self._process:
+            self._process.stdin.close()
+            self._process.wait()
+        self._is_open = False
+
+        # Upload any remaining segments
+        self._upload_new_segments()
+
+        # Generate final playlist with #EXT-X-ENDLIST
+        if self.segment_cids:
+            lines = [
+                "#EXTM3U",
+                "#EXT-X-VERSION:3",
+                f"#EXT-X-TARGETDURATION:{int(self.segment_duration) + 1}",
+                "#EXT-X-MEDIA-SEQUENCE:0",
+                "#EXT-X-PLAYLIST-TYPE:VOD",
+            ]
+
+            for seg_num in sorted(self.segment_cids.keys()):
+                cid = self.segment_cids[seg_num]
+                lines.append(f"#EXTINF:{self.segment_duration:.3f},")
+                lines.append(f"{self.ipfs_gateway}/{cid}")
+
+            lines.append("#EXT-X-ENDLIST")
+            playlist_content = "\n".join(lines) + "\n"
+
+            cid = self._ipfs_add_bytes(playlist_content.encode("utf-8"), pin=True)
+            if cid:
+                self._playlist_cid = cid
+                print(f"IPFS: final playlist -> {cid} ({len(self.segment_cids)} segments)", file=sys.stderr)
+
+    @property
+    def playlist_cid(self) -> Optional[str]:
+        """Get the current playlist CID."""
+        return self._playlist_cid
+
+    @property
+    def playlist_url(self) -> Optional[str]:
+        """Get the full IPFS URL for the playlist."""
+        if self._playlist_cid:
+            return f"{self.ipfs_gateway}/{self._playlist_cid}"
+        return None
+
+    def get_playlist(self) -> str:
+        """Get the current m3u8 playlist content with IPFS URLs."""
+        if not self.segment_cids:
+            return "#EXTM3U\n"
+
+        lines = [
+            "#EXTM3U",
+            "#EXT-X-VERSION:3",
+            f"#EXT-X-TARGETDURATION:{int(self.segment_duration) + 1}",
+            "#EXT-X-MEDIA-SEQUENCE:0",
+        ]
+
+        for seg_num in sorted(self.segment_cids.keys()):
+            cid = self.segment_cids[seg_num]
+            lines.append(f"#EXTINF:{self.segment_duration:.3f},")
+            lines.append(f"{self.ipfs_gateway}/{cid}")
+
+        if not self._is_open:
+            lines.append("#EXT-X-ENDLIST")
+
+        return "\n".join(lines) + "\n"
+
+    @property
+    def is_open(self) -> bool:
+        return self._is_open and self._process.poll() is None