Add IPFS HLS streaming and GPU optimizations

- Add IPFSHLSOutput class that uploads segments to IPFS as they're created - Update streaming task to use IPFS HLS output for distributed streaming - Add /ipfs-stream endpoint to get IPFS playlist URL - Update /stream endpoint to redirect to IPFS when available - Add GPU persistence mode (STREAMING_GPU_PERSIST=1) to keep frames on GPU - Add hardware video decoding (NVDEC) support for faster video processing - Add GPU-accelerated primitive libraries: blending_gpu, color_ops_gpu, geometry_gpu - Add streaming_gpu module with GPUFrame class for tracking CPU/GPU data location - Add Dockerfile.gpu for building GPU-enabled worker image Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-03 20:23:16 +00:00
parent 5bc655f8c8
commit 86830019ad
24 changed files with 4025 additions and 96 deletions
--- a/streaming/backends.py
+++ b/streaming/backends.py
@@ -68,6 +68,8 @@ class NumpyBackend(Backend):

    def load_effect(self, effect_path: Path) -> Any:
        """Load an effect from sexp file."""
+        if isinstance(effect_path, str):
+            effect_path = Path(effect_path)
        effect_key = str(effect_path)
        if effect_key not in self._loaded_effects:
            interp = self._get_interpreter()
@@ -260,23 +262,258 @@ class NumpyBackend(Backend):
        return np.clip(result, 0, 255).astype(np.uint8)


-class GLSLBackend(Backend):
+class WGPUBackend(Backend):
    """
-    GPU-based effect processing using OpenGL/GLSL.
+    GPU-based effect processing using wgpu/WebGPU compute shaders.

-    Requires GPU with OpenGL 3.3+ support (or Mesa software renderer).
-    Achieves 30+ fps real-time processing.
+    Compiles sexp effects to WGSL at load time, executes on GPU.
+    Achieves 30+ fps real-time processing on supported hardware.

-    TODO: Implement when ready for GPU acceleration.
+    Requirements:
+        - wgpu-py library
+        - Vulkan-capable GPU (or software renderer)
    """

-    def __init__(self):
-        raise NotImplementedError(
-            "GLSL backend not yet implemented. Use NumpyBackend for now."
-        )
+    def __init__(self, recipe_dir: Path = None):
+        self.recipe_dir = recipe_dir or Path(".")
+        self._device = None
+        self._loaded_effects: Dict[str, Any] = {}  # name -> compiled shader info
+        self._numpy_fallback = NumpyBackend(recipe_dir)
+        # Buffer pool for reuse - keyed by (width, height)
+        self._buffer_pool: Dict[tuple, Dict] = {}
+
+    def _ensure_device(self):
+        """Lazy-initialize wgpu device."""
+        if self._device is not None:
+            return
+
+        try:
+            import wgpu
+            adapter = wgpu.gpu.request_adapter_sync(power_preference="high-performance")
+            self._device = adapter.request_device_sync()
+            print(f"[WGPUBackend] Using GPU: {adapter.info.get('device', 'unknown')}")
+        except Exception as e:
+            print(f"[WGPUBackend] GPU init failed: {e}, falling back to CPU")
+            self._device = None

    def load_effect(self, effect_path: Path) -> Any:
-        pass
+        """Load and compile an effect from sexp file to WGSL."""
+        effect_key = str(effect_path)
+        if effect_key in self._loaded_effects:
+            return self._loaded_effects[effect_key]
+
+        try:
+            from sexp_effects.wgsl_compiler import compile_effect_file
+            compiled = compile_effect_file(str(effect_path))
+
+            self._ensure_device()
+            if self._device is None:
+                # Fall back to numpy
+                return self._numpy_fallback.load_effect(effect_path)
+
+            # Create shader module
+            import wgpu
+            shader_module = self._device.create_shader_module(code=compiled.wgsl_code)
+
+            # Create compute pipeline
+            pipeline = self._device.create_compute_pipeline(
+                layout="auto",
+                compute={"module": shader_module, "entry_point": "main"}
+            )
+
+            self._loaded_effects[effect_key] = {
+                'compiled': compiled,
+                'pipeline': pipeline,
+                'name': compiled.name,
+            }
+            return compiled.name
+
+        except Exception as e:
+            print(f"[WGPUBackend] Failed to compile {effect_path}: {e}")
+            # Fall back to numpy for this effect
+            return self._numpy_fallback.load_effect(effect_path)
+
+    def _resolve_binding(self, value: Any, t: float, analysis_data: Dict) -> Any:
+        """Resolve a parameter binding to its value at time t."""
+        # Delegate to numpy backend's implementation
+        return self._numpy_fallback._resolve_binding(value, t, analysis_data)
+
+    def _get_or_create_buffers(self, w: int, h: int):
+        """Get or create reusable buffers for given dimensions."""
+        import wgpu
+
+        key = (w, h)
+        if key in self._buffer_pool:
+            return self._buffer_pool[key]
+
+        size = w * h * 4  # u32 per pixel
+
+        # Create staging buffer for uploads (MAP_WRITE)
+        staging_buffer = self._device.create_buffer(
+            size=size,
+            usage=wgpu.BufferUsage.MAP_WRITE | wgpu.BufferUsage.COPY_SRC,
+            mapped_at_creation=False,
+        )
+
+        # Create input buffer (STORAGE, receives data from staging)
+        input_buffer = self._device.create_buffer(
+            size=size,
+            usage=wgpu.BufferUsage.STORAGE | wgpu.BufferUsage.COPY_DST,
+        )
+
+        # Create output buffer (STORAGE + COPY_SRC for readback)
+        output_buffer = self._device.create_buffer(
+            size=size,
+            usage=wgpu.BufferUsage.STORAGE | wgpu.BufferUsage.COPY_SRC,
+        )
+
+        # Params buffer (uniform, 256 bytes should be enough)
+        params_buffer = self._device.create_buffer(
+            size=256,
+            usage=wgpu.BufferUsage.UNIFORM | wgpu.BufferUsage.COPY_DST,
+        )
+
+        self._buffer_pool[key] = {
+            'staging': staging_buffer,
+            'input': input_buffer,
+            'output': output_buffer,
+            'params': params_buffer,
+            'size': size,
+        }
+        return self._buffer_pool[key]
+
+    def _apply_effect_gpu(
+        self,
+        frame: np.ndarray,
+        effect_name: str,
+        params: Dict,
+        t: float,
+    ) -> Optional[np.ndarray]:
+        """Apply effect using GPU. Returns None if GPU not available."""
+        import wgpu
+
+        # Find the loaded effect
+        effect_info = None
+        for key, info in self._loaded_effects.items():
+            if info.get('name') == effect_name:
+                effect_info = info
+                break
+
+        if effect_info is None or self._device is None:
+            return None
+
+        compiled = effect_info['compiled']
+        pipeline = effect_info['pipeline']
+
+        h, w = frame.shape[:2]
+
+        # Get reusable buffers
+        buffers = self._get_or_create_buffers(w, h)
+
+        # Pack frame as u32 array (RGB -> packed u32)
+        r = frame[:, :, 0].astype(np.uint32)
+        g = frame[:, :, 1].astype(np.uint32)
+        b = frame[:, :, 2].astype(np.uint32)
+        packed = (r << 16) | (g << 8) | b
+        input_data = packed.flatten().astype(np.uint32)
+
+        # Upload input data via queue.write_buffer (more efficient than recreation)
+        self._device.queue.write_buffer(buffers['input'], 0, input_data.tobytes())
+
+        # Build params struct
+        import struct
+        param_values = [w, h]  # width, height as u32
+        param_format = "II"  # two u32
+
+        # Add time as f32
+        param_values.append(t)
+        param_format += "f"
+
+        # Add effect-specific params
+        for param in compiled.params:
+            val = params.get(param.name, param.default)
+            if val is None:
+                val = 0
+            if param.wgsl_type == 'f32':
+                param_values.append(float(val))
+                param_format += "f"
+            elif param.wgsl_type == 'i32':
+                param_values.append(int(val))
+                param_format += "i"
+            elif param.wgsl_type == 'u32':
+                param_values.append(int(val))
+                param_format += "I"
+
+        # Pad to 16-byte alignment
+        param_bytes = struct.pack(param_format, *param_values)
+        while len(param_bytes) % 16 != 0:
+            param_bytes += b'\x00'
+
+        self._device.queue.write_buffer(buffers['params'], 0, param_bytes)
+
+        # Create bind group (unfortunately this can't be easily reused with different effects)
+        bind_group = self._device.create_bind_group(
+            layout=pipeline.get_bind_group_layout(0),
+            entries=[
+                {"binding": 0, "resource": {"buffer": buffers['input']}},
+                {"binding": 1, "resource": {"buffer": buffers['output']}},
+                {"binding": 2, "resource": {"buffer": buffers['params']}},
+            ]
+        )
+
+        # Dispatch compute
+        encoder = self._device.create_command_encoder()
+        compute_pass = encoder.begin_compute_pass()
+        compute_pass.set_pipeline(pipeline)
+        compute_pass.set_bind_group(0, bind_group)
+
+        # Workgroups: ceil(w/16) x ceil(h/16)
+        wg_x = (w + 15) // 16
+        wg_y = (h + 15) // 16
+        compute_pass.dispatch_workgroups(wg_x, wg_y, 1)
+        compute_pass.end()
+
+        self._device.queue.submit([encoder.finish()])
+
+        # Read back result
+        result_data = self._device.queue.read_buffer(buffers['output'])
+        result_packed = np.frombuffer(result_data, dtype=np.uint32).reshape(h, w)
+
+        # Unpack u32 -> RGB
+        result = np.zeros((h, w, 3), dtype=np.uint8)
+        result[:, :, 0] = ((result_packed >> 16) & 0xFF).astype(np.uint8)
+        result[:, :, 1] = ((result_packed >> 8) & 0xFF).astype(np.uint8)
+        result[:, :, 2] = (result_packed & 0xFF).astype(np.uint8)
+
+        return result
+
+    def _apply_effect(
+        self,
+        frame: np.ndarray,
+        effect_name: str,
+        params: Dict,
+        t: float,
+        analysis_data: Dict,
+    ) -> np.ndarray:
+        """Apply a single effect to a frame."""
+        # Resolve bindings in params
+        resolved_params = {"_time": t}
+        for key, value in params.items():
+            if key in ("effect", "effect_path", "cid", "analysis_refs"):
+                continue
+            resolved_params[key] = self._resolve_binding(value, t, analysis_data)
+
+        # Try GPU first
+        self._ensure_device()
+        if self._device is not None:
+            result = self._apply_effect_gpu(frame, effect_name, resolved_params, t)
+            if result is not None:
+                return result
+
+        # Fall back to numpy
+        return self._numpy_fallback._apply_effect(
+            frame, effect_name, params, t, analysis_data
+        )

    def process_frame(
        self,
@@ -286,7 +523,34 @@ class GLSLBackend(Backend):
        t: float,
        analysis_data: Dict,
    ) -> np.ndarray:
-        pass
+        """Process frames through effects and composite."""
+        if not frames:
+            return np.zeros((720, 1280, 3), dtype=np.uint8)
+
+        processed = []
+
+        # Apply effects to each input frame
+        for i, (frame, effects) in enumerate(zip(frames, effects_per_frame)):
+            result = frame.copy()
+            for effect_config in effects:
+                effect_name = effect_config.get("effect", "")
+                if effect_name:
+                    result = self._apply_effect(
+                        result, effect_name, effect_config, t, analysis_data
+                    )
+            processed.append(result)
+
+        # Composite layers (use numpy backend for now)
+        if len(processed) == 1:
+            return processed[0]
+
+        return self._numpy_fallback._composite(
+            processed, compositor_config, t, analysis_data
+        )
+
+
+# Keep GLSLBackend as alias for backwards compatibility
+GLSLBackend = WGPUBackend


 def get_backend(name: str = "numpy", **kwargs) -> Backend:
@@ -294,7 +558,7 @@ def get_backend(name: str = "numpy", **kwargs) -> Backend:
    Get a backend by name.

    Args:
-        name: "numpy" or "glsl"
+        name: "numpy", "wgpu", or "glsl" (alias for wgpu)
        **kwargs: Backend-specific options

    Returns:
@@ -302,7 +566,7 @@ def get_backend(name: str = "numpy", **kwargs) -> Backend:
    """
    if name == "numpy":
        return NumpyBackend(**kwargs)
-    elif name == "glsl":
-        return GLSLBackend(**kwargs)
+    elif name in ("wgpu", "glsl", "gpu"):
+        return WGPUBackend(**kwargs)
    else:
        raise ValueError(f"Unknown backend: {name}")
--- a/streaming/output.py
+++ b/streaming/output.py
@@ -5,14 +5,99 @@ Supports:
 - Display window (preview)
 - File output (recording)
 - Stream output (RTMP, etc.) - future
+- NVENC hardware encoding (auto-detected)
+- CuPy GPU arrays (auto-converted to numpy for output)
 """

 import numpy as np
 import subprocess
 from abc import ABC, abstractmethod
-from typing import Tuple, Optional
+from typing import Tuple, Optional, List, Union
 from pathlib import Path

+# Try to import CuPy for GPU array support
+try:
+    import cupy as cp
+    CUPY_AVAILABLE = True
+except ImportError:
+    cp = None
+    CUPY_AVAILABLE = False
+
+
+def ensure_numpy(frame: Union[np.ndarray, 'cp.ndarray']) -> np.ndarray:
+    """Convert frame to numpy array if it's a CuPy array."""
+    if CUPY_AVAILABLE and isinstance(frame, cp.ndarray):
+        return cp.asnumpy(frame)
+    return frame
+
+# Cache NVENC availability check
+_nvenc_available: Optional[bool] = None
+
+
+def check_nvenc_available() -> bool:
+    """Check if NVENC hardware encoding is available."""
+    global _nvenc_available
+    if _nvenc_available is not None:
+        return _nvenc_available
+
+    try:
+        result = subprocess.run(
+            ["ffmpeg", "-encoders"],
+            capture_output=True,
+            text=True,
+            timeout=5
+        )
+        _nvenc_available = "h264_nvenc" in result.stdout
+    except Exception:
+        _nvenc_available = False
+
+    return _nvenc_available
+
+
+def get_encoder_params(codec: str, preset: str, crf: int) -> List[str]:
+    """
+    Get encoder-specific FFmpeg parameters.
+
+    For NVENC (h264_nvenc, hevc_nvenc):
+    - Uses -cq for constant quality (similar to CRF)
+    - Presets: p1 (fastest) to p7 (slowest/best quality)
+    - Mapping: fast->p4, medium->p5, slow->p6
+
+    For libx264:
+    - Uses -crf for constant rate factor
+    - Presets: ultrafast, superfast, veryfast, faster, fast, medium, slow, slower, veryslow
+    """
+    if codec in ("h264_nvenc", "hevc_nvenc"):
+        # Map libx264 presets to NVENC presets
+        nvenc_preset_map = {
+            "ultrafast": "p1",
+            "superfast": "p2",
+            "veryfast": "p3",
+            "faster": "p3",
+            "fast": "p4",
+            "medium": "p5",
+            "slow": "p6",
+            "slower": "p6",
+            "veryslow": "p7",
+        }
+        nvenc_preset = nvenc_preset_map.get(preset, "p4")
+
+        # NVENC quality: 0 (best) to 51 (worst), similar to CRF
+        # CRF 18 = high quality, CRF 23 = good quality
+        return [
+            "-c:v", codec,
+            "-preset", nvenc_preset,
+            "-cq", str(crf),  # Constant quality mode
+            "-rc", "vbr",     # Variable bitrate with quality target
+        ]
+    else:
+        # Standard libx264 params
+        return [
+            "-c:v", codec,
+            "-preset", preset,
+            "-crf", str(crf),
+        ]
+

 class Output(ABC):
    """Abstract base class for output targets."""
@@ -91,6 +176,9 @@ class DisplayOutput(Output):
        if not self._is_open:
            return

+        # Convert GPU array to numpy if needed
+        frame = ensure_numpy(frame)
+
        # Ensure frame is correct format
        if frame.dtype != np.uint8:
            frame = np.clip(frame, 0, 255).astype(np.uint8)
@@ -136,6 +224,9 @@ class DisplayOutput(Output):
 class FileOutput(Output):
    """
    Write frames to a video file using ffmpeg.
+
+    Automatically uses NVENC hardware encoding when available,
+    falling back to libx264 CPU encoding otherwise.
    """

    def __init__(
@@ -143,7 +234,7 @@ class FileOutput(Output):
        path: str,
        size: Tuple[int, int],
        fps: float = 30,
-        codec: str = "libx264",
+        codec: str = "auto",  # "auto", "h264_nvenc", "libx264"
        crf: int = 18,
        preset: str = "fast",
        audio_source: str = None,
@@ -153,6 +244,11 @@ class FileOutput(Output):
        self.fps = fps
        self._is_open = True

+        # Auto-detect NVENC
+        if codec == "auto":
+            codec = "h264_nvenc" if check_nvenc_available() else "libx264"
+        self.codec = codec
+
        # Build ffmpeg command
        cmd = [
            "ffmpeg", "-y",
@@ -170,12 +266,9 @@ class FileOutput(Output):
            # Explicitly map: video from input 0 (rawvideo), audio from input 1
            cmd.extend(["-map", "0:v", "-map", "1:a"])

-        cmd.extend([
-            "-c:v", codec,
-            "-preset", preset,
-            "-crf", str(crf),
-            "-pix_fmt", "yuv420p",
-        ])
+        # Get encoder-specific params
+        cmd.extend(get_encoder_params(codec, preset, crf))
+        cmd.extend(["-pix_fmt", "yuv420p"])

        # Add audio codec if we have audio
        if audio_source:
@@ -201,11 +294,20 @@ class FileOutput(Output):
            self._is_open = False
            return

+        # Convert GPU array to numpy if needed
+        frame = ensure_numpy(frame)
+
        # Resize if needed
        if frame.shape[1] != self.size[0] or frame.shape[0] != self.size[1]:
            import cv2
            frame = cv2.resize(frame, self.size)

+        # Ensure correct format
+        if frame.dtype != np.uint8:
+            frame = np.clip(frame, 0, 255).astype(np.uint8)
+        if not frame.flags['C_CONTIGUOUS']:
+            frame = np.ascontiguousarray(frame)
+
        try:
            self._process.stdin.write(frame.tobytes())
        except BrokenPipeError:
@@ -335,6 +437,9 @@ class PipeOutput(Output):
            self._is_open = False
            return

+        # Convert GPU array to numpy if needed
+        frame = ensure_numpy(frame)
+
        # Resize if needed
        if frame.shape[1] != self.size[0] or frame.shape[0] != self.size[1]:
            import cv2
@@ -371,3 +476,424 @@ class PipeOutput(Output):
        if self._process and self._process.poll() is not None:
            self._is_open = False
        return self._is_open
+
+
+class HLSOutput(Output):
+    """
+    Write frames as HLS stream (m3u8 playlist + .ts segments).
+
+    This enables true live streaming where the browser can poll
+    for new segments as they become available.
+
+    Automatically uses NVENC hardware encoding when available.
+    """
+
+    def __init__(
+        self,
+        output_dir: str,
+        size: Tuple[int, int],
+        fps: float = 30,
+        segment_duration: float = 4.0,  # 4s segments for stability
+        codec: str = "auto",  # "auto", "h264_nvenc", "libx264"
+        crf: int = 23,
+        preset: str = "fast",  # Better quality than ultrafast
+        audio_source: str = None,
+    ):
+        self.output_dir = Path(output_dir)
+        self.output_dir.mkdir(parents=True, exist_ok=True)
+        self.size = size
+        self.fps = fps
+        self.segment_duration = segment_duration
+        self._is_open = True
+
+        # Auto-detect NVENC
+        if codec == "auto":
+            codec = "h264_nvenc" if check_nvenc_available() else "libx264"
+        self.codec = codec
+
+        # HLS playlist path
+        self.playlist_path = self.output_dir / "stream.m3u8"
+
+        # Build ffmpeg command for HLS output
+        cmd = [
+            "ffmpeg", "-y",
+            "-f", "rawvideo",
+            "-vcodec", "rawvideo",
+            "-pix_fmt", "rgb24",
+            "-s", f"{size[0]}x{size[1]}",
+            "-r", str(fps),
+            "-i", "-",
+        ]
+
+        # Add audio input if provided
+        if audio_source:
+            cmd.extend(["-i", str(audio_source)])
+            cmd.extend(["-map", "0:v", "-map", "1:a"])
+
+        # Keyframe interval - must be exactly segment_duration for clean cuts
+        gop_size = int(fps * segment_duration)
+
+        # Get encoder-specific params
+        cmd.extend(get_encoder_params(codec, preset, crf))
+        cmd.extend([
+            "-pix_fmt", "yuv420p",
+            # Force keyframes at exact intervals for clean segment boundaries
+            "-g", str(gop_size),
+            "-keyint_min", str(gop_size),
+            "-sc_threshold", "0",  # Disable scene change detection
+            "-force_key_frames", f"expr:gte(t,n_forced*{segment_duration})",
+            # Reduce buffering for faster segment availability
+            "-flush_packets", "1",
+        ])
+
+        # Add audio codec if we have audio
+        if audio_source:
+            cmd.extend(["-c:a", "aac", "-b:a", "128k"])
+
+        # HLS specific options for smooth live streaming
+        cmd.extend([
+            "-f", "hls",
+            "-hls_time", str(segment_duration),
+            "-hls_list_size", "0",  # Keep all segments in playlist
+            "-hls_flags", "independent_segments+append_list+split_by_time",
+            "-hls_segment_type", "mpegts",
+            "-hls_segment_filename", str(self.output_dir / "segment_%05d.ts"),
+            str(self.playlist_path),
+        ])
+
+        import sys
+        print(f"HLSOutput cmd: {' '.join(cmd)}", file=sys.stderr)
+        self._process = subprocess.Popen(
+            cmd,
+            stdin=subprocess.PIPE,
+            stderr=None,  # Show errors for debugging
+        )
+
+        # Track segments for status reporting
+        self.segments_written = 0
+        self._last_segment_check = 0
+
+    def write(self, frame: np.ndarray, t: float):
+        """Write frame to HLS stream."""
+        if not self._is_open or self._process.poll() is not None:
+            self._is_open = False
+            return
+
+        # Convert GPU array to numpy if needed
+        frame = ensure_numpy(frame)
+
+        # Resize if needed
+        if frame.shape[1] != self.size[0] or frame.shape[0] != self.size[1]:
+            import cv2
+            frame = cv2.resize(frame, self.size)
+
+        # Ensure correct format
+        if frame.dtype != np.uint8:
+            frame = np.clip(frame, 0, 255).astype(np.uint8)
+        if not frame.flags['C_CONTIGUOUS']:
+            frame = np.ascontiguousarray(frame)
+
+        try:
+            self._process.stdin.write(frame.tobytes())
+        except BrokenPipeError:
+            self._is_open = False
+
+        # Periodically count segments
+        if t - self._last_segment_check > 1.0:
+            self._last_segment_check = t
+            self.segments_written = len(list(self.output_dir.glob("segment_*.ts")))
+
+    def close(self):
+        """Close the HLS stream."""
+        if self._process:
+            self._process.stdin.close()
+            self._process.wait()
+        self._is_open = False
+
+        # Final segment count
+        self.segments_written = len(list(self.output_dir.glob("segment_*.ts")))
+
+        # Mark playlist as ended (VOD mode)
+        if self.playlist_path.exists():
+            with open(self.playlist_path, "a") as f:
+                f.write("#EXT-X-ENDLIST\n")
+
+    @property
+    def is_open(self) -> bool:
+        return self._is_open and self._process.poll() is None
+
+
+class IPFSHLSOutput(Output):
+    """
+    Write frames as HLS stream with segments uploaded to IPFS.
+
+    Each segment is uploaded to IPFS as it's created, enabling distributed
+    streaming where clients can fetch segments from any IPFS gateway.
+
+    The m3u8 playlist is continuously updated with IPFS URLs and can be
+    fetched via get_playlist() or the playlist_cid property.
+    """
+
+    def __init__(
+        self,
+        output_dir: str,
+        size: Tuple[int, int],
+        fps: float = 30,
+        segment_duration: float = 4.0,
+        codec: str = "auto",
+        crf: int = 23,
+        preset: str = "fast",
+        audio_source: str = None,
+        ipfs_gateway: str = "https://ipfs.io/ipfs",
+    ):
+        self.output_dir = Path(output_dir)
+        self.output_dir.mkdir(parents=True, exist_ok=True)
+        self.size = size
+        self.fps = fps
+        self.segment_duration = segment_duration
+        self.ipfs_gateway = ipfs_gateway.rstrip("/")
+        self._is_open = True
+
+        # Auto-detect NVENC
+        if codec == "auto":
+            codec = "h264_nvenc" if check_nvenc_available() else "libx264"
+        self.codec = codec
+
+        # Track segment CIDs
+        self.segment_cids: dict = {}  # segment_number -> cid
+        self._last_segment_checked = -1
+        self._playlist_cid: Optional[str] = None
+
+        # Import IPFS client
+        from ipfs_client import add_file, add_bytes
+        self._ipfs_add_file = add_file
+        self._ipfs_add_bytes = add_bytes
+
+        # Local HLS paths
+        self.local_playlist_path = self.output_dir / "stream.m3u8"
+
+        # Build ffmpeg command for HLS output
+        cmd = [
+            "ffmpeg", "-y",
+            "-f", "rawvideo",
+            "-vcodec", "rawvideo",
+            "-pix_fmt", "rgb24",
+            "-s", f"{size[0]}x{size[1]}",
+            "-r", str(fps),
+            "-i", "-",
+        ]
+
+        # Add audio input if provided
+        if audio_source:
+            cmd.extend(["-i", str(audio_source)])
+            cmd.extend(["-map", "0:v", "-map", "1:a"])
+
+        # Keyframe interval
+        gop_size = int(fps * segment_duration)
+
+        # Get encoder-specific params
+        cmd.extend(get_encoder_params(codec, preset, crf))
+        cmd.extend([
+            "-pix_fmt", "yuv420p",
+            "-g", str(gop_size),
+            "-keyint_min", str(gop_size),
+            "-sc_threshold", "0",
+            "-force_key_frames", f"expr:gte(t,n_forced*{segment_duration})",
+            "-flush_packets", "1",
+        ])
+
+        # Add audio codec if we have audio
+        if audio_source:
+            cmd.extend(["-c:a", "aac", "-b:a", "128k"])
+
+        # HLS options
+        cmd.extend([
+            "-f", "hls",
+            "-hls_time", str(segment_duration),
+            "-hls_list_size", "0",
+            "-hls_flags", "independent_segments+append_list+split_by_time",
+            "-hls_segment_type", "mpegts",
+            "-hls_segment_filename", str(self.output_dir / "segment_%05d.ts"),
+            str(self.local_playlist_path),
+        ])
+
+        import sys
+        print(f"IPFSHLSOutput: starting ffmpeg", file=sys.stderr)
+        self._process = subprocess.Popen(
+            cmd,
+            stdin=subprocess.PIPE,
+            stderr=None,
+        )
+
+    def _upload_new_segments(self):
+        """Check for new segments and upload them to IPFS."""
+        import sys
+
+        # Find all segments
+        segments = sorted(self.output_dir.glob("segment_*.ts"))
+
+        for seg_path in segments:
+            # Extract segment number from filename
+            seg_name = seg_path.stem  # segment_00000
+            seg_num = int(seg_name.split("_")[1])
+
+            # Skip if already uploaded
+            if seg_num in self.segment_cids:
+                continue
+
+            # Skip if segment is still being written (check if file size is stable)
+            try:
+                size1 = seg_path.stat().st_size
+                if size1 == 0:
+                    continue  # Empty file, still being created
+
+                import time
+                time.sleep(0.1)
+                size2 = seg_path.stat().st_size
+                if size1 != size2:
+                    continue  # File still being written
+            except FileNotFoundError:
+                continue
+
+            # Upload to IPFS
+            cid = self._ipfs_add_file(seg_path, pin=True)
+            if cid:
+                self.segment_cids[seg_num] = cid
+                print(f"IPFS: segment_{seg_num:05d}.ts -> {cid}", file=sys.stderr)
+
+                # Update playlist after each segment upload
+                self._update_ipfs_playlist()
+
+    def _update_ipfs_playlist(self):
+        """Generate and upload IPFS-aware m3u8 playlist."""
+        if not self.segment_cids:
+            return
+
+        import sys
+
+        # Build m3u8 content with IPFS URLs
+        lines = [
+            "#EXTM3U",
+            "#EXT-X-VERSION:3",
+            f"#EXT-X-TARGETDURATION:{int(self.segment_duration) + 1}",
+            "#EXT-X-MEDIA-SEQUENCE:0",
+        ]
+
+        # Add segments in order
+        for seg_num in sorted(self.segment_cids.keys()):
+            cid = self.segment_cids[seg_num]
+            lines.append(f"#EXTINF:{self.segment_duration:.3f},")
+            lines.append(f"{self.ipfs_gateway}/{cid}")
+
+        playlist_content = "\n".join(lines) + "\n"
+
+        # Upload playlist to IPFS
+        cid = self._ipfs_add_bytes(playlist_content.encode("utf-8"), pin=True)
+        if cid:
+            self._playlist_cid = cid
+            print(f"IPFS: playlist updated -> {cid} ({len(self.segment_cids)} segments)", file=sys.stderr)
+
+    def write(self, frame: np.ndarray, t: float):
+        """Write frame to HLS stream and upload segments to IPFS."""
+        if not self._is_open or self._process.poll() is not None:
+            self._is_open = False
+            return
+
+        # Convert GPU array to numpy if needed
+        frame = ensure_numpy(frame)
+
+        # Resize if needed
+        if frame.shape[1] != self.size[0] or frame.shape[0] != self.size[1]:
+            import cv2
+            frame = cv2.resize(frame, self.size)
+
+        # Ensure correct format
+        if frame.dtype != np.uint8:
+            frame = np.clip(frame, 0, 255).astype(np.uint8)
+        if not frame.flags['C_CONTIGUOUS']:
+            frame = np.ascontiguousarray(frame)
+
+        try:
+            self._process.stdin.write(frame.tobytes())
+        except BrokenPipeError:
+            self._is_open = False
+            return
+
+        # Check for new segments periodically (every second)
+        current_segment = int(t / self.segment_duration)
+        if current_segment > self._last_segment_checked:
+            self._last_segment_checked = current_segment
+            self._upload_new_segments()
+
+    def close(self):
+        """Close the HLS stream and finalize IPFS uploads."""
+        import sys
+
+        if self._process:
+            self._process.stdin.close()
+            self._process.wait()
+        self._is_open = False
+
+        # Upload any remaining segments
+        self._upload_new_segments()
+
+        # Generate final playlist with #EXT-X-ENDLIST
+        if self.segment_cids:
+            lines = [
+                "#EXTM3U",
+                "#EXT-X-VERSION:3",
+                f"#EXT-X-TARGETDURATION:{int(self.segment_duration) + 1}",
+                "#EXT-X-MEDIA-SEQUENCE:0",
+                "#EXT-X-PLAYLIST-TYPE:VOD",
+            ]
+
+            for seg_num in sorted(self.segment_cids.keys()):
+                cid = self.segment_cids[seg_num]
+                lines.append(f"#EXTINF:{self.segment_duration:.3f},")
+                lines.append(f"{self.ipfs_gateway}/{cid}")
+
+            lines.append("#EXT-X-ENDLIST")
+            playlist_content = "\n".join(lines) + "\n"
+
+            cid = self._ipfs_add_bytes(playlist_content.encode("utf-8"), pin=True)
+            if cid:
+                self._playlist_cid = cid
+                print(f"IPFS: final playlist -> {cid} ({len(self.segment_cids)} segments)", file=sys.stderr)
+
+    @property
+    def playlist_cid(self) -> Optional[str]:
+        """Get the current playlist CID."""
+        return self._playlist_cid
+
+    @property
+    def playlist_url(self) -> Optional[str]:
+        """Get the full IPFS URL for the playlist."""
+        if self._playlist_cid:
+            return f"{self.ipfs_gateway}/{self._playlist_cid}"
+        return None
+
+    def get_playlist(self) -> str:
+        """Get the current m3u8 playlist content with IPFS URLs."""
+        if not self.segment_cids:
+            return "#EXTM3U\n"
+
+        lines = [
+            "#EXTM3U",
+            "#EXT-X-VERSION:3",
+            f"#EXT-X-TARGETDURATION:{int(self.segment_duration) + 1}",
+            "#EXT-X-MEDIA-SEQUENCE:0",
+        ]
+
+        for seg_num in sorted(self.segment_cids.keys()):
+            cid = self.segment_cids[seg_num]
+            lines.append(f"#EXTINF:{self.segment_duration:.3f},")
+            lines.append(f"{self.ipfs_gateway}/{cid}")
+
+        if not self._is_open:
+            lines.append("#EXT-X-ENDLIST")
+
+        return "\n".join(lines) + "\n"
+
+    @property
+    def is_open(self) -> bool:
+        return self._is_open and self._process.poll() is None
--- a/streaming/stream_sexp_generic.py
+++ b/streaming/stream_sexp_generic.py
@@ -159,36 +159,51 @@ class StreamInterpreter:
        return config

    def _load_primitives(self, lib_name: str):
-        """Load primitives from a Python library file."""
+        """Load primitives from a Python library file.
+
+        Prefers GPU-accelerated versions (*_gpu.py) when available.
+        """
        import importlib.util

-        lib_paths = [
-            self.primitive_lib_dir / f"{lib_name}.py",
-            self.sexp_dir / "primitive_libs" / f"{lib_name}.py",
-            self.sexp_dir.parent / "sexp_effects" / "primitive_libs" / f"{lib_name}.py",
-        ]
+        # Try GPU version first, then fall back to CPU version
+        lib_names_to_try = [f"{lib_name}_gpu", lib_name]

        lib_path = None
-        for p in lib_paths:
-            if p.exists():
-                lib_path = p
+        actual_lib_name = lib_name
+
+        for try_lib in lib_names_to_try:
+            lib_paths = [
+                self.primitive_lib_dir / f"{try_lib}.py",
+                self.sexp_dir / "primitive_libs" / f"{try_lib}.py",
+                self.sexp_dir.parent / "sexp_effects" / "primitive_libs" / f"{try_lib}.py",
+            ]
+            for p in lib_paths:
+                if p.exists():
+                    lib_path = p
+                    actual_lib_name = try_lib
+                    break
+            if lib_path:
                break

        if not lib_path:
            print(f"Warning: primitive library '{lib_name}' not found", file=sys.stderr)
            return

-        spec = importlib.util.spec_from_file_location(lib_name, lib_path)
+        spec = importlib.util.spec_from_file_location(actual_lib_name, lib_path)
        module = importlib.util.module_from_spec(spec)
        spec.loader.exec_module(module)

+        # Check if this is a GPU-accelerated module
+        is_gpu = actual_lib_name.endswith('_gpu')
+        gpu_tag = " [GPU]" if is_gpu else ""
+
        count = 0
        for name in dir(module):
            if name.startswith('prim_'):
                func = getattr(module, name)
                prim_name = name[5:]
                dash_name = prim_name.replace('_', '-')
-                # Register ONLY with namespace (geometry:ripple-displace)
+                # Register with original lib_name namespace (geometry:rotate, not geometry_gpu:rotate)
                # Don't overwrite if already registered (allows pre-registration of overrides)
                key = f"{lib_name}:{dash_name}"
                if key not in self.primitives:
@@ -199,7 +214,7 @@ class StreamInterpreter:
            prims = getattr(module, 'PRIMITIVES')
            if isinstance(prims, dict):
                for name, func in prims.items():
-                    # Register ONLY with namespace
+                    # Register with original lib_name namespace
                    # Don't overwrite if already registered
                    dash_name = name.replace('_', '-')
                    key = f"{lib_name}:{dash_name}"
@@ -207,7 +222,7 @@ class StreamInterpreter:
                        self.primitives[key] = func
                        count += 1

-        print(f"Loaded primitives: {lib_name} ({count} functions)", file=sys.stderr)
+        print(f"Loaded primitives: {lib_name} ({count} functions){gpu_tag}", file=sys.stderr)

    def _load_effect(self, effect_path: Path):
        """Load and register an effect from a .sexp file."""
@@ -807,8 +822,11 @@ class StreamInterpreter:
                self._record_error(f"Primitive {op} error: {e}")
                raise RuntimeError(f"Primitive {op} failed: {e}")

-        # Unknown - return as-is
-        return expr
+        # Unknown function call - raise meaningful error
+        raise RuntimeError(f"Unknown function or primitive: '{op}'. "
+                          f"Available primitives: {sorted(list(self.primitives.keys())[:10])}... "
+                          f"Available effects: {sorted(list(self.effects.keys())[:10])}... "
+                          f"Available macros: {sorted(list(self.macros.keys())[:10])}...")

    def _step_scans(self, ctx: Context, env: dict):
        """Step scans based on trigger evaluation."""
@@ -833,9 +851,9 @@ class StreamInterpreter:
        """Run the streaming pipeline."""
        # Import output classes - handle both package and direct execution
        try:
-            from .output import PipeOutput, DisplayOutput, FileOutput
+            from .output import PipeOutput, DisplayOutput, FileOutput, HLSOutput, IPFSHLSOutput
        except ImportError:
-            from output import PipeOutput, DisplayOutput, FileOutput
+            from output import PipeOutput, DisplayOutput, FileOutput, HLSOutput, IPFSHLSOutput

        self._init()

@@ -871,6 +889,16 @@ class StreamInterpreter:
            out = PipeOutput(size=(w, h), fps=fps, audio_source=audio)
        elif output == "preview":
            out = DisplayOutput(size=(w, h), fps=fps, audio_source=audio)
+        elif output.endswith("/hls"):
+            # HLS output - output is a directory path ending in /hls
+            hls_dir = output[:-4]  # Remove /hls suffix
+            out = HLSOutput(hls_dir, size=(w, h), fps=fps, audio_source=audio)
+        elif output.endswith("/ipfs-hls"):
+            # IPFS HLS output - segments uploaded to IPFS as they're created
+            hls_dir = output[:-9]  # Remove /ipfs-hls suffix
+            import os
+            ipfs_gateway = os.environ.get("IPFS_GATEWAY_URL", "https://ipfs.io/ipfs")
+            out = IPFSHLSOutput(hls_dir, size=(w, h), fps=fps, audio_source=audio, ipfs_gateway=ipfs_gateway)
        else:
            out = FileOutput(output, size=(w, h), fps=fps, audio_source=audio)

@@ -916,6 +944,8 @@ class StreamInterpreter:

        finally:
            out.close()
+            # Store output for access to properties like playlist_cid
+            self.output = out
            print("\nDone", file=sys.stderr)