Add IPFS HLS streaming and GPU optimizations

- Add IPFSHLSOutput class that uploads segments to IPFS as they're created - Update streaming task to use IPFS HLS output for distributed streaming - Add /ipfs-stream endpoint to get IPFS playlist URL - Update /stream endpoint to redirect to IPFS when available - Add GPU persistence mode (STREAMING_GPU_PERSIST=1) to keep frames on GPU - Add hardware video decoding (NVDEC) support for faster video processing - Add GPU-accelerated primitive libraries: blending_gpu, color_ops_gpu, geometry_gpu - Add streaming_gpu module with GPUFrame class for tracking CPU/GPU data location - Add Dockerfile.gpu for building GPU-enabled worker image Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-03 20:23:16 +00:00
parent 5bc655f8c8
commit 86830019ad
24 changed files with 4025 additions and 96 deletions
--- a/sexp_effects/primitive_libs/blending_gpu.py
+++ b/sexp_effects/primitive_libs/blending_gpu.py
@@ -0,0 +1,220 @@
+"""
+GPU-Accelerated Blending Primitives Library
+
+Uses CuPy for CUDA-accelerated image blending and compositing.
+Keeps frames on GPU when STREAMING_GPU_PERSIST=1 for maximum performance.
+"""
+import os
+import numpy as np
+
+# Try to import CuPy for GPU acceleration
+try:
+    import cupy as cp
+    GPU_AVAILABLE = True
+    print("[blending_gpu] CuPy GPU acceleration enabled")
+except ImportError:
+    cp = np
+    GPU_AVAILABLE = False
+    print("[blending_gpu] CuPy not available, using CPU fallback")
+
+# GPU persistence mode - keep frames on GPU between operations
+GPU_PERSIST = os.environ.get("STREAMING_GPU_PERSIST", "1") == "1"
+if GPU_AVAILABLE and GPU_PERSIST:
+    print("[blending_gpu] GPU persistence enabled - frames stay on GPU")
+
+
+def _to_gpu(img):
+    """Move image to GPU if available."""
+    if GPU_AVAILABLE and not isinstance(img, cp.ndarray):
+        return cp.asarray(img)
+    return img
+
+
+def _to_cpu(img):
+    """Move image back to CPU (only if GPU_PERSIST is disabled)."""
+    if not GPU_PERSIST and GPU_AVAILABLE and isinstance(img, cp.ndarray):
+        return cp.asnumpy(img)
+    return img
+
+
+def _get_xp(img):
+    """Get the array module (numpy or cupy) for the given image."""
+    if GPU_AVAILABLE and isinstance(img, cp.ndarray):
+        return cp
+    return np
+
+
+def prim_blend_images(a, b, alpha):
+    """Blend two images: a * (1-alpha) + b * alpha."""
+    alpha = max(0.0, min(1.0, float(alpha)))
+
+    if GPU_AVAILABLE:
+        a_gpu = _to_gpu(a)
+        b_gpu = _to_gpu(b)
+        result = (a_gpu.astype(cp.float32) * (1 - alpha) + b_gpu.astype(cp.float32) * alpha).astype(cp.uint8)
+        return _to_cpu(result)
+
+    return (a.astype(float) * (1 - alpha) + b.astype(float) * alpha).astype(np.uint8)
+
+
+def prim_blend_mode(a, b, mode):
+    """Blend using Photoshop-style blend modes."""
+    if GPU_AVAILABLE:
+        a_gpu = _to_gpu(a).astype(cp.float32) / 255
+        b_gpu = _to_gpu(b).astype(cp.float32) / 255
+        xp = cp
+    else:
+        a_gpu = a.astype(float) / 255
+        b_gpu = b.astype(float) / 255
+        xp = np
+
+    if mode == "multiply":
+        result = a_gpu * b_gpu
+    elif mode == "screen":
+        result = 1 - (1 - a_gpu) * (1 - b_gpu)
+    elif mode == "overlay":
+        mask = a_gpu < 0.5
+        result = xp.where(mask, 2 * a_gpu * b_gpu, 1 - 2 * (1 - a_gpu) * (1 - b_gpu))
+    elif mode == "soft-light":
+        mask = b_gpu < 0.5
+        result = xp.where(mask,
+                         a_gpu - (1 - 2 * b_gpu) * a_gpu * (1 - a_gpu),
+                         a_gpu + (2 * b_gpu - 1) * (xp.sqrt(a_gpu) - a_gpu))
+    elif mode == "hard-light":
+        mask = b_gpu < 0.5
+        result = xp.where(mask, 2 * a_gpu * b_gpu, 1 - 2 * (1 - a_gpu) * (1 - b_gpu))
+    elif mode == "color-dodge":
+        result = xp.clip(a_gpu / (1 - b_gpu + 0.001), 0, 1)
+    elif mode == "color-burn":
+        result = 1 - xp.clip((1 - a_gpu) / (b_gpu + 0.001), 0, 1)
+    elif mode == "difference":
+        result = xp.abs(a_gpu - b_gpu)
+    elif mode == "exclusion":
+        result = a_gpu + b_gpu - 2 * a_gpu * b_gpu
+    elif mode == "add":
+        result = xp.clip(a_gpu + b_gpu, 0, 1)
+    elif mode == "subtract":
+        result = xp.clip(a_gpu - b_gpu, 0, 1)
+    elif mode == "darken":
+        result = xp.minimum(a_gpu, b_gpu)
+    elif mode == "lighten":
+        result = xp.maximum(a_gpu, b_gpu)
+    else:
+        # Default to normal (just return b)
+        result = b_gpu
+
+    result = (result * 255).astype(xp.uint8)
+    return _to_cpu(result)
+
+
+def prim_mask(img, mask_img):
+    """Apply grayscale mask to image (white=opaque, black=transparent)."""
+    if GPU_AVAILABLE:
+        img_gpu = _to_gpu(img)
+        mask_gpu = _to_gpu(mask_img)
+
+        if len(mask_gpu.shape) == 3:
+            mask = mask_gpu[:, :, 0].astype(cp.float32) / 255
+        else:
+            mask = mask_gpu.astype(cp.float32) / 255
+
+        mask = mask[:, :, cp.newaxis]
+        result = (img_gpu.astype(cp.float32) * mask).astype(cp.uint8)
+        return _to_cpu(result)
+
+    if len(mask_img.shape) == 3:
+        mask = mask_img[:, :, 0].astype(float) / 255
+    else:
+        mask = mask_img.astype(float) / 255
+
+    mask = mask[:, :, np.newaxis]
+    return (img.astype(float) * mask).astype(np.uint8)
+
+
+def prim_alpha_composite(base, overlay, alpha_channel):
+    """Composite overlay onto base using alpha channel."""
+    if GPU_AVAILABLE:
+        base_gpu = _to_gpu(base)
+        overlay_gpu = _to_gpu(overlay)
+        alpha_gpu = _to_gpu(alpha_channel)
+
+        if len(alpha_gpu.shape) == 3:
+            alpha = alpha_gpu[:, :, 0].astype(cp.float32) / 255
+        else:
+            alpha = alpha_gpu.astype(cp.float32) / 255
+
+        alpha = alpha[:, :, cp.newaxis]
+        result = base_gpu.astype(cp.float32) * (1 - alpha) + overlay_gpu.astype(cp.float32) * alpha
+        return _to_cpu(result.astype(cp.uint8))
+
+    if len(alpha_channel.shape) == 3:
+        alpha = alpha_channel[:, :, 0].astype(float) / 255
+    else:
+        alpha = alpha_channel.astype(float) / 255
+
+    alpha = alpha[:, :, np.newaxis]
+    result = base.astype(float) * (1 - alpha) + overlay.astype(float) * alpha
+    return result.astype(np.uint8)
+
+
+def prim_overlay(base, overlay, x, y, alpha=1.0):
+    """Overlay image at position (x, y) with optional alpha."""
+    if GPU_AVAILABLE:
+        base_gpu = _to_gpu(base)
+        overlay_gpu = _to_gpu(overlay)
+        result = base_gpu.copy()
+
+        x, y = int(x), int(y)
+        oh, ow = overlay_gpu.shape[:2]
+        bh, bw = base_gpu.shape[:2]
+
+        # Clip to bounds
+        sx1 = max(0, -x)
+        sy1 = max(0, -y)
+        dx1 = max(0, x)
+        dy1 = max(0, y)
+        sx2 = min(ow, bw - x)
+        sy2 = min(oh, bh - y)
+
+        if sx2 > sx1 and sy2 > sy1:
+            src = overlay_gpu[sy1:sy2, sx1:sx2]
+            dst = result[dy1:dy1+(sy2-sy1), dx1:dx1+(sx2-sx1)]
+            blended = (dst.astype(cp.float32) * (1 - alpha) + src.astype(cp.float32) * alpha)
+            result[dy1:dy1+(sy2-sy1), dx1:dx1+(sx2-sx1)] = blended.astype(cp.uint8)
+
+        return _to_cpu(result)
+
+    result = base.copy()
+    x, y = int(x), int(y)
+    oh, ow = overlay.shape[:2]
+    bh, bw = base.shape[:2]
+
+    # Clip to bounds
+    sx1 = max(0, -x)
+    sy1 = max(0, -y)
+    dx1 = max(0, x)
+    dy1 = max(0, y)
+    sx2 = min(ow, bw - x)
+    sy2 = min(oh, bh - y)
+
+    if sx2 > sx1 and sy2 > sy1:
+        src = overlay[sy1:sy2, sx1:sx2]
+        dst = result[dy1:dy1+(sy2-sy1), dx1:dx1+(sx2-sx1)]
+        blended = (dst.astype(float) * (1 - alpha) + src.astype(float) * alpha)
+        result[dy1:dy1+(sy2-sy1), dx1:dx1+(sx2-sx1)] = blended.astype(np.uint8)
+
+    return result
+
+
+PRIMITIVES = {
+    # Basic blending
+    'blend-images': prim_blend_images,
+    'blend-mode': prim_blend_mode,
+
+    # Masking
+    'mask': prim_mask,
+    'alpha-composite': prim_alpha_composite,
+
+    # Overlay
+    'overlay': prim_overlay,
+}