""" GPU-Accelerated Blending Primitives Library Uses CuPy for CUDA-accelerated image blending and compositing. Keeps frames on GPU when STREAMING_GPU_PERSIST=1 for maximum performance. """ import os import numpy as np # Try to import CuPy for GPU acceleration try: import cupy as cp GPU_AVAILABLE = True print("[blending_gpu] CuPy GPU acceleration enabled") except ImportError: cp = np GPU_AVAILABLE = False print("[blending_gpu] CuPy not available, using CPU fallback") # GPU persistence mode - keep frames on GPU between operations GPU_PERSIST = os.environ.get("STREAMING_GPU_PERSIST", "1") == "1" if GPU_AVAILABLE and GPU_PERSIST: print("[blending_gpu] GPU persistence enabled - frames stay on GPU") def _to_gpu(img): """Move image to GPU if available.""" if GPU_AVAILABLE and not isinstance(img, cp.ndarray): return cp.asarray(img) return img def _to_cpu(img): """Move image back to CPU (only if GPU_PERSIST is disabled).""" if not GPU_PERSIST and GPU_AVAILABLE and isinstance(img, cp.ndarray): return cp.asnumpy(img) return img def _get_xp(img): """Get the array module (numpy or cupy) for the given image.""" if GPU_AVAILABLE and isinstance(img, cp.ndarray): return cp return np def prim_blend_images(a, b, alpha): """Blend two images: a * (1-alpha) + b * alpha.""" alpha = max(0.0, min(1.0, float(alpha))) if GPU_AVAILABLE: a_gpu = _to_gpu(a) b_gpu = _to_gpu(b) result = (a_gpu.astype(cp.float32) * (1 - alpha) + b_gpu.astype(cp.float32) * alpha).astype(cp.uint8) return _to_cpu(result) return (a.astype(float) * (1 - alpha) + b.astype(float) * alpha).astype(np.uint8) def prim_blend_mode(a, b, mode): """Blend using Photoshop-style blend modes.""" if GPU_AVAILABLE: a_gpu = _to_gpu(a).astype(cp.float32) / 255 b_gpu = _to_gpu(b).astype(cp.float32) / 255 xp = cp else: a_gpu = a.astype(float) / 255 b_gpu = b.astype(float) / 255 xp = np if mode == "multiply": result = a_gpu * b_gpu elif mode == "screen": result = 1 - (1 - a_gpu) * (1 - b_gpu) elif mode == "overlay": mask = a_gpu < 0.5 result = xp.where(mask, 2 * a_gpu * b_gpu, 1 - 2 * (1 - a_gpu) * (1 - b_gpu)) elif mode == "soft-light": mask = b_gpu < 0.5 result = xp.where(mask, a_gpu - (1 - 2 * b_gpu) * a_gpu * (1 - a_gpu), a_gpu + (2 * b_gpu - 1) * (xp.sqrt(a_gpu) - a_gpu)) elif mode == "hard-light": mask = b_gpu < 0.5 result = xp.where(mask, 2 * a_gpu * b_gpu, 1 - 2 * (1 - a_gpu) * (1 - b_gpu)) elif mode == "color-dodge": result = xp.clip(a_gpu / (1 - b_gpu + 0.001), 0, 1) elif mode == "color-burn": result = 1 - xp.clip((1 - a_gpu) / (b_gpu + 0.001), 0, 1) elif mode == "difference": result = xp.abs(a_gpu - b_gpu) elif mode == "exclusion": result = a_gpu + b_gpu - 2 * a_gpu * b_gpu elif mode == "add": result = xp.clip(a_gpu + b_gpu, 0, 1) elif mode == "subtract": result = xp.clip(a_gpu - b_gpu, 0, 1) elif mode == "darken": result = xp.minimum(a_gpu, b_gpu) elif mode == "lighten": result = xp.maximum(a_gpu, b_gpu) else: # Default to normal (just return b) result = b_gpu result = (result * 255).astype(xp.uint8) return _to_cpu(result) def prim_mask(img, mask_img): """Apply grayscale mask to image (white=opaque, black=transparent).""" if GPU_AVAILABLE: img_gpu = _to_gpu(img) mask_gpu = _to_gpu(mask_img) if len(mask_gpu.shape) == 3: mask = mask_gpu[:, :, 0].astype(cp.float32) / 255 else: mask = mask_gpu.astype(cp.float32) / 255 mask = mask[:, :, cp.newaxis] result = (img_gpu.astype(cp.float32) * mask).astype(cp.uint8) return _to_cpu(result) if len(mask_img.shape) == 3: mask = mask_img[:, :, 0].astype(float) / 255 else: mask = mask_img.astype(float) / 255 mask = mask[:, :, np.newaxis] return (img.astype(float) * mask).astype(np.uint8) def prim_alpha_composite(base, overlay, alpha_channel): """Composite overlay onto base using alpha channel.""" if GPU_AVAILABLE: base_gpu = _to_gpu(base) overlay_gpu = _to_gpu(overlay) alpha_gpu = _to_gpu(alpha_channel) if len(alpha_gpu.shape) == 3: alpha = alpha_gpu[:, :, 0].astype(cp.float32) / 255 else: alpha = alpha_gpu.astype(cp.float32) / 255 alpha = alpha[:, :, cp.newaxis] result = base_gpu.astype(cp.float32) * (1 - alpha) + overlay_gpu.astype(cp.float32) * alpha return _to_cpu(result.astype(cp.uint8)) if len(alpha_channel.shape) == 3: alpha = alpha_channel[:, :, 0].astype(float) / 255 else: alpha = alpha_channel.astype(float) / 255 alpha = alpha[:, :, np.newaxis] result = base.astype(float) * (1 - alpha) + overlay.astype(float) * alpha return result.astype(np.uint8) def prim_overlay(base, overlay, x, y, alpha=1.0): """Overlay image at position (x, y) with optional alpha.""" if GPU_AVAILABLE: base_gpu = _to_gpu(base) overlay_gpu = _to_gpu(overlay) result = base_gpu.copy() x, y = int(x), int(y) oh, ow = overlay_gpu.shape[:2] bh, bw = base_gpu.shape[:2] # Clip to bounds sx1 = max(0, -x) sy1 = max(0, -y) dx1 = max(0, x) dy1 = max(0, y) sx2 = min(ow, bw - x) sy2 = min(oh, bh - y) if sx2 > sx1 and sy2 > sy1: src = overlay_gpu[sy1:sy2, sx1:sx2] dst = result[dy1:dy1+(sy2-sy1), dx1:dx1+(sx2-sx1)] blended = (dst.astype(cp.float32) * (1 - alpha) + src.astype(cp.float32) * alpha) result[dy1:dy1+(sy2-sy1), dx1:dx1+(sx2-sx1)] = blended.astype(cp.uint8) return _to_cpu(result) result = base.copy() x, y = int(x), int(y) oh, ow = overlay.shape[:2] bh, bw = base.shape[:2] # Clip to bounds sx1 = max(0, -x) sy1 = max(0, -y) dx1 = max(0, x) dy1 = max(0, y) sx2 = min(ow, bw - x) sy2 = min(oh, bh - y) if sx2 > sx1 and sy2 > sy1: src = overlay[sy1:sy2, sx1:sx2] dst = result[dy1:dy1+(sy2-sy1), dx1:dx1+(sx2-sx1)] blended = (dst.astype(float) * (1 - alpha) + src.astype(float) * alpha) result[dy1:dy1+(sy2-sy1), dx1:dx1+(sx2-sx1)] = blended.astype(np.uint8) return result PRIMITIVES = { # Basic blending 'blend-images': prim_blend_images, 'blend-mode': prim_blend_mode, # Masking 'mask': prim_mask, 'alpha-composite': prim_alpha_composite, # Overlay 'overlay': prim_overlay, }