""" GPU-Accelerated Geometry Primitives Library Uses CuPy for CUDA-accelerated image transforms. Falls back to CPU if GPU unavailable. Performance Mode: - Set STREAMING_GPU_PERSIST=1 to keep frames on GPU between operations - This dramatically improves performance by avoiding CPU<->GPU transfers - Frames only transfer to CPU at final output """ import os import numpy as np # Try to import CuPy for GPU acceleration try: import cupy as cp from cupyx.scipy import ndimage as cpndimage GPU_AVAILABLE = True print("[geometry_gpu] CuPy GPU acceleration enabled") except ImportError: cp = np GPU_AVAILABLE = False print("[geometry_gpu] CuPy not available, using CPU fallback") # GPU persistence mode - keep frames on GPU between operations # Set STREAMING_GPU_PERSIST=1 for maximum performance GPU_PERSIST = os.environ.get("STREAMING_GPU_PERSIST", "1") == "1" if GPU_AVAILABLE and GPU_PERSIST: print("[geometry_gpu] GPU persistence enabled - frames stay on GPU") def _to_gpu(img): """Move image to GPU if available.""" if GPU_AVAILABLE and not isinstance(img, cp.ndarray): return cp.asarray(img) return img def _to_cpu(img): """Move image back to CPU (only if GPU_PERSIST is disabled).""" if not GPU_PERSIST and GPU_AVAILABLE and isinstance(img, cp.ndarray): return cp.asnumpy(img) return img def _ensure_output_format(img): """Ensure output is in correct format based on GPU_PERSIST setting.""" return _to_cpu(img) def prim_rotate(img, angle, cx=None, cy=None): """Rotate image by angle degrees around center (cx, cy).""" if not GPU_AVAILABLE: # Fallback to OpenCV import cv2 h, w = img.shape[:2] if cx is None: cx = w / 2 if cy is None: cy = h / 2 M = cv2.getRotationMatrix2D((cx, cy), angle, 1.0) return cv2.warpAffine(img, M, (w, h)) img_gpu = _to_gpu(img) h, w = img_gpu.shape[:2] if cx is None: cx = w / 2 if cy is None: cy = h / 2 # Use cupyx.scipy.ndimage.rotate # Note: scipy uses different angle convention rotated = cpndimage.rotate(img_gpu, angle, reshape=False, order=1) return _to_cpu(rotated) def prim_scale(img, sx, sy, cx=None, cy=None): """Scale image by (sx, sy) around center (cx, cy).""" if not GPU_AVAILABLE: import cv2 h, w = img.shape[:2] if cx is None: cx = w / 2 if cy is None: cy = h / 2 M = np.float32([ [sx, 0, cx * (1 - sx)], [0, sy, cy * (1 - sy)] ]) return cv2.warpAffine(img, M, (w, h)) img_gpu = _to_gpu(img) h, w = img_gpu.shape[:2] if cx is None: cx = w / 2 if cy is None: cy = h / 2 # Use cupyx.scipy.ndimage.zoom if img_gpu.ndim == 3: zoom_factors = (sy, sx, 1) # Don't zoom color channels else: zoom_factors = (sy, sx) zoomed = cpndimage.zoom(img_gpu, zoom_factors, order=1) # Crop/pad to original size zh, zw = zoomed.shape[:2] result = cp.zeros_like(img_gpu) # Calculate offsets src_y = max(0, (zh - h) // 2) src_x = max(0, (zw - w) // 2) dst_y = max(0, (h - zh) // 2) dst_x = max(0, (w - zw) // 2) copy_h = min(h - dst_y, zh - src_y) copy_w = min(w - dst_x, zw - src_x) result[dst_y:dst_y+copy_h, dst_x:dst_x+copy_w] = zoomed[src_y:src_y+copy_h, src_x:src_x+copy_w] return _to_cpu(result) def prim_translate(img, dx, dy): """Translate image by (dx, dy) pixels.""" if not GPU_AVAILABLE: import cv2 h, w = img.shape[:2] M = np.float32([[1, 0, dx], [0, 1, dy]]) return cv2.warpAffine(img, M, (w, h)) img_gpu = _to_gpu(img) # Use cupyx.scipy.ndimage.shift if img_gpu.ndim == 3: shift = (dy, dx, 0) # Don't shift color channels else: shift = (dy, dx) shifted = cpndimage.shift(img_gpu, shift, order=1) return _to_cpu(shifted) def prim_flip_h(img): """Flip image horizontally.""" if GPU_AVAILABLE: img_gpu = _to_gpu(img) return _to_cpu(cp.flip(img_gpu, axis=1)) return np.flip(img, axis=1) def prim_flip_v(img): """Flip image vertically.""" if GPU_AVAILABLE: img_gpu = _to_gpu(img) return _to_cpu(cp.flip(img_gpu, axis=0)) return np.flip(img, axis=0) def prim_flip(img, direction="horizontal"): """Flip image in given direction.""" if direction in ("horizontal", "h"): return prim_flip_h(img) elif direction in ("vertical", "v"): return prim_flip_v(img) elif direction in ("both", "hv", "vh"): if GPU_AVAILABLE: img_gpu = _to_gpu(img) return _to_cpu(cp.flip(cp.flip(img_gpu, axis=0), axis=1)) return np.flip(np.flip(img, axis=0), axis=1) return img # CUDA kernel for ripple effect if GPU_AVAILABLE: _ripple_kernel = cp.RawKernel(r''' extern "C" __global__ void ripple(const unsigned char* src, unsigned char* dst, int width, int height, int channels, float amplitude, float frequency, float decay, float speed, float time, float cx, float cy) { int x = blockDim.x * blockIdx.x + threadIdx.x; int y = blockDim.y * blockIdx.y + threadIdx.y; if (x >= width || y >= height) return; // Distance from center float dx = x - cx; float dy = y - cy; float dist = sqrtf(dx * dx + dy * dy); // Ripple displacement float wave = sinf(dist * frequency * 0.1f - time * speed) * amplitude; float falloff = expf(-dist * decay * 0.01f); float displacement = wave * falloff; // Direction from center float len = dist + 0.0001f; // Avoid division by zero float dir_x = dx / len; float dir_y = dy / len; // Source coordinates float src_x = x - dir_x * displacement; float src_y = y - dir_y * displacement; // Clamp to bounds src_x = fmaxf(0.0f, fminf(width - 1.0f, src_x)); src_y = fmaxf(0.0f, fminf(height - 1.0f, src_y)); // Bilinear interpolation int x0 = (int)src_x; int y0 = (int)src_y; int x1 = min(x0 + 1, width - 1); int y1 = min(y0 + 1, height - 1); float fx = src_x - x0; float fy = src_y - y0; for (int c = 0; c < channels; c++) { float v00 = src[(y0 * width + x0) * channels + c]; float v10 = src[(y0 * width + x1) * channels + c]; float v01 = src[(y1 * width + x0) * channels + c]; float v11 = src[(y1 * width + x1) * channels + c]; float v0 = v00 * (1 - fx) + v10 * fx; float v1 = v01 * (1 - fx) + v11 * fx; float val = v0 * (1 - fy) + v1 * fy; dst[(y * width + x) * channels + c] = (unsigned char)fminf(255.0f, fmaxf(0.0f, val)); } } ''', 'ripple') def prim_ripple(img, amplitude=10.0, frequency=8.0, decay=2.0, speed=5.0, time=0.0, center_x=None, center_y=None): """Apply ripple distortion effect.""" h, w = img.shape[:2] channels = img.shape[2] if img.ndim == 3 else 1 if center_x is None: center_x = w / 2 if center_y is None: center_y = h / 2 if not GPU_AVAILABLE: # CPU fallback using coordinate mapping import cv2 y_coords, x_coords = np.mgrid[0:h, 0:w].astype(np.float32) dx = x_coords - center_x dy = y_coords - center_y dist = np.sqrt(dx**2 + dy**2) wave = np.sin(dist * frequency * 0.1 - time * speed) * amplitude falloff = np.exp(-dist * decay * 0.01) displacement = wave * falloff length = dist + 0.0001 dir_x = dx / length dir_y = dy / length map_x = (x_coords - dir_x * displacement).astype(np.float32) map_y = (y_coords - dir_y * displacement).astype(np.float32) return cv2.remap(img, map_x, map_y, cv2.INTER_LINEAR) # GPU implementation img_gpu = _to_gpu(img.astype(np.uint8)) if img_gpu.ndim == 2: img_gpu = img_gpu[:, :, cp.newaxis] channels = 1 dst = cp.zeros_like(img_gpu) block = (16, 16) grid = ((w + block[0] - 1) // block[0], (h + block[1] - 1) // block[1]) _ripple_kernel(grid, block, ( img_gpu, dst, np.int32(w), np.int32(h), np.int32(channels), np.float32(amplitude), np.float32(frequency), np.float32(decay), np.float32(speed), np.float32(time), np.float32(center_x), np.float32(center_y) )) result = _to_cpu(dst) if channels == 1: result = result[:, :, 0] return result # CUDA kernel for fast rotation with bilinear interpolation if GPU_AVAILABLE: _rotate_kernel = cp.RawKernel(r''' extern "C" __global__ void rotate_img(const unsigned char* src, unsigned char* dst, int width, int height, int channels, float cos_a, float sin_a, float cx, float cy) { int x = blockDim.x * blockIdx.x + threadIdx.x; int y = blockDim.y * blockIdx.y + threadIdx.y; if (x >= width || y >= height) return; // Translate to center, rotate, translate back float dx = x - cx; float dy = y - cy; float src_x = cos_a * dx + sin_a * dy + cx; float src_y = -sin_a * dx + cos_a * dy + cy; // Check bounds if (src_x < 0 || src_x >= width - 1 || src_y < 0 || src_y >= height - 1) { for (int c = 0; c < channels; c++) { dst[(y * width + x) * channels + c] = 0; } return; } // Bilinear interpolation int x0 = (int)src_x; int y0 = (int)src_y; int x1 = x0 + 1; int y1 = y0 + 1; float fx = src_x - x0; float fy = src_y - y0; for (int c = 0; c < channels; c++) { float v00 = src[(y0 * width + x0) * channels + c]; float v10 = src[(y0 * width + x1) * channels + c]; float v01 = src[(y1 * width + x0) * channels + c]; float v11 = src[(y1 * width + x1) * channels + c]; float v0 = v00 * (1 - fx) + v10 * fx; float v1 = v01 * (1 - fx) + v11 * fx; float val = v0 * (1 - fy) + v1 * fy; dst[(y * width + x) * channels + c] = (unsigned char)fminf(255.0f, fmaxf(0.0f, val)); } } ''', 'rotate_img') def prim_rotate_gpu(img, angle, cx=None, cy=None): """Fast GPU rotation using custom CUDA kernel.""" if not GPU_AVAILABLE: return prim_rotate(img, angle, cx, cy) h, w = img.shape[:2] channels = img.shape[2] if img.ndim == 3 else 1 if cx is None: cx = w / 2 if cy is None: cy = h / 2 img_gpu = _to_gpu(img.astype(np.uint8)) if img_gpu.ndim == 2: img_gpu = img_gpu[:, :, cp.newaxis] channels = 1 dst = cp.zeros_like(img_gpu) # Convert angle to radians rad = np.radians(angle) cos_a = np.cos(rad) sin_a = np.sin(rad) block = (16, 16) grid = ((w + block[0] - 1) // block[0], (h + block[1] - 1) // block[1]) _rotate_kernel(grid, block, ( img_gpu, dst, np.int32(w), np.int32(h), np.int32(channels), np.float32(cos_a), np.float32(sin_a), np.float32(cx), np.float32(cy) )) result = _to_cpu(dst) if channels == 1: result = result[:, :, 0] return result # Import CPU primitives as fallbacks for functions we don't GPU-accelerate def _get_cpu_primitives(): """Get all primitives from CPU geometry module as fallbacks.""" from sexp_effects.primitive_libs import geometry return geometry.PRIMITIVES # Export functions - start with CPU primitives, then override with GPU versions PRIMITIVES = _get_cpu_primitives().copy() # Override specific primitives with GPU-accelerated versions PRIMITIVES.update({ 'translate': prim_translate, 'rotate-img': prim_rotate_gpu if GPU_AVAILABLE else prim_rotate, 'scale-img': prim_scale, 'flip-h': prim_flip_h, 'flip-v': prim_flip_v, 'flip': prim_flip, # Note: ripple-displace uses CPU version (different API - returns coords, not image) })