""" JIT Compiler for sexp frame pipelines. Compiles sexp expressions to fused CUDA kernels for maximum performance. """ import cupy as cp import numpy as np from typing import Dict, List, Any, Optional, Tuple, Callable import hashlib import sys # Cache for compiled kernels _KERNEL_CACHE: Dict[str, Callable] = {} def _generate_kernel_key(ops: List[Tuple]) -> str: """Generate cache key for operation sequence.""" return hashlib.md5(str(ops).encode()).hexdigest() # ============================================================================= # CUDA Kernel Templates # ============================================================================= AFFINE_WARP_KERNEL = cp.RawKernel(r''' extern "C" __global__ void affine_warp( const unsigned char* src, unsigned char* dst, int width, int height, int channels, float m00, float m01, float m02, float m10, float m11, float m12 ) { int x = blockIdx.x * blockDim.x + threadIdx.x; int y = blockIdx.y * blockDim.y + threadIdx.y; if (x >= width || y >= height) return; // Apply inverse affine transform float src_x = m00 * x + m01 * y + m02; float src_y = m10 * x + m11 * y + m12; int dst_idx = (y * width + x) * channels; // Bounds check if (src_x < 0 || src_x >= width - 1 || src_y < 0 || src_y >= height - 1) { for (int c = 0; c < channels; c++) { dst[dst_idx + c] = 0; } return; } // Bilinear interpolation int x0 = (int)src_x; int y0 = (int)src_y; int x1 = x0 + 1; int y1 = y0 + 1; float fx = src_x - x0; float fy = src_y - y0; for (int c = 0; c < channels; c++) { float v00 = src[(y0 * width + x0) * channels + c]; float v10 = src[(y0 * width + x1) * channels + c]; float v01 = src[(y1 * width + x0) * channels + c]; float v11 = src[(y1 * width + x1) * channels + c]; float v0 = v00 * (1 - fx) + v10 * fx; float v1 = v01 * (1 - fx) + v11 * fx; float v = v0 * (1 - fy) + v1 * fy; dst[dst_idx + c] = (unsigned char)(v < 0 ? 0 : (v > 255 ? 255 : v)); } } ''', 'affine_warp') BLEND_KERNEL = cp.RawKernel(r''' extern "C" __global__ void blend( const unsigned char* src1, const unsigned char* src2, unsigned char* dst, int size, float alpha ) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx >= size) return; float v = src1[idx] * (1.0f - alpha) + src2[idx] * alpha; dst[idx] = (unsigned char)(v < 0 ? 0 : (v > 255 ? 255 : v)); } ''', 'blend') BRIGHTNESS_CONTRAST_KERNEL = cp.RawKernel(r''' extern "C" __global__ void brightness_contrast( const unsigned char* src, unsigned char* dst, int size, float brightness, float contrast ) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx >= size) return; float v = src[idx]; v = (v - 128.0f) * contrast + 128.0f + brightness; dst[idx] = (unsigned char)(v < 0 ? 0 : (v > 255 ? 255 : v)); } ''', 'brightness_contrast') HUE_SHIFT_KERNEL = cp.RawKernel(r''' extern "C" __global__ void hue_shift( const unsigned char* src, unsigned char* dst, int width, int height, float hue_shift ) { int x = blockIdx.x * blockDim.x + threadIdx.x; int y = blockIdx.y * blockDim.y + threadIdx.y; if (x >= width || y >= height) return; int idx = (y * width + x) * 3; float r = src[idx] / 255.0f; float g = src[idx + 1] / 255.0f; float b = src[idx + 2] / 255.0f; // RGB to HSV float max_c = fmaxf(r, fmaxf(g, b)); float min_c = fminf(r, fminf(g, b)); float delta = max_c - min_c; float h = 0, s = 0, v = max_c; if (delta > 0.00001f) { s = delta / max_c; if (r >= max_c) h = (g - b) / delta; else if (g >= max_c) h = 2.0f + (b - r) / delta; else h = 4.0f + (r - g) / delta; h *= 60.0f; if (h < 0) h += 360.0f; } // Apply hue shift h = fmodf(h + hue_shift + 360.0f, 360.0f); // HSV to RGB float c = v * s; float x_val = c * (1 - fabsf(fmodf(h / 60.0f, 2.0f) - 1)); float m = v - c; float r2, g2, b2; if (h < 60) { r2 = c; g2 = x_val; b2 = 0; } else if (h < 120) { r2 = x_val; g2 = c; b2 = 0; } else if (h < 180) { r2 = 0; g2 = c; b2 = x_val; } else if (h < 240) { r2 = 0; g2 = x_val; b2 = c; } else if (h < 300) { r2 = x_val; g2 = 0; b2 = c; } else { r2 = c; g2 = 0; b2 = x_val; } dst[idx] = (unsigned char)((r2 + m) * 255); dst[idx + 1] = (unsigned char)((g2 + m) * 255); dst[idx + 2] = (unsigned char)((b2 + m) * 255); } ''', 'hue_shift') INVERT_KERNEL = cp.RawKernel(r''' extern "C" __global__ void invert( const unsigned char* src, unsigned char* dst, int size ) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx >= size) return; dst[idx] = 255 - src[idx]; } ''', 'invert') ZOOM_KERNEL = cp.RawKernel(r''' extern "C" __global__ void zoom( const unsigned char* src, unsigned char* dst, int width, int height, int channels, float zoom_factor, float cx, float cy ) { int x = blockIdx.x * blockDim.x + threadIdx.x; int y = blockIdx.y * blockDim.y + threadIdx.y; if (x >= width || y >= height) return; // Map to source coordinates (zoom from center) float src_x = (x - cx) / zoom_factor + cx; float src_y = (y - cy) / zoom_factor + cy; int dst_idx = (y * width + x) * channels; if (src_x < 0 || src_x >= width - 1 || src_y < 0 || src_y >= height - 1) { for (int c = 0; c < channels; c++) { dst[dst_idx + c] = 0; } return; } // Bilinear interpolation int x0 = (int)src_x; int y0 = (int)src_y; float fx = src_x - x0; float fy = src_y - y0; for (int c = 0; c < channels; c++) { float v00 = src[(y0 * width + x0) * channels + c]; float v10 = src[(y0 * width + (x0+1)) * channels + c]; float v01 = src[((y0+1) * width + x0) * channels + c]; float v11 = src[((y0+1) * width + (x0+1)) * channels + c]; float v = v00*(1-fx)*(1-fy) + v10*fx*(1-fy) + v01*(1-fx)*fy + v11*fx*fy; dst[dst_idx + c] = (unsigned char)(v < 0 ? 0 : (v > 255 ? 255 : v)); } } ''', 'zoom') RIPPLE_KERNEL = cp.RawKernel(r''' extern "C" __global__ void ripple( const unsigned char* src, unsigned char* dst, int width, int height, int channels, float cx, float cy, float amplitude, float frequency, float decay, float phase ) { int x = blockIdx.x * blockDim.x + threadIdx.x; int y = blockIdx.y * blockDim.y + threadIdx.y; if (x >= width || y >= height) return; float dx = x - cx; float dy = y - cy; float dist = sqrtf(dx * dx + dy * dy); // Ripple displacement float wave = sinf(dist * frequency * 0.1f + phase); float amp = amplitude * expf(-dist * decay * 0.01f); float src_x = x + dx / (dist + 0.001f) * wave * amp; float src_y = y + dy / (dist + 0.001f) * wave * amp; int dst_idx = (y * width + x) * channels; if (src_x < 0 || src_x >= width - 1 || src_y < 0 || src_y >= height - 1) { for (int c = 0; c < channels; c++) { dst[dst_idx + c] = src[dst_idx + c]; // Keep original on boundary } return; } // Bilinear interpolation int x0 = (int)src_x; int y0 = (int)src_y; float fx = src_x - x0; float fy = src_y - y0; for (int c = 0; c < channels; c++) { float v00 = src[(y0 * width + x0) * channels + c]; float v10 = src[(y0 * width + (x0+1)) * channels + c]; float v01 = src[((y0+1) * width + x0) * channels + c]; float v11 = src[((y0+1) * width + (x0+1)) * channels + c]; float v = v00*(1-fx)*(1-fy) + v10*fx*(1-fy) + v01*(1-fx)*fy + v11*fx*fy; dst[dst_idx + c] = (unsigned char)(v < 0 ? 0 : (v > 255 ? 255 : v)); } } ''', 'ripple') # ============================================================================= # Fast GPU Operations # ============================================================================= class FastGPUOps: """Optimized GPU operations using CUDA kernels.""" def __init__(self, width: int, height: int): self.width = width self.height = height self.channels = 3 # Pre-allocate work buffers self._buf1 = cp.zeros((height, width, 3), dtype=cp.uint8) self._buf2 = cp.zeros((height, width, 3), dtype=cp.uint8) self._current_buf = 0 # Grid/block config self._block_2d = (16, 16) self._grid_2d = ((width + 15) // 16, (height + 15) // 16) self._block_1d = 256 self._grid_1d = (width * height * 3 + 255) // 256 def _get_buffers(self): """Get source and destination buffers (ping-pong).""" if self._current_buf == 0: return self._buf1, self._buf2 return self._buf2, self._buf1 def _swap_buffers(self): """Swap ping-pong buffers.""" self._current_buf = 1 - self._current_buf def set_input(self, frame: cp.ndarray): """Set input frame.""" if self._current_buf == 0: cp.copyto(self._buf1, frame) else: cp.copyto(self._buf2, frame) def get_output(self) -> cp.ndarray: """Get current output buffer.""" if self._current_buf == 0: return self._buf1 return self._buf2 def rotate(self, angle: float, cx: float = None, cy: float = None): """Fast GPU rotation.""" if cx is None: cx = self.width / 2 if cy is None: cy = self.height / 2 src, dst = self._get_buffers() # Compute inverse rotation matrix import math rad = math.radians(-angle) # Negative for inverse cos_a = math.cos(rad) sin_a = math.sin(rad) # Inverse affine matrix (rotate around center) m00 = cos_a m01 = -sin_a m02 = cx - cos_a * cx + sin_a * cy m10 = sin_a m11 = cos_a m12 = cy - sin_a * cx - cos_a * cy AFFINE_WARP_KERNEL( self._grid_2d, self._block_2d, (src, dst, self.width, self.height, self.channels, np.float32(m00), np.float32(m01), np.float32(m02), np.float32(m10), np.float32(m11), np.float32(m12)) ) self._swap_buffers() def zoom(self, factor: float, cx: float = None, cy: float = None): """Fast GPU zoom.""" if cx is None: cx = self.width / 2 if cy is None: cy = self.height / 2 src, dst = self._get_buffers() ZOOM_KERNEL( self._grid_2d, self._block_2d, (src, dst, self.width, self.height, self.channels, np.float32(factor), np.float32(cx), np.float32(cy)) ) self._swap_buffers() def blend(self, other: cp.ndarray, alpha: float): """Fast GPU blend.""" src, dst = self._get_buffers() size = self.width * self.height * self.channels BLEND_KERNEL( (self._grid_1d,), (self._block_1d,), (src.ravel(), other.ravel(), dst.ravel(), size, np.float32(alpha)) ) self._swap_buffers() def brightness(self, factor: float): """Fast GPU brightness adjustment.""" src, dst = self._get_buffers() size = self.width * self.height * self.channels BRIGHTNESS_CONTRAST_KERNEL( (self._grid_1d,), (self._block_1d,), (src.ravel(), dst.ravel(), size, np.float32((factor - 1) * 128), np.float32(1.0)) ) self._swap_buffers() def contrast(self, factor: float): """Fast GPU contrast adjustment.""" src, dst = self._get_buffers() size = self.width * self.height * self.channels BRIGHTNESS_CONTRAST_KERNEL( (self._grid_1d,), (self._block_1d,), (src.ravel(), dst.ravel(), size, np.float32(0), np.float32(factor)) ) self._swap_buffers() def hue_shift(self, degrees: float): """Fast GPU hue shift.""" src, dst = self._get_buffers() HUE_SHIFT_KERNEL( self._grid_2d, self._block_2d, (src, dst, self.width, self.height, np.float32(degrees)) ) self._swap_buffers() def invert(self): """Fast GPU invert.""" src, dst = self._get_buffers() size = self.width * self.height * self.channels INVERT_KERNEL( (self._grid_1d,), (self._block_1d,), (src.ravel(), dst.ravel(), size) ) self._swap_buffers() def ripple(self, amplitude: float, cx: float = None, cy: float = None, frequency: float = 8, decay: float = 2, phase: float = 0): """Fast GPU ripple effect.""" if cx is None: cx = self.width / 2 if cy is None: cy = self.height / 2 src, dst = self._get_buffers() RIPPLE_KERNEL( self._grid_2d, self._block_2d, (src, dst, self.width, self.height, self.channels, np.float32(cx), np.float32(cy), np.float32(amplitude), np.float32(frequency), np.float32(decay), np.float32(phase)) ) self._swap_buffers() # Global fast ops instance (created per resolution) _FAST_OPS: Dict[Tuple[int, int], FastGPUOps] = {} def get_fast_ops(width: int, height: int) -> FastGPUOps: """Get or create FastGPUOps for given resolution.""" key = (width, height) if key not in _FAST_OPS: _FAST_OPS[key] = FastGPUOps(width, height) return _FAST_OPS[key] # ============================================================================= # Fast effect functions (drop-in replacements) # ============================================================================= def fast_rotate(frame: cp.ndarray, angle: float, **kwargs) -> cp.ndarray: """Fast GPU rotation.""" h, w = frame.shape[:2] ops = get_fast_ops(w, h) ops.set_input(frame) ops.rotate(angle, kwargs.get('cx'), kwargs.get('cy')) return ops.get_output().copy() def fast_zoom(frame: cp.ndarray, factor: float, **kwargs) -> cp.ndarray: """Fast GPU zoom.""" h, w = frame.shape[:2] ops = get_fast_ops(w, h) ops.set_input(frame) ops.zoom(factor, kwargs.get('cx'), kwargs.get('cy')) return ops.get_output().copy() def fast_blend(frame1: cp.ndarray, frame2: cp.ndarray, alpha: float) -> cp.ndarray: """Fast GPU blend.""" h, w = frame1.shape[:2] ops = get_fast_ops(w, h) ops.set_input(frame1) ops.blend(frame2, alpha) return ops.get_output().copy() def fast_hue_shift(frame: cp.ndarray, degrees: float) -> cp.ndarray: """Fast GPU hue shift.""" h, w = frame.shape[:2] ops = get_fast_ops(w, h) ops.set_input(frame) ops.hue_shift(degrees) return ops.get_output().copy() def fast_invert(frame: cp.ndarray) -> cp.ndarray: """Fast GPU invert.""" h, w = frame.shape[:2] ops = get_fast_ops(w, h) ops.set_input(frame) ops.invert() return ops.get_output().copy() def fast_ripple(frame: cp.ndarray, amplitude: float, **kwargs) -> cp.ndarray: """Fast GPU ripple.""" h, w = frame.shape[:2] ops = get_fast_ops(w, h) ops.set_input(frame) ops.ripple( amplitude, kwargs.get('center_x', w/2), kwargs.get('center_y', h/2), kwargs.get('frequency', 8), kwargs.get('decay', 2), kwargs.get('speed', 0) * kwargs.get('t', 0) # phase from speed*time ) return ops.get_output().copy() print("[jit_compiler] CUDA kernels loaded", file=sys.stderr)