Files
rose-ash/l1/streaming/jit_compiler.py
2026-02-24 23:07:19 +00:00

532 lines
15 KiB
Python

"""
JIT Compiler for sexp frame pipelines.
Compiles sexp expressions to fused CUDA kernels for maximum performance.
"""
import cupy as cp
import numpy as np
from typing import Dict, List, Any, Optional, Tuple, Callable
import hashlib
import sys
# Cache for compiled kernels
_KERNEL_CACHE: Dict[str, Callable] = {}
def _generate_kernel_key(ops: List[Tuple]) -> str:
"""Generate cache key for operation sequence."""
return hashlib.md5(str(ops).encode()).hexdigest()
# =============================================================================
# CUDA Kernel Templates
# =============================================================================
AFFINE_WARP_KERNEL = cp.RawKernel(r'''
extern "C" __global__
void affine_warp(
const unsigned char* src,
unsigned char* dst,
int width, int height, int channels,
float m00, float m01, float m02,
float m10, float m11, float m12
) {
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x >= width || y >= height) return;
// Apply inverse affine transform
float src_x = m00 * x + m01 * y + m02;
float src_y = m10 * x + m11 * y + m12;
int dst_idx = (y * width + x) * channels;
// Bounds check
if (src_x < 0 || src_x >= width - 1 || src_y < 0 || src_y >= height - 1) {
for (int c = 0; c < channels; c++) {
dst[dst_idx + c] = 0;
}
return;
}
// Bilinear interpolation
int x0 = (int)src_x;
int y0 = (int)src_y;
int x1 = x0 + 1;
int y1 = y0 + 1;
float fx = src_x - x0;
float fy = src_y - y0;
for (int c = 0; c < channels; c++) {
float v00 = src[(y0 * width + x0) * channels + c];
float v10 = src[(y0 * width + x1) * channels + c];
float v01 = src[(y1 * width + x0) * channels + c];
float v11 = src[(y1 * width + x1) * channels + c];
float v0 = v00 * (1 - fx) + v10 * fx;
float v1 = v01 * (1 - fx) + v11 * fx;
float v = v0 * (1 - fy) + v1 * fy;
dst[dst_idx + c] = (unsigned char)(v < 0 ? 0 : (v > 255 ? 255 : v));
}
}
''', 'affine_warp')
BLEND_KERNEL = cp.RawKernel(r'''
extern "C" __global__
void blend(
const unsigned char* src1,
const unsigned char* src2,
unsigned char* dst,
int size,
float alpha
) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= size) return;
float v = src1[idx] * (1.0f - alpha) + src2[idx] * alpha;
dst[idx] = (unsigned char)(v < 0 ? 0 : (v > 255 ? 255 : v));
}
''', 'blend')
BRIGHTNESS_CONTRAST_KERNEL = cp.RawKernel(r'''
extern "C" __global__
void brightness_contrast(
const unsigned char* src,
unsigned char* dst,
int size,
float brightness,
float contrast
) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= size) return;
float v = src[idx];
v = (v - 128.0f) * contrast + 128.0f + brightness;
dst[idx] = (unsigned char)(v < 0 ? 0 : (v > 255 ? 255 : v));
}
''', 'brightness_contrast')
HUE_SHIFT_KERNEL = cp.RawKernel(r'''
extern "C" __global__
void hue_shift(
const unsigned char* src,
unsigned char* dst,
int width, int height,
float hue_shift
) {
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x >= width || y >= height) return;
int idx = (y * width + x) * 3;
float r = src[idx] / 255.0f;
float g = src[idx + 1] / 255.0f;
float b = src[idx + 2] / 255.0f;
// RGB to HSV
float max_c = fmaxf(r, fmaxf(g, b));
float min_c = fminf(r, fminf(g, b));
float delta = max_c - min_c;
float h = 0, s = 0, v = max_c;
if (delta > 0.00001f) {
s = delta / max_c;
if (r >= max_c) h = (g - b) / delta;
else if (g >= max_c) h = 2.0f + (b - r) / delta;
else h = 4.0f + (r - g) / delta;
h *= 60.0f;
if (h < 0) h += 360.0f;
}
// Apply hue shift
h = fmodf(h + hue_shift + 360.0f, 360.0f);
// HSV to RGB
float c = v * s;
float x_val = c * (1 - fabsf(fmodf(h / 60.0f, 2.0f) - 1));
float m = v - c;
float r2, g2, b2;
if (h < 60) { r2 = c; g2 = x_val; b2 = 0; }
else if (h < 120) { r2 = x_val; g2 = c; b2 = 0; }
else if (h < 180) { r2 = 0; g2 = c; b2 = x_val; }
else if (h < 240) { r2 = 0; g2 = x_val; b2 = c; }
else if (h < 300) { r2 = x_val; g2 = 0; b2 = c; }
else { r2 = c; g2 = 0; b2 = x_val; }
dst[idx] = (unsigned char)((r2 + m) * 255);
dst[idx + 1] = (unsigned char)((g2 + m) * 255);
dst[idx + 2] = (unsigned char)((b2 + m) * 255);
}
''', 'hue_shift')
INVERT_KERNEL = cp.RawKernel(r'''
extern "C" __global__
void invert(
const unsigned char* src,
unsigned char* dst,
int size
) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= size) return;
dst[idx] = 255 - src[idx];
}
''', 'invert')
ZOOM_KERNEL = cp.RawKernel(r'''
extern "C" __global__
void zoom(
const unsigned char* src,
unsigned char* dst,
int width, int height, int channels,
float zoom_factor,
float cx, float cy
) {
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x >= width || y >= height) return;
// Map to source coordinates (zoom from center)
float src_x = (x - cx) / zoom_factor + cx;
float src_y = (y - cy) / zoom_factor + cy;
int dst_idx = (y * width + x) * channels;
if (src_x < 0 || src_x >= width - 1 || src_y < 0 || src_y >= height - 1) {
for (int c = 0; c < channels; c++) {
dst[dst_idx + c] = 0;
}
return;
}
// Bilinear interpolation
int x0 = (int)src_x;
int y0 = (int)src_y;
float fx = src_x - x0;
float fy = src_y - y0;
for (int c = 0; c < channels; c++) {
float v00 = src[(y0 * width + x0) * channels + c];
float v10 = src[(y0 * width + (x0+1)) * channels + c];
float v01 = src[((y0+1) * width + x0) * channels + c];
float v11 = src[((y0+1) * width + (x0+1)) * channels + c];
float v = v00*(1-fx)*(1-fy) + v10*fx*(1-fy) + v01*(1-fx)*fy + v11*fx*fy;
dst[dst_idx + c] = (unsigned char)(v < 0 ? 0 : (v > 255 ? 255 : v));
}
}
''', 'zoom')
RIPPLE_KERNEL = cp.RawKernel(r'''
extern "C" __global__
void ripple(
const unsigned char* src,
unsigned char* dst,
int width, int height, int channels,
float cx, float cy,
float amplitude, float frequency, float decay, float phase
) {
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x >= width || y >= height) return;
float dx = x - cx;
float dy = y - cy;
float dist = sqrtf(dx * dx + dy * dy);
// Ripple displacement
float wave = sinf(dist * frequency * 0.1f + phase);
float amp = amplitude * expf(-dist * decay * 0.01f);
float src_x = x + dx / (dist + 0.001f) * wave * amp;
float src_y = y + dy / (dist + 0.001f) * wave * amp;
int dst_idx = (y * width + x) * channels;
if (src_x < 0 || src_x >= width - 1 || src_y < 0 || src_y >= height - 1) {
for (int c = 0; c < channels; c++) {
dst[dst_idx + c] = src[dst_idx + c]; // Keep original on boundary
}
return;
}
// Bilinear interpolation
int x0 = (int)src_x;
int y0 = (int)src_y;
float fx = src_x - x0;
float fy = src_y - y0;
for (int c = 0; c < channels; c++) {
float v00 = src[(y0 * width + x0) * channels + c];
float v10 = src[(y0 * width + (x0+1)) * channels + c];
float v01 = src[((y0+1) * width + x0) * channels + c];
float v11 = src[((y0+1) * width + (x0+1)) * channels + c];
float v = v00*(1-fx)*(1-fy) + v10*fx*(1-fy) + v01*(1-fx)*fy + v11*fx*fy;
dst[dst_idx + c] = (unsigned char)(v < 0 ? 0 : (v > 255 ? 255 : v));
}
}
''', 'ripple')
# =============================================================================
# Fast GPU Operations
# =============================================================================
class FastGPUOps:
"""Optimized GPU operations using CUDA kernels."""
def __init__(self, width: int, height: int):
self.width = width
self.height = height
self.channels = 3
# Pre-allocate work buffers
self._buf1 = cp.zeros((height, width, 3), dtype=cp.uint8)
self._buf2 = cp.zeros((height, width, 3), dtype=cp.uint8)
self._current_buf = 0
# Grid/block config
self._block_2d = (16, 16)
self._grid_2d = ((width + 15) // 16, (height + 15) // 16)
self._block_1d = 256
self._grid_1d = (width * height * 3 + 255) // 256
def _get_buffers(self):
"""Get source and destination buffers (ping-pong)."""
if self._current_buf == 0:
return self._buf1, self._buf2
return self._buf2, self._buf1
def _swap_buffers(self):
"""Swap ping-pong buffers."""
self._current_buf = 1 - self._current_buf
def set_input(self, frame: cp.ndarray):
"""Set input frame."""
if self._current_buf == 0:
cp.copyto(self._buf1, frame)
else:
cp.copyto(self._buf2, frame)
def get_output(self) -> cp.ndarray:
"""Get current output buffer."""
if self._current_buf == 0:
return self._buf1
return self._buf2
def rotate(self, angle: float, cx: float = None, cy: float = None):
"""Fast GPU rotation."""
if cx is None:
cx = self.width / 2
if cy is None:
cy = self.height / 2
src, dst = self._get_buffers()
# Compute inverse rotation matrix
import math
rad = math.radians(-angle) # Negative for inverse
cos_a = math.cos(rad)
sin_a = math.sin(rad)
# Inverse affine matrix (rotate around center)
m00 = cos_a
m01 = -sin_a
m02 = cx - cos_a * cx + sin_a * cy
m10 = sin_a
m11 = cos_a
m12 = cy - sin_a * cx - cos_a * cy
AFFINE_WARP_KERNEL(
self._grid_2d, self._block_2d,
(src, dst, self.width, self.height, self.channels,
np.float32(m00), np.float32(m01), np.float32(m02),
np.float32(m10), np.float32(m11), np.float32(m12))
)
self._swap_buffers()
def zoom(self, factor: float, cx: float = None, cy: float = None):
"""Fast GPU zoom."""
if cx is None:
cx = self.width / 2
if cy is None:
cy = self.height / 2
src, dst = self._get_buffers()
ZOOM_KERNEL(
self._grid_2d, self._block_2d,
(src, dst, self.width, self.height, self.channels,
np.float32(factor), np.float32(cx), np.float32(cy))
)
self._swap_buffers()
def blend(self, other: cp.ndarray, alpha: float):
"""Fast GPU blend."""
src, dst = self._get_buffers()
size = self.width * self.height * self.channels
BLEND_KERNEL(
(self._grid_1d,), (self._block_1d,),
(src.ravel(), other.ravel(), dst.ravel(), size, np.float32(alpha))
)
self._swap_buffers()
def brightness(self, factor: float):
"""Fast GPU brightness adjustment."""
src, dst = self._get_buffers()
size = self.width * self.height * self.channels
BRIGHTNESS_CONTRAST_KERNEL(
(self._grid_1d,), (self._block_1d,),
(src.ravel(), dst.ravel(), size, np.float32((factor - 1) * 128), np.float32(1.0))
)
self._swap_buffers()
def contrast(self, factor: float):
"""Fast GPU contrast adjustment."""
src, dst = self._get_buffers()
size = self.width * self.height * self.channels
BRIGHTNESS_CONTRAST_KERNEL(
(self._grid_1d,), (self._block_1d,),
(src.ravel(), dst.ravel(), size, np.float32(0), np.float32(factor))
)
self._swap_buffers()
def hue_shift(self, degrees: float):
"""Fast GPU hue shift."""
src, dst = self._get_buffers()
HUE_SHIFT_KERNEL(
self._grid_2d, self._block_2d,
(src, dst, self.width, self.height, np.float32(degrees))
)
self._swap_buffers()
def invert(self):
"""Fast GPU invert."""
src, dst = self._get_buffers()
size = self.width * self.height * self.channels
INVERT_KERNEL(
(self._grid_1d,), (self._block_1d,),
(src.ravel(), dst.ravel(), size)
)
self._swap_buffers()
def ripple(self, amplitude: float, cx: float = None, cy: float = None,
frequency: float = 8, decay: float = 2, phase: float = 0):
"""Fast GPU ripple effect."""
if cx is None:
cx = self.width / 2
if cy is None:
cy = self.height / 2
src, dst = self._get_buffers()
RIPPLE_KERNEL(
self._grid_2d, self._block_2d,
(src, dst, self.width, self.height, self.channels,
np.float32(cx), np.float32(cy),
np.float32(amplitude), np.float32(frequency),
np.float32(decay), np.float32(phase))
)
self._swap_buffers()
# Global fast ops instance (created per resolution)
_FAST_OPS: Dict[Tuple[int, int], FastGPUOps] = {}
def get_fast_ops(width: int, height: int) -> FastGPUOps:
"""Get or create FastGPUOps for given resolution."""
key = (width, height)
if key not in _FAST_OPS:
_FAST_OPS[key] = FastGPUOps(width, height)
return _FAST_OPS[key]
# =============================================================================
# Fast effect functions (drop-in replacements)
# =============================================================================
def fast_rotate(frame: cp.ndarray, angle: float, **kwargs) -> cp.ndarray:
"""Fast GPU rotation."""
h, w = frame.shape[:2]
ops = get_fast_ops(w, h)
ops.set_input(frame)
ops.rotate(angle, kwargs.get('cx'), kwargs.get('cy'))
return ops.get_output().copy()
def fast_zoom(frame: cp.ndarray, factor: float, **kwargs) -> cp.ndarray:
"""Fast GPU zoom."""
h, w = frame.shape[:2]
ops = get_fast_ops(w, h)
ops.set_input(frame)
ops.zoom(factor, kwargs.get('cx'), kwargs.get('cy'))
return ops.get_output().copy()
def fast_blend(frame1: cp.ndarray, frame2: cp.ndarray, alpha: float) -> cp.ndarray:
"""Fast GPU blend."""
h, w = frame1.shape[:2]
ops = get_fast_ops(w, h)
ops.set_input(frame1)
ops.blend(frame2, alpha)
return ops.get_output().copy()
def fast_hue_shift(frame: cp.ndarray, degrees: float) -> cp.ndarray:
"""Fast GPU hue shift."""
h, w = frame.shape[:2]
ops = get_fast_ops(w, h)
ops.set_input(frame)
ops.hue_shift(degrees)
return ops.get_output().copy()
def fast_invert(frame: cp.ndarray) -> cp.ndarray:
"""Fast GPU invert."""
h, w = frame.shape[:2]
ops = get_fast_ops(w, h)
ops.set_input(frame)
ops.invert()
return ops.get_output().copy()
def fast_ripple(frame: cp.ndarray, amplitude: float, **kwargs) -> cp.ndarray:
"""Fast GPU ripple."""
h, w = frame.shape[:2]
ops = get_fast_ops(w, h)
ops.set_input(frame)
ops.ripple(
amplitude,
kwargs.get('center_x', w/2),
kwargs.get('center_y', h/2),
kwargs.get('frequency', 8),
kwargs.get('decay', 2),
kwargs.get('speed', 0) * kwargs.get('t', 0) # phase from speed*time
)
return ops.get_output().copy()
print("[jit_compiler] CUDA kernels loaded", file=sys.stderr)