rose-ash/l1/streaming/jit_compiler.py

"""
JIT Compiler for sexp frame pipelines.

Compiles sexp expressions to fused CUDA kernels for maximum performance.
"""

import cupy as cp
import numpy as np
from typing import Dict, List, Any, Optional, Tuple, Callable
import hashlib
import sys

# Cache for compiled kernels
_KERNEL_CACHE: Dict[str, Callable] = {}


def _generate_kernel_key(ops: List[Tuple]) -> str:
    """Generate cache key for operation sequence."""
    return hashlib.md5(str(ops).encode()).hexdigest()


# =============================================================================
# CUDA Kernel Templates
# =============================================================================

AFFINE_WARP_KERNEL = cp.RawKernel(r'''
extern "C" __global__
void affine_warp(
    const unsigned char* src,
    unsigned char* dst,
    int width, int height, int channels,
    float m00, float m01, float m02,
    float m10, float m11, float m12
) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;

    if (x >= width || y >= height) return;

    // Apply inverse affine transform
    float src_x = m00 * x + m01 * y + m02;
    float src_y = m10 * x + m11 * y + m12;

    int dst_idx = (y * width + x) * channels;

    // Bounds check
    if (src_x < 0 || src_x >= width - 1 || src_y < 0 || src_y >= height - 1) {
        for (int c = 0; c < channels; c++) {
            dst[dst_idx + c] = 0;
        }
        return;
    }

    // Bilinear interpolation
    int x0 = (int)src_x;
    int y0 = (int)src_y;
    int x1 = x0 + 1;
    int y1 = y0 + 1;

    float fx = src_x - x0;
    float fy = src_y - y0;

    for (int c = 0; c < channels; c++) {
        float v00 = src[(y0 * width + x0) * channels + c];
        float v10 = src[(y0 * width + x1) * channels + c];
        float v01 = src[(y1 * width + x0) * channels + c];
        float v11 = src[(y1 * width + x1) * channels + c];

        float v0 = v00 * (1 - fx) + v10 * fx;
        float v1 = v01 * (1 - fx) + v11 * fx;
        float v = v0 * (1 - fy) + v1 * fy;

        dst[dst_idx + c] = (unsigned char)(v < 0 ? 0 : (v > 255 ? 255 : v));
    }
}
''', 'affine_warp')


BLEND_KERNEL = cp.RawKernel(r'''
extern "C" __global__
void blend(
    const unsigned char* src1,
    const unsigned char* src2,
    unsigned char* dst,
    int size,
    float alpha
) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx >= size) return;

    float v = src1[idx] * (1.0f - alpha) + src2[idx] * alpha;
    dst[idx] = (unsigned char)(v < 0 ? 0 : (v > 255 ? 255 : v));
}
''', 'blend')


BRIGHTNESS_CONTRAST_KERNEL = cp.RawKernel(r'''
extern "C" __global__
void brightness_contrast(
    const unsigned char* src,
    unsigned char* dst,
    int size,
    float brightness,
    float contrast
) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx >= size) return;

    float v = src[idx];
    v = (v - 128.0f) * contrast + 128.0f + brightness;
    dst[idx] = (unsigned char)(v < 0 ? 0 : (v > 255 ? 255 : v));
}
''', 'brightness_contrast')


HUE_SHIFT_KERNEL = cp.RawKernel(r'''
extern "C" __global__
void hue_shift(
    const unsigned char* src,
    unsigned char* dst,
    int width, int height,
    float hue_shift
) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;

    if (x >= width || y >= height) return;

    int idx = (y * width + x) * 3;

    float r = src[idx] / 255.0f;
    float g = src[idx + 1] / 255.0f;
    float b = src[idx + 2] / 255.0f;

    // RGB to HSV
    float max_c = fmaxf(r, fmaxf(g, b));
    float min_c = fminf(r, fminf(g, b));
    float delta = max_c - min_c;

    float h = 0, s = 0, v = max_c;

    if (delta > 0.00001f) {
        s = delta / max_c;
        if (r >= max_c) h = (g - b) / delta;
        else if (g >= max_c) h = 2.0f + (b - r) / delta;
        else h = 4.0f + (r - g) / delta;
        h *= 60.0f;
        if (h < 0) h += 360.0f;
    }

    // Apply hue shift
    h = fmodf(h + hue_shift + 360.0f, 360.0f);

    // HSV to RGB
    float c = v * s;
    float x_val = c * (1 - fabsf(fmodf(h / 60.0f, 2.0f) - 1));
    float m = v - c;

    float r2, g2, b2;
    if (h < 60) { r2 = c; g2 = x_val; b2 = 0; }
    else if (h < 120) { r2 = x_val; g2 = c; b2 = 0; }
    else if (h < 180) { r2 = 0; g2 = c; b2 = x_val; }
    else if (h < 240) { r2 = 0; g2 = x_val; b2 = c; }
    else if (h < 300) { r2 = x_val; g2 = 0; b2 = c; }
    else { r2 = c; g2 = 0; b2 = x_val; }

    dst[idx] = (unsigned char)((r2 + m) * 255);
    dst[idx + 1] = (unsigned char)((g2 + m) * 255);
    dst[idx + 2] = (unsigned char)((b2 + m) * 255);
}
''', 'hue_shift')


INVERT_KERNEL = cp.RawKernel(r'''
extern "C" __global__
void invert(
    const unsigned char* src,
    unsigned char* dst,
    int size
) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx >= size) return;
    dst[idx] = 255 - src[idx];
}
''', 'invert')


ZOOM_KERNEL = cp.RawKernel(r'''
extern "C" __global__
void zoom(
    const unsigned char* src,
    unsigned char* dst,
    int width, int height, int channels,
    float zoom_factor,
    float cx, float cy
) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;

    if (x >= width || y >= height) return;

    // Map to source coordinates (zoom from center)
    float src_x = (x - cx) / zoom_factor + cx;
    float src_y = (y - cy) / zoom_factor + cy;

    int dst_idx = (y * width + x) * channels;

    if (src_x < 0 || src_x >= width - 1 || src_y < 0 || src_y >= height - 1) {
        for (int c = 0; c < channels; c++) {
            dst[dst_idx + c] = 0;
        }
        return;
    }

    // Bilinear interpolation
    int x0 = (int)src_x;
    int y0 = (int)src_y;
    float fx = src_x - x0;
    float fy = src_y - y0;

    for (int c = 0; c < channels; c++) {
        float v00 = src[(y0 * width + x0) * channels + c];
        float v10 = src[(y0 * width + (x0+1)) * channels + c];
        float v01 = src[((y0+1) * width + x0) * channels + c];
        float v11 = src[((y0+1) * width + (x0+1)) * channels + c];

        float v = v00*(1-fx)*(1-fy) + v10*fx*(1-fy) + v01*(1-fx)*fy + v11*fx*fy;
        dst[dst_idx + c] = (unsigned char)(v < 0 ? 0 : (v > 255 ? 255 : v));
    }
}
''', 'zoom')


RIPPLE_KERNEL = cp.RawKernel(r'''
extern "C" __global__
void ripple(
    const unsigned char* src,
    unsigned char* dst,
    int width, int height, int channels,
    float cx, float cy,
    float amplitude, float frequency, float decay, float phase
) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;

    if (x >= width || y >= height) return;

    float dx = x - cx;
    float dy = y - cy;
    float dist = sqrtf(dx * dx + dy * dy);

    // Ripple displacement
    float wave = sinf(dist * frequency * 0.1f + phase);
    float amp = amplitude * expf(-dist * decay * 0.01f);

    float src_x = x + dx / (dist + 0.001f) * wave * amp;
    float src_y = y + dy / (dist + 0.001f) * wave * amp;

    int dst_idx = (y * width + x) * channels;

    if (src_x < 0 || src_x >= width - 1 || src_y < 0 || src_y >= height - 1) {
        for (int c = 0; c < channels; c++) {
            dst[dst_idx + c] = src[dst_idx + c];  // Keep original on boundary
        }
        return;
    }

    // Bilinear interpolation
    int x0 = (int)src_x;
    int y0 = (int)src_y;
    float fx = src_x - x0;
    float fy = src_y - y0;

    for (int c = 0; c < channels; c++) {
        float v00 = src[(y0 * width + x0) * channels + c];
        float v10 = src[(y0 * width + (x0+1)) * channels + c];
        float v01 = src[((y0+1) * width + x0) * channels + c];
        float v11 = src[((y0+1) * width + (x0+1)) * channels + c];

        float v = v00*(1-fx)*(1-fy) + v10*fx*(1-fy) + v01*(1-fx)*fy + v11*fx*fy;
        dst[dst_idx + c] = (unsigned char)(v < 0 ? 0 : (v > 255 ? 255 : v));
    }
}
''', 'ripple')


# =============================================================================
# Fast GPU Operations
# =============================================================================

class FastGPUOps:
    """Optimized GPU operations using CUDA kernels."""

    def __init__(self, width: int, height: int):
        self.width = width
        self.height = height
        self.channels = 3

        # Pre-allocate work buffers
        self._buf1 = cp.zeros((height, width, 3), dtype=cp.uint8)
        self._buf2 = cp.zeros((height, width, 3), dtype=cp.uint8)
        self._current_buf = 0

        # Grid/block config
        self._block_2d = (16, 16)
        self._grid_2d = ((width + 15) // 16, (height + 15) // 16)
        self._block_1d = 256
        self._grid_1d = (width * height * 3 + 255) // 256

    def _get_buffers(self):
        """Get source and destination buffers (ping-pong)."""
        if self._current_buf == 0:
            return self._buf1, self._buf2
        return self._buf2, self._buf1

    def _swap_buffers(self):
        """Swap ping-pong buffers."""
        self._current_buf = 1 - self._current_buf

    def set_input(self, frame: cp.ndarray):
        """Set input frame."""
        if self._current_buf == 0:
            cp.copyto(self._buf1, frame)
        else:
            cp.copyto(self._buf2, frame)

    def get_output(self) -> cp.ndarray:
        """Get current output buffer."""
        if self._current_buf == 0:
            return self._buf1
        return self._buf2

    def rotate(self, angle: float, cx: float = None, cy: float = None):
        """Fast GPU rotation."""
        if cx is None:
            cx = self.width / 2
        if cy is None:
            cy = self.height / 2

        src, dst = self._get_buffers()

        # Compute inverse rotation matrix
        import math
        rad = math.radians(-angle)  # Negative for inverse
        cos_a = math.cos(rad)
        sin_a = math.sin(rad)

        # Inverse affine matrix (rotate around center)
        m00 = cos_a
        m01 = -sin_a
        m02 = cx - cos_a * cx + sin_a * cy
        m10 = sin_a
        m11 = cos_a
        m12 = cy - sin_a * cx - cos_a * cy

        AFFINE_WARP_KERNEL(
            self._grid_2d, self._block_2d,
            (src, dst, self.width, self.height, self.channels,
             np.float32(m00), np.float32(m01), np.float32(m02),
             np.float32(m10), np.float32(m11), np.float32(m12))
        )
        self._swap_buffers()

    def zoom(self, factor: float, cx: float = None, cy: float = None):
        """Fast GPU zoom."""
        if cx is None:
            cx = self.width / 2
        if cy is None:
            cy = self.height / 2

        src, dst = self._get_buffers()

        ZOOM_KERNEL(
            self._grid_2d, self._block_2d,
            (src, dst, self.width, self.height, self.channels,
             np.float32(factor), np.float32(cx), np.float32(cy))
        )
        self._swap_buffers()

    def blend(self, other: cp.ndarray, alpha: float):
        """Fast GPU blend."""
        src, dst = self._get_buffers()
        size = self.width * self.height * self.channels

        BLEND_KERNEL(
            (self._grid_1d,), (self._block_1d,),
            (src.ravel(), other.ravel(), dst.ravel(), size, np.float32(alpha))
        )
        self._swap_buffers()

    def brightness(self, factor: float):
        """Fast GPU brightness adjustment."""
        src, dst = self._get_buffers()
        size = self.width * self.height * self.channels

        BRIGHTNESS_CONTRAST_KERNEL(
            (self._grid_1d,), (self._block_1d,),
            (src.ravel(), dst.ravel(), size, np.float32((factor - 1) * 128), np.float32(1.0))
        )
        self._swap_buffers()

    def contrast(self, factor: float):
        """Fast GPU contrast adjustment."""
        src, dst = self._get_buffers()
        size = self.width * self.height * self.channels

        BRIGHTNESS_CONTRAST_KERNEL(
            (self._grid_1d,), (self._block_1d,),
            (src.ravel(), dst.ravel(), size, np.float32(0), np.float32(factor))
        )
        self._swap_buffers()

    def hue_shift(self, degrees: float):
        """Fast GPU hue shift."""
        src, dst = self._get_buffers()

        HUE_SHIFT_KERNEL(
            self._grid_2d, self._block_2d,
            (src, dst, self.width, self.height, np.float32(degrees))
        )
        self._swap_buffers()

    def invert(self):
        """Fast GPU invert."""
        src, dst = self._get_buffers()
        size = self.width * self.height * self.channels

        INVERT_KERNEL(
            (self._grid_1d,), (self._block_1d,),
            (src.ravel(), dst.ravel(), size)
        )
        self._swap_buffers()

    def ripple(self, amplitude: float, cx: float = None, cy: float = None,
               frequency: float = 8, decay: float = 2, phase: float = 0):
        """Fast GPU ripple effect."""
        if cx is None:
            cx = self.width / 2
        if cy is None:
            cy = self.height / 2

        src, dst = self._get_buffers()

        RIPPLE_KERNEL(
            self._grid_2d, self._block_2d,
            (src, dst, self.width, self.height, self.channels,
             np.float32(cx), np.float32(cy),
             np.float32(amplitude), np.float32(frequency),
             np.float32(decay), np.float32(phase))
        )
        self._swap_buffers()


# Global fast ops instance (created per resolution)
_FAST_OPS: Dict[Tuple[int, int], FastGPUOps] = {}


def get_fast_ops(width: int, height: int) -> FastGPUOps:
    """Get or create FastGPUOps for given resolution."""
    key = (width, height)
    if key not in _FAST_OPS:
        _FAST_OPS[key] = FastGPUOps(width, height)
    return _FAST_OPS[key]


# =============================================================================
# Fast effect functions (drop-in replacements)
# =============================================================================

def fast_rotate(frame: cp.ndarray, angle: float, **kwargs) -> cp.ndarray:
    """Fast GPU rotation."""
    h, w = frame.shape[:2]
    ops = get_fast_ops(w, h)
    ops.set_input(frame)
    ops.rotate(angle, kwargs.get('cx'), kwargs.get('cy'))
    return ops.get_output().copy()


def fast_zoom(frame: cp.ndarray, factor: float, **kwargs) -> cp.ndarray:
    """Fast GPU zoom."""
    h, w = frame.shape[:2]
    ops = get_fast_ops(w, h)
    ops.set_input(frame)
    ops.zoom(factor, kwargs.get('cx'), kwargs.get('cy'))
    return ops.get_output().copy()


def fast_blend(frame1: cp.ndarray, frame2: cp.ndarray, alpha: float) -> cp.ndarray:
    """Fast GPU blend."""
    h, w = frame1.shape[:2]
    ops = get_fast_ops(w, h)
    ops.set_input(frame1)
    ops.blend(frame2, alpha)
    return ops.get_output().copy()


def fast_hue_shift(frame: cp.ndarray, degrees: float) -> cp.ndarray:
    """Fast GPU hue shift."""
    h, w = frame.shape[:2]
    ops = get_fast_ops(w, h)
    ops.set_input(frame)
    ops.hue_shift(degrees)
    return ops.get_output().copy()


def fast_invert(frame: cp.ndarray) -> cp.ndarray:
    """Fast GPU invert."""
    h, w = frame.shape[:2]
    ops = get_fast_ops(w, h)
    ops.set_input(frame)
    ops.invert()
    return ops.get_output().copy()


def fast_ripple(frame: cp.ndarray, amplitude: float, **kwargs) -> cp.ndarray:
    """Fast GPU ripple."""
    h, w = frame.shape[:2]
    ops = get_fast_ops(w, h)
    ops.set_input(frame)
    ops.ripple(
        amplitude,
        kwargs.get('center_x', w/2),
        kwargs.get('center_y', h/2),
        kwargs.get('frequency', 8),
        kwargs.get('decay', 2),
        kwargs.get('speed', 0) * kwargs.get('t', 0)  # phase from speed*time
    )
    return ops.get_output().copy()


print("[jit_compiler] CUDA kernels loaded", file=sys.stderr)