666 lines
20 KiB
Python
666 lines
20 KiB
Python
"""
|
|
Sexp to CUDA Kernel Compiler.
|
|
|
|
Compiles sexp frame pipelines to fused CUDA kernels for maximum performance.
|
|
Instead of interpreting sexp and launching 10+ kernels per frame,
|
|
generates a single kernel that does everything in one pass.
|
|
"""
|
|
|
|
import cupy as cp
|
|
import numpy as np
|
|
from typing import Dict, List, Any, Optional, Tuple
|
|
import hashlib
|
|
import sys
|
|
|
|
# Kernel cache
|
|
_COMPILED_KERNELS: Dict[str, Any] = {}
|
|
|
|
|
|
def compile_frame_pipeline(effects: List[dict], width: int, height: int) -> callable:
|
|
"""
|
|
Compile a list of effects to a fused CUDA kernel.
|
|
|
|
Args:
|
|
effects: List of effect dicts like:
|
|
[{'op': 'rotate', 'angle': 45.0},
|
|
{'op': 'blend', 'alpha': 0.5, 'src2': <gpu_array>},
|
|
{'op': 'hue_shift', 'degrees': 90.0},
|
|
{'op': 'ripple', 'amplitude': 10.0, 'frequency': 8.0, ...}]
|
|
width, height: Frame dimensions
|
|
|
|
Returns:
|
|
Callable that takes input frame and returns output frame
|
|
"""
|
|
|
|
# Generate cache key
|
|
ops_key = str([(e['op'], {k:v for k,v in e.items() if k != 'src2'}) for e in effects])
|
|
cache_key = f"{width}x{height}_{hashlib.md5(ops_key.encode()).hexdigest()}"
|
|
|
|
if cache_key in _COMPILED_KERNELS:
|
|
return _COMPILED_KERNELS[cache_key]
|
|
|
|
# Generate fused kernel code
|
|
kernel_code = _generate_fused_kernel(effects, width, height)
|
|
|
|
# Compile kernel
|
|
kernel = cp.RawKernel(kernel_code, 'fused_pipeline')
|
|
|
|
# Create wrapper function
|
|
def run_pipeline(frame: cp.ndarray, **dynamic_params) -> cp.ndarray:
|
|
"""Run the compiled pipeline on a frame."""
|
|
if frame.dtype != cp.uint8:
|
|
frame = cp.clip(frame, 0, 255).astype(cp.uint8)
|
|
if not frame.flags['C_CONTIGUOUS']:
|
|
frame = cp.ascontiguousarray(frame)
|
|
|
|
output = cp.zeros_like(frame)
|
|
|
|
block = (16, 16)
|
|
grid = ((width + 15) // 16, (height + 15) // 16)
|
|
|
|
# Build parameter array
|
|
params = _build_params(effects, dynamic_params)
|
|
|
|
kernel(grid, block, (frame, output, width, height, params))
|
|
|
|
return output
|
|
|
|
_COMPILED_KERNELS[cache_key] = run_pipeline
|
|
return run_pipeline
|
|
|
|
|
|
def _generate_fused_kernel(effects: List[dict], width: int, height: int) -> str:
|
|
"""Generate CUDA kernel code for fused effects pipeline."""
|
|
|
|
# Build the kernel
|
|
code = r'''
|
|
extern "C" __global__
|
|
void fused_pipeline(
|
|
const unsigned char* src,
|
|
unsigned char* dst,
|
|
int width, int height,
|
|
const float* params
|
|
) {
|
|
int x = blockIdx.x * blockDim.x + threadIdx.x;
|
|
int y = blockIdx.y * blockDim.y + threadIdx.y;
|
|
|
|
if (x >= width || y >= height) return;
|
|
|
|
// Start with source coordinates
|
|
float src_x = (float)x;
|
|
float src_y = (float)y;
|
|
float cx = width / 2.0f;
|
|
float cy = height / 2.0f;
|
|
|
|
// Track accumulated transforms
|
|
float total_cos = 1.0f, total_sin = 0.0f; // rotation
|
|
float total_zoom = 1.0f; // zoom
|
|
float ripple_dx = 0.0f, ripple_dy = 0.0f; // ripple displacement
|
|
|
|
int param_idx = 0;
|
|
|
|
'''
|
|
|
|
# Add effect-specific code
|
|
for i, effect in enumerate(effects):
|
|
op = effect['op']
|
|
|
|
if op == 'rotate':
|
|
code += f'''
|
|
// Rotate {i}
|
|
{{
|
|
float angle = params[param_idx++] * 3.14159265f / 180.0f;
|
|
float c = cosf(angle);
|
|
float s = sinf(angle);
|
|
// Compose with existing rotation
|
|
float nc = total_cos * c - total_sin * s;
|
|
float ns = total_cos * s + total_sin * c;
|
|
total_cos = nc;
|
|
total_sin = ns;
|
|
}}
|
|
'''
|
|
elif op == 'zoom':
|
|
code += f'''
|
|
// Zoom {i}
|
|
{{
|
|
float zoom = params[param_idx++];
|
|
total_zoom *= zoom;
|
|
}}
|
|
'''
|
|
elif op == 'ripple':
|
|
code += f'''
|
|
// Ripple {i}
|
|
{{
|
|
float amplitude = params[param_idx++];
|
|
float frequency = params[param_idx++];
|
|
float decay = params[param_idx++];
|
|
float phase = params[param_idx++];
|
|
float rcx = params[param_idx++];
|
|
float rcy = params[param_idx++];
|
|
|
|
float rdx = src_x - rcx;
|
|
float rdy = src_y - rcy;
|
|
float dist = sqrtf(rdx * rdx + rdy * rdy);
|
|
|
|
float wave = sinf(dist * frequency * 0.1f + phase);
|
|
float amp = amplitude * expf(-dist * decay * 0.01f);
|
|
|
|
if (dist > 0.001f) {{
|
|
ripple_dx += rdx / dist * wave * amp;
|
|
ripple_dy += rdy / dist * wave * amp;
|
|
}}
|
|
}}
|
|
'''
|
|
|
|
# Apply all geometric transforms at once
|
|
code += '''
|
|
// Apply accumulated geometric transforms
|
|
{
|
|
// Translate to center
|
|
float dx = src_x - cx;
|
|
float dy = src_y - cy;
|
|
|
|
// Apply rotation
|
|
float rx = total_cos * dx + total_sin * dy;
|
|
float ry = -total_sin * dx + total_cos * dy;
|
|
|
|
// Apply zoom (inverse for sampling)
|
|
rx /= total_zoom;
|
|
ry /= total_zoom;
|
|
|
|
// Translate back and apply ripple
|
|
src_x = rx + cx - ripple_dx;
|
|
src_y = ry + cy - ripple_dy;
|
|
}
|
|
|
|
// Sample source with bilinear interpolation
|
|
float r, g, b;
|
|
if (src_x < 0 || src_x >= width - 1 || src_y < 0 || src_y >= height - 1) {
|
|
r = g = b = 0;
|
|
} else {
|
|
int x0 = (int)src_x;
|
|
int y0 = (int)src_y;
|
|
float fx = src_x - x0;
|
|
float fy = src_y - y0;
|
|
|
|
int idx00 = (y0 * width + x0) * 3;
|
|
int idx10 = (y0 * width + x0 + 1) * 3;
|
|
int idx01 = ((y0 + 1) * width + x0) * 3;
|
|
int idx11 = ((y0 + 1) * width + x0 + 1) * 3;
|
|
|
|
#define BILERP(c) \\
|
|
(src[idx00 + c] * (1-fx) * (1-fy) + \\
|
|
src[idx10 + c] * fx * (1-fy) + \\
|
|
src[idx01 + c] * (1-fx) * fy + \\
|
|
src[idx11 + c] * fx * fy)
|
|
|
|
r = BILERP(0);
|
|
g = BILERP(1);
|
|
b = BILERP(2);
|
|
}
|
|
|
|
'''
|
|
|
|
# Add color transforms
|
|
for i, effect in enumerate(effects):
|
|
op = effect['op']
|
|
|
|
if op == 'invert':
|
|
code += f'''
|
|
// Invert {i}
|
|
{{
|
|
float amount = params[param_idx++];
|
|
if (amount > 0.5f) {{
|
|
r = 255.0f - r;
|
|
g = 255.0f - g;
|
|
b = 255.0f - b;
|
|
}}
|
|
}}
|
|
'''
|
|
elif op == 'hue_shift':
|
|
code += f'''
|
|
// Hue shift {i}
|
|
{{
|
|
float shift = params[param_idx++];
|
|
if (fabsf(shift) > 0.01f) {{
|
|
// RGB to HSV
|
|
float rf = r / 255.0f;
|
|
float gf = g / 255.0f;
|
|
float bf = b / 255.0f;
|
|
|
|
float max_c = fmaxf(rf, fmaxf(gf, bf));
|
|
float min_c = fminf(rf, fminf(gf, bf));
|
|
float delta = max_c - min_c;
|
|
|
|
float h = 0, s = 0, v = max_c;
|
|
|
|
if (delta > 0.00001f) {{
|
|
s = delta / max_c;
|
|
if (rf >= max_c) h = (gf - bf) / delta;
|
|
else if (gf >= max_c) h = 2.0f + (bf - rf) / delta;
|
|
else h = 4.0f + (rf - gf) / delta;
|
|
h *= 60.0f;
|
|
if (h < 0) h += 360.0f;
|
|
}}
|
|
|
|
h = fmodf(h + shift + 360.0f, 360.0f);
|
|
|
|
// HSV to RGB
|
|
float c = v * s;
|
|
float x_val = c * (1 - fabsf(fmodf(h / 60.0f, 2.0f) - 1));
|
|
float m = v - c;
|
|
|
|
float r2, g2, b2;
|
|
if (h < 60) {{ r2 = c; g2 = x_val; b2 = 0; }}
|
|
else if (h < 120) {{ r2 = x_val; g2 = c; b2 = 0; }}
|
|
else if (h < 180) {{ r2 = 0; g2 = c; b2 = x_val; }}
|
|
else if (h < 240) {{ r2 = 0; g2 = x_val; b2 = c; }}
|
|
else if (h < 300) {{ r2 = x_val; g2 = 0; b2 = c; }}
|
|
else {{ r2 = c; g2 = 0; b2 = x_val; }}
|
|
|
|
r = (r2 + m) * 255.0f;
|
|
g = (g2 + m) * 255.0f;
|
|
b = (b2 + m) * 255.0f;
|
|
}}
|
|
}}
|
|
'''
|
|
elif op == 'brightness':
|
|
code += f'''
|
|
// Brightness {i}
|
|
{{
|
|
float factor = params[param_idx++];
|
|
r *= factor;
|
|
g *= factor;
|
|
b *= factor;
|
|
}}
|
|
'''
|
|
|
|
# Write output
|
|
code += '''
|
|
// Write output
|
|
int dst_idx = (y * width + x) * 3;
|
|
dst[dst_idx] = (unsigned char)fminf(255.0f, fmaxf(0.0f, r));
|
|
dst[dst_idx + 1] = (unsigned char)fminf(255.0f, fmaxf(0.0f, g));
|
|
dst[dst_idx + 2] = (unsigned char)fminf(255.0f, fmaxf(0.0f, b));
|
|
}
|
|
'''
|
|
|
|
return code
|
|
|
|
|
|
def _build_params(effects: List[dict], dynamic_params: dict) -> cp.ndarray:
|
|
"""Build parameter array for kernel."""
|
|
params = []
|
|
|
|
for effect in effects:
|
|
op = effect['op']
|
|
|
|
if op == 'rotate':
|
|
params.append(float(dynamic_params.get('rotate_angle', effect.get('angle', 0))))
|
|
elif op == 'zoom':
|
|
params.append(float(dynamic_params.get('zoom_amount', effect.get('amount', 1.0))))
|
|
elif op == 'ripple':
|
|
params.append(float(effect.get('amplitude', 10)))
|
|
params.append(float(effect.get('frequency', 8)))
|
|
params.append(float(effect.get('decay', 2)))
|
|
params.append(float(dynamic_params.get('ripple_phase', effect.get('phase', 0))))
|
|
params.append(float(effect.get('center_x', 960)))
|
|
params.append(float(effect.get('center_y', 540)))
|
|
elif op == 'invert':
|
|
params.append(float(effect.get('amount', 0)))
|
|
elif op == 'hue_shift':
|
|
params.append(float(effect.get('degrees', 0)))
|
|
elif op == 'brightness':
|
|
params.append(float(effect.get('factor', 1.0)))
|
|
|
|
return cp.array(params, dtype=cp.float32)
|
|
|
|
|
|
def compile_autonomous_pipeline(effects: List[dict], width: int, height: int,
|
|
dynamic_expressions: dict = None) -> callable:
|
|
"""
|
|
Compile a fully autonomous pipeline that computes ALL parameters on GPU.
|
|
|
|
This eliminates Python from the hot path - the kernel computes time-based
|
|
parameters (sin, cos, etc.) directly on GPU.
|
|
|
|
Args:
|
|
effects: List of effect dicts
|
|
width, height: Frame dimensions
|
|
dynamic_expressions: Dict mapping param names to expressions, e.g.:
|
|
{'rotate_angle': 't * 30',
|
|
'ripple_phase': 't * 2',
|
|
'brightness_factor': '0.8 + 0.4 * sin(t * 2)'}
|
|
|
|
Returns:
|
|
Callable that takes (frame, frame_num, fps) and returns output frame
|
|
"""
|
|
if dynamic_expressions is None:
|
|
dynamic_expressions = {}
|
|
|
|
# Generate cache key
|
|
ops_key = str([(e['op'], {k:v for k,v in e.items() if k != 'src2'}) for e in effects])
|
|
expr_key = str(sorted(dynamic_expressions.items()))
|
|
cache_key = f"auto_{width}x{height}_{hashlib.md5((ops_key + expr_key).encode()).hexdigest()}"
|
|
|
|
if cache_key in _COMPILED_KERNELS:
|
|
return _COMPILED_KERNELS[cache_key]
|
|
|
|
# Generate autonomous kernel code
|
|
kernel_code = _generate_autonomous_kernel(effects, width, height, dynamic_expressions)
|
|
|
|
# Compile kernel
|
|
kernel = cp.RawKernel(kernel_code, 'autonomous_pipeline')
|
|
|
|
# Create wrapper function
|
|
def run_autonomous(frame: cp.ndarray, frame_num: int, fps: float = 30.0) -> cp.ndarray:
|
|
"""Run the autonomous pipeline - no Python in the hot path!"""
|
|
if frame.dtype != cp.uint8:
|
|
frame = cp.clip(frame, 0, 255).astype(cp.uint8)
|
|
if not frame.flags['C_CONTIGUOUS']:
|
|
frame = cp.ascontiguousarray(frame)
|
|
|
|
output = cp.zeros_like(frame)
|
|
|
|
block = (16, 16)
|
|
grid = ((width + 15) // 16, (height + 15) // 16)
|
|
|
|
# Only pass frame_num and fps - kernel computes everything else!
|
|
t = float(frame_num) / float(fps)
|
|
kernel(grid, block, (frame, output, np.int32(width), np.int32(height),
|
|
np.float32(t), np.int32(frame_num)))
|
|
|
|
return output
|
|
|
|
_COMPILED_KERNELS[cache_key] = run_autonomous
|
|
return run_autonomous
|
|
|
|
|
|
def _generate_autonomous_kernel(effects: List[dict], width: int, height: int,
|
|
dynamic_expressions: dict) -> str:
|
|
"""Generate CUDA kernel that computes everything autonomously."""
|
|
|
|
# Map simple expressions to CUDA code
|
|
def expr_to_cuda(expr: str) -> str:
|
|
"""Convert simple expression to CUDA."""
|
|
expr = expr.replace('sin(', 'sinf(')
|
|
expr = expr.replace('cos(', 'cosf(')
|
|
expr = expr.replace('abs(', 'fabsf(')
|
|
return expr
|
|
|
|
code = r'''
|
|
extern "C" __global__
|
|
void autonomous_pipeline(
|
|
const unsigned char* src,
|
|
unsigned char* dst,
|
|
int width, int height,
|
|
float t, int frame_num
|
|
) {
|
|
int x = blockIdx.x * blockDim.x + threadIdx.x;
|
|
int y = blockIdx.y * blockDim.y + threadIdx.y;
|
|
|
|
if (x >= width || y >= height) return;
|
|
|
|
// Compute dynamic parameters from time (ALL ON GPU!)
|
|
'''
|
|
|
|
# Add dynamic parameter calculations
|
|
rotate_expr = dynamic_expressions.get('rotate_angle', '0.0f')
|
|
ripple_phase_expr = dynamic_expressions.get('ripple_phase', '0.0f')
|
|
brightness_expr = dynamic_expressions.get('brightness_factor', '1.0f')
|
|
zoom_expr = dynamic_expressions.get('zoom_amount', '1.0f')
|
|
|
|
code += f'''
|
|
float rotate_angle = {expr_to_cuda(rotate_expr)};
|
|
float ripple_phase = {expr_to_cuda(ripple_phase_expr)};
|
|
float brightness_factor = {expr_to_cuda(brightness_expr)};
|
|
float zoom_amount = {expr_to_cuda(zoom_expr)};
|
|
|
|
// Start with source coordinates
|
|
float src_x = (float)x;
|
|
float src_y = (float)y;
|
|
float cx = width / 2.0f;
|
|
float cy = height / 2.0f;
|
|
|
|
// Accumulated transforms
|
|
float total_cos = 1.0f, total_sin = 0.0f;
|
|
float total_zoom = 1.0f;
|
|
float ripple_dx = 0.0f, ripple_dy = 0.0f;
|
|
|
|
'''
|
|
|
|
# Add effect-specific code
|
|
for i, effect in enumerate(effects):
|
|
op = effect['op']
|
|
|
|
if op == 'rotate':
|
|
code += f'''
|
|
// Rotate {i}
|
|
{{
|
|
float angle = rotate_angle * 3.14159265f / 180.0f;
|
|
float c = cosf(angle);
|
|
float s = sinf(angle);
|
|
float nc = total_cos * c - total_sin * s;
|
|
float ns = total_cos * s + total_sin * c;
|
|
total_cos = nc;
|
|
total_sin = ns;
|
|
}}
|
|
'''
|
|
elif op == 'zoom':
|
|
code += f'''
|
|
// Zoom {i}
|
|
{{
|
|
total_zoom *= zoom_amount;
|
|
}}
|
|
'''
|
|
elif op == 'ripple':
|
|
amp = effect.get('amplitude', 10)
|
|
freq = effect.get('frequency', 8)
|
|
decay = effect.get('decay', 2)
|
|
rcx = effect.get('center_x', width/2)
|
|
rcy = effect.get('center_y', height/2)
|
|
code += f'''
|
|
// Ripple {i}
|
|
{{
|
|
float amplitude = {amp}f;
|
|
float frequency = {freq}f;
|
|
float decay_val = {decay}f;
|
|
float rcx = {rcx}f;
|
|
float rcy = {rcy}f;
|
|
|
|
float rdx = src_x - rcx;
|
|
float rdy = src_y - rcy;
|
|
float dist = sqrtf(rdx * rdx + rdy * rdy);
|
|
|
|
float wave = sinf(dist * frequency * 0.1f + ripple_phase);
|
|
float amp = amplitude * expf(-dist * decay_val * 0.01f);
|
|
|
|
if (dist > 0.001f) {{
|
|
ripple_dx += rdx / dist * wave * amp;
|
|
ripple_dy += rdy / dist * wave * amp;
|
|
}}
|
|
}}
|
|
'''
|
|
|
|
# Apply geometric transforms
|
|
code += '''
|
|
// Apply accumulated transforms
|
|
{
|
|
float dx = src_x - cx;
|
|
float dy = src_y - cy;
|
|
float rx = total_cos * dx + total_sin * dy;
|
|
float ry = -total_sin * dx + total_cos * dy;
|
|
rx /= total_zoom;
|
|
ry /= total_zoom;
|
|
src_x = rx + cx - ripple_dx;
|
|
src_y = ry + cy - ripple_dy;
|
|
}
|
|
|
|
// Bilinear sample
|
|
float r, g, b;
|
|
if (src_x < 0 || src_x >= width - 1 || src_y < 0 || src_y >= height - 1) {
|
|
r = g = b = 0;
|
|
} else {
|
|
int x0 = (int)src_x;
|
|
int y0 = (int)src_y;
|
|
float fx = src_x - x0;
|
|
float fy = src_y - y0;
|
|
|
|
int idx00 = (y0 * width + x0) * 3;
|
|
int idx10 = (y0 * width + x0 + 1) * 3;
|
|
int idx01 = ((y0 + 1) * width + x0) * 3;
|
|
int idx11 = ((y0 + 1) * width + x0 + 1) * 3;
|
|
|
|
#define BILERP(c) \\
|
|
(src[idx00 + c] * (1-fx) * (1-fy) + \\
|
|
src[idx10 + c] * fx * (1-fy) + \\
|
|
src[idx01 + c] * (1-fx) * fy + \\
|
|
src[idx11 + c] * fx * fy)
|
|
|
|
r = BILERP(0);
|
|
g = BILERP(1);
|
|
b = BILERP(2);
|
|
}
|
|
|
|
'''
|
|
|
|
# Add color transforms
|
|
for i, effect in enumerate(effects):
|
|
op = effect['op']
|
|
|
|
if op == 'hue_shift':
|
|
degrees = effect.get('degrees', 0)
|
|
code += f'''
|
|
// Hue shift {i}
|
|
{{
|
|
float shift = {degrees}f;
|
|
float rf = r / 255.0f;
|
|
float gf = g / 255.0f;
|
|
float bf = b / 255.0f;
|
|
|
|
float max_c = fmaxf(rf, fmaxf(gf, bf));
|
|
float min_c = fminf(rf, fminf(gf, bf));
|
|
float delta = max_c - min_c;
|
|
|
|
float h = 0, s = 0, v = max_c;
|
|
|
|
if (delta > 0.00001f) {{
|
|
s = delta / max_c;
|
|
if (rf >= max_c) h = (gf - bf) / delta;
|
|
else if (gf >= max_c) h = 2.0f + (bf - rf) / delta;
|
|
else h = 4.0f + (rf - gf) / delta;
|
|
h *= 60.0f;
|
|
if (h < 0) h += 360.0f;
|
|
}}
|
|
|
|
h = fmodf(h + shift + 360.0f, 360.0f);
|
|
|
|
float c = v * s;
|
|
float x_val = c * (1 - fabsf(fmodf(h / 60.0f, 2.0f) - 1));
|
|
float m = v - c;
|
|
|
|
float r2, g2, b2;
|
|
if (h < 60) {{ r2 = c; g2 = x_val; b2 = 0; }}
|
|
else if (h < 120) {{ r2 = x_val; g2 = c; b2 = 0; }}
|
|
else if (h < 180) {{ r2 = 0; g2 = c; b2 = x_val; }}
|
|
else if (h < 240) {{ r2 = 0; g2 = x_val; b2 = c; }}
|
|
else if (h < 300) {{ r2 = x_val; g2 = 0; b2 = c; }}
|
|
else {{ r2 = c; g2 = 0; b2 = x_val; }}
|
|
|
|
r = (r2 + m) * 255.0f;
|
|
g = (g2 + m) * 255.0f;
|
|
b = (b2 + m) * 255.0f;
|
|
}}
|
|
'''
|
|
elif op == 'brightness':
|
|
code += '''
|
|
// Brightness
|
|
{
|
|
r *= brightness_factor;
|
|
g *= brightness_factor;
|
|
b *= brightness_factor;
|
|
}
|
|
'''
|
|
|
|
# Write output
|
|
code += '''
|
|
// Write output
|
|
int dst_idx = (y * width + x) * 3;
|
|
dst[dst_idx] = (unsigned char)fminf(255.0f, fmaxf(0.0f, r));
|
|
dst[dst_idx + 1] = (unsigned char)fminf(255.0f, fmaxf(0.0f, g));
|
|
dst[dst_idx + 2] = (unsigned char)fminf(255.0f, fmaxf(0.0f, b));
|
|
}
|
|
'''
|
|
|
|
return code
|
|
|
|
|
|
# Test the compiler
|
|
if __name__ == '__main__':
|
|
import time
|
|
|
|
print("[sexp_to_cuda] Testing fused kernel compiler...")
|
|
print("=" * 60)
|
|
|
|
# Define a test pipeline
|
|
effects = [
|
|
{'op': 'rotate', 'angle': 45.0},
|
|
{'op': 'hue_shift', 'degrees': 30.0},
|
|
{'op': 'ripple', 'amplitude': 15, 'frequency': 10, 'decay': 2, 'phase': 0, 'center_x': 960, 'center_y': 540},
|
|
{'op': 'brightness', 'factor': 1.0},
|
|
]
|
|
|
|
frame = cp.random.randint(0, 255, (1080, 1920, 3), dtype=cp.uint8)
|
|
|
|
# ===== Test 1: Standard fused kernel (params passed from Python) =====
|
|
print("\n[Test 1] Standard fused kernel (Python computes params)")
|
|
pipeline = compile_frame_pipeline(effects, 1920, 1080)
|
|
|
|
# Warmup
|
|
output = pipeline(frame)
|
|
cp.cuda.Stream.null.synchronize()
|
|
|
|
# Benchmark with Python param computation
|
|
start = time.time()
|
|
for i in range(100):
|
|
# Simulate Python computing params (like sexp interpreter does)
|
|
import math
|
|
t = i / 30.0
|
|
angle = t * 30
|
|
phase = t * 2
|
|
brightness = 0.8 + 0.4 * math.sin(t * 2)
|
|
output = pipeline(frame, rotate_angle=angle, ripple_phase=phase)
|
|
cp.cuda.Stream.null.synchronize()
|
|
elapsed = time.time() - start
|
|
|
|
print(f" Time: {elapsed/100*1000:.2f}ms per frame")
|
|
print(f" FPS: {100/elapsed:.0f}")
|
|
|
|
# ===== Test 2: Autonomous kernel (GPU computes everything) =====
|
|
print("\n[Test 2] Autonomous kernel (GPU computes ALL params)")
|
|
|
|
dynamic_expressions = {
|
|
'rotate_angle': 't * 30.0f',
|
|
'ripple_phase': 't * 2.0f',
|
|
'brightness_factor': '0.8f + 0.4f * sinf(t * 2.0f)',
|
|
}
|
|
|
|
auto_pipeline = compile_autonomous_pipeline(effects, 1920, 1080, dynamic_expressions)
|
|
|
|
# Warmup
|
|
output = auto_pipeline(frame, 0, 30.0)
|
|
cp.cuda.Stream.null.synchronize()
|
|
|
|
# Benchmark - NO Python computation in loop!
|
|
start = time.time()
|
|
for i in range(100):
|
|
output = auto_pipeline(frame, i, 30.0) # Just pass frame_num!
|
|
cp.cuda.Stream.null.synchronize()
|
|
elapsed = time.time() - start
|
|
|
|
print(f" Time: {elapsed/100*1000:.2f}ms per frame")
|
|
print(f" FPS: {100/elapsed:.0f}")
|
|
|
|
print("\n" + "=" * 60)
|
|
print("Autonomous kernel eliminates Python from hot path!")
|