diff --git a/streaming/sexp_to_cuda.py b/streaming/sexp_to_cuda.py index cb23058..9212b7c 100644 --- a/streaming/sexp_to_cuda.py +++ b/streaming/sexp_to_cuda.py @@ -316,35 +316,350 @@ def _build_params(effects: List[dict], dynamic_params: dict) -> cp.ndarray: return cp.array(params, dtype=cp.float32) +def compile_autonomous_pipeline(effects: List[dict], width: int, height: int, + dynamic_expressions: dict = None) -> callable: + """ + Compile a fully autonomous pipeline that computes ALL parameters on GPU. + + This eliminates Python from the hot path - the kernel computes time-based + parameters (sin, cos, etc.) directly on GPU. + + Args: + effects: List of effect dicts + width, height: Frame dimensions + dynamic_expressions: Dict mapping param names to expressions, e.g.: + {'rotate_angle': 't * 30', + 'ripple_phase': 't * 2', + 'brightness_factor': '0.8 + 0.4 * sin(t * 2)'} + + Returns: + Callable that takes (frame, frame_num, fps) and returns output frame + """ + if dynamic_expressions is None: + dynamic_expressions = {} + + # Generate cache key + ops_key = str([(e['op'], {k:v for k,v in e.items() if k != 'src2'}) for e in effects]) + expr_key = str(sorted(dynamic_expressions.items())) + cache_key = f"auto_{width}x{height}_{hashlib.md5((ops_key + expr_key).encode()).hexdigest()}" + + if cache_key in _COMPILED_KERNELS: + return _COMPILED_KERNELS[cache_key] + + # Generate autonomous kernel code + kernel_code = _generate_autonomous_kernel(effects, width, height, dynamic_expressions) + + # Compile kernel + kernel = cp.RawKernel(kernel_code, 'autonomous_pipeline') + + # Create wrapper function + def run_autonomous(frame: cp.ndarray, frame_num: int, fps: float = 30.0) -> cp.ndarray: + """Run the autonomous pipeline - no Python in the hot path!""" + if frame.dtype != cp.uint8: + frame = cp.clip(frame, 0, 255).astype(cp.uint8) + if not frame.flags['C_CONTIGUOUS']: + frame = cp.ascontiguousarray(frame) + + output = cp.zeros_like(frame) + + block = (16, 16) + grid = ((width + 15) // 16, (height + 15) // 16) + + # Only pass frame_num and fps - kernel computes everything else! + t = float(frame_num) / float(fps) + kernel(grid, block, (frame, output, np.int32(width), np.int32(height), + np.float32(t), np.int32(frame_num))) + + return output + + _COMPILED_KERNELS[cache_key] = run_autonomous + return run_autonomous + + +def _generate_autonomous_kernel(effects: List[dict], width: int, height: int, + dynamic_expressions: dict) -> str: + """Generate CUDA kernel that computes everything autonomously.""" + + # Map simple expressions to CUDA code + def expr_to_cuda(expr: str) -> str: + """Convert simple expression to CUDA.""" + expr = expr.replace('sin(', 'sinf(') + expr = expr.replace('cos(', 'cosf(') + expr = expr.replace('abs(', 'fabsf(') + return expr + + code = r''' +extern "C" __global__ +void autonomous_pipeline( + const unsigned char* src, + unsigned char* dst, + int width, int height, + float t, int frame_num +) { + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + + if (x >= width || y >= height) return; + + // Compute dynamic parameters from time (ALL ON GPU!) +''' + + # Add dynamic parameter calculations + rotate_expr = dynamic_expressions.get('rotate_angle', '0.0f') + ripple_phase_expr = dynamic_expressions.get('ripple_phase', '0.0f') + brightness_expr = dynamic_expressions.get('brightness_factor', '1.0f') + zoom_expr = dynamic_expressions.get('zoom_amount', '1.0f') + + code += f''' + float rotate_angle = {expr_to_cuda(rotate_expr)}; + float ripple_phase = {expr_to_cuda(ripple_phase_expr)}; + float brightness_factor = {expr_to_cuda(brightness_expr)}; + float zoom_amount = {expr_to_cuda(zoom_expr)}; + + // Start with source coordinates + float src_x = (float)x; + float src_y = (float)y; + float cx = width / 2.0f; + float cy = height / 2.0f; + + // Accumulated transforms + float total_cos = 1.0f, total_sin = 0.0f; + float total_zoom = 1.0f; + float ripple_dx = 0.0f, ripple_dy = 0.0f; + +''' + + # Add effect-specific code + for i, effect in enumerate(effects): + op = effect['op'] + + if op == 'rotate': + code += f''' + // Rotate {i} + {{ + float angle = rotate_angle * 3.14159265f / 180.0f; + float c = cosf(angle); + float s = sinf(angle); + float nc = total_cos * c - total_sin * s; + float ns = total_cos * s + total_sin * c; + total_cos = nc; + total_sin = ns; + }} +''' + elif op == 'zoom': + code += f''' + // Zoom {i} + {{ + total_zoom *= zoom_amount; + }} +''' + elif op == 'ripple': + amp = effect.get('amplitude', 10) + freq = effect.get('frequency', 8) + decay = effect.get('decay', 2) + rcx = effect.get('center_x', width/2) + rcy = effect.get('center_y', height/2) + code += f''' + // Ripple {i} + {{ + float amplitude = {amp}f; + float frequency = {freq}f; + float decay_val = {decay}f; + float rcx = {rcx}f; + float rcy = {rcy}f; + + float rdx = src_x - rcx; + float rdy = src_y - rcy; + float dist = sqrtf(rdx * rdx + rdy * rdy); + + float wave = sinf(dist * frequency * 0.1f + ripple_phase); + float amp = amplitude * expf(-dist * decay_val * 0.01f); + + if (dist > 0.001f) {{ + ripple_dx += rdx / dist * wave * amp; + ripple_dy += rdy / dist * wave * amp; + }} + }} +''' + + # Apply geometric transforms + code += ''' + // Apply accumulated transforms + { + float dx = src_x - cx; + float dy = src_y - cy; + float rx = total_cos * dx + total_sin * dy; + float ry = -total_sin * dx + total_cos * dy; + rx /= total_zoom; + ry /= total_zoom; + src_x = rx + cx - ripple_dx; + src_y = ry + cy - ripple_dy; + } + + // Bilinear sample + float r, g, b; + if (src_x < 0 || src_x >= width - 1 || src_y < 0 || src_y >= height - 1) { + r = g = b = 0; + } else { + int x0 = (int)src_x; + int y0 = (int)src_y; + float fx = src_x - x0; + float fy = src_y - y0; + + int idx00 = (y0 * width + x0) * 3; + int idx10 = (y0 * width + x0 + 1) * 3; + int idx01 = ((y0 + 1) * width + x0) * 3; + int idx11 = ((y0 + 1) * width + x0 + 1) * 3; + + #define BILERP(c) \\ + (src[idx00 + c] * (1-fx) * (1-fy) + \\ + src[idx10 + c] * fx * (1-fy) + \\ + src[idx01 + c] * (1-fx) * fy + \\ + src[idx11 + c] * fx * fy) + + r = BILERP(0); + g = BILERP(1); + b = BILERP(2); + } + +''' + + # Add color transforms + for i, effect in enumerate(effects): + op = effect['op'] + + if op == 'hue_shift': + degrees = effect.get('degrees', 0) + code += f''' + // Hue shift {i} + {{ + float shift = {degrees}f; + float rf = r / 255.0f; + float gf = g / 255.0f; + float bf = b / 255.0f; + + float max_c = fmaxf(rf, fmaxf(gf, bf)); + float min_c = fminf(rf, fminf(gf, bf)); + float delta = max_c - min_c; + + float h = 0, s = 0, v = max_c; + + if (delta > 0.00001f) {{ + s = delta / max_c; + if (rf >= max_c) h = (gf - bf) / delta; + else if (gf >= max_c) h = 2.0f + (bf - rf) / delta; + else h = 4.0f + (rf - gf) / delta; + h *= 60.0f; + if (h < 0) h += 360.0f; + }} + + h = fmodf(h + shift + 360.0f, 360.0f); + + float c = v * s; + float x_val = c * (1 - fabsf(fmodf(h / 60.0f, 2.0f) - 1)); + float m = v - c; + + float r2, g2, b2; + if (h < 60) {{ r2 = c; g2 = x_val; b2 = 0; }} + else if (h < 120) {{ r2 = x_val; g2 = c; b2 = 0; }} + else if (h < 180) {{ r2 = 0; g2 = c; b2 = x_val; }} + else if (h < 240) {{ r2 = 0; g2 = x_val; b2 = c; }} + else if (h < 300) {{ r2 = x_val; g2 = 0; b2 = c; }} + else {{ r2 = c; g2 = 0; b2 = x_val; }} + + r = (r2 + m) * 255.0f; + g = (g2 + m) * 255.0f; + b = (b2 + m) * 255.0f; + }} +''' + elif op == 'brightness': + code += ''' + // Brightness + { + r *= brightness_factor; + g *= brightness_factor; + b *= brightness_factor; + } +''' + + # Write output + code += ''' + // Write output + int dst_idx = (y * width + x) * 3; + dst[dst_idx] = (unsigned char)fminf(255.0f, fmaxf(0.0f, r)); + dst[dst_idx + 1] = (unsigned char)fminf(255.0f, fmaxf(0.0f, g)); + dst[dst_idx + 2] = (unsigned char)fminf(255.0f, fmaxf(0.0f, b)); +} +''' + + return code + + # Test the compiler if __name__ == '__main__': + import time + print("[sexp_to_cuda] Testing fused kernel compiler...") + print("=" * 60) # Define a test pipeline effects = [ {'op': 'rotate', 'angle': 45.0}, - {'op': 'zoom', 'amount': 1.2}, {'op': 'hue_shift', 'degrees': 30.0}, - {'op': 'ripple', 'amplitude': 10, 'frequency': 8, 'decay': 2, 'phase': 0, 'center_x': 960, 'center_y': 540}, + {'op': 'ripple', 'amplitude': 15, 'frequency': 10, 'decay': 2, 'phase': 0, 'center_x': 960, 'center_y': 540}, + {'op': 'brightness', 'factor': 1.0}, ] - # Compile - pipeline = compile_frame_pipeline(effects, 1920, 1080) - - # Test with dummy frame - import time frame = cp.random.randint(0, 255, (1080, 1920, 3), dtype=cp.uint8) + # ===== Test 1: Standard fused kernel (params passed from Python) ===== + print("\n[Test 1] Standard fused kernel (Python computes params)") + pipeline = compile_frame_pipeline(effects, 1920, 1080) + # Warmup output = pipeline(frame) cp.cuda.Stream.null.synchronize() - # Benchmark + # Benchmark with Python param computation start = time.time() for i in range(100): - output = pipeline(frame, rotate_angle=i, ripple_phase=i*0.1) + # Simulate Python computing params (like sexp interpreter does) + import math + t = i / 30.0 + angle = t * 30 + phase = t * 2 + brightness = 0.8 + 0.4 * math.sin(t * 2) + output = pipeline(frame, rotate_angle=angle, ripple_phase=phase) cp.cuda.Stream.null.synchronize() elapsed = time.time() - start - print(f"Fused kernel: {elapsed/100*1000:.2f}ms per frame") - print(f"That's {100/elapsed:.0f} fps potential!") + print(f" Time: {elapsed/100*1000:.2f}ms per frame") + print(f" FPS: {100/elapsed:.0f}") + + # ===== Test 2: Autonomous kernel (GPU computes everything) ===== + print("\n[Test 2] Autonomous kernel (GPU computes ALL params)") + + dynamic_expressions = { + 'rotate_angle': 't * 30.0f', + 'ripple_phase': 't * 2.0f', + 'brightness_factor': '0.8f + 0.4f * sinf(t * 2.0f)', + } + + auto_pipeline = compile_autonomous_pipeline(effects, 1920, 1080, dynamic_expressions) + + # Warmup + output = auto_pipeline(frame, 0, 30.0) + cp.cuda.Stream.null.synchronize() + + # Benchmark - NO Python computation in loop! + start = time.time() + for i in range(100): + output = auto_pipeline(frame, i, 30.0) # Just pass frame_num! + cp.cuda.Stream.null.synchronize() + elapsed = time.time() - start + + print(f" Time: {elapsed/100*1000:.2f}ms per frame") + print(f" FPS: {100/elapsed:.0f}") + + print("\n" + "=" * 60) + print("Autonomous kernel eliminates Python from hot path!")