#!/usr/bin/env python3 """ Direct test of fused pipeline primitive. Compares performance of: 1. Fused kernel (single CUDA kernel for all effects) 2. Separate kernels (one CUDA kernel per effect) """ import time import sys # Check for CuPy try: import cupy as cp print("[test] CuPy available") except ImportError: print("[test] CuPy not available - can't run test") sys.exit(1) # Add path for imports sys.path.insert(0, '/app') from streaming.sexp_to_cuda import compile_frame_pipeline from streaming.jit_compiler import fast_rotate, fast_hue_shift, fast_ripple def test_fused_vs_separate(): """Compare fused vs separate kernel performance.""" width, height = 1920, 1080 n_frames = 100 # Create test frame frame = cp.random.randint(0, 255, (height, width, 3), dtype=cp.uint8) # Define effects pipeline effects = [ {'op': 'rotate', 'angle': 45.0}, {'op': 'hue_shift', 'degrees': 30.0}, {'op': 'ripple', 'amplitude': 15, 'frequency': 10, 'decay': 2, 'phase': 0, 'center_x': 960, 'center_y': 540}, ] print(f"\n[test] Testing {n_frames} frames at {width}x{height}") print(f"[test] Effects: rotate, hue_shift, ripple\n") # ========== Test fused kernel ========== print("[test] Compiling fused kernel...") pipeline = compile_frame_pipeline(effects, width, height) # Warmup output = pipeline(frame, rotate_angle=45, ripple_phase=0) cp.cuda.Stream.null.synchronize() print("[test] Running fused kernel benchmark...") start = time.time() for i in range(n_frames): output = pipeline(frame, rotate_angle=i * 3.6, ripple_phase=i * 0.1) cp.cuda.Stream.null.synchronize() fused_time = time.time() - start fused_ms = fused_time / n_frames * 1000 fused_fps = n_frames / fused_time print(f"[test] Fused kernel: {fused_ms:.2f}ms/frame ({fused_fps:.0f} fps)") # ========== Test separate kernels ========== print("\n[test] Running separate kernels benchmark...") # Warmup temp = fast_rotate(frame, 45.0) temp = fast_hue_shift(temp, 30.0) temp = fast_ripple(temp, 15, frequency=10, decay=2, phase=0, center_x=960, center_y=540) cp.cuda.Stream.null.synchronize() start = time.time() for i in range(n_frames): temp = fast_rotate(frame, i * 3.6) temp = fast_hue_shift(temp, 30.0) temp = fast_ripple(temp, 15, frequency=10, decay=2, phase=i * 0.1, center_x=960, center_y=540) cp.cuda.Stream.null.synchronize() separate_time = time.time() - start separate_ms = separate_time / n_frames * 1000 separate_fps = n_frames / separate_time print(f"[test] Separate kernels: {separate_ms:.2f}ms/frame ({separate_fps:.0f} fps)") # ========== Summary ========== speedup = separate_time / fused_time print(f"\n{'='*50}") print(f"SPEEDUP: {speedup:.1f}x faster with fused kernel") print(f"") print(f"Fused: {fused_ms:.2f}ms ({fused_fps:.0f} fps)") print(f"Separate: {separate_ms:.2f}ms ({separate_fps:.0f} fps)") print(f"{'='*50}") # Compare with original Python sexp interpreter baseline (126-205ms) python_baseline_ms = 150 # Approximate from profiling vs_python = python_baseline_ms / fused_ms print(f"\nVs Python sexp interpreter (~{python_baseline_ms}ms): {vs_python:.0f}x faster!") if __name__ == '__main__': test_fused_vs_separate()