Add fused-pipeline primitive and test for compiled CUDA kernels

2026-02-04 09:51:56 +00:00
parent 8b9309a90b
commit 2d20a6f452
3 changed files with 241 additions and 0 deletions
--- a/test_fused_direct.py
+++ b/test_fused_direct.py
@@ -0,0 +1,102 @@
+#!/usr/bin/env python3
+"""
+Direct test of fused pipeline primitive.
+
+Compares performance of:
+1. Fused kernel (single CUDA kernel for all effects)
+2. Separate kernels (one CUDA kernel per effect)
+"""
+
+import time
+import sys
+
+# Check for CuPy
+try:
+    import cupy as cp
+    print("[test] CuPy available")
+except ImportError:
+    print("[test] CuPy not available - can't run test")
+    sys.exit(1)
+
+# Add path for imports
+sys.path.insert(0, '/app')
+
+from streaming.sexp_to_cuda import compile_frame_pipeline
+from streaming.jit_compiler import fast_rotate, fast_hue_shift, fast_ripple
+
+def test_fused_vs_separate():
+    """Compare fused vs separate kernel performance."""
+
+    width, height = 1920, 1080
+    n_frames = 100
+
+    # Create test frame
+    frame = cp.random.randint(0, 255, (height, width, 3), dtype=cp.uint8)
+
+    # Define effects pipeline
+    effects = [
+        {'op': 'rotate', 'angle': 45.0},
+        {'op': 'hue_shift', 'degrees': 30.0},
+        {'op': 'ripple', 'amplitude': 15, 'frequency': 10, 'decay': 2, 'phase': 0, 'center_x': 960, 'center_y': 540},
+    ]
+
+    print(f"\n[test] Testing {n_frames} frames at {width}x{height}")
+    print(f"[test] Effects: rotate, hue_shift, ripple\n")
+
+    # ========== Test fused kernel ==========
+    print("[test] Compiling fused kernel...")
+    pipeline = compile_frame_pipeline(effects, width, height)
+
+    # Warmup
+    output = pipeline(frame, rotate_angle=45, ripple_phase=0)
+    cp.cuda.Stream.null.synchronize()
+
+    print("[test] Running fused kernel benchmark...")
+    start = time.time()
+    for i in range(n_frames):
+        output = pipeline(frame, rotate_angle=i * 3.6, ripple_phase=i * 0.1)
+    cp.cuda.Stream.null.synchronize()
+    fused_time = time.time() - start
+
+    fused_ms = fused_time / n_frames * 1000
+    fused_fps = n_frames / fused_time
+    print(f"[test] Fused kernel: {fused_ms:.2f}ms/frame ({fused_fps:.0f} fps)")
+
+    # ========== Test separate kernels ==========
+    print("\n[test] Running separate kernels benchmark...")
+
+    # Warmup
+    temp = fast_rotate(frame, 45.0)
+    temp = fast_hue_shift(temp, 30.0)
+    temp = fast_ripple(temp, 15, 10, 2, 0, 960, 540)
+    cp.cuda.Stream.null.synchronize()
+
+    start = time.time()
+    for i in range(n_frames):
+        temp = fast_rotate(frame, i * 3.6)
+        temp = fast_hue_shift(temp, 30.0)
+        temp = fast_ripple(temp, 15, 10, 2, i * 0.1, 960, 540)
+    cp.cuda.Stream.null.synchronize()
+    separate_time = time.time() - start
+
+    separate_ms = separate_time / n_frames * 1000
+    separate_fps = n_frames / separate_time
+    print(f"[test] Separate kernels: {separate_ms:.2f}ms/frame ({separate_fps:.0f} fps)")
+
+    # ========== Summary ==========
+    speedup = separate_time / fused_time
+    print(f"\n{'='*50}")
+    print(f"SPEEDUP: {speedup:.1f}x faster with fused kernel")
+    print(f"")
+    print(f"Fused:    {fused_ms:.2f}ms ({fused_fps:.0f} fps)")
+    print(f"Separate: {separate_ms:.2f}ms ({separate_fps:.0f} fps)")
+    print(f"{'='*50}")
+
+    # Compare with original Python sexp interpreter baseline (126-205ms)
+    python_baseline_ms = 150  # Approximate from profiling
+    vs_python = python_baseline_ms / fused_ms
+    print(f"\nVs Python sexp interpreter (~{python_baseline_ms}ms): {vs_python:.0f}x faster!")
+
+
+if __name__ == '__main__':
+    test_fused_vs_separate()