Add full optimized pipeline test

2026-02-04 10:04:09 +00:00
parent 1bd171b892
commit 76bf19b8ab
1 changed files with 161 additions and 0 deletions
--- a/test_full_optimized.py
+++ b/test_full_optimized.py
@@ -0,0 +1,161 @@
+#!/usr/bin/env python3
+"""
+Full Optimized GPU Pipeline Test
+
+This demonstrates the maximum performance achievable:
+1. Pre-allocated GPU frame buffer
+2. Autonomous CUDA kernel (all params computed on GPU)
+3. GPU HLS encoder (zero-copy NVENC)
+
+The entire pipeline runs on GPU with zero CPU involvement per frame!
+"""
+
+import time
+import sys
+import os
+
+sys.path.insert(0, '/app')
+
+import cupy as cp
+import numpy as np
+from streaming.sexp_to_cuda import compile_autonomous_pipeline
+
+# Try to import GPU encoder
+try:
+    from streaming.gpu_output import GPUHLSOutput, check_gpu_encode_available
+    GPU_ENCODE = check_gpu_encode_available()
+except:
+    GPU_ENCODE = False
+
+def run_optimized_stream(duration: float = 10.0, fps: float = 30.0, output_dir: str = '/tmp/optimized'):
+    width, height = 1920, 1080
+    n_frames = int(duration * fps)
+
+    print("=" * 60)
+    print("FULL OPTIMIZED GPU PIPELINE")
+    print("=" * 60)
+    print(f"Resolution: {width}x{height}")
+    print(f"Duration:   {duration}s ({n_frames} frames @ {fps}fps)")
+    print(f"GPU encode: {GPU_ENCODE}")
+    print("=" * 60)
+
+    # Pre-allocate frame buffer on GPU
+    print("\n[1/4] Pre-allocating GPU frame buffer...")
+    frame = cp.zeros((height, width, 3), dtype=cp.uint8)
+    # Create a gradient pattern
+    y_grad = cp.linspace(0, 255, height, dtype=cp.float32)[:, cp.newaxis]
+    x_grad = cp.linspace(0, 255, width, dtype=cp.float32)[cp.newaxis, :]
+    frame[:, :, 0] = (y_grad * 0.5).astype(cp.uint8)  # R
+    frame[:, :, 1] = (x_grad * 0.5).astype(cp.uint8)  # G
+    frame[:, :, 2] = 128  # B
+
+    # Define effects
+    effects = [
+        {'op': 'rotate', 'angle': 0},
+        {'op': 'hue_shift', 'degrees': 30},
+        {'op': 'ripple', 'amplitude': 20, 'frequency': 12, 'decay': 2, 'phase': 0, 'center_x': 960, 'center_y': 540},
+        {'op': 'brightness', 'factor': 1.0},
+    ]
+
+    # Dynamic expressions (computed on GPU)
+    dynamic_expressions = {
+        'rotate_angle': 't * 45.0f',  # 45 degrees per second
+        'ripple_phase': 't * 3.0f',   # Ripple animation
+        'brightness_factor': '0.7f + 0.3f * sinf(t * 2.0f)',  # Pulsing brightness
+    }
+
+    # Compile autonomous pipeline
+    print("[2/4] Compiling autonomous CUDA kernel...")
+    pipeline = compile_autonomous_pipeline(effects, width, height, dynamic_expressions)
+
+    # Setup output
+    print("[3/4] Setting up output...")
+    os.makedirs(output_dir, exist_ok=True)
+
+    if GPU_ENCODE:
+        print("      Using GPU HLS encoder (zero-copy)")
+        out = GPUHLSOutput(output_dir, size=(width, height), fps=fps)
+    else:
+        print("      Using ffmpeg encoder")
+        import subprocess
+        cmd = [
+            'ffmpeg', '-y',
+            '-f', 'rawvideo', '-vcodec', 'rawvideo',
+            '-pix_fmt', 'rgb24', '-s', f'{width}x{height}', '-r', str(fps),
+            '-i', '-',
+            '-c:v', 'h264_nvenc', '-preset', 'p4', '-cq', '18',
+            '-pix_fmt', 'yuv420p',
+            f'{output_dir}/output.mp4'
+        ]
+        proc = subprocess.Popen(cmd, stdin=subprocess.PIPE, stderr=subprocess.DEVNULL)
+
+    # Warmup
+    output = pipeline(frame, 0, fps)
+    cp.cuda.Stream.null.synchronize()
+
+    # Run the pipeline!
+    print(f"[4/4] Running {n_frames} frames...")
+    print("-" * 60)
+
+    frame_times = []
+    start_total = time.time()
+
+    for i in range(n_frames):
+        frame_start = time.time()
+
+        # Apply effects (autonomous kernel - all on GPU!)
+        output = pipeline(frame, i, fps)
+
+        # Write output
+        if GPU_ENCODE:
+            out.write(output, i / fps)
+        else:
+            # Transfer to CPU for ffmpeg (slower path)
+            cpu_frame = cp.asnumpy(output)
+            proc.stdin.write(cpu_frame.tobytes())
+
+        cp.cuda.Stream.null.synchronize()
+        frame_times.append(time.time() - frame_start)
+
+        # Progress
+        if (i + 1) % 30 == 0:
+            avg_ms = sum(frame_times[-30:]) / 30 * 1000
+            print(f"  Frame {i+1}/{n_frames}: {avg_ms:.1f}ms/frame")
+
+    total_time = time.time() - start_total
+
+    # Cleanup
+    if GPU_ENCODE:
+        out.close()
+    else:
+        proc.stdin.close()
+        proc.wait()
+
+    # Results
+    print("-" * 60)
+    avg_ms = sum(frame_times) / len(frame_times) * 1000
+    actual_fps = n_frames / total_time
+
+    print("\nRESULTS:")
+    print("=" * 60)
+    print(f"Total time:    {total_time:.2f}s")
+    print(f"Avg per frame: {avg_ms:.2f}ms")
+    print(f"Actual FPS:    {actual_fps:.0f}")
+    print(f"Real-time:     {actual_fps / fps:.1f}x")
+    print("=" * 60)
+
+    if GPU_ENCODE:
+        print(f"\nOutput: {output_dir}/*.ts (HLS segments)")
+    else:
+        print(f"\nOutput: {output_dir}/output.mp4")
+
+
+if __name__ == '__main__':
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-d', '--duration', type=float, default=10.0)
+    parser.add_argument('-o', '--output', default='/tmp/optimized')
+    parser.add_argument('--fps', type=float, default=30.0)
+    args = parser.parse_args()
+
+    run_optimized_stream(args.duration, args.fps, args.output)