From 76bf19b8ab1f815226a06948d88e261181f03279 Mon Sep 17 00:00:00 2001 From: giles Date: Wed, 4 Feb 2026 10:04:09 +0000 Subject: [PATCH] Add full optimized pipeline test --- test_full_optimized.py | 161 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 161 insertions(+) create mode 100644 test_full_optimized.py diff --git a/test_full_optimized.py b/test_full_optimized.py new file mode 100644 index 0000000..6d7ae48 --- /dev/null +++ b/test_full_optimized.py @@ -0,0 +1,161 @@ +#!/usr/bin/env python3 +""" +Full Optimized GPU Pipeline Test + +This demonstrates the maximum performance achievable: +1. Pre-allocated GPU frame buffer +2. Autonomous CUDA kernel (all params computed on GPU) +3. GPU HLS encoder (zero-copy NVENC) + +The entire pipeline runs on GPU with zero CPU involvement per frame! +""" + +import time +import sys +import os + +sys.path.insert(0, '/app') + +import cupy as cp +import numpy as np +from streaming.sexp_to_cuda import compile_autonomous_pipeline + +# Try to import GPU encoder +try: + from streaming.gpu_output import GPUHLSOutput, check_gpu_encode_available + GPU_ENCODE = check_gpu_encode_available() +except: + GPU_ENCODE = False + +def run_optimized_stream(duration: float = 10.0, fps: float = 30.0, output_dir: str = '/tmp/optimized'): + width, height = 1920, 1080 + n_frames = int(duration * fps) + + print("=" * 60) + print("FULL OPTIMIZED GPU PIPELINE") + print("=" * 60) + print(f"Resolution: {width}x{height}") + print(f"Duration: {duration}s ({n_frames} frames @ {fps}fps)") + print(f"GPU encode: {GPU_ENCODE}") + print("=" * 60) + + # Pre-allocate frame buffer on GPU + print("\n[1/4] Pre-allocating GPU frame buffer...") + frame = cp.zeros((height, width, 3), dtype=cp.uint8) + # Create a gradient pattern + y_grad = cp.linspace(0, 255, height, dtype=cp.float32)[:, cp.newaxis] + x_grad = cp.linspace(0, 255, width, dtype=cp.float32)[cp.newaxis, :] + frame[:, :, 0] = (y_grad * 0.5).astype(cp.uint8) # R + frame[:, :, 1] = (x_grad * 0.5).astype(cp.uint8) # G + frame[:, :, 2] = 128 # B + + # Define effects + effects = [ + {'op': 'rotate', 'angle': 0}, + {'op': 'hue_shift', 'degrees': 30}, + {'op': 'ripple', 'amplitude': 20, 'frequency': 12, 'decay': 2, 'phase': 0, 'center_x': 960, 'center_y': 540}, + {'op': 'brightness', 'factor': 1.0}, + ] + + # Dynamic expressions (computed on GPU) + dynamic_expressions = { + 'rotate_angle': 't * 45.0f', # 45 degrees per second + 'ripple_phase': 't * 3.0f', # Ripple animation + 'brightness_factor': '0.7f + 0.3f * sinf(t * 2.0f)', # Pulsing brightness + } + + # Compile autonomous pipeline + print("[2/4] Compiling autonomous CUDA kernel...") + pipeline = compile_autonomous_pipeline(effects, width, height, dynamic_expressions) + + # Setup output + print("[3/4] Setting up output...") + os.makedirs(output_dir, exist_ok=True) + + if GPU_ENCODE: + print(" Using GPU HLS encoder (zero-copy)") + out = GPUHLSOutput(output_dir, size=(width, height), fps=fps) + else: + print(" Using ffmpeg encoder") + import subprocess + cmd = [ + 'ffmpeg', '-y', + '-f', 'rawvideo', '-vcodec', 'rawvideo', + '-pix_fmt', 'rgb24', '-s', f'{width}x{height}', '-r', str(fps), + '-i', '-', + '-c:v', 'h264_nvenc', '-preset', 'p4', '-cq', '18', + '-pix_fmt', 'yuv420p', + f'{output_dir}/output.mp4' + ] + proc = subprocess.Popen(cmd, stdin=subprocess.PIPE, stderr=subprocess.DEVNULL) + + # Warmup + output = pipeline(frame, 0, fps) + cp.cuda.Stream.null.synchronize() + + # Run the pipeline! + print(f"[4/4] Running {n_frames} frames...") + print("-" * 60) + + frame_times = [] + start_total = time.time() + + for i in range(n_frames): + frame_start = time.time() + + # Apply effects (autonomous kernel - all on GPU!) + output = pipeline(frame, i, fps) + + # Write output + if GPU_ENCODE: + out.write(output, i / fps) + else: + # Transfer to CPU for ffmpeg (slower path) + cpu_frame = cp.asnumpy(output) + proc.stdin.write(cpu_frame.tobytes()) + + cp.cuda.Stream.null.synchronize() + frame_times.append(time.time() - frame_start) + + # Progress + if (i + 1) % 30 == 0: + avg_ms = sum(frame_times[-30:]) / 30 * 1000 + print(f" Frame {i+1}/{n_frames}: {avg_ms:.1f}ms/frame") + + total_time = time.time() - start_total + + # Cleanup + if GPU_ENCODE: + out.close() + else: + proc.stdin.close() + proc.wait() + + # Results + print("-" * 60) + avg_ms = sum(frame_times) / len(frame_times) * 1000 + actual_fps = n_frames / total_time + + print("\nRESULTS:") + print("=" * 60) + print(f"Total time: {total_time:.2f}s") + print(f"Avg per frame: {avg_ms:.2f}ms") + print(f"Actual FPS: {actual_fps:.0f}") + print(f"Real-time: {actual_fps / fps:.1f}x") + print("=" * 60) + + if GPU_ENCODE: + print(f"\nOutput: {output_dir}/*.ts (HLS segments)") + else: + print(f"\nOutput: {output_dir}/output.mp4") + + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser() + parser.add_argument('-d', '--duration', type=float, default=10.0) + parser.add_argument('-o', '--output', default='/tmp/optimized') + parser.add_argument('--fps', type=float, default=30.0) + args = parser.parse_args() + + run_optimized_stream(args.duration, args.fps, args.output)