Add full optimized pipeline test
This commit is contained in:
161
test_full_optimized.py
Normal file
161
test_full_optimized.py
Normal file
@@ -0,0 +1,161 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Full Optimized GPU Pipeline Test
|
||||
|
||||
This demonstrates the maximum performance achievable:
|
||||
1. Pre-allocated GPU frame buffer
|
||||
2. Autonomous CUDA kernel (all params computed on GPU)
|
||||
3. GPU HLS encoder (zero-copy NVENC)
|
||||
|
||||
The entire pipeline runs on GPU with zero CPU involvement per frame!
|
||||
"""
|
||||
|
||||
import time
|
||||
import sys
|
||||
import os
|
||||
|
||||
sys.path.insert(0, '/app')
|
||||
|
||||
import cupy as cp
|
||||
import numpy as np
|
||||
from streaming.sexp_to_cuda import compile_autonomous_pipeline
|
||||
|
||||
# Try to import GPU encoder
|
||||
try:
|
||||
from streaming.gpu_output import GPUHLSOutput, check_gpu_encode_available
|
||||
GPU_ENCODE = check_gpu_encode_available()
|
||||
except:
|
||||
GPU_ENCODE = False
|
||||
|
||||
def run_optimized_stream(duration: float = 10.0, fps: float = 30.0, output_dir: str = '/tmp/optimized'):
|
||||
width, height = 1920, 1080
|
||||
n_frames = int(duration * fps)
|
||||
|
||||
print("=" * 60)
|
||||
print("FULL OPTIMIZED GPU PIPELINE")
|
||||
print("=" * 60)
|
||||
print(f"Resolution: {width}x{height}")
|
||||
print(f"Duration: {duration}s ({n_frames} frames @ {fps}fps)")
|
||||
print(f"GPU encode: {GPU_ENCODE}")
|
||||
print("=" * 60)
|
||||
|
||||
# Pre-allocate frame buffer on GPU
|
||||
print("\n[1/4] Pre-allocating GPU frame buffer...")
|
||||
frame = cp.zeros((height, width, 3), dtype=cp.uint8)
|
||||
# Create a gradient pattern
|
||||
y_grad = cp.linspace(0, 255, height, dtype=cp.float32)[:, cp.newaxis]
|
||||
x_grad = cp.linspace(0, 255, width, dtype=cp.float32)[cp.newaxis, :]
|
||||
frame[:, :, 0] = (y_grad * 0.5).astype(cp.uint8) # R
|
||||
frame[:, :, 1] = (x_grad * 0.5).astype(cp.uint8) # G
|
||||
frame[:, :, 2] = 128 # B
|
||||
|
||||
# Define effects
|
||||
effects = [
|
||||
{'op': 'rotate', 'angle': 0},
|
||||
{'op': 'hue_shift', 'degrees': 30},
|
||||
{'op': 'ripple', 'amplitude': 20, 'frequency': 12, 'decay': 2, 'phase': 0, 'center_x': 960, 'center_y': 540},
|
||||
{'op': 'brightness', 'factor': 1.0},
|
||||
]
|
||||
|
||||
# Dynamic expressions (computed on GPU)
|
||||
dynamic_expressions = {
|
||||
'rotate_angle': 't * 45.0f', # 45 degrees per second
|
||||
'ripple_phase': 't * 3.0f', # Ripple animation
|
||||
'brightness_factor': '0.7f + 0.3f * sinf(t * 2.0f)', # Pulsing brightness
|
||||
}
|
||||
|
||||
# Compile autonomous pipeline
|
||||
print("[2/4] Compiling autonomous CUDA kernel...")
|
||||
pipeline = compile_autonomous_pipeline(effects, width, height, dynamic_expressions)
|
||||
|
||||
# Setup output
|
||||
print("[3/4] Setting up output...")
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
if GPU_ENCODE:
|
||||
print(" Using GPU HLS encoder (zero-copy)")
|
||||
out = GPUHLSOutput(output_dir, size=(width, height), fps=fps)
|
||||
else:
|
||||
print(" Using ffmpeg encoder")
|
||||
import subprocess
|
||||
cmd = [
|
||||
'ffmpeg', '-y',
|
||||
'-f', 'rawvideo', '-vcodec', 'rawvideo',
|
||||
'-pix_fmt', 'rgb24', '-s', f'{width}x{height}', '-r', str(fps),
|
||||
'-i', '-',
|
||||
'-c:v', 'h264_nvenc', '-preset', 'p4', '-cq', '18',
|
||||
'-pix_fmt', 'yuv420p',
|
||||
f'{output_dir}/output.mp4'
|
||||
]
|
||||
proc = subprocess.Popen(cmd, stdin=subprocess.PIPE, stderr=subprocess.DEVNULL)
|
||||
|
||||
# Warmup
|
||||
output = pipeline(frame, 0, fps)
|
||||
cp.cuda.Stream.null.synchronize()
|
||||
|
||||
# Run the pipeline!
|
||||
print(f"[4/4] Running {n_frames} frames...")
|
||||
print("-" * 60)
|
||||
|
||||
frame_times = []
|
||||
start_total = time.time()
|
||||
|
||||
for i in range(n_frames):
|
||||
frame_start = time.time()
|
||||
|
||||
# Apply effects (autonomous kernel - all on GPU!)
|
||||
output = pipeline(frame, i, fps)
|
||||
|
||||
# Write output
|
||||
if GPU_ENCODE:
|
||||
out.write(output, i / fps)
|
||||
else:
|
||||
# Transfer to CPU for ffmpeg (slower path)
|
||||
cpu_frame = cp.asnumpy(output)
|
||||
proc.stdin.write(cpu_frame.tobytes())
|
||||
|
||||
cp.cuda.Stream.null.synchronize()
|
||||
frame_times.append(time.time() - frame_start)
|
||||
|
||||
# Progress
|
||||
if (i + 1) % 30 == 0:
|
||||
avg_ms = sum(frame_times[-30:]) / 30 * 1000
|
||||
print(f" Frame {i+1}/{n_frames}: {avg_ms:.1f}ms/frame")
|
||||
|
||||
total_time = time.time() - start_total
|
||||
|
||||
# Cleanup
|
||||
if GPU_ENCODE:
|
||||
out.close()
|
||||
else:
|
||||
proc.stdin.close()
|
||||
proc.wait()
|
||||
|
||||
# Results
|
||||
print("-" * 60)
|
||||
avg_ms = sum(frame_times) / len(frame_times) * 1000
|
||||
actual_fps = n_frames / total_time
|
||||
|
||||
print("\nRESULTS:")
|
||||
print("=" * 60)
|
||||
print(f"Total time: {total_time:.2f}s")
|
||||
print(f"Avg per frame: {avg_ms:.2f}ms")
|
||||
print(f"Actual FPS: {actual_fps:.0f}")
|
||||
print(f"Real-time: {actual_fps / fps:.1f}x")
|
||||
print("=" * 60)
|
||||
|
||||
if GPU_ENCODE:
|
||||
print(f"\nOutput: {output_dir}/*.ts (HLS segments)")
|
||||
else:
|
||||
print(f"\nOutput: {output_dir}/output.mp4")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
import argparse
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('-d', '--duration', type=float, default=10.0)
|
||||
parser.add_argument('-o', '--output', default='/tmp/optimized')
|
||||
parser.add_argument('--fps', type=float, default=30.0)
|
||||
args = parser.parse_args()
|
||||
|
||||
run_optimized_stream(args.duration, args.fps, args.output)
|
||||
Reference in New Issue
Block a user