celery/test_full_optimized.py

#!/usr/bin/env python3
"""
Full Optimized GPU Pipeline Test

This demonstrates the maximum performance achievable:
1. Pre-allocated GPU frame buffer
2. Autonomous CUDA kernel (all params computed on GPU)
3. GPU HLS encoder (zero-copy NVENC)

The entire pipeline runs on GPU with zero CPU involvement per frame!
"""

import time
import sys
import os

sys.path.insert(0, '/app')

import cupy as cp
import numpy as np
from streaming.sexp_to_cuda import compile_autonomous_pipeline

# Try to import GPU encoder
try:
    from streaming.gpu_output import GPUHLSOutput, check_gpu_encode_available
    GPU_ENCODE = check_gpu_encode_available()
except:
    GPU_ENCODE = False

def run_optimized_stream(duration: float = 10.0, fps: float = 30.0, output_dir: str = '/tmp/optimized'):
    width, height = 1920, 1080
    n_frames = int(duration * fps)

    print("=" * 60)
    print("FULL OPTIMIZED GPU PIPELINE")
    print("=" * 60)
    print(f"Resolution: {width}x{height}")
    print(f"Duration:   {duration}s ({n_frames} frames @ {fps}fps)")
    print(f"GPU encode: {GPU_ENCODE}")
    print("=" * 60)

    # Pre-allocate frame buffer on GPU
    print("\n[1/4] Pre-allocating GPU frame buffer...")
    frame = cp.zeros((height, width, 3), dtype=cp.uint8)
    # Create a gradient pattern
    y_grad = cp.linspace(0, 255, height, dtype=cp.float32)[:, cp.newaxis]
    x_grad = cp.linspace(0, 255, width, dtype=cp.float32)[cp.newaxis, :]
    frame[:, :, 0] = (y_grad * 0.5).astype(cp.uint8)  # R
    frame[:, :, 1] = (x_grad * 0.5).astype(cp.uint8)  # G
    frame[:, :, 2] = 128  # B

    # Define effects
    effects = [
        {'op': 'rotate', 'angle': 0},
        {'op': 'hue_shift', 'degrees': 30},
        {'op': 'ripple', 'amplitude': 20, 'frequency': 12, 'decay': 2, 'phase': 0, 'center_x': 960, 'center_y': 540},
        {'op': 'brightness', 'factor': 1.0},
    ]

    # Dynamic expressions (computed on GPU)
    dynamic_expressions = {
        'rotate_angle': 't * 45.0f',  # 45 degrees per second
        'ripple_phase': 't * 3.0f',   # Ripple animation
        'brightness_factor': '0.7f + 0.3f * sinf(t * 2.0f)',  # Pulsing brightness
    }

    # Compile autonomous pipeline
    print("[2/4] Compiling autonomous CUDA kernel...")
    pipeline = compile_autonomous_pipeline(effects, width, height, dynamic_expressions)

    # Setup output
    print("[3/4] Setting up output...")
    os.makedirs(output_dir, exist_ok=True)

    if GPU_ENCODE:
        print("      Using GPU HLS encoder (zero-copy)")
        out = GPUHLSOutput(output_dir, size=(width, height), fps=fps)
    else:
        print("      Using ffmpeg encoder")
        import subprocess
        cmd = [
            'ffmpeg', '-y',
            '-f', 'rawvideo', '-vcodec', 'rawvideo',
            '-pix_fmt', 'rgb24', '-s', f'{width}x{height}', '-r', str(fps),
            '-i', '-',
            '-c:v', 'h264_nvenc', '-preset', 'p4', '-cq', '18',
            '-pix_fmt', 'yuv420p',
            f'{output_dir}/output.mp4'
        ]
        proc = subprocess.Popen(cmd, stdin=subprocess.PIPE, stderr=subprocess.DEVNULL)

    # Warmup
    output = pipeline(frame, 0, fps)
    cp.cuda.Stream.null.synchronize()

    # Run the pipeline!
    print(f"[4/4] Running {n_frames} frames...")
    print("-" * 60)

    frame_times = []
    start_total = time.time()

    for i in range(n_frames):
        frame_start = time.time()

        # Apply effects (autonomous kernel - all on GPU!)
        output = pipeline(frame, i, fps)

        # Write output
        if GPU_ENCODE:
            out.write(output, i / fps)
        else:
            # Transfer to CPU for ffmpeg (slower path)
            cpu_frame = cp.asnumpy(output)
            proc.stdin.write(cpu_frame.tobytes())

        cp.cuda.Stream.null.synchronize()
        frame_times.append(time.time() - frame_start)

        # Progress
        if (i + 1) % 30 == 0:
            avg_ms = sum(frame_times[-30:]) / 30 * 1000
            print(f"  Frame {i+1}/{n_frames}: {avg_ms:.1f}ms/frame")

    total_time = time.time() - start_total

    # Cleanup
    if GPU_ENCODE:
        out.close()
    else:
        proc.stdin.close()
        proc.wait()

    # Results
    print("-" * 60)
    avg_ms = sum(frame_times) / len(frame_times) * 1000
    actual_fps = n_frames / total_time

    print("\nRESULTS:")
    print("=" * 60)
    print(f"Total time:    {total_time:.2f}s")
    print(f"Avg per frame: {avg_ms:.2f}ms")
    print(f"Actual FPS:    {actual_fps:.0f}")
    print(f"Real-time:     {actual_fps / fps:.1f}x")
    print("=" * 60)

    if GPU_ENCODE:
        print(f"\nOutput: {output_dir}/*.ts (HLS segments)")
    else:
        print(f"\nOutput: {output_dir}/output.mp4")


if __name__ == '__main__':
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('-d', '--duration', type=float, default=10.0)
    parser.add_argument('-o', '--output', default='/tmp/optimized')
    parser.add_argument('--fps', type=float, default=30.0)
    args = parser.parse_args()

    run_optimized_stream(args.duration, args.fps, args.output)