celery/diagnose_gpu.py

#!/usr/bin/env python3
"""
GPU Rendering Diagnostic Script

Checks for common issues that cause GPU rendering slowdowns in art-dag.
Run this script to identify potential performance bottlenecks.
"""

import sys
import subprocess
import os

def print_section(title):
    print(f"\n{'='*60}")
    print(f"  {title}")
    print(f"{'='*60}")

def check_pass(msg):
    print(f"  [PASS] {msg}")

def check_fail(msg):
    print(f"  [FAIL] {msg}")

def check_warn(msg):
    print(f"  [WARN] {msg}")

def check_info(msg):
    print(f"  [INFO] {msg}")

# ============================================================
# 1. Check GPU Availability
# ============================================================
print_section("1. GPU AVAILABILITY")

# Check nvidia-smi
try:
    result = subprocess.run(["nvidia-smi", "--query-gpu=name,memory.total,memory.free,utilization.gpu",
                            "--format=csv,noheader"], capture_output=True, text=True, timeout=5)
    if result.returncode == 0:
        for line in result.stdout.strip().split('\n'):
            check_pass(f"GPU found: {line}")
    else:
        check_fail("nvidia-smi failed - no GPU detected")
except FileNotFoundError:
    check_fail("nvidia-smi not found - NVIDIA drivers not installed")
except Exception as e:
    check_fail(f"nvidia-smi error: {e}")

# ============================================================
# 2. Check CuPy
# ============================================================
print_section("2. CUPY (GPU ARRAY LIBRARY)")

try:
    import cupy as cp
    check_pass(f"CuPy available, version {cp.__version__}")

    # Test basic GPU operation
    try:
        a = cp.zeros((100, 100), dtype=cp.uint8)
        cp.cuda.Stream.null.synchronize()
        check_pass("CuPy GPU operations working")

        # Check memory
        mempool = cp.get_default_memory_pool()
        check_info(f"GPU memory pool: {mempool.used_bytes() / 1024**2:.1f} MB used, "
                  f"{mempool.total_bytes() / 1024**2:.1f} MB total")
    except Exception as e:
        check_fail(f"CuPy GPU test failed: {e}")
except ImportError:
    check_fail("CuPy not installed - GPU rendering disabled")

# ============================================================
# 3. Check PyNvVideoCodec (GPU Encoding)
# ============================================================
print_section("3. PYNVVIDEOCODEC (GPU ENCODING)")

try:
    import PyNvVideoCodec as nvc
    check_pass("PyNvVideoCodec available - zero-copy GPU encoding enabled")
except ImportError:
    check_warn("PyNvVideoCodec not available - using FFmpeg NVENC (slower)")

# ============================================================
# 4. Check Decord GPU (Hardware Decode)
# ============================================================
print_section("4. DECORD GPU (HARDWARE DECODE)")

try:
    import decord
    from decord import gpu
    ctx = gpu(0)
    check_pass(f"Decord GPU (NVDEC) available - hardware video decode enabled")
except ImportError:
    check_warn("Decord not installed - using FFmpeg decode")
except Exception as e:
    check_warn(f"Decord GPU not available ({e}) - using FFmpeg decode")

# ============================================================
# 5. Check DLPack Support
# ============================================================
print_section("5. DLPACK (ZERO-COPY TRANSFER)")

try:
    import decord
    from decord import VideoReader, gpu
    import cupy as cp

    # Need a test video file
    test_video = None
    for path in ["/data/cache", "/tmp"]:
        if os.path.exists(path):
            for f in os.listdir(path):
                if f.endswith(('.mp4', '.webm', '.mkv')):
                    test_video = os.path.join(path, f)
                    break
        if test_video:
            break

    if test_video:
        try:
            vr = VideoReader(test_video, ctx=gpu(0))
            frame = vr[0]
            dlpack = frame.to_dlpack()
            gpu_frame = cp.from_dlpack(dlpack)
            check_pass(f"DLPack zero-copy working (tested with {os.path.basename(test_video)})")
        except Exception as e:
            check_fail(f"DLPack FAILED: {e}")
            check_info("This means every frame does GPU->CPU->GPU copy (SLOW)")
    else:
        check_warn("No test video found - cannot verify DLPack")
except ImportError:
    check_warn("Cannot test DLPack - decord or cupy not available")

# ============================================================
# 6. Check Fast CUDA Kernels
# ============================================================
print_section("6. FAST CUDA KERNELS (JIT COMPILED)")

try:
    sys.path.insert(0, '/root/art-dag/celery')
    from streaming.jit_compiler import (
        fast_rotate, fast_zoom, fast_blend, fast_hue_shift,
        fast_invert, fast_ripple, get_fast_ops
    )
    check_pass("Fast CUDA kernels loaded successfully")

    # Test one kernel
    try:
        import cupy as cp
        test_img = cp.zeros((720, 1280, 3), dtype=cp.uint8)
        result = fast_rotate(test_img, 45.0)
        cp.cuda.Stream.null.synchronize()
        check_pass("Fast rotate kernel working")
    except Exception as e:
        check_fail(f"Fast kernel execution failed: {e}")
except ImportError as e:
    check_warn(f"Fast CUDA kernels not available: {e}")
    check_info("Fallback to slower CuPy operations")

# ============================================================
# 7. Check Fused Pipeline Compiler
# ============================================================
print_section("7. FUSED PIPELINE COMPILER")

try:
    sys.path.insert(0, '/root/art-dag/celery')
    from streaming.sexp_to_cuda import compile_frame_pipeline, compile_autonomous_pipeline
    check_pass("Fused CUDA pipeline compiler available")
except ImportError as e:
    check_warn(f"Fused pipeline compiler not available: {e}")
    check_info("Using per-operation fallback (slower for multi-effect pipelines)")

# ============================================================
# 8. Check FFmpeg NVENC
# ============================================================
print_section("8. FFMPEG NVENC (HARDWARE ENCODE)")

try:
    result = subprocess.run(["ffmpeg", "-encoders"], capture_output=True, text=True, timeout=5)
    if "h264_nvenc" in result.stdout:
        check_pass("FFmpeg h264_nvenc encoder available")
    else:
        check_warn("FFmpeg h264_nvenc not available - using libx264 (CPU)")

    if "hevc_nvenc" in result.stdout:
        check_pass("FFmpeg hevc_nvenc encoder available")
except Exception as e:
    check_fail(f"FFmpeg check failed: {e}")

# ============================================================
# 9. Check FFmpeg NVDEC
# ============================================================
print_section("9. FFMPEG NVDEC (HARDWARE DECODE)")

try:
    result = subprocess.run(["ffmpeg", "-hwaccels"], capture_output=True, text=True, timeout=5)
    if "cuda" in result.stdout:
        check_pass("FFmpeg CUDA hwaccel available")
    else:
        check_warn("FFmpeg CUDA hwaccel not available - using CPU decode")
except Exception as e:
    check_fail(f"FFmpeg hwaccel check failed: {e}")

# ============================================================
# 10. Check Pipeline Cache Status
# ============================================================
print_section("10. PIPELINE CACHE STATUS")

try:
    sys.path.insert(0, '/root/art-dag/celery')
    from sexp_effects.primitive_libs.streaming_gpu import (
        _FUSED_PIPELINE_CACHE, _AUTONOMOUS_PIPELINE_CACHE
    )
    fused_count = len(_FUSED_PIPELINE_CACHE)
    auto_count = len(_AUTONOMOUS_PIPELINE_CACHE)

    if fused_count > 0 or auto_count > 0:
        check_info(f"Fused pipeline cache: {fused_count} entries")
        check_info(f"Autonomous pipeline cache: {auto_count} entries")
        if fused_count > 100 or auto_count > 100:
            check_warn("Large pipeline cache - may cause memory pressure")
    else:
        check_info("Pipeline caches empty (no rendering done yet)")
except Exception as e:
    check_info(f"Could not check pipeline cache: {e}")

# ============================================================
# Summary
# ============================================================
print_section("SUMMARY")
print("""
Optimal GPU rendering requires:
  1. [CRITICAL] CuPy with working GPU operations
  2. [CRITICAL] DLPack zero-copy transfer (decord -> CuPy)
  3. [HIGH] Fast CUDA kernels from jit_compiler
  4. [MEDIUM] Fused pipeline compiler for multi-effect recipes
  5. [MEDIUM] PyNvVideoCodec for zero-copy encoding
  6. [LOW] FFmpeg NVENC/NVDEC as fallback

If DLPack is failing, check:
  - decord version (needs 0.6.0+ with DLPack support)
  - CuPy version compatibility
  - CUDA toolkit version match

If fast kernels are not loading:
  - Check if streaming/jit_compiler.py exists
  - Verify CUDA compiler (nvcc) is available
""")