#!/usr/bin/env python3 """ GPU Rendering Diagnostic Script Checks for common issues that cause GPU rendering slowdowns in art-dag. Run this script to identify potential performance bottlenecks. """ import sys import subprocess import os def print_section(title): print(f"\n{'='*60}") print(f" {title}") print(f"{'='*60}") def check_pass(msg): print(f" [PASS] {msg}") def check_fail(msg): print(f" [FAIL] {msg}") def check_warn(msg): print(f" [WARN] {msg}") def check_info(msg): print(f" [INFO] {msg}") # ============================================================ # 1. Check GPU Availability # ============================================================ print_section("1. GPU AVAILABILITY") # Check nvidia-smi try: result = subprocess.run(["nvidia-smi", "--query-gpu=name,memory.total,memory.free,utilization.gpu", "--format=csv,noheader"], capture_output=True, text=True, timeout=5) if result.returncode == 0: for line in result.stdout.strip().split('\n'): check_pass(f"GPU found: {line}") else: check_fail("nvidia-smi failed - no GPU detected") except FileNotFoundError: check_fail("nvidia-smi not found - NVIDIA drivers not installed") except Exception as e: check_fail(f"nvidia-smi error: {e}") # ============================================================ # 2. Check CuPy # ============================================================ print_section("2. CUPY (GPU ARRAY LIBRARY)") try: import cupy as cp check_pass(f"CuPy available, version {cp.__version__}") # Test basic GPU operation try: a = cp.zeros((100, 100), dtype=cp.uint8) cp.cuda.Stream.null.synchronize() check_pass("CuPy GPU operations working") # Check memory mempool = cp.get_default_memory_pool() check_info(f"GPU memory pool: {mempool.used_bytes() / 1024**2:.1f} MB used, " f"{mempool.total_bytes() / 1024**2:.1f} MB total") except Exception as e: check_fail(f"CuPy GPU test failed: {e}") except ImportError: check_fail("CuPy not installed - GPU rendering disabled") # ============================================================ # 3. Check PyNvVideoCodec (GPU Encoding) # ============================================================ print_section("3. PYNVVIDEOCODEC (GPU ENCODING)") try: import PyNvVideoCodec as nvc check_pass("PyNvVideoCodec available - zero-copy GPU encoding enabled") except ImportError: check_warn("PyNvVideoCodec not available - using FFmpeg NVENC (slower)") # ============================================================ # 4. Check Decord GPU (Hardware Decode) # ============================================================ print_section("4. DECORD GPU (HARDWARE DECODE)") try: import decord from decord import gpu ctx = gpu(0) check_pass(f"Decord GPU (NVDEC) available - hardware video decode enabled") except ImportError: check_warn("Decord not installed - using FFmpeg decode") except Exception as e: check_warn(f"Decord GPU not available ({e}) - using FFmpeg decode") # ============================================================ # 5. Check DLPack Support # ============================================================ print_section("5. DLPACK (ZERO-COPY TRANSFER)") try: import decord from decord import VideoReader, gpu import cupy as cp # Need a test video file test_video = None for path in ["/data/cache", "/tmp"]: if os.path.exists(path): for f in os.listdir(path): if f.endswith(('.mp4', '.webm', '.mkv')): test_video = os.path.join(path, f) break if test_video: break if test_video: try: vr = VideoReader(test_video, ctx=gpu(0)) frame = vr[0] dlpack = frame.to_dlpack() gpu_frame = cp.from_dlpack(dlpack) check_pass(f"DLPack zero-copy working (tested with {os.path.basename(test_video)})") except Exception as e: check_fail(f"DLPack FAILED: {e}") check_info("This means every frame does GPU->CPU->GPU copy (SLOW)") else: check_warn("No test video found - cannot verify DLPack") except ImportError: check_warn("Cannot test DLPack - decord or cupy not available") # ============================================================ # 6. Check Fast CUDA Kernels # ============================================================ print_section("6. FAST CUDA KERNELS (JIT COMPILED)") try: sys.path.insert(0, '/root/art-dag/celery') from streaming.jit_compiler import ( fast_rotate, fast_zoom, fast_blend, fast_hue_shift, fast_invert, fast_ripple, get_fast_ops ) check_pass("Fast CUDA kernels loaded successfully") # Test one kernel try: import cupy as cp test_img = cp.zeros((720, 1280, 3), dtype=cp.uint8) result = fast_rotate(test_img, 45.0) cp.cuda.Stream.null.synchronize() check_pass("Fast rotate kernel working") except Exception as e: check_fail(f"Fast kernel execution failed: {e}") except ImportError as e: check_warn(f"Fast CUDA kernels not available: {e}") check_info("Fallback to slower CuPy operations") # ============================================================ # 7. Check Fused Pipeline Compiler # ============================================================ print_section("7. FUSED PIPELINE COMPILER") try: sys.path.insert(0, '/root/art-dag/celery') from streaming.sexp_to_cuda import compile_frame_pipeline, compile_autonomous_pipeline check_pass("Fused CUDA pipeline compiler available") except ImportError as e: check_warn(f"Fused pipeline compiler not available: {e}") check_info("Using per-operation fallback (slower for multi-effect pipelines)") # ============================================================ # 8. Check FFmpeg NVENC # ============================================================ print_section("8. FFMPEG NVENC (HARDWARE ENCODE)") try: result = subprocess.run(["ffmpeg", "-encoders"], capture_output=True, text=True, timeout=5) if "h264_nvenc" in result.stdout: check_pass("FFmpeg h264_nvenc encoder available") else: check_warn("FFmpeg h264_nvenc not available - using libx264 (CPU)") if "hevc_nvenc" in result.stdout: check_pass("FFmpeg hevc_nvenc encoder available") except Exception as e: check_fail(f"FFmpeg check failed: {e}") # ============================================================ # 9. Check FFmpeg NVDEC # ============================================================ print_section("9. FFMPEG NVDEC (HARDWARE DECODE)") try: result = subprocess.run(["ffmpeg", "-hwaccels"], capture_output=True, text=True, timeout=5) if "cuda" in result.stdout: check_pass("FFmpeg CUDA hwaccel available") else: check_warn("FFmpeg CUDA hwaccel not available - using CPU decode") except Exception as e: check_fail(f"FFmpeg hwaccel check failed: {e}") # ============================================================ # 10. Check Pipeline Cache Status # ============================================================ print_section("10. PIPELINE CACHE STATUS") try: sys.path.insert(0, '/root/art-dag/celery') from sexp_effects.primitive_libs.streaming_gpu import ( _FUSED_PIPELINE_CACHE, _AUTONOMOUS_PIPELINE_CACHE ) fused_count = len(_FUSED_PIPELINE_CACHE) auto_count = len(_AUTONOMOUS_PIPELINE_CACHE) if fused_count > 0 or auto_count > 0: check_info(f"Fused pipeline cache: {fused_count} entries") check_info(f"Autonomous pipeline cache: {auto_count} entries") if fused_count > 100 or auto_count > 100: check_warn("Large pipeline cache - may cause memory pressure") else: check_info("Pipeline caches empty (no rendering done yet)") except Exception as e: check_info(f"Could not check pipeline cache: {e}") # ============================================================ # Summary # ============================================================ print_section("SUMMARY") print(""" Optimal GPU rendering requires: 1. [CRITICAL] CuPy with working GPU operations 2. [CRITICAL] DLPack zero-copy transfer (decord -> CuPy) 3. [HIGH] Fast CUDA kernels from jit_compiler 4. [MEDIUM] Fused pipeline compiler for multi-effect recipes 5. [MEDIUM] PyNvVideoCodec for zero-copy encoding 6. [LOW] FFmpeg NVENC/NVDEC as fallback If DLPack is failing, check: - decord version (needs 0.6.0+ with DLPack support) - CuPy version compatibility - CUDA toolkit version match If fast kernels are not loading: - Check if streaming/jit_compiler.py exists - Verify CUDA compiler (nvcc) is available """)