250 lines
8.6 KiB
Python
Executable File
250 lines
8.6 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
GPU Rendering Diagnostic Script
|
|
|
|
Checks for common issues that cause GPU rendering slowdowns in art-dag.
|
|
Run this script to identify potential performance bottlenecks.
|
|
"""
|
|
|
|
import sys
|
|
import subprocess
|
|
import os
|
|
|
|
def print_section(title):
|
|
print(f"\n{'='*60}")
|
|
print(f" {title}")
|
|
print(f"{'='*60}")
|
|
|
|
def check_pass(msg):
|
|
print(f" [PASS] {msg}")
|
|
|
|
def check_fail(msg):
|
|
print(f" [FAIL] {msg}")
|
|
|
|
def check_warn(msg):
|
|
print(f" [WARN] {msg}")
|
|
|
|
def check_info(msg):
|
|
print(f" [INFO] {msg}")
|
|
|
|
# ============================================================
|
|
# 1. Check GPU Availability
|
|
# ============================================================
|
|
print_section("1. GPU AVAILABILITY")
|
|
|
|
# Check nvidia-smi
|
|
try:
|
|
result = subprocess.run(["nvidia-smi", "--query-gpu=name,memory.total,memory.free,utilization.gpu",
|
|
"--format=csv,noheader"], capture_output=True, text=True, timeout=5)
|
|
if result.returncode == 0:
|
|
for line in result.stdout.strip().split('\n'):
|
|
check_pass(f"GPU found: {line}")
|
|
else:
|
|
check_fail("nvidia-smi failed - no GPU detected")
|
|
except FileNotFoundError:
|
|
check_fail("nvidia-smi not found - NVIDIA drivers not installed")
|
|
except Exception as e:
|
|
check_fail(f"nvidia-smi error: {e}")
|
|
|
|
# ============================================================
|
|
# 2. Check CuPy
|
|
# ============================================================
|
|
print_section("2. CUPY (GPU ARRAY LIBRARY)")
|
|
|
|
try:
|
|
import cupy as cp
|
|
check_pass(f"CuPy available, version {cp.__version__}")
|
|
|
|
# Test basic GPU operation
|
|
try:
|
|
a = cp.zeros((100, 100), dtype=cp.uint8)
|
|
cp.cuda.Stream.null.synchronize()
|
|
check_pass("CuPy GPU operations working")
|
|
|
|
# Check memory
|
|
mempool = cp.get_default_memory_pool()
|
|
check_info(f"GPU memory pool: {mempool.used_bytes() / 1024**2:.1f} MB used, "
|
|
f"{mempool.total_bytes() / 1024**2:.1f} MB total")
|
|
except Exception as e:
|
|
check_fail(f"CuPy GPU test failed: {e}")
|
|
except ImportError:
|
|
check_fail("CuPy not installed - GPU rendering disabled")
|
|
|
|
# ============================================================
|
|
# 3. Check PyNvVideoCodec (GPU Encoding)
|
|
# ============================================================
|
|
print_section("3. PYNVVIDEOCODEC (GPU ENCODING)")
|
|
|
|
try:
|
|
import PyNvVideoCodec as nvc
|
|
check_pass("PyNvVideoCodec available - zero-copy GPU encoding enabled")
|
|
except ImportError:
|
|
check_warn("PyNvVideoCodec not available - using FFmpeg NVENC (slower)")
|
|
|
|
# ============================================================
|
|
# 4. Check Decord GPU (Hardware Decode)
|
|
# ============================================================
|
|
print_section("4. DECORD GPU (HARDWARE DECODE)")
|
|
|
|
try:
|
|
import decord
|
|
from decord import gpu
|
|
ctx = gpu(0)
|
|
check_pass(f"Decord GPU (NVDEC) available - hardware video decode enabled")
|
|
except ImportError:
|
|
check_warn("Decord not installed - using FFmpeg decode")
|
|
except Exception as e:
|
|
check_warn(f"Decord GPU not available ({e}) - using FFmpeg decode")
|
|
|
|
# ============================================================
|
|
# 5. Check DLPack Support
|
|
# ============================================================
|
|
print_section("5. DLPACK (ZERO-COPY TRANSFER)")
|
|
|
|
try:
|
|
import decord
|
|
from decord import VideoReader, gpu
|
|
import cupy as cp
|
|
|
|
# Need a test video file
|
|
test_video = None
|
|
for path in ["/data/cache", "/tmp"]:
|
|
if os.path.exists(path):
|
|
for f in os.listdir(path):
|
|
if f.endswith(('.mp4', '.webm', '.mkv')):
|
|
test_video = os.path.join(path, f)
|
|
break
|
|
if test_video:
|
|
break
|
|
|
|
if test_video:
|
|
try:
|
|
vr = VideoReader(test_video, ctx=gpu(0))
|
|
frame = vr[0]
|
|
dlpack = frame.to_dlpack()
|
|
gpu_frame = cp.from_dlpack(dlpack)
|
|
check_pass(f"DLPack zero-copy working (tested with {os.path.basename(test_video)})")
|
|
except Exception as e:
|
|
check_fail(f"DLPack FAILED: {e}")
|
|
check_info("This means every frame does GPU->CPU->GPU copy (SLOW)")
|
|
else:
|
|
check_warn("No test video found - cannot verify DLPack")
|
|
except ImportError:
|
|
check_warn("Cannot test DLPack - decord or cupy not available")
|
|
|
|
# ============================================================
|
|
# 6. Check Fast CUDA Kernels
|
|
# ============================================================
|
|
print_section("6. FAST CUDA KERNELS (JIT COMPILED)")
|
|
|
|
try:
|
|
sys.path.insert(0, '/root/art-dag/celery')
|
|
from streaming.jit_compiler import (
|
|
fast_rotate, fast_zoom, fast_blend, fast_hue_shift,
|
|
fast_invert, fast_ripple, get_fast_ops
|
|
)
|
|
check_pass("Fast CUDA kernels loaded successfully")
|
|
|
|
# Test one kernel
|
|
try:
|
|
import cupy as cp
|
|
test_img = cp.zeros((720, 1280, 3), dtype=cp.uint8)
|
|
result = fast_rotate(test_img, 45.0)
|
|
cp.cuda.Stream.null.synchronize()
|
|
check_pass("Fast rotate kernel working")
|
|
except Exception as e:
|
|
check_fail(f"Fast kernel execution failed: {e}")
|
|
except ImportError as e:
|
|
check_warn(f"Fast CUDA kernels not available: {e}")
|
|
check_info("Fallback to slower CuPy operations")
|
|
|
|
# ============================================================
|
|
# 7. Check Fused Pipeline Compiler
|
|
# ============================================================
|
|
print_section("7. FUSED PIPELINE COMPILER")
|
|
|
|
try:
|
|
sys.path.insert(0, '/root/art-dag/celery')
|
|
from streaming.sexp_to_cuda import compile_frame_pipeline, compile_autonomous_pipeline
|
|
check_pass("Fused CUDA pipeline compiler available")
|
|
except ImportError as e:
|
|
check_warn(f"Fused pipeline compiler not available: {e}")
|
|
check_info("Using per-operation fallback (slower for multi-effect pipelines)")
|
|
|
|
# ============================================================
|
|
# 8. Check FFmpeg NVENC
|
|
# ============================================================
|
|
print_section("8. FFMPEG NVENC (HARDWARE ENCODE)")
|
|
|
|
try:
|
|
result = subprocess.run(["ffmpeg", "-encoders"], capture_output=True, text=True, timeout=5)
|
|
if "h264_nvenc" in result.stdout:
|
|
check_pass("FFmpeg h264_nvenc encoder available")
|
|
else:
|
|
check_warn("FFmpeg h264_nvenc not available - using libx264 (CPU)")
|
|
|
|
if "hevc_nvenc" in result.stdout:
|
|
check_pass("FFmpeg hevc_nvenc encoder available")
|
|
except Exception as e:
|
|
check_fail(f"FFmpeg check failed: {e}")
|
|
|
|
# ============================================================
|
|
# 9. Check FFmpeg NVDEC
|
|
# ============================================================
|
|
print_section("9. FFMPEG NVDEC (HARDWARE DECODE)")
|
|
|
|
try:
|
|
result = subprocess.run(["ffmpeg", "-hwaccels"], capture_output=True, text=True, timeout=5)
|
|
if "cuda" in result.stdout:
|
|
check_pass("FFmpeg CUDA hwaccel available")
|
|
else:
|
|
check_warn("FFmpeg CUDA hwaccel not available - using CPU decode")
|
|
except Exception as e:
|
|
check_fail(f"FFmpeg hwaccel check failed: {e}")
|
|
|
|
# ============================================================
|
|
# 10. Check Pipeline Cache Status
|
|
# ============================================================
|
|
print_section("10. PIPELINE CACHE STATUS")
|
|
|
|
try:
|
|
sys.path.insert(0, '/root/art-dag/celery')
|
|
from sexp_effects.primitive_libs.streaming_gpu import (
|
|
_FUSED_PIPELINE_CACHE, _AUTONOMOUS_PIPELINE_CACHE
|
|
)
|
|
fused_count = len(_FUSED_PIPELINE_CACHE)
|
|
auto_count = len(_AUTONOMOUS_PIPELINE_CACHE)
|
|
|
|
if fused_count > 0 or auto_count > 0:
|
|
check_info(f"Fused pipeline cache: {fused_count} entries")
|
|
check_info(f"Autonomous pipeline cache: {auto_count} entries")
|
|
if fused_count > 100 or auto_count > 100:
|
|
check_warn("Large pipeline cache - may cause memory pressure")
|
|
else:
|
|
check_info("Pipeline caches empty (no rendering done yet)")
|
|
except Exception as e:
|
|
check_info(f"Could not check pipeline cache: {e}")
|
|
|
|
# ============================================================
|
|
# Summary
|
|
# ============================================================
|
|
print_section("SUMMARY")
|
|
print("""
|
|
Optimal GPU rendering requires:
|
|
1. [CRITICAL] CuPy with working GPU operations
|
|
2. [CRITICAL] DLPack zero-copy transfer (decord -> CuPy)
|
|
3. [HIGH] Fast CUDA kernels from jit_compiler
|
|
4. [MEDIUM] Fused pipeline compiler for multi-effect recipes
|
|
5. [MEDIUM] PyNvVideoCodec for zero-copy encoding
|
|
6. [LOW] FFmpeg NVENC/NVDEC as fallback
|
|
|
|
If DLPack is failing, check:
|
|
- decord version (needs 0.6.0+ with DLPack support)
|
|
- CuPy version compatibility
|
|
- CUDA toolkit version match
|
|
|
|
If fast kernels are not loading:
|
|
- Check if streaming/jit_compiler.py exists
|
|
- Verify CUDA compiler (nvcc) is available
|
|
""")
|