Files
mono/artdag/l1/diagnose_gpu.py
giles 1a74d811f7 Incorporate art-dag-mono repo into artdag/ subfolder
Merges full history from art-dag/mono.git into the monorepo
under the artdag/ directory. Contains: core (DAG engine),
l1 (Celery rendering server), l2 (ActivityPub registry),
common (shared templates/middleware), client (CLI), test (e2e).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

git-subtree-dir: artdag
git-subtree-mainline: 1a179de547
git-subtree-split: 4c2e716558
2026-02-27 09:07:23 +00:00

250 lines
8.6 KiB
Python
Executable File

#!/usr/bin/env python3
"""
GPU Rendering Diagnostic Script
Checks for common issues that cause GPU rendering slowdowns in art-dag.
Run this script to identify potential performance bottlenecks.
"""
import sys
import subprocess
import os
def print_section(title):
print(f"\n{'='*60}")
print(f" {title}")
print(f"{'='*60}")
def check_pass(msg):
print(f" [PASS] {msg}")
def check_fail(msg):
print(f" [FAIL] {msg}")
def check_warn(msg):
print(f" [WARN] {msg}")
def check_info(msg):
print(f" [INFO] {msg}")
# ============================================================
# 1. Check GPU Availability
# ============================================================
print_section("1. GPU AVAILABILITY")
# Check nvidia-smi
try:
result = subprocess.run(["nvidia-smi", "--query-gpu=name,memory.total,memory.free,utilization.gpu",
"--format=csv,noheader"], capture_output=True, text=True, timeout=5)
if result.returncode == 0:
for line in result.stdout.strip().split('\n'):
check_pass(f"GPU found: {line}")
else:
check_fail("nvidia-smi failed - no GPU detected")
except FileNotFoundError:
check_fail("nvidia-smi not found - NVIDIA drivers not installed")
except Exception as e:
check_fail(f"nvidia-smi error: {e}")
# ============================================================
# 2. Check CuPy
# ============================================================
print_section("2. CUPY (GPU ARRAY LIBRARY)")
try:
import cupy as cp
check_pass(f"CuPy available, version {cp.__version__}")
# Test basic GPU operation
try:
a = cp.zeros((100, 100), dtype=cp.uint8)
cp.cuda.Stream.null.synchronize()
check_pass("CuPy GPU operations working")
# Check memory
mempool = cp.get_default_memory_pool()
check_info(f"GPU memory pool: {mempool.used_bytes() / 1024**2:.1f} MB used, "
f"{mempool.total_bytes() / 1024**2:.1f} MB total")
except Exception as e:
check_fail(f"CuPy GPU test failed: {e}")
except ImportError:
check_fail("CuPy not installed - GPU rendering disabled")
# ============================================================
# 3. Check PyNvVideoCodec (GPU Encoding)
# ============================================================
print_section("3. PYNVVIDEOCODEC (GPU ENCODING)")
try:
import PyNvVideoCodec as nvc
check_pass("PyNvVideoCodec available - zero-copy GPU encoding enabled")
except ImportError:
check_warn("PyNvVideoCodec not available - using FFmpeg NVENC (slower)")
# ============================================================
# 4. Check Decord GPU (Hardware Decode)
# ============================================================
print_section("4. DECORD GPU (HARDWARE DECODE)")
try:
import decord
from decord import gpu
ctx = gpu(0)
check_pass(f"Decord GPU (NVDEC) available - hardware video decode enabled")
except ImportError:
check_warn("Decord not installed - using FFmpeg decode")
except Exception as e:
check_warn(f"Decord GPU not available ({e}) - using FFmpeg decode")
# ============================================================
# 5. Check DLPack Support
# ============================================================
print_section("5. DLPACK (ZERO-COPY TRANSFER)")
try:
import decord
from decord import VideoReader, gpu
import cupy as cp
# Need a test video file
test_video = None
for path in ["/data/cache", "/tmp"]:
if os.path.exists(path):
for f in os.listdir(path):
if f.endswith(('.mp4', '.webm', '.mkv')):
test_video = os.path.join(path, f)
break
if test_video:
break
if test_video:
try:
vr = VideoReader(test_video, ctx=gpu(0))
frame = vr[0]
dlpack = frame.to_dlpack()
gpu_frame = cp.from_dlpack(dlpack)
check_pass(f"DLPack zero-copy working (tested with {os.path.basename(test_video)})")
except Exception as e:
check_fail(f"DLPack FAILED: {e}")
check_info("This means every frame does GPU->CPU->GPU copy (SLOW)")
else:
check_warn("No test video found - cannot verify DLPack")
except ImportError:
check_warn("Cannot test DLPack - decord or cupy not available")
# ============================================================
# 6. Check Fast CUDA Kernels
# ============================================================
print_section("6. FAST CUDA KERNELS (JIT COMPILED)")
try:
sys.path.insert(0, '/root/art-dag/celery')
from streaming.jit_compiler import (
fast_rotate, fast_zoom, fast_blend, fast_hue_shift,
fast_invert, fast_ripple, get_fast_ops
)
check_pass("Fast CUDA kernels loaded successfully")
# Test one kernel
try:
import cupy as cp
test_img = cp.zeros((720, 1280, 3), dtype=cp.uint8)
result = fast_rotate(test_img, 45.0)
cp.cuda.Stream.null.synchronize()
check_pass("Fast rotate kernel working")
except Exception as e:
check_fail(f"Fast kernel execution failed: {e}")
except ImportError as e:
check_warn(f"Fast CUDA kernels not available: {e}")
check_info("Fallback to slower CuPy operations")
# ============================================================
# 7. Check Fused Pipeline Compiler
# ============================================================
print_section("7. FUSED PIPELINE COMPILER")
try:
sys.path.insert(0, '/root/art-dag/celery')
from streaming.sexp_to_cuda import compile_frame_pipeline, compile_autonomous_pipeline
check_pass("Fused CUDA pipeline compiler available")
except ImportError as e:
check_warn(f"Fused pipeline compiler not available: {e}")
check_info("Using per-operation fallback (slower for multi-effect pipelines)")
# ============================================================
# 8. Check FFmpeg NVENC
# ============================================================
print_section("8. FFMPEG NVENC (HARDWARE ENCODE)")
try:
result = subprocess.run(["ffmpeg", "-encoders"], capture_output=True, text=True, timeout=5)
if "h264_nvenc" in result.stdout:
check_pass("FFmpeg h264_nvenc encoder available")
else:
check_warn("FFmpeg h264_nvenc not available - using libx264 (CPU)")
if "hevc_nvenc" in result.stdout:
check_pass("FFmpeg hevc_nvenc encoder available")
except Exception as e:
check_fail(f"FFmpeg check failed: {e}")
# ============================================================
# 9. Check FFmpeg NVDEC
# ============================================================
print_section("9. FFMPEG NVDEC (HARDWARE DECODE)")
try:
result = subprocess.run(["ffmpeg", "-hwaccels"], capture_output=True, text=True, timeout=5)
if "cuda" in result.stdout:
check_pass("FFmpeg CUDA hwaccel available")
else:
check_warn("FFmpeg CUDA hwaccel not available - using CPU decode")
except Exception as e:
check_fail(f"FFmpeg hwaccel check failed: {e}")
# ============================================================
# 10. Check Pipeline Cache Status
# ============================================================
print_section("10. PIPELINE CACHE STATUS")
try:
sys.path.insert(0, '/root/art-dag/celery')
from sexp_effects.primitive_libs.streaming_gpu import (
_FUSED_PIPELINE_CACHE, _AUTONOMOUS_PIPELINE_CACHE
)
fused_count = len(_FUSED_PIPELINE_CACHE)
auto_count = len(_AUTONOMOUS_PIPELINE_CACHE)
if fused_count > 0 or auto_count > 0:
check_info(f"Fused pipeline cache: {fused_count} entries")
check_info(f"Autonomous pipeline cache: {auto_count} entries")
if fused_count > 100 or auto_count > 100:
check_warn("Large pipeline cache - may cause memory pressure")
else:
check_info("Pipeline caches empty (no rendering done yet)")
except Exception as e:
check_info(f"Could not check pipeline cache: {e}")
# ============================================================
# Summary
# ============================================================
print_section("SUMMARY")
print("""
Optimal GPU rendering requires:
1. [CRITICAL] CuPy with working GPU operations
2. [CRITICAL] DLPack zero-copy transfer (decord -> CuPy)
3. [HIGH] Fast CUDA kernels from jit_compiler
4. [MEDIUM] Fused pipeline compiler for multi-effect recipes
5. [MEDIUM] PyNvVideoCodec for zero-copy encoding
6. [LOW] FFmpeg NVENC/NVDEC as fallback
If DLPack is failing, check:
- decord version (needs 0.6.0+ with DLPack support)
- CuPy version compatibility
- CUDA toolkit version match
If fast kernels are not loading:
- Check if streaming/jit_compiler.py exists
- Verify CUDA compiler (nvcc) is available
""")