Build decord from source with CUDA for GPU video decode
Some checks are pending
GPU Worker CI/CD / test (push) Waiting to run
GPU Worker CI/CD / deploy (push) Blocked by required conditions

- Build decord with -DUSE_CUDA=ON for true NVDEC hardware decode
- Use DLPack for zero-copy transfer from decord to CuPy
- Frames stay on GPU throughout: decode -> process -> encode

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
giles
2026-02-04 01:50:14 +00:00
parent b7e3827fa2
commit 41adf058bd
2 changed files with 82 additions and 115 deletions

View File

@@ -26,9 +26,18 @@ RUN pip install --no-cache-dir -r requirements.txt
# Install GPU-specific dependencies (CuPy for CUDA 12.x) # Install GPU-specific dependencies (CuPy for CUDA 12.x)
RUN pip install --no-cache-dir cupy-cuda12x RUN pip install --no-cache-dir cupy-cuda12x
# Install PyNvCodec for true GPU-native video decoding (NVDEC) # Build decord from source with CUDA support for GPU-native video decoding
# Frames decode directly to GPU memory - zero CPU transfer RUN apt-get update && apt-get install -y --no-install-recommends \
RUN pip install --no-cache-dir PyNvCodec cmake build-essential \
&& rm -rf /var/lib/apt/lists/*
RUN git clone --recursive https://github.com/dmlc/decord /tmp/decord && \
cd /tmp/decord && \
mkdir build && cd build && \
cmake .. -DUSE_CUDA=ON -DCMAKE_BUILD_TYPE=Release && \
make -j$(nproc) && \
cd ../python && pip install . && \
rm -rf /tmp/decord
# Copy application # Copy application
COPY . . COPY . .

View File

@@ -34,7 +34,7 @@ except ImportError:
# Check for hardware decode support # Check for hardware decode support
_HWDEC_AVAILABLE: Optional[bool] = None _HWDEC_AVAILABLE: Optional[bool] = None
_PYNVCODEC_AVAILABLE: Optional[bool] = None _DECORD_GPU_AVAILABLE: Optional[bool] = None
def check_hwdec_available() -> bool: def check_hwdec_available() -> bool:
@@ -64,21 +64,24 @@ def check_hwdec_available() -> bool:
return _HWDEC_AVAILABLE return _HWDEC_AVAILABLE
def check_pynvcodec_available() -> bool: def check_decord_gpu_available() -> bool:
"""Check if PyNvCodec GPU decode is available.""" """Check if decord with CUDA GPU decode is available."""
global _PYNVCODEC_AVAILABLE global _DECORD_GPU_AVAILABLE
if _PYNVCODEC_AVAILABLE is not None: if _DECORD_GPU_AVAILABLE is not None:
return _PYNVCODEC_AVAILABLE return _DECORD_GPU_AVAILABLE
try: try:
import PyNvCodec as nvc import decord
_PYNVCODEC_AVAILABLE = True from decord import gpu
print("[streaming_gpu] PyNvCodec GPU decode available", file=sys.stderr) # Try to create a GPU context to verify CUDA support
ctx = gpu(0)
_DECORD_GPU_AVAILABLE = True
print("[streaming_gpu] decord GPU (CUDA) decode available", file=sys.stderr)
except Exception as e: except Exception as e:
_PYNVCODEC_AVAILABLE = False _DECORD_GPU_AVAILABLE = False
print(f"[streaming_gpu] PyNvCodec not available: {e}", file=sys.stderr) print(f"[streaming_gpu] decord GPU not available: {e}", file=sys.stderr)
return _PYNVCODEC_AVAILABLE return _DECORD_GPU_AVAILABLE
class GPUFrame: class GPUFrame:
@@ -168,17 +171,17 @@ class GPUVideoSource:
""" """
GPU-accelerated video source using hardware decode. GPU-accelerated video source using hardware decode.
Uses PyNvCodec for true zero-copy NVDEC decode - frames go directly Uses decord with CUDA GPU context for true NVDEC decode - frames
to GPU memory without any CPU transfer. decode directly to GPU memory via CUDA.
Falls back to FFmpeg pipe if PyNvCodec unavailable (slower due to CPU copy). Falls back to FFmpeg pipe if decord GPU unavailable (slower due to CPU copy).
""" """
def __init__(self, path: str, fps: float = 30, prefer_gpu: bool = True): def __init__(self, path: str, fps: float = 30, prefer_gpu: bool = True):
self.path = Path(path) self.path = Path(path)
self.fps = fps self.fps = fps
self.prefer_gpu = prefer_gpu and GPU_AVAILABLE self.prefer_gpu = prefer_gpu and GPU_AVAILABLE
self._use_pynvcodec = self.prefer_gpu and check_pynvcodec_available() self._use_decord_gpu = self.prefer_gpu and check_decord_gpu_available()
self._frame_size: Optional[Tuple[int, int]] = None self._frame_size: Optional[Tuple[int, int]] = None
self._duration: Optional[float] = None self._duration: Optional[float] = None
@@ -188,11 +191,9 @@ class GPUVideoSource:
self._last_read_time = -1 self._last_read_time = -1
self._cached_frame: Optional[GPUFrame] = None self._cached_frame: Optional[GPUFrame] = None
# PyNvCodec decoder components # Decord VideoReader with GPU context
self._nvdec = None self._vr = None
self._nv_cvt = None # NV12 to RGB converter self._decord_ctx = None
self._nv_dwn = None # GPU to CPU downloader (for fallback)
self._gpu_id = 0
# FFmpeg fallback state # FFmpeg fallback state
self._proc = None self._proc = None
@@ -201,44 +202,39 @@ class GPUVideoSource:
# Initialize video source # Initialize video source
self._init_video() self._init_video()
mode = "PyNvCodec-GPU" if self._use_pynvcodec else ("ffmpeg-hwaccel" if check_hwdec_available() else "ffmpeg-CPU") mode = "decord-GPU" if self._use_decord_gpu else ("ffmpeg-hwaccel" if check_hwdec_available() else "ffmpeg-CPU")
print(f"[GPUVideoSource] {self.path.name}: {self._frame_size}, " print(f"[GPUVideoSource] {self.path.name}: {self._frame_size}, "
f"duration={self._duration:.1f}s, mode={mode}", file=sys.stderr) f"duration={self._duration:.1f}s, mode={mode}", file=sys.stderr)
def _init_video(self): def _init_video(self):
"""Initialize video reader (PyNvCodec or probe for ffmpeg).""" """Initialize video reader (decord GPU or probe for ffmpeg)."""
# First probe video for metadata (needed for both paths) if self._use_decord_gpu:
self._probe_video()
if self._use_pynvcodec:
try: try:
import PyNvCodec as nvc from decord import VideoReader, gpu
# Create NVDEC decoder - decodes directly to GPU # Use GPU context for NVDEC hardware decode
self._nvdec = nvc.PyNvDecoder( self._decord_ctx = gpu(0)
str(self.path), self._vr = VideoReader(str(self.path), ctx=self._decord_ctx, num_threads=1)
self._gpu_id
)
# Get actual dimensions from decoder self._total_frames = len(self._vr)
self._frame_size = (self._nvdec.Width(), self._nvdec.Height()) self._video_fps = self._vr.get_avg_fps()
self._duration = self._total_frames / self._video_fps
# Create color converter: NV12 (decoder output) -> RGB # Get frame size from first frame
self._nv_cvt = nvc.PySurfaceConverter( first_frame = self._vr[0]
self._nvdec.Width(), self._frame_size = (first_frame.shape[1], first_frame.shape[0])
self._nvdec.Height(),
nvc.PixelFormat.NV12,
nvc.PixelFormat.RGB,
self._gpu_id
)
print(f"[GPUVideoSource] PyNvCodec initialized: {self._frame_size}", file=sys.stderr) print(f"[GPUVideoSource] decord GPU initialized: {self._frame_size}, "
f"{self._total_frames} frames @ {self._video_fps:.1f}fps", file=sys.stderr)
return return
except Exception as e: except Exception as e:
print(f"[GPUVideoSource] PyNvCodec init failed, falling back to ffmpeg: {e}", file=sys.stderr) print(f"[GPUVideoSource] decord GPU init failed, falling back to ffmpeg: {e}", file=sys.stderr)
self._use_pynvcodec = False self._use_decord_gpu = False
self._nvdec = None self._vr = None
self._nv_cvt = None self._decord_ctx = None
# FFmpeg fallback - probe video for metadata
self._probe_video()
def _probe_video(self): def _probe_video(self):
"""Probe video file for metadata (FFmpeg fallback).""" """Probe video file for metadata (FFmpeg fallback)."""
@@ -322,72 +318,34 @@ class GPUVideoSource:
return np.frombuffer(data, dtype=np.uint8).reshape((h, w, 3)).copy() return np.frombuffer(data, dtype=np.uint8).reshape((h, w, 3)).copy()
def _read_frame_pynvcodec(self, target_time: float) -> Optional[GPUFrame]: def _read_frame_decord_gpu(self, frame_idx: int) -> Optional[GPUFrame]:
"""Read frame using PyNvCodec (true GPU-native, zero CPU copy).""" """Read frame using decord with GPU context (NVDEC, zero-copy to CuPy)."""
if self._nvdec is None: if self._vr is None:
return None return None
try: try:
import PyNvCodec as nvc # Handle looping
frame_idx = frame_idx % max(1, self._total_frames)
# Seek if needed (PyNvCodec uses frame numbers) # Decode frame - with GPU context, this uses NVDEC
target_frame = int(target_time * self._video_fps) frame_tensor = self._vr[frame_idx]
target_frame = target_frame % max(1, self._total_frames) # Loop
# Decode frame - returns surface in GPU memory # Convert to CuPy via DLPack (zero-copy GPU transfer)
raw_surface = self._nvdec.DecodeSingleSurface()
if raw_surface.Empty():
# Try to seek and decode again
seek_ctx = nvc.SeekContext(target_frame)
self._nvdec.DecodeSingleSurface(seek_ctx)
raw_surface = self._nvdec.DecodeSingleSurface()
if raw_surface.Empty():
return None
# Convert NV12 -> RGB on GPU
rgb_surface = self._nv_cvt.Execute(raw_surface)
if rgb_surface.Empty():
return None
# Get as CuPy array - stays on GPU!
if GPU_AVAILABLE: if GPU_AVAILABLE:
# Create CuPy array from GPU surface pointer # decord tensors support DLPack for zero-copy conversion
# PyNvCodec surfaces can be converted to numpy, then to cupy # This keeps the frame on GPU without any CPU transfer
# But for true zero-copy, we use the CUDA pointer directly try:
frame_ptr = rgb_surface.PlanePtr() gpu_frame = cp.from_dlpack(frame_tensor)
pitch = rgb_surface.Pitch() return GPUFrame(gpu_frame, on_gpu=True)
height = rgb_surface.Height() except Exception:
width = rgb_surface.Width() # Fallback: convert via numpy (involves CPU copy)
frame_np = frame_tensor.asnumpy()
# Create cupy array from device pointer return GPUFrame(frame_np, on_gpu=True)
# Note: PyNvCodec stores data in pitched format
mem = cp.cuda.UnownedMemory(frame_ptr.GpuMem(), pitch * height * 3, None)
memptr = cp.cuda.MemoryPointer(mem, 0)
gpu_frame = cp.ndarray((height, width, 3), dtype=cp.uint8, memptr=memptr)
# Make a copy to ensure we own the memory (surface may be reused)
gpu_frame = gpu_frame.copy()
return GPUFrame(gpu_frame, on_gpu=True)
else: else:
# Fallback to CPU return GPUFrame(frame_tensor.asnumpy(), on_gpu=False)
frame_np = np.ndarray(
shape=(rgb_surface.Height(), rgb_surface.Width(), 3),
dtype=np.uint8
)
# Download to CPU (not ideal but works)
if self._nv_dwn is None:
self._nv_dwn = nvc.PySurfaceDownloader(
rgb_surface.Width(),
rgb_surface.Height(),
nvc.PixelFormat.RGB,
self._gpu_id
)
self._nv_dwn.DownloadSingleSurface(rgb_surface, frame_np)
return GPUFrame(frame_np, on_gpu=False)
except Exception as e: except Exception as e:
print(f"[GPUVideoSource] PyNvCodec read error at t={target_time:.2f}: {e}", file=sys.stderr) print(f"[GPUVideoSource] decord GPU read error at frame {frame_idx}: {e}", file=sys.stderr)
return None return None
def read_at(self, t: float) -> Optional[GPUFrame]: def read_at(self, t: float) -> Optional[GPUFrame]:
@@ -409,9 +367,10 @@ class GPUVideoSource:
self._last_read_time = t self._last_read_time = t
# Use PyNvCodec if available (true GPU-native decode, zero CPU copy) # Use decord GPU if available (NVDEC decode, zero-copy via DLPack)
if self._use_pynvcodec: if self._use_decord_gpu:
self._cached_frame = self._read_frame_pynvcodec(seek_time) frame_idx = int(seek_time * self._video_fps)
self._cached_frame = self._read_frame_decord_gpu(frame_idx)
if self._cached_frame is not None: if self._cached_frame is not None:
# Free CPU copy if on GPU (saves memory) # Free CPU copy if on GPU (saves memory)
if self.prefer_gpu and self._cached_frame.is_on_gpu: if self.prefer_gpu and self._cached_frame.is_on_gpu:
@@ -472,10 +431,9 @@ class GPUVideoSource:
if self._proc: if self._proc:
self._proc.kill() self._proc.kill()
self._proc = None self._proc = None
# Release PyNvCodec resources # Release decord resources
self._nvdec = None self._vr = None
self._nv_cvt = None self._decord_ctx = None
self._nv_dwn = None
# GPU-aware primitive functions # GPU-aware primitive functions