Build decord from source with CUDA for GPU video decode
Some checks are pending
GPU Worker CI/CD / test (push) Waiting to run
GPU Worker CI/CD / deploy (push) Blocked by required conditions

- Build decord with -DUSE_CUDA=ON for true NVDEC hardware decode
- Use DLPack for zero-copy transfer from decord to CuPy
- Frames stay on GPU throughout: decode -> process -> encode

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
giles
2026-02-04 01:50:14 +00:00
parent b7e3827fa2
commit 41adf058bd
2 changed files with 82 additions and 115 deletions

View File

@@ -26,9 +26,18 @@ RUN pip install --no-cache-dir -r requirements.txt
# Install GPU-specific dependencies (CuPy for CUDA 12.x)
RUN pip install --no-cache-dir cupy-cuda12x
# Install PyNvCodec for true GPU-native video decoding (NVDEC)
# Frames decode directly to GPU memory - zero CPU transfer
RUN pip install --no-cache-dir PyNvCodec
# Build decord from source with CUDA support for GPU-native video decoding
RUN apt-get update && apt-get install -y --no-install-recommends \
cmake build-essential \
&& rm -rf /var/lib/apt/lists/*
RUN git clone --recursive https://github.com/dmlc/decord /tmp/decord && \
cd /tmp/decord && \
mkdir build && cd build && \
cmake .. -DUSE_CUDA=ON -DCMAKE_BUILD_TYPE=Release && \
make -j$(nproc) && \
cd ../python && pip install . && \
rm -rf /tmp/decord
# Copy application
COPY . .

View File

@@ -34,7 +34,7 @@ except ImportError:
# Check for hardware decode support
_HWDEC_AVAILABLE: Optional[bool] = None
_PYNVCODEC_AVAILABLE: Optional[bool] = None
_DECORD_GPU_AVAILABLE: Optional[bool] = None
def check_hwdec_available() -> bool:
@@ -64,21 +64,24 @@ def check_hwdec_available() -> bool:
return _HWDEC_AVAILABLE
def check_pynvcodec_available() -> bool:
"""Check if PyNvCodec GPU decode is available."""
global _PYNVCODEC_AVAILABLE
if _PYNVCODEC_AVAILABLE is not None:
return _PYNVCODEC_AVAILABLE
def check_decord_gpu_available() -> bool:
"""Check if decord with CUDA GPU decode is available."""
global _DECORD_GPU_AVAILABLE
if _DECORD_GPU_AVAILABLE is not None:
return _DECORD_GPU_AVAILABLE
try:
import PyNvCodec as nvc
_PYNVCODEC_AVAILABLE = True
print("[streaming_gpu] PyNvCodec GPU decode available", file=sys.stderr)
import decord
from decord import gpu
# Try to create a GPU context to verify CUDA support
ctx = gpu(0)
_DECORD_GPU_AVAILABLE = True
print("[streaming_gpu] decord GPU (CUDA) decode available", file=sys.stderr)
except Exception as e:
_PYNVCODEC_AVAILABLE = False
print(f"[streaming_gpu] PyNvCodec not available: {e}", file=sys.stderr)
_DECORD_GPU_AVAILABLE = False
print(f"[streaming_gpu] decord GPU not available: {e}", file=sys.stderr)
return _PYNVCODEC_AVAILABLE
return _DECORD_GPU_AVAILABLE
class GPUFrame:
@@ -168,17 +171,17 @@ class GPUVideoSource:
"""
GPU-accelerated video source using hardware decode.
Uses PyNvCodec for true zero-copy NVDEC decode - frames go directly
to GPU memory without any CPU transfer.
Uses decord with CUDA GPU context for true NVDEC decode - frames
decode directly to GPU memory via CUDA.
Falls back to FFmpeg pipe if PyNvCodec unavailable (slower due to CPU copy).
Falls back to FFmpeg pipe if decord GPU unavailable (slower due to CPU copy).
"""
def __init__(self, path: str, fps: float = 30, prefer_gpu: bool = True):
self.path = Path(path)
self.fps = fps
self.prefer_gpu = prefer_gpu and GPU_AVAILABLE
self._use_pynvcodec = self.prefer_gpu and check_pynvcodec_available()
self._use_decord_gpu = self.prefer_gpu and check_decord_gpu_available()
self._frame_size: Optional[Tuple[int, int]] = None
self._duration: Optional[float] = None
@@ -188,11 +191,9 @@ class GPUVideoSource:
self._last_read_time = -1
self._cached_frame: Optional[GPUFrame] = None
# PyNvCodec decoder components
self._nvdec = None
self._nv_cvt = None # NV12 to RGB converter
self._nv_dwn = None # GPU to CPU downloader (for fallback)
self._gpu_id = 0
# Decord VideoReader with GPU context
self._vr = None
self._decord_ctx = None
# FFmpeg fallback state
self._proc = None
@@ -201,44 +202,39 @@ class GPUVideoSource:
# Initialize video source
self._init_video()
mode = "PyNvCodec-GPU" if self._use_pynvcodec else ("ffmpeg-hwaccel" if check_hwdec_available() else "ffmpeg-CPU")
mode = "decord-GPU" if self._use_decord_gpu else ("ffmpeg-hwaccel" if check_hwdec_available() else "ffmpeg-CPU")
print(f"[GPUVideoSource] {self.path.name}: {self._frame_size}, "
f"duration={self._duration:.1f}s, mode={mode}", file=sys.stderr)
def _init_video(self):
"""Initialize video reader (PyNvCodec or probe for ffmpeg)."""
# First probe video for metadata (needed for both paths)
self._probe_video()
if self._use_pynvcodec:
"""Initialize video reader (decord GPU or probe for ffmpeg)."""
if self._use_decord_gpu:
try:
import PyNvCodec as nvc
from decord import VideoReader, gpu
# Create NVDEC decoder - decodes directly to GPU
self._nvdec = nvc.PyNvDecoder(
str(self.path),
self._gpu_id
)
# Use GPU context for NVDEC hardware decode
self._decord_ctx = gpu(0)
self._vr = VideoReader(str(self.path), ctx=self._decord_ctx, num_threads=1)
# Get actual dimensions from decoder
self._frame_size = (self._nvdec.Width(), self._nvdec.Height())
self._total_frames = len(self._vr)
self._video_fps = self._vr.get_avg_fps()
self._duration = self._total_frames / self._video_fps
# Create color converter: NV12 (decoder output) -> RGB
self._nv_cvt = nvc.PySurfaceConverter(
self._nvdec.Width(),
self._nvdec.Height(),
nvc.PixelFormat.NV12,
nvc.PixelFormat.RGB,
self._gpu_id
)
# Get frame size from first frame
first_frame = self._vr[0]
self._frame_size = (first_frame.shape[1], first_frame.shape[0])
print(f"[GPUVideoSource] PyNvCodec initialized: {self._frame_size}", file=sys.stderr)
print(f"[GPUVideoSource] decord GPU initialized: {self._frame_size}, "
f"{self._total_frames} frames @ {self._video_fps:.1f}fps", file=sys.stderr)
return
except Exception as e:
print(f"[GPUVideoSource] PyNvCodec init failed, falling back to ffmpeg: {e}", file=sys.stderr)
self._use_pynvcodec = False
self._nvdec = None
self._nv_cvt = None
print(f"[GPUVideoSource] decord GPU init failed, falling back to ffmpeg: {e}", file=sys.stderr)
self._use_decord_gpu = False
self._vr = None
self._decord_ctx = None
# FFmpeg fallback - probe video for metadata
self._probe_video()
def _probe_video(self):
"""Probe video file for metadata (FFmpeg fallback)."""
@@ -322,72 +318,34 @@ class GPUVideoSource:
return np.frombuffer(data, dtype=np.uint8).reshape((h, w, 3)).copy()
def _read_frame_pynvcodec(self, target_time: float) -> Optional[GPUFrame]:
"""Read frame using PyNvCodec (true GPU-native, zero CPU copy)."""
if self._nvdec is None:
def _read_frame_decord_gpu(self, frame_idx: int) -> Optional[GPUFrame]:
"""Read frame using decord with GPU context (NVDEC, zero-copy to CuPy)."""
if self._vr is None:
return None
try:
import PyNvCodec as nvc
# Handle looping
frame_idx = frame_idx % max(1, self._total_frames)
# Seek if needed (PyNvCodec uses frame numbers)
target_frame = int(target_time * self._video_fps)
target_frame = target_frame % max(1, self._total_frames) # Loop
# Decode frame - with GPU context, this uses NVDEC
frame_tensor = self._vr[frame_idx]
# Decode frame - returns surface in GPU memory
raw_surface = self._nvdec.DecodeSingleSurface()
if raw_surface.Empty():
# Try to seek and decode again
seek_ctx = nvc.SeekContext(target_frame)
self._nvdec.DecodeSingleSurface(seek_ctx)
raw_surface = self._nvdec.DecodeSingleSurface()
if raw_surface.Empty():
return None
# Convert NV12 -> RGB on GPU
rgb_surface = self._nv_cvt.Execute(raw_surface)
if rgb_surface.Empty():
return None
# Get as CuPy array - stays on GPU!
# Convert to CuPy via DLPack (zero-copy GPU transfer)
if GPU_AVAILABLE:
# Create CuPy array from GPU surface pointer
# PyNvCodec surfaces can be converted to numpy, then to cupy
# But for true zero-copy, we use the CUDA pointer directly
frame_ptr = rgb_surface.PlanePtr()
pitch = rgb_surface.Pitch()
height = rgb_surface.Height()
width = rgb_surface.Width()
# Create cupy array from device pointer
# Note: PyNvCodec stores data in pitched format
mem = cp.cuda.UnownedMemory(frame_ptr.GpuMem(), pitch * height * 3, None)
memptr = cp.cuda.MemoryPointer(mem, 0)
gpu_frame = cp.ndarray((height, width, 3), dtype=cp.uint8, memptr=memptr)
# Make a copy to ensure we own the memory (surface may be reused)
gpu_frame = gpu_frame.copy()
return GPUFrame(gpu_frame, on_gpu=True)
# decord tensors support DLPack for zero-copy conversion
# This keeps the frame on GPU without any CPU transfer
try:
gpu_frame = cp.from_dlpack(frame_tensor)
return GPUFrame(gpu_frame, on_gpu=True)
except Exception:
# Fallback: convert via numpy (involves CPU copy)
frame_np = frame_tensor.asnumpy()
return GPUFrame(frame_np, on_gpu=True)
else:
# Fallback to CPU
frame_np = np.ndarray(
shape=(rgb_surface.Height(), rgb_surface.Width(), 3),
dtype=np.uint8
)
# Download to CPU (not ideal but works)
if self._nv_dwn is None:
self._nv_dwn = nvc.PySurfaceDownloader(
rgb_surface.Width(),
rgb_surface.Height(),
nvc.PixelFormat.RGB,
self._gpu_id
)
self._nv_dwn.DownloadSingleSurface(rgb_surface, frame_np)
return GPUFrame(frame_np, on_gpu=False)
return GPUFrame(frame_tensor.asnumpy(), on_gpu=False)
except Exception as e:
print(f"[GPUVideoSource] PyNvCodec read error at t={target_time:.2f}: {e}", file=sys.stderr)
print(f"[GPUVideoSource] decord GPU read error at frame {frame_idx}: {e}", file=sys.stderr)
return None
def read_at(self, t: float) -> Optional[GPUFrame]:
@@ -409,9 +367,10 @@ class GPUVideoSource:
self._last_read_time = t
# Use PyNvCodec if available (true GPU-native decode, zero CPU copy)
if self._use_pynvcodec:
self._cached_frame = self._read_frame_pynvcodec(seek_time)
# Use decord GPU if available (NVDEC decode, zero-copy via DLPack)
if self._use_decord_gpu:
frame_idx = int(seek_time * self._video_fps)
self._cached_frame = self._read_frame_decord_gpu(frame_idx)
if self._cached_frame is not None:
# Free CPU copy if on GPU (saves memory)
if self.prefer_gpu and self._cached_frame.is_on_gpu:
@@ -472,10 +431,9 @@ class GPUVideoSource:
if self._proc:
self._proc.kill()
self._proc = None
# Release PyNvCodec resources
self._nvdec = None
self._nv_cvt = None
self._nv_dwn = None
# Release decord resources
self._vr = None
self._decord_ctx = None
# GPU-aware primitive functions