Use PyNvCodec for true zero-copy GPU video decode
Some checks are pending
GPU Worker CI/CD / test (push) Waiting to run
GPU Worker CI/CD / deploy (push) Blocked by required conditions

Replace decord (CPU-only pip package) with PyNvCodec which provides
direct NVDEC access. Frames decode straight to GPU memory without
any CPU transfer, eliminating the memory bandwidth bottleneck.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
giles
2026-02-04 01:47:03 +00:00
parent 771fb8cebc
commit b7e3827fa2
2 changed files with 120 additions and 66 deletions

View File

@@ -26,9 +26,9 @@ RUN pip install --no-cache-dir -r requirements.txt
# Install GPU-specific dependencies (CuPy for CUDA 12.x)
RUN pip install --no-cache-dir cupy-cuda12x
# Install decord for GPU-accelerated video decoding (keeps frames on GPU)
# This avoids CPU<->GPU memory transfers during video decode
RUN pip install --no-cache-dir decord
# Install PyNvCodec for true GPU-native video decoding (NVDEC)
# Frames decode directly to GPU memory - zero CPU transfer
RUN pip install --no-cache-dir PyNvCodec
# Copy application
COPY . .

View File

@@ -34,7 +34,7 @@ except ImportError:
# Check for hardware decode support
_HWDEC_AVAILABLE: Optional[bool] = None
_DECORD_AVAILABLE: Optional[bool] = None
_PYNVCODEC_AVAILABLE: Optional[bool] = None
def check_hwdec_available() -> bool:
@@ -64,23 +64,21 @@ def check_hwdec_available() -> bool:
return _HWDEC_AVAILABLE
def check_decord_available() -> bool:
"""Check if decord GPU decode is available."""
global _DECORD_AVAILABLE
if _DECORD_AVAILABLE is not None:
return _DECORD_AVAILABLE
def check_pynvcodec_available() -> bool:
"""Check if PyNvCodec GPU decode is available."""
global _PYNVCODEC_AVAILABLE
if _PYNVCODEC_AVAILABLE is not None:
return _PYNVCODEC_AVAILABLE
try:
import decord
from decord import gpu
# Try to create a GPU context - this will fail if CUDA isn't properly set up
_DECORD_AVAILABLE = True
print("[streaming_gpu] decord GPU decode available", file=sys.stderr)
import PyNvCodec as nvc
_PYNVCODEC_AVAILABLE = True
print("[streaming_gpu] PyNvCodec GPU decode available", file=sys.stderr)
except Exception as e:
_DECORD_AVAILABLE = False
print(f"[streaming_gpu] decord not available: {e}", file=sys.stderr)
_PYNVCODEC_AVAILABLE = False
print(f"[streaming_gpu] PyNvCodec not available: {e}", file=sys.stderr)
return _DECORD_AVAILABLE
return _PYNVCODEC_AVAILABLE
class GPUFrame:
@@ -170,17 +168,17 @@ class GPUVideoSource:
"""
GPU-accelerated video source using hardware decode.
Uses decord with GPU context for true zero-copy NVDEC decode,
keeping decoded frames in GPU memory throughout.
Uses PyNvCodec for true zero-copy NVDEC decode - frames go directly
to GPU memory without any CPU transfer.
Falls back to FFmpeg pipe if decord unavailable (slower due to CPU copy).
Falls back to FFmpeg pipe if PyNvCodec unavailable (slower due to CPU copy).
"""
def __init__(self, path: str, fps: float = 30, prefer_gpu: bool = True):
self.path = Path(path)
self.fps = fps
self.prefer_gpu = prefer_gpu and GPU_AVAILABLE
self._use_decord = self.prefer_gpu and check_decord_available()
self._use_pynvcodec = self.prefer_gpu and check_pynvcodec_available()
self._frame_size: Optional[Tuple[int, int]] = None
self._duration: Optional[float] = None
@@ -190,8 +188,11 @@ class GPUVideoSource:
self._last_read_time = -1
self._cached_frame: Optional[GPUFrame] = None
# Decord VideoReader (GPU context)
self._vr = None
# PyNvCodec decoder components
self._nvdec = None
self._nv_cvt = None # NV12 to RGB converter
self._nv_dwn = None # GPU to CPU downloader (for fallback)
self._gpu_id = 0
# FFmpeg fallback state
self._proc = None
@@ -200,32 +201,44 @@ class GPUVideoSource:
# Initialize video source
self._init_video()
mode = "decord-GPU" if self._use_decord else ("ffmpeg-hwaccel" if check_hwdec_available() else "ffmpeg-CPU")
mode = "PyNvCodec-GPU" if self._use_pynvcodec else ("ffmpeg-hwaccel" if check_hwdec_available() else "ffmpeg-CPU")
print(f"[GPUVideoSource] {self.path.name}: {self._frame_size}, "
f"duration={self._duration:.1f}s, mode={mode}", file=sys.stderr)
def _init_video(self):
"""Initialize video reader (decord or probe for ffmpeg)."""
if self._use_decord:
"""Initialize video reader (PyNvCodec or probe for ffmpeg)."""
# First probe video for metadata (needed for both paths)
self._probe_video()
if self._use_pynvcodec:
try:
from decord import VideoReader, gpu, cpu
# Use GPU context for hardware decode
ctx = gpu(0) if self.prefer_gpu else cpu(0)
self._vr = VideoReader(str(self.path), ctx=ctx, num_threads=1)
self._total_frames = len(self._vr)
self._video_fps = self._vr.get_avg_fps()
self._duration = self._total_frames / self._video_fps
# Get frame size from first frame shape
first_frame = self._vr[0].asnumpy()
self._frame_size = (first_frame.shape[1], first_frame.shape[0])
import PyNvCodec as nvc
# Create NVDEC decoder - decodes directly to GPU
self._nvdec = nvc.PyNvDecoder(
str(self.path),
self._gpu_id
)
# Get actual dimensions from decoder
self._frame_size = (self._nvdec.Width(), self._nvdec.Height())
# Create color converter: NV12 (decoder output) -> RGB
self._nv_cvt = nvc.PySurfaceConverter(
self._nvdec.Width(),
self._nvdec.Height(),
nvc.PixelFormat.NV12,
nvc.PixelFormat.RGB,
self._gpu_id
)
print(f"[GPUVideoSource] PyNvCodec initialized: {self._frame_size}", file=sys.stderr)
return
except Exception as e:
print(f"[GPUVideoSource] decord init failed, falling back to ffmpeg: {e}", file=sys.stderr)
self._use_decord = False
self._vr = None
# FFmpeg fallback - probe video
self._probe_video()
print(f"[GPUVideoSource] PyNvCodec init failed, falling back to ffmpeg: {e}", file=sys.stderr)
self._use_pynvcodec = False
self._nvdec = None
self._nv_cvt = None
def _probe_video(self):
"""Probe video file for metadata (FFmpeg fallback)."""
@@ -309,32 +322,72 @@ class GPUVideoSource:
return np.frombuffer(data, dtype=np.uint8).reshape((h, w, 3)).copy()
def _read_frame_decord(self, frame_idx: int) -> Optional[GPUFrame]:
"""Read frame using decord (GPU-native)."""
if self._vr is None:
def _read_frame_pynvcodec(self, target_time: float) -> Optional[GPUFrame]:
"""Read frame using PyNvCodec (true GPU-native, zero CPU copy)."""
if self._nvdec is None:
return None
try:
# Handle looping
frame_idx = frame_idx % self._total_frames
import PyNvCodec as nvc
# Decord returns a tensor - asnumpy() gives numpy array
# With GPU context, decode happens on GPU, but asnumpy() copies to CPU
# For true zero-copy, we need to use decord's GPU tensor directly
frame_data = self._vr[frame_idx]
# Seek if needed (PyNvCodec uses frame numbers)
target_frame = int(target_time * self._video_fps)
target_frame = target_frame % max(1, self._total_frames) # Loop
# If using GPU context, try to get data directly on GPU
if self.prefer_gpu and GPU_AVAILABLE:
# frame_data is a decord NDArray - convert to numpy then to CuPy
# This still involves a copy, but decode was on GPU (faster)
frame_np = frame_data.asnumpy()
# Create GPUFrame and transfer to GPU
return GPUFrame(frame_np, on_gpu=True)
# Decode frame - returns surface in GPU memory
raw_surface = self._nvdec.DecodeSingleSurface()
if raw_surface.Empty():
# Try to seek and decode again
seek_ctx = nvc.SeekContext(target_frame)
self._nvdec.DecodeSingleSurface(seek_ctx)
raw_surface = self._nvdec.DecodeSingleSurface()
if raw_surface.Empty():
return None
# Convert NV12 -> RGB on GPU
rgb_surface = self._nv_cvt.Execute(raw_surface)
if rgb_surface.Empty():
return None
# Get as CuPy array - stays on GPU!
if GPU_AVAILABLE:
# Create CuPy array from GPU surface pointer
# PyNvCodec surfaces can be converted to numpy, then to cupy
# But for true zero-copy, we use the CUDA pointer directly
frame_ptr = rgb_surface.PlanePtr()
pitch = rgb_surface.Pitch()
height = rgb_surface.Height()
width = rgb_surface.Width()
# Create cupy array from device pointer
# Note: PyNvCodec stores data in pitched format
mem = cp.cuda.UnownedMemory(frame_ptr.GpuMem(), pitch * height * 3, None)
memptr = cp.cuda.MemoryPointer(mem, 0)
gpu_frame = cp.ndarray((height, width, 3), dtype=cp.uint8, memptr=memptr)
# Make a copy to ensure we own the memory (surface may be reused)
gpu_frame = gpu_frame.copy()
return GPUFrame(gpu_frame, on_gpu=True)
else:
return GPUFrame(frame_data.asnumpy(), on_gpu=False)
# Fallback to CPU
frame_np = np.ndarray(
shape=(rgb_surface.Height(), rgb_surface.Width(), 3),
dtype=np.uint8
)
# Download to CPU (not ideal but works)
if self._nv_dwn is None:
self._nv_dwn = nvc.PySurfaceDownloader(
rgb_surface.Width(),
rgb_surface.Height(),
nvc.PixelFormat.RGB,
self._gpu_id
)
self._nv_dwn.DownloadSingleSurface(rgb_surface, frame_np)
return GPUFrame(frame_np, on_gpu=False)
except Exception as e:
print(f"[GPUVideoSource] decord read error at frame {frame_idx}: {e}", file=sys.stderr)
print(f"[GPUVideoSource] PyNvCodec read error at t={target_time:.2f}: {e}", file=sys.stderr)
return None
def read_at(self, t: float) -> Optional[GPUFrame]:
@@ -356,10 +409,9 @@ class GPUVideoSource:
self._last_read_time = t
# Use decord if available (GPU-native decode)
if self._use_decord:
frame_idx = int(seek_time * self._video_fps)
self._cached_frame = self._read_frame_decord(frame_idx)
# Use PyNvCodec if available (true GPU-native decode, zero CPU copy)
if self._use_pynvcodec:
self._cached_frame = self._read_frame_pynvcodec(seek_time)
if self._cached_frame is not None:
# Free CPU copy if on GPU (saves memory)
if self.prefer_gpu and self._cached_frame.is_on_gpu:
@@ -420,8 +472,10 @@ class GPUVideoSource:
if self._proc:
self._proc.kill()
self._proc = None
if self._vr is not None:
self._vr = None # Release decord VideoReader
# Release PyNvCodec resources
self._nvdec = None
self._nv_cvt = None
self._nv_dwn = None
# GPU-aware primitive functions