Use PyNvCodec for true zero-copy GPU video decode
Some checks are pending
GPU Worker CI/CD / test (push) Waiting to run
GPU Worker CI/CD / deploy (push) Blocked by required conditions

Replace decord (CPU-only pip package) with PyNvCodec which provides
direct NVDEC access. Frames decode straight to GPU memory without
any CPU transfer, eliminating the memory bandwidth bottleneck.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
giles
2026-02-04 01:47:03 +00:00
parent 771fb8cebc
commit b7e3827fa2
2 changed files with 120 additions and 66 deletions

View File

@@ -26,9 +26,9 @@ RUN pip install --no-cache-dir -r requirements.txt
# Install GPU-specific dependencies (CuPy for CUDA 12.x) # Install GPU-specific dependencies (CuPy for CUDA 12.x)
RUN pip install --no-cache-dir cupy-cuda12x RUN pip install --no-cache-dir cupy-cuda12x
# Install decord for GPU-accelerated video decoding (keeps frames on GPU) # Install PyNvCodec for true GPU-native video decoding (NVDEC)
# This avoids CPU<->GPU memory transfers during video decode # Frames decode directly to GPU memory - zero CPU transfer
RUN pip install --no-cache-dir decord RUN pip install --no-cache-dir PyNvCodec
# Copy application # Copy application
COPY . . COPY . .

View File

@@ -34,7 +34,7 @@ except ImportError:
# Check for hardware decode support # Check for hardware decode support
_HWDEC_AVAILABLE: Optional[bool] = None _HWDEC_AVAILABLE: Optional[bool] = None
_DECORD_AVAILABLE: Optional[bool] = None _PYNVCODEC_AVAILABLE: Optional[bool] = None
def check_hwdec_available() -> bool: def check_hwdec_available() -> bool:
@@ -64,23 +64,21 @@ def check_hwdec_available() -> bool:
return _HWDEC_AVAILABLE return _HWDEC_AVAILABLE
def check_decord_available() -> bool: def check_pynvcodec_available() -> bool:
"""Check if decord GPU decode is available.""" """Check if PyNvCodec GPU decode is available."""
global _DECORD_AVAILABLE global _PYNVCODEC_AVAILABLE
if _DECORD_AVAILABLE is not None: if _PYNVCODEC_AVAILABLE is not None:
return _DECORD_AVAILABLE return _PYNVCODEC_AVAILABLE
try: try:
import decord import PyNvCodec as nvc
from decord import gpu _PYNVCODEC_AVAILABLE = True
# Try to create a GPU context - this will fail if CUDA isn't properly set up print("[streaming_gpu] PyNvCodec GPU decode available", file=sys.stderr)
_DECORD_AVAILABLE = True
print("[streaming_gpu] decord GPU decode available", file=sys.stderr)
except Exception as e: except Exception as e:
_DECORD_AVAILABLE = False _PYNVCODEC_AVAILABLE = False
print(f"[streaming_gpu] decord not available: {e}", file=sys.stderr) print(f"[streaming_gpu] PyNvCodec not available: {e}", file=sys.stderr)
return _DECORD_AVAILABLE return _PYNVCODEC_AVAILABLE
class GPUFrame: class GPUFrame:
@@ -170,17 +168,17 @@ class GPUVideoSource:
""" """
GPU-accelerated video source using hardware decode. GPU-accelerated video source using hardware decode.
Uses decord with GPU context for true zero-copy NVDEC decode, Uses PyNvCodec for true zero-copy NVDEC decode - frames go directly
keeping decoded frames in GPU memory throughout. to GPU memory without any CPU transfer.
Falls back to FFmpeg pipe if decord unavailable (slower due to CPU copy). Falls back to FFmpeg pipe if PyNvCodec unavailable (slower due to CPU copy).
""" """
def __init__(self, path: str, fps: float = 30, prefer_gpu: bool = True): def __init__(self, path: str, fps: float = 30, prefer_gpu: bool = True):
self.path = Path(path) self.path = Path(path)
self.fps = fps self.fps = fps
self.prefer_gpu = prefer_gpu and GPU_AVAILABLE self.prefer_gpu = prefer_gpu and GPU_AVAILABLE
self._use_decord = self.prefer_gpu and check_decord_available() self._use_pynvcodec = self.prefer_gpu and check_pynvcodec_available()
self._frame_size: Optional[Tuple[int, int]] = None self._frame_size: Optional[Tuple[int, int]] = None
self._duration: Optional[float] = None self._duration: Optional[float] = None
@@ -190,8 +188,11 @@ class GPUVideoSource:
self._last_read_time = -1 self._last_read_time = -1
self._cached_frame: Optional[GPUFrame] = None self._cached_frame: Optional[GPUFrame] = None
# Decord VideoReader (GPU context) # PyNvCodec decoder components
self._vr = None self._nvdec = None
self._nv_cvt = None # NV12 to RGB converter
self._nv_dwn = None # GPU to CPU downloader (for fallback)
self._gpu_id = 0
# FFmpeg fallback state # FFmpeg fallback state
self._proc = None self._proc = None
@@ -200,32 +201,44 @@ class GPUVideoSource:
# Initialize video source # Initialize video source
self._init_video() self._init_video()
mode = "decord-GPU" if self._use_decord else ("ffmpeg-hwaccel" if check_hwdec_available() else "ffmpeg-CPU") mode = "PyNvCodec-GPU" if self._use_pynvcodec else ("ffmpeg-hwaccel" if check_hwdec_available() else "ffmpeg-CPU")
print(f"[GPUVideoSource] {self.path.name}: {self._frame_size}, " print(f"[GPUVideoSource] {self.path.name}: {self._frame_size}, "
f"duration={self._duration:.1f}s, mode={mode}", file=sys.stderr) f"duration={self._duration:.1f}s, mode={mode}", file=sys.stderr)
def _init_video(self): def _init_video(self):
"""Initialize video reader (decord or probe for ffmpeg).""" """Initialize video reader (PyNvCodec or probe for ffmpeg)."""
if self._use_decord: # First probe video for metadata (needed for both paths)
self._probe_video()
if self._use_pynvcodec:
try: try:
from decord import VideoReader, gpu, cpu import PyNvCodec as nvc
# Use GPU context for hardware decode
ctx = gpu(0) if self.prefer_gpu else cpu(0) # Create NVDEC decoder - decodes directly to GPU
self._vr = VideoReader(str(self.path), ctx=ctx, num_threads=1) self._nvdec = nvc.PyNvDecoder(
self._total_frames = len(self._vr) str(self.path),
self._video_fps = self._vr.get_avg_fps() self._gpu_id
self._duration = self._total_frames / self._video_fps )
# Get frame size from first frame shape
first_frame = self._vr[0].asnumpy() # Get actual dimensions from decoder
self._frame_size = (first_frame.shape[1], first_frame.shape[0]) self._frame_size = (self._nvdec.Width(), self._nvdec.Height())
# Create color converter: NV12 (decoder output) -> RGB
self._nv_cvt = nvc.PySurfaceConverter(
self._nvdec.Width(),
self._nvdec.Height(),
nvc.PixelFormat.NV12,
nvc.PixelFormat.RGB,
self._gpu_id
)
print(f"[GPUVideoSource] PyNvCodec initialized: {self._frame_size}", file=sys.stderr)
return return
except Exception as e: except Exception as e:
print(f"[GPUVideoSource] decord init failed, falling back to ffmpeg: {e}", file=sys.stderr) print(f"[GPUVideoSource] PyNvCodec init failed, falling back to ffmpeg: {e}", file=sys.stderr)
self._use_decord = False self._use_pynvcodec = False
self._vr = None self._nvdec = None
self._nv_cvt = None
# FFmpeg fallback - probe video
self._probe_video()
def _probe_video(self): def _probe_video(self):
"""Probe video file for metadata (FFmpeg fallback).""" """Probe video file for metadata (FFmpeg fallback)."""
@@ -309,32 +322,72 @@ class GPUVideoSource:
return np.frombuffer(data, dtype=np.uint8).reshape((h, w, 3)).copy() return np.frombuffer(data, dtype=np.uint8).reshape((h, w, 3)).copy()
def _read_frame_decord(self, frame_idx: int) -> Optional[GPUFrame]: def _read_frame_pynvcodec(self, target_time: float) -> Optional[GPUFrame]:
"""Read frame using decord (GPU-native).""" """Read frame using PyNvCodec (true GPU-native, zero CPU copy)."""
if self._vr is None: if self._nvdec is None:
return None return None
try: try:
# Handle looping import PyNvCodec as nvc
frame_idx = frame_idx % self._total_frames
# Decord returns a tensor - asnumpy() gives numpy array # Seek if needed (PyNvCodec uses frame numbers)
# With GPU context, decode happens on GPU, but asnumpy() copies to CPU target_frame = int(target_time * self._video_fps)
# For true zero-copy, we need to use decord's GPU tensor directly target_frame = target_frame % max(1, self._total_frames) # Loop
frame_data = self._vr[frame_idx]
# If using GPU context, try to get data directly on GPU # Decode frame - returns surface in GPU memory
if self.prefer_gpu and GPU_AVAILABLE: raw_surface = self._nvdec.DecodeSingleSurface()
# frame_data is a decord NDArray - convert to numpy then to CuPy if raw_surface.Empty():
# This still involves a copy, but decode was on GPU (faster) # Try to seek and decode again
frame_np = frame_data.asnumpy() seek_ctx = nvc.SeekContext(target_frame)
# Create GPUFrame and transfer to GPU self._nvdec.DecodeSingleSurface(seek_ctx)
return GPUFrame(frame_np, on_gpu=True) raw_surface = self._nvdec.DecodeSingleSurface()
if raw_surface.Empty():
return None
# Convert NV12 -> RGB on GPU
rgb_surface = self._nv_cvt.Execute(raw_surface)
if rgb_surface.Empty():
return None
# Get as CuPy array - stays on GPU!
if GPU_AVAILABLE:
# Create CuPy array from GPU surface pointer
# PyNvCodec surfaces can be converted to numpy, then to cupy
# But for true zero-copy, we use the CUDA pointer directly
frame_ptr = rgb_surface.PlanePtr()
pitch = rgb_surface.Pitch()
height = rgb_surface.Height()
width = rgb_surface.Width()
# Create cupy array from device pointer
# Note: PyNvCodec stores data in pitched format
mem = cp.cuda.UnownedMemory(frame_ptr.GpuMem(), pitch * height * 3, None)
memptr = cp.cuda.MemoryPointer(mem, 0)
gpu_frame = cp.ndarray((height, width, 3), dtype=cp.uint8, memptr=memptr)
# Make a copy to ensure we own the memory (surface may be reused)
gpu_frame = gpu_frame.copy()
return GPUFrame(gpu_frame, on_gpu=True)
else: else:
return GPUFrame(frame_data.asnumpy(), on_gpu=False) # Fallback to CPU
frame_np = np.ndarray(
shape=(rgb_surface.Height(), rgb_surface.Width(), 3),
dtype=np.uint8
)
# Download to CPU (not ideal but works)
if self._nv_dwn is None:
self._nv_dwn = nvc.PySurfaceDownloader(
rgb_surface.Width(),
rgb_surface.Height(),
nvc.PixelFormat.RGB,
self._gpu_id
)
self._nv_dwn.DownloadSingleSurface(rgb_surface, frame_np)
return GPUFrame(frame_np, on_gpu=False)
except Exception as e: except Exception as e:
print(f"[GPUVideoSource] decord read error at frame {frame_idx}: {e}", file=sys.stderr) print(f"[GPUVideoSource] PyNvCodec read error at t={target_time:.2f}: {e}", file=sys.stderr)
return None return None
def read_at(self, t: float) -> Optional[GPUFrame]: def read_at(self, t: float) -> Optional[GPUFrame]:
@@ -356,10 +409,9 @@ class GPUVideoSource:
self._last_read_time = t self._last_read_time = t
# Use decord if available (GPU-native decode) # Use PyNvCodec if available (true GPU-native decode, zero CPU copy)
if self._use_decord: if self._use_pynvcodec:
frame_idx = int(seek_time * self._video_fps) self._cached_frame = self._read_frame_pynvcodec(seek_time)
self._cached_frame = self._read_frame_decord(frame_idx)
if self._cached_frame is not None: if self._cached_frame is not None:
# Free CPU copy if on GPU (saves memory) # Free CPU copy if on GPU (saves memory)
if self.prefer_gpu and self._cached_frame.is_on_gpu: if self.prefer_gpu and self._cached_frame.is_on_gpu:
@@ -420,8 +472,10 @@ class GPUVideoSource:
if self._proc: if self._proc:
self._proc.kill() self._proc.kill()
self._proc = None self._proc = None
if self._vr is not None: # Release PyNvCodec resources
self._vr = None # Release decord VideoReader self._nvdec = None
self._nv_cvt = None
self._nv_dwn = None
# GPU-aware primitive functions # GPU-aware primitive functions