From 41adf058bde40909e2f50a5910f2c019f581c474 Mon Sep 17 00:00:00 2001 From: giles Date: Wed, 4 Feb 2026 01:50:14 +0000 Subject: [PATCH] Build decord from source with CUDA for GPU video decode - Build decord with -DUSE_CUDA=ON for true NVDEC hardware decode - Use DLPack for zero-copy transfer from decord to CuPy - Frames stay on GPU throughout: decode -> process -> encode Co-Authored-By: Claude Opus 4.5 --- Dockerfile.gpu | 15 +- sexp_effects/primitive_libs/streaming_gpu.py | 182 +++++++------------ 2 files changed, 82 insertions(+), 115 deletions(-) diff --git a/Dockerfile.gpu b/Dockerfile.gpu index e927738..1dd164a 100644 --- a/Dockerfile.gpu +++ b/Dockerfile.gpu @@ -26,9 +26,18 @@ RUN pip install --no-cache-dir -r requirements.txt # Install GPU-specific dependencies (CuPy for CUDA 12.x) RUN pip install --no-cache-dir cupy-cuda12x -# Install PyNvCodec for true GPU-native video decoding (NVDEC) -# Frames decode directly to GPU memory - zero CPU transfer -RUN pip install --no-cache-dir PyNvCodec +# Build decord from source with CUDA support for GPU-native video decoding +RUN apt-get update && apt-get install -y --no-install-recommends \ + cmake build-essential \ + && rm -rf /var/lib/apt/lists/* + +RUN git clone --recursive https://github.com/dmlc/decord /tmp/decord && \ + cd /tmp/decord && \ + mkdir build && cd build && \ + cmake .. -DUSE_CUDA=ON -DCMAKE_BUILD_TYPE=Release && \ + make -j$(nproc) && \ + cd ../python && pip install . && \ + rm -rf /tmp/decord # Copy application COPY . . diff --git a/sexp_effects/primitive_libs/streaming_gpu.py b/sexp_effects/primitive_libs/streaming_gpu.py index 71b5065..3e76b34 100644 --- a/sexp_effects/primitive_libs/streaming_gpu.py +++ b/sexp_effects/primitive_libs/streaming_gpu.py @@ -34,7 +34,7 @@ except ImportError: # Check for hardware decode support _HWDEC_AVAILABLE: Optional[bool] = None -_PYNVCODEC_AVAILABLE: Optional[bool] = None +_DECORD_GPU_AVAILABLE: Optional[bool] = None def check_hwdec_available() -> bool: @@ -64,21 +64,24 @@ def check_hwdec_available() -> bool: return _HWDEC_AVAILABLE -def check_pynvcodec_available() -> bool: - """Check if PyNvCodec GPU decode is available.""" - global _PYNVCODEC_AVAILABLE - if _PYNVCODEC_AVAILABLE is not None: - return _PYNVCODEC_AVAILABLE +def check_decord_gpu_available() -> bool: + """Check if decord with CUDA GPU decode is available.""" + global _DECORD_GPU_AVAILABLE + if _DECORD_GPU_AVAILABLE is not None: + return _DECORD_GPU_AVAILABLE try: - import PyNvCodec as nvc - _PYNVCODEC_AVAILABLE = True - print("[streaming_gpu] PyNvCodec GPU decode available", file=sys.stderr) + import decord + from decord import gpu + # Try to create a GPU context to verify CUDA support + ctx = gpu(0) + _DECORD_GPU_AVAILABLE = True + print("[streaming_gpu] decord GPU (CUDA) decode available", file=sys.stderr) except Exception as e: - _PYNVCODEC_AVAILABLE = False - print(f"[streaming_gpu] PyNvCodec not available: {e}", file=sys.stderr) + _DECORD_GPU_AVAILABLE = False + print(f"[streaming_gpu] decord GPU not available: {e}", file=sys.stderr) - return _PYNVCODEC_AVAILABLE + return _DECORD_GPU_AVAILABLE class GPUFrame: @@ -168,17 +171,17 @@ class GPUVideoSource: """ GPU-accelerated video source using hardware decode. - Uses PyNvCodec for true zero-copy NVDEC decode - frames go directly - to GPU memory without any CPU transfer. + Uses decord with CUDA GPU context for true NVDEC decode - frames + decode directly to GPU memory via CUDA. - Falls back to FFmpeg pipe if PyNvCodec unavailable (slower due to CPU copy). + Falls back to FFmpeg pipe if decord GPU unavailable (slower due to CPU copy). """ def __init__(self, path: str, fps: float = 30, prefer_gpu: bool = True): self.path = Path(path) self.fps = fps self.prefer_gpu = prefer_gpu and GPU_AVAILABLE - self._use_pynvcodec = self.prefer_gpu and check_pynvcodec_available() + self._use_decord_gpu = self.prefer_gpu and check_decord_gpu_available() self._frame_size: Optional[Tuple[int, int]] = None self._duration: Optional[float] = None @@ -188,11 +191,9 @@ class GPUVideoSource: self._last_read_time = -1 self._cached_frame: Optional[GPUFrame] = None - # PyNvCodec decoder components - self._nvdec = None - self._nv_cvt = None # NV12 to RGB converter - self._nv_dwn = None # GPU to CPU downloader (for fallback) - self._gpu_id = 0 + # Decord VideoReader with GPU context + self._vr = None + self._decord_ctx = None # FFmpeg fallback state self._proc = None @@ -201,44 +202,39 @@ class GPUVideoSource: # Initialize video source self._init_video() - mode = "PyNvCodec-GPU" if self._use_pynvcodec else ("ffmpeg-hwaccel" if check_hwdec_available() else "ffmpeg-CPU") + mode = "decord-GPU" if self._use_decord_gpu else ("ffmpeg-hwaccel" if check_hwdec_available() else "ffmpeg-CPU") print(f"[GPUVideoSource] {self.path.name}: {self._frame_size}, " f"duration={self._duration:.1f}s, mode={mode}", file=sys.stderr) def _init_video(self): - """Initialize video reader (PyNvCodec or probe for ffmpeg).""" - # First probe video for metadata (needed for both paths) - self._probe_video() - - if self._use_pynvcodec: + """Initialize video reader (decord GPU or probe for ffmpeg).""" + if self._use_decord_gpu: try: - import PyNvCodec as nvc + from decord import VideoReader, gpu - # Create NVDEC decoder - decodes directly to GPU - self._nvdec = nvc.PyNvDecoder( - str(self.path), - self._gpu_id - ) + # Use GPU context for NVDEC hardware decode + self._decord_ctx = gpu(0) + self._vr = VideoReader(str(self.path), ctx=self._decord_ctx, num_threads=1) - # Get actual dimensions from decoder - self._frame_size = (self._nvdec.Width(), self._nvdec.Height()) + self._total_frames = len(self._vr) + self._video_fps = self._vr.get_avg_fps() + self._duration = self._total_frames / self._video_fps - # Create color converter: NV12 (decoder output) -> RGB - self._nv_cvt = nvc.PySurfaceConverter( - self._nvdec.Width(), - self._nvdec.Height(), - nvc.PixelFormat.NV12, - nvc.PixelFormat.RGB, - self._gpu_id - ) + # Get frame size from first frame + first_frame = self._vr[0] + self._frame_size = (first_frame.shape[1], first_frame.shape[0]) - print(f"[GPUVideoSource] PyNvCodec initialized: {self._frame_size}", file=sys.stderr) + print(f"[GPUVideoSource] decord GPU initialized: {self._frame_size}, " + f"{self._total_frames} frames @ {self._video_fps:.1f}fps", file=sys.stderr) return except Exception as e: - print(f"[GPUVideoSource] PyNvCodec init failed, falling back to ffmpeg: {e}", file=sys.stderr) - self._use_pynvcodec = False - self._nvdec = None - self._nv_cvt = None + print(f"[GPUVideoSource] decord GPU init failed, falling back to ffmpeg: {e}", file=sys.stderr) + self._use_decord_gpu = False + self._vr = None + self._decord_ctx = None + + # FFmpeg fallback - probe video for metadata + self._probe_video() def _probe_video(self): """Probe video file for metadata (FFmpeg fallback).""" @@ -322,72 +318,34 @@ class GPUVideoSource: return np.frombuffer(data, dtype=np.uint8).reshape((h, w, 3)).copy() - def _read_frame_pynvcodec(self, target_time: float) -> Optional[GPUFrame]: - """Read frame using PyNvCodec (true GPU-native, zero CPU copy).""" - if self._nvdec is None: + def _read_frame_decord_gpu(self, frame_idx: int) -> Optional[GPUFrame]: + """Read frame using decord with GPU context (NVDEC, zero-copy to CuPy).""" + if self._vr is None: return None try: - import PyNvCodec as nvc + # Handle looping + frame_idx = frame_idx % max(1, self._total_frames) - # Seek if needed (PyNvCodec uses frame numbers) - target_frame = int(target_time * self._video_fps) - target_frame = target_frame % max(1, self._total_frames) # Loop + # Decode frame - with GPU context, this uses NVDEC + frame_tensor = self._vr[frame_idx] - # Decode frame - returns surface in GPU memory - raw_surface = self._nvdec.DecodeSingleSurface() - if raw_surface.Empty(): - # Try to seek and decode again - seek_ctx = nvc.SeekContext(target_frame) - self._nvdec.DecodeSingleSurface(seek_ctx) - raw_surface = self._nvdec.DecodeSingleSurface() - if raw_surface.Empty(): - return None - - # Convert NV12 -> RGB on GPU - rgb_surface = self._nv_cvt.Execute(raw_surface) - if rgb_surface.Empty(): - return None - - # Get as CuPy array - stays on GPU! + # Convert to CuPy via DLPack (zero-copy GPU transfer) if GPU_AVAILABLE: - # Create CuPy array from GPU surface pointer - # PyNvCodec surfaces can be converted to numpy, then to cupy - # But for true zero-copy, we use the CUDA pointer directly - frame_ptr = rgb_surface.PlanePtr() - pitch = rgb_surface.Pitch() - height = rgb_surface.Height() - width = rgb_surface.Width() - - # Create cupy array from device pointer - # Note: PyNvCodec stores data in pitched format - mem = cp.cuda.UnownedMemory(frame_ptr.GpuMem(), pitch * height * 3, None) - memptr = cp.cuda.MemoryPointer(mem, 0) - gpu_frame = cp.ndarray((height, width, 3), dtype=cp.uint8, memptr=memptr) - - # Make a copy to ensure we own the memory (surface may be reused) - gpu_frame = gpu_frame.copy() - - return GPUFrame(gpu_frame, on_gpu=True) + # decord tensors support DLPack for zero-copy conversion + # This keeps the frame on GPU without any CPU transfer + try: + gpu_frame = cp.from_dlpack(frame_tensor) + return GPUFrame(gpu_frame, on_gpu=True) + except Exception: + # Fallback: convert via numpy (involves CPU copy) + frame_np = frame_tensor.asnumpy() + return GPUFrame(frame_np, on_gpu=True) else: - # Fallback to CPU - frame_np = np.ndarray( - shape=(rgb_surface.Height(), rgb_surface.Width(), 3), - dtype=np.uint8 - ) - # Download to CPU (not ideal but works) - if self._nv_dwn is None: - self._nv_dwn = nvc.PySurfaceDownloader( - rgb_surface.Width(), - rgb_surface.Height(), - nvc.PixelFormat.RGB, - self._gpu_id - ) - self._nv_dwn.DownloadSingleSurface(rgb_surface, frame_np) - return GPUFrame(frame_np, on_gpu=False) + return GPUFrame(frame_tensor.asnumpy(), on_gpu=False) except Exception as e: - print(f"[GPUVideoSource] PyNvCodec read error at t={target_time:.2f}: {e}", file=sys.stderr) + print(f"[GPUVideoSource] decord GPU read error at frame {frame_idx}: {e}", file=sys.stderr) return None def read_at(self, t: float) -> Optional[GPUFrame]: @@ -409,9 +367,10 @@ class GPUVideoSource: self._last_read_time = t - # Use PyNvCodec if available (true GPU-native decode, zero CPU copy) - if self._use_pynvcodec: - self._cached_frame = self._read_frame_pynvcodec(seek_time) + # Use decord GPU if available (NVDEC decode, zero-copy via DLPack) + if self._use_decord_gpu: + frame_idx = int(seek_time * self._video_fps) + self._cached_frame = self._read_frame_decord_gpu(frame_idx) if self._cached_frame is not None: # Free CPU copy if on GPU (saves memory) if self.prefer_gpu and self._cached_frame.is_on_gpu: @@ -472,10 +431,9 @@ class GPUVideoSource: if self._proc: self._proc.kill() self._proc = None - # Release PyNvCodec resources - self._nvdec = None - self._nv_cvt = None - self._nv_dwn = None + # Release decord resources + self._vr = None + self._decord_ctx = None # GPU-aware primitive functions