From 41adf058bde40909e2f50a5910f2c019f581c474 Mon Sep 17 00:00:00 2001
From: giles <giles.bradshaw@sigyl.com>
Date: Wed, 4 Feb 2026 01:50:14 +0000
Subject: [PATCH] Build decord from source with CUDA for GPU video decode

- Build decord with -DUSE_CUDA=ON for true NVDEC hardware decode
- Use DLPack for zero-copy transfer from decord to CuPy
- Frames stay on GPU throughout: decode -> process -> encode

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 Dockerfile.gpu                               |  15 +-
 sexp_effects/primitive_libs/streaming_gpu.py | 182 +++++++------------
 2 files changed, 82 insertions(+), 115 deletions(-)

diff --git a/Dockerfile.gpu b/Dockerfile.gpu
index e927738..1dd164a 100644
--- a/Dockerfile.gpu
+++ b/Dockerfile.gpu
@@ -26,9 +26,18 @@ RUN pip install --no-cache-dir -r requirements.txt
 # Install GPU-specific dependencies (CuPy for CUDA 12.x)
 RUN pip install --no-cache-dir cupy-cuda12x
 
-# Install PyNvCodec for true GPU-native video decoding (NVDEC)
-# Frames decode directly to GPU memory - zero CPU transfer
-RUN pip install --no-cache-dir PyNvCodec
+# Build decord from source with CUDA support for GPU-native video decoding
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    cmake build-essential \
+    && rm -rf /var/lib/apt/lists/*
+
+RUN git clone --recursive https://github.com/dmlc/decord /tmp/decord && \
+    cd /tmp/decord && \
+    mkdir build && cd build && \
+    cmake .. -DUSE_CUDA=ON -DCMAKE_BUILD_TYPE=Release && \
+    make -j$(nproc) && \
+    cd ../python && pip install . && \
+    rm -rf /tmp/decord
 
 # Copy application
 COPY . .
diff --git a/sexp_effects/primitive_libs/streaming_gpu.py b/sexp_effects/primitive_libs/streaming_gpu.py
index 71b5065..3e76b34 100644
--- a/sexp_effects/primitive_libs/streaming_gpu.py
+++ b/sexp_effects/primitive_libs/streaming_gpu.py
@@ -34,7 +34,7 @@ except ImportError:
 
 # Check for hardware decode support
 _HWDEC_AVAILABLE: Optional[bool] = None
-_PYNVCODEC_AVAILABLE: Optional[bool] = None
+_DECORD_GPU_AVAILABLE: Optional[bool] = None
 
 
 def check_hwdec_available() -> bool:
@@ -64,21 +64,24 @@ def check_hwdec_available() -> bool:
     return _HWDEC_AVAILABLE
 
 
-def check_pynvcodec_available() -> bool:
-    """Check if PyNvCodec GPU decode is available."""
-    global _PYNVCODEC_AVAILABLE
-    if _PYNVCODEC_AVAILABLE is not None:
-        return _PYNVCODEC_AVAILABLE
+def check_decord_gpu_available() -> bool:
+    """Check if decord with CUDA GPU decode is available."""
+    global _DECORD_GPU_AVAILABLE
+    if _DECORD_GPU_AVAILABLE is not None:
+        return _DECORD_GPU_AVAILABLE
 
     try:
-        import PyNvCodec as nvc
-        _PYNVCODEC_AVAILABLE = True
-        print("[streaming_gpu] PyNvCodec GPU decode available", file=sys.stderr)
+        import decord
+        from decord import gpu
+        # Try to create a GPU context to verify CUDA support
+        ctx = gpu(0)
+        _DECORD_GPU_AVAILABLE = True
+        print("[streaming_gpu] decord GPU (CUDA) decode available", file=sys.stderr)
     except Exception as e:
-        _PYNVCODEC_AVAILABLE = False
-        print(f"[streaming_gpu] PyNvCodec not available: {e}", file=sys.stderr)
+        _DECORD_GPU_AVAILABLE = False
+        print(f"[streaming_gpu] decord GPU not available: {e}", file=sys.stderr)
 
-    return _PYNVCODEC_AVAILABLE
+    return _DECORD_GPU_AVAILABLE
 
 
 class GPUFrame:
@@ -168,17 +171,17 @@ class GPUVideoSource:
     """
     GPU-accelerated video source using hardware decode.
 
-    Uses PyNvCodec for true zero-copy NVDEC decode - frames go directly
-    to GPU memory without any CPU transfer.
+    Uses decord with CUDA GPU context for true NVDEC decode - frames
+    decode directly to GPU memory via CUDA.
 
-    Falls back to FFmpeg pipe if PyNvCodec unavailable (slower due to CPU copy).
+    Falls back to FFmpeg pipe if decord GPU unavailable (slower due to CPU copy).
     """
 
     def __init__(self, path: str, fps: float = 30, prefer_gpu: bool = True):
         self.path = Path(path)
         self.fps = fps
         self.prefer_gpu = prefer_gpu and GPU_AVAILABLE
-        self._use_pynvcodec = self.prefer_gpu and check_pynvcodec_available()
+        self._use_decord_gpu = self.prefer_gpu and check_decord_gpu_available()
 
         self._frame_size: Optional[Tuple[int, int]] = None
         self._duration: Optional[float] = None
@@ -188,11 +191,9 @@ class GPUVideoSource:
         self._last_read_time = -1
         self._cached_frame: Optional[GPUFrame] = None
 
-        # PyNvCodec decoder components
-        self._nvdec = None
-        self._nv_cvt = None  # NV12 to RGB converter
-        self._nv_dwn = None  # GPU to CPU downloader (for fallback)
-        self._gpu_id = 0
+        # Decord VideoReader with GPU context
+        self._vr = None
+        self._decord_ctx = None
 
         # FFmpeg fallback state
         self._proc = None
@@ -201,44 +202,39 @@ class GPUVideoSource:
         # Initialize video source
         self._init_video()
 
-        mode = "PyNvCodec-GPU" if self._use_pynvcodec else ("ffmpeg-hwaccel" if check_hwdec_available() else "ffmpeg-CPU")
+        mode = "decord-GPU" if self._use_decord_gpu else ("ffmpeg-hwaccel" if check_hwdec_available() else "ffmpeg-CPU")
         print(f"[GPUVideoSource] {self.path.name}: {self._frame_size}, "
               f"duration={self._duration:.1f}s, mode={mode}", file=sys.stderr)
 
     def _init_video(self):
-        """Initialize video reader (PyNvCodec or probe for ffmpeg)."""
-        # First probe video for metadata (needed for both paths)
-        self._probe_video()
-
-        if self._use_pynvcodec:
+        """Initialize video reader (decord GPU or probe for ffmpeg)."""
+        if self._use_decord_gpu:
             try:
-                import PyNvCodec as nvc
+                from decord import VideoReader, gpu
 
-                # Create NVDEC decoder - decodes directly to GPU
-                self._nvdec = nvc.PyNvDecoder(
-                    str(self.path),
-                    self._gpu_id
-                )
+                # Use GPU context for NVDEC hardware decode
+                self._decord_ctx = gpu(0)
+                self._vr = VideoReader(str(self.path), ctx=self._decord_ctx, num_threads=1)
 
-                # Get actual dimensions from decoder
-                self._frame_size = (self._nvdec.Width(), self._nvdec.Height())
+                self._total_frames = len(self._vr)
+                self._video_fps = self._vr.get_avg_fps()
+                self._duration = self._total_frames / self._video_fps
 
-                # Create color converter: NV12 (decoder output) -> RGB
-                self._nv_cvt = nvc.PySurfaceConverter(
-                    self._nvdec.Width(),
-                    self._nvdec.Height(),
-                    nvc.PixelFormat.NV12,
-                    nvc.PixelFormat.RGB,
-                    self._gpu_id
-                )
+                # Get frame size from first frame
+                first_frame = self._vr[0]
+                self._frame_size = (first_frame.shape[1], first_frame.shape[0])
 
-                print(f"[GPUVideoSource] PyNvCodec initialized: {self._frame_size}", file=sys.stderr)
+                print(f"[GPUVideoSource] decord GPU initialized: {self._frame_size}, "
+                      f"{self._total_frames} frames @ {self._video_fps:.1f}fps", file=sys.stderr)
                 return
             except Exception as e:
-                print(f"[GPUVideoSource] PyNvCodec init failed, falling back to ffmpeg: {e}", file=sys.stderr)
-                self._use_pynvcodec = False
-                self._nvdec = None
-                self._nv_cvt = None
+                print(f"[GPUVideoSource] decord GPU init failed, falling back to ffmpeg: {e}", file=sys.stderr)
+                self._use_decord_gpu = False
+                self._vr = None
+                self._decord_ctx = None
+
+        # FFmpeg fallback - probe video for metadata
+        self._probe_video()
 
     def _probe_video(self):
         """Probe video file for metadata (FFmpeg fallback)."""
@@ -322,72 +318,34 @@ class GPUVideoSource:
 
         return np.frombuffer(data, dtype=np.uint8).reshape((h, w, 3)).copy()
 
-    def _read_frame_pynvcodec(self, target_time: float) -> Optional[GPUFrame]:
-        """Read frame using PyNvCodec (true GPU-native, zero CPU copy)."""
-        if self._nvdec is None:
+    def _read_frame_decord_gpu(self, frame_idx: int) -> Optional[GPUFrame]:
+        """Read frame using decord with GPU context (NVDEC, zero-copy to CuPy)."""
+        if self._vr is None:
             return None
 
         try:
-            import PyNvCodec as nvc
+            # Handle looping
+            frame_idx = frame_idx % max(1, self._total_frames)
 
-            # Seek if needed (PyNvCodec uses frame numbers)
-            target_frame = int(target_time * self._video_fps)
-            target_frame = target_frame % max(1, self._total_frames)  # Loop
+            # Decode frame - with GPU context, this uses NVDEC
+            frame_tensor = self._vr[frame_idx]
 
-            # Decode frame - returns surface in GPU memory
-            raw_surface = self._nvdec.DecodeSingleSurface()
-            if raw_surface.Empty():
-                # Try to seek and decode again
-                seek_ctx = nvc.SeekContext(target_frame)
-                self._nvdec.DecodeSingleSurface(seek_ctx)
-                raw_surface = self._nvdec.DecodeSingleSurface()
-                if raw_surface.Empty():
-                    return None
-
-            # Convert NV12 -> RGB on GPU
-            rgb_surface = self._nv_cvt.Execute(raw_surface)
-            if rgb_surface.Empty():
-                return None
-
-            # Get as CuPy array - stays on GPU!
+            # Convert to CuPy via DLPack (zero-copy GPU transfer)
             if GPU_AVAILABLE:
-                # Create CuPy array from GPU surface pointer
-                # PyNvCodec surfaces can be converted to numpy, then to cupy
-                # But for true zero-copy, we use the CUDA pointer directly
-                frame_ptr = rgb_surface.PlanePtr()
-                pitch = rgb_surface.Pitch()
-                height = rgb_surface.Height()
-                width = rgb_surface.Width()
-
-                # Create cupy array from device pointer
-                # Note: PyNvCodec stores data in pitched format
-                mem = cp.cuda.UnownedMemory(frame_ptr.GpuMem(), pitch * height * 3, None)
-                memptr = cp.cuda.MemoryPointer(mem, 0)
-                gpu_frame = cp.ndarray((height, width, 3), dtype=cp.uint8, memptr=memptr)
-
-                # Make a copy to ensure we own the memory (surface may be reused)
-                gpu_frame = gpu_frame.copy()
-
-                return GPUFrame(gpu_frame, on_gpu=True)
+                # decord tensors support DLPack for zero-copy conversion
+                # This keeps the frame on GPU without any CPU transfer
+                try:
+                    gpu_frame = cp.from_dlpack(frame_tensor)
+                    return GPUFrame(gpu_frame, on_gpu=True)
+                except Exception:
+                    # Fallback: convert via numpy (involves CPU copy)
+                    frame_np = frame_tensor.asnumpy()
+                    return GPUFrame(frame_np, on_gpu=True)
             else:
-                # Fallback to CPU
-                frame_np = np.ndarray(
-                    shape=(rgb_surface.Height(), rgb_surface.Width(), 3),
-                    dtype=np.uint8
-                )
-                # Download to CPU (not ideal but works)
-                if self._nv_dwn is None:
-                    self._nv_dwn = nvc.PySurfaceDownloader(
-                        rgb_surface.Width(),
-                        rgb_surface.Height(),
-                        nvc.PixelFormat.RGB,
-                        self._gpu_id
-                    )
-                self._nv_dwn.DownloadSingleSurface(rgb_surface, frame_np)
-                return GPUFrame(frame_np, on_gpu=False)
+                return GPUFrame(frame_tensor.asnumpy(), on_gpu=False)
 
         except Exception as e:
-            print(f"[GPUVideoSource] PyNvCodec read error at t={target_time:.2f}: {e}", file=sys.stderr)
+            print(f"[GPUVideoSource] decord GPU read error at frame {frame_idx}: {e}", file=sys.stderr)
             return None
 
     def read_at(self, t: float) -> Optional[GPUFrame]:
@@ -409,9 +367,10 @@ class GPUVideoSource:
 
         self._last_read_time = t
 
-        # Use PyNvCodec if available (true GPU-native decode, zero CPU copy)
-        if self._use_pynvcodec:
-            self._cached_frame = self._read_frame_pynvcodec(seek_time)
+        # Use decord GPU if available (NVDEC decode, zero-copy via DLPack)
+        if self._use_decord_gpu:
+            frame_idx = int(seek_time * self._video_fps)
+            self._cached_frame = self._read_frame_decord_gpu(frame_idx)
             if self._cached_frame is not None:
                 # Free CPU copy if on GPU (saves memory)
                 if self.prefer_gpu and self._cached_frame.is_on_gpu:
@@ -472,10 +431,9 @@ class GPUVideoSource:
         if self._proc:
             self._proc.kill()
             self._proc = None
-        # Release PyNvCodec resources
-        self._nvdec = None
-        self._nv_cvt = None
-        self._nv_dwn = None
+        # Release decord resources
+        self._vr = None
+        self._decord_ctx = None
 
 
 # GPU-aware primitive functions