celery/sexp_effects/primitive_libs/streaming_gpu.py

"""
GPU-Accelerated Streaming Primitives

Provides GPU-native video source and frame processing.
Frames stay on GPU memory throughout the pipeline for maximum performance.

Architecture:
- GPUFrame: Wrapper that tracks whether data is on CPU or GPU
- GPUVideoSource: Hardware-accelerated decode to GPU memory
- GPU primitives operate directly on GPU frames using fast CUDA kernels
- Transfer to CPU only at final output

Requirements:
- CuPy for CUDA support
- FFmpeg with NVDEC support (for hardware decode)
- NVIDIA GPU with CUDA capability
"""

import os
import sys
import json
import subprocess
import numpy as np
from pathlib import Path
from typing import Optional, Tuple, Union

# Try to import CuPy
try:
    import cupy as cp
    GPU_AVAILABLE = True
except ImportError:
    cp = None
    GPU_AVAILABLE = False

# Try to import fast CUDA kernels from JIT compiler
_FAST_KERNELS_AVAILABLE = False
try:
    if GPU_AVAILABLE:
        from streaming.jit_compiler import (
            fast_rotate, fast_zoom, fast_blend, fast_hue_shift,
            fast_invert, fast_ripple, get_fast_ops
        )
        _FAST_KERNELS_AVAILABLE = True
        print("[streaming_gpu] Fast CUDA kernels loaded", file=sys.stderr)
except ImportError as e:
    print(f"[streaming_gpu] Fast kernels not available: {e}", file=sys.stderr)

# Check for hardware decode support
_HWDEC_AVAILABLE: Optional[bool] = None
_DECORD_GPU_AVAILABLE: Optional[bool] = None


def check_hwdec_available() -> bool:
    """Check if NVIDIA hardware decode is available."""
    global _HWDEC_AVAILABLE
    if _HWDEC_AVAILABLE is not None:
        return _HWDEC_AVAILABLE

    try:
        # Check for nvidia-smi (GPU present)
        result = subprocess.run(["nvidia-smi"], capture_output=True, timeout=2)
        if result.returncode != 0:
            _HWDEC_AVAILABLE = False
            return False

        # Check for nvdec in ffmpeg
        result = subprocess.run(
            ["ffmpeg", "-hwaccels"],
            capture_output=True,
            text=True,
            timeout=5
        )
        _HWDEC_AVAILABLE = "cuda" in result.stdout
    except Exception:
        _HWDEC_AVAILABLE = False

    return _HWDEC_AVAILABLE


def check_decord_gpu_available() -> bool:
    """Check if decord with CUDA GPU decode is available."""
    global _DECORD_GPU_AVAILABLE
    if _DECORD_GPU_AVAILABLE is not None:
        return _DECORD_GPU_AVAILABLE

    try:
        import decord
        from decord import gpu
        # Try to create a GPU context to verify CUDA support
        ctx = gpu(0)
        _DECORD_GPU_AVAILABLE = True
        print("[streaming_gpu] decord GPU (CUDA) decode available", file=sys.stderr)
    except Exception as e:
        _DECORD_GPU_AVAILABLE = False
        print(f"[streaming_gpu] decord GPU not available: {e}", file=sys.stderr)

    return _DECORD_GPU_AVAILABLE


class GPUFrame:
    """
    Frame container that tracks data location (CPU/GPU).

    Enables zero-copy operations when data is already on the right device.
    Lazy transfer - only moves data when actually needed.
    """

    def __init__(self, data: Union[np.ndarray, 'cp.ndarray'], on_gpu: bool = None):
        self._cpu_data: Optional[np.ndarray] = None
        self._gpu_data = None  # Optional[cp.ndarray]

        if on_gpu is None:
            # Auto-detect based on type
            if GPU_AVAILABLE and isinstance(data, cp.ndarray):
                self._gpu_data = data
            else:
                self._cpu_data = np.asarray(data)
        elif on_gpu and GPU_AVAILABLE:
            self._gpu_data = cp.asarray(data) if not isinstance(data, cp.ndarray) else data
        else:
            self._cpu_data = np.asarray(data) if isinstance(data, np.ndarray) else cp.asnumpy(data)

    @property
    def cpu(self) -> np.ndarray:
        """Get frame as numpy array (transfers from GPU if needed)."""
        if self._cpu_data is None:
            if self._gpu_data is not None and GPU_AVAILABLE:
                self._cpu_data = cp.asnumpy(self._gpu_data)
            else:
                raise ValueError("No frame data available")
        return self._cpu_data

    @property
    def gpu(self):
        """Get frame as CuPy array (transfers to GPU if needed)."""
        if not GPU_AVAILABLE:
            raise RuntimeError("GPU not available")
        if self._gpu_data is None:
            if self._cpu_data is not None:
                self._gpu_data = cp.asarray(self._cpu_data)
            else:
                raise ValueError("No frame data available")
        return self._gpu_data

    @property
    def is_on_gpu(self) -> bool:
        """Check if data is currently on GPU."""
        return self._gpu_data is not None

    @property
    def shape(self) -> Tuple[int, ...]:
        """Get frame shape."""
        if self._gpu_data is not None:
            return self._gpu_data.shape
        return self._cpu_data.shape

    @property
    def dtype(self):
        """Get frame dtype."""
        if self._gpu_data is not None:
            return self._gpu_data.dtype
        return self._cpu_data.dtype

    def numpy(self) -> np.ndarray:
        """Alias for cpu property."""
        return self.cpu

    def cupy(self):
        """Alias for gpu property."""
        return self.gpu

    def free_cpu(self):
        """Free CPU memory (keep GPU only)."""
        if self._gpu_data is not None:
            self._cpu_data = None

    def free_gpu(self):
        """Free GPU memory (keep CPU only)."""
        if self._cpu_data is not None:
            self._gpu_data = None


class GPUVideoSource:
    """
    GPU-accelerated video source using hardware decode.

    Uses decord with CUDA GPU context for true NVDEC decode - frames
    decode directly to GPU memory via CUDA.

    Falls back to FFmpeg pipe if decord GPU unavailable (slower due to CPU copy).
    """

    def __init__(self, path: str, fps: float = 30, prefer_gpu: bool = True):
        self.path = Path(path)
        self.fps = fps
        self.prefer_gpu = prefer_gpu and GPU_AVAILABLE
        self._use_decord_gpu = self.prefer_gpu and check_decord_gpu_available()

        self._frame_size: Optional[Tuple[int, int]] = None
        self._duration: Optional[float] = None
        self._video_fps: float = 30.0
        self._total_frames: int = 0
        self._frame_time = 1.0 / fps
        self._last_read_time = -1
        self._cached_frame: Optional[GPUFrame] = None

        # Decord VideoReader with GPU context
        self._vr = None
        self._decord_ctx = None

        # FFmpeg fallback state
        self._proc = None
        self._stream_time = 0.0

        # Initialize video source
        self._init_video()

        mode = "decord-GPU" if self._use_decord_gpu else ("ffmpeg-hwaccel" if check_hwdec_available() else "ffmpeg-CPU")
        print(f"[GPUVideoSource] {self.path.name}: {self._frame_size}, "
              f"duration={self._duration:.1f}s, mode={mode}", file=sys.stderr)

    def _init_video(self):
        """Initialize video reader (decord GPU or probe for ffmpeg)."""
        if self._use_decord_gpu:
            try:
                from decord import VideoReader, gpu

                # Use GPU context for NVDEC hardware decode
                self._decord_ctx = gpu(0)
                self._vr = VideoReader(str(self.path), ctx=self._decord_ctx, num_threads=1)

                self._total_frames = len(self._vr)
                self._video_fps = self._vr.get_avg_fps()
                self._duration = self._total_frames / self._video_fps

                # Get frame size from first frame
                first_frame = self._vr[0]
                self._frame_size = (first_frame.shape[1], first_frame.shape[0])

                print(f"[GPUVideoSource] decord GPU initialized: {self._frame_size}, "
                      f"{self._total_frames} frames @ {self._video_fps:.1f}fps", file=sys.stderr)
                return
            except Exception as e:
                print(f"[GPUVideoSource] decord GPU init failed, falling back to ffmpeg: {e}", file=sys.stderr)
                self._use_decord_gpu = False
                self._vr = None
                self._decord_ctx = None

        # FFmpeg fallback - probe video for metadata
        self._probe_video()

    def _probe_video(self):
        """Probe video file for metadata (FFmpeg fallback)."""
        cmd = ["ffprobe", "-v", "quiet", "-print_format", "json",
               "-show_streams", "-show_format", str(self.path)]
        result = subprocess.run(cmd, capture_output=True, text=True)
        info = json.loads(result.stdout)

        for stream in info.get("streams", []):
            if stream.get("codec_type") == "video":
                self._frame_size = (stream.get("width", 720), stream.get("height", 720))
                if "duration" in stream:
                    self._duration = float(stream["duration"])
                elif "tags" in stream and "DURATION" in stream["tags"]:
                    dur_str = stream["tags"]["DURATION"]
                    parts = dur_str.split(":")
                    if len(parts) == 3:
                        h, m, s = parts
                        self._duration = int(h) * 3600 + int(m) * 60 + float(s)
                # Get fps
                if "r_frame_rate" in stream:
                    fps_str = stream["r_frame_rate"]
                    if "/" in fps_str:
                        num, den = fps_str.split("/")
                        self._video_fps = float(num) / float(den)
                break

        if self._duration is None and "format" in info:
            if "duration" in info["format"]:
                self._duration = float(info["format"]["duration"])

        if not self._frame_size:
            self._frame_size = (720, 720)
        if not self._duration:
            self._duration = 60.0

        self._total_frames = int(self._duration * self._video_fps)

    def _start_stream(self, seek_time: float = 0):
        """Start ffmpeg decode process (fallback mode)."""
        if self._proc:
            self._proc.kill()
            self._proc = None

        if not self.path.exists():
            raise FileNotFoundError(f"Video file not found: {self.path}")

        w, h = self._frame_size

        # Build ffmpeg command
        cmd = ["ffmpeg", "-v", "error"]

        # Hardware decode if available
        if check_hwdec_available():
            cmd.extend(["-hwaccel", "cuda"])

        cmd.extend([
            "-ss", f"{seek_time:.3f}",
            "-i", str(self.path),
            "-f", "rawvideo",
            "-pix_fmt", "rgb24",
            "-s", f"{w}x{h}",
            "-r", str(self.fps),
            "-"
        ])

        self._proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        self._stream_time = seek_time

    def _read_frame_raw(self) -> Optional[np.ndarray]:
        """Read one frame from ffmpeg pipe (fallback mode)."""
        w, h = self._frame_size
        frame_size = w * h * 3

        if not self._proc or self._proc.poll() is not None:
            return None

        data = self._proc.stdout.read(frame_size)
        if len(data) < frame_size:
            return None

        return np.frombuffer(data, dtype=np.uint8).reshape((h, w, 3)).copy()

    def _read_frame_decord_gpu(self, frame_idx: int) -> Optional[GPUFrame]:
        """Read frame using decord with GPU context (NVDEC, zero-copy to CuPy)."""
        if self._vr is None:
            return None

        try:
            # Handle looping
            frame_idx = frame_idx % max(1, self._total_frames)

            # Decode frame - with GPU context, this uses NVDEC
            frame_tensor = self._vr[frame_idx]

            # Convert to CuPy via DLPack (zero-copy GPU transfer)
            if GPU_AVAILABLE:
                # decord tensors have .to_dlpack() which returns a PyCapsule
                # that CuPy can consume for zero-copy GPU transfer
                try:
                    dlpack_capsule = frame_tensor.to_dlpack()
                    gpu_frame = cp.from_dlpack(dlpack_capsule)
                    # Log success once per source
                    if not getattr(self, '_dlpack_logged', False):
                        print(f"[GPUVideoSource] DLPack zero-copy SUCCESS - frames stay on GPU", file=sys.stderr)
                        self._dlpack_logged = True
                    return GPUFrame(gpu_frame, on_gpu=True)
                except Exception as dlpack_err:
                    # Fallback: convert via numpy (involves CPU copy)
                    if not getattr(self, '_dlpack_fail_logged', False):
                        print(f"[GPUVideoSource] DLPack FAILED ({dlpack_err}), using CPU copy fallback", file=sys.stderr)
                        self._dlpack_fail_logged = True
                    frame_np = frame_tensor.asnumpy()
                    return GPUFrame(frame_np, on_gpu=True)
            else:
                return GPUFrame(frame_tensor.asnumpy(), on_gpu=False)

        except Exception as e:
            print(f"[GPUVideoSource] decord GPU read error at frame {frame_idx}: {e}", file=sys.stderr)
            return None

    def read_at(self, t: float) -> Optional[GPUFrame]:
        """
        Read frame at specific time.

        Returns GPUFrame with data on GPU if GPU mode enabled.
        """
        # Cache check
        if t == self._last_read_time and self._cached_frame is not None:
            return self._cached_frame

        # Loop time for shorter videos
        seek_time = t
        if self._duration and self._duration > 0:
            seek_time = t % self._duration
            if seek_time > self._duration - 0.1:
                seek_time = 0.0

        self._last_read_time = t

        # Use decord GPU if available (NVDEC decode, zero-copy via DLPack)
        if self._use_decord_gpu:
            frame_idx = int(seek_time * self._video_fps)
            self._cached_frame = self._read_frame_decord_gpu(frame_idx)
            if self._cached_frame is not None:
                # Free CPU copy if on GPU (saves memory)
                if self.prefer_gpu and self._cached_frame.is_on_gpu:
                    self._cached_frame.free_cpu()
                return self._cached_frame

        # FFmpeg fallback
        need_seek = (
            self._proc is None or
            self._proc.poll() is not None or
            seek_time < self._stream_time - self._frame_time or
            seek_time > self._stream_time + 2.0
        )

        if need_seek:
            self._start_stream(seek_time)

        # Skip frames to reach target
        while self._stream_time + self._frame_time <= seek_time:
            frame = self._read_frame_raw()
            if frame is None:
                self._start_stream(seek_time)
                break
            self._stream_time += self._frame_time

        # Read target frame
        frame_np = self._read_frame_raw()
        if frame_np is None:
            return self._cached_frame

        self._stream_time += self._frame_time

        # Create GPUFrame - transfer to GPU if in GPU mode
        self._cached_frame = GPUFrame(frame_np, on_gpu=self.prefer_gpu)

        # Free CPU copy if on GPU (saves memory)
        if self.prefer_gpu and self._cached_frame.is_on_gpu:
            self._cached_frame.free_cpu()

        return self._cached_frame

    def read(self) -> Optional[GPUFrame]:
        """Read current frame."""
        if self._cached_frame is not None:
            return self._cached_frame
        return self.read_at(0)

    @property
    def size(self) -> Tuple[int, int]:
        return self._frame_size

    @property
    def duration(self) -> float:
        return self._duration

    def close(self):
        """Close the video source."""
        if self._proc:
            self._proc.kill()
            self._proc = None
        # Release decord resources
        self._vr = None
        self._decord_ctx = None


# GPU-aware primitive functions

def gpu_blend(frame_a: GPUFrame, frame_b: GPUFrame, alpha: float = 0.5) -> GPUFrame:
    """
    Blend two frames on GPU using fast CUDA kernel.

    Both frames stay on GPU throughout - no CPU transfer.
    """
    if not GPU_AVAILABLE:
        a = frame_a.cpu.astype(np.float32)
        b = frame_b.cpu.astype(np.float32)
        result = (a * alpha + b * (1 - alpha)).astype(np.uint8)
        return GPUFrame(result, on_gpu=False)

    # Use fast CUDA kernel
    if _FAST_KERNELS_AVAILABLE:
        a_gpu = frame_a.gpu
        b_gpu = frame_b.gpu
        if a_gpu.dtype != cp.uint8:
            a_gpu = cp.clip(a_gpu, 0, 255).astype(cp.uint8)
        if b_gpu.dtype != cp.uint8:
            b_gpu = cp.clip(b_gpu, 0, 255).astype(cp.uint8)
        result = fast_blend(a_gpu, b_gpu, alpha)
        return GPUFrame(result, on_gpu=True)

    # Fallback
    a = frame_a.gpu.astype(cp.float32)
    b = frame_b.gpu.astype(cp.float32)
    result = (a * alpha + b * (1 - alpha)).astype(cp.uint8)
    return GPUFrame(result, on_gpu=True)


def gpu_resize(frame: GPUFrame, size: Tuple[int, int]) -> GPUFrame:
    """Resize frame on GPU using fast CUDA zoom kernel."""
    import cv2

    if not GPU_AVAILABLE or not frame.is_on_gpu:
        resized = cv2.resize(frame.cpu, size)
        return GPUFrame(resized, on_gpu=False)

    gpu_data = frame.gpu
    h, w = gpu_data.shape[:2]
    target_w, target_h = size

    # Use fast zoom kernel if same aspect ratio (pure zoom)
    if _FAST_KERNELS_AVAILABLE and target_w == target_h == w == h:
        # For uniform zoom we can use the zoom kernel
        pass  # Fall through to scipy for now - full resize needs different approach

    # CuPy doesn't have built-in resize, use scipy zoom
    from cupyx.scipy import ndimage as cpndimage

    zoom_y = target_h / h
    zoom_x = target_w / w

    if gpu_data.ndim == 3:
        resized = cpndimage.zoom(gpu_data, (zoom_y, zoom_x, 1), order=1)
    else:
        resized = cpndimage.zoom(gpu_data, (zoom_y, zoom_x), order=1)

    return GPUFrame(resized, on_gpu=True)


def gpu_zoom(frame: GPUFrame, factor: float, cx: float = None, cy: float = None) -> GPUFrame:
    """Zoom frame on GPU using fast CUDA kernel."""
    if not GPU_AVAILABLE or not frame.is_on_gpu:
        import cv2
        h, w = frame.cpu.shape[:2]
        if cx is None:
            cx = w / 2
        if cy is None:
            cy = h / 2
        M = cv2.getRotationMatrix2D((cx, cy), 0, factor)
        zoomed = cv2.warpAffine(frame.cpu, M, (w, h))
        return GPUFrame(zoomed, on_gpu=False)

    if _FAST_KERNELS_AVAILABLE:
        zoomed = fast_zoom(frame.gpu, factor, cx=cx, cy=cy)
        return GPUFrame(zoomed, on_gpu=True)

    # Fallback - basic zoom via slice and resize
    return frame


def gpu_hue_shift(frame: GPUFrame, degrees: float) -> GPUFrame:
    """Shift hue on GPU using fast CUDA kernel."""
    if not GPU_AVAILABLE or not frame.is_on_gpu:
        import cv2
        hsv = cv2.cvtColor(frame.cpu, cv2.COLOR_RGB2HSV)
        hsv[:, :, 0] = (hsv[:, :, 0].astype(np.float32) + degrees / 2) % 180
        result = cv2.cvtColor(hsv, cv2.COLOR_HSV2RGB)
        return GPUFrame(result, on_gpu=False)

    if _FAST_KERNELS_AVAILABLE:
        gpu_data = frame.gpu
        if gpu_data.dtype != cp.uint8:
            gpu_data = cp.clip(gpu_data, 0, 255).astype(cp.uint8)
        shifted = fast_hue_shift(gpu_data, degrees)
        return GPUFrame(shifted, on_gpu=True)

    # Fallback - no GPU hue shift without fast kernels
    return frame


def gpu_invert(frame: GPUFrame) -> GPUFrame:
    """Invert colors on GPU using fast CUDA kernel."""
    if not GPU_AVAILABLE or not frame.is_on_gpu:
        result = 255 - frame.cpu
        return GPUFrame(result, on_gpu=False)

    if _FAST_KERNELS_AVAILABLE:
        gpu_data = frame.gpu
        if gpu_data.dtype != cp.uint8:
            gpu_data = cp.clip(gpu_data, 0, 255).astype(cp.uint8)
        inverted = fast_invert(gpu_data)
        return GPUFrame(inverted, on_gpu=True)

    # Fallback - basic CuPy invert
    result = 255 - frame.gpu
    return GPUFrame(result, on_gpu=True)


def gpu_ripple(frame: GPUFrame, amplitude: float, frequency: float = 8,
               decay: float = 2, phase: float = 0,
               cx: float = None, cy: float = None) -> GPUFrame:
    """Apply ripple effect on GPU using fast CUDA kernel."""
    if not GPU_AVAILABLE or not frame.is_on_gpu:
        return frame  # No CPU fallback for ripple

    if _FAST_KERNELS_AVAILABLE:
        gpu_data = frame.gpu
        if gpu_data.dtype != cp.uint8:
            gpu_data = cp.clip(gpu_data, 0, 255).astype(cp.uint8)
        h, w = gpu_data.shape[:2]
        rippled = fast_ripple(
            gpu_data, amplitude,
            center_x=cx if cx else w/2,
            center_y=cy if cy else h/2,
            frequency=frequency,
            decay=decay,
            speed=1.0,
            t=phase
        )
        return GPUFrame(rippled, on_gpu=True)

    return frame


def gpu_contrast(frame: GPUFrame, factor: float) -> GPUFrame:
    """Adjust contrast on GPU using fast CUDA kernel."""
    if not GPU_AVAILABLE or not frame.is_on_gpu:
        result = np.clip((frame.cpu.astype(np.float32) - 128) * factor + 128, 0, 255).astype(np.uint8)
        return GPUFrame(result, on_gpu=False)

    if _FAST_KERNELS_AVAILABLE:
        gpu_data = frame.gpu
        if gpu_data.dtype != cp.uint8:
            gpu_data = cp.clip(gpu_data, 0, 255).astype(cp.uint8)
        h, w = gpu_data.shape[:2]
        ops = get_fast_ops(w, h)
        ops.set_input(gpu_data)
        ops.contrast(factor)
        return GPUFrame(ops.get_output().copy(), on_gpu=True)

    # Fallback
    result = cp.clip((frame.gpu.astype(cp.float32) - 128) * factor + 128, 0, 255).astype(cp.uint8)
    return GPUFrame(result, on_gpu=True)


def gpu_rotate(frame: GPUFrame, angle: float) -> GPUFrame:
    """Rotate frame on GPU using fast CUDA kernel."""
    if not GPU_AVAILABLE or not frame.is_on_gpu:
        import cv2
        h, w = frame.cpu.shape[:2]
        center = (w // 2, h // 2)
        M = cv2.getRotationMatrix2D(center, angle, 1.0)
        rotated = cv2.warpAffine(frame.cpu, M, (w, h))
        return GPUFrame(rotated, on_gpu=False)

    # Use fast CUDA kernel (< 1ms vs 20ms for scipy)
    if _FAST_KERNELS_AVAILABLE:
        rotated = fast_rotate(frame.gpu, angle)
        return GPUFrame(rotated, on_gpu=True)

    # Fallback to scipy (slow)
    from cupyx.scipy import ndimage as cpndimage
    rotated = cpndimage.rotate(frame.gpu, angle, reshape=False, order=1)
    return GPUFrame(rotated, on_gpu=True)


def gpu_brightness(frame: GPUFrame, factor: float) -> GPUFrame:
    """Adjust brightness on GPU using fast CUDA kernel."""
    if not GPU_AVAILABLE or not frame.is_on_gpu:
        result = np.clip(frame.cpu.astype(np.float32) * factor, 0, 255).astype(np.uint8)
        return GPUFrame(result, on_gpu=False)

    # Use fast CUDA kernel
    if _FAST_KERNELS_AVAILABLE:
        gpu_data = frame.gpu
        if gpu_data.dtype != cp.uint8:
            gpu_data = cp.clip(gpu_data, 0, 255).astype(cp.uint8)
        h, w = gpu_data.shape[:2]
        ops = get_fast_ops(w, h)
        ops.set_input(gpu_data)
        ops.brightness(factor)
        return GPUFrame(ops.get_output().copy(), on_gpu=True)

    # Fallback
    result = cp.clip(frame.gpu.astype(cp.float32) * factor, 0, 255).astype(cp.uint8)
    return GPUFrame(result, on_gpu=True)


def gpu_composite(frames: list, weights: list = None) -> GPUFrame:
    """
    Composite multiple frames with weights.

    All frames processed on GPU for efficiency.
    """
    if not frames:
        raise ValueError("No frames to composite")

    if len(frames) == 1:
        return frames[0]

    if weights is None:
        weights = [1.0 / len(frames)] * len(frames)

    # Normalize weights
    total = sum(weights)
    if total > 0:
        weights = [w / total for w in weights]

    use_gpu = GPU_AVAILABLE and any(f.is_on_gpu for f in frames)

    if use_gpu:
        # All on GPU
        target_shape = frames[0].gpu.shape
        result = cp.zeros(target_shape, dtype=cp.float32)

        for frame, weight in zip(frames, weights):
            gpu_data = frame.gpu.astype(cp.float32)
            if gpu_data.shape != target_shape:
                # Resize to match
                from cupyx.scipy import ndimage as cpndimage
                h, w = target_shape[:2]
                fh, fw = gpu_data.shape[:2]
                zoom_factors = (h/fh, w/fw, 1) if gpu_data.ndim == 3 else (h/fh, w/fw)
                gpu_data = cpndimage.zoom(gpu_data, zoom_factors, order=1)
            result += gpu_data * weight

        return GPUFrame(cp.clip(result, 0, 255).astype(cp.uint8), on_gpu=True)
    else:
        # All on CPU
        import cv2
        target_shape = frames[0].cpu.shape
        result = np.zeros(target_shape, dtype=np.float32)

        for frame, weight in zip(frames, weights):
            cpu_data = frame.cpu.astype(np.float32)
            if cpu_data.shape != target_shape:
                cpu_data = cv2.resize(cpu_data, (target_shape[1], target_shape[0]))
            result += cpu_data * weight

        return GPUFrame(np.clip(result, 0, 255).astype(np.uint8), on_gpu=False)


# Primitive registration for streaming interpreter

def _to_gpu_frame(img):
    """Convert any image type to GPUFrame, keeping data on GPU if possible."""
    if isinstance(img, GPUFrame):
        return img
    # Check for CuPy array (stays on GPU)
    if GPU_AVAILABLE and hasattr(img, '__cuda_array_interface__'):
        # Already a CuPy array - wrap directly
        return GPUFrame(img, on_gpu=True)
    # Numpy or other - will be uploaded to GPU
    return GPUFrame(img, on_gpu=True)


def get_primitives():
    """
    Get GPU-aware primitives for registration with interpreter.

    These wrap the GPU functions to work with the sexp interpreter.
    All use fast CUDA kernels when available for maximum performance.

    Primitives detect CuPy arrays and keep them on GPU (no CPU round-trips).
    """
    def prim_make_video_source_gpu(path: str, fps: float = 30):
        """Create GPU-accelerated video source."""
        return GPUVideoSource(path, fps, prefer_gpu=True)

    def prim_gpu_blend(a, b, alpha=0.5):
        """Blend two frames using fast CUDA kernel."""
        fa = _to_gpu_frame(a)
        fb = _to_gpu_frame(b)
        result = gpu_blend(fa, fb, alpha)
        return result.gpu if result.is_on_gpu else result.cpu

    def prim_gpu_rotate(img, angle):
        """Rotate image using fast CUDA kernel (< 1ms)."""
        f = _to_gpu_frame(img)
        result = gpu_rotate(f, angle)
        return result.gpu if result.is_on_gpu else result.cpu

    def prim_gpu_brightness(img, factor):
        """Adjust brightness using fast CUDA kernel."""
        f = _to_gpu_frame(img)
        result = gpu_brightness(f, factor)
        return result.gpu if result.is_on_gpu else result.cpu

    def prim_gpu_contrast(img, factor):
        """Adjust contrast using fast CUDA kernel."""
        f = _to_gpu_frame(img)
        result = gpu_contrast(f, factor)
        return result.gpu if result.is_on_gpu else result.cpu

    def prim_gpu_zoom(img, factor, cx=None, cy=None):
        """Zoom image using fast CUDA kernel."""
        f = _to_gpu_frame(img)
        result = gpu_zoom(f, factor, cx, cy)
        return result.gpu if result.is_on_gpu else result.cpu

    def prim_gpu_hue_shift(img, degrees):
        """Shift hue using fast CUDA kernel."""
        f = _to_gpu_frame(img)
        result = gpu_hue_shift(f, degrees)
        return result.gpu if result.is_on_gpu else result.cpu

    def prim_gpu_invert(img):
        """Invert colors using fast CUDA kernel."""
        f = _to_gpu_frame(img)
        result = gpu_invert(f)
        return result.gpu if result.is_on_gpu else result.cpu

    def prim_gpu_ripple(img, amplitude, frequency=8, decay=2, phase=0, cx=None, cy=None):
        """Apply ripple effect using fast CUDA kernel."""
        f = _to_gpu_frame(img)
        result = gpu_ripple(f, amplitude, frequency, decay, phase, cx, cy)
        return result.gpu if result.is_on_gpu else result.cpu

    return {
        'streaming-gpu:make-video-source': prim_make_video_source_gpu,
        'gpu:blend': prim_gpu_blend,
        'gpu:rotate': prim_gpu_rotate,
        'gpu:brightness': prim_gpu_brightness,
        'gpu:contrast': prim_gpu_contrast,
        'gpu:zoom': prim_gpu_zoom,
        'gpu:hue-shift': prim_gpu_hue_shift,
        'gpu:invert': prim_gpu_invert,
        'gpu:ripple': prim_gpu_ripple,
    }


# Export
__all__ = [
    'GPU_AVAILABLE',
    'GPUFrame',
    'GPUVideoSource',
    'gpu_blend',
    'gpu_resize',
    'gpu_rotate',
    'gpu_brightness',
    'gpu_contrast',
    'gpu_zoom',
    'gpu_hue_shift',
    'gpu_invert',
    'gpu_ripple',
    'gpu_composite',
    'get_primitives',
    'check_hwdec_available',
    'PRIMITIVES',
]


# Import CPU primitives from streaming.py and include them in PRIMITIVES
# This ensures audio analysis primitives are available when streaming_gpu is loaded
def _get_cpu_primitives():
    from sexp_effects.primitive_libs import streaming
    return streaming.PRIMITIVES


PRIMITIVES = _get_cpu_primitives().copy()

# Try to import fused kernel compiler
_FUSED_KERNELS_AVAILABLE = False
_compile_frame_pipeline = None
_compile_autonomous_pipeline = None
try:
    if GPU_AVAILABLE:
        from streaming.sexp_to_cuda import compile_frame_pipeline as _compile_frame_pipeline
        from streaming.sexp_to_cuda import compile_autonomous_pipeline as _compile_autonomous_pipeline
        _FUSED_KERNELS_AVAILABLE = True
        print("[streaming_gpu] Fused CUDA kernel compiler loaded", file=sys.stderr)
except ImportError as e:
    print(f"[streaming_gpu] Fused kernels not available: {e}", file=sys.stderr)


# Fused pipeline cache
_FUSED_PIPELINE_CACHE = {}


def _normalize_effect_dict(effect):
    """Convert effect dict with Keyword keys to string keys."""
    result = {}
    for k, v in effect.items():
        # Handle Keyword objects from sexp parser
        if hasattr(k, 'name'):  # Keyword object
            key = k.name
        else:
            key = str(k)
        result[key] = v
    return result


def prim_fused_pipeline(img, effects_list, **dynamic_params):
    """
    Apply a fused CUDA kernel pipeline to an image.

    This compiles multiple effects into a single CUDA kernel that processes
    the entire pipeline in one GPU pass, eliminating Python interpreter overhead.

    Args:
        img: Input image (GPU array or numpy array)
        effects_list: List of effect dicts like:
            [{'op': 'rotate', 'angle': 45.0},
             {'op': 'hue_shift', 'degrees': 90.0},
             {'op': 'ripple', 'amplitude': 10, ...}]
        **dynamic_params: Parameters that change per-frame like:
            rotate_angle=45, ripple_phase=0.5

    Returns:
        Processed image as GPU array

    Supported ops: rotate, zoom, ripple, invert, hue_shift, brightness
    """
    # Normalize effects list - convert Keyword keys to strings
    effects_list = [_normalize_effect_dict(e) for e in effects_list]

    if not _FUSED_KERNELS_AVAILABLE:
        # Fallback: apply effects one by one
        # Wrap in GPUFrame if needed (GPU functions expect GPUFrame objects)
        if isinstance(img, GPUFrame):
            result = img
        else:
            on_gpu = hasattr(img, '__cuda_array_interface__')
            result = GPUFrame(img, on_gpu=on_gpu)
        for effect in effects_list:
            op = effect['op']
            if op == 'rotate':
                angle = dynamic_params.get('rotate_angle', effect.get('angle', 0))
                result = gpu_rotate(result, angle)
            elif op == 'zoom':
                amount = dynamic_params.get('zoom_amount', effect.get('amount', 1.0))
                result = gpu_zoom(result, amount)
            elif op == 'hue_shift':
                degrees = effect.get('degrees', 0)
                result = gpu_hue_shift(result, degrees)
            elif op == 'ripple':
                result = gpu_ripple(result,
                    amplitude=effect.get('amplitude', 10),
                    frequency=effect.get('frequency', 8),
                    decay=effect.get('decay', 2),
                    phase=dynamic_params.get('ripple_phase', effect.get('phase', 0)),
                    cx=effect.get('center_x'),
                    cy=effect.get('center_y'))
            elif op == 'brightness':
                factor = effect.get('factor', 1.0)
                result = gpu_contrast(result, factor, 0)
            elif op == 'invert':
                result = gpu_invert(result)
        return result

    # Get image dimensions
    if hasattr(img, 'shape'):
        h, w = img.shape[:2]
    else:
        raise ValueError("Image must have shape attribute")

    # Create cache key from effects
    import hashlib
    ops_key = str([(e['op'], {k:v for k,v in e.items() if k != 'src2'}) for e in effects_list])
    cache_key = f"{w}x{h}_{hashlib.md5(ops_key.encode()).hexdigest()}"

    # Compile or get cached pipeline
    if cache_key not in _FUSED_PIPELINE_CACHE:
        _FUSED_PIPELINE_CACHE[cache_key] = _compile_frame_pipeline(effects_list, w, h)

    pipeline = _FUSED_PIPELINE_CACHE[cache_key]

    # Ensure image is on GPU and uint8
    if hasattr(img, '__cuda_array_interface__'):
        gpu_img = img
    elif GPU_AVAILABLE:
        gpu_img = cp.asarray(img)
    else:
        gpu_img = img

    # Run the fused pipeline
    return pipeline(gpu_img, **dynamic_params)


# Autonomous pipeline cache (separate from fused)
_AUTONOMOUS_PIPELINE_CACHE = {}


def prim_autonomous_pipeline(img, effects_list, dynamic_expressions, frame_num, fps=30.0):
    """
    Apply a fully autonomous CUDA kernel pipeline.

    This computes ALL parameters on GPU - including time-based expressions
    like sin(t), t*30, etc. Zero Python in the hot path!

    Args:
        img: Input image (GPU array or numpy array)
        effects_list: List of effect dicts
        dynamic_expressions: Dict mapping param names to CUDA expressions:
            {'rotate_angle': 't * 30.0f',
             'ripple_phase': 't * 2.0f',
             'brightness_factor': '0.8f + 0.4f * sinf(t * 2.0f)'}
        frame_num: Current frame number
        fps: Frames per second (default 30)

    Returns:
        Processed image as GPU array

    Note: Expressions use CUDA syntax - use sinf() not sin(), etc.
    """
    # Normalize effects and expressions
    effects_list = [_normalize_effect_dict(e) for e in effects_list]
    dynamic_expressions = {
        (k.name if hasattr(k, 'name') else str(k)): v
        for k, v in dynamic_expressions.items()
    }

    if not _FUSED_KERNELS_AVAILABLE or _compile_autonomous_pipeline is None:
        # Fallback to regular fused pipeline with Python-computed params
        import math
        t = float(frame_num) / float(fps)
        # Evaluate expressions in Python as fallback
        dynamic_params = {}
        for key, expr in dynamic_expressions.items():
            try:
                # Simple eval with t and math functions
                result = eval(expr.replace('f', '').replace('sin', 'math.sin').replace('cos', 'math.cos'),
                             {'t': t, 'math': math, 'frame_num': frame_num})
                dynamic_params[key] = result
            except:
                dynamic_params[key] = 0
        return prim_fused_pipeline(img, effects_list, **dynamic_params)

    # Get image dimensions
    if hasattr(img, 'shape'):
        h, w = img.shape[:2]
    else:
        raise ValueError("Image must have shape attribute")

    # Create cache key
    import hashlib
    ops_key = str([(e['op'], {k:v for k,v in e.items() if k != 'src2'}) for e in effects_list])
    expr_key = str(sorted(dynamic_expressions.items()))
    cache_key = f"auto_{w}x{h}_{hashlib.md5((ops_key + expr_key).encode()).hexdigest()}"

    # Compile or get cached pipeline
    if cache_key not in _AUTONOMOUS_PIPELINE_CACHE:
        _AUTONOMOUS_PIPELINE_CACHE[cache_key] = _compile_autonomous_pipeline(
            effects_list, w, h, dynamic_expressions)

    pipeline = _AUTONOMOUS_PIPELINE_CACHE[cache_key]

    # Ensure image is on GPU
    if hasattr(img, '__cuda_array_interface__'):
        gpu_img = img
    elif GPU_AVAILABLE:
        gpu_img = cp.asarray(img)
    else:
        gpu_img = img

    # Run - just pass frame_num and fps, kernel does the rest!
    return pipeline(gpu_img, int(frame_num), float(fps))


# ============================================================
# GPU Image Primitives (keep images on GPU)
# ============================================================

def gpu_make_image(w, h, color=None):
    """Create a new image on GPU filled with color (default black).

    Unlike image:make-image, this keeps the image on GPU memory,
    avoiding CPU<->GPU transfers in the pipeline.
    """
    if not GPU_AVAILABLE:
        # Fallback to CPU
        import numpy as np
        if color is None:
            color = [0, 0, 0]
        img = np.zeros((int(h), int(w), 3), dtype=np.uint8)
        img[:] = color
        return img

    w, h = int(w), int(h)
    if color is None:
        color = [0, 0, 0]

    # Create on GPU directly
    img = cp.zeros((h, w, 3), dtype=cp.uint8)
    img[:, :, 0] = int(color[0]) if len(color) > 0 else 0
    img[:, :, 1] = int(color[1]) if len(color) > 1 else 0
    img[:, :, 2] = int(color[2]) if len(color) > 2 else 0

    return img


def gpu_gradient_image(w, h, color1=None, color2=None, direction='horizontal'):
    """Create a gradient image on GPU.

    Args:
        w, h: Dimensions
        color1, color2: Start and end colors [r, g, b]
        direction: 'horizontal', 'vertical', 'diagonal'
    """
    if not GPU_AVAILABLE:
        return gpu_make_image(w, h, color1)

    w, h = int(w), int(h)
    if color1 is None:
        color1 = [0, 0, 0]
    if color2 is None:
        color2 = [255, 255, 255]

    img = cp.zeros((h, w, 3), dtype=cp.uint8)

    if direction == 'horizontal':
        for c in range(3):
            grad = cp.linspace(color1[c], color2[c], w, dtype=cp.float32)
            img[:, :, c] = grad[cp.newaxis, :].astype(cp.uint8)
    elif direction == 'vertical':
        for c in range(3):
            grad = cp.linspace(color1[c], color2[c], h, dtype=cp.float32)
            img[:, :, c] = grad[:, cp.newaxis].astype(cp.uint8)
    elif direction == 'diagonal':
        for c in range(3):
            x_grad = cp.linspace(0, 1, w, dtype=cp.float32)[cp.newaxis, :]
            y_grad = cp.linspace(0, 1, h, dtype=cp.float32)[:, cp.newaxis]
            combined = (x_grad + y_grad) / 2
            img[:, :, c] = (color1[c] + (color2[c] - color1[c]) * combined).astype(cp.uint8)

    return img


# Add GPU-specific primitives
PRIMITIVES['fused-pipeline'] = prim_fused_pipeline
PRIMITIVES['autonomous-pipeline'] = prim_autonomous_pipeline
PRIMITIVES['gpu-make-image'] = gpu_make_image
PRIMITIVES['gpu-gradient'] = gpu_gradient_image
# (The GPU video source will be added by create_cid_primitives in the task)