celery/sexp_effects/primitive_libs/streaming.py

"""
Streaming primitives for video/audio processing.

These primitives handle video source reading and audio analysis,
keeping the interpreter completely generic.

GPU Acceleration:
- Set STREAMING_GPU_PERSIST=1 to output CuPy arrays (frames stay on GPU)
- Hardware video decoding (NVDEC) is used when available
- Dramatically improves performance on GPU nodes

Async Prefetching:
- Set STREAMING_PREFETCH=1 to enable background frame prefetching
- Decodes upcoming frames while current frame is being processed
"""

import os
import numpy as np
import subprocess
import json
import threading
from collections import deque
from pathlib import Path

# Try to import CuPy for GPU acceleration
try:
    import cupy as cp
    CUPY_AVAILABLE = True
except ImportError:
    cp = None
    CUPY_AVAILABLE = False

# GPU persistence mode - output CuPy arrays instead of numpy
# Disabled by default until all primitives support GPU frames
GPU_PERSIST = os.environ.get("STREAMING_GPU_PERSIST", "0") == "1" and CUPY_AVAILABLE

# Async prefetch mode - decode frames in background thread
PREFETCH_ENABLED = os.environ.get("STREAMING_PREFETCH", "1") == "1"
PREFETCH_BUFFER_SIZE = int(os.environ.get("STREAMING_PREFETCH_SIZE", "10"))

# Check for hardware decode support (cached)
_HWDEC_AVAILABLE = None


def _check_hwdec():
    """Check if NVIDIA hardware decode is available."""
    global _HWDEC_AVAILABLE
    if _HWDEC_AVAILABLE is not None:
        return _HWDEC_AVAILABLE

    try:
        result = subprocess.run(["nvidia-smi"], capture_output=True, timeout=2)
        if result.returncode != 0:
            _HWDEC_AVAILABLE = False
            return False
        result = subprocess.run(["ffmpeg", "-hwaccels"], capture_output=True, text=True, timeout=5)
        _HWDEC_AVAILABLE = "cuda" in result.stdout
    except Exception:
        _HWDEC_AVAILABLE = False

    return _HWDEC_AVAILABLE


class VideoSource:
    """Video source with persistent streaming pipe for fast sequential reads."""

    def __init__(self, path: str, fps: float = 30):
        self.path = Path(path)
        self.fps = fps  # Output fps for the stream
        self._frame_size = None
        self._duration = None
        self._proc = None  # Persistent ffmpeg process
        self._stream_time = 0.0  # Current position in stream
        self._frame_time = 1.0 / fps  # Time per frame at output fps
        self._last_read_time = -1
        self._cached_frame = None

        # Check if file exists
        if not self.path.exists():
            raise FileNotFoundError(f"Video file not found: {self.path}")

        # Get video info
        cmd = ["ffprobe", "-v", "quiet", "-print_format", "json",
               "-show_streams", str(self.path)]
        result = subprocess.run(cmd, capture_output=True, text=True)
        if result.returncode != 0:
            raise RuntimeError(f"Failed to probe video '{self.path}': {result.stderr}")
        try:
            info = json.loads(result.stdout)
        except json.JSONDecodeError:
            raise RuntimeError(f"Invalid video file or ffprobe failed: {self.path}")

        for stream in info.get("streams", []):
            if stream.get("codec_type") == "video":
                self._frame_size = (stream.get("width", 720), stream.get("height", 720))
                # Try direct duration field first
                if "duration" in stream:
                    self._duration = float(stream["duration"])
                # Fall back to tags.DURATION (webm format: "00:01:00.124000000")
                elif "tags" in stream and "DURATION" in stream["tags"]:
                    dur_str = stream["tags"]["DURATION"]
                    parts = dur_str.split(":")
                    if len(parts) == 3:
                        h, m, s = parts
                        self._duration = int(h) * 3600 + int(m) * 60 + float(s)
                break

        # Fallback: check format duration if stream duration not found
        if self._duration is None and "format" in info and "duration" in info["format"]:
            self._duration = float(info["format"]["duration"])

        if not self._frame_size:
            self._frame_size = (720, 720)

        import sys
        print(f"VideoSource: {self.path.name} duration={self._duration} size={self._frame_size}", file=sys.stderr)

    def _start_stream(self, seek_time: float = 0):
        """Start or restart the ffmpeg streaming process.

        Uses NVIDIA hardware decoding (NVDEC) when available for better performance.
        """
        if self._proc:
            self._proc.kill()
            self._proc = None

        # Check file exists before trying to open
        if not self.path.exists():
            raise FileNotFoundError(f"Video file not found: {self.path}")

        w, h = self._frame_size

        # Build ffmpeg command with optional hardware decode
        cmd = ["ffmpeg", "-v", "error"]

        # Use hardware decode if available (significantly faster)
        if _check_hwdec():
            cmd.extend(["-hwaccel", "cuda"])

        cmd.extend([
            "-ss", f"{seek_time:.3f}",
            "-i", str(self.path),
            "-f", "rawvideo", "-pix_fmt", "rgb24",
            "-s", f"{w}x{h}",
            "-r", str(self.fps),  # Output at specified fps
            "-"
        ])

        self._proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        self._stream_time = seek_time

        # Check if process started successfully by reading first bit of stderr
        import select
        import sys
        readable, _, _ = select.select([self._proc.stderr], [], [], 0.5)
        if readable:
            err = self._proc.stderr.read(4096).decode('utf-8', errors='ignore')
            if err:
                print(f"ffmpeg error for {self.path.name}: {err}", file=sys.stderr)

    def _read_frame_from_stream(self):
        """Read one frame from the stream.

        Returns CuPy array if GPU_PERSIST is enabled, numpy array otherwise.
        """
        w, h = self._frame_size
        frame_size = w * h * 3

        if not self._proc or self._proc.poll() is not None:
            return None

        data = self._proc.stdout.read(frame_size)
        if len(data) < frame_size:
            return None

        frame = np.frombuffer(data, dtype=np.uint8).reshape((h, w, 3)).copy()

        # Transfer to GPU if persistence mode enabled
        if GPU_PERSIST:
            return cp.asarray(frame)
        return frame

    def read(self) -> np.ndarray:
        """Read frame (uses last cached or t=0)."""
        if self._cached_frame is not None:
            return self._cached_frame
        return self.read_at(0)

    def read_at(self, t: float) -> np.ndarray:
        """Read frame at specific time using streaming with smart seeking."""
        # Cache check - return same frame for same time
        if t == self._last_read_time and self._cached_frame is not None:
            return self._cached_frame

        w, h = self._frame_size

        # Loop time if video is shorter
        seek_time = t
        if self._duration and self._duration > 0:
            seek_time = t % self._duration
            # If we're within 0.1s of the end, wrap to beginning to avoid EOF issues
            if seek_time > self._duration - 0.1:
                seek_time = 0.0

        # Decide whether to seek or continue streaming
        # Seek if: no stream, going backwards (more than 1 frame), or jumping more than 2 seconds ahead
        # Allow small backward tolerance to handle floating point and timing jitter
        need_seek = (
            self._proc is None or
            self._proc.poll() is not None or
            seek_time < self._stream_time - self._frame_time or  # More than 1 frame backward
            seek_time > self._stream_time + 2.0
        )

        if need_seek:
            import sys
            reason = "no proc" if self._proc is None else "proc dead" if self._proc.poll() is not None else "backward" if seek_time < self._stream_time else "jump"
            print(f"SEEK {self.path.name}: t={t:.4f} seek={seek_time:.4f} stream={self._stream_time:.4f} ({reason})", file=sys.stderr)
            self._start_stream(seek_time)

        # Skip frames to reach target time
        skip_retries = 0
        while self._stream_time + self._frame_time <= seek_time:
            frame = self._read_frame_from_stream()
            if frame is None:
                # Stream ended or failed - restart from seek point
                import time
                skip_retries += 1
                if skip_retries > 3:
                    # Give up skipping, just start fresh at seek_time
                    self._start_stream(seek_time)
                    time.sleep(0.1)
                    break
                self._start_stream(seek_time)
                time.sleep(0.05)
                continue
            self._stream_time += self._frame_time
            skip_retries = 0  # Reset on successful read

        # Read the target frame with retry logic
        frame = None
        max_retries = 3
        for attempt in range(max_retries):
            frame = self._read_frame_from_stream()
            if frame is not None:
                break

            # Stream failed - try restarting
            import sys
            import time
            print(f"RETRY {self.path.name}: attempt {attempt+1}/{max_retries} at t={t:.2f}", file=sys.stderr)

            # Check for ffmpeg errors
            if self._proc and self._proc.stderr:
                try:
                    import select
                    readable, _, _ = select.select([self._proc.stderr], [], [], 0.1)
                    if readable:
                        err = self._proc.stderr.read(4096).decode('utf-8', errors='ignore')
                        if err:
                            print(f"ffmpeg error: {err}", file=sys.stderr)
                except:
                    pass

            # Wait a bit and restart
            time.sleep(0.1)
            self._start_stream(seek_time)

            # Give ffmpeg time to start
            time.sleep(0.1)

        if frame is None:
            import sys
            raise RuntimeError(f"Failed to read video frame from {self.path.name} at t={t:.2f} after {max_retries} retries")
        else:
            self._stream_time += self._frame_time

        self._last_read_time = t
        self._cached_frame = frame
        return frame

    def skip(self):
        """No-op for seek-based reading."""
        pass

    @property
    def size(self):
        return self._frame_size

    def close(self):
        if self._proc:
            self._proc.kill()
            self._proc = None


class PrefetchingVideoSource:
    """
    Video source with background prefetching for improved performance.

    Wraps VideoSource and adds a background thread that pre-decodes
    upcoming frames while the main thread processes the current frame.
    """

    def __init__(self, path: str, fps: float = 30, buffer_size: int = None):
        self._source = VideoSource(path, fps)
        self._buffer_size = buffer_size or PREFETCH_BUFFER_SIZE
        self._buffer = {}  # time -> frame
        self._buffer_lock = threading.Lock()
        self._prefetch_time = 0.0
        self._frame_time = 1.0 / fps
        self._stop_event = threading.Event()
        self._request_event = threading.Event()
        self._target_time = 0.0

        # Start prefetch thread
        self._thread = threading.Thread(target=self._prefetch_loop, daemon=True)
        self._thread.start()

        import sys
        print(f"PrefetchingVideoSource: {path} buffer_size={self._buffer_size}", file=sys.stderr)

    def _prefetch_loop(self):
        """Background thread that pre-reads frames."""
        while not self._stop_event.is_set():
            # Wait for work or timeout
            self._request_event.wait(timeout=0.01)
            self._request_event.clear()

            if self._stop_event.is_set():
                break

            # Prefetch frames ahead of target time
            target = self._target_time
            with self._buffer_lock:
                # Clean old frames (more than 1 second behind)
                old_times = [t for t in self._buffer.keys() if t < target - 1.0]
                for t in old_times:
                    del self._buffer[t]

                # Count how many frames we have buffered ahead
                buffered_ahead = sum(1 for t in self._buffer.keys() if t >= target)

            # Prefetch if buffer not full
            if buffered_ahead < self._buffer_size:
                # Find next time to prefetch
                prefetch_t = target
                with self._buffer_lock:
                    existing_times = set(self._buffer.keys())
                for _ in range(self._buffer_size):
                    if prefetch_t not in existing_times:
                        break
                    prefetch_t += self._frame_time

                # Read the frame (this is the slow part)
                try:
                    frame = self._source.read_at(prefetch_t)
                    with self._buffer_lock:
                        self._buffer[prefetch_t] = frame
                except Exception as e:
                    import sys
                    print(f"Prefetch error at t={prefetch_t}: {e}", file=sys.stderr)

    def read_at(self, t: float) -> np.ndarray:
        """Read frame at specific time, using prefetch buffer if available."""
        self._target_time = t
        self._request_event.set()  # Wake up prefetch thread

        # Round to frame time for buffer lookup
        t_key = round(t / self._frame_time) * self._frame_time

        # Check buffer first
        with self._buffer_lock:
            if t_key in self._buffer:
                return self._buffer[t_key]
            # Also check for close matches (within half frame time)
            for buf_t, frame in self._buffer.items():
                if abs(buf_t - t) < self._frame_time * 0.5:
                    return frame

        # Not in buffer - read directly (blocking)
        frame = self._source.read_at(t)

        # Store in buffer
        with self._buffer_lock:
            self._buffer[t_key] = frame

        return frame

    def read(self) -> np.ndarray:
        """Read frame (uses last cached or t=0)."""
        return self.read_at(0)

    def skip(self):
        """No-op for seek-based reading."""
        pass

    @property
    def size(self):
        return self._source.size

    @property
    def path(self):
        return self._source.path

    def close(self):
        self._stop_event.set()
        self._request_event.set()  # Wake up thread to exit
        self._thread.join(timeout=1.0)
        self._source.close()


class AudioAnalyzer:
    """Audio analyzer for energy and beat detection."""

    def __init__(self, path: str, sample_rate: int = 22050):
        self.path = Path(path)
        self.sample_rate = sample_rate

        # Check if file exists
        if not self.path.exists():
            raise FileNotFoundError(f"Audio file not found: {self.path}")

        # Load audio via ffmpeg
        cmd = ["ffmpeg", "-v", "error", "-i", str(self.path),
               "-f", "f32le", "-ac", "1", "-ar", str(sample_rate), "-"]
        result = subprocess.run(cmd, capture_output=True)
        if result.returncode != 0:
            raise RuntimeError(f"Failed to load audio '{self.path}': {result.stderr.decode()}")
        self._audio = np.frombuffer(result.stdout, dtype=np.float32)
        if len(self._audio) == 0:
            raise RuntimeError(f"Audio file is empty or invalid: {self.path}")

        # Get duration
        cmd = ["ffprobe", "-v", "quiet", "-print_format", "json",
               "-show_format", str(self.path)]
        result = subprocess.run(cmd, capture_output=True, text=True)
        if result.returncode != 0:
            raise RuntimeError(f"Failed to probe audio '{self.path}': {result.stderr}")
        info = json.loads(result.stdout)
        self.duration = float(info.get("format", {}).get("duration", 60))

        # Beat detection state
        self._flux_history = []
        self._last_beat_time = -1
        self._beat_count = 0
        self._last_beat_check_time = -1
        # Cache beat result for current time (so multiple scans see same result)
        self._beat_cache_time = -1
        self._beat_cache_result = False

    def get_energy(self, t: float) -> float:
        """Get energy level at time t (0-1)."""
        idx = int(t * self.sample_rate)
        start = max(0, idx - 512)
        end = min(len(self._audio), idx + 512)
        if start >= end:
            return 0.0
        return min(1.0, np.sqrt(np.mean(self._audio[start:end] ** 2)) * 3.0)

    def get_beat(self, t: float) -> bool:
        """Check if there's a beat at time t."""
        # Return cached result if same time (multiple scans query same frame)
        if t == self._beat_cache_time:
            return self._beat_cache_result

        idx = int(t * self.sample_rate)
        size = 2048

        start, end = max(0, idx - size//2), min(len(self._audio), idx + size//2)
        if end - start < size/2:
            self._beat_cache_time = t
            self._beat_cache_result = False
            return False
        curr = self._audio[start:end]

        pstart, pend = max(0, start - 512), max(0, end - 512)
        if pend <= pstart:
            self._beat_cache_time = t
            self._beat_cache_result = False
            return False
        prev = self._audio[pstart:pend]

        curr_spec = np.abs(np.fft.rfft(curr * np.hanning(len(curr))))
        prev_spec = np.abs(np.fft.rfft(prev * np.hanning(len(prev))))

        n = min(len(curr_spec), len(prev_spec))
        flux = np.sum(np.maximum(0, curr_spec[:n] - prev_spec[:n])) / (n + 1)

        self._flux_history.append((t, flux))
        if len(self._flux_history) > 50:
            self._flux_history = self._flux_history[-50:]

        if len(self._flux_history) < 5:
            self._beat_cache_time = t
            self._beat_cache_result = False
            return False

        recent = [f for _, f in self._flux_history[-20:]]
        threshold = np.mean(recent) + 1.5 * np.std(recent)

        is_beat = flux > threshold and (t - self._last_beat_time) > 0.1
        if is_beat:
            self._last_beat_time = t
            if t > self._last_beat_check_time:
                self._beat_count += 1
                self._last_beat_check_time = t

        # Cache result for this time
        self._beat_cache_time = t
        self._beat_cache_result = is_beat
        return is_beat

    def get_beat_count(self, t: float) -> int:
        """Get cumulative beat count up to time t."""
        # Ensure beat detection has run up to this time
        self.get_beat(t)
        return self._beat_count


# === Primitives ===

def prim_make_video_source(path: str, fps: float = 30):
    """Create a video source from a file path.

    Uses PrefetchingVideoSource if STREAMING_PREFETCH=1 (default).
    """
    if PREFETCH_ENABLED:
        return PrefetchingVideoSource(path, fps)
    return VideoSource(path, fps)


def prim_source_read(source: VideoSource, t: float = None):
    """Read a frame from a video source."""
    import sys
    if t is not None:
        frame = source.read_at(t)
        # Debug: show source and time
        if int(t * 10) % 10 == 0:  # Every second
            print(f"READ {source.path.name}: t={t:.2f} stream={source._stream_time:.2f}", file=sys.stderr)
        return frame
    return source.read()


def prim_source_skip(source: VideoSource):
    """Skip a frame (keep pipe in sync)."""
    source.skip()


def prim_source_size(source: VideoSource):
    """Get (width, height) of source."""
    return source.size


def prim_make_audio_analyzer(path: str):
    """Create an audio analyzer from a file path."""
    return AudioAnalyzer(path)


def prim_audio_energy(analyzer: AudioAnalyzer, t: float) -> float:
    """Get energy level (0-1) at time t."""
    return analyzer.get_energy(t)


def prim_audio_beat(analyzer: AudioAnalyzer, t: float) -> bool:
    """Check if there's a beat at time t."""
    return analyzer.get_beat(t)


def prim_audio_beat_count(analyzer: AudioAnalyzer, t: float) -> int:
    """Get cumulative beat count up to time t."""
    return analyzer.get_beat_count(t)


def prim_audio_duration(analyzer: AudioAnalyzer) -> float:
    """Get audio duration in seconds."""
    return analyzer.duration


# Export primitives
PRIMITIVES = {
    # Video source
    'make-video-source': prim_make_video_source,
    'source-read': prim_source_read,
    'source-skip': prim_source_skip,
    'source-size': prim_source_size,

    # Audio analyzer
    'make-audio-analyzer': prim_make_audio_analyzer,
    'audio-energy': prim_audio_energy,
    'audio-beat': prim_audio_beat,
    'audio-beat-count': prim_audio_beat_count,
    'audio-duration': prim_audio_duration,
}