rose-ash/artdag/analysis/analyzer.py

# artdag/analysis/analyzer.py
"""
Main Analyzer class for the Analysis phase.

Coordinates audio and video feature extraction with caching.
"""

import json
import logging
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, List, Optional

from .schema import AnalysisResult, AudioFeatures, VideoFeatures
from .audio import analyze_audio, FEATURE_ALL as AUDIO_ALL
from .video import analyze_video, FEATURE_ALL as VIDEO_ALL

logger = logging.getLogger(__name__)


class AnalysisCache:
    """
    Simple file-based cache for analysis results.

    Stores results as JSON files keyed by analysis cache_id.
    """

    def __init__(self, cache_dir: Path):
        self.cache_dir = Path(cache_dir)
        self.cache_dir.mkdir(parents=True, exist_ok=True)

    def _path_for(self, cache_id: str) -> Path:
        """Get cache file path for a cache_id."""
        return self.cache_dir / f"{cache_id}.json"

    def get(self, cache_id: str) -> Optional[AnalysisResult]:
        """Retrieve cached analysis result."""
        path = self._path_for(cache_id)
        if not path.exists():
            return None

        try:
            with open(path, "r") as f:
                data = json.load(f)
            return AnalysisResult.from_dict(data)
        except (json.JSONDecodeError, KeyError) as e:
            logger.warning(f"Failed to load analysis cache {cache_id}: {e}")
            return None

    def put(self, result: AnalysisResult) -> None:
        """Store analysis result in cache."""
        path = self._path_for(result.cache_id)
        with open(path, "w") as f:
            json.dump(result.to_dict(), f, indent=2)

    def has(self, cache_id: str) -> bool:
        """Check if analysis result is cached."""
        return self._path_for(cache_id).exists()

    def remove(self, cache_id: str) -> bool:
        """Remove cached analysis result."""
        path = self._path_for(cache_id)
        if path.exists():
            path.unlink()
            return True
        return False


class Analyzer:
    """
    Analyzes media inputs to extract features.

    The Analyzer is the first phase of the 3-phase execution model.
    It extracts features from inputs that inform downstream processing.

    Example:
        analyzer = Analyzer(cache_dir=Path("./analysis_cache"))

        # Analyze a music file for beats
        result = analyzer.analyze(
            input_path=Path("/path/to/music.mp3"),
            input_hash="abc123...",
            features=["beats", "energy"]
        )

        print(f"Tempo: {result.tempo} BPM")
        print(f"Beats: {result.beat_times}")
    """

    def __init__(
        self,
        cache_dir: Optional[Path] = None,
        content_cache: Optional["Cache"] = None,  # artdag.Cache for input lookup
    ):
        """
        Initialize the Analyzer.

        Args:
            cache_dir: Directory for analysis cache. If None, no caching.
            content_cache: artdag Cache for looking up inputs by hash
        """
        self.cache = AnalysisCache(cache_dir) if cache_dir else None
        self.content_cache = content_cache

    def get_input_path(self, input_hash: str, input_path: Optional[Path] = None) -> Path:
        """
        Resolve input to a file path.

        Args:
            input_hash: Content hash of the input
            input_path: Optional direct path to file

        Returns:
            Path to the input file

        Raises:
            ValueError: If input cannot be resolved
        """
        if input_path and input_path.exists():
            return input_path

        if self.content_cache:
            entry = self.content_cache.get(input_hash)
            if entry:
                return Path(entry.output_path)

        raise ValueError(f"Cannot resolve input {input_hash}: no path provided and not in cache")

    def analyze(
        self,
        input_hash: str,
        features: List[str],
        input_path: Optional[Path] = None,
        media_type: Optional[str] = None,
    ) -> AnalysisResult:
        """
        Analyze an input file and extract features.

        Args:
            input_hash: Content hash of the input (for cache key)
            features: List of features to extract:
                Audio: "beats", "tempo", "energy", "spectrum", "onsets"
                Video: "metadata", "motion_tempo", "scene_changes"
                Meta: "all" (extracts all relevant features)
            input_path: Optional direct path to file
            media_type: Optional hint ("audio", "video", or None for auto-detect)

        Returns:
            AnalysisResult with extracted features
        """
        # Compute cache ID
        temp_result = AnalysisResult(
            input_hash=input_hash,
            features_requested=sorted(features),
        )
        cache_id = temp_result.cache_id

        # Check cache
        if self.cache and self.cache.has(cache_id):
            cached = self.cache.get(cache_id)
            if cached:
                logger.info(f"Analysis cache hit: {cache_id[:16]}...")
                return cached

        # Resolve input path
        path = self.get_input_path(input_hash, input_path)
        logger.info(f"Analyzing {path} for features: {features}")

        # Detect media type if not specified
        if media_type is None:
            media_type = self._detect_media_type(path)

        # Extract features
        audio_features = None
        video_features = None

        # Normalize features
        if "all" in features:
            audio_features_list = [AUDIO_ALL]
            video_features_list = [VIDEO_ALL]
        else:
            audio_features_list = [f for f in features if f in ("beats", "tempo", "energy", "spectrum", "onsets")]
            video_features_list = [f for f in features if f in ("metadata", "motion_tempo", "scene_changes")]

        if media_type in ("audio", "video") and audio_features_list:
            try:
                audio_features = analyze_audio(path, features=audio_features_list)
            except Exception as e:
                logger.warning(f"Audio analysis failed: {e}")

        if media_type == "video" and video_features_list:
            try:
                video_features = analyze_video(path, features=video_features_list)
            except Exception as e:
                logger.warning(f"Video analysis failed: {e}")

        result = AnalysisResult(
            input_hash=input_hash,
            features_requested=sorted(features),
            audio=audio_features,
            video=video_features,
            analyzed_at=datetime.now(timezone.utc).isoformat(),
        )

        # Cache result
        if self.cache:
            self.cache.put(result)

        return result

    def analyze_multiple(
        self,
        inputs: Dict[str, Path],
        features: List[str],
    ) -> Dict[str, AnalysisResult]:
        """
        Analyze multiple inputs.

        Args:
            inputs: Dict mapping input_hash to file path
            features: Features to extract from all inputs

        Returns:
            Dict mapping input_hash to AnalysisResult
        """
        results = {}
        for input_hash, input_path in inputs.items():
            try:
                results[input_hash] = self.analyze(
                    input_hash=input_hash,
                    features=features,
                    input_path=input_path,
                )
            except Exception as e:
                logger.error(f"Analysis failed for {input_hash}: {e}")
                raise

        return results

    def _detect_media_type(self, path: Path) -> str:
        """
        Detect if file is audio or video.

        Args:
            path: Path to media file

        Returns:
            "audio" or "video"
        """
        import subprocess
        import json

        cmd = [
            "ffprobe", "-v", "quiet",
            "-print_format", "json",
            "-show_streams",
            str(path)
        ]

        try:
            result = subprocess.run(cmd, capture_output=True, text=True, check=True)
            data = json.loads(result.stdout)
            streams = data.get("streams", [])

            has_video = any(s.get("codec_type") == "video" for s in streams)
            has_audio = any(s.get("codec_type") == "audio" for s in streams)

            if has_video:
                return "video"
            elif has_audio:
                return "audio"
            else:
                return "unknown"

        except (subprocess.CalledProcessError, json.JSONDecodeError):
            # Fall back to extension-based detection
            ext = path.suffix.lower()
            if ext in (".mp4", ".mov", ".avi", ".mkv", ".webm"):
                return "video"
            elif ext in (".mp3", ".wav", ".flac", ".ogg", ".m4a", ".aac"):
                return "audio"
            return "unknown"