Squashed 'core/' content from commit 4957443

git-subtree-dir: core git-subtree-split: 4957443184ae0eb6323635a90a19acffb3e01d07
2026-02-24 23:09:39 +00:00
commit cc2dcbddd4
80 changed files with 25711 additions and 0 deletions
--- a/artdag/analysis/analyzer.py
+++ b/artdag/analysis/analyzer.py
@@ -0,0 +1,282 @@
+# artdag/analysis/analyzer.py
+"""
+Main Analyzer class for the Analysis phase.
+
+Coordinates audio and video feature extraction with caching.
+"""
+
+import json
+import logging
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Dict, List, Optional
+
+from .schema import AnalysisResult, AudioFeatures, VideoFeatures
+from .audio import analyze_audio, FEATURE_ALL as AUDIO_ALL
+from .video import analyze_video, FEATURE_ALL as VIDEO_ALL
+
+logger = logging.getLogger(__name__)
+
+
+class AnalysisCache:
+    """
+    Simple file-based cache for analysis results.
+
+    Stores results as JSON files keyed by analysis cache_id.
+    """
+
+    def __init__(self, cache_dir: Path):
+        self.cache_dir = Path(cache_dir)
+        self.cache_dir.mkdir(parents=True, exist_ok=True)
+
+    def _path_for(self, cache_id: str) -> Path:
+        """Get cache file path for a cache_id."""
+        return self.cache_dir / f"{cache_id}.json"
+
+    def get(self, cache_id: str) -> Optional[AnalysisResult]:
+        """Retrieve cached analysis result."""
+        path = self._path_for(cache_id)
+        if not path.exists():
+            return None
+
+        try:
+            with open(path, "r") as f:
+                data = json.load(f)
+            return AnalysisResult.from_dict(data)
+        except (json.JSONDecodeError, KeyError) as e:
+            logger.warning(f"Failed to load analysis cache {cache_id}: {e}")
+            return None
+
+    def put(self, result: AnalysisResult) -> None:
+        """Store analysis result in cache."""
+        path = self._path_for(result.cache_id)
+        with open(path, "w") as f:
+            json.dump(result.to_dict(), f, indent=2)
+
+    def has(self, cache_id: str) -> bool:
+        """Check if analysis result is cached."""
+        return self._path_for(cache_id).exists()
+
+    def remove(self, cache_id: str) -> bool:
+        """Remove cached analysis result."""
+        path = self._path_for(cache_id)
+        if path.exists():
+            path.unlink()
+            return True
+        return False
+
+
+class Analyzer:
+    """
+    Analyzes media inputs to extract features.
+
+    The Analyzer is the first phase of the 3-phase execution model.
+    It extracts features from inputs that inform downstream processing.
+
+    Example:
+        analyzer = Analyzer(cache_dir=Path("./analysis_cache"))
+
+        # Analyze a music file for beats
+        result = analyzer.analyze(
+            input_path=Path("/path/to/music.mp3"),
+            input_hash="abc123...",
+            features=["beats", "energy"]
+        )
+
+        print(f"Tempo: {result.tempo} BPM")
+        print(f"Beats: {result.beat_times}")
+    """
+
+    def __init__(
+        self,
+        cache_dir: Optional[Path] = None,
+        content_cache: Optional["Cache"] = None,  # artdag.Cache for input lookup
+    ):
+        """
+        Initialize the Analyzer.
+
+        Args:
+            cache_dir: Directory for analysis cache. If None, no caching.
+            content_cache: artdag Cache for looking up inputs by hash
+        """
+        self.cache = AnalysisCache(cache_dir) if cache_dir else None
+        self.content_cache = content_cache
+
+    def get_input_path(self, input_hash: str, input_path: Optional[Path] = None) -> Path:
+        """
+        Resolve input to a file path.
+
+        Args:
+            input_hash: Content hash of the input
+            input_path: Optional direct path to file
+
+        Returns:
+            Path to the input file
+
+        Raises:
+            ValueError: If input cannot be resolved
+        """
+        if input_path and input_path.exists():
+            return input_path
+
+        if self.content_cache:
+            entry = self.content_cache.get(input_hash)
+            if entry:
+                return Path(entry.output_path)
+
+        raise ValueError(f"Cannot resolve input {input_hash}: no path provided and not in cache")
+
+    def analyze(
+        self,
+        input_hash: str,
+        features: List[str],
+        input_path: Optional[Path] = None,
+        media_type: Optional[str] = None,
+    ) -> AnalysisResult:
+        """
+        Analyze an input file and extract features.
+
+        Args:
+            input_hash: Content hash of the input (for cache key)
+            features: List of features to extract:
+                Audio: "beats", "tempo", "energy", "spectrum", "onsets"
+                Video: "metadata", "motion_tempo", "scene_changes"
+                Meta: "all" (extracts all relevant features)
+            input_path: Optional direct path to file
+            media_type: Optional hint ("audio", "video", or None for auto-detect)
+
+        Returns:
+            AnalysisResult with extracted features
+        """
+        # Compute cache ID
+        temp_result = AnalysisResult(
+            input_hash=input_hash,
+            features_requested=sorted(features),
+        )
+        cache_id = temp_result.cache_id
+
+        # Check cache
+        if self.cache and self.cache.has(cache_id):
+            cached = self.cache.get(cache_id)
+            if cached:
+                logger.info(f"Analysis cache hit: {cache_id[:16]}...")
+                return cached
+
+        # Resolve input path
+        path = self.get_input_path(input_hash, input_path)
+        logger.info(f"Analyzing {path} for features: {features}")
+
+        # Detect media type if not specified
+        if media_type is None:
+            media_type = self._detect_media_type(path)
+
+        # Extract features
+        audio_features = None
+        video_features = None
+
+        # Normalize features
+        if "all" in features:
+            audio_features_list = [AUDIO_ALL]
+            video_features_list = [VIDEO_ALL]
+        else:
+            audio_features_list = [f for f in features if f in ("beats", "tempo", "energy", "spectrum", "onsets")]
+            video_features_list = [f for f in features if f in ("metadata", "motion_tempo", "scene_changes")]
+
+        if media_type in ("audio", "video") and audio_features_list:
+            try:
+                audio_features = analyze_audio(path, features=audio_features_list)
+            except Exception as e:
+                logger.warning(f"Audio analysis failed: {e}")
+
+        if media_type == "video" and video_features_list:
+            try:
+                video_features = analyze_video(path, features=video_features_list)
+            except Exception as e:
+                logger.warning(f"Video analysis failed: {e}")
+
+        result = AnalysisResult(
+            input_hash=input_hash,
+            features_requested=sorted(features),
+            audio=audio_features,
+            video=video_features,
+            analyzed_at=datetime.now(timezone.utc).isoformat(),
+        )
+
+        # Cache result
+        if self.cache:
+            self.cache.put(result)
+
+        return result
+
+    def analyze_multiple(
+        self,
+        inputs: Dict[str, Path],
+        features: List[str],
+    ) -> Dict[str, AnalysisResult]:
+        """
+        Analyze multiple inputs.
+
+        Args:
+            inputs: Dict mapping input_hash to file path
+            features: Features to extract from all inputs
+
+        Returns:
+            Dict mapping input_hash to AnalysisResult
+        """
+        results = {}
+        for input_hash, input_path in inputs.items():
+            try:
+                results[input_hash] = self.analyze(
+                    input_hash=input_hash,
+                    features=features,
+                    input_path=input_path,
+                )
+            except Exception as e:
+                logger.error(f"Analysis failed for {input_hash}: {e}")
+                raise
+
+        return results
+
+    def _detect_media_type(self, path: Path) -> str:
+        """
+        Detect if file is audio or video.
+
+        Args:
+            path: Path to media file
+
+        Returns:
+            "audio" or "video"
+        """
+        import subprocess
+        import json
+
+        cmd = [
+            "ffprobe", "-v", "quiet",
+            "-print_format", "json",
+            "-show_streams",
+            str(path)
+        ]
+
+        try:
+            result = subprocess.run(cmd, capture_output=True, text=True, check=True)
+            data = json.loads(result.stdout)
+            streams = data.get("streams", [])
+
+            has_video = any(s.get("codec_type") == "video" for s in streams)
+            has_audio = any(s.get("codec_type") == "audio" for s in streams)
+
+            if has_video:
+                return "video"
+            elif has_audio:
+                return "audio"
+            else:
+                return "unknown"
+
+        except (subprocess.CalledProcessError, json.JSONDecodeError):
+            # Fall back to extension-based detection
+            ext = path.suffix.lower()
+            if ext in (".mp4", ".mov", ".avi", ".mkv", ".webm"):
+                return "video"
+            elif ext in (".mp3", ".wav", ".flac", ".ogg", ".m4a", ".aac"):
+                return "audio"
+            return "unknown"