Squashed 'core/' content from commit 4957443
git-subtree-dir: core git-subtree-split: 4957443184ae0eb6323635a90a19acffb3e01d07
This commit is contained in:
282
artdag/analysis/analyzer.py
Normal file
282
artdag/analysis/analyzer.py
Normal file
@@ -0,0 +1,282 @@
|
||||
# artdag/analysis/analyzer.py
|
||||
"""
|
||||
Main Analyzer class for the Analysis phase.
|
||||
|
||||
Coordinates audio and video feature extraction with caching.
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
from .schema import AnalysisResult, AudioFeatures, VideoFeatures
|
||||
from .audio import analyze_audio, FEATURE_ALL as AUDIO_ALL
|
||||
from .video import analyze_video, FEATURE_ALL as VIDEO_ALL
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class AnalysisCache:
|
||||
"""
|
||||
Simple file-based cache for analysis results.
|
||||
|
||||
Stores results as JSON files keyed by analysis cache_id.
|
||||
"""
|
||||
|
||||
def __init__(self, cache_dir: Path):
|
||||
self.cache_dir = Path(cache_dir)
|
||||
self.cache_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
def _path_for(self, cache_id: str) -> Path:
|
||||
"""Get cache file path for a cache_id."""
|
||||
return self.cache_dir / f"{cache_id}.json"
|
||||
|
||||
def get(self, cache_id: str) -> Optional[AnalysisResult]:
|
||||
"""Retrieve cached analysis result."""
|
||||
path = self._path_for(cache_id)
|
||||
if not path.exists():
|
||||
return None
|
||||
|
||||
try:
|
||||
with open(path, "r") as f:
|
||||
data = json.load(f)
|
||||
return AnalysisResult.from_dict(data)
|
||||
except (json.JSONDecodeError, KeyError) as e:
|
||||
logger.warning(f"Failed to load analysis cache {cache_id}: {e}")
|
||||
return None
|
||||
|
||||
def put(self, result: AnalysisResult) -> None:
|
||||
"""Store analysis result in cache."""
|
||||
path = self._path_for(result.cache_id)
|
||||
with open(path, "w") as f:
|
||||
json.dump(result.to_dict(), f, indent=2)
|
||||
|
||||
def has(self, cache_id: str) -> bool:
|
||||
"""Check if analysis result is cached."""
|
||||
return self._path_for(cache_id).exists()
|
||||
|
||||
def remove(self, cache_id: str) -> bool:
|
||||
"""Remove cached analysis result."""
|
||||
path = self._path_for(cache_id)
|
||||
if path.exists():
|
||||
path.unlink()
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
class Analyzer:
|
||||
"""
|
||||
Analyzes media inputs to extract features.
|
||||
|
||||
The Analyzer is the first phase of the 3-phase execution model.
|
||||
It extracts features from inputs that inform downstream processing.
|
||||
|
||||
Example:
|
||||
analyzer = Analyzer(cache_dir=Path("./analysis_cache"))
|
||||
|
||||
# Analyze a music file for beats
|
||||
result = analyzer.analyze(
|
||||
input_path=Path("/path/to/music.mp3"),
|
||||
input_hash="abc123...",
|
||||
features=["beats", "energy"]
|
||||
)
|
||||
|
||||
print(f"Tempo: {result.tempo} BPM")
|
||||
print(f"Beats: {result.beat_times}")
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
cache_dir: Optional[Path] = None,
|
||||
content_cache: Optional["Cache"] = None, # artdag.Cache for input lookup
|
||||
):
|
||||
"""
|
||||
Initialize the Analyzer.
|
||||
|
||||
Args:
|
||||
cache_dir: Directory for analysis cache. If None, no caching.
|
||||
content_cache: artdag Cache for looking up inputs by hash
|
||||
"""
|
||||
self.cache = AnalysisCache(cache_dir) if cache_dir else None
|
||||
self.content_cache = content_cache
|
||||
|
||||
def get_input_path(self, input_hash: str, input_path: Optional[Path] = None) -> Path:
|
||||
"""
|
||||
Resolve input to a file path.
|
||||
|
||||
Args:
|
||||
input_hash: Content hash of the input
|
||||
input_path: Optional direct path to file
|
||||
|
||||
Returns:
|
||||
Path to the input file
|
||||
|
||||
Raises:
|
||||
ValueError: If input cannot be resolved
|
||||
"""
|
||||
if input_path and input_path.exists():
|
||||
return input_path
|
||||
|
||||
if self.content_cache:
|
||||
entry = self.content_cache.get(input_hash)
|
||||
if entry:
|
||||
return Path(entry.output_path)
|
||||
|
||||
raise ValueError(f"Cannot resolve input {input_hash}: no path provided and not in cache")
|
||||
|
||||
def analyze(
|
||||
self,
|
||||
input_hash: str,
|
||||
features: List[str],
|
||||
input_path: Optional[Path] = None,
|
||||
media_type: Optional[str] = None,
|
||||
) -> AnalysisResult:
|
||||
"""
|
||||
Analyze an input file and extract features.
|
||||
|
||||
Args:
|
||||
input_hash: Content hash of the input (for cache key)
|
||||
features: List of features to extract:
|
||||
Audio: "beats", "tempo", "energy", "spectrum", "onsets"
|
||||
Video: "metadata", "motion_tempo", "scene_changes"
|
||||
Meta: "all" (extracts all relevant features)
|
||||
input_path: Optional direct path to file
|
||||
media_type: Optional hint ("audio", "video", or None for auto-detect)
|
||||
|
||||
Returns:
|
||||
AnalysisResult with extracted features
|
||||
"""
|
||||
# Compute cache ID
|
||||
temp_result = AnalysisResult(
|
||||
input_hash=input_hash,
|
||||
features_requested=sorted(features),
|
||||
)
|
||||
cache_id = temp_result.cache_id
|
||||
|
||||
# Check cache
|
||||
if self.cache and self.cache.has(cache_id):
|
||||
cached = self.cache.get(cache_id)
|
||||
if cached:
|
||||
logger.info(f"Analysis cache hit: {cache_id[:16]}...")
|
||||
return cached
|
||||
|
||||
# Resolve input path
|
||||
path = self.get_input_path(input_hash, input_path)
|
||||
logger.info(f"Analyzing {path} for features: {features}")
|
||||
|
||||
# Detect media type if not specified
|
||||
if media_type is None:
|
||||
media_type = self._detect_media_type(path)
|
||||
|
||||
# Extract features
|
||||
audio_features = None
|
||||
video_features = None
|
||||
|
||||
# Normalize features
|
||||
if "all" in features:
|
||||
audio_features_list = [AUDIO_ALL]
|
||||
video_features_list = [VIDEO_ALL]
|
||||
else:
|
||||
audio_features_list = [f for f in features if f in ("beats", "tempo", "energy", "spectrum", "onsets")]
|
||||
video_features_list = [f for f in features if f in ("metadata", "motion_tempo", "scene_changes")]
|
||||
|
||||
if media_type in ("audio", "video") and audio_features_list:
|
||||
try:
|
||||
audio_features = analyze_audio(path, features=audio_features_list)
|
||||
except Exception as e:
|
||||
logger.warning(f"Audio analysis failed: {e}")
|
||||
|
||||
if media_type == "video" and video_features_list:
|
||||
try:
|
||||
video_features = analyze_video(path, features=video_features_list)
|
||||
except Exception as e:
|
||||
logger.warning(f"Video analysis failed: {e}")
|
||||
|
||||
result = AnalysisResult(
|
||||
input_hash=input_hash,
|
||||
features_requested=sorted(features),
|
||||
audio=audio_features,
|
||||
video=video_features,
|
||||
analyzed_at=datetime.now(timezone.utc).isoformat(),
|
||||
)
|
||||
|
||||
# Cache result
|
||||
if self.cache:
|
||||
self.cache.put(result)
|
||||
|
||||
return result
|
||||
|
||||
def analyze_multiple(
|
||||
self,
|
||||
inputs: Dict[str, Path],
|
||||
features: List[str],
|
||||
) -> Dict[str, AnalysisResult]:
|
||||
"""
|
||||
Analyze multiple inputs.
|
||||
|
||||
Args:
|
||||
inputs: Dict mapping input_hash to file path
|
||||
features: Features to extract from all inputs
|
||||
|
||||
Returns:
|
||||
Dict mapping input_hash to AnalysisResult
|
||||
"""
|
||||
results = {}
|
||||
for input_hash, input_path in inputs.items():
|
||||
try:
|
||||
results[input_hash] = self.analyze(
|
||||
input_hash=input_hash,
|
||||
features=features,
|
||||
input_path=input_path,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Analysis failed for {input_hash}: {e}")
|
||||
raise
|
||||
|
||||
return results
|
||||
|
||||
def _detect_media_type(self, path: Path) -> str:
|
||||
"""
|
||||
Detect if file is audio or video.
|
||||
|
||||
Args:
|
||||
path: Path to media file
|
||||
|
||||
Returns:
|
||||
"audio" or "video"
|
||||
"""
|
||||
import subprocess
|
||||
import json
|
||||
|
||||
cmd = [
|
||||
"ffprobe", "-v", "quiet",
|
||||
"-print_format", "json",
|
||||
"-show_streams",
|
||||
str(path)
|
||||
]
|
||||
|
||||
try:
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
|
||||
data = json.loads(result.stdout)
|
||||
streams = data.get("streams", [])
|
||||
|
||||
has_video = any(s.get("codec_type") == "video" for s in streams)
|
||||
has_audio = any(s.get("codec_type") == "audio" for s in streams)
|
||||
|
||||
if has_video:
|
||||
return "video"
|
||||
elif has_audio:
|
||||
return "audio"
|
||||
else:
|
||||
return "unknown"
|
||||
|
||||
except (subprocess.CalledProcessError, json.JSONDecodeError):
|
||||
# Fall back to extension-based detection
|
||||
ext = path.suffix.lower()
|
||||
if ext in (".mp4", ".mov", ".avi", ".mkv", ".webm"):
|
||||
return "video"
|
||||
elif ext in (".mp3", ".wav", ".flac", ".ogg", ".m4a", ".aac"):
|
||||
return "audio"
|
||||
return "unknown"
|
||||
Reference in New Issue
Block a user