Squashed 'core/' content from commit 4957443
git-subtree-dir: core git-subtree-split: 4957443184ae0eb6323635a90a19acffb3e01d07
This commit is contained in:
26
artdag/analysis/__init__.py
Normal file
26
artdag/analysis/__init__.py
Normal file
@@ -0,0 +1,26 @@
|
||||
# artdag/analysis - Audio and video feature extraction
|
||||
#
|
||||
# Provides the Analysis phase of the 3-phase execution model:
|
||||
# 1. ANALYZE - Extract features from inputs
|
||||
# 2. PLAN - Generate execution plan with cache IDs
|
||||
# 3. EXECUTE - Run steps with caching
|
||||
|
||||
from .schema import (
|
||||
AnalysisResult,
|
||||
AudioFeatures,
|
||||
VideoFeatures,
|
||||
BeatInfo,
|
||||
EnergyEnvelope,
|
||||
SpectrumBands,
|
||||
)
|
||||
from .analyzer import Analyzer
|
||||
|
||||
__all__ = [
|
||||
"Analyzer",
|
||||
"AnalysisResult",
|
||||
"AudioFeatures",
|
||||
"VideoFeatures",
|
||||
"BeatInfo",
|
||||
"EnergyEnvelope",
|
||||
"SpectrumBands",
|
||||
]
|
||||
282
artdag/analysis/analyzer.py
Normal file
282
artdag/analysis/analyzer.py
Normal file
@@ -0,0 +1,282 @@
|
||||
# artdag/analysis/analyzer.py
|
||||
"""
|
||||
Main Analyzer class for the Analysis phase.
|
||||
|
||||
Coordinates audio and video feature extraction with caching.
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
from .schema import AnalysisResult, AudioFeatures, VideoFeatures
|
||||
from .audio import analyze_audio, FEATURE_ALL as AUDIO_ALL
|
||||
from .video import analyze_video, FEATURE_ALL as VIDEO_ALL
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class AnalysisCache:
|
||||
"""
|
||||
Simple file-based cache for analysis results.
|
||||
|
||||
Stores results as JSON files keyed by analysis cache_id.
|
||||
"""
|
||||
|
||||
def __init__(self, cache_dir: Path):
|
||||
self.cache_dir = Path(cache_dir)
|
||||
self.cache_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
def _path_for(self, cache_id: str) -> Path:
|
||||
"""Get cache file path for a cache_id."""
|
||||
return self.cache_dir / f"{cache_id}.json"
|
||||
|
||||
def get(self, cache_id: str) -> Optional[AnalysisResult]:
|
||||
"""Retrieve cached analysis result."""
|
||||
path = self._path_for(cache_id)
|
||||
if not path.exists():
|
||||
return None
|
||||
|
||||
try:
|
||||
with open(path, "r") as f:
|
||||
data = json.load(f)
|
||||
return AnalysisResult.from_dict(data)
|
||||
except (json.JSONDecodeError, KeyError) as e:
|
||||
logger.warning(f"Failed to load analysis cache {cache_id}: {e}")
|
||||
return None
|
||||
|
||||
def put(self, result: AnalysisResult) -> None:
|
||||
"""Store analysis result in cache."""
|
||||
path = self._path_for(result.cache_id)
|
||||
with open(path, "w") as f:
|
||||
json.dump(result.to_dict(), f, indent=2)
|
||||
|
||||
def has(self, cache_id: str) -> bool:
|
||||
"""Check if analysis result is cached."""
|
||||
return self._path_for(cache_id).exists()
|
||||
|
||||
def remove(self, cache_id: str) -> bool:
|
||||
"""Remove cached analysis result."""
|
||||
path = self._path_for(cache_id)
|
||||
if path.exists():
|
||||
path.unlink()
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
class Analyzer:
|
||||
"""
|
||||
Analyzes media inputs to extract features.
|
||||
|
||||
The Analyzer is the first phase of the 3-phase execution model.
|
||||
It extracts features from inputs that inform downstream processing.
|
||||
|
||||
Example:
|
||||
analyzer = Analyzer(cache_dir=Path("./analysis_cache"))
|
||||
|
||||
# Analyze a music file for beats
|
||||
result = analyzer.analyze(
|
||||
input_path=Path("/path/to/music.mp3"),
|
||||
input_hash="abc123...",
|
||||
features=["beats", "energy"]
|
||||
)
|
||||
|
||||
print(f"Tempo: {result.tempo} BPM")
|
||||
print(f"Beats: {result.beat_times}")
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
cache_dir: Optional[Path] = None,
|
||||
content_cache: Optional["Cache"] = None, # artdag.Cache for input lookup
|
||||
):
|
||||
"""
|
||||
Initialize the Analyzer.
|
||||
|
||||
Args:
|
||||
cache_dir: Directory for analysis cache. If None, no caching.
|
||||
content_cache: artdag Cache for looking up inputs by hash
|
||||
"""
|
||||
self.cache = AnalysisCache(cache_dir) if cache_dir else None
|
||||
self.content_cache = content_cache
|
||||
|
||||
def get_input_path(self, input_hash: str, input_path: Optional[Path] = None) -> Path:
|
||||
"""
|
||||
Resolve input to a file path.
|
||||
|
||||
Args:
|
||||
input_hash: Content hash of the input
|
||||
input_path: Optional direct path to file
|
||||
|
||||
Returns:
|
||||
Path to the input file
|
||||
|
||||
Raises:
|
||||
ValueError: If input cannot be resolved
|
||||
"""
|
||||
if input_path and input_path.exists():
|
||||
return input_path
|
||||
|
||||
if self.content_cache:
|
||||
entry = self.content_cache.get(input_hash)
|
||||
if entry:
|
||||
return Path(entry.output_path)
|
||||
|
||||
raise ValueError(f"Cannot resolve input {input_hash}: no path provided and not in cache")
|
||||
|
||||
def analyze(
|
||||
self,
|
||||
input_hash: str,
|
||||
features: List[str],
|
||||
input_path: Optional[Path] = None,
|
||||
media_type: Optional[str] = None,
|
||||
) -> AnalysisResult:
|
||||
"""
|
||||
Analyze an input file and extract features.
|
||||
|
||||
Args:
|
||||
input_hash: Content hash of the input (for cache key)
|
||||
features: List of features to extract:
|
||||
Audio: "beats", "tempo", "energy", "spectrum", "onsets"
|
||||
Video: "metadata", "motion_tempo", "scene_changes"
|
||||
Meta: "all" (extracts all relevant features)
|
||||
input_path: Optional direct path to file
|
||||
media_type: Optional hint ("audio", "video", or None for auto-detect)
|
||||
|
||||
Returns:
|
||||
AnalysisResult with extracted features
|
||||
"""
|
||||
# Compute cache ID
|
||||
temp_result = AnalysisResult(
|
||||
input_hash=input_hash,
|
||||
features_requested=sorted(features),
|
||||
)
|
||||
cache_id = temp_result.cache_id
|
||||
|
||||
# Check cache
|
||||
if self.cache and self.cache.has(cache_id):
|
||||
cached = self.cache.get(cache_id)
|
||||
if cached:
|
||||
logger.info(f"Analysis cache hit: {cache_id[:16]}...")
|
||||
return cached
|
||||
|
||||
# Resolve input path
|
||||
path = self.get_input_path(input_hash, input_path)
|
||||
logger.info(f"Analyzing {path} for features: {features}")
|
||||
|
||||
# Detect media type if not specified
|
||||
if media_type is None:
|
||||
media_type = self._detect_media_type(path)
|
||||
|
||||
# Extract features
|
||||
audio_features = None
|
||||
video_features = None
|
||||
|
||||
# Normalize features
|
||||
if "all" in features:
|
||||
audio_features_list = [AUDIO_ALL]
|
||||
video_features_list = [VIDEO_ALL]
|
||||
else:
|
||||
audio_features_list = [f for f in features if f in ("beats", "tempo", "energy", "spectrum", "onsets")]
|
||||
video_features_list = [f for f in features if f in ("metadata", "motion_tempo", "scene_changes")]
|
||||
|
||||
if media_type in ("audio", "video") and audio_features_list:
|
||||
try:
|
||||
audio_features = analyze_audio(path, features=audio_features_list)
|
||||
except Exception as e:
|
||||
logger.warning(f"Audio analysis failed: {e}")
|
||||
|
||||
if media_type == "video" and video_features_list:
|
||||
try:
|
||||
video_features = analyze_video(path, features=video_features_list)
|
||||
except Exception as e:
|
||||
logger.warning(f"Video analysis failed: {e}")
|
||||
|
||||
result = AnalysisResult(
|
||||
input_hash=input_hash,
|
||||
features_requested=sorted(features),
|
||||
audio=audio_features,
|
||||
video=video_features,
|
||||
analyzed_at=datetime.now(timezone.utc).isoformat(),
|
||||
)
|
||||
|
||||
# Cache result
|
||||
if self.cache:
|
||||
self.cache.put(result)
|
||||
|
||||
return result
|
||||
|
||||
def analyze_multiple(
|
||||
self,
|
||||
inputs: Dict[str, Path],
|
||||
features: List[str],
|
||||
) -> Dict[str, AnalysisResult]:
|
||||
"""
|
||||
Analyze multiple inputs.
|
||||
|
||||
Args:
|
||||
inputs: Dict mapping input_hash to file path
|
||||
features: Features to extract from all inputs
|
||||
|
||||
Returns:
|
||||
Dict mapping input_hash to AnalysisResult
|
||||
"""
|
||||
results = {}
|
||||
for input_hash, input_path in inputs.items():
|
||||
try:
|
||||
results[input_hash] = self.analyze(
|
||||
input_hash=input_hash,
|
||||
features=features,
|
||||
input_path=input_path,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Analysis failed for {input_hash}: {e}")
|
||||
raise
|
||||
|
||||
return results
|
||||
|
||||
def _detect_media_type(self, path: Path) -> str:
|
||||
"""
|
||||
Detect if file is audio or video.
|
||||
|
||||
Args:
|
||||
path: Path to media file
|
||||
|
||||
Returns:
|
||||
"audio" or "video"
|
||||
"""
|
||||
import subprocess
|
||||
import json
|
||||
|
||||
cmd = [
|
||||
"ffprobe", "-v", "quiet",
|
||||
"-print_format", "json",
|
||||
"-show_streams",
|
||||
str(path)
|
||||
]
|
||||
|
||||
try:
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
|
||||
data = json.loads(result.stdout)
|
||||
streams = data.get("streams", [])
|
||||
|
||||
has_video = any(s.get("codec_type") == "video" for s in streams)
|
||||
has_audio = any(s.get("codec_type") == "audio" for s in streams)
|
||||
|
||||
if has_video:
|
||||
return "video"
|
||||
elif has_audio:
|
||||
return "audio"
|
||||
else:
|
||||
return "unknown"
|
||||
|
||||
except (subprocess.CalledProcessError, json.JSONDecodeError):
|
||||
# Fall back to extension-based detection
|
||||
ext = path.suffix.lower()
|
||||
if ext in (".mp4", ".mov", ".avi", ".mkv", ".webm"):
|
||||
return "video"
|
||||
elif ext in (".mp3", ".wav", ".flac", ".ogg", ".m4a", ".aac"):
|
||||
return "audio"
|
||||
return "unknown"
|
||||
336
artdag/analysis/audio.py
Normal file
336
artdag/analysis/audio.py
Normal file
@@ -0,0 +1,336 @@
|
||||
# artdag/analysis/audio.py
|
||||
"""
|
||||
Audio feature extraction.
|
||||
|
||||
Uses librosa for beat detection, energy analysis, and spectral features.
|
||||
Falls back to basic ffprobe if librosa is not available.
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
from .schema import AudioFeatures, BeatInfo, EnergyEnvelope, SpectrumBands
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Feature names for requesting specific analysis
|
||||
FEATURE_BEATS = "beats"
|
||||
FEATURE_TEMPO = "tempo"
|
||||
FEATURE_ENERGY = "energy"
|
||||
FEATURE_SPECTRUM = "spectrum"
|
||||
FEATURE_ONSETS = "onsets"
|
||||
FEATURE_ALL = "all"
|
||||
|
||||
|
||||
def _get_audio_info_ffprobe(path: Path) -> Tuple[float, int, int]:
|
||||
"""Get basic audio info using ffprobe."""
|
||||
cmd = [
|
||||
"ffprobe", "-v", "quiet",
|
||||
"-print_format", "json",
|
||||
"-show_streams",
|
||||
"-select_streams", "a:0",
|
||||
str(path)
|
||||
]
|
||||
try:
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
|
||||
data = json.loads(result.stdout)
|
||||
if not data.get("streams"):
|
||||
raise ValueError("No audio stream found")
|
||||
|
||||
stream = data["streams"][0]
|
||||
duration = float(stream.get("duration", 0))
|
||||
sample_rate = int(stream.get("sample_rate", 44100))
|
||||
channels = int(stream.get("channels", 2))
|
||||
return duration, sample_rate, channels
|
||||
except (subprocess.CalledProcessError, json.JSONDecodeError, KeyError) as e:
|
||||
logger.warning(f"ffprobe failed: {e}")
|
||||
raise ValueError(f"Could not read audio info: {e}")
|
||||
|
||||
|
||||
def _extract_audio_to_wav(path: Path, duration: Optional[float] = None) -> Path:
|
||||
"""Extract audio to temporary WAV file for librosa processing."""
|
||||
import tempfile
|
||||
wav_path = Path(tempfile.mktemp(suffix=".wav"))
|
||||
|
||||
cmd = ["ffmpeg", "-y", "-i", str(path)]
|
||||
if duration:
|
||||
cmd.extend(["-t", str(duration)])
|
||||
cmd.extend([
|
||||
"-vn", # No video
|
||||
"-acodec", "pcm_s16le",
|
||||
"-ar", "22050", # Resample to 22050 Hz for librosa
|
||||
"-ac", "1", # Mono
|
||||
str(wav_path)
|
||||
])
|
||||
|
||||
try:
|
||||
subprocess.run(cmd, capture_output=True, check=True)
|
||||
return wav_path
|
||||
except subprocess.CalledProcessError as e:
|
||||
logger.error(f"Audio extraction failed: {e.stderr}")
|
||||
raise ValueError(f"Could not extract audio: {e}")
|
||||
|
||||
|
||||
def analyze_beats(path: Path, sample_rate: int = 22050) -> BeatInfo:
|
||||
"""
|
||||
Detect beats and tempo using librosa.
|
||||
|
||||
Args:
|
||||
path: Path to audio file (or pre-extracted WAV)
|
||||
sample_rate: Sample rate for analysis
|
||||
|
||||
Returns:
|
||||
BeatInfo with beat times, tempo, and confidence
|
||||
"""
|
||||
try:
|
||||
import librosa
|
||||
except ImportError:
|
||||
raise ImportError("librosa required for beat detection. Install with: pip install librosa")
|
||||
|
||||
# Load audio
|
||||
y, sr = librosa.load(str(path), sr=sample_rate, mono=True)
|
||||
|
||||
# Detect tempo and beats
|
||||
tempo, beat_frames = librosa.beat.beat_track(y=y, sr=sr)
|
||||
|
||||
# Convert frames to times
|
||||
beat_times = librosa.frames_to_time(beat_frames, sr=sr).tolist()
|
||||
|
||||
# Estimate confidence from onset strength consistency
|
||||
onset_env = librosa.onset.onset_strength(y=y, sr=sr)
|
||||
beat_strength = onset_env[beat_frames] if len(beat_frames) > 0 else []
|
||||
confidence = float(beat_strength.mean() / onset_env.max()) if len(beat_strength) > 0 and onset_env.max() > 0 else 0.5
|
||||
|
||||
# Detect downbeats (first beat of each bar)
|
||||
# Use beat phase to estimate bar positions
|
||||
downbeat_times = None
|
||||
if len(beat_times) >= 4:
|
||||
# Assume 4/4 time signature, downbeats every 4 beats
|
||||
downbeat_times = [beat_times[i] for i in range(0, len(beat_times), 4)]
|
||||
|
||||
return BeatInfo(
|
||||
beat_times=beat_times,
|
||||
tempo=float(tempo) if hasattr(tempo, '__float__') else float(tempo[0]) if len(tempo) > 0 else 120.0,
|
||||
confidence=min(1.0, max(0.0, confidence)),
|
||||
downbeat_times=downbeat_times,
|
||||
time_signature=4,
|
||||
)
|
||||
|
||||
|
||||
def analyze_energy(path: Path, window_ms: float = 50.0, sample_rate: int = 22050) -> EnergyEnvelope:
|
||||
"""
|
||||
Extract energy (loudness) envelope.
|
||||
|
||||
Args:
|
||||
path: Path to audio file
|
||||
window_ms: Analysis window size in milliseconds
|
||||
sample_rate: Sample rate for analysis
|
||||
|
||||
Returns:
|
||||
EnergyEnvelope with times and normalized values
|
||||
"""
|
||||
try:
|
||||
import librosa
|
||||
import numpy as np
|
||||
except ImportError:
|
||||
raise ImportError("librosa and numpy required. Install with: pip install librosa numpy")
|
||||
|
||||
y, sr = librosa.load(str(path), sr=sample_rate, mono=True)
|
||||
|
||||
# Calculate frame size from window_ms
|
||||
hop_length = int(sr * window_ms / 1000)
|
||||
|
||||
# RMS energy
|
||||
rms = librosa.feature.rms(y=y, hop_length=hop_length)[0]
|
||||
|
||||
# Normalize to 0-1
|
||||
rms_max = rms.max()
|
||||
if rms_max > 0:
|
||||
rms_normalized = rms / rms_max
|
||||
else:
|
||||
rms_normalized = rms
|
||||
|
||||
# Generate time points
|
||||
times = librosa.frames_to_time(np.arange(len(rms)), sr=sr, hop_length=hop_length)
|
||||
|
||||
return EnergyEnvelope(
|
||||
times=times.tolist(),
|
||||
values=rms_normalized.tolist(),
|
||||
window_ms=window_ms,
|
||||
)
|
||||
|
||||
|
||||
def analyze_spectrum(
|
||||
path: Path,
|
||||
band_ranges: Optional[dict] = None,
|
||||
window_ms: float = 50.0,
|
||||
sample_rate: int = 22050
|
||||
) -> SpectrumBands:
|
||||
"""
|
||||
Extract frequency band envelopes.
|
||||
|
||||
Args:
|
||||
path: Path to audio file
|
||||
band_ranges: Dict mapping band name to (low_hz, high_hz)
|
||||
window_ms: Analysis window size
|
||||
sample_rate: Sample rate
|
||||
|
||||
Returns:
|
||||
SpectrumBands with bass, mid, high envelopes
|
||||
"""
|
||||
try:
|
||||
import librosa
|
||||
import numpy as np
|
||||
except ImportError:
|
||||
raise ImportError("librosa and numpy required")
|
||||
|
||||
if band_ranges is None:
|
||||
band_ranges = {
|
||||
"bass": (20, 200),
|
||||
"mid": (200, 2000),
|
||||
"high": (2000, 20000),
|
||||
}
|
||||
|
||||
y, sr = librosa.load(str(path), sr=sample_rate, mono=True)
|
||||
hop_length = int(sr * window_ms / 1000)
|
||||
|
||||
# Compute STFT
|
||||
n_fft = 2048
|
||||
stft = np.abs(librosa.stft(y, n_fft=n_fft, hop_length=hop_length))
|
||||
|
||||
# Frequency bins
|
||||
freqs = librosa.fft_frequencies(sr=sr, n_fft=n_fft)
|
||||
|
||||
def band_energy(low_hz: float, high_hz: float) -> List[float]:
|
||||
"""Sum energy in frequency band."""
|
||||
mask = (freqs >= low_hz) & (freqs <= high_hz)
|
||||
if not mask.any():
|
||||
return [0.0] * stft.shape[1]
|
||||
band = stft[mask, :].sum(axis=0)
|
||||
# Normalize
|
||||
band_max = band.max()
|
||||
if band_max > 0:
|
||||
band = band / band_max
|
||||
return band.tolist()
|
||||
|
||||
times = librosa.frames_to_time(np.arange(stft.shape[1]), sr=sr, hop_length=hop_length)
|
||||
|
||||
return SpectrumBands(
|
||||
bass=band_energy(*band_ranges["bass"]),
|
||||
mid=band_energy(*band_ranges["mid"]),
|
||||
high=band_energy(*band_ranges["high"]),
|
||||
times=times.tolist(),
|
||||
band_ranges=band_ranges,
|
||||
)
|
||||
|
||||
|
||||
def analyze_onsets(path: Path, sample_rate: int = 22050) -> List[float]:
|
||||
"""
|
||||
Detect onset times (note/sound starts).
|
||||
|
||||
Args:
|
||||
path: Path to audio file
|
||||
sample_rate: Sample rate
|
||||
|
||||
Returns:
|
||||
List of onset times in seconds
|
||||
"""
|
||||
try:
|
||||
import librosa
|
||||
except ImportError:
|
||||
raise ImportError("librosa required")
|
||||
|
||||
y, sr = librosa.load(str(path), sr=sample_rate, mono=True)
|
||||
|
||||
# Detect onsets
|
||||
onset_frames = librosa.onset.onset_detect(y=y, sr=sr)
|
||||
onset_times = librosa.frames_to_time(onset_frames, sr=sr)
|
||||
|
||||
return onset_times.tolist()
|
||||
|
||||
|
||||
def analyze_audio(
|
||||
path: Path,
|
||||
features: Optional[List[str]] = None,
|
||||
) -> AudioFeatures:
|
||||
"""
|
||||
Extract audio features from file.
|
||||
|
||||
Args:
|
||||
path: Path to audio/video file
|
||||
features: List of features to extract. Options:
|
||||
- "beats": Beat detection (tempo, beat times)
|
||||
- "energy": Loudness envelope
|
||||
- "spectrum": Frequency band envelopes
|
||||
- "onsets": Note onset times
|
||||
- "all": All features
|
||||
|
||||
Returns:
|
||||
AudioFeatures with requested analysis
|
||||
"""
|
||||
if features is None:
|
||||
features = [FEATURE_ALL]
|
||||
|
||||
# Normalize features
|
||||
if FEATURE_ALL in features:
|
||||
features = [FEATURE_BEATS, FEATURE_ENERGY, FEATURE_SPECTRUM, FEATURE_ONSETS]
|
||||
|
||||
# Get basic info via ffprobe
|
||||
duration, sample_rate, channels = _get_audio_info_ffprobe(path)
|
||||
|
||||
result = AudioFeatures(
|
||||
duration=duration,
|
||||
sample_rate=sample_rate,
|
||||
channels=channels,
|
||||
)
|
||||
|
||||
# Check if librosa is available for advanced features
|
||||
try:
|
||||
import librosa # noqa: F401
|
||||
has_librosa = True
|
||||
except ImportError:
|
||||
has_librosa = False
|
||||
if any(f in features for f in [FEATURE_BEATS, FEATURE_ENERGY, FEATURE_SPECTRUM, FEATURE_ONSETS]):
|
||||
logger.warning("librosa not available, skipping advanced audio features")
|
||||
|
||||
if not has_librosa:
|
||||
return result
|
||||
|
||||
# Extract audio to WAV for librosa
|
||||
wav_path = None
|
||||
try:
|
||||
wav_path = _extract_audio_to_wav(path)
|
||||
|
||||
if FEATURE_BEATS in features or FEATURE_TEMPO in features:
|
||||
try:
|
||||
result.beats = analyze_beats(wav_path)
|
||||
except Exception as e:
|
||||
logger.warning(f"Beat detection failed: {e}")
|
||||
|
||||
if FEATURE_ENERGY in features:
|
||||
try:
|
||||
result.energy = analyze_energy(wav_path)
|
||||
except Exception as e:
|
||||
logger.warning(f"Energy analysis failed: {e}")
|
||||
|
||||
if FEATURE_SPECTRUM in features:
|
||||
try:
|
||||
result.spectrum = analyze_spectrum(wav_path)
|
||||
except Exception as e:
|
||||
logger.warning(f"Spectrum analysis failed: {e}")
|
||||
|
||||
if FEATURE_ONSETS in features:
|
||||
try:
|
||||
result.onsets = analyze_onsets(wav_path)
|
||||
except Exception as e:
|
||||
logger.warning(f"Onset detection failed: {e}")
|
||||
|
||||
finally:
|
||||
# Clean up temporary WAV file
|
||||
if wav_path and wav_path.exists():
|
||||
wav_path.unlink()
|
||||
|
||||
return result
|
||||
352
artdag/analysis/schema.py
Normal file
352
artdag/analysis/schema.py
Normal file
@@ -0,0 +1,352 @@
|
||||
# artdag/analysis/schema.py
|
||||
"""
|
||||
Data structures for analysis results.
|
||||
|
||||
Analysis extracts features from input media that inform downstream processing.
|
||||
Results are cached by: analysis_cache_id = SHA3-256(input_hash + sorted(features))
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
|
||||
def _stable_hash(data: Any, algorithm: str = "sha3_256") -> str:
|
||||
"""Create stable hash from arbitrary data."""
|
||||
json_str = json.dumps(data, sort_keys=True, separators=(",", ":"))
|
||||
hasher = hashlib.new(algorithm)
|
||||
hasher.update(json_str.encode())
|
||||
return hasher.hexdigest()
|
||||
|
||||
|
||||
@dataclass
|
||||
class BeatInfo:
|
||||
"""
|
||||
Beat detection results.
|
||||
|
||||
Attributes:
|
||||
beat_times: List of beat positions in seconds
|
||||
tempo: Estimated tempo in BPM
|
||||
confidence: Tempo detection confidence (0-1)
|
||||
downbeat_times: First beat of each bar (if detected)
|
||||
time_signature: Detected or assumed time signature (e.g., 4)
|
||||
"""
|
||||
beat_times: List[float]
|
||||
tempo: float
|
||||
confidence: float = 1.0
|
||||
downbeat_times: Optional[List[float]] = None
|
||||
time_signature: int = 4
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
return {
|
||||
"beat_times": self.beat_times,
|
||||
"tempo": self.tempo,
|
||||
"confidence": self.confidence,
|
||||
"downbeat_times": self.downbeat_times,
|
||||
"time_signature": self.time_signature,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: Dict[str, Any]) -> "BeatInfo":
|
||||
return cls(
|
||||
beat_times=data["beat_times"],
|
||||
tempo=data["tempo"],
|
||||
confidence=data.get("confidence", 1.0),
|
||||
downbeat_times=data.get("downbeat_times"),
|
||||
time_signature=data.get("time_signature", 4),
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class EnergyEnvelope:
|
||||
"""
|
||||
Energy (loudness) over time.
|
||||
|
||||
Attributes:
|
||||
times: Time points in seconds
|
||||
values: Energy values (0-1, normalized)
|
||||
window_ms: Analysis window size in milliseconds
|
||||
"""
|
||||
times: List[float]
|
||||
values: List[float]
|
||||
window_ms: float = 50.0
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
return {
|
||||
"times": self.times,
|
||||
"values": self.values,
|
||||
"window_ms": self.window_ms,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: Dict[str, Any]) -> "EnergyEnvelope":
|
||||
return cls(
|
||||
times=data["times"],
|
||||
values=data["values"],
|
||||
window_ms=data.get("window_ms", 50.0),
|
||||
)
|
||||
|
||||
def at_time(self, t: float) -> float:
|
||||
"""Interpolate energy value at given time."""
|
||||
if not self.times:
|
||||
return 0.0
|
||||
if t <= self.times[0]:
|
||||
return self.values[0]
|
||||
if t >= self.times[-1]:
|
||||
return self.values[-1]
|
||||
|
||||
# Binary search for bracketing indices
|
||||
lo, hi = 0, len(self.times) - 1
|
||||
while hi - lo > 1:
|
||||
mid = (lo + hi) // 2
|
||||
if self.times[mid] <= t:
|
||||
lo = mid
|
||||
else:
|
||||
hi = mid
|
||||
|
||||
# Linear interpolation
|
||||
t0, t1 = self.times[lo], self.times[hi]
|
||||
v0, v1 = self.values[lo], self.values[hi]
|
||||
alpha = (t - t0) / (t1 - t0) if t1 != t0 else 0
|
||||
return v0 + alpha * (v1 - v0)
|
||||
|
||||
|
||||
@dataclass
|
||||
class SpectrumBands:
|
||||
"""
|
||||
Frequency band envelopes over time.
|
||||
|
||||
Attributes:
|
||||
bass: Low frequency envelope (20-200 Hz typical)
|
||||
mid: Mid frequency envelope (200-2000 Hz typical)
|
||||
high: High frequency envelope (2000-20000 Hz typical)
|
||||
times: Time points in seconds
|
||||
band_ranges: Frequency ranges for each band in Hz
|
||||
"""
|
||||
bass: List[float]
|
||||
mid: List[float]
|
||||
high: List[float]
|
||||
times: List[float]
|
||||
band_ranges: Dict[str, Tuple[float, float]] = field(default_factory=lambda: {
|
||||
"bass": (20, 200),
|
||||
"mid": (200, 2000),
|
||||
"high": (2000, 20000),
|
||||
})
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
return {
|
||||
"bass": self.bass,
|
||||
"mid": self.mid,
|
||||
"high": self.high,
|
||||
"times": self.times,
|
||||
"band_ranges": self.band_ranges,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: Dict[str, Any]) -> "SpectrumBands":
|
||||
return cls(
|
||||
bass=data["bass"],
|
||||
mid=data["mid"],
|
||||
high=data["high"],
|
||||
times=data["times"],
|
||||
band_ranges=data.get("band_ranges", {
|
||||
"bass": (20, 200),
|
||||
"mid": (200, 2000),
|
||||
"high": (2000, 20000),
|
||||
}),
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class AudioFeatures:
|
||||
"""
|
||||
All extracted audio features.
|
||||
|
||||
Attributes:
|
||||
duration: Audio duration in seconds
|
||||
sample_rate: Sample rate in Hz
|
||||
channels: Number of audio channels
|
||||
beats: Beat detection results
|
||||
energy: Energy envelope
|
||||
spectrum: Frequency band envelopes
|
||||
onsets: Note/sound onset times
|
||||
"""
|
||||
duration: float
|
||||
sample_rate: int
|
||||
channels: int
|
||||
beats: Optional[BeatInfo] = None
|
||||
energy: Optional[EnergyEnvelope] = None
|
||||
spectrum: Optional[SpectrumBands] = None
|
||||
onsets: Optional[List[float]] = None
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
return {
|
||||
"duration": self.duration,
|
||||
"sample_rate": self.sample_rate,
|
||||
"channels": self.channels,
|
||||
"beats": self.beats.to_dict() if self.beats else None,
|
||||
"energy": self.energy.to_dict() if self.energy else None,
|
||||
"spectrum": self.spectrum.to_dict() if self.spectrum else None,
|
||||
"onsets": self.onsets,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: Dict[str, Any]) -> "AudioFeatures":
|
||||
return cls(
|
||||
duration=data["duration"],
|
||||
sample_rate=data["sample_rate"],
|
||||
channels=data["channels"],
|
||||
beats=BeatInfo.from_dict(data["beats"]) if data.get("beats") else None,
|
||||
energy=EnergyEnvelope.from_dict(data["energy"]) if data.get("energy") else None,
|
||||
spectrum=SpectrumBands.from_dict(data["spectrum"]) if data.get("spectrum") else None,
|
||||
onsets=data.get("onsets"),
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class VideoFeatures:
|
||||
"""
|
||||
Extracted video features.
|
||||
|
||||
Attributes:
|
||||
duration: Video duration in seconds
|
||||
frame_rate: Frames per second
|
||||
width: Frame width in pixels
|
||||
height: Frame height in pixels
|
||||
codec: Video codec name
|
||||
motion_tempo: Estimated tempo from motion analysis (optional)
|
||||
scene_changes: Times of detected scene changes
|
||||
"""
|
||||
duration: float
|
||||
frame_rate: float
|
||||
width: int
|
||||
height: int
|
||||
codec: str = ""
|
||||
motion_tempo: Optional[float] = None
|
||||
scene_changes: Optional[List[float]] = None
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
return {
|
||||
"duration": self.duration,
|
||||
"frame_rate": self.frame_rate,
|
||||
"width": self.width,
|
||||
"height": self.height,
|
||||
"codec": self.codec,
|
||||
"motion_tempo": self.motion_tempo,
|
||||
"scene_changes": self.scene_changes,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: Dict[str, Any]) -> "VideoFeatures":
|
||||
return cls(
|
||||
duration=data["duration"],
|
||||
frame_rate=data["frame_rate"],
|
||||
width=data["width"],
|
||||
height=data["height"],
|
||||
codec=data.get("codec", ""),
|
||||
motion_tempo=data.get("motion_tempo"),
|
||||
scene_changes=data.get("scene_changes"),
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class AnalysisResult:
|
||||
"""
|
||||
Complete analysis result for an input.
|
||||
|
||||
Combines audio and video features with metadata for caching.
|
||||
|
||||
Attributes:
|
||||
input_hash: Content hash of the analyzed input
|
||||
features_requested: List of features that were requested
|
||||
audio: Audio features (if input has audio)
|
||||
video: Video features (if input has video)
|
||||
cache_id: Computed cache ID for this analysis
|
||||
analyzed_at: Timestamp of analysis
|
||||
"""
|
||||
input_hash: str
|
||||
features_requested: List[str]
|
||||
audio: Optional[AudioFeatures] = None
|
||||
video: Optional[VideoFeatures] = None
|
||||
cache_id: Optional[str] = None
|
||||
analyzed_at: Optional[str] = None
|
||||
|
||||
def __post_init__(self):
|
||||
"""Compute cache_id if not provided."""
|
||||
if self.cache_id is None:
|
||||
self.cache_id = self._compute_cache_id()
|
||||
|
||||
def _compute_cache_id(self) -> str:
|
||||
"""
|
||||
Compute cache ID from input hash and requested features.
|
||||
|
||||
cache_id = SHA3-256(input_hash + sorted(features_requested))
|
||||
"""
|
||||
content = {
|
||||
"input_hash": self.input_hash,
|
||||
"features": sorted(self.features_requested),
|
||||
}
|
||||
return _stable_hash(content)
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
return {
|
||||
"input_hash": self.input_hash,
|
||||
"features_requested": self.features_requested,
|
||||
"audio": self.audio.to_dict() if self.audio else None,
|
||||
"video": self.video.to_dict() if self.video else None,
|
||||
"cache_id": self.cache_id,
|
||||
"analyzed_at": self.analyzed_at,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: Dict[str, Any]) -> "AnalysisResult":
|
||||
return cls(
|
||||
input_hash=data["input_hash"],
|
||||
features_requested=data["features_requested"],
|
||||
audio=AudioFeatures.from_dict(data["audio"]) if data.get("audio") else None,
|
||||
video=VideoFeatures.from_dict(data["video"]) if data.get("video") else None,
|
||||
cache_id=data.get("cache_id"),
|
||||
analyzed_at=data.get("analyzed_at"),
|
||||
)
|
||||
|
||||
def to_json(self) -> str:
|
||||
"""Serialize to JSON string."""
|
||||
return json.dumps(self.to_dict(), indent=2)
|
||||
|
||||
@classmethod
|
||||
def from_json(cls, json_str: str) -> "AnalysisResult":
|
||||
"""Deserialize from JSON string."""
|
||||
return cls.from_dict(json.loads(json_str))
|
||||
|
||||
# Convenience accessors
|
||||
@property
|
||||
def tempo(self) -> Optional[float]:
|
||||
"""Get tempo if beats were analyzed."""
|
||||
return self.audio.beats.tempo if self.audio and self.audio.beats else None
|
||||
|
||||
@property
|
||||
def beat_times(self) -> Optional[List[float]]:
|
||||
"""Get beat times if beats were analyzed."""
|
||||
return self.audio.beats.beat_times if self.audio and self.audio.beats else None
|
||||
|
||||
@property
|
||||
def downbeat_times(self) -> Optional[List[float]]:
|
||||
"""Get downbeat times if analyzed."""
|
||||
return self.audio.beats.downbeat_times if self.audio and self.audio.beats else None
|
||||
|
||||
@property
|
||||
def duration(self) -> float:
|
||||
"""Get duration from video or audio."""
|
||||
if self.video:
|
||||
return self.video.duration
|
||||
if self.audio:
|
||||
return self.audio.duration
|
||||
return 0.0
|
||||
|
||||
@property
|
||||
def dimensions(self) -> Optional[Tuple[int, int]]:
|
||||
"""Get video dimensions if available."""
|
||||
if self.video:
|
||||
return (self.video.width, self.video.height)
|
||||
return None
|
||||
266
artdag/analysis/video.py
Normal file
266
artdag/analysis/video.py
Normal file
@@ -0,0 +1,266 @@
|
||||
# artdag/analysis/video.py
|
||||
"""
|
||||
Video feature extraction.
|
||||
|
||||
Uses ffprobe for basic metadata and optional OpenCV for motion analysis.
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import subprocess
|
||||
from fractions import Fraction
|
||||
from pathlib import Path
|
||||
from typing import List, Optional
|
||||
|
||||
from .schema import VideoFeatures
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Feature names
|
||||
FEATURE_METADATA = "metadata"
|
||||
FEATURE_MOTION_TEMPO = "motion_tempo"
|
||||
FEATURE_SCENE_CHANGES = "scene_changes"
|
||||
FEATURE_ALL = "all"
|
||||
|
||||
|
||||
def _parse_frame_rate(rate_str: str) -> float:
|
||||
"""Parse frame rate string like '30000/1001' or '30'."""
|
||||
try:
|
||||
if "/" in rate_str:
|
||||
frac = Fraction(rate_str)
|
||||
return float(frac)
|
||||
return float(rate_str)
|
||||
except (ValueError, ZeroDivisionError):
|
||||
return 30.0 # Default
|
||||
|
||||
|
||||
def analyze_metadata(path: Path) -> VideoFeatures:
|
||||
"""
|
||||
Extract video metadata using ffprobe.
|
||||
|
||||
Args:
|
||||
path: Path to video file
|
||||
|
||||
Returns:
|
||||
VideoFeatures with basic metadata
|
||||
"""
|
||||
cmd = [
|
||||
"ffprobe", "-v", "quiet",
|
||||
"-print_format", "json",
|
||||
"-show_streams",
|
||||
"-show_format",
|
||||
"-select_streams", "v:0",
|
||||
str(path)
|
||||
]
|
||||
|
||||
try:
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
|
||||
data = json.loads(result.stdout)
|
||||
except (subprocess.CalledProcessError, json.JSONDecodeError) as e:
|
||||
raise ValueError(f"Could not read video info: {e}")
|
||||
|
||||
if not data.get("streams"):
|
||||
raise ValueError("No video stream found")
|
||||
|
||||
stream = data["streams"][0]
|
||||
fmt = data.get("format", {})
|
||||
|
||||
# Get duration from format or stream
|
||||
duration = float(fmt.get("duration", stream.get("duration", 0)))
|
||||
|
||||
# Parse frame rate
|
||||
frame_rate = _parse_frame_rate(stream.get("avg_frame_rate", "30"))
|
||||
|
||||
return VideoFeatures(
|
||||
duration=duration,
|
||||
frame_rate=frame_rate,
|
||||
width=int(stream.get("width", 0)),
|
||||
height=int(stream.get("height", 0)),
|
||||
codec=stream.get("codec_name", ""),
|
||||
)
|
||||
|
||||
|
||||
def analyze_scene_changes(path: Path, threshold: float = 0.3) -> List[float]:
|
||||
"""
|
||||
Detect scene changes using ffmpeg scene detection.
|
||||
|
||||
Args:
|
||||
path: Path to video file
|
||||
threshold: Scene change threshold (0-1, lower = more sensitive)
|
||||
|
||||
Returns:
|
||||
List of scene change times in seconds
|
||||
"""
|
||||
cmd = [
|
||||
"ffmpeg", "-i", str(path),
|
||||
"-vf", f"select='gt(scene,{threshold})',showinfo",
|
||||
"-f", "null", "-"
|
||||
]
|
||||
|
||||
try:
|
||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||
stderr = result.stderr
|
||||
except subprocess.CalledProcessError as e:
|
||||
logger.warning(f"Scene detection failed: {e}")
|
||||
return []
|
||||
|
||||
# Parse scene change times from ffmpeg output
|
||||
scene_times = []
|
||||
for line in stderr.split("\n"):
|
||||
if "pts_time:" in line:
|
||||
try:
|
||||
# Extract pts_time value
|
||||
for part in line.split():
|
||||
if part.startswith("pts_time:"):
|
||||
time_str = part.split(":")[1]
|
||||
scene_times.append(float(time_str))
|
||||
break
|
||||
except (ValueError, IndexError):
|
||||
continue
|
||||
|
||||
return scene_times
|
||||
|
||||
|
||||
def analyze_motion_tempo(path: Path, sample_duration: float = 30.0) -> Optional[float]:
|
||||
"""
|
||||
Estimate tempo from video motion periodicity.
|
||||
|
||||
Analyzes optical flow or frame differences to detect rhythmic motion.
|
||||
This is useful for matching video speed to audio tempo.
|
||||
|
||||
Args:
|
||||
path: Path to video file
|
||||
sample_duration: Duration to analyze (seconds)
|
||||
|
||||
Returns:
|
||||
Estimated motion tempo in BPM, or None if not detectable
|
||||
"""
|
||||
try:
|
||||
import cv2
|
||||
import numpy as np
|
||||
except ImportError:
|
||||
logger.warning("OpenCV not available, skipping motion tempo analysis")
|
||||
return None
|
||||
|
||||
cap = cv2.VideoCapture(str(path))
|
||||
if not cap.isOpened():
|
||||
logger.warning(f"Could not open video: {path}")
|
||||
return None
|
||||
|
||||
try:
|
||||
fps = cap.get(cv2.CAP_PROP_FPS)
|
||||
if fps <= 0:
|
||||
fps = 30.0
|
||||
|
||||
max_frames = int(sample_duration * fps)
|
||||
frame_diffs = []
|
||||
prev_gray = None
|
||||
|
||||
frame_count = 0
|
||||
while frame_count < max_frames:
|
||||
ret, frame = cap.read()
|
||||
if not ret:
|
||||
break
|
||||
|
||||
# Convert to grayscale and resize for speed
|
||||
gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
|
||||
gray = cv2.resize(gray, (160, 90))
|
||||
|
||||
if prev_gray is not None:
|
||||
# Calculate frame difference
|
||||
diff = cv2.absdiff(gray, prev_gray)
|
||||
frame_diffs.append(np.mean(diff))
|
||||
|
||||
prev_gray = gray
|
||||
frame_count += 1
|
||||
|
||||
if len(frame_diffs) < 60: # Need at least 2 seconds at 30fps
|
||||
return None
|
||||
|
||||
# Convert to numpy array
|
||||
motion = np.array(frame_diffs)
|
||||
|
||||
# Normalize
|
||||
motion = motion - motion.mean()
|
||||
if motion.std() > 0:
|
||||
motion = motion / motion.std()
|
||||
|
||||
# Autocorrelation to find periodicity
|
||||
n = len(motion)
|
||||
acf = np.correlate(motion, motion, mode="full")[n-1:]
|
||||
acf = acf / acf[0] # Normalize
|
||||
|
||||
# Find peaks in autocorrelation (potential beat periods)
|
||||
# Look for periods between 0.3s (200 BPM) and 2s (30 BPM)
|
||||
min_lag = int(0.3 * fps)
|
||||
max_lag = min(int(2.0 * fps), len(acf) - 1)
|
||||
|
||||
if max_lag <= min_lag:
|
||||
return None
|
||||
|
||||
# Find the highest peak in the valid range
|
||||
search_range = acf[min_lag:max_lag]
|
||||
if len(search_range) == 0:
|
||||
return None
|
||||
|
||||
peak_idx = np.argmax(search_range) + min_lag
|
||||
peak_value = acf[peak_idx]
|
||||
|
||||
# Only report if peak is significant
|
||||
if peak_value < 0.1:
|
||||
return None
|
||||
|
||||
# Convert lag to BPM
|
||||
period_seconds = peak_idx / fps
|
||||
bpm = 60.0 / period_seconds
|
||||
|
||||
# Sanity check
|
||||
if 30 <= bpm <= 200:
|
||||
return round(bpm, 1)
|
||||
|
||||
return None
|
||||
|
||||
finally:
|
||||
cap.release()
|
||||
|
||||
|
||||
def analyze_video(
|
||||
path: Path,
|
||||
features: Optional[List[str]] = None,
|
||||
) -> VideoFeatures:
|
||||
"""
|
||||
Extract video features from file.
|
||||
|
||||
Args:
|
||||
path: Path to video file
|
||||
features: List of features to extract. Options:
|
||||
- "metadata": Basic video info (always included)
|
||||
- "motion_tempo": Estimated tempo from motion
|
||||
- "scene_changes": Scene change detection
|
||||
- "all": All features
|
||||
|
||||
Returns:
|
||||
VideoFeatures with requested analysis
|
||||
"""
|
||||
if features is None:
|
||||
features = [FEATURE_METADATA]
|
||||
|
||||
if FEATURE_ALL in features:
|
||||
features = [FEATURE_METADATA, FEATURE_MOTION_TEMPO, FEATURE_SCENE_CHANGES]
|
||||
|
||||
# Basic metadata is always extracted
|
||||
result = analyze_metadata(path)
|
||||
|
||||
if FEATURE_MOTION_TEMPO in features:
|
||||
try:
|
||||
result.motion_tempo = analyze_motion_tempo(path)
|
||||
except Exception as e:
|
||||
logger.warning(f"Motion tempo analysis failed: {e}")
|
||||
|
||||
if FEATURE_SCENE_CHANGES in features:
|
||||
try:
|
||||
result.scene_changes = analyze_scene_changes(path)
|
||||
except Exception as e:
|
||||
logger.warning(f"Scene change detection failed: {e}")
|
||||
|
||||
return result
|
||||
Reference in New Issue
Block a user