Files
mono/artdag/analysis/audio.py
giles cc2dcbddd4 Squashed 'core/' content from commit 4957443
git-subtree-dir: core
git-subtree-split: 4957443184ae0eb6323635a90a19acffb3e01d07
2026-02-24 23:09:39 +00:00

337 lines
9.8 KiB
Python

# artdag/analysis/audio.py
"""
Audio feature extraction.
Uses librosa for beat detection, energy analysis, and spectral features.
Falls back to basic ffprobe if librosa is not available.
"""
import json
import logging
import subprocess
from pathlib import Path
from typing import List, Optional, Tuple
from .schema import AudioFeatures, BeatInfo, EnergyEnvelope, SpectrumBands
logger = logging.getLogger(__name__)
# Feature names for requesting specific analysis
FEATURE_BEATS = "beats"
FEATURE_TEMPO = "tempo"
FEATURE_ENERGY = "energy"
FEATURE_SPECTRUM = "spectrum"
FEATURE_ONSETS = "onsets"
FEATURE_ALL = "all"
def _get_audio_info_ffprobe(path: Path) -> Tuple[float, int, int]:
"""Get basic audio info using ffprobe."""
cmd = [
"ffprobe", "-v", "quiet",
"-print_format", "json",
"-show_streams",
"-select_streams", "a:0",
str(path)
]
try:
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
data = json.loads(result.stdout)
if not data.get("streams"):
raise ValueError("No audio stream found")
stream = data["streams"][0]
duration = float(stream.get("duration", 0))
sample_rate = int(stream.get("sample_rate", 44100))
channels = int(stream.get("channels", 2))
return duration, sample_rate, channels
except (subprocess.CalledProcessError, json.JSONDecodeError, KeyError) as e:
logger.warning(f"ffprobe failed: {e}")
raise ValueError(f"Could not read audio info: {e}")
def _extract_audio_to_wav(path: Path, duration: Optional[float] = None) -> Path:
"""Extract audio to temporary WAV file for librosa processing."""
import tempfile
wav_path = Path(tempfile.mktemp(suffix=".wav"))
cmd = ["ffmpeg", "-y", "-i", str(path)]
if duration:
cmd.extend(["-t", str(duration)])
cmd.extend([
"-vn", # No video
"-acodec", "pcm_s16le",
"-ar", "22050", # Resample to 22050 Hz for librosa
"-ac", "1", # Mono
str(wav_path)
])
try:
subprocess.run(cmd, capture_output=True, check=True)
return wav_path
except subprocess.CalledProcessError as e:
logger.error(f"Audio extraction failed: {e.stderr}")
raise ValueError(f"Could not extract audio: {e}")
def analyze_beats(path: Path, sample_rate: int = 22050) -> BeatInfo:
"""
Detect beats and tempo using librosa.
Args:
path: Path to audio file (or pre-extracted WAV)
sample_rate: Sample rate for analysis
Returns:
BeatInfo with beat times, tempo, and confidence
"""
try:
import librosa
except ImportError:
raise ImportError("librosa required for beat detection. Install with: pip install librosa")
# Load audio
y, sr = librosa.load(str(path), sr=sample_rate, mono=True)
# Detect tempo and beats
tempo, beat_frames = librosa.beat.beat_track(y=y, sr=sr)
# Convert frames to times
beat_times = librosa.frames_to_time(beat_frames, sr=sr).tolist()
# Estimate confidence from onset strength consistency
onset_env = librosa.onset.onset_strength(y=y, sr=sr)
beat_strength = onset_env[beat_frames] if len(beat_frames) > 0 else []
confidence = float(beat_strength.mean() / onset_env.max()) if len(beat_strength) > 0 and onset_env.max() > 0 else 0.5
# Detect downbeats (first beat of each bar)
# Use beat phase to estimate bar positions
downbeat_times = None
if len(beat_times) >= 4:
# Assume 4/4 time signature, downbeats every 4 beats
downbeat_times = [beat_times[i] for i in range(0, len(beat_times), 4)]
return BeatInfo(
beat_times=beat_times,
tempo=float(tempo) if hasattr(tempo, '__float__') else float(tempo[0]) if len(tempo) > 0 else 120.0,
confidence=min(1.0, max(0.0, confidence)),
downbeat_times=downbeat_times,
time_signature=4,
)
def analyze_energy(path: Path, window_ms: float = 50.0, sample_rate: int = 22050) -> EnergyEnvelope:
"""
Extract energy (loudness) envelope.
Args:
path: Path to audio file
window_ms: Analysis window size in milliseconds
sample_rate: Sample rate for analysis
Returns:
EnergyEnvelope with times and normalized values
"""
try:
import librosa
import numpy as np
except ImportError:
raise ImportError("librosa and numpy required. Install with: pip install librosa numpy")
y, sr = librosa.load(str(path), sr=sample_rate, mono=True)
# Calculate frame size from window_ms
hop_length = int(sr * window_ms / 1000)
# RMS energy
rms = librosa.feature.rms(y=y, hop_length=hop_length)[0]
# Normalize to 0-1
rms_max = rms.max()
if rms_max > 0:
rms_normalized = rms / rms_max
else:
rms_normalized = rms
# Generate time points
times = librosa.frames_to_time(np.arange(len(rms)), sr=sr, hop_length=hop_length)
return EnergyEnvelope(
times=times.tolist(),
values=rms_normalized.tolist(),
window_ms=window_ms,
)
def analyze_spectrum(
path: Path,
band_ranges: Optional[dict] = None,
window_ms: float = 50.0,
sample_rate: int = 22050
) -> SpectrumBands:
"""
Extract frequency band envelopes.
Args:
path: Path to audio file
band_ranges: Dict mapping band name to (low_hz, high_hz)
window_ms: Analysis window size
sample_rate: Sample rate
Returns:
SpectrumBands with bass, mid, high envelopes
"""
try:
import librosa
import numpy as np
except ImportError:
raise ImportError("librosa and numpy required")
if band_ranges is None:
band_ranges = {
"bass": (20, 200),
"mid": (200, 2000),
"high": (2000, 20000),
}
y, sr = librosa.load(str(path), sr=sample_rate, mono=True)
hop_length = int(sr * window_ms / 1000)
# Compute STFT
n_fft = 2048
stft = np.abs(librosa.stft(y, n_fft=n_fft, hop_length=hop_length))
# Frequency bins
freqs = librosa.fft_frequencies(sr=sr, n_fft=n_fft)
def band_energy(low_hz: float, high_hz: float) -> List[float]:
"""Sum energy in frequency band."""
mask = (freqs >= low_hz) & (freqs <= high_hz)
if not mask.any():
return [0.0] * stft.shape[1]
band = stft[mask, :].sum(axis=0)
# Normalize
band_max = band.max()
if band_max > 0:
band = band / band_max
return band.tolist()
times = librosa.frames_to_time(np.arange(stft.shape[1]), sr=sr, hop_length=hop_length)
return SpectrumBands(
bass=band_energy(*band_ranges["bass"]),
mid=band_energy(*band_ranges["mid"]),
high=band_energy(*band_ranges["high"]),
times=times.tolist(),
band_ranges=band_ranges,
)
def analyze_onsets(path: Path, sample_rate: int = 22050) -> List[float]:
"""
Detect onset times (note/sound starts).
Args:
path: Path to audio file
sample_rate: Sample rate
Returns:
List of onset times in seconds
"""
try:
import librosa
except ImportError:
raise ImportError("librosa required")
y, sr = librosa.load(str(path), sr=sample_rate, mono=True)
# Detect onsets
onset_frames = librosa.onset.onset_detect(y=y, sr=sr)
onset_times = librosa.frames_to_time(onset_frames, sr=sr)
return onset_times.tolist()
def analyze_audio(
path: Path,
features: Optional[List[str]] = None,
) -> AudioFeatures:
"""
Extract audio features from file.
Args:
path: Path to audio/video file
features: List of features to extract. Options:
- "beats": Beat detection (tempo, beat times)
- "energy": Loudness envelope
- "spectrum": Frequency band envelopes
- "onsets": Note onset times
- "all": All features
Returns:
AudioFeatures with requested analysis
"""
if features is None:
features = [FEATURE_ALL]
# Normalize features
if FEATURE_ALL in features:
features = [FEATURE_BEATS, FEATURE_ENERGY, FEATURE_SPECTRUM, FEATURE_ONSETS]
# Get basic info via ffprobe
duration, sample_rate, channels = _get_audio_info_ffprobe(path)
result = AudioFeatures(
duration=duration,
sample_rate=sample_rate,
channels=channels,
)
# Check if librosa is available for advanced features
try:
import librosa # noqa: F401
has_librosa = True
except ImportError:
has_librosa = False
if any(f in features for f in [FEATURE_BEATS, FEATURE_ENERGY, FEATURE_SPECTRUM, FEATURE_ONSETS]):
logger.warning("librosa not available, skipping advanced audio features")
if not has_librosa:
return result
# Extract audio to WAV for librosa
wav_path = None
try:
wav_path = _extract_audio_to_wav(path)
if FEATURE_BEATS in features or FEATURE_TEMPO in features:
try:
result.beats = analyze_beats(wav_path)
except Exception as e:
logger.warning(f"Beat detection failed: {e}")
if FEATURE_ENERGY in features:
try:
result.energy = analyze_energy(wav_path)
except Exception as e:
logger.warning(f"Energy analysis failed: {e}")
if FEATURE_SPECTRUM in features:
try:
result.spectrum = analyze_spectrum(wav_path)
except Exception as e:
logger.warning(f"Spectrum analysis failed: {e}")
if FEATURE_ONSETS in features:
try:
result.onsets = analyze_onsets(wav_path)
except Exception as e:
logger.warning(f"Onset detection failed: {e}")
finally:
# Clean up temporary WAV file
if wav_path and wav_path.exists():
wav_path.unlink()
return result