rose-ash/artdag/analysis/audio.py

# artdag/analysis/audio.py
"""
Audio feature extraction.

Uses librosa for beat detection, energy analysis, and spectral features.
Falls back to basic ffprobe if librosa is not available.
"""

import json
import logging
import subprocess
from pathlib import Path
from typing import List, Optional, Tuple

from .schema import AudioFeatures, BeatInfo, EnergyEnvelope, SpectrumBands

logger = logging.getLogger(__name__)

# Feature names for requesting specific analysis
FEATURE_BEATS = "beats"
FEATURE_TEMPO = "tempo"
FEATURE_ENERGY = "energy"
FEATURE_SPECTRUM = "spectrum"
FEATURE_ONSETS = "onsets"
FEATURE_ALL = "all"


def _get_audio_info_ffprobe(path: Path) -> Tuple[float, int, int]:
    """Get basic audio info using ffprobe."""
    cmd = [
        "ffprobe", "-v", "quiet",
        "-print_format", "json",
        "-show_streams",
        "-select_streams", "a:0",
        str(path)
    ]
    try:
        result = subprocess.run(cmd, capture_output=True, text=True, check=True)
        data = json.loads(result.stdout)
        if not data.get("streams"):
            raise ValueError("No audio stream found")

        stream = data["streams"][0]
        duration = float(stream.get("duration", 0))
        sample_rate = int(stream.get("sample_rate", 44100))
        channels = int(stream.get("channels", 2))
        return duration, sample_rate, channels
    except (subprocess.CalledProcessError, json.JSONDecodeError, KeyError) as e:
        logger.warning(f"ffprobe failed: {e}")
        raise ValueError(f"Could not read audio info: {e}")


def _extract_audio_to_wav(path: Path, duration: Optional[float] = None) -> Path:
    """Extract audio to temporary WAV file for librosa processing."""
    import tempfile
    wav_path = Path(tempfile.mktemp(suffix=".wav"))

    cmd = ["ffmpeg", "-y", "-i", str(path)]
    if duration:
        cmd.extend(["-t", str(duration)])
    cmd.extend([
        "-vn",  # No video
        "-acodec", "pcm_s16le",
        "-ar", "22050",  # Resample to 22050 Hz for librosa
        "-ac", "1",  # Mono
        str(wav_path)
    ])

    try:
        subprocess.run(cmd, capture_output=True, check=True)
        return wav_path
    except subprocess.CalledProcessError as e:
        logger.error(f"Audio extraction failed: {e.stderr}")
        raise ValueError(f"Could not extract audio: {e}")


def analyze_beats(path: Path, sample_rate: int = 22050) -> BeatInfo:
    """
    Detect beats and tempo using librosa.

    Args:
        path: Path to audio file (or pre-extracted WAV)
        sample_rate: Sample rate for analysis

    Returns:
        BeatInfo with beat times, tempo, and confidence
    """
    try:
        import librosa
    except ImportError:
        raise ImportError("librosa required for beat detection. Install with: pip install librosa")

    # Load audio
    y, sr = librosa.load(str(path), sr=sample_rate, mono=True)

    # Detect tempo and beats
    tempo, beat_frames = librosa.beat.beat_track(y=y, sr=sr)

    # Convert frames to times
    beat_times = librosa.frames_to_time(beat_frames, sr=sr).tolist()

    # Estimate confidence from onset strength consistency
    onset_env = librosa.onset.onset_strength(y=y, sr=sr)
    beat_strength = onset_env[beat_frames] if len(beat_frames) > 0 else []
    confidence = float(beat_strength.mean() / onset_env.max()) if len(beat_strength) > 0 and onset_env.max() > 0 else 0.5

    # Detect downbeats (first beat of each bar)
    # Use beat phase to estimate bar positions
    downbeat_times = None
    if len(beat_times) >= 4:
        # Assume 4/4 time signature, downbeats every 4 beats
        downbeat_times = [beat_times[i] for i in range(0, len(beat_times), 4)]

    return BeatInfo(
        beat_times=beat_times,
        tempo=float(tempo) if hasattr(tempo, '__float__') else float(tempo[0]) if len(tempo) > 0 else 120.0,
        confidence=min(1.0, max(0.0, confidence)),
        downbeat_times=downbeat_times,
        time_signature=4,
    )


def analyze_energy(path: Path, window_ms: float = 50.0, sample_rate: int = 22050) -> EnergyEnvelope:
    """
    Extract energy (loudness) envelope.

    Args:
        path: Path to audio file
        window_ms: Analysis window size in milliseconds
        sample_rate: Sample rate for analysis

    Returns:
        EnergyEnvelope with times and normalized values
    """
    try:
        import librosa
        import numpy as np
    except ImportError:
        raise ImportError("librosa and numpy required. Install with: pip install librosa numpy")

    y, sr = librosa.load(str(path), sr=sample_rate, mono=True)

    # Calculate frame size from window_ms
    hop_length = int(sr * window_ms / 1000)

    # RMS energy
    rms = librosa.feature.rms(y=y, hop_length=hop_length)[0]

    # Normalize to 0-1
    rms_max = rms.max()
    if rms_max > 0:
        rms_normalized = rms / rms_max
    else:
        rms_normalized = rms

    # Generate time points
    times = librosa.frames_to_time(np.arange(len(rms)), sr=sr, hop_length=hop_length)

    return EnergyEnvelope(
        times=times.tolist(),
        values=rms_normalized.tolist(),
        window_ms=window_ms,
    )


def analyze_spectrum(
    path: Path,
    band_ranges: Optional[dict] = None,
    window_ms: float = 50.0,
    sample_rate: int = 22050
) -> SpectrumBands:
    """
    Extract frequency band envelopes.

    Args:
        path: Path to audio file
        band_ranges: Dict mapping band name to (low_hz, high_hz)
        window_ms: Analysis window size
        sample_rate: Sample rate

    Returns:
        SpectrumBands with bass, mid, high envelopes
    """
    try:
        import librosa
        import numpy as np
    except ImportError:
        raise ImportError("librosa and numpy required")

    if band_ranges is None:
        band_ranges = {
            "bass": (20, 200),
            "mid": (200, 2000),
            "high": (2000, 20000),
        }

    y, sr = librosa.load(str(path), sr=sample_rate, mono=True)
    hop_length = int(sr * window_ms / 1000)

    # Compute STFT
    n_fft = 2048
    stft = np.abs(librosa.stft(y, n_fft=n_fft, hop_length=hop_length))

    # Frequency bins
    freqs = librosa.fft_frequencies(sr=sr, n_fft=n_fft)

    def band_energy(low_hz: float, high_hz: float) -> List[float]:
        """Sum energy in frequency band."""
        mask = (freqs >= low_hz) & (freqs <= high_hz)
        if not mask.any():
            return [0.0] * stft.shape[1]
        band = stft[mask, :].sum(axis=0)
        # Normalize
        band_max = band.max()
        if band_max > 0:
            band = band / band_max
        return band.tolist()

    times = librosa.frames_to_time(np.arange(stft.shape[1]), sr=sr, hop_length=hop_length)

    return SpectrumBands(
        bass=band_energy(*band_ranges["bass"]),
        mid=band_energy(*band_ranges["mid"]),
        high=band_energy(*band_ranges["high"]),
        times=times.tolist(),
        band_ranges=band_ranges,
    )


def analyze_onsets(path: Path, sample_rate: int = 22050) -> List[float]:
    """
    Detect onset times (note/sound starts).

    Args:
        path: Path to audio file
        sample_rate: Sample rate

    Returns:
        List of onset times in seconds
    """
    try:
        import librosa
    except ImportError:
        raise ImportError("librosa required")

    y, sr = librosa.load(str(path), sr=sample_rate, mono=True)

    # Detect onsets
    onset_frames = librosa.onset.onset_detect(y=y, sr=sr)
    onset_times = librosa.frames_to_time(onset_frames, sr=sr)

    return onset_times.tolist()


def analyze_audio(
    path: Path,
    features: Optional[List[str]] = None,
) -> AudioFeatures:
    """
    Extract audio features from file.

    Args:
        path: Path to audio/video file
        features: List of features to extract. Options:
            - "beats": Beat detection (tempo, beat times)
            - "energy": Loudness envelope
            - "spectrum": Frequency band envelopes
            - "onsets": Note onset times
            - "all": All features

    Returns:
        AudioFeatures with requested analysis
    """
    if features is None:
        features = [FEATURE_ALL]

    # Normalize features
    if FEATURE_ALL in features:
        features = [FEATURE_BEATS, FEATURE_ENERGY, FEATURE_SPECTRUM, FEATURE_ONSETS]

    # Get basic info via ffprobe
    duration, sample_rate, channels = _get_audio_info_ffprobe(path)

    result = AudioFeatures(
        duration=duration,
        sample_rate=sample_rate,
        channels=channels,
    )

    # Check if librosa is available for advanced features
    try:
        import librosa  # noqa: F401
        has_librosa = True
    except ImportError:
        has_librosa = False
        if any(f in features for f in [FEATURE_BEATS, FEATURE_ENERGY, FEATURE_SPECTRUM, FEATURE_ONSETS]):
            logger.warning("librosa not available, skipping advanced audio features")

    if not has_librosa:
        return result

    # Extract audio to WAV for librosa
    wav_path = None
    try:
        wav_path = _extract_audio_to_wav(path)

        if FEATURE_BEATS in features or FEATURE_TEMPO in features:
            try:
                result.beats = analyze_beats(wav_path)
            except Exception as e:
                logger.warning(f"Beat detection failed: {e}")

        if FEATURE_ENERGY in features:
            try:
                result.energy = analyze_energy(wav_path)
            except Exception as e:
                logger.warning(f"Energy analysis failed: {e}")

        if FEATURE_SPECTRUM in features:
            try:
                result.spectrum = analyze_spectrum(wav_path)
            except Exception as e:
                logger.warning(f"Spectrum analysis failed: {e}")

        if FEATURE_ONSETS in features:
            try:
                result.onsets = analyze_onsets(wav_path)
            except Exception as e:
                logger.warning(f"Onset detection failed: {e}")

    finally:
        # Clean up temporary WAV file
        if wav_path and wav_path.exists():
            wav_path.unlink()

    return result