Audio-Classifier/backend/src/core/audio_processor.py

"""Audio feature extraction using librosa."""
import librosa
import numpy as np
from typing import Dict, Tuple, Optional
import warnings

from ..utils.logging import get_logger

logger = get_logger(__name__)

# Suppress librosa warnings
warnings.filterwarnings('ignore', category=UserWarning, module='librosa')


def load_audio(filepath: str, sr: int = 22050) -> Tuple[np.ndarray, int]:
    """Load audio file.

    Args:
        filepath: Path to audio file
        sr: Target sample rate (default: 22050 Hz)

    Returns:
        Tuple of (audio time series, sample rate)
    """
    try:
        y, sr = librosa.load(filepath, sr=sr, mono=True)
        return y, sr
    except Exception as e:
        logger.error(f"Failed to load audio file {filepath}: {e}")
        raise


def extract_tempo(y: np.ndarray, sr: int) -> float:
    """Extract tempo (BPM) from audio.

    Args:
        y: Audio time series
        sr: Sample rate

    Returns:
        Tempo in BPM
    """
    try:
        # Use onset_envelope for better beat tracking
        onset_env = librosa.onset.onset_strength(y=y, sr=sr)
        tempo, _ = librosa.beat.beat_track(onset_envelope=onset_env, sr=sr)
        return float(tempo)
    except Exception as e:
        logger.warning(f"Failed to extract tempo: {e}")
        return 0.0


def extract_key(y: np.ndarray, sr: int) -> str:
    """Extract musical key from audio.

    Args:
        y: Audio time series
        sr: Sample rate

    Returns:
        Key as string (e.g., "C major", "D minor")
    """
    try:
        # Extract chroma features
        chromagram = librosa.feature.chroma_cqt(y=y, sr=sr)

        # Average chroma across time
        chroma_mean = np.mean(chromagram, axis=1)

        # Find dominant pitch class
        key_idx = np.argmax(chroma_mean)

        # Map to note names
        notes = ['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B']

        # Simple major/minor detection (can be improved)
        # Check if minor third is prominent
        minor_third_idx = (key_idx + 3) % 12
        is_minor = chroma_mean[minor_third_idx] > chroma_mean.mean()

        mode = "minor" if is_minor else "major"
        return f"{notes[key_idx]} {mode}"

    except Exception as e:
        logger.warning(f"Failed to extract key: {e}")
        return "unknown"


def extract_spectral_features(y: np.ndarray, sr: int) -> Dict[str, float]:
    """Extract spectral features.

    Args:
        y: Audio time series
        sr: Sample rate

    Returns:
        Dictionary with spectral features
    """
    try:
        # Spectral centroid
        spectral_centroids = librosa.feature.spectral_centroid(y=y, sr=sr)[0]
        spectral_centroid_mean = float(np.mean(spectral_centroids))

        # Zero crossing rate
        zcr = librosa.feature.zero_crossing_rate(y)[0]
        zcr_mean = float(np.mean(zcr))

        # Spectral rolloff
        spectral_rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)[0]
        spectral_rolloff_mean = float(np.mean(spectral_rolloff))

        # Spectral bandwidth
        spectral_bandwidth = librosa.feature.spectral_bandwidth(y=y, sr=sr)[0]
        spectral_bandwidth_mean = float(np.mean(spectral_bandwidth))

        return {
            "spectral_centroid": spectral_centroid_mean,
            "zero_crossing_rate": zcr_mean,
            "spectral_rolloff": spectral_rolloff_mean,
            "spectral_bandwidth": spectral_bandwidth_mean,
        }

    except Exception as e:
        logger.warning(f"Failed to extract spectral features: {e}")
        return {
            "spectral_centroid": 0.0,
            "zero_crossing_rate": 0.0,
            "spectral_rolloff": 0.0,
            "spectral_bandwidth": 0.0,
        }


def extract_energy(y: np.ndarray, sr: int) -> float:
    """Extract RMS energy.

    Args:
        y: Audio time series
        sr: Sample rate

    Returns:
        Normalized energy value (0-1)
    """
    try:
        rms = librosa.feature.rms(y=y)[0]
        energy = float(np.mean(rms))
        # Normalize to 0-1 range (approximate)
        return min(energy * 10, 1.0)
    except Exception as e:
        logger.warning(f"Failed to extract energy: {e}")
        return 0.0


def estimate_danceability(y: np.ndarray, sr: int, tempo: float) -> float:
    """Estimate danceability based on rhythm and tempo.

    Args:
        y: Audio time series
        sr: Sample rate
        tempo: BPM

    Returns:
        Danceability score (0-1)
    """
    try:
        # Danceability is correlated with:
        # 1. Strong beat regularity
        # 2. Tempo in danceable range (90-150 BPM)
        # 3. Percussive content

        # Get onset strength
        onset_env = librosa.onset.onset_strength(y=y, sr=sr)

        # Calculate beat regularity (autocorrelation of onset strength)
        ac = librosa.autocorrelate(onset_env, max_size=sr // 512)
        ac_peak = float(np.max(ac[1:]) / (ac[0] + 1e-8))  # Normalize by first value

        # Tempo factor (optimal around 90-150 BPM)
        if 90 <= tempo <= 150:
            tempo_factor = 1.0
        elif 70 <= tempo < 90 or 150 < tempo <= 180:
            tempo_factor = 0.7
        else:
            tempo_factor = 0.4

        # Combine factors
        danceability = min(ac_peak * tempo_factor, 1.0)
        return float(danceability)

    except Exception as e:
        logger.warning(f"Failed to estimate danceability: {e}")
        return 0.0


def estimate_valence(y: np.ndarray, sr: int) -> float:
    """Estimate valence (positivity) based on audio features.

    Args:
        y: Audio time series
        sr: Sample rate

    Returns:
        Valence score (0-1), where 1 is positive/happy
    """
    try:
        # Valence is correlated with:
        # 1. Major key vs minor key
        # 2. Higher tempo
        # 3. Brighter timbre (higher spectral centroid)

        # Get chroma for major/minor detection
        chromagram = librosa.feature.chroma_cqt(y=y, sr=sr)
        chroma_mean = np.mean(chromagram, axis=1)

        # Get spectral centroid (brightness)
        spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)[0]
        brightness = float(np.mean(spectral_centroid) / (sr / 2))  # Normalize

        # Simple heuristic: combine brightness with mode
        # Higher spectral centroid = more positive
        valence = min(brightness * 1.5, 1.0)

        return float(valence)

    except Exception as e:
        logger.warning(f"Failed to estimate valence: {e}")
        return 0.5  # Neutral


def estimate_loudness(y: np.ndarray, sr: int) -> float:
    """Estimate loudness in LUFS (approximate).

    Args:
        y: Audio time series
        sr: Sample rate

    Returns:
        Approximate loudness in LUFS
    """
    try:
        # This is a simplified estimation
        # True LUFS requires ITU-R BS.1770 weighting
        rms = np.sqrt(np.mean(y**2))

        # Convert to dB
        db = 20 * np.log10(rms + 1e-10)

        # Approximate LUFS (very rough estimate)
        lufs = db + 0.691  # Offset to approximate LUFS

        return float(lufs)

    except Exception as e:
        logger.warning(f"Failed to estimate loudness: {e}")
        return -14.0  # Default target loudness


def extract_time_signature(y: np.ndarray, sr: int) -> str:
    """Estimate time signature.

    Args:
        y: Audio time series
        sr: Sample rate

    Returns:
        Time signature as string (e.g., "4/4", "3/4")

    Note:
        This is a simplified estimation. Accurate time signature detection
        is complex and often requires machine learning models.
    """
    try:
        # Get tempo and beat frames
        onset_env = librosa.onset.onset_strength(y=y, sr=sr)
        tempo, beats = librosa.beat.beat_track(onset_envelope=onset_env, sr=sr)

        # Analyze beat intervals
        if len(beats) < 4:
            return "4/4"  # Default

        beat_times = librosa.frames_to_time(beats, sr=sr)
        intervals = np.diff(beat_times)

        # Look for patterns (very simplified)
        # This is placeholder logic - real implementation would be much more complex
        return "4/4"  # Default to 4/4 for now

    except Exception as e:
        logger.warning(f"Failed to extract time signature: {e}")
        return "4/4"


def extract_all_features(filepath: str) -> Dict:
    """Extract all audio features from a file.

    Args:
        filepath: Path to audio file

    Returns:
        Dictionary with all extracted features
    """
    logger.info(f"Extracting features from: {filepath}")

    try:
        # Load audio
        y, sr = load_audio(filepath)

        # Get duration
        duration = float(librosa.get_duration(y=y, sr=sr))

        # Extract tempo first (used by other features)
        tempo = extract_tempo(y, sr)

        # Extract all features
        key = extract_key(y, sr)
        spectral_features = extract_spectral_features(y, sr)
        energy = extract_energy(y, sr)
        danceability = estimate_danceability(y, sr, tempo)
        valence = estimate_valence(y, sr)
        loudness = estimate_loudness(y, sr)
        time_signature = extract_time_signature(y, sr)

        features = {
            "duration_seconds": duration,
            "tempo_bpm": tempo,
            "key": key,
            "time_signature": time_signature,
            "energy": energy,
            "danceability": danceability,
            "valence": valence,
            "loudness_lufs": loudness,
            "spectral_centroid": spectral_features["spectral_centroid"],
            "zero_crossing_rate": spectral_features["zero_crossing_rate"],
            "spectral_rolloff": spectral_features["spectral_rolloff"],
            "spectral_bandwidth": spectral_features["spectral_bandwidth"],
        }

        logger.info(f"Successfully extracted features: tempo={tempo:.1f} BPM, key={key}")
        return features

    except Exception as e:
        logger.error(f"Failed to extract features from {filepath}: {e}")
        raise