"""Audio feature extraction using librosa.""" import librosa import numpy as np from typing import Dict, Tuple, Optional import warnings from ..utils.logging import get_logger logger = get_logger(__name__) # Suppress librosa warnings warnings.filterwarnings('ignore', category=UserWarning, module='librosa') def load_audio(filepath: str, sr: int = 22050) -> Tuple[np.ndarray, int]: """Load audio file. Args: filepath: Path to audio file sr: Target sample rate (default: 22050 Hz) Returns: Tuple of (audio time series, sample rate) """ try: y, sr = librosa.load(filepath, sr=sr, mono=True) return y, sr except Exception as e: logger.error(f"Failed to load audio file {filepath}: {e}") raise def extract_tempo(y: np.ndarray, sr: int) -> float: """Extract tempo (BPM) from audio. Args: y: Audio time series sr: Sample rate Returns: Tempo in BPM """ try: # Use onset_envelope for better beat tracking onset_env = librosa.onset.onset_strength(y=y, sr=sr) tempo, _ = librosa.beat.beat_track(onset_envelope=onset_env, sr=sr) return float(tempo) except Exception as e: logger.warning(f"Failed to extract tempo: {e}") return 0.0 def extract_key(y: np.ndarray, sr: int) -> str: """Extract musical key from audio. Args: y: Audio time series sr: Sample rate Returns: Key as string (e.g., "C major", "D minor") """ try: # Extract chroma features chromagram = librosa.feature.chroma_cqt(y=y, sr=sr) # Average chroma across time chroma_mean = np.mean(chromagram, axis=1) # Find dominant pitch class key_idx = np.argmax(chroma_mean) # Map to note names notes = ['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B'] # Simple major/minor detection (can be improved) # Check if minor third is prominent minor_third_idx = (key_idx + 3) % 12 is_minor = chroma_mean[minor_third_idx] > chroma_mean.mean() mode = "minor" if is_minor else "major" return f"{notes[key_idx]} {mode}" except Exception as e: logger.warning(f"Failed to extract key: {e}") return "unknown" def extract_spectral_features(y: np.ndarray, sr: int) -> Dict[str, float]: """Extract spectral features. Args: y: Audio time series sr: Sample rate Returns: Dictionary with spectral features """ try: # Spectral centroid spectral_centroids = librosa.feature.spectral_centroid(y=y, sr=sr)[0] spectral_centroid_mean = float(np.mean(spectral_centroids)) # Zero crossing rate zcr = librosa.feature.zero_crossing_rate(y)[0] zcr_mean = float(np.mean(zcr)) # Spectral rolloff spectral_rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)[0] spectral_rolloff_mean = float(np.mean(spectral_rolloff)) # Spectral bandwidth spectral_bandwidth = librosa.feature.spectral_bandwidth(y=y, sr=sr)[0] spectral_bandwidth_mean = float(np.mean(spectral_bandwidth)) return { "spectral_centroid": spectral_centroid_mean, "zero_crossing_rate": zcr_mean, "spectral_rolloff": spectral_rolloff_mean, "spectral_bandwidth": spectral_bandwidth_mean, } except Exception as e: logger.warning(f"Failed to extract spectral features: {e}") return { "spectral_centroid": 0.0, "zero_crossing_rate": 0.0, "spectral_rolloff": 0.0, "spectral_bandwidth": 0.0, } def extract_energy(y: np.ndarray, sr: int) -> float: """Extract RMS energy. Args: y: Audio time series sr: Sample rate Returns: Normalized energy value (0-1) """ try: rms = librosa.feature.rms(y=y)[0] energy = float(np.mean(rms)) # Normalize to 0-1 range (approximate) return min(energy * 10, 1.0) except Exception as e: logger.warning(f"Failed to extract energy: {e}") return 0.0 def estimate_danceability(y: np.ndarray, sr: int, tempo: float) -> float: """Estimate danceability based on rhythm and tempo. Args: y: Audio time series sr: Sample rate tempo: BPM Returns: Danceability score (0-1) """ try: # Danceability is correlated with: # 1. Strong beat regularity # 2. Tempo in danceable range (90-150 BPM) # 3. Percussive content # Get onset strength onset_env = librosa.onset.onset_strength(y=y, sr=sr) # Calculate beat regularity (autocorrelation of onset strength) ac = librosa.autocorrelate(onset_env, max_size=sr // 512) ac_peak = float(np.max(ac[1:]) / (ac[0] + 1e-8)) # Normalize by first value # Tempo factor (optimal around 90-150 BPM) if 90 <= tempo <= 150: tempo_factor = 1.0 elif 70 <= tempo < 90 or 150 < tempo <= 180: tempo_factor = 0.7 else: tempo_factor = 0.4 # Combine factors danceability = min(ac_peak * tempo_factor, 1.0) return float(danceability) except Exception as e: logger.warning(f"Failed to estimate danceability: {e}") return 0.0 def estimate_valence(y: np.ndarray, sr: int) -> float: """Estimate valence (positivity) based on audio features. Args: y: Audio time series sr: Sample rate Returns: Valence score (0-1), where 1 is positive/happy """ try: # Valence is correlated with: # 1. Major key vs minor key # 2. Higher tempo # 3. Brighter timbre (higher spectral centroid) # Get chroma for major/minor detection chromagram = librosa.feature.chroma_cqt(y=y, sr=sr) chroma_mean = np.mean(chromagram, axis=1) # Get spectral centroid (brightness) spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)[0] brightness = float(np.mean(spectral_centroid) / (sr / 2)) # Normalize # Simple heuristic: combine brightness with mode # Higher spectral centroid = more positive valence = min(brightness * 1.5, 1.0) return float(valence) except Exception as e: logger.warning(f"Failed to estimate valence: {e}") return 0.5 # Neutral def estimate_loudness(y: np.ndarray, sr: int) -> float: """Estimate loudness in LUFS (approximate). Args: y: Audio time series sr: Sample rate Returns: Approximate loudness in LUFS """ try: # This is a simplified estimation # True LUFS requires ITU-R BS.1770 weighting rms = np.sqrt(np.mean(y**2)) # Convert to dB db = 20 * np.log10(rms + 1e-10) # Approximate LUFS (very rough estimate) lufs = db + 0.691 # Offset to approximate LUFS return float(lufs) except Exception as e: logger.warning(f"Failed to estimate loudness: {e}") return -14.0 # Default target loudness def extract_time_signature(y: np.ndarray, sr: int) -> str: """Estimate time signature. Args: y: Audio time series sr: Sample rate Returns: Time signature as string (e.g., "4/4", "3/4") Note: This is a simplified estimation. Accurate time signature detection is complex and often requires machine learning models. """ try: # Get tempo and beat frames onset_env = librosa.onset.onset_strength(y=y, sr=sr) tempo, beats = librosa.beat.beat_track(onset_envelope=onset_env, sr=sr) # Analyze beat intervals if len(beats) < 4: return "4/4" # Default beat_times = librosa.frames_to_time(beats, sr=sr) intervals = np.diff(beat_times) # Look for patterns (very simplified) # This is placeholder logic - real implementation would be much more complex return "4/4" # Default to 4/4 for now except Exception as e: logger.warning(f"Failed to extract time signature: {e}") return "4/4" def extract_all_features(filepath: str) -> Dict: """Extract all audio features from a file. Args: filepath: Path to audio file Returns: Dictionary with all extracted features """ logger.info(f"Extracting features from: {filepath}") try: # Load audio y, sr = load_audio(filepath) # Get duration duration = float(librosa.get_duration(y=y, sr=sr)) # Extract tempo first (used by other features) tempo = extract_tempo(y, sr) # Extract all features key = extract_key(y, sr) spectral_features = extract_spectral_features(y, sr) energy = extract_energy(y, sr) danceability = estimate_danceability(y, sr, tempo) valence = estimate_valence(y, sr) loudness = estimate_loudness(y, sr) time_signature = extract_time_signature(y, sr) features = { "duration_seconds": duration, "tempo_bpm": tempo, "key": key, "time_signature": time_signature, "energy": energy, "danceability": danceability, "valence": valence, "loudness_lufs": loudness, "spectral_centroid": spectral_features["spectral_centroid"], "zero_crossing_rate": spectral_features["zero_crossing_rate"], "spectral_rolloff": spectral_features["spectral_rolloff"], "spectral_bandwidth": spectral_features["spectral_bandwidth"], } logger.info(f"Successfully extracted features: tempo={tempo:.1f} BPM, key={key}") return features except Exception as e: logger.error(f"Failed to extract features from {filepath}: {e}") raise