Files
Audio-Classifier/backend/src/cli/scanner.py
Benoit b74c6b0b40
All checks were successful
Build and Push Docker Images / Build Frontend Image (push) Successful in 57s
Fix scan infini: exclure dossiers transcoded et waveforms
Problème: Le scanner scannait TOUS les dossiers, y compris les dossiers
générés (transcoded/ et waveforms/), créant:
1. Boucle infinie: scan original → crée transcoded → re-scan transcoded
2. Segfaults: tentative de transcoder des fichiers déjà transcodés
3. Doublons en base de données

Solution:
- library.py: Exclut transcoded, waveforms, .transcoded, .waveforms
- scanner.py: Même exclusion dans le CLI

Technique: Modifie dirs[:] dans os.walk() pour skip ces dossiers.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-26 00:08:13 +01:00

224 lines
6.9 KiB
Python

#!/usr/bin/env python3
"""
Audio library scanner CLI tool.
Scans a directory for audio files and adds them to the database.
"""
import os
import sys
import argparse
from pathlib import Path
from typing import List
# Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
from src.core.audio_processor import extract_all_features
from src.core.essentia_classifier import EssentiaClassifier
from src.core.transcoder import AudioTranscoder
from src.core.waveform_generator import save_waveform_to_file
from src.models.database import SessionLocal
from src.models.schema import AudioTrack
from src.utils.logging import get_logger
logger = get_logger(__name__)
# Supported audio formats
AUDIO_EXTENSIONS = {'.mp3', '.wav', '.flac', '.m4a', '.aac', '.ogg', '.wma'}
def find_audio_files(directory: str) -> List[Path]:
"""Find all audio files in directory and subdirectories.
Args:
directory: Root directory to scan
Returns:
List of paths to audio files
"""
audio_files = []
directory_path = Path(directory)
if not directory_path.exists():
logger.error(f"Directory does not exist: {directory}")
return []
logger.info(f"Scanning directory: {directory}")
for root, dirs, files in os.walk(directory_path):
# Skip transcoded and waveforms directories
dirs[:] = [d for d in dirs if d not in ['transcoded', 'waveforms', '.transcoded', '.waveforms']]
for file in files:
file_path = Path(root) / file
if file_path.suffix.lower() in AUDIO_EXTENSIONS:
audio_files.append(file_path)
logger.info(f"Found {len(audio_files)} audio files")
return audio_files
def analyze_and_store(file_path: Path, classifier: EssentiaClassifier, transcoder: AudioTranscoder, db) -> bool:
"""Analyze an audio file and store it in the database.
Args:
file_path: Path to audio file
classifier: Essentia classifier instance
transcoder: Audio transcoder instance
db: Database session
Returns:
True if successful, False otherwise
"""
try:
logger.info(f"Processing: {file_path}")
# Check if already in database
existing = db.query(AudioTrack).filter(AudioTrack.filepath == str(file_path)).first()
if existing:
logger.info(f"Already in database, skipping: {file_path}")
return True
# Extract basic features with librosa
features = extract_all_features(str(file_path))
# Get genre classification
genre_result = classifier.predict_genre(str(file_path))
# Get mood classification
mood_result = classifier.predict_mood(str(file_path))
# Get instruments
instruments = classifier.predict_instruments(str(file_path))
# Transcode to MP3 128kbps for streaming
logger.info(" → Transcoding to MP3 128kbps for streaming...")
stream_path = transcoder.transcode_to_mp3(
str(file_path),
bitrate="128k",
overwrite=False
)
# Pre-compute waveform
logger.info(" → Generating waveform...")
waveform_dir = file_path.parent / "waveforms"
waveform_dir.mkdir(parents=True, exist_ok=True)
waveform_path = waveform_dir / f"{file_path.stem}.waveform.json"
waveform_success = save_waveform_to_file(
str(file_path),
str(waveform_path),
num_peaks=800
)
# Create track record
track = AudioTrack(
filepath=str(file_path),
stream_filepath=stream_path,
waveform_filepath=str(waveform_path) if waveform_success else None,
filename=file_path.name,
duration_seconds=features['duration_seconds'],
tempo_bpm=features['tempo_bpm'],
key=features['key'],
time_signature=features['time_signature'],
energy=features['energy'],
danceability=features['danceability'],
valence=features['valence'],
loudness_lufs=features['loudness_lufs'],
spectral_centroid=features['spectral_centroid'],
zero_crossing_rate=features['zero_crossing_rate'],
genre_primary=genre_result['primary'],
genre_secondary=genre_result['secondary'],
genre_confidence=genre_result['confidence'],
mood_primary=mood_result['primary'],
mood_secondary=mood_result['secondary'],
mood_arousal=mood_result['arousal'],
mood_valence=mood_result['valence'],
instruments=[i['name'] for i in instruments[:5]], # Top 5
)
db.add(track)
db.commit()
logger.info(f"✓ Added to database: {file_path.name}")
logger.info(f" Genre: {genre_result['primary']}, Mood: {mood_result['primary']}, "
f"Tempo: {features['tempo_bpm']:.1f} BPM")
logger.info(f" Stream: {stream_path}")
logger.info(f" Waveform: {'' if waveform_success else ''}")
return True
except Exception as e:
logger.error(f"Failed to process {file_path}: {e}")
db.rollback()
return False
def main():
"""Main scanner function."""
parser = argparse.ArgumentParser(
description='Scan audio library and add tracks to database'
)
parser.add_argument(
'directory',
help='Directory to scan for audio files'
)
parser.add_argument(
'--workers',
type=int,
default=1,
help='Number of parallel workers (default: 1)'
)
args = parser.parse_args()
# Find audio files
audio_files = find_audio_files(args.directory)
if not audio_files:
logger.warning("No audio files found!")
return
# Initialize classifier
logger.info("Initializing Essentia classifier...")
classifier = EssentiaClassifier()
# Initialize transcoder
logger.info("Initializing audio transcoder...")
transcoder = AudioTranscoder()
# Check FFmpeg availability
if not transcoder.check_ffmpeg_available():
logger.error("FFmpeg is required for transcoding. Please install FFmpeg and try again.")
return
# Process files
db = SessionLocal()
success_count = 0
error_count = 0
try:
for i, file_path in enumerate(audio_files, 1):
logger.info(f"[{i}/{len(audio_files)}] Processing...")
if analyze_and_store(file_path, classifier, transcoder, db):
success_count += 1
else:
error_count += 1
finally:
db.close()
# Summary
logger.info("")
logger.info("=" * 60)
logger.info(f"Scan complete!")
logger.info(f" Total files: {len(audio_files)}")
logger.info(f" Successfully processed: {success_count}")
logger.info(f" Errors: {error_count}")
logger.info("=" * 60)
if __name__ == '__main__':
main()