Audio-Classifier/backend/src/core/file_scanner.py

"""File scanning and metadata extraction."""
import os
from pathlib import Path
from typing import List, Dict, Optional
from mutagen import File as MutagenFile

from ..utils.logging import get_logger
from ..utils.validators import get_audio_files, is_audio_file

logger = get_logger(__name__)


def scan_folder(path: str, recursive: bool = True) -> List[str]:
    """Scan folder for audio files.

    Args:
        path: Directory path to scan
        recursive: If True, scan subdirectories recursively

    Returns:
        List of absolute paths to audio files
    """
    logger.info(f"Scanning folder: {path} (recursive={recursive})")

    try:
        audio_files = get_audio_files(path, recursive=recursive)
        logger.info(f"Found {len(audio_files)} audio files")
        return audio_files

    except Exception as e:
        logger.error(f"Failed to scan folder {path}: {e}")
        return []


def get_file_metadata(filepath: str) -> Dict:
    """Get file metadata including ID3 tags.

    Args:
        filepath: Path to audio file

    Returns:
        Dictionary with file metadata
    """
    try:
        file_path = Path(filepath)

        # Basic file info
        metadata = {
            "filename": file_path.name,
            "file_size_bytes": file_path.stat().st_size,
            "format": file_path.suffix.lstrip('.').lower(),
            "filepath": str(file_path.resolve()),
        }

        # Try to get ID3 tags
        try:
            audio_file = MutagenFile(filepath, easy=True)
            if audio_file is not None:
                # Extract common tags
                tags = {}
                if hasattr(audio_file, 'tags') and audio_file.tags:
                    for key in ['title', 'artist', 'album', 'genre', 'date']:
                        if key in audio_file.tags:
                            value = audio_file.tags[key]
                            tags[key] = value[0] if isinstance(value, list) else str(value)

                if tags:
                    metadata["id3_tags"] = tags

                # Get duration from mutagen if available
                if hasattr(audio_file, 'info') and hasattr(audio_file.info, 'length'):
                    metadata["duration_seconds"] = float(audio_file.info.length)

        except Exception as e:
            logger.debug(f"Could not read tags from {filepath}: {e}")

        return metadata

    except Exception as e:
        logger.error(f"Failed to get metadata for {filepath}: {e}")
        return {
            "filename": Path(filepath).name,
            "file_size_bytes": 0,
            "format": "unknown",
            "filepath": filepath,
        }


def validate_audio_files(filepaths: List[str]) -> List[str]:
    """Validate a list of file paths and return only valid audio files.

    Args:
        filepaths: List of file paths to validate

    Returns:
        List of valid audio file paths
    """
    valid_files = []

    for filepath in filepaths:
        if not Path(filepath).exists():
            logger.warning(f"File does not exist: {filepath}")
            continue

        if not is_audio_file(filepath):
            logger.warning(f"Not a supported audio file: {filepath}")
            continue

        valid_files.append(filepath)

    return valid_files