WIP essentia

This commit is contained in:
2025-12-22 12:59:20 +01:00
parent 83de840672
commit dec30019e2
9 changed files with 106 additions and 29 deletions

View File

@@ -0,0 +1,15 @@
{
"permissions": {
"allow": [
"Bash(node --version:*)",
"Bash(docker --version:*)",
"Bash(docker-compose:*)",
"Bash(test:*)",
"Bash(cp:*)",
"Bash(bash scripts/download-essentia-models.sh:*)",
"Bash(curl:*)",
"Bash(docker logs:*)",
"Bash(docker exec:*)"
]
}
}

View File

@@ -95,6 +95,10 @@ curl -X POST http://localhost:8001/api/analyze/folder \
-H "Content-Type: application/json" \ -H "Content-Type: application/json" \
-d '{"path": "/audio/music", "recursive": true}' -d '{"path": "/audio/music", "recursive": true}'
``` ```
#### Sous Windows 10
````bash
curl.exe -X POST http://localhost:8001/api/analyze/folder -H "Content-Type: application/json" -d '{\"path\": \"/audio/\", \"recursive\": true}'
````
### Rechercher des pistes ### Rechercher des pistes

View File

@@ -39,8 +39,8 @@ COPY requirements.txt .
RUN pip install --no-cache-dir numpy==1.24.3 RUN pip install --no-cache-dir numpy==1.24.3
RUN pip install --no-cache-dir scipy==1.11.4 RUN pip install --no-cache-dir scipy==1.11.4
# Install Essentia - Python 3.9 with ARM64 support # Install Essentia-TensorFlow - Python 3.9 AMD64 support
RUN pip install --no-cache-dir essentia RUN pip install --no-cache-dir essentia-tensorflow
RUN pip install --no-cache-dir -r requirements.txt RUN pip install --no-cache-dir -r requirements.txt

View File

@@ -14,7 +14,8 @@ try:
from essentia.standard import ( from essentia.standard import (
MonoLoader, MonoLoader,
TensorflowPredictEffnetDiscogs, TensorflowPredictEffnetDiscogs,
TensorflowPredict2D TensorflowPredict2D,
TensorflowPredictMusiCNN
) )
ESSENTIA_AVAILABLE = True ESSENTIA_AVAILABLE = True
except ImportError: except ImportError:
@@ -55,7 +56,17 @@ class EssentiaClassifier:
logger.warning(f"Models path {self.models_path} does not exist") logger.warning(f"Models path {self.models_path} does not exist")
return return
# Model file names # Check for embedding model first
embedding_file = "discogs-effnet-bs64-1.pb"
embedding_path = self.models_path / embedding_file
if embedding_path.exists():
logger.info(f"Loading embedding model from {embedding_path}")
self.models["embedding"] = str(embedding_path)
else:
logger.warning(f"Embedding model not found: {embedding_path}")
return # Cannot proceed without embeddings
# Model file names for classification heads
model_files = { model_files = {
"genre": "mtg_jamendo_genre-discogs-effnet-1.pb", "genre": "mtg_jamendo_genre-discogs-effnet-1.pb",
"mood": "mtg_jamendo_moodtheme-discogs-effnet-1.pb", "mood": "mtg_jamendo_moodtheme-discogs-effnet-1.pb",
@@ -135,15 +146,26 @@ class EssentiaClassifier:
return self._fallback_genre() return self._fallback_genre()
try: try:
# Load audio # Step 1: Extract embeddings using discogs-effnet
audio = MonoLoader(filename=audio_path, sampleRate=16000, resampleQuality=4)() audio = MonoLoader(filename=audio_path, sampleRate=16000, resampleQuality=4)()
# Predict embedding_model = TensorflowPredictEffnetDiscogs(
model = TensorflowPredictEffnetDiscogs( graphFilename=self.models["embedding"],
graphFilename=self.models["genre"],
output="PartitionedCall:1" output="PartitionedCall:1"
) )
predictions = model(audio) embeddings = embedding_model(audio)
# Average embeddings over time
embeddings_mean = np.mean(embeddings, axis=0)
# Step 2: Feed embeddings to classification head
classifier = TensorflowPredict2D(
graphFilename=self.models["genre"],
input="model/Placeholder",
output="model/Sigmoid"
)
predictions = classifier(embeddings_mean.reshape(1, -1))
predictions = predictions[0] # Remove batch dimension
# Get top predictions # Get top predictions
top_indices = np.argsort(predictions)[::-1][:5] top_indices = np.argsort(predictions)[::-1][:5]
@@ -172,19 +194,28 @@ class EssentiaClassifier:
Returns: Returns:
Dictionary with mood predictions Dictionary with mood predictions
""" """
if not ESSENTIA_AVAILABLE or "mood" not in self.models: if not ESSENTIA_AVAILABLE or "mood" not in self.models or "embedding" not in self.models:
return self._fallback_mood() return self._fallback_mood()
try: try:
# Load audio # Step 1: Extract embeddings using discogs-effnet
audio = MonoLoader(filename=audio_path, sampleRate=16000, resampleQuality=4)() audio = MonoLoader(filename=audio_path, sampleRate=16000, resampleQuality=4)()
# Predict embedding_model = TensorflowPredictEffnetDiscogs(
model = TensorflowPredictEffnetDiscogs( graphFilename=self.models["embedding"],
graphFilename=self.models["mood"],
output="PartitionedCall:1" output="PartitionedCall:1"
) )
predictions = model(audio) embeddings = embedding_model(audio)
embeddings_mean = np.mean(embeddings, axis=0)
# Step 2: Feed embeddings to classification head
classifier = TensorflowPredict2D(
graphFilename=self.models["mood"],
input="model/Placeholder",
output="model/Sigmoid"
)
predictions = classifier(embeddings_mean.reshape(1, -1))
predictions = predictions[0]
# Get top predictions # Get top predictions
top_indices = np.argsort(predictions)[::-1][:5] top_indices = np.argsort(predictions)[::-1][:5]
@@ -216,19 +247,28 @@ class EssentiaClassifier:
Returns: Returns:
List of instruments with confidence scores List of instruments with confidence scores
""" """
if not ESSENTIA_AVAILABLE or "instrument" not in self.models: if not ESSENTIA_AVAILABLE or "instrument" not in self.models or "embedding" not in self.models:
return self._fallback_instruments() return self._fallback_instruments()
try: try:
# Load audio # Step 1: Extract embeddings using discogs-effnet
audio = MonoLoader(filename=audio_path, sampleRate=16000, resampleQuality=4)() audio = MonoLoader(filename=audio_path, sampleRate=16000, resampleQuality=4)()
# Predict embedding_model = TensorflowPredictEffnetDiscogs(
model = TensorflowPredictEffnetDiscogs( graphFilename=self.models["embedding"],
graphFilename=self.models["instrument"],
output="PartitionedCall:1" output="PartitionedCall:1"
) )
predictions = model(audio) embeddings = embedding_model(audio)
embeddings_mean = np.mean(embeddings, axis=0)
# Step 2: Feed embeddings to classification head
classifier = TensorflowPredict2D(
graphFilename=self.models["instrument"],
input="model/Placeholder",
output="model/Sigmoid"
)
predictions = classifier(embeddings_mean.reshape(1, -1))
predictions = predictions[0]
# Get instruments above threshold # Get instruments above threshold
threshold = 0.1 threshold = 0.1

View File

@@ -40,10 +40,15 @@ services:
restart: unless-stopped restart: unless-stopped
frontend: frontend:
build: ./frontend build:
context: ./frontend
args:
NEXT_PUBLIC_API_URL: http://localhost:8001
container_name: audio_classifier_ui container_name: audio_classifier_ui
environment: environment:
NEXT_PUBLIC_API_URL: http://backend:8000 # Use localhost:8001 because the browser (client-side) needs to access the API
# The backend is mapped to port 8001 on the host machine
NEXT_PUBLIC_API_URL: http://localhost:8001
ports: ports:
- "3000:3000" - "3000:3000"
depends_on: depends_on:

View File

@@ -1,7 +1,6 @@
node_modules node_modules
.next .next
.git .git
.env.local
npm-debug.log* npm-debug.log*
yarn-debug.log* yarn-debug.log*
yarn-error.log* yarn-error.log*

1
frontend/.env.local Normal file
View File

@@ -0,0 +1 @@
NEXT_PUBLIC_API_URL=http://localhost:8001

View File

@@ -12,6 +12,10 @@ RUN npm ci
# Copy application code # Copy application code
COPY . . COPY . .
# Build argument for API URL
ARG NEXT_PUBLIC_API_URL=http://localhost:8001
ENV NEXT_PUBLIC_API_URL=${NEXT_PUBLIC_API_URL}
# Build the application # Build the application
RUN npm run build RUN npm run build

View File

@@ -6,7 +6,8 @@
set -e # Exit on error set -e # Exit on error
MODELS_DIR="backend/models" MODELS_DIR="backend/models"
BASE_URL="https://essentia.upf.edu/models/classification-heads" CLASS_HEADS_URL="https://essentia.upf.edu/models/classification-heads"
EMBEDDINGS_URL="https://essentia.upf.edu/models/feature-extractors/discogs-effnet"
echo "📦 Downloading Essentia models..." echo "📦 Downloading Essentia models..."
echo "Models directory: $MODELS_DIR" echo "Models directory: $MODELS_DIR"
@@ -37,15 +38,23 @@ download_model() {
fi fi
} }
# Download each model # Download embedding model first (required for all classification heads)
echo ""
echo "Downloading embedding model..."
download_model "discogs-effnet-bs64-1.pb" \
"$EMBEDDINGS_URL/discogs-effnet-bs64-1.pb"
# Download classification heads
echo ""
echo "Downloading classification heads..."
download_model "mtg_jamendo_genre-discogs-effnet-1.pb" \ download_model "mtg_jamendo_genre-discogs-effnet-1.pb" \
"$BASE_URL/mtg_jamendo_genre/mtg_jamendo_genre-discogs-effnet-1.pb" "$CLASS_HEADS_URL/mtg_jamendo_genre/mtg_jamendo_genre-discogs-effnet-1.pb"
download_model "mtg_jamendo_moodtheme-discogs-effnet-1.pb" \ download_model "mtg_jamendo_moodtheme-discogs-effnet-1.pb" \
"$BASE_URL/mtg_jamendo_moodtheme/mtg_jamendo_moodtheme-discogs-effnet-1.pb" "$CLASS_HEADS_URL/mtg_jamendo_moodtheme/mtg_jamendo_moodtheme-discogs-effnet-1.pb"
download_model "mtg_jamendo_instrument-discogs-effnet-1.pb" \ download_model "mtg_jamendo_instrument-discogs-effnet-1.pb" \
"$BASE_URL/mtg_jamendo_instrument/mtg_jamendo_instrument-discogs-effnet-1.pb" "$CLASS_HEADS_URL/mtg_jamendo_instrument/mtg_jamendo_instrument-discogs-effnet-1.pb"
echo "" echo ""
echo "✅ All models downloaded successfully!" echo "✅ All models downloaded successfully!"