Fix bequcoup de choses : Genre OK, affichage des infos sur le front

WIP essentia
2025-12-22 13:26:55 +01:00 · 2025-12-22 12:59:20 +01:00
10 changed files with 166 additions and 36 deletions
--- a/.claude/settings.local.json
+++ b/.claude/settings.local.json
@@ -0,0 +1,15 @@
 {
  "permissions": {
    "allow": [
      "Bash(node --version:*)",
      "Bash(docker --version:*)",
      "Bash(docker-compose:*)",
      "Bash(test:*)",
      "Bash(cp:*)",
      "Bash(bash scripts/download-essentia-models.sh:*)",
      "Bash(curl:*)",
      "Bash(docker logs:*)",
      "Bash(docker exec:*)"
    ]
  }
 }
--- a/README.md
+++ b/README.md
@@ -95,6 +95,10 @@ curl -X POST http://localhost:8001/api/analyze/folder \
  -H "Content-Type: application/json" \
  -d '{"path": "/audio/music", "recursive": true}'
 ```
 #### Sous Windows 10
 ````bash
 curl.exe -X POST http://localhost:8001/api/analyze/folder -H "Content-Type: application/json" -d '{\"path\": \"/audio/\", \"recursive\": true}'
 ````
 ### Rechercher des pistes
--- a/backend/Dockerfile
+++ b/backend/Dockerfile
@@ -39,8 +39,8 @@ COPY requirements.txt .
 RUN pip install --no-cache-dir numpy==1.24.3
 RUN pip install --no-cache-dir scipy==1.11.4
-# Install Essentia - Python 3.9 with ARM64 support
+# Install Essentia-TensorFlow - Python 3.9 AMD64 support
-RUN pip install --no-cache-dir essentia
+RUN pip install --no-cache-dir essentia-tensorflow
 RUN pip install --no-cache-dir -r requirements.txt
--- a/backend/src/core/essentia_classifier.py
+++ b/backend/src/core/essentia_classifier.py
@@ -14,7 +14,8 @@ try:
    from essentia.standard import (
        MonoLoader,
        TensorflowPredictEffnetDiscogs,
-        TensorflowPredict2D
+        TensorflowPredict2D,
        TensorflowPredictMusiCNN
    )
    ESSENTIA_AVAILABLE = True
 except ImportError:
@@ -55,7 +56,17 @@ class EssentiaClassifier:
            logger.warning(f"Models path {self.models_path} does not exist")
            return
-        # Model file names
+        # Check for embedding model first
        embedding_file = "discogs-effnet-bs64-1.pb"
        embedding_path = self.models_path / embedding_file
        if embedding_path.exists():
            logger.info(f"Loading embedding model from {embedding_path}")
            self.models["embedding"] = str(embedding_path)
        else:
            logger.warning(f"Embedding model not found: {embedding_path}")
            return  # Cannot proceed without embeddings
        # Model file names for classification heads
        model_files = {
            "genre": "mtg_jamendo_genre-discogs-effnet-1.pb",
            "mood": "mtg_jamendo_moodtheme-discogs-effnet-1.pb",
@@ -135,23 +146,47 @@ class EssentiaClassifier:
            return self._fallback_genre()
        try:
-            # Load audio
+            # Step 1: Extract embeddings using discogs-effnet
            audio = MonoLoader(filename=audio_path, sampleRate=16000, resampleQuality=4)()
-            # Predict
+            embedding_model = TensorflowPredictEffnetDiscogs(
-            model = TensorflowPredictEffnetDiscogs(
+                graphFilename=self.models["embedding"],
                graphFilename=self.models["genre"],
                output="PartitionedCall:1"
            )
-            predictions = model(audio)
+            embeddings = embedding_model(audio)
            # Average embeddings over time
            embeddings_mean = np.mean(embeddings, axis=0)
            # Step 2: Feed embeddings to classification head
            classifier = TensorflowPredict2D(
                graphFilename=self.models["genre"],
                input="model/Placeholder",
                output="model/Sigmoid"
            )
            predictions = classifier(embeddings_mean.reshape(1, -1))
            predictions = predictions[0]  # Remove batch dimension
            # Get top predictions
            top_indices = np.argsort(predictions)[::-1][:5]
            labels = self.class_labels.get("genre", [])
            logger.info(f"Genre predictions shape: {predictions.shape}, num_labels: {len(labels)}")
-            primary = labels[top_indices[0]] if labels else "unknown"
+            # Ensure we don't go out of bounds
-            secondary = [labels[i] for i in top_indices[1:4]] if labels else []
+            if len(predictions) == 0:
-            confidence = float(predictions[top_indices[0]])
+                logger.warning("No predictions returned from genre model")
                return self._fallback_genre()
            top_indices = np.argsort(predictions)[::-1][:5]
            # Only use indices that are within the labels range
            valid_top_indices = [i for i in top_indices if i < len(labels)]
            if not valid_top_indices:
                logger.warning(f"No valid indices found. Predictions: {len(predictions)}, Labels: {len(labels)}")
                return self._fallback_genre()
            primary = labels[valid_top_indices[0]]
            secondary = [labels[i] for i in valid_top_indices[1:4]]
            confidence = float(predictions[valid_top_indices[0]])
            return {
                "primary": primary,
@@ -172,26 +207,43 @@ class EssentiaClassifier:
        Returns:
            Dictionary with mood predictions
        """
-        if not ESSENTIA_AVAILABLE or "mood" not in self.models:
+        if not ESSENTIA_AVAILABLE or "mood" not in self.models or "embedding" not in self.models:
            return self._fallback_mood()
        try:
-            # Load audio
+            # Step 1: Extract embeddings using discogs-effnet
            audio = MonoLoader(filename=audio_path, sampleRate=16000, resampleQuality=4)()
-            # Predict
+            embedding_model = TensorflowPredictEffnetDiscogs(
-            model = TensorflowPredictEffnetDiscogs(
+                graphFilename=self.models["embedding"],
                graphFilename=self.models["mood"],
                output="PartitionedCall:1"
            )
-            predictions = model(audio)
+            embeddings = embedding_model(audio)
            embeddings_mean = np.mean(embeddings, axis=0)
            # Step 2: Feed embeddings to classification head
            classifier = TensorflowPredict2D(
                graphFilename=self.models["mood"],
                input="model/Placeholder",
                output="model/Sigmoid"
            )
            predictions = classifier(embeddings_mean.reshape(1, -1))
            predictions = predictions[0]
            # Get top predictions
            top_indices = np.argsort(predictions)[::-1][:5]
            labels = self.class_labels.get("mood", [])
-            primary = labels[top_indices[0]] if labels else "unknown"
+            if len(predictions) == 0:
-            secondary = [labels[i] for i in top_indices[1:3]] if labels else []
+                return self._fallback_mood()
            top_indices = np.argsort(predictions)[::-1][:5]
            valid_top_indices = [i for i in top_indices if i < len(labels)]
            if not valid_top_indices:
                return self._fallback_mood()
            primary = labels[valid_top_indices[0]] if valid_top_indices else "unknown"
            secondary = [labels[i] for i in valid_top_indices[1:3]] if len(valid_top_indices) > 1 else []
            # Estimate arousal and valence from mood labels (simplified)
            arousal, valence = self._estimate_arousal_valence(primary)
@@ -216,19 +268,28 @@ class EssentiaClassifier:
        Returns:
            List of instruments with confidence scores
        """
-        if not ESSENTIA_AVAILABLE or "instrument" not in self.models:
+        if not ESSENTIA_AVAILABLE or "instrument" not in self.models or "embedding" not in self.models:
            return self._fallback_instruments()
        try:
-            # Load audio
+            # Step 1: Extract embeddings using discogs-effnet
            audio = MonoLoader(filename=audio_path, sampleRate=16000, resampleQuality=4)()
-            # Predict
+            embedding_model = TensorflowPredictEffnetDiscogs(
-            model = TensorflowPredictEffnetDiscogs(
+                graphFilename=self.models["embedding"],
                graphFilename=self.models["instrument"],
                output="PartitionedCall:1"
            )
-            predictions = model(audio)
+            embeddings = embedding_model(audio)
            embeddings_mean = np.mean(embeddings, axis=0)
            # Step 2: Feed embeddings to classification head
            classifier = TensorflowPredict2D(
                graphFilename=self.models["instrument"],
                input="model/Placeholder",
                output="model/Sigmoid"
            )
            predictions = classifier(embeddings_mean.reshape(1, -1))
            predictions = predictions[0]
            # Get instruments above threshold
            threshold = 0.1
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -40,10 +40,15 @@ services:
    restart: unless-stopped
  frontend:
-    build: ./frontend
+    build:
      context: ./frontend
      args:
        NEXT_PUBLIC_API_URL: http://localhost:8001
    container_name: audio_classifier_ui
    environment:
-      NEXT_PUBLIC_API_URL: http://backend:8000
+      # Use localhost:8001 because the browser (client-side) needs to access the API
      # The backend is mapped to port 8001 on the host machine
      NEXT_PUBLIC_API_URL: http://localhost:8001
    ports:
      - "3000:3000"
    depends_on:
--- a/frontend/.dockerignore
+++ b/frontend/.dockerignore
@@ -1,7 +1,6 @@
 node_modules
 .next
 .git
 .env.local
 npm-debug.log*
 yarn-debug.log*
 yarn-error.log*
--- a/frontend/.env.local
+++ b/frontend/.env.local
@@ -0,0 +1 @@
 NEXT_PUBLIC_API_URL=http://localhost:8001
--- a/frontend/Dockerfile
+++ b/frontend/Dockerfile
@@ -12,6 +12,10 @@ RUN npm ci
 # Copy application code
 COPY . .
 # Build argument for API URL
 ARG NEXT_PUBLIC_API_URL=http://localhost:8001
 ENV NEXT_PUBLIC_API_URL=${NEXT_PUBLIC_API_URL}
 # Build the application
 RUN npm run build
--- a/frontend/app/page.tsx
+++ b/frontend/app/page.tsx
@@ -76,6 +76,8 @@ export default function Home() {
                  <div className="flex justify-between items-start">
                    <div className="flex-1">
                      <h3 className="font-medium text-gray-900">{track.filename}</h3>
                      {/* Primary metadata */}
                      <div className="mt-1 flex flex-wrap gap-2">
                        <span className="inline-flex items-center px-2 py-1 rounded text-xs bg-blue-100 text-blue-800">
                          {track.classification.genre.primary}
@@ -86,10 +88,40 @@ export default function Home() {
                        <span className="text-xs text-gray-500">
                          {Math.round(track.features.tempo_bpm)} BPM
                        </span>
                        <span className="text-xs text-gray-500">
                          {track.features.key}
                        </span>
                        <span className="text-xs text-gray-500">
                          {Math.floor(track.duration_seconds / 60)}:{String(Math.floor(track.duration_seconds % 60)).padStart(2, '0')}
                        </span>
                      </div>
                      {/* Secondary moods */}
                      {track.classification.mood.secondary && track.classification.mood.secondary.length > 0 && (
                        <div className="mt-2 flex flex-wrap gap-1">
                          <span className="text-xs text-gray-400">Also:</span>
                          {track.classification.mood.secondary.map((mood, i) => (
                            <span key={i} className="inline-flex items-center px-2 py-0.5 rounded text-xs bg-purple-50 text-purple-600">
                              {mood}
                            </span>
                          ))}
                        </div>
                      )}
                      {/* Instruments */}
                      {track.classification.instruments && track.classification.instruments.length > 0 && (
                        <div className="mt-2 flex flex-wrap gap-1">
                          <span className="text-xs text-gray-400">Instruments:</span>
                          {track.classification.instruments.slice(0, 6).map((instrument, i) => (
                            <span key={i} className="inline-flex items-center px-2 py-0.5 rounded text-xs bg-green-50 text-green-700">
                              {instrument}
                            </span>
                          ))}
                          {track.classification.instruments.length > 6 && (
                            <span className="text-xs text-gray-400">+{track.classification.instruments.length - 6} more</span>
                          )}
                        </div>
                      )}
                    </div>
                    <div className="ml-4 flex gap-2">
                      <a
--- a/scripts/download-essentia-models.sh
+++ b/scripts/download-essentia-models.sh
@@ -6,7 +6,8 @@
 set -e  # Exit on error
 MODELS_DIR="backend/models"
-BASE_URL="https://essentia.upf.edu/models/classification-heads"
+CLASS_HEADS_URL="https://essentia.upf.edu/models/classification-heads"
 EMBEDDINGS_URL="https://essentia.upf.edu/models/feature-extractors/discogs-effnet"
 echo "📦 Downloading Essentia models..."
 echo "Models directory: $MODELS_DIR"
@@ -37,15 +38,23 @@ download_model() {
    fi
 }
-# Download each model
+# Download embedding model first (required for all classification heads)
 echo ""
 echo "Downloading embedding model..."
 download_model "discogs-effnet-bs64-1.pb" \
    "$EMBEDDINGS_URL/discogs-effnet-bs64-1.pb"
 # Download classification heads
 echo ""
 echo "Downloading classification heads..."
 download_model "mtg_jamendo_genre-discogs-effnet-1.pb" \
-    "$BASE_URL/mtg_jamendo_genre/mtg_jamendo_genre-discogs-effnet-1.pb"
+    "$CLASS_HEADS_URL/mtg_jamendo_genre/mtg_jamendo_genre-discogs-effnet-1.pb"
 download_model "mtg_jamendo_moodtheme-discogs-effnet-1.pb" \
-    "$BASE_URL/mtg_jamendo_moodtheme/mtg_jamendo_moodtheme-discogs-effnet-1.pb"
+    "$CLASS_HEADS_URL/mtg_jamendo_moodtheme/mtg_jamendo_moodtheme-discogs-effnet-1.pb"
 download_model "mtg_jamendo_instrument-discogs-effnet-1.pb" \
-    "$BASE_URL/mtg_jamendo_instrument/mtg_jamendo_instrument-discogs-effnet-1.pb"
+    "$CLASS_HEADS_URL/mtg_jamendo_instrument/mtg_jamendo_instrument-discogs-effnet-1.pb"
 echo ""
 echo "✅ All models downloaded successfully!"
Author	SHA1	Message	Date
Benoit	90c841310c	Fix bequcoup de choses : Genre OK, affichage des infos sur le front	2025-12-22 13:26:55 +01:00
Benoit	dec30019e2	WIP essentia	2025-12-22 12:59:20 +01:00