Compare commits
2 Commits
83de840672
...
90c841310c
| Author | SHA1 | Date | |
|---|---|---|---|
| 90c841310c | |||
| dec30019e2 |
15
.claude/settings.local.json
Normal file
15
.claude/settings.local.json
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
{
|
||||||
|
"permissions": {
|
||||||
|
"allow": [
|
||||||
|
"Bash(node --version:*)",
|
||||||
|
"Bash(docker --version:*)",
|
||||||
|
"Bash(docker-compose:*)",
|
||||||
|
"Bash(test:*)",
|
||||||
|
"Bash(cp:*)",
|
||||||
|
"Bash(bash scripts/download-essentia-models.sh:*)",
|
||||||
|
"Bash(curl:*)",
|
||||||
|
"Bash(docker logs:*)",
|
||||||
|
"Bash(docker exec:*)"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -95,6 +95,10 @@ curl -X POST http://localhost:8001/api/analyze/folder \
|
|||||||
-H "Content-Type: application/json" \
|
-H "Content-Type: application/json" \
|
||||||
-d '{"path": "/audio/music", "recursive": true}'
|
-d '{"path": "/audio/music", "recursive": true}'
|
||||||
```
|
```
|
||||||
|
#### Sous Windows 10
|
||||||
|
````bash
|
||||||
|
curl.exe -X POST http://localhost:8001/api/analyze/folder -H "Content-Type: application/json" -d '{\"path\": \"/audio/\", \"recursive\": true}'
|
||||||
|
````
|
||||||
|
|
||||||
### Rechercher des pistes
|
### Rechercher des pistes
|
||||||
|
|
||||||
|
|||||||
@@ -39,8 +39,8 @@ COPY requirements.txt .
|
|||||||
RUN pip install --no-cache-dir numpy==1.24.3
|
RUN pip install --no-cache-dir numpy==1.24.3
|
||||||
RUN pip install --no-cache-dir scipy==1.11.4
|
RUN pip install --no-cache-dir scipy==1.11.4
|
||||||
|
|
||||||
# Install Essentia - Python 3.9 with ARM64 support
|
# Install Essentia-TensorFlow - Python 3.9 AMD64 support
|
||||||
RUN pip install --no-cache-dir essentia
|
RUN pip install --no-cache-dir essentia-tensorflow
|
||||||
|
|
||||||
RUN pip install --no-cache-dir -r requirements.txt
|
RUN pip install --no-cache-dir -r requirements.txt
|
||||||
|
|
||||||
|
|||||||
@@ -14,7 +14,8 @@ try:
|
|||||||
from essentia.standard import (
|
from essentia.standard import (
|
||||||
MonoLoader,
|
MonoLoader,
|
||||||
TensorflowPredictEffnetDiscogs,
|
TensorflowPredictEffnetDiscogs,
|
||||||
TensorflowPredict2D
|
TensorflowPredict2D,
|
||||||
|
TensorflowPredictMusiCNN
|
||||||
)
|
)
|
||||||
ESSENTIA_AVAILABLE = True
|
ESSENTIA_AVAILABLE = True
|
||||||
except ImportError:
|
except ImportError:
|
||||||
@@ -55,7 +56,17 @@ class EssentiaClassifier:
|
|||||||
logger.warning(f"Models path {self.models_path} does not exist")
|
logger.warning(f"Models path {self.models_path} does not exist")
|
||||||
return
|
return
|
||||||
|
|
||||||
# Model file names
|
# Check for embedding model first
|
||||||
|
embedding_file = "discogs-effnet-bs64-1.pb"
|
||||||
|
embedding_path = self.models_path / embedding_file
|
||||||
|
if embedding_path.exists():
|
||||||
|
logger.info(f"Loading embedding model from {embedding_path}")
|
||||||
|
self.models["embedding"] = str(embedding_path)
|
||||||
|
else:
|
||||||
|
logger.warning(f"Embedding model not found: {embedding_path}")
|
||||||
|
return # Cannot proceed without embeddings
|
||||||
|
|
||||||
|
# Model file names for classification heads
|
||||||
model_files = {
|
model_files = {
|
||||||
"genre": "mtg_jamendo_genre-discogs-effnet-1.pb",
|
"genre": "mtg_jamendo_genre-discogs-effnet-1.pb",
|
||||||
"mood": "mtg_jamendo_moodtheme-discogs-effnet-1.pb",
|
"mood": "mtg_jamendo_moodtheme-discogs-effnet-1.pb",
|
||||||
@@ -135,23 +146,47 @@ class EssentiaClassifier:
|
|||||||
return self._fallback_genre()
|
return self._fallback_genre()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Load audio
|
# Step 1: Extract embeddings using discogs-effnet
|
||||||
audio = MonoLoader(filename=audio_path, sampleRate=16000, resampleQuality=4)()
|
audio = MonoLoader(filename=audio_path, sampleRate=16000, resampleQuality=4)()
|
||||||
|
|
||||||
# Predict
|
embedding_model = TensorflowPredictEffnetDiscogs(
|
||||||
model = TensorflowPredictEffnetDiscogs(
|
graphFilename=self.models["embedding"],
|
||||||
graphFilename=self.models["genre"],
|
|
||||||
output="PartitionedCall:1"
|
output="PartitionedCall:1"
|
||||||
)
|
)
|
||||||
predictions = model(audio)
|
embeddings = embedding_model(audio)
|
||||||
|
|
||||||
|
# Average embeddings over time
|
||||||
|
embeddings_mean = np.mean(embeddings, axis=0)
|
||||||
|
|
||||||
|
# Step 2: Feed embeddings to classification head
|
||||||
|
classifier = TensorflowPredict2D(
|
||||||
|
graphFilename=self.models["genre"],
|
||||||
|
input="model/Placeholder",
|
||||||
|
output="model/Sigmoid"
|
||||||
|
)
|
||||||
|
predictions = classifier(embeddings_mean.reshape(1, -1))
|
||||||
|
predictions = predictions[0] # Remove batch dimension
|
||||||
|
|
||||||
# Get top predictions
|
# Get top predictions
|
||||||
top_indices = np.argsort(predictions)[::-1][:5]
|
|
||||||
labels = self.class_labels.get("genre", [])
|
labels = self.class_labels.get("genre", [])
|
||||||
|
logger.info(f"Genre predictions shape: {predictions.shape}, num_labels: {len(labels)}")
|
||||||
|
|
||||||
primary = labels[top_indices[0]] if labels else "unknown"
|
# Ensure we don't go out of bounds
|
||||||
secondary = [labels[i] for i in top_indices[1:4]] if labels else []
|
if len(predictions) == 0:
|
||||||
confidence = float(predictions[top_indices[0]])
|
logger.warning("No predictions returned from genre model")
|
||||||
|
return self._fallback_genre()
|
||||||
|
|
||||||
|
top_indices = np.argsort(predictions)[::-1][:5]
|
||||||
|
# Only use indices that are within the labels range
|
||||||
|
valid_top_indices = [i for i in top_indices if i < len(labels)]
|
||||||
|
|
||||||
|
if not valid_top_indices:
|
||||||
|
logger.warning(f"No valid indices found. Predictions: {len(predictions)}, Labels: {len(labels)}")
|
||||||
|
return self._fallback_genre()
|
||||||
|
|
||||||
|
primary = labels[valid_top_indices[0]]
|
||||||
|
secondary = [labels[i] for i in valid_top_indices[1:4]]
|
||||||
|
confidence = float(predictions[valid_top_indices[0]])
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"primary": primary,
|
"primary": primary,
|
||||||
@@ -172,26 +207,43 @@ class EssentiaClassifier:
|
|||||||
Returns:
|
Returns:
|
||||||
Dictionary with mood predictions
|
Dictionary with mood predictions
|
||||||
"""
|
"""
|
||||||
if not ESSENTIA_AVAILABLE or "mood" not in self.models:
|
if not ESSENTIA_AVAILABLE or "mood" not in self.models or "embedding" not in self.models:
|
||||||
return self._fallback_mood()
|
return self._fallback_mood()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Load audio
|
# Step 1: Extract embeddings using discogs-effnet
|
||||||
audio = MonoLoader(filename=audio_path, sampleRate=16000, resampleQuality=4)()
|
audio = MonoLoader(filename=audio_path, sampleRate=16000, resampleQuality=4)()
|
||||||
|
|
||||||
# Predict
|
embedding_model = TensorflowPredictEffnetDiscogs(
|
||||||
model = TensorflowPredictEffnetDiscogs(
|
graphFilename=self.models["embedding"],
|
||||||
graphFilename=self.models["mood"],
|
|
||||||
output="PartitionedCall:1"
|
output="PartitionedCall:1"
|
||||||
)
|
)
|
||||||
predictions = model(audio)
|
embeddings = embedding_model(audio)
|
||||||
|
embeddings_mean = np.mean(embeddings, axis=0)
|
||||||
|
|
||||||
|
# Step 2: Feed embeddings to classification head
|
||||||
|
classifier = TensorflowPredict2D(
|
||||||
|
graphFilename=self.models["mood"],
|
||||||
|
input="model/Placeholder",
|
||||||
|
output="model/Sigmoid"
|
||||||
|
)
|
||||||
|
predictions = classifier(embeddings_mean.reshape(1, -1))
|
||||||
|
predictions = predictions[0]
|
||||||
|
|
||||||
# Get top predictions
|
# Get top predictions
|
||||||
top_indices = np.argsort(predictions)[::-1][:5]
|
|
||||||
labels = self.class_labels.get("mood", [])
|
labels = self.class_labels.get("mood", [])
|
||||||
|
|
||||||
primary = labels[top_indices[0]] if labels else "unknown"
|
if len(predictions) == 0:
|
||||||
secondary = [labels[i] for i in top_indices[1:3]] if labels else []
|
return self._fallback_mood()
|
||||||
|
|
||||||
|
top_indices = np.argsort(predictions)[::-1][:5]
|
||||||
|
valid_top_indices = [i for i in top_indices if i < len(labels)]
|
||||||
|
|
||||||
|
if not valid_top_indices:
|
||||||
|
return self._fallback_mood()
|
||||||
|
|
||||||
|
primary = labels[valid_top_indices[0]] if valid_top_indices else "unknown"
|
||||||
|
secondary = [labels[i] for i in valid_top_indices[1:3]] if len(valid_top_indices) > 1 else []
|
||||||
|
|
||||||
# Estimate arousal and valence from mood labels (simplified)
|
# Estimate arousal and valence from mood labels (simplified)
|
||||||
arousal, valence = self._estimate_arousal_valence(primary)
|
arousal, valence = self._estimate_arousal_valence(primary)
|
||||||
@@ -216,19 +268,28 @@ class EssentiaClassifier:
|
|||||||
Returns:
|
Returns:
|
||||||
List of instruments with confidence scores
|
List of instruments with confidence scores
|
||||||
"""
|
"""
|
||||||
if not ESSENTIA_AVAILABLE or "instrument" not in self.models:
|
if not ESSENTIA_AVAILABLE or "instrument" not in self.models or "embedding" not in self.models:
|
||||||
return self._fallback_instruments()
|
return self._fallback_instruments()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Load audio
|
# Step 1: Extract embeddings using discogs-effnet
|
||||||
audio = MonoLoader(filename=audio_path, sampleRate=16000, resampleQuality=4)()
|
audio = MonoLoader(filename=audio_path, sampleRate=16000, resampleQuality=4)()
|
||||||
|
|
||||||
# Predict
|
embedding_model = TensorflowPredictEffnetDiscogs(
|
||||||
model = TensorflowPredictEffnetDiscogs(
|
graphFilename=self.models["embedding"],
|
||||||
graphFilename=self.models["instrument"],
|
|
||||||
output="PartitionedCall:1"
|
output="PartitionedCall:1"
|
||||||
)
|
)
|
||||||
predictions = model(audio)
|
embeddings = embedding_model(audio)
|
||||||
|
embeddings_mean = np.mean(embeddings, axis=0)
|
||||||
|
|
||||||
|
# Step 2: Feed embeddings to classification head
|
||||||
|
classifier = TensorflowPredict2D(
|
||||||
|
graphFilename=self.models["instrument"],
|
||||||
|
input="model/Placeholder",
|
||||||
|
output="model/Sigmoid"
|
||||||
|
)
|
||||||
|
predictions = classifier(embeddings_mean.reshape(1, -1))
|
||||||
|
predictions = predictions[0]
|
||||||
|
|
||||||
# Get instruments above threshold
|
# Get instruments above threshold
|
||||||
threshold = 0.1
|
threshold = 0.1
|
||||||
|
|||||||
@@ -40,10 +40,15 @@ services:
|
|||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
|
|
||||||
frontend:
|
frontend:
|
||||||
build: ./frontend
|
build:
|
||||||
|
context: ./frontend
|
||||||
|
args:
|
||||||
|
NEXT_PUBLIC_API_URL: http://localhost:8001
|
||||||
container_name: audio_classifier_ui
|
container_name: audio_classifier_ui
|
||||||
environment:
|
environment:
|
||||||
NEXT_PUBLIC_API_URL: http://backend:8000
|
# Use localhost:8001 because the browser (client-side) needs to access the API
|
||||||
|
# The backend is mapped to port 8001 on the host machine
|
||||||
|
NEXT_PUBLIC_API_URL: http://localhost:8001
|
||||||
ports:
|
ports:
|
||||||
- "3000:3000"
|
- "3000:3000"
|
||||||
depends_on:
|
depends_on:
|
||||||
|
|||||||
@@ -1,7 +1,6 @@
|
|||||||
node_modules
|
node_modules
|
||||||
.next
|
.next
|
||||||
.git
|
.git
|
||||||
.env.local
|
|
||||||
npm-debug.log*
|
npm-debug.log*
|
||||||
yarn-debug.log*
|
yarn-debug.log*
|
||||||
yarn-error.log*
|
yarn-error.log*
|
||||||
|
|||||||
1
frontend/.env.local
Normal file
1
frontend/.env.local
Normal file
@@ -0,0 +1 @@
|
|||||||
|
NEXT_PUBLIC_API_URL=http://localhost:8001
|
||||||
@@ -12,6 +12,10 @@ RUN npm ci
|
|||||||
# Copy application code
|
# Copy application code
|
||||||
COPY . .
|
COPY . .
|
||||||
|
|
||||||
|
# Build argument for API URL
|
||||||
|
ARG NEXT_PUBLIC_API_URL=http://localhost:8001
|
||||||
|
ENV NEXT_PUBLIC_API_URL=${NEXT_PUBLIC_API_URL}
|
||||||
|
|
||||||
# Build the application
|
# Build the application
|
||||||
RUN npm run build
|
RUN npm run build
|
||||||
|
|
||||||
|
|||||||
@@ -76,6 +76,8 @@ export default function Home() {
|
|||||||
<div className="flex justify-between items-start">
|
<div className="flex justify-between items-start">
|
||||||
<div className="flex-1">
|
<div className="flex-1">
|
||||||
<h3 className="font-medium text-gray-900">{track.filename}</h3>
|
<h3 className="font-medium text-gray-900">{track.filename}</h3>
|
||||||
|
|
||||||
|
{/* Primary metadata */}
|
||||||
<div className="mt-1 flex flex-wrap gap-2">
|
<div className="mt-1 flex flex-wrap gap-2">
|
||||||
<span className="inline-flex items-center px-2 py-1 rounded text-xs bg-blue-100 text-blue-800">
|
<span className="inline-flex items-center px-2 py-1 rounded text-xs bg-blue-100 text-blue-800">
|
||||||
{track.classification.genre.primary}
|
{track.classification.genre.primary}
|
||||||
@@ -86,10 +88,40 @@ export default function Home() {
|
|||||||
<span className="text-xs text-gray-500">
|
<span className="text-xs text-gray-500">
|
||||||
{Math.round(track.features.tempo_bpm)} BPM
|
{Math.round(track.features.tempo_bpm)} BPM
|
||||||
</span>
|
</span>
|
||||||
|
<span className="text-xs text-gray-500">
|
||||||
|
{track.features.key}
|
||||||
|
</span>
|
||||||
<span className="text-xs text-gray-500">
|
<span className="text-xs text-gray-500">
|
||||||
{Math.floor(track.duration_seconds / 60)}:{String(Math.floor(track.duration_seconds % 60)).padStart(2, '0')}
|
{Math.floor(track.duration_seconds / 60)}:{String(Math.floor(track.duration_seconds % 60)).padStart(2, '0')}
|
||||||
</span>
|
</span>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
{/* Secondary moods */}
|
||||||
|
{track.classification.mood.secondary && track.classification.mood.secondary.length > 0 && (
|
||||||
|
<div className="mt-2 flex flex-wrap gap-1">
|
||||||
|
<span className="text-xs text-gray-400">Also:</span>
|
||||||
|
{track.classification.mood.secondary.map((mood, i) => (
|
||||||
|
<span key={i} className="inline-flex items-center px-2 py-0.5 rounded text-xs bg-purple-50 text-purple-600">
|
||||||
|
{mood}
|
||||||
|
</span>
|
||||||
|
))}
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
|
||||||
|
{/* Instruments */}
|
||||||
|
{track.classification.instruments && track.classification.instruments.length > 0 && (
|
||||||
|
<div className="mt-2 flex flex-wrap gap-1">
|
||||||
|
<span className="text-xs text-gray-400">Instruments:</span>
|
||||||
|
{track.classification.instruments.slice(0, 6).map((instrument, i) => (
|
||||||
|
<span key={i} className="inline-flex items-center px-2 py-0.5 rounded text-xs bg-green-50 text-green-700">
|
||||||
|
{instrument}
|
||||||
|
</span>
|
||||||
|
))}
|
||||||
|
{track.classification.instruments.length > 6 && (
|
||||||
|
<span className="text-xs text-gray-400">+{track.classification.instruments.length - 6} more</span>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
</div>
|
</div>
|
||||||
<div className="ml-4 flex gap-2">
|
<div className="ml-4 flex gap-2">
|
||||||
<a
|
<a
|
||||||
|
|||||||
@@ -6,7 +6,8 @@
|
|||||||
set -e # Exit on error
|
set -e # Exit on error
|
||||||
|
|
||||||
MODELS_DIR="backend/models"
|
MODELS_DIR="backend/models"
|
||||||
BASE_URL="https://essentia.upf.edu/models/classification-heads"
|
CLASS_HEADS_URL="https://essentia.upf.edu/models/classification-heads"
|
||||||
|
EMBEDDINGS_URL="https://essentia.upf.edu/models/feature-extractors/discogs-effnet"
|
||||||
|
|
||||||
echo "📦 Downloading Essentia models..."
|
echo "📦 Downloading Essentia models..."
|
||||||
echo "Models directory: $MODELS_DIR"
|
echo "Models directory: $MODELS_DIR"
|
||||||
@@ -37,15 +38,23 @@ download_model() {
|
|||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
|
|
||||||
# Download each model
|
# Download embedding model first (required for all classification heads)
|
||||||
|
echo ""
|
||||||
|
echo "Downloading embedding model..."
|
||||||
|
download_model "discogs-effnet-bs64-1.pb" \
|
||||||
|
"$EMBEDDINGS_URL/discogs-effnet-bs64-1.pb"
|
||||||
|
|
||||||
|
# Download classification heads
|
||||||
|
echo ""
|
||||||
|
echo "Downloading classification heads..."
|
||||||
download_model "mtg_jamendo_genre-discogs-effnet-1.pb" \
|
download_model "mtg_jamendo_genre-discogs-effnet-1.pb" \
|
||||||
"$BASE_URL/mtg_jamendo_genre/mtg_jamendo_genre-discogs-effnet-1.pb"
|
"$CLASS_HEADS_URL/mtg_jamendo_genre/mtg_jamendo_genre-discogs-effnet-1.pb"
|
||||||
|
|
||||||
download_model "mtg_jamendo_moodtheme-discogs-effnet-1.pb" \
|
download_model "mtg_jamendo_moodtheme-discogs-effnet-1.pb" \
|
||||||
"$BASE_URL/mtg_jamendo_moodtheme/mtg_jamendo_moodtheme-discogs-effnet-1.pb"
|
"$CLASS_HEADS_URL/mtg_jamendo_moodtheme/mtg_jamendo_moodtheme-discogs-effnet-1.pb"
|
||||||
|
|
||||||
download_model "mtg_jamendo_instrument-discogs-effnet-1.pb" \
|
download_model "mtg_jamendo_instrument-discogs-effnet-1.pb" \
|
||||||
"$BASE_URL/mtg_jamendo_instrument/mtg_jamendo_instrument-discogs-effnet-1.pb"
|
"$CLASS_HEADS_URL/mtg_jamendo_instrument/mtg_jamendo_instrument-discogs-effnet-1.pb"
|
||||||
|
|
||||||
echo ""
|
echo ""
|
||||||
echo "✅ All models downloaded successfully!"
|
echo "✅ All models downloaded successfully!"
|
||||||
|
|||||||
Reference in New Issue
Block a user