WIP essentia
This commit is contained in:
@@ -39,8 +39,8 @@ COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir numpy==1.24.3
|
||||
RUN pip install --no-cache-dir scipy==1.11.4
|
||||
|
||||
# Install Essentia - Python 3.9 with ARM64 support
|
||||
RUN pip install --no-cache-dir essentia
|
||||
# Install Essentia-TensorFlow - Python 3.9 AMD64 support
|
||||
RUN pip install --no-cache-dir essentia-tensorflow
|
||||
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
|
||||
@@ -14,7 +14,8 @@ try:
|
||||
from essentia.standard import (
|
||||
MonoLoader,
|
||||
TensorflowPredictEffnetDiscogs,
|
||||
TensorflowPredict2D
|
||||
TensorflowPredict2D,
|
||||
TensorflowPredictMusiCNN
|
||||
)
|
||||
ESSENTIA_AVAILABLE = True
|
||||
except ImportError:
|
||||
@@ -55,7 +56,17 @@ class EssentiaClassifier:
|
||||
logger.warning(f"Models path {self.models_path} does not exist")
|
||||
return
|
||||
|
||||
# Model file names
|
||||
# Check for embedding model first
|
||||
embedding_file = "discogs-effnet-bs64-1.pb"
|
||||
embedding_path = self.models_path / embedding_file
|
||||
if embedding_path.exists():
|
||||
logger.info(f"Loading embedding model from {embedding_path}")
|
||||
self.models["embedding"] = str(embedding_path)
|
||||
else:
|
||||
logger.warning(f"Embedding model not found: {embedding_path}")
|
||||
return # Cannot proceed without embeddings
|
||||
|
||||
# Model file names for classification heads
|
||||
model_files = {
|
||||
"genre": "mtg_jamendo_genre-discogs-effnet-1.pb",
|
||||
"mood": "mtg_jamendo_moodtheme-discogs-effnet-1.pb",
|
||||
@@ -135,15 +146,26 @@ class EssentiaClassifier:
|
||||
return self._fallback_genre()
|
||||
|
||||
try:
|
||||
# Load audio
|
||||
# Step 1: Extract embeddings using discogs-effnet
|
||||
audio = MonoLoader(filename=audio_path, sampleRate=16000, resampleQuality=4)()
|
||||
|
||||
# Predict
|
||||
model = TensorflowPredictEffnetDiscogs(
|
||||
graphFilename=self.models["genre"],
|
||||
embedding_model = TensorflowPredictEffnetDiscogs(
|
||||
graphFilename=self.models["embedding"],
|
||||
output="PartitionedCall:1"
|
||||
)
|
||||
predictions = model(audio)
|
||||
embeddings = embedding_model(audio)
|
||||
|
||||
# Average embeddings over time
|
||||
embeddings_mean = np.mean(embeddings, axis=0)
|
||||
|
||||
# Step 2: Feed embeddings to classification head
|
||||
classifier = TensorflowPredict2D(
|
||||
graphFilename=self.models["genre"],
|
||||
input="model/Placeholder",
|
||||
output="model/Sigmoid"
|
||||
)
|
||||
predictions = classifier(embeddings_mean.reshape(1, -1))
|
||||
predictions = predictions[0] # Remove batch dimension
|
||||
|
||||
# Get top predictions
|
||||
top_indices = np.argsort(predictions)[::-1][:5]
|
||||
@@ -172,19 +194,28 @@ class EssentiaClassifier:
|
||||
Returns:
|
||||
Dictionary with mood predictions
|
||||
"""
|
||||
if not ESSENTIA_AVAILABLE or "mood" not in self.models:
|
||||
if not ESSENTIA_AVAILABLE or "mood" not in self.models or "embedding" not in self.models:
|
||||
return self._fallback_mood()
|
||||
|
||||
try:
|
||||
# Load audio
|
||||
# Step 1: Extract embeddings using discogs-effnet
|
||||
audio = MonoLoader(filename=audio_path, sampleRate=16000, resampleQuality=4)()
|
||||
|
||||
# Predict
|
||||
model = TensorflowPredictEffnetDiscogs(
|
||||
graphFilename=self.models["mood"],
|
||||
embedding_model = TensorflowPredictEffnetDiscogs(
|
||||
graphFilename=self.models["embedding"],
|
||||
output="PartitionedCall:1"
|
||||
)
|
||||
predictions = model(audio)
|
||||
embeddings = embedding_model(audio)
|
||||
embeddings_mean = np.mean(embeddings, axis=0)
|
||||
|
||||
# Step 2: Feed embeddings to classification head
|
||||
classifier = TensorflowPredict2D(
|
||||
graphFilename=self.models["mood"],
|
||||
input="model/Placeholder",
|
||||
output="model/Sigmoid"
|
||||
)
|
||||
predictions = classifier(embeddings_mean.reshape(1, -1))
|
||||
predictions = predictions[0]
|
||||
|
||||
# Get top predictions
|
||||
top_indices = np.argsort(predictions)[::-1][:5]
|
||||
@@ -216,19 +247,28 @@ class EssentiaClassifier:
|
||||
Returns:
|
||||
List of instruments with confidence scores
|
||||
"""
|
||||
if not ESSENTIA_AVAILABLE or "instrument" not in self.models:
|
||||
if not ESSENTIA_AVAILABLE or "instrument" not in self.models or "embedding" not in self.models:
|
||||
return self._fallback_instruments()
|
||||
|
||||
try:
|
||||
# Load audio
|
||||
# Step 1: Extract embeddings using discogs-effnet
|
||||
audio = MonoLoader(filename=audio_path, sampleRate=16000, resampleQuality=4)()
|
||||
|
||||
# Predict
|
||||
model = TensorflowPredictEffnetDiscogs(
|
||||
graphFilename=self.models["instrument"],
|
||||
embedding_model = TensorflowPredictEffnetDiscogs(
|
||||
graphFilename=self.models["embedding"],
|
||||
output="PartitionedCall:1"
|
||||
)
|
||||
predictions = model(audio)
|
||||
embeddings = embedding_model(audio)
|
||||
embeddings_mean = np.mean(embeddings, axis=0)
|
||||
|
||||
# Step 2: Feed embeddings to classification head
|
||||
classifier = TensorflowPredict2D(
|
||||
graphFilename=self.models["instrument"],
|
||||
input="model/Placeholder",
|
||||
output="model/Sigmoid"
|
||||
)
|
||||
predictions = classifier(embeddings_mean.reshape(1, -1))
|
||||
predictions = predictions[0]
|
||||
|
||||
# Get instruments above threshold
|
||||
threshold = 0.1
|
||||
|
||||
Reference in New Issue
Block a user