Files
SpotifyRecAlg/swingmusic/services/metadata_aggregator.py
T
Tomas Dvorak 6e8fedf534 first commit
2026-04-13 17:46:58 +02:00

325 lines
11 KiB
Python

"""
Enhanced Metadata Aggregation System for Universal Music Downloader
Provides cross-service matching and metadata enrichment without API keys
"""
import logging
import re
from dataclasses import dataclass
from typing import Any
logger = logging.getLogger(__name__)
@dataclass
class CrossServiceMatch:
"""Cross-service song match information"""
service: str
service_id: str
title: str
artist: str
url: str
confidence: float
isrc: str | None = None
duration_ms: int | None = None
release_date: str | None = None
cover_art: str | None = None
@dataclass
class EnhancedMetadata:
"""Enhanced metadata with cross-service information"""
primary_metadata: Any
cross_matches: list[CrossServiceMatch]
canonical_info: dict[str, Any] | None = None
confidence_score: float = 0.0
recommendations: list[str] = None
class MetadataAggregator:
"""Aggregates and enhances metadata from multiple sources"""
def __init__(self):
self.canonical_cache = {}
self.artist_aliases = {}
def normalize_title(self, title: str) -> str:
"""Normalize song title for better matching"""
# Remove extra whitespace and convert to lowercase
normalized = title.strip().lower()
# Remove common prefixes and suffixes
prefixes_to_remove = [
"official video",
"official audio",
"lyrics",
"live",
"acoustic",
"remastered",
]
for prefix in prefixes_to_remove:
normalized = re.sub(rf"\s*{prefix}\s*", "", normalized, flags=re.IGNORECASE)
# Remove content in parentheses
normalized = re.sub(r"\s*\([^)]*\)\s*", "", normalized)
# Remove extra dashes and special characters
normalized = re.sub(r"\s*[-–—]\s*", " ", normalized)
return normalized.strip()
def normalize_artist(self, artist: str) -> str:
"""Normalize artist name for better matching"""
normalized = artist.strip().lower()
# Remove "feat." and similar
normalized = re.sub(r"\s*feat\.\s*", " feat. ", normalized)
# Handle "vs" collaborations
normalized = re.sub(r"\s+vs\s+", " vs ", normalized)
return normalized.strip()
def calculate_similarity_score(
self, title1: str, artist1: str, title2: str, artist2: str
) -> float:
"""Calculate similarity score between two songs"""
title_score = 0.0
artist_score = 0.0
# Title similarity
if title1 and title2:
norm_title1 = self.normalize_title(title1)
norm_title2 = self.normalize_title(title2)
if norm_title1 == norm_title2:
title_score = 1.0
elif norm_title1 in norm_title2 or norm_title2 in norm_title1:
title_score = 0.8
else:
# Partial match based on words
words1 = set(norm_title1.split())
words2 = set(norm_title2.split())
common_words = words1.intersection(words2)
title_score = (
len(common_words) / max(len(words1), len(words2))
if words1 and words2
else 0.0
)
# Artist similarity
if artist1 and artist2:
norm_artist1 = self.normalize_artist(artist1)
norm_artist2 = self.normalize_artist(artist2)
if norm_artist1 == norm_artist2:
artist_score = 1.0
elif norm_artist1 in norm_artist2 or norm_artist2 in norm_artist1:
artist_score = 0.8
else:
# Partial match based on words
words1 = set(norm_artist1.split())
words2 = set(norm_artist2.split())
common_words = words1.intersection(words2)
artist_score = (
len(common_words) / max(len(words1), len(words2))
if words1 and words2
else 0.0
)
# Combined score (title is more important)
return title_score * 0.7 + artist_score * 0.3
def find_cross_service_matches(
self, primary_metadata: Any, all_services_data: dict[str, Any]
) -> list[CrossServiceMatch]:
"""Find matches of the same song across other services"""
matches = []
if not primary_metadata:
return matches
primary_title = getattr(primary_metadata, "title", "")
primary_artist = getattr(primary_metadata, "artist", "")
getattr(primary_metadata, "isrc", None)
for service, data in all_services_data.items():
service_attr = getattr(primary_metadata, "service", None)
if service_attr and service == service_attr.value:
continue # Skip: same service
service_title = getattr(data, "title", "")
service_artist = getattr(data, "artist", "")
service_url = getattr(data, "original_url", "")
# Calculate similarity score
similarity = self.calculate_similarity_score(
primary_title, primary_artist, service_title, service_artist
)
# Only include matches with reasonable similarity
if similarity >= 0.6: # 60% similarity threshold
match = CrossServiceMatch(
service=service,
service_id=getattr(data, "service_id", ""),
title=service_title,
artist=service_artist,
url=service_url,
confidence=similarity,
isrc=getattr(data, "isrc", None),
duration_ms=getattr(data, "duration_ms", None),
release_date=getattr(data, "release_date", None),
cover_art=getattr(data, "image_url", None),
)
matches.append(match)
# Sort by confidence score
matches.sort(key=lambda x: x.confidence, reverse=True)
return matches
def get_canonical_info(self, isrc: str) -> dict[str, Any] | None:
"""Get canonical information from ISRC"""
if not isrc or len(isrc) != 12:
return None
# Parse ISRC: Country-Registration Year-Sequence Number
country = isrc[:2]
year = isrc[2:6]
sequence = isrc[6:]
return {
"isrc": isrc,
"country": country,
"year": year,
"sequence": sequence,
"type": "recording" if sequence.isdigit() else "other",
}
def generate_recommendations(
self, metadata: Any, cross_matches: list[CrossServiceMatch]
) -> list[str]:
"""Generate recommendations based on metadata and cross matches"""
recommendations = []
# Base recommendations on genre
genre = getattr(metadata, "genre", "")
if genre:
recommendations.append(f"Similar {genre} tracks")
# Add recommendations from high-confidence cross matches
high_confidence_matches = [m for m in cross_matches if m.confidence >= 0.8]
for match in high_confidence_matches[:3]: # Top 3 matches
recommendations.append(f"Also available on {match.service}")
# Add recommendations based on artist
artist = getattr(metadata, "artist", "")
if artist:
recommendations.append(f"More from {artist}")
return list(set(recommendations)) # Remove duplicates
def create_enhanced_metadata(
self, primary_metadata: Any, cross_matches: list[CrossServiceMatch]
) -> EnhancedMetadata:
"""Create enhanced metadata object"""
# Calculate confidence score
max_confidence = (
max([m.confidence for m in cross_matches]) if cross_matches else 0.0
)
# Get canonical info if ISRC exists
canonical_info = None
isrc = getattr(primary_metadata, "isrc", None)
if isrc:
canonical_info = self.get_canonical_info(isrc)
# Generate recommendations
recommendations = self.generate_recommendations(primary_metadata, cross_matches)
return EnhancedMetadata(
primary_metadata=primary_metadata,
cross_matches=cross_matches,
canonical_info=canonical_info,
confidence_score=max_confidence,
recommendations=recommendations,
)
class FreeMetadataEnricher:
"""Free metadata enrichment without API keys"""
def __init__(self):
self.aggregator = MetadataAggregator()
def extract_lyrics_snippet(self, title: str, artist: str) -> str:
"""Extract potential lyrics snippet for search enhancement"""
# This would use web scraping of lyrics sites
# For now, return empty to avoid copyright issues
return ""
def detect_language(self, title: str, artist: str) -> str:
"""Detect likely language from title and artist"""
# Simple heuristic based on character patterns
if any(ord(c) > 127 for c in title + artist):
return "non-english"
return "english"
def estimate_mood(self, title: str, artist: str) -> str:
"""Estimate mood from title and artist name"""
title_lower = title.lower()
artist_lower = artist.lower()
mood_keywords = {
"happy": ["love", "joy", "sun", "summer", "dance", "party"],
"sad": ["cry", "tears", "rain", "winter", "goodbye", "broken"],
"energetic": ["rock", "power", "energy", "loud", "fast"],
"calm": ["peace", "quiet", "soft", "gentle", "acoustic"],
"dark": ["dark", "death", "black", "night", "shadow"],
}
for mood, keywords in mood_keywords.items():
if any(
keyword in title_lower or keyword in artist_lower
for keyword in keywords
):
return mood
return "neutral"
def calculate_quality_score(self, metadata: Any) -> float:
"""Calculate metadata quality score"""
score = 0.0
# Check for ISRC (high quality indicator)
if getattr(metadata, "isrc", None):
score += 0.3
# Check for release date
if getattr(metadata, "release_date", None):
score += 0.2
# Check for genre information
if getattr(metadata, "genre", None):
score += 0.2
# Check for cover art
if getattr(metadata, "image_url", None):
score += 0.1
# Check for duration
if getattr(metadata, "duration_ms", None):
score += 0.1
# Check for extended metadata
if getattr(metadata, "metadata", None):
score += 0.1
return min(score, 1.0)
# Global instances
metadata_aggregator = MetadataAggregator()
free_enricher = FreeMetadataEnricher()