mirror of
https://github.com/Dvorinka/SpotifyRecAlg.git
synced 2026-06-04 12:33:03 +00:00
325 lines
11 KiB
Python
325 lines
11 KiB
Python
"""
|
|
Enhanced Metadata Aggregation System for Universal Music Downloader
|
|
Provides cross-service matching and metadata enrichment without API keys
|
|
"""
|
|
|
|
import logging
|
|
import re
|
|
from dataclasses import dataclass
|
|
from typing import Any
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
@dataclass
|
|
class CrossServiceMatch:
|
|
"""Cross-service song match information"""
|
|
|
|
service: str
|
|
service_id: str
|
|
title: str
|
|
artist: str
|
|
url: str
|
|
confidence: float
|
|
isrc: str | None = None
|
|
duration_ms: int | None = None
|
|
release_date: str | None = None
|
|
cover_art: str | None = None
|
|
|
|
|
|
@dataclass
|
|
class EnhancedMetadata:
|
|
"""Enhanced metadata with cross-service information"""
|
|
|
|
primary_metadata: Any
|
|
cross_matches: list[CrossServiceMatch]
|
|
canonical_info: dict[str, Any] | None = None
|
|
confidence_score: float = 0.0
|
|
recommendations: list[str] = None
|
|
|
|
|
|
class MetadataAggregator:
|
|
"""Aggregates and enhances metadata from multiple sources"""
|
|
|
|
def __init__(self):
|
|
self.canonical_cache = {}
|
|
self.artist_aliases = {}
|
|
|
|
def normalize_title(self, title: str) -> str:
|
|
"""Normalize song title for better matching"""
|
|
# Remove extra whitespace and convert to lowercase
|
|
normalized = title.strip().lower()
|
|
|
|
# Remove common prefixes and suffixes
|
|
prefixes_to_remove = [
|
|
"official video",
|
|
"official audio",
|
|
"lyrics",
|
|
"live",
|
|
"acoustic",
|
|
"remastered",
|
|
]
|
|
for prefix in prefixes_to_remove:
|
|
normalized = re.sub(rf"\s*{prefix}\s*", "", normalized, flags=re.IGNORECASE)
|
|
|
|
# Remove content in parentheses
|
|
normalized = re.sub(r"\s*\([^)]*\)\s*", "", normalized)
|
|
|
|
# Remove extra dashes and special characters
|
|
normalized = re.sub(r"\s*[-–—]\s*", " ", normalized)
|
|
|
|
return normalized.strip()
|
|
|
|
def normalize_artist(self, artist: str) -> str:
|
|
"""Normalize artist name for better matching"""
|
|
normalized = artist.strip().lower()
|
|
|
|
# Remove "feat." and similar
|
|
normalized = re.sub(r"\s*feat\.\s*", " feat. ", normalized)
|
|
|
|
# Handle "vs" collaborations
|
|
normalized = re.sub(r"\s+vs\s+", " vs ", normalized)
|
|
|
|
return normalized.strip()
|
|
|
|
def calculate_similarity_score(
|
|
self, title1: str, artist1: str, title2: str, artist2: str
|
|
) -> float:
|
|
"""Calculate similarity score between two songs"""
|
|
title_score = 0.0
|
|
artist_score = 0.0
|
|
|
|
# Title similarity
|
|
if title1 and title2:
|
|
norm_title1 = self.normalize_title(title1)
|
|
norm_title2 = self.normalize_title(title2)
|
|
|
|
if norm_title1 == norm_title2:
|
|
title_score = 1.0
|
|
elif norm_title1 in norm_title2 or norm_title2 in norm_title1:
|
|
title_score = 0.8
|
|
else:
|
|
# Partial match based on words
|
|
words1 = set(norm_title1.split())
|
|
words2 = set(norm_title2.split())
|
|
common_words = words1.intersection(words2)
|
|
title_score = (
|
|
len(common_words) / max(len(words1), len(words2))
|
|
if words1 and words2
|
|
else 0.0
|
|
)
|
|
|
|
# Artist similarity
|
|
if artist1 and artist2:
|
|
norm_artist1 = self.normalize_artist(artist1)
|
|
norm_artist2 = self.normalize_artist(artist2)
|
|
|
|
if norm_artist1 == norm_artist2:
|
|
artist_score = 1.0
|
|
elif norm_artist1 in norm_artist2 or norm_artist2 in norm_artist1:
|
|
artist_score = 0.8
|
|
else:
|
|
# Partial match based on words
|
|
words1 = set(norm_artist1.split())
|
|
words2 = set(norm_artist2.split())
|
|
common_words = words1.intersection(words2)
|
|
artist_score = (
|
|
len(common_words) / max(len(words1), len(words2))
|
|
if words1 and words2
|
|
else 0.0
|
|
)
|
|
|
|
# Combined score (title is more important)
|
|
return title_score * 0.7 + artist_score * 0.3
|
|
|
|
def find_cross_service_matches(
|
|
self, primary_metadata: Any, all_services_data: dict[str, Any]
|
|
) -> list[CrossServiceMatch]:
|
|
"""Find matches of the same song across other services"""
|
|
matches = []
|
|
|
|
if not primary_metadata:
|
|
return matches
|
|
|
|
primary_title = getattr(primary_metadata, "title", "")
|
|
primary_artist = getattr(primary_metadata, "artist", "")
|
|
getattr(primary_metadata, "isrc", None)
|
|
|
|
for service, data in all_services_data.items():
|
|
service_attr = getattr(primary_metadata, "service", None)
|
|
if service_attr and service == service_attr.value:
|
|
continue # Skip: same service
|
|
|
|
service_title = getattr(data, "title", "")
|
|
service_artist = getattr(data, "artist", "")
|
|
service_url = getattr(data, "original_url", "")
|
|
|
|
# Calculate similarity score
|
|
similarity = self.calculate_similarity_score(
|
|
primary_title, primary_artist, service_title, service_artist
|
|
)
|
|
|
|
# Only include matches with reasonable similarity
|
|
if similarity >= 0.6: # 60% similarity threshold
|
|
match = CrossServiceMatch(
|
|
service=service,
|
|
service_id=getattr(data, "service_id", ""),
|
|
title=service_title,
|
|
artist=service_artist,
|
|
url=service_url,
|
|
confidence=similarity,
|
|
isrc=getattr(data, "isrc", None),
|
|
duration_ms=getattr(data, "duration_ms", None),
|
|
release_date=getattr(data, "release_date", None),
|
|
cover_art=getattr(data, "image_url", None),
|
|
)
|
|
matches.append(match)
|
|
|
|
# Sort by confidence score
|
|
matches.sort(key=lambda x: x.confidence, reverse=True)
|
|
return matches
|
|
|
|
def get_canonical_info(self, isrc: str) -> dict[str, Any] | None:
|
|
"""Get canonical information from ISRC"""
|
|
if not isrc or len(isrc) != 12:
|
|
return None
|
|
|
|
# Parse ISRC: Country-Registration Year-Sequence Number
|
|
country = isrc[:2]
|
|
year = isrc[2:6]
|
|
sequence = isrc[6:]
|
|
|
|
return {
|
|
"isrc": isrc,
|
|
"country": country,
|
|
"year": year,
|
|
"sequence": sequence,
|
|
"type": "recording" if sequence.isdigit() else "other",
|
|
}
|
|
|
|
def generate_recommendations(
|
|
self, metadata: Any, cross_matches: list[CrossServiceMatch]
|
|
) -> list[str]:
|
|
"""Generate recommendations based on metadata and cross matches"""
|
|
recommendations = []
|
|
|
|
# Base recommendations on genre
|
|
genre = getattr(metadata, "genre", "")
|
|
if genre:
|
|
recommendations.append(f"Similar {genre} tracks")
|
|
|
|
# Add recommendations from high-confidence cross matches
|
|
high_confidence_matches = [m for m in cross_matches if m.confidence >= 0.8]
|
|
for match in high_confidence_matches[:3]: # Top 3 matches
|
|
recommendations.append(f"Also available on {match.service}")
|
|
|
|
# Add recommendations based on artist
|
|
artist = getattr(metadata, "artist", "")
|
|
if artist:
|
|
recommendations.append(f"More from {artist}")
|
|
|
|
return list(set(recommendations)) # Remove duplicates
|
|
|
|
def create_enhanced_metadata(
|
|
self, primary_metadata: Any, cross_matches: list[CrossServiceMatch]
|
|
) -> EnhancedMetadata:
|
|
"""Create enhanced metadata object"""
|
|
# Calculate confidence score
|
|
max_confidence = (
|
|
max([m.confidence for m in cross_matches]) if cross_matches else 0.0
|
|
)
|
|
|
|
# Get canonical info if ISRC exists
|
|
canonical_info = None
|
|
isrc = getattr(primary_metadata, "isrc", None)
|
|
if isrc:
|
|
canonical_info = self.get_canonical_info(isrc)
|
|
|
|
# Generate recommendations
|
|
recommendations = self.generate_recommendations(primary_metadata, cross_matches)
|
|
|
|
return EnhancedMetadata(
|
|
primary_metadata=primary_metadata,
|
|
cross_matches=cross_matches,
|
|
canonical_info=canonical_info,
|
|
confidence_score=max_confidence,
|
|
recommendations=recommendations,
|
|
)
|
|
|
|
|
|
class FreeMetadataEnricher:
|
|
"""Free metadata enrichment without API keys"""
|
|
|
|
def __init__(self):
|
|
self.aggregator = MetadataAggregator()
|
|
|
|
def extract_lyrics_snippet(self, title: str, artist: str) -> str:
|
|
"""Extract potential lyrics snippet for search enhancement"""
|
|
# This would use web scraping of lyrics sites
|
|
# For now, return empty to avoid copyright issues
|
|
return ""
|
|
|
|
def detect_language(self, title: str, artist: str) -> str:
|
|
"""Detect likely language from title and artist"""
|
|
# Simple heuristic based on character patterns
|
|
if any(ord(c) > 127 for c in title + artist):
|
|
return "non-english"
|
|
return "english"
|
|
|
|
def estimate_mood(self, title: str, artist: str) -> str:
|
|
"""Estimate mood from title and artist name"""
|
|
title_lower = title.lower()
|
|
artist_lower = artist.lower()
|
|
|
|
mood_keywords = {
|
|
"happy": ["love", "joy", "sun", "summer", "dance", "party"],
|
|
"sad": ["cry", "tears", "rain", "winter", "goodbye", "broken"],
|
|
"energetic": ["rock", "power", "energy", "loud", "fast"],
|
|
"calm": ["peace", "quiet", "soft", "gentle", "acoustic"],
|
|
"dark": ["dark", "death", "black", "night", "shadow"],
|
|
}
|
|
|
|
for mood, keywords in mood_keywords.items():
|
|
if any(
|
|
keyword in title_lower or keyword in artist_lower
|
|
for keyword in keywords
|
|
):
|
|
return mood
|
|
|
|
return "neutral"
|
|
|
|
def calculate_quality_score(self, metadata: Any) -> float:
|
|
"""Calculate metadata quality score"""
|
|
score = 0.0
|
|
|
|
# Check for ISRC (high quality indicator)
|
|
if getattr(metadata, "isrc", None):
|
|
score += 0.3
|
|
|
|
# Check for release date
|
|
if getattr(metadata, "release_date", None):
|
|
score += 0.2
|
|
|
|
# Check for genre information
|
|
if getattr(metadata, "genre", None):
|
|
score += 0.2
|
|
|
|
# Check for cover art
|
|
if getattr(metadata, "image_url", None):
|
|
score += 0.1
|
|
|
|
# Check for duration
|
|
if getattr(metadata, "duration_ms", None):
|
|
score += 0.1
|
|
|
|
# Check for extended metadata
|
|
if getattr(metadata, "metadata", None):
|
|
score += 0.1
|
|
|
|
return min(score, 1.0)
|
|
|
|
|
|
# Global instances
|
|
metadata_aggregator = MetadataAggregator()
|
|
free_enricher = FreeMetadataEnricher()
|