mirror of
https://github.com/Dvorinka/SpotifyRecAlg.git
synced 2026-06-04 20:43:04 +00:00
first commit
This commit is contained in:
@@ -0,0 +1,324 @@
|
||||
"""
|
||||
Enhanced Metadata Aggregation System for Universal Music Downloader
|
||||
Provides cross-service matching and metadata enrichment without API keys
|
||||
"""
|
||||
|
||||
import logging
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from typing import Any
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class CrossServiceMatch:
|
||||
"""Cross-service song match information"""
|
||||
|
||||
service: str
|
||||
service_id: str
|
||||
title: str
|
||||
artist: str
|
||||
url: str
|
||||
confidence: float
|
||||
isrc: str | None = None
|
||||
duration_ms: int | None = None
|
||||
release_date: str | None = None
|
||||
cover_art: str | None = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class EnhancedMetadata:
|
||||
"""Enhanced metadata with cross-service information"""
|
||||
|
||||
primary_metadata: Any
|
||||
cross_matches: list[CrossServiceMatch]
|
||||
canonical_info: dict[str, Any] | None = None
|
||||
confidence_score: float = 0.0
|
||||
recommendations: list[str] = None
|
||||
|
||||
|
||||
class MetadataAggregator:
|
||||
"""Aggregates and enhances metadata from multiple sources"""
|
||||
|
||||
def __init__(self):
|
||||
self.canonical_cache = {}
|
||||
self.artist_aliases = {}
|
||||
|
||||
def normalize_title(self, title: str) -> str:
|
||||
"""Normalize song title for better matching"""
|
||||
# Remove extra whitespace and convert to lowercase
|
||||
normalized = title.strip().lower()
|
||||
|
||||
# Remove common prefixes and suffixes
|
||||
prefixes_to_remove = [
|
||||
"official video",
|
||||
"official audio",
|
||||
"lyrics",
|
||||
"live",
|
||||
"acoustic",
|
||||
"remastered",
|
||||
]
|
||||
for prefix in prefixes_to_remove:
|
||||
normalized = re.sub(rf"\s*{prefix}\s*", "", normalized, flags=re.IGNORECASE)
|
||||
|
||||
# Remove content in parentheses
|
||||
normalized = re.sub(r"\s*\([^)]*\)\s*", "", normalized)
|
||||
|
||||
# Remove extra dashes and special characters
|
||||
normalized = re.sub(r"\s*[-–—]\s*", " ", normalized)
|
||||
|
||||
return normalized.strip()
|
||||
|
||||
def normalize_artist(self, artist: str) -> str:
|
||||
"""Normalize artist name for better matching"""
|
||||
normalized = artist.strip().lower()
|
||||
|
||||
# Remove "feat." and similar
|
||||
normalized = re.sub(r"\s*feat\.\s*", " feat. ", normalized)
|
||||
|
||||
# Handle "vs" collaborations
|
||||
normalized = re.sub(r"\s+vs\s+", " vs ", normalized)
|
||||
|
||||
return normalized.strip()
|
||||
|
||||
def calculate_similarity_score(
|
||||
self, title1: str, artist1: str, title2: str, artist2: str
|
||||
) -> float:
|
||||
"""Calculate similarity score between two songs"""
|
||||
title_score = 0.0
|
||||
artist_score = 0.0
|
||||
|
||||
# Title similarity
|
||||
if title1 and title2:
|
||||
norm_title1 = self.normalize_title(title1)
|
||||
norm_title2 = self.normalize_title(title2)
|
||||
|
||||
if norm_title1 == norm_title2:
|
||||
title_score = 1.0
|
||||
elif norm_title1 in norm_title2 or norm_title2 in norm_title1:
|
||||
title_score = 0.8
|
||||
else:
|
||||
# Partial match based on words
|
||||
words1 = set(norm_title1.split())
|
||||
words2 = set(norm_title2.split())
|
||||
common_words = words1.intersection(words2)
|
||||
title_score = (
|
||||
len(common_words) / max(len(words1), len(words2))
|
||||
if words1 and words2
|
||||
else 0.0
|
||||
)
|
||||
|
||||
# Artist similarity
|
||||
if artist1 and artist2:
|
||||
norm_artist1 = self.normalize_artist(artist1)
|
||||
norm_artist2 = self.normalize_artist(artist2)
|
||||
|
||||
if norm_artist1 == norm_artist2:
|
||||
artist_score = 1.0
|
||||
elif norm_artist1 in norm_artist2 or norm_artist2 in norm_artist1:
|
||||
artist_score = 0.8
|
||||
else:
|
||||
# Partial match based on words
|
||||
words1 = set(norm_artist1.split())
|
||||
words2 = set(norm_artist2.split())
|
||||
common_words = words1.intersection(words2)
|
||||
artist_score = (
|
||||
len(common_words) / max(len(words1), len(words2))
|
||||
if words1 and words2
|
||||
else 0.0
|
||||
)
|
||||
|
||||
# Combined score (title is more important)
|
||||
return title_score * 0.7 + artist_score * 0.3
|
||||
|
||||
def find_cross_service_matches(
|
||||
self, primary_metadata: Any, all_services_data: dict[str, Any]
|
||||
) -> list[CrossServiceMatch]:
|
||||
"""Find matches of the same song across other services"""
|
||||
matches = []
|
||||
|
||||
if not primary_metadata:
|
||||
return matches
|
||||
|
||||
primary_title = getattr(primary_metadata, "title", "")
|
||||
primary_artist = getattr(primary_metadata, "artist", "")
|
||||
getattr(primary_metadata, "isrc", None)
|
||||
|
||||
for service, data in all_services_data.items():
|
||||
service_attr = getattr(primary_metadata, "service", None)
|
||||
if service_attr and service == service_attr.value:
|
||||
continue # Skip: same service
|
||||
|
||||
service_title = getattr(data, "title", "")
|
||||
service_artist = getattr(data, "artist", "")
|
||||
service_url = getattr(data, "original_url", "")
|
||||
|
||||
# Calculate similarity score
|
||||
similarity = self.calculate_similarity_score(
|
||||
primary_title, primary_artist, service_title, service_artist
|
||||
)
|
||||
|
||||
# Only include matches with reasonable similarity
|
||||
if similarity >= 0.6: # 60% similarity threshold
|
||||
match = CrossServiceMatch(
|
||||
service=service,
|
||||
service_id=getattr(data, "service_id", ""),
|
||||
title=service_title,
|
||||
artist=service_artist,
|
||||
url=service_url,
|
||||
confidence=similarity,
|
||||
isrc=getattr(data, "isrc", None),
|
||||
duration_ms=getattr(data, "duration_ms", None),
|
||||
release_date=getattr(data, "release_date", None),
|
||||
cover_art=getattr(data, "image_url", None),
|
||||
)
|
||||
matches.append(match)
|
||||
|
||||
# Sort by confidence score
|
||||
matches.sort(key=lambda x: x.confidence, reverse=True)
|
||||
return matches
|
||||
|
||||
def get_canonical_info(self, isrc: str) -> dict[str, Any] | None:
|
||||
"""Get canonical information from ISRC"""
|
||||
if not isrc or len(isrc) != 12:
|
||||
return None
|
||||
|
||||
# Parse ISRC: Country-Registration Year-Sequence Number
|
||||
country = isrc[:2]
|
||||
year = isrc[2:6]
|
||||
sequence = isrc[6:]
|
||||
|
||||
return {
|
||||
"isrc": isrc,
|
||||
"country": country,
|
||||
"year": year,
|
||||
"sequence": sequence,
|
||||
"type": "recording" if sequence.isdigit() else "other",
|
||||
}
|
||||
|
||||
def generate_recommendations(
|
||||
self, metadata: Any, cross_matches: list[CrossServiceMatch]
|
||||
) -> list[str]:
|
||||
"""Generate recommendations based on metadata and cross matches"""
|
||||
recommendations = []
|
||||
|
||||
# Base recommendations on genre
|
||||
genre = getattr(metadata, "genre", "")
|
||||
if genre:
|
||||
recommendations.append(f"Similar {genre} tracks")
|
||||
|
||||
# Add recommendations from high-confidence cross matches
|
||||
high_confidence_matches = [m for m in cross_matches if m.confidence >= 0.8]
|
||||
for match in high_confidence_matches[:3]: # Top 3 matches
|
||||
recommendations.append(f"Also available on {match.service}")
|
||||
|
||||
# Add recommendations based on artist
|
||||
artist = getattr(metadata, "artist", "")
|
||||
if artist:
|
||||
recommendations.append(f"More from {artist}")
|
||||
|
||||
return list(set(recommendations)) # Remove duplicates
|
||||
|
||||
def create_enhanced_metadata(
|
||||
self, primary_metadata: Any, cross_matches: list[CrossServiceMatch]
|
||||
) -> EnhancedMetadata:
|
||||
"""Create enhanced metadata object"""
|
||||
# Calculate confidence score
|
||||
max_confidence = (
|
||||
max([m.confidence for m in cross_matches]) if cross_matches else 0.0
|
||||
)
|
||||
|
||||
# Get canonical info if ISRC exists
|
||||
canonical_info = None
|
||||
isrc = getattr(primary_metadata, "isrc", None)
|
||||
if isrc:
|
||||
canonical_info = self.get_canonical_info(isrc)
|
||||
|
||||
# Generate recommendations
|
||||
recommendations = self.generate_recommendations(primary_metadata, cross_matches)
|
||||
|
||||
return EnhancedMetadata(
|
||||
primary_metadata=primary_metadata,
|
||||
cross_matches=cross_matches,
|
||||
canonical_info=canonical_info,
|
||||
confidence_score=max_confidence,
|
||||
recommendations=recommendations,
|
||||
)
|
||||
|
||||
|
||||
class FreeMetadataEnricher:
|
||||
"""Free metadata enrichment without API keys"""
|
||||
|
||||
def __init__(self):
|
||||
self.aggregator = MetadataAggregator()
|
||||
|
||||
def extract_lyrics_snippet(self, title: str, artist: str) -> str:
|
||||
"""Extract potential lyrics snippet for search enhancement"""
|
||||
# This would use web scraping of lyrics sites
|
||||
# For now, return empty to avoid copyright issues
|
||||
return ""
|
||||
|
||||
def detect_language(self, title: str, artist: str) -> str:
|
||||
"""Detect likely language from title and artist"""
|
||||
# Simple heuristic based on character patterns
|
||||
if any(ord(c) > 127 for c in title + artist):
|
||||
return "non-english"
|
||||
return "english"
|
||||
|
||||
def estimate_mood(self, title: str, artist: str) -> str:
|
||||
"""Estimate mood from title and artist name"""
|
||||
title_lower = title.lower()
|
||||
artist_lower = artist.lower()
|
||||
|
||||
mood_keywords = {
|
||||
"happy": ["love", "joy", "sun", "summer", "dance", "party"],
|
||||
"sad": ["cry", "tears", "rain", "winter", "goodbye", "broken"],
|
||||
"energetic": ["rock", "power", "energy", "loud", "fast"],
|
||||
"calm": ["peace", "quiet", "soft", "gentle", "acoustic"],
|
||||
"dark": ["dark", "death", "black", "night", "shadow"],
|
||||
}
|
||||
|
||||
for mood, keywords in mood_keywords.items():
|
||||
if any(
|
||||
keyword in title_lower or keyword in artist_lower
|
||||
for keyword in keywords
|
||||
):
|
||||
return mood
|
||||
|
||||
return "neutral"
|
||||
|
||||
def calculate_quality_score(self, metadata: Any) -> float:
|
||||
"""Calculate metadata quality score"""
|
||||
score = 0.0
|
||||
|
||||
# Check for ISRC (high quality indicator)
|
||||
if getattr(metadata, "isrc", None):
|
||||
score += 0.3
|
||||
|
||||
# Check for release date
|
||||
if getattr(metadata, "release_date", None):
|
||||
score += 0.2
|
||||
|
||||
# Check for genre information
|
||||
if getattr(metadata, "genre", None):
|
||||
score += 0.2
|
||||
|
||||
# Check for cover art
|
||||
if getattr(metadata, "image_url", None):
|
||||
score += 0.1
|
||||
|
||||
# Check for duration
|
||||
if getattr(metadata, "duration_ms", None):
|
||||
score += 0.1
|
||||
|
||||
# Check for extended metadata
|
||||
if getattr(metadata, "metadata", None):
|
||||
score += 0.1
|
||||
|
||||
return min(score, 1.0)
|
||||
|
||||
|
||||
# Global instances
|
||||
metadata_aggregator = MetadataAggregator()
|
||||
free_enricher = FreeMetadataEnricher()
|
||||
Reference in New Issue
Block a user