first commit

2026-06-04 20:43:04 +00:00 · 2026-04-13 17:46:58 +02:00
commit 6e8fedf534
234 changed files with 53808 additions and 0 deletions
@@ -0,0 +1,324 @@
+"""
+Enhanced Metadata Aggregation System for Universal Music Downloader
+Provides cross-service matching and metadata enrichment without API keys
+"""
+
+import logging
+import re
+from dataclasses import dataclass
+from typing import Any
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class CrossServiceMatch:
+    """Cross-service song match information"""
+
+    service: str
+    service_id: str
+    title: str
+    artist: str
+    url: str
+    confidence: float
+    isrc: str | None = None
+    duration_ms: int | None = None
+    release_date: str | None = None
+    cover_art: str | None = None
+
+
+@dataclass
+class EnhancedMetadata:
+    """Enhanced metadata with cross-service information"""
+
+    primary_metadata: Any
+    cross_matches: list[CrossServiceMatch]
+    canonical_info: dict[str, Any] | None = None
+    confidence_score: float = 0.0
+    recommendations: list[str] = None
+
+
+class MetadataAggregator:
+    """Aggregates and enhances metadata from multiple sources"""
+
+    def __init__(self):
+        self.canonical_cache = {}
+        self.artist_aliases = {}
+
+    def normalize_title(self, title: str) -> str:
+        """Normalize song title for better matching"""
+        # Remove extra whitespace and convert to lowercase
+        normalized = title.strip().lower()
+
+        # Remove common prefixes and suffixes
+        prefixes_to_remove = [
+            "official video",
+            "official audio",
+            "lyrics",
+            "live",
+            "acoustic",
+            "remastered",
+        ]
+        for prefix in prefixes_to_remove:
+            normalized = re.sub(rf"\s*{prefix}\s*", "", normalized, flags=re.IGNORECASE)
+
+        # Remove content in parentheses
+        normalized = re.sub(r"\s*\([^)]*\)\s*", "", normalized)
+
+        # Remove extra dashes and special characters
+        normalized = re.sub(r"\s*[-–—]\s*", " ", normalized)
+
+        return normalized.strip()
+
+    def normalize_artist(self, artist: str) -> str:
+        """Normalize artist name for better matching"""
+        normalized = artist.strip().lower()
+
+        # Remove "feat." and similar
+        normalized = re.sub(r"\s*feat\.\s*", " feat. ", normalized)
+
+        # Handle "vs" collaborations
+        normalized = re.sub(r"\s+vs\s+", " vs ", normalized)
+
+        return normalized.strip()
+
+    def calculate_similarity_score(
+        self, title1: str, artist1: str, title2: str, artist2: str
+    ) -> float:
+        """Calculate similarity score between two songs"""
+        title_score = 0.0
+        artist_score = 0.0
+
+        # Title similarity
+        if title1 and title2:
+            norm_title1 = self.normalize_title(title1)
+            norm_title2 = self.normalize_title(title2)
+
+            if norm_title1 == norm_title2:
+                title_score = 1.0
+            elif norm_title1 in norm_title2 or norm_title2 in norm_title1:
+                title_score = 0.8
+            else:
+                # Partial match based on words
+                words1 = set(norm_title1.split())
+                words2 = set(norm_title2.split())
+                common_words = words1.intersection(words2)
+                title_score = (
+                    len(common_words) / max(len(words1), len(words2))
+                    if words1 and words2
+                    else 0.0
+                )
+
+        # Artist similarity
+        if artist1 and artist2:
+            norm_artist1 = self.normalize_artist(artist1)
+            norm_artist2 = self.normalize_artist(artist2)
+
+            if norm_artist1 == norm_artist2:
+                artist_score = 1.0
+            elif norm_artist1 in norm_artist2 or norm_artist2 in norm_artist1:
+                artist_score = 0.8
+            else:
+                # Partial match based on words
+                words1 = set(norm_artist1.split())
+                words2 = set(norm_artist2.split())
+                common_words = words1.intersection(words2)
+                artist_score = (
+                    len(common_words) / max(len(words1), len(words2))
+                    if words1 and words2
+                    else 0.0
+                )
+
+        # Combined score (title is more important)
+        return title_score * 0.7 + artist_score * 0.3
+
+    def find_cross_service_matches(
+        self, primary_metadata: Any, all_services_data: dict[str, Any]
+    ) -> list[CrossServiceMatch]:
+        """Find matches of the same song across other services"""
+        matches = []
+
+        if not primary_metadata:
+            return matches
+
+        primary_title = getattr(primary_metadata, "title", "")
+        primary_artist = getattr(primary_metadata, "artist", "")
+        getattr(primary_metadata, "isrc", None)
+
+        for service, data in all_services_data.items():
+            service_attr = getattr(primary_metadata, "service", None)
+            if service_attr and service == service_attr.value:
+                continue  # Skip: same service
+
+            service_title = getattr(data, "title", "")
+            service_artist = getattr(data, "artist", "")
+            service_url = getattr(data, "original_url", "")
+
+            # Calculate similarity score
+            similarity = self.calculate_similarity_score(
+                primary_title, primary_artist, service_title, service_artist
+            )
+
+            # Only include matches with reasonable similarity
+            if similarity >= 0.6:  # 60% similarity threshold
+                match = CrossServiceMatch(
+                    service=service,
+                    service_id=getattr(data, "service_id", ""),
+                    title=service_title,
+                    artist=service_artist,
+                    url=service_url,
+                    confidence=similarity,
+                    isrc=getattr(data, "isrc", None),
+                    duration_ms=getattr(data, "duration_ms", None),
+                    release_date=getattr(data, "release_date", None),
+                    cover_art=getattr(data, "image_url", None),
+                )
+                matches.append(match)
+
+        # Sort by confidence score
+        matches.sort(key=lambda x: x.confidence, reverse=True)
+        return matches
+
+    def get_canonical_info(self, isrc: str) -> dict[str, Any] | None:
+        """Get canonical information from ISRC"""
+        if not isrc or len(isrc) != 12:
+            return None
+
+        # Parse ISRC: Country-Registration Year-Sequence Number
+        country = isrc[:2]
+        year = isrc[2:6]
+        sequence = isrc[6:]
+
+        return {
+            "isrc": isrc,
+            "country": country,
+            "year": year,
+            "sequence": sequence,
+            "type": "recording" if sequence.isdigit() else "other",
+        }
+
+    def generate_recommendations(
+        self, metadata: Any, cross_matches: list[CrossServiceMatch]
+    ) -> list[str]:
+        """Generate recommendations based on metadata and cross matches"""
+        recommendations = []
+
+        # Base recommendations on genre
+        genre = getattr(metadata, "genre", "")
+        if genre:
+            recommendations.append(f"Similar {genre} tracks")
+
+        # Add recommendations from high-confidence cross matches
+        high_confidence_matches = [m for m in cross_matches if m.confidence >= 0.8]
+        for match in high_confidence_matches[:3]:  # Top 3 matches
+            recommendations.append(f"Also available on {match.service}")
+
+        # Add recommendations based on artist
+        artist = getattr(metadata, "artist", "")
+        if artist:
+            recommendations.append(f"More from {artist}")
+
+        return list(set(recommendations))  # Remove duplicates
+
+    def create_enhanced_metadata(
+        self, primary_metadata: Any, cross_matches: list[CrossServiceMatch]
+    ) -> EnhancedMetadata:
+        """Create enhanced metadata object"""
+        # Calculate confidence score
+        max_confidence = (
+            max([m.confidence for m in cross_matches]) if cross_matches else 0.0
+        )
+
+        # Get canonical info if ISRC exists
+        canonical_info = None
+        isrc = getattr(primary_metadata, "isrc", None)
+        if isrc:
+            canonical_info = self.get_canonical_info(isrc)
+
+        # Generate recommendations
+        recommendations = self.generate_recommendations(primary_metadata, cross_matches)
+
+        return EnhancedMetadata(
+            primary_metadata=primary_metadata,
+            cross_matches=cross_matches,
+            canonical_info=canonical_info,
+            confidence_score=max_confidence,
+            recommendations=recommendations,
+        )
+
+
+class FreeMetadataEnricher:
+    """Free metadata enrichment without API keys"""
+
+    def __init__(self):
+        self.aggregator = MetadataAggregator()
+
+    def extract_lyrics_snippet(self, title: str, artist: str) -> str:
+        """Extract potential lyrics snippet for search enhancement"""
+        # This would use web scraping of lyrics sites
+        # For now, return empty to avoid copyright issues
+        return ""
+
+    def detect_language(self, title: str, artist: str) -> str:
+        """Detect likely language from title and artist"""
+        # Simple heuristic based on character patterns
+        if any(ord(c) > 127 for c in title + artist):
+            return "non-english"
+        return "english"
+
+    def estimate_mood(self, title: str, artist: str) -> str:
+        """Estimate mood from title and artist name"""
+        title_lower = title.lower()
+        artist_lower = artist.lower()
+
+        mood_keywords = {
+            "happy": ["love", "joy", "sun", "summer", "dance", "party"],
+            "sad": ["cry", "tears", "rain", "winter", "goodbye", "broken"],
+            "energetic": ["rock", "power", "energy", "loud", "fast"],
+            "calm": ["peace", "quiet", "soft", "gentle", "acoustic"],
+            "dark": ["dark", "death", "black", "night", "shadow"],
+        }
+
+        for mood, keywords in mood_keywords.items():
+            if any(
+                keyword in title_lower or keyword in artist_lower
+                for keyword in keywords
+            ):
+                return mood
+
+        return "neutral"
+
+    def calculate_quality_score(self, metadata: Any) -> float:
+        """Calculate metadata quality score"""
+        score = 0.0
+
+        # Check for ISRC (high quality indicator)
+        if getattr(metadata, "isrc", None):
+            score += 0.3
+
+        # Check for release date
+        if getattr(metadata, "release_date", None):
+            score += 0.2
+
+        # Check for genre information
+        if getattr(metadata, "genre", None):
+            score += 0.2
+
+        # Check for cover art
+        if getattr(metadata, "image_url", None):
+            score += 0.1
+
+        # Check for duration
+        if getattr(metadata, "duration_ms", None):
+            score += 0.1
+
+        # Check for extended metadata
+        if getattr(metadata, "metadata", None):
+            score += 0.1
+
+        return min(score, 1.0)
+
+
+# Global instances
+metadata_aggregator = MetadataAggregator()
+free_enricher = FreeMetadataEnricher()