SpotifyRecAlg/swingmusic/services/metadata_aggregator.py

"""
Enhanced Metadata Aggregation System for Universal Music Downloader
Provides cross-service matching and metadata enrichment without API keys
"""

import logging
import re
from dataclasses import dataclass
from typing import Any

logger = logging.getLogger(__name__)


@dataclass
class CrossServiceMatch:
    """Cross-service song match information"""

    service: str
    service_id: str
    title: str
    artist: str
    url: str
    confidence: float
    isrc: str | None = None
    duration_ms: int | None = None
    release_date: str | None = None
    cover_art: str | None = None


@dataclass
class EnhancedMetadata:
    """Enhanced metadata with cross-service information"""

    primary_metadata: Any
    cross_matches: list[CrossServiceMatch]
    canonical_info: dict[str, Any] | None = None
    confidence_score: float = 0.0
    recommendations: list[str] = None


class MetadataAggregator:
    """Aggregates and enhances metadata from multiple sources"""

    def __init__(self):
        self.canonical_cache = {}
        self.artist_aliases = {}

    def normalize_title(self, title: str) -> str:
        """Normalize song title for better matching"""
        # Remove extra whitespace and convert to lowercase
        normalized = title.strip().lower()

        # Remove common prefixes and suffixes
        prefixes_to_remove = [
            "official video",
            "official audio",
            "lyrics",
            "live",
            "acoustic",
            "remastered",
        ]
        for prefix in prefixes_to_remove:
            normalized = re.sub(rf"\s*{prefix}\s*", "", normalized, flags=re.IGNORECASE)

        # Remove content in parentheses
        normalized = re.sub(r"\s*\([^)]*\)\s*", "", normalized)

        # Remove extra dashes and special characters
        normalized = re.sub(r"\s*[-–—]\s*", " ", normalized)

        return normalized.strip()

    def normalize_artist(self, artist: str) -> str:
        """Normalize artist name for better matching"""
        normalized = artist.strip().lower()

        # Remove "feat." and similar
        normalized = re.sub(r"\s*feat\.\s*", " feat. ", normalized)

        # Handle "vs" collaborations
        normalized = re.sub(r"\s+vs\s+", " vs ", normalized)

        return normalized.strip()

    def calculate_similarity_score(
        self, title1: str, artist1: str, title2: str, artist2: str
    ) -> float:
        """Calculate similarity score between two songs"""
        title_score = 0.0
        artist_score = 0.0

        # Title similarity
        if title1 and title2:
            norm_title1 = self.normalize_title(title1)
            norm_title2 = self.normalize_title(title2)

            if norm_title1 == norm_title2:
                title_score = 1.0
            elif norm_title1 in norm_title2 or norm_title2 in norm_title1:
                title_score = 0.8
            else:
                # Partial match based on words
                words1 = set(norm_title1.split())
                words2 = set(norm_title2.split())
                common_words = words1.intersection(words2)
                title_score = (
                    len(common_words) / max(len(words1), len(words2))
                    if words1 and words2
                    else 0.0
                )

        # Artist similarity
        if artist1 and artist2:
            norm_artist1 = self.normalize_artist(artist1)
            norm_artist2 = self.normalize_artist(artist2)

            if norm_artist1 == norm_artist2:
                artist_score = 1.0
            elif norm_artist1 in norm_artist2 or norm_artist2 in norm_artist1:
                artist_score = 0.8
            else:
                # Partial match based on words
                words1 = set(norm_artist1.split())
                words2 = set(norm_artist2.split())
                common_words = words1.intersection(words2)
                artist_score = (
                    len(common_words) / max(len(words1), len(words2))
                    if words1 and words2
                    else 0.0
                )

        # Combined score (title is more important)
        return title_score * 0.7 + artist_score * 0.3

    def find_cross_service_matches(
        self, primary_metadata: Any, all_services_data: dict[str, Any]
    ) -> list[CrossServiceMatch]:
        """Find matches of the same song across other services"""
        matches = []

        if not primary_metadata:
            return matches

        primary_title = getattr(primary_metadata, "title", "")
        primary_artist = getattr(primary_metadata, "artist", "")
        getattr(primary_metadata, "isrc", None)

        for service, data in all_services_data.items():
            service_attr = getattr(primary_metadata, "service", None)
            if service_attr and service == service_attr.value:
                continue  # Skip: same service

            service_title = getattr(data, "title", "")
            service_artist = getattr(data, "artist", "")
            service_url = getattr(data, "original_url", "")

            # Calculate similarity score
            similarity = self.calculate_similarity_score(
                primary_title, primary_artist, service_title, service_artist
            )

            # Only include matches with reasonable similarity
            if similarity >= 0.6:  # 60% similarity threshold
                match = CrossServiceMatch(
                    service=service,
                    service_id=getattr(data, "service_id", ""),
                    title=service_title,
                    artist=service_artist,
                    url=service_url,
                    confidence=similarity,
                    isrc=getattr(data, "isrc", None),
                    duration_ms=getattr(data, "duration_ms", None),
                    release_date=getattr(data, "release_date", None),
                    cover_art=getattr(data, "image_url", None),
                )
                matches.append(match)

        # Sort by confidence score
        matches.sort(key=lambda x: x.confidence, reverse=True)
        return matches

    def get_canonical_info(self, isrc: str) -> dict[str, Any] | None:
        """Get canonical information from ISRC"""
        if not isrc or len(isrc) != 12:
            return None

        # Parse ISRC: Country-Registration Year-Sequence Number
        country = isrc[:2]
        year = isrc[2:6]
        sequence = isrc[6:]

        return {
            "isrc": isrc,
            "country": country,
            "year": year,
            "sequence": sequence,
            "type": "recording" if sequence.isdigit() else "other",
        }

    def generate_recommendations(
        self, metadata: Any, cross_matches: list[CrossServiceMatch]
    ) -> list[str]:
        """Generate recommendations based on metadata and cross matches"""
        recommendations = []

        # Base recommendations on genre
        genre = getattr(metadata, "genre", "")
        if genre:
            recommendations.append(f"Similar {genre} tracks")

        # Add recommendations from high-confidence cross matches
        high_confidence_matches = [m for m in cross_matches if m.confidence >= 0.8]
        for match in high_confidence_matches[:3]:  # Top 3 matches
            recommendations.append(f"Also available on {match.service}")

        # Add recommendations based on artist
        artist = getattr(metadata, "artist", "")
        if artist:
            recommendations.append(f"More from {artist}")

        return list(set(recommendations))  # Remove duplicates

    def create_enhanced_metadata(
        self, primary_metadata: Any, cross_matches: list[CrossServiceMatch]
    ) -> EnhancedMetadata:
        """Create enhanced metadata object"""
        # Calculate confidence score
        max_confidence = (
            max([m.confidence for m in cross_matches]) if cross_matches else 0.0
        )

        # Get canonical info if ISRC exists
        canonical_info = None
        isrc = getattr(primary_metadata, "isrc", None)
        if isrc:
            canonical_info = self.get_canonical_info(isrc)

        # Generate recommendations
        recommendations = self.generate_recommendations(primary_metadata, cross_matches)

        return EnhancedMetadata(
            primary_metadata=primary_metadata,
            cross_matches=cross_matches,
            canonical_info=canonical_info,
            confidence_score=max_confidence,
            recommendations=recommendations,
        )


class FreeMetadataEnricher:
    """Free metadata enrichment without API keys"""

    def __init__(self):
        self.aggregator = MetadataAggregator()

    def extract_lyrics_snippet(self, title: str, artist: str) -> str:
        """Extract potential lyrics snippet for search enhancement"""
        # This would use web scraping of lyrics sites
        # For now, return empty to avoid copyright issues
        return ""

    def detect_language(self, title: str, artist: str) -> str:
        """Detect likely language from title and artist"""
        # Simple heuristic based on character patterns
        if any(ord(c) > 127 for c in title + artist):
            return "non-english"
        return "english"

    def estimate_mood(self, title: str, artist: str) -> str:
        """Estimate mood from title and artist name"""
        title_lower = title.lower()
        artist_lower = artist.lower()

        mood_keywords = {
            "happy": ["love", "joy", "sun", "summer", "dance", "party"],
            "sad": ["cry", "tears", "rain", "winter", "goodbye", "broken"],
            "energetic": ["rock", "power", "energy", "loud", "fast"],
            "calm": ["peace", "quiet", "soft", "gentle", "acoustic"],
            "dark": ["dark", "death", "black", "night", "shadow"],
        }

        for mood, keywords in mood_keywords.items():
            if any(
                keyword in title_lower or keyword in artist_lower
                for keyword in keywords
            ):
                return mood

        return "neutral"

    def calculate_quality_score(self, metadata: Any) -> float:
        """Calculate metadata quality score"""
        score = 0.0

        # Check for ISRC (high quality indicator)
        if getattr(metadata, "isrc", None):
            score += 0.3

        # Check for release date
        if getattr(metadata, "release_date", None):
            score += 0.2

        # Check for genre information
        if getattr(metadata, "genre", None):
            score += 0.2

        # Check for cover art
        if getattr(metadata, "image_url", None):
            score += 0.1

        # Check for duration
        if getattr(metadata, "duration_ms", None):
            score += 0.1

        # Check for extended metadata
        if getattr(metadata, "metadata", None):
            score += 0.1

        return min(score, 1.0)


# Global instances
metadata_aggregator = MetadataAggregator()
free_enricher = FreeMetadataEnricher()