""" Enhanced Album Grouper for SwingMusic Handles proper album grouping with various artists, compilations, and metadata normalization """ import re import unicodedata from dataclasses import dataclass from difflib import SequenceMatcher from swingmusic import logger from swingmusic.db.sqlite.utils import get_db_connection @dataclass class AlbumGroupingKey: """Key for album grouping with normalization""" normalized_artist: str normalized_album: str year: str | None is_compilation: bool album_type: str # album, single, compilation, etc. @dataclass class AlbumInfo: """Enhanced album information""" album_id: str title: str artists: list[str] primary_artist: str year: str | None album_type: str is_compilation: bool track_count: int total_duration: int image_url: str | None folder_path: str grouping_key: str class MetadataNormalizer: """Normalizes metadata for consistent grouping""" # Common variations that should be normalized ARTIST_VARIATIONS = { "various artists": ["various artists", "va", "various", "multiple artists"], "soundtrack": [ "soundtrack", "ost", "original soundtrack", "original sound track", ], "various": ["various", "various artists", "va"], } # Words to remove for better matching STOP_WORDS = { "the", "a", "an", "and", "or", "but", "for", "nor", "so", "yet", "to", "of", "in", "on", "at", "by", "with", "about", "as", } # Patterns to clean up CLEANUP_PATTERNS = [ r"\[.*?\]", # Remove brackets and content r"\(.*?\)", # Remove parentheses and content r"\{.*?\}", # Remove braces and content r"<.*?>", # Remove angle brackets and content r" feat\. .*", # Remove featuring info r" ft\. .*", # Remove featuring info r" featuring .*", # Remove featuring info ] @classmethod def normalize_string(cls, text: str) -> str: """Normalize string for comparison""" if not text: return "" # Convert to lowercase and normalize unicode text = unicodedata.normalize("NFKD", text.lower()) # Remove accents and diacritics text = "".join(c for c in text if not unicodedata.combining(c)) # Apply cleanup patterns for pattern in cls.CLEANUP_PATTERNS: text = re.sub(pattern, "", text, flags=re.IGNORECASE) # Remove extra whitespace and punctuation text = re.sub(r"[^\w\s]", " ", text) text = re.sub(r"\s+", " ", text).strip() # Remove stop words (optional for album names) # words = text.split() # text = ' '.join(word for word in words if word not in cls.STOP_WORDS) return text @classmethod def normalize_artist(cls, artist: str) -> str: """Normalize artist name for grouping""" if not artist: return "" normalized = cls.normalize_string(artist) # Handle common variations for standard, variations in cls.ARTIST_VARIATIONS.items(): if normalized in variations: return standard return normalized @classmethod def normalize_album(cls, album: str) -> str: """Normalize album name for grouping""" return cls.normalize_string(album) @classmethod def extract_year(cls, date_str: str) -> str | None: """Extract year from date string""" if not date_str: return None # Look for 4-digit year patterns year_match = re.search(r"\b(19|20)\d{2}\b", date_str) if year_match: return year_match.group() return None @classmethod def is_compilation(cls, artists: list[str], albumartist: str = None) -> bool: """Determine if album is a compilation""" if not artists: return False # Check if albumartist is "Various Artists" if albumartist: normalized_albumartist = cls.normalize_artist(albumartist) if normalized_albumartist in ["various artists", "va", "various"]: return True # Check if there are many different artists unique_artists = {cls.normalize_artist(artist) for artist in artists} # If more than 3 unique artists, likely a compilation if len(unique_artists) > 3: return True # Check for common compilation indicators album_lower = " ".join(artists).lower() compilation_indicators = [ "various artists", "soundtrack", "ost", "compilation", "various", "multiple artists", "collection", "greatest hits", ] return any(indicator in album_lower for indicator in compilation_indicators) class ArtistAliasResolver: """Resolves artist aliases to canonical names""" def __init__(self): self.aliases: dict[str, str] = {} self._load_common_aliases() def _load_common_aliases(self): """Load common artist aliases""" # Common artist name variations common_aliases = { "taylor swift": ["t. swift", "taylor", "swift"], "the beatles": ["beatles", "the fab four"], "led zeppelin": ["zeppelin", "led zep"], "pink floyd": ["floyd"], "the rolling stones": ["rolling stones", "stones"], "bob dylan": ["dylan", "bobby dylan"], "david bowie": ["bowie", "ziggy stardust"], # Add more as needed } for canonical, aliases in common_aliases.items(): for alias in aliases: self.aliases[MetadataNormalizer.normalize_string(alias)] = canonical def resolve_alias(self, artist: str) -> str: """Resolve artist alias to canonical name""" normalized = MetadataNormalizer.normalize_string(artist) return self.aliases.get(normalized, artist) def add_alias(self, canonical: str, alias: str): """Add a new artist alias""" normalized_alias = MetadataNormalizer.normalize_string(alias) self.aliases[normalized_alias] = canonical class AlbumGrouper: """Enhanced album grouping with proper normalization""" def __init__(self): self.metadata_normalizer = MetadataNormalizer() self.alias_resolver = ArtistAliasResolver() self.grouping_cache: dict[str, AlbumGroupingKey] = {} def normalize_album_artist(self, track_metadata: dict[str, any]) -> str: """Normalize album artist for proper grouping""" # Try albumartist first albumartist = track_metadata.get("albumartist") if albumartist: normalized = self.metadata_normalizer.normalize_artist(albumartist) resolved = self.alias_resolver.resolve_alias(normalized) return resolved # Fall back to artist artist = track_metadata.get("artist") if artist: normalized = self.metadata_normalizer.normalize_artist(artist) resolved = self.alias_resolver.resolve_alias(normalized) return resolved return "Unknown Artist" def create_grouping_key(self, track_metadata: dict[str, any]) -> AlbumGroupingKey: """Create consistent grouping key for albums""" # Extract and normalize artist artists = self._extract_artists(track_metadata) primary_artist = self.normalize_album_artist(track_metadata) # Normalize album name album_name = track_metadata.get("album", "") normalized_album = self.metadata_normalizer.normalize_album(album_name) # Extract year release_date = track_metadata.get("date") or track_metadata.get("year") year = ( self.metadata_normalizer.extract_year(str(release_date)) if release_date else None ) # Determine if compilation is_compilation = self.metadata_normalizer.is_compilation( artists, track_metadata.get("albumartist") ) # Determine album type album_type = track_metadata.get("albumtype", "album") if is_compilation: album_type = "compilation" return AlbumGroupingKey( normalized_artist=primary_artist, normalized_album=normalized_album, year=year, is_compilation=is_compilation, album_type=album_type, ) def create_grouping_key_string(self, track_metadata: dict[str, any]) -> str: """Create string-based grouping key for database storage""" key = self.create_grouping_key(track_metadata) # Include year for different editions but allow fallback year_part = f"::{key.year}" if key.year else "" # Mark compilations specially compilation_part = "::COMP" if key.is_compilation else "" return f"{key.normalized_artist}::{key.normalized_album}{year_part}{compilation_part}" def _extract_artists(self, track_metadata: dict[str, any]) -> list[str]: """Extract list of artists from track metadata""" artists = [] # Try artists field (array) if "artists" in track_metadata: if isinstance(track_metadata["artists"], list): artists.extend(track_metadata["artists"]) else: artists.append(str(track_metadata["artists"])) # Try artist field if "artist" in track_metadata: artist_str = track_metadata["artist"] if isinstance(artist_str, list): artists.extend(artist_str) else: # Split common separators for sep in [",", ";", "&", " and ", " ft ", " feat "]: if sep in artist_str: artists.extend([a.strip() for a in artist_str.split(sep)]) break else: artists.append(artist_str) # Remove duplicates and empty strings return list(set(filter(None, artists))) def calculate_similarity(self, str1: str, str2: str) -> float: """Calculate similarity between two strings""" return SequenceMatcher(None, str1, str2).ratio() def should_group_together( self, key1: AlbumGroupingKey, key2: AlbumGroupingKey ) -> bool: """Determine if two albums should be grouped together""" # Different artists - don't group unless both are compilations if key1.normalized_artist != key2.normalized_artist: if not (key1.is_compilation and key2.is_compilation): return False # Check album name similarity album_similarity = self.calculate_similarity( key1.normalized_album, key2.normalized_album ) if album_similarity < 0.8: # 80% similarity threshold return False # If years are available, they should be close or identical if key1.year and key2.year and key1.year != key2.year: # Allow grouping if years are close (e.g., reissues) year_diff = abs(int(key1.year) - int(key2.year)) if year_diff > 5: # More than 5 years difference return False return True def group_albums_from_database(self) -> dict[str, list[dict[str, any]]]: """Group albums from database tracks""" try: with get_db_connection() as conn: # Get all tracks with album information query = """ SELECT t.trackhash, t.title, t.artist, t.albumartist, t.album, t.date, t.year, t.albumtype, t.image, t.folderpath, t.duration FROM tracks t WHERE t.album IS NOT NULL AND t.album != '' ORDER BY t.albumartist, t.album, t.date, t.tracknumber """ cursor = conn.execute(query) tracks = cursor.fetchall() # Group tracks by album key album_groups: dict[str, list[dict[str, any]]] = {} for track in tracks: track_dict = dict(track) # Create grouping key grouping_key = self.create_grouping_key_string(track_dict) # Add to group if grouping_key not in album_groups: album_groups[grouping_key] = [] album_groups[grouping_key].append(track_dict) return album_groups except Exception as e: logger.error(f"Error grouping albums from database: {e}") return {} def create_album_info( self, grouping_key: str, tracks: list[dict[str, any]] ) -> AlbumInfo: """Create album info from grouped tracks""" if not tracks: raise ValueError("No tracks provided") first_track = tracks[0] key = self.create_grouping_key(first_track) # Extract unique artists all_artists = set() for track in tracks: artists = self._extract_artists(track) all_artists.update(artists) # Calculate total duration total_duration = sum(track.get("duration", 0) for track in tracks) # Get image from first track (could be enhanced to find best image) image_url = first_track.get("image") return AlbumInfo( album_id=grouping_key, title=first_track.get("album", ""), artists=list(all_artists), primary_artist=key.normalized_artist, year=key.year, album_type=key.album_type, is_compilation=key.is_compilation, track_count=len(tracks), total_duration=total_duration, image_url=image_url, folder_path=first_track.get("folderpath", ""), grouping_key=grouping_key, ) def fix_album_grouping_in_database(self) -> int: """Fix album grouping in database and return number of fixes""" fixes_made = 0 try: with get_db_connection() as conn: # Get all tracks cursor = conn.execute(""" SELECT trackhash, artist, albumartist, album, date, year, albumtype FROM tracks WHERE album IS NOT NULL AND album != '' """) tracks = cursor.fetchall() for track in tracks: track_dict = dict(track) # Create proper grouping key self.create_grouping_key_string(track_dict) # Check if we need to update albumartist proper_albumartist = self.normalize_album_artist(track_dict) current_albumartist = track_dict.get("albumartist", "") if proper_albumartist != current_albumartist: cursor = conn.execute( """ UPDATE tracks SET albumartist = ? WHERE trackhash = ? """, (proper_albumartist, track_dict["trackhash"]), ) fixes_made += 1 logger.info( f"Fixed albumartist for {track_dict['trackhash']}: '{current_albumartist}' -> '{proper_albumartist}'" ) conn.commit() except Exception as e: logger.error(f"Error fixing album grouping: {e}") return fixes_made # Global album grouper instance album_grouper = AlbumGrouper()