mirror of
https://github.com/Dvorinka/SpotifyRecAlg.git
synced 2026-06-04 12:33:03 +00:00
488 lines
16 KiB
Python
488 lines
16 KiB
Python
"""
|
|
Enhanced Album Grouper for SwingMusic
|
|
Handles proper album grouping with various artists, compilations, and metadata normalization
|
|
"""
|
|
|
|
import re
|
|
import unicodedata
|
|
from dataclasses import dataclass
|
|
from difflib import SequenceMatcher
|
|
|
|
from swingmusic import logger
|
|
from swingmusic.db.sqlite.utils import get_db_connection
|
|
|
|
|
|
@dataclass
|
|
class AlbumGroupingKey:
|
|
"""Key for album grouping with normalization"""
|
|
|
|
normalized_artist: str
|
|
normalized_album: str
|
|
year: str | None
|
|
is_compilation: bool
|
|
album_type: str # album, single, compilation, etc.
|
|
|
|
|
|
@dataclass
|
|
class AlbumInfo:
|
|
"""Enhanced album information"""
|
|
|
|
album_id: str
|
|
title: str
|
|
artists: list[str]
|
|
primary_artist: str
|
|
year: str | None
|
|
album_type: str
|
|
is_compilation: bool
|
|
track_count: int
|
|
total_duration: int
|
|
image_url: str | None
|
|
folder_path: str
|
|
grouping_key: str
|
|
|
|
|
|
class MetadataNormalizer:
|
|
"""Normalizes metadata for consistent grouping"""
|
|
|
|
# Common variations that should be normalized
|
|
ARTIST_VARIATIONS = {
|
|
"various artists": ["various artists", "va", "various", "multiple artists"],
|
|
"soundtrack": [
|
|
"soundtrack",
|
|
"ost",
|
|
"original soundtrack",
|
|
"original sound track",
|
|
],
|
|
"various": ["various", "various artists", "va"],
|
|
}
|
|
|
|
# Words to remove for better matching
|
|
STOP_WORDS = {
|
|
"the",
|
|
"a",
|
|
"an",
|
|
"and",
|
|
"or",
|
|
"but",
|
|
"for",
|
|
"nor",
|
|
"so",
|
|
"yet",
|
|
"to",
|
|
"of",
|
|
"in",
|
|
"on",
|
|
"at",
|
|
"by",
|
|
"with",
|
|
"about",
|
|
"as",
|
|
}
|
|
|
|
# Patterns to clean up
|
|
CLEANUP_PATTERNS = [
|
|
r"\[.*?\]", # Remove brackets and content
|
|
r"\(.*?\)", # Remove parentheses and content
|
|
r"\{.*?\}", # Remove braces and content
|
|
r"<.*?>", # Remove angle brackets and content
|
|
r" feat\. .*", # Remove featuring info
|
|
r" ft\. .*", # Remove featuring info
|
|
r" featuring .*", # Remove featuring info
|
|
]
|
|
|
|
@classmethod
|
|
def normalize_string(cls, text: str) -> str:
|
|
"""Normalize string for comparison"""
|
|
if not text:
|
|
return ""
|
|
|
|
# Convert to lowercase and normalize unicode
|
|
text = unicodedata.normalize("NFKD", text.lower())
|
|
|
|
# Remove accents and diacritics
|
|
text = "".join(c for c in text if not unicodedata.combining(c))
|
|
|
|
# Apply cleanup patterns
|
|
for pattern in cls.CLEANUP_PATTERNS:
|
|
text = re.sub(pattern, "", text, flags=re.IGNORECASE)
|
|
|
|
# Remove extra whitespace and punctuation
|
|
text = re.sub(r"[^\w\s]", " ", text)
|
|
text = re.sub(r"\s+", " ", text).strip()
|
|
|
|
# Remove stop words (optional for album names)
|
|
# words = text.split()
|
|
# text = ' '.join(word for word in words if word not in cls.STOP_WORDS)
|
|
|
|
return text
|
|
|
|
@classmethod
|
|
def normalize_artist(cls, artist: str) -> str:
|
|
"""Normalize artist name for grouping"""
|
|
if not artist:
|
|
return ""
|
|
|
|
normalized = cls.normalize_string(artist)
|
|
|
|
# Handle common variations
|
|
for standard, variations in cls.ARTIST_VARIATIONS.items():
|
|
if normalized in variations:
|
|
return standard
|
|
|
|
return normalized
|
|
|
|
@classmethod
|
|
def normalize_album(cls, album: str) -> str:
|
|
"""Normalize album name for grouping"""
|
|
return cls.normalize_string(album)
|
|
|
|
@classmethod
|
|
def extract_year(cls, date_str: str) -> str | None:
|
|
"""Extract year from date string"""
|
|
if not date_str:
|
|
return None
|
|
|
|
# Look for 4-digit year patterns
|
|
year_match = re.search(r"\b(19|20)\d{2}\b", date_str)
|
|
if year_match:
|
|
return year_match.group()
|
|
|
|
return None
|
|
|
|
@classmethod
|
|
def is_compilation(cls, artists: list[str], albumartist: str = None) -> bool:
|
|
"""Determine if album is a compilation"""
|
|
if not artists:
|
|
return False
|
|
|
|
# Check if albumartist is "Various Artists"
|
|
if albumartist:
|
|
normalized_albumartist = cls.normalize_artist(albumartist)
|
|
if normalized_albumartist in ["various artists", "va", "various"]:
|
|
return True
|
|
|
|
# Check if there are many different artists
|
|
unique_artists = {cls.normalize_artist(artist) for artist in artists}
|
|
|
|
# If more than 3 unique artists, likely a compilation
|
|
if len(unique_artists) > 3:
|
|
return True
|
|
|
|
# Check for common compilation indicators
|
|
album_lower = " ".join(artists).lower()
|
|
compilation_indicators = [
|
|
"various artists",
|
|
"soundtrack",
|
|
"ost",
|
|
"compilation",
|
|
"various",
|
|
"multiple artists",
|
|
"collection",
|
|
"greatest hits",
|
|
]
|
|
|
|
return any(indicator in album_lower for indicator in compilation_indicators)
|
|
|
|
|
|
class ArtistAliasResolver:
|
|
"""Resolves artist aliases to canonical names"""
|
|
|
|
def __init__(self):
|
|
self.aliases: dict[str, str] = {}
|
|
self._load_common_aliases()
|
|
|
|
def _load_common_aliases(self):
|
|
"""Load common artist aliases"""
|
|
# Common artist name variations
|
|
common_aliases = {
|
|
"taylor swift": ["t. swift", "taylor", "swift"],
|
|
"the beatles": ["beatles", "the fab four"],
|
|
"led zeppelin": ["zeppelin", "led zep"],
|
|
"pink floyd": ["floyd"],
|
|
"the rolling stones": ["rolling stones", "stones"],
|
|
"bob dylan": ["dylan", "bobby dylan"],
|
|
"david bowie": ["bowie", "ziggy stardust"],
|
|
# Add more as needed
|
|
}
|
|
|
|
for canonical, aliases in common_aliases.items():
|
|
for alias in aliases:
|
|
self.aliases[MetadataNormalizer.normalize_string(alias)] = canonical
|
|
|
|
def resolve_alias(self, artist: str) -> str:
|
|
"""Resolve artist alias to canonical name"""
|
|
normalized = MetadataNormalizer.normalize_string(artist)
|
|
return self.aliases.get(normalized, artist)
|
|
|
|
def add_alias(self, canonical: str, alias: str):
|
|
"""Add a new artist alias"""
|
|
normalized_alias = MetadataNormalizer.normalize_string(alias)
|
|
self.aliases[normalized_alias] = canonical
|
|
|
|
|
|
class AlbumGrouper:
|
|
"""Enhanced album grouping with proper normalization"""
|
|
|
|
def __init__(self):
|
|
self.metadata_normalizer = MetadataNormalizer()
|
|
self.alias_resolver = ArtistAliasResolver()
|
|
self.grouping_cache: dict[str, AlbumGroupingKey] = {}
|
|
|
|
def normalize_album_artist(self, track_metadata: dict[str, any]) -> str:
|
|
"""Normalize album artist for proper grouping"""
|
|
# Try albumartist first
|
|
albumartist = track_metadata.get("albumartist")
|
|
if albumartist:
|
|
normalized = self.metadata_normalizer.normalize_artist(albumartist)
|
|
resolved = self.alias_resolver.resolve_alias(normalized)
|
|
return resolved
|
|
|
|
# Fall back to artist
|
|
artist = track_metadata.get("artist")
|
|
if artist:
|
|
normalized = self.metadata_normalizer.normalize_artist(artist)
|
|
resolved = self.alias_resolver.resolve_alias(normalized)
|
|
return resolved
|
|
|
|
return "Unknown Artist"
|
|
|
|
def create_grouping_key(self, track_metadata: dict[str, any]) -> AlbumGroupingKey:
|
|
"""Create consistent grouping key for albums"""
|
|
# Extract and normalize artist
|
|
artists = self._extract_artists(track_metadata)
|
|
primary_artist = self.normalize_album_artist(track_metadata)
|
|
|
|
# Normalize album name
|
|
album_name = track_metadata.get("album", "")
|
|
normalized_album = self.metadata_normalizer.normalize_album(album_name)
|
|
|
|
# Extract year
|
|
release_date = track_metadata.get("date") or track_metadata.get("year")
|
|
year = (
|
|
self.metadata_normalizer.extract_year(str(release_date))
|
|
if release_date
|
|
else None
|
|
)
|
|
|
|
# Determine if compilation
|
|
is_compilation = self.metadata_normalizer.is_compilation(
|
|
artists, track_metadata.get("albumartist")
|
|
)
|
|
|
|
# Determine album type
|
|
album_type = track_metadata.get("albumtype", "album")
|
|
if is_compilation:
|
|
album_type = "compilation"
|
|
|
|
return AlbumGroupingKey(
|
|
normalized_artist=primary_artist,
|
|
normalized_album=normalized_album,
|
|
year=year,
|
|
is_compilation=is_compilation,
|
|
album_type=album_type,
|
|
)
|
|
|
|
def create_grouping_key_string(self, track_metadata: dict[str, any]) -> str:
|
|
"""Create string-based grouping key for database storage"""
|
|
key = self.create_grouping_key(track_metadata)
|
|
|
|
# Include year for different editions but allow fallback
|
|
year_part = f"::{key.year}" if key.year else ""
|
|
|
|
# Mark compilations specially
|
|
compilation_part = "::COMP" if key.is_compilation else ""
|
|
|
|
return f"{key.normalized_artist}::{key.normalized_album}{year_part}{compilation_part}"
|
|
|
|
def _extract_artists(self, track_metadata: dict[str, any]) -> list[str]:
|
|
"""Extract list of artists from track metadata"""
|
|
artists = []
|
|
|
|
# Try artists field (array)
|
|
if "artists" in track_metadata:
|
|
if isinstance(track_metadata["artists"], list):
|
|
artists.extend(track_metadata["artists"])
|
|
else:
|
|
artists.append(str(track_metadata["artists"]))
|
|
|
|
# Try artist field
|
|
if "artist" in track_metadata:
|
|
artist_str = track_metadata["artist"]
|
|
if isinstance(artist_str, list):
|
|
artists.extend(artist_str)
|
|
else:
|
|
# Split common separators
|
|
for sep in [",", ";", "&", " and ", " ft ", " feat "]:
|
|
if sep in artist_str:
|
|
artists.extend([a.strip() for a in artist_str.split(sep)])
|
|
break
|
|
else:
|
|
artists.append(artist_str)
|
|
|
|
# Remove duplicates and empty strings
|
|
return list(set(filter(None, artists)))
|
|
|
|
def calculate_similarity(self, str1: str, str2: str) -> float:
|
|
"""Calculate similarity between two strings"""
|
|
return SequenceMatcher(None, str1, str2).ratio()
|
|
|
|
def should_group_together(
|
|
self, key1: AlbumGroupingKey, key2: AlbumGroupingKey
|
|
) -> bool:
|
|
"""Determine if two albums should be grouped together"""
|
|
# Different artists - don't group unless both are compilations
|
|
if key1.normalized_artist != key2.normalized_artist:
|
|
if not (key1.is_compilation and key2.is_compilation):
|
|
return False
|
|
|
|
# Check album name similarity
|
|
album_similarity = self.calculate_similarity(
|
|
key1.normalized_album, key2.normalized_album
|
|
)
|
|
if album_similarity < 0.8: # 80% similarity threshold
|
|
return False
|
|
|
|
# If years are available, they should be close or identical
|
|
if key1.year and key2.year and key1.year != key2.year:
|
|
# Allow grouping if years are close (e.g., reissues)
|
|
year_diff = abs(int(key1.year) - int(key2.year))
|
|
if year_diff > 5: # More than 5 years difference
|
|
return False
|
|
|
|
return True
|
|
|
|
def group_albums_from_database(self) -> dict[str, list[dict[str, any]]]:
|
|
"""Group albums from database tracks"""
|
|
try:
|
|
with get_db_connection() as conn:
|
|
# Get all tracks with album information
|
|
query = """
|
|
SELECT
|
|
t.trackhash,
|
|
t.title,
|
|
t.artist,
|
|
t.albumartist,
|
|
t.album,
|
|
t.date,
|
|
t.year,
|
|
t.albumtype,
|
|
t.image,
|
|
t.folderpath,
|
|
t.duration
|
|
FROM tracks t
|
|
WHERE t.album IS NOT NULL AND t.album != ''
|
|
ORDER BY t.albumartist, t.album, t.date, t.tracknumber
|
|
"""
|
|
|
|
cursor = conn.execute(query)
|
|
tracks = cursor.fetchall()
|
|
|
|
# Group tracks by album key
|
|
album_groups: dict[str, list[dict[str, any]]] = {}
|
|
|
|
for track in tracks:
|
|
track_dict = dict(track)
|
|
|
|
# Create grouping key
|
|
grouping_key = self.create_grouping_key_string(track_dict)
|
|
|
|
# Add to group
|
|
if grouping_key not in album_groups:
|
|
album_groups[grouping_key] = []
|
|
|
|
album_groups[grouping_key].append(track_dict)
|
|
|
|
return album_groups
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error grouping albums from database: {e}")
|
|
return {}
|
|
|
|
def create_album_info(
|
|
self, grouping_key: str, tracks: list[dict[str, any]]
|
|
) -> AlbumInfo:
|
|
"""Create album info from grouped tracks"""
|
|
if not tracks:
|
|
raise ValueError("No tracks provided")
|
|
|
|
first_track = tracks[0]
|
|
key = self.create_grouping_key(first_track)
|
|
|
|
# Extract unique artists
|
|
all_artists = set()
|
|
for track in tracks:
|
|
artists = self._extract_artists(track)
|
|
all_artists.update(artists)
|
|
|
|
# Calculate total duration
|
|
total_duration = sum(track.get("duration", 0) for track in tracks)
|
|
|
|
# Get image from first track (could be enhanced to find best image)
|
|
image_url = first_track.get("image")
|
|
|
|
return AlbumInfo(
|
|
album_id=grouping_key,
|
|
title=first_track.get("album", ""),
|
|
artists=list(all_artists),
|
|
primary_artist=key.normalized_artist,
|
|
year=key.year,
|
|
album_type=key.album_type,
|
|
is_compilation=key.is_compilation,
|
|
track_count=len(tracks),
|
|
total_duration=total_duration,
|
|
image_url=image_url,
|
|
folder_path=first_track.get("folderpath", ""),
|
|
grouping_key=grouping_key,
|
|
)
|
|
|
|
def fix_album_grouping_in_database(self) -> int:
|
|
"""Fix album grouping in database and return number of fixes"""
|
|
fixes_made = 0
|
|
|
|
try:
|
|
with get_db_connection() as conn:
|
|
# Get all tracks
|
|
cursor = conn.execute("""
|
|
SELECT trackhash, artist, albumartist, album, date, year, albumtype
|
|
FROM tracks
|
|
WHERE album IS NOT NULL AND album != ''
|
|
""")
|
|
|
|
tracks = cursor.fetchall()
|
|
|
|
for track in tracks:
|
|
track_dict = dict(track)
|
|
|
|
# Create proper grouping key
|
|
self.create_grouping_key_string(track_dict)
|
|
|
|
# Check if we need to update albumartist
|
|
proper_albumartist = self.normalize_album_artist(track_dict)
|
|
current_albumartist = track_dict.get("albumartist", "")
|
|
|
|
if proper_albumartist != current_albumartist:
|
|
cursor = conn.execute(
|
|
"""
|
|
UPDATE tracks
|
|
SET albumartist = ?
|
|
WHERE trackhash = ?
|
|
""",
|
|
(proper_albumartist, track_dict["trackhash"]),
|
|
)
|
|
|
|
fixes_made += 1
|
|
logger.info(
|
|
f"Fixed albumartist for {track_dict['trackhash']}: '{current_albumartist}' -> '{proper_albumartist}'"
|
|
)
|
|
|
|
conn.commit()
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error fixing album grouping: {e}")
|
|
|
|
return fixes_made
|
|
|
|
|
|
# Global album grouper instance
|
|
album_grouper = AlbumGrouper()
|