mirror of
https://github.com/Dvorinka/SpotifyRecAlg.git
synced 2026-06-03 20:13:03 +00:00
455 lines
13 KiB
Python
455 lines
13 KiB
Python
"""
|
|
Analytics Event Queue using DragonflyDB.
|
|
|
|
Provides a high-performance event queue for analytics events using DragonflyDB
|
|
lists. Events are queued for batch processing, reducing database load and
|
|
enabling real-time analytics aggregation.
|
|
"""
|
|
|
|
import json
|
|
import logging
|
|
import time
|
|
import uuid
|
|
from dataclasses import dataclass, field
|
|
from enum import StrEnum
|
|
from typing import Any
|
|
|
|
from swingmusic.db.dragonfly_client import get_dragonfly_client
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Maximum queue size before forced flush
|
|
MAX_QUEUE_SIZE = 10000
|
|
|
|
# Event expiry (7 days)
|
|
EVENT_TTL = 7 * 24 * 60 * 60
|
|
|
|
|
|
class EventType(StrEnum):
|
|
"""Types of analytics events."""
|
|
|
|
# Playback events
|
|
TRACK_PLAY = "track_play"
|
|
TRACK_SKIP = "track_skip"
|
|
TRACK_COMPLETE = "track_complete"
|
|
ALBUM_PLAY = "album_play"
|
|
ARTIST_PLAY = "artist_play"
|
|
|
|
# User interaction events
|
|
FAVORITE_ADD = "favorite_add"
|
|
FAVORITE_REMOVE = "favorite_remove"
|
|
PLAYLIST_CREATE = "playlist_create"
|
|
PLAYLIST_ADD_TRACK = "playlist_add_track"
|
|
PLAYLIST_REMOVE_TRACK = "playlist_remove_track"
|
|
|
|
# Search events
|
|
SEARCH_QUERY = "search_query"
|
|
SEARCH_RESULT_CLICK = "search_result_click"
|
|
|
|
# Download events
|
|
DOWNLOAD_START = "download_start"
|
|
DOWNLOAD_COMPLETE = "download_complete"
|
|
DOWNLOAD_FAIL = "download_fail"
|
|
|
|
# Library events
|
|
LIBRARY_SCAN = "library_scan"
|
|
LIBRARY_UPDATE = "library_update"
|
|
|
|
# Session events
|
|
SESSION_START = "session_start"
|
|
SESSION_END = "session_end"
|
|
|
|
|
|
@dataclass
|
|
class AnalyticsEvent:
|
|
"""Represents a single analytics event."""
|
|
|
|
event_type: EventType
|
|
timestamp: int
|
|
userid: int
|
|
data: dict[str, Any]
|
|
event_id: str = field(default_factory=lambda: uuid.uuid4().hex)
|
|
|
|
def to_dict(self) -> dict[str, Any]:
|
|
return {
|
|
"event_id": self.event_id,
|
|
"event_type": self.event_type.value,
|
|
"timestamp": self.timestamp,
|
|
"userid": self.userid,
|
|
"data": self.data,
|
|
}
|
|
|
|
@classmethod
|
|
def from_dict(cls, data: dict[str, Any]) -> "AnalyticsEvent":
|
|
return cls(
|
|
event_id=data["event_id"],
|
|
event_type=EventType(data["event_type"]),
|
|
timestamp=data["timestamp"],
|
|
userid=data["userid"],
|
|
data=data["data"],
|
|
)
|
|
|
|
|
|
class AnalyticsEventQueue:
|
|
"""
|
|
Manages analytics events using DragonflyDB lists.
|
|
|
|
Events are pushed to a Redis list and can be processed in batches
|
|
by a background worker. This decouples event collection from
|
|
event processing, improving application responsiveness.
|
|
"""
|
|
|
|
def __init__(self, max_queue_size: int = MAX_QUEUE_SIZE):
|
|
self.max_queue_size = max_queue_size
|
|
self._client = None
|
|
|
|
@property
|
|
def client(self):
|
|
if self._client is None:
|
|
self._client = get_dragonfly_client()
|
|
return self._client
|
|
|
|
def _get_queue_key(self) -> str:
|
|
"""Get the main event queue key."""
|
|
return "analytics:events:queue"
|
|
|
|
def _get_processing_key(self) -> str:
|
|
"""Get the processing list key (for reliable queue pattern)."""
|
|
return "analytics:events:processing"
|
|
|
|
def _get_stats_key(self, event_type: EventType) -> str:
|
|
"""Get the key for event type statistics."""
|
|
return f"analytics:stats:{event_type.value}"
|
|
|
|
def _get_hourly_key(self, event_type: EventType, hour: int) -> str:
|
|
"""Get the key for hourly event counts."""
|
|
return f"analytics:hourly:{event_type.value}:{hour}"
|
|
|
|
def enqueue(self, event: AnalyticsEvent) -> bool:
|
|
"""
|
|
Add an event to the queue.
|
|
|
|
Args:
|
|
event: The analytics event to enqueue
|
|
|
|
Returns:
|
|
True if successful, False otherwise
|
|
"""
|
|
if not self.client.is_available():
|
|
return False
|
|
|
|
try:
|
|
queue_key = self._get_queue_key()
|
|
|
|
# Push event to queue (RPUSH for FIFO processing)
|
|
self.client.client.rpush(queue_key, json.dumps(event.to_dict()))
|
|
|
|
# Increment event type counter
|
|
stats_key = self._get_stats_key(event.event_type)
|
|
self.client.client.incr(stats_key)
|
|
|
|
# Increment hourly counter
|
|
hour = event.timestamp // 3600
|
|
hourly_key = self._get_hourly_key(event.event_type, hour)
|
|
self.client.client.incr(hourly_key)
|
|
self.client.client.expire(hourly_key, EVENT_TTL)
|
|
|
|
return True
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to enqueue analytics event: {e}")
|
|
return False
|
|
|
|
def enqueue_batch(self, events: list[AnalyticsEvent]) -> int:
|
|
"""
|
|
Add multiple events to the queue atomically.
|
|
|
|
Returns:
|
|
Number of events successfully enqueued
|
|
"""
|
|
if not self.client.is_available():
|
|
return 0
|
|
|
|
try:
|
|
queue_key = self._get_queue_key()
|
|
pipe = self.client.client.pipeline()
|
|
|
|
for event in events:
|
|
pipe.rpush(queue_key, json.dumps(event.to_dict()))
|
|
|
|
# Increment counters
|
|
stats_key = self._get_stats_key(event.event_type)
|
|
pipe.incr(stats_key)
|
|
|
|
hour = event.timestamp // 3600
|
|
hourly_key = self._get_hourly_key(event.event_type, hour)
|
|
pipe.incr(hourly_key)
|
|
pipe.expire(hourly_key, EVENT_TTL)
|
|
|
|
pipe.execute()
|
|
return len(events)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to enqueue batch: {e}")
|
|
return 0
|
|
|
|
def dequeue(self, batch_size: int = 100) -> list[AnalyticsEvent]:
|
|
"""
|
|
Get a batch of events from the queue for processing.
|
|
|
|
Uses the reliable queue pattern: events are moved to a processing
|
|
list before being returned. They should be acknowledged after
|
|
successful processing.
|
|
|
|
Args:
|
|
batch_size: Maximum number of events to dequeue
|
|
|
|
Returns:
|
|
List of analytics events
|
|
"""
|
|
if not self.client.is_available():
|
|
return []
|
|
|
|
try:
|
|
queue_key = self._get_queue_key()
|
|
processing_key = self._get_processing_key()
|
|
|
|
events = []
|
|
|
|
for _ in range(batch_size):
|
|
# RPOPLPUSH: atomically pop from queue and push to processing
|
|
result = self.client.client.rpoplpush(queue_key, processing_key)
|
|
|
|
if not result:
|
|
break
|
|
|
|
try:
|
|
event = AnalyticsEvent.from_dict(json.loads(result))
|
|
events.append(event)
|
|
except json.JSONDecodeError:
|
|
# Invalid event, remove from processing
|
|
self.client.client.lrem(processing_key, 1, result)
|
|
continue
|
|
|
|
return events
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to dequeue events: {e}")
|
|
return []
|
|
|
|
def acknowledge(self, event_ids: list[str]) -> int:
|
|
"""
|
|
Acknowledge processed events, removing them from the processing list.
|
|
|
|
Args:
|
|
event_ids: List of event IDs to acknowledge
|
|
|
|
Returns:
|
|
Number of events acknowledged
|
|
"""
|
|
if not self.client.is_available():
|
|
return 0
|
|
|
|
try:
|
|
processing_key = self._get_processing_key()
|
|
|
|
# Get all events in processing list
|
|
all_events = self.client.client.lrange(processing_key, 0, -1)
|
|
|
|
acknowledged = 0
|
|
for event_json in all_events:
|
|
try:
|
|
event_data = json.loads(event_json)
|
|
if event_data.get("event_id") in event_ids:
|
|
self.client.client.lrem(processing_key, 1, event_json)
|
|
acknowledged += 1
|
|
except json.JSONDecodeError:
|
|
continue
|
|
|
|
return acknowledged
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to acknowledge events: {e}")
|
|
return 0
|
|
|
|
def requeue_unprocessed(self, timeout_seconds: int = 300) -> int:
|
|
"""
|
|
Re-queue events that have been in processing for too long.
|
|
|
|
This handles the case where a worker crashes while processing events.
|
|
|
|
Args:
|
|
timeout_seconds: Time after which processing events are considered stale
|
|
|
|
Returns:
|
|
Number of events re-queued
|
|
"""
|
|
if not self.client.is_available():
|
|
return 0
|
|
|
|
try:
|
|
queue_key = self._get_queue_key()
|
|
processing_key = self._get_processing_key()
|
|
|
|
# Move all processing events back to queue
|
|
# In production, you'd check timestamps for timeout
|
|
requeued = 0
|
|
while True:
|
|
result = self.client.client.rpoplpush(processing_key, queue_key)
|
|
if not result:
|
|
break
|
|
requeued += 1
|
|
|
|
return requeued
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to requeue unprocessed events: {e}")
|
|
return 0
|
|
|
|
def get_queue_size(self) -> int:
|
|
"""Get the number of events in the queue."""
|
|
if not self.client.is_available():
|
|
return 0
|
|
|
|
try:
|
|
return self.client.client.llen(self._get_queue_key())
|
|
except Exception:
|
|
return 0
|
|
|
|
def get_event_count(self, event_type: EventType) -> int:
|
|
"""Get the total count for an event type."""
|
|
if not self.client.is_available():
|
|
return 0
|
|
|
|
try:
|
|
key = self._get_stats_key(event_type)
|
|
value = self.client.get(key)
|
|
return int(value) if value else 0
|
|
except Exception:
|
|
return 0
|
|
|
|
def get_hourly_counts(
|
|
self, event_type: EventType, start_hour: int, end_hour: int
|
|
) -> dict[int, int]:
|
|
"""
|
|
Get hourly event counts for a time range.
|
|
|
|
Args:
|
|
event_type: The event type to query
|
|
start_hour: Start hour (Unix timestamp / 3600)
|
|
end_hour: End hour (Unix timestamp / 3600)
|
|
|
|
Returns:
|
|
Dict mapping hour to event count
|
|
"""
|
|
if not self.client.is_available():
|
|
return {}
|
|
|
|
try:
|
|
counts = {}
|
|
for hour in range(start_hour, end_hour + 1):
|
|
key = self._get_hourly_key(event_type, hour)
|
|
value = self.client.get(key)
|
|
counts[hour] = int(value) if value else 0
|
|
|
|
return counts
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to get hourly counts: {e}")
|
|
return {}
|
|
|
|
def clear_queue(self) -> bool:
|
|
"""Clear all events from the queue."""
|
|
if not self.client.is_available():
|
|
return False
|
|
|
|
try:
|
|
self.client.client.delete(self._get_queue_key())
|
|
self.client.client.delete(self._get_processing_key())
|
|
return True
|
|
except Exception:
|
|
return False
|
|
|
|
|
|
# Helper functions for common events
|
|
def track_played(
|
|
userid: int,
|
|
trackhash: str,
|
|
duration: int,
|
|
source: str,
|
|
) -> AnalyticsEvent:
|
|
"""Create a track play event."""
|
|
return AnalyticsEvent(
|
|
event_type=EventType.TRACK_PLAY,
|
|
timestamp=int(time.time()),
|
|
userid=userid,
|
|
data={
|
|
"trackhash": trackhash,
|
|
"duration": duration,
|
|
"source": source,
|
|
},
|
|
)
|
|
|
|
|
|
def track_skipped(
|
|
userid: int,
|
|
trackhash: str,
|
|
position: int,
|
|
) -> AnalyticsEvent:
|
|
"""Create a track skip event."""
|
|
return AnalyticsEvent(
|
|
event_type=EventType.TRACK_SKIP,
|
|
timestamp=int(time.time()),
|
|
userid=userid,
|
|
data={
|
|
"trackhash": trackhash,
|
|
"position": position,
|
|
},
|
|
)
|
|
|
|
|
|
def favorite_toggled(
|
|
userid: int,
|
|
item_type: str,
|
|
itemhash: str,
|
|
added: bool,
|
|
) -> AnalyticsEvent:
|
|
"""Create a favorite toggle event."""
|
|
return AnalyticsEvent(
|
|
event_type=EventType.FAVORITE_ADD if added else EventType.FAVORITE_REMOVE,
|
|
timestamp=int(time.time()),
|
|
userid=userid,
|
|
data={
|
|
"item_type": item_type,
|
|
"itemhash": itemhash,
|
|
},
|
|
)
|
|
|
|
|
|
def search_performed(
|
|
userid: int,
|
|
query: str,
|
|
results_count: int,
|
|
filters: dict[str, Any] | None = None,
|
|
) -> AnalyticsEvent:
|
|
"""Create a search event."""
|
|
return AnalyticsEvent(
|
|
event_type=EventType.SEARCH_QUERY,
|
|
timestamp=int(time.time()),
|
|
userid=userid,
|
|
data={
|
|
"query": query,
|
|
"results_count": results_count,
|
|
"filters": filters or {},
|
|
},
|
|
)
|
|
|
|
|
|
# Global instance
|
|
analytics_queue = AnalyticsEventQueue()
|
|
|
|
|
|
def get_analytics_queue() -> AnalyticsEventQueue:
|
|
"""Get the global analytics queue instance."""
|
|
return analytics_queue
|