Trackeep/backend/handlers/web_scraping.go

package handlers

import (
	"fmt"
	"log"
	"net/http"
	"net/url"
	"regexp"
	"strconv"
	"strings"
	"time"

	"github.com/gin-gonic/gin"
	"github.com/gocolly/colly/v2"
	"github.com/trackeep/backend/models"
	"gorm.io/gorm"
)

// WebScrapingHandler handles web scraping operations
type WebScrapingHandler struct {
	db *gorm.DB
}

// NewWebScrapingHandler creates a new web scraping handler
func NewWebScrapingHandler(db *gorm.DB) *WebScrapingHandler {
	return &WebScrapingHandler{db: db}
}

// CreateScrapingJob creates a new web scraping job
func (h *WebScrapingHandler) CreateScrapingJob(c *gin.Context) {
	userID := c.GetUint("user_id")

	var req struct {
		URL             string `json:"url" binding:"required"`
		JobType         string `json:"job_type"`
		Priority        string `json:"priority"`
		ExtractImages   bool   `json:"extract_images"`
		ExtractLinks    bool   `json:"extract_links"`
		ExtractVideos   bool   `json:"extract_videos"`
		GenerateSummary bool   `json:"generate_summary"`
		DownloadImages  bool   `json:"download_images"`
		ExtractMetadata bool   `json:"extract_metadata"`
	}

	if err := c.ShouldBindJSON(&req); err != nil {
		c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
		return
	}

	// Validate URL
	if _, err := url.ParseRequestURI(req.URL); err != nil {
		c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid URL format"})
		return
	}

	// Set defaults
	if req.JobType == "" {
		req.JobType = "full_scrape"
	}
	if req.Priority == "" {
		req.Priority = "normal"
	}

	job := models.ScrapingJob{
		UserID:          userID,
		URL:             req.URL,
		JobType:         req.JobType,
		Priority:        req.Priority,
		ExtractImages:   req.ExtractImages,
		ExtractLinks:    req.ExtractLinks,
		ExtractVideos:   req.ExtractVideos,
		GenerateSummary: req.GenerateSummary,
		DownloadImages:  req.DownloadImages,
		ExtractMetadata: req.ExtractMetadata,
		Status:          "pending",
	}

	if err := h.db.Create(&job).Error; err != nil {
		c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to create scraping job"})
		return
	}

	// Start processing the job asynchronously
	go h.processScrapingJob(job.ID)

	c.JSON(http.StatusCreated, job)
}

// GetScrapingJobs returns user's scraping jobs
func (h *WebScrapingHandler) GetScrapingJobs(c *gin.Context) {
	userID := c.GetUint("user_id")

	status := c.Query("status")
	limit := 20
	if l := c.Query("limit"); l != "" {
		if parsed, err := strconv.Atoi(l); err == nil && parsed > 0 && parsed <= 100 {
			limit = parsed
		}
	}

	query := h.db.Where("user_id = ?", userID)
	if status != "" {
		query = query.Where("status = ?", status)
	}

	var jobs []models.ScrapingJob
	if err := query.Order("created_at DESC").Limit(limit).Find(&jobs).Error; err != nil {
		c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to fetch scraping jobs"})
		return
	}

	c.JSON(http.StatusOK, gin.H{
		"jobs":  jobs,
		"limit": limit,
	})
}

// GetScrapingJob returns a specific scraping job
func (h *WebScrapingHandler) GetScrapingJob(c *gin.Context) {
	userID := c.GetUint("user_id")
	jobID, err := strconv.ParseUint(c.Param("id"), 10, 32)
	if err != nil {
		c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid job ID"})
		return
	}

	var job models.ScrapingJob
	if err := h.db.Where("id = ? AND user_id = ?", jobID, userID).
		Preload("ScrapedContent").
		First(&job).Error; err != nil {
		c.JSON(http.StatusNotFound, gin.H{"error": "Scraping job not found"})
		return
	}

	c.JSON(http.StatusOK, job)
}

// GetScrapedContent returns scraped content
func (h *WebScrapingHandler) GetScrapedContent(c *gin.Context) {
	userID := c.GetUint("user_id")
	contentID, err := strconv.ParseUint(c.Param("id"), 10, 32)
	if err != nil {
		c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid content ID"})
		return
	}

	var content models.ScrapedContent
	if err := h.db.Where("id = ? AND user_id = ?", contentID, userID).
		Preload("Images").
		Preload("Links").
		Preload("Videos").
		Preload("Tags").
		First(&content).Error; err != nil {
		c.JSON(http.StatusNotFound, gin.H{"error": "Scraped content not found"})
		return
	}

	c.JSON(http.StatusOK, content)
}

// GetScrapedContentList returns user's scraped content
func (h *WebScrapingHandler) GetScrapedContentList(c *gin.Context) {
	userID := c.GetUint("user_id")

	contentType := c.Query("content_type")
	domain := c.Query("domain")
	limit := 20
	if l := c.Query("limit"); l != "" {
		if parsed, err := strconv.Atoi(l); err == nil && parsed > 0 && parsed <= 100 {
			limit = parsed
		}
	}

	query := h.db.Where("user_id = ?", userID)
	if contentType != "" {
		query = query.Where("content_type = ?", contentType)
	}
	if domain != "" {
		query = query.Where("domain = ?", domain)
	}

	var content []models.ScrapedContent
	if err := query.Order("last_scraped DESC").Limit(limit).Find(&content).Error; err != nil {
		c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to fetch scraped content"})
		return
	}

	c.JSON(http.StatusOK, gin.H{
		"content": content,
		"limit":   limit,
	})
}

// DeleteScrapingJob deletes a scraping job
func (h *WebScrapingHandler) DeleteScrapingJob(c *gin.Context) {
	userID := c.GetUint("user_id")
	jobID, err := strconv.ParseUint(c.Param("id"), 10, 32)
	if err != nil {
		c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid job ID"})
		return
	}

	var job models.ScrapingJob
	if err := h.db.Where("id = ? AND user_id = ?", jobID, userID).First(&job).Error; err != nil {
		c.JSON(http.StatusNotFound, gin.H{"error": "Scraping job not found"})
		return
	}

	// Only allow deletion of pending, completed, or failed jobs
	if job.Status == "processing" {
		c.JSON(http.StatusBadRequest, gin.H{"error": "Cannot delete job that is currently processing"})
		return
	}

	if err := h.db.Delete(&job).Error; err != nil {
		c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to delete scraping job"})
		return
	}

	c.JSON(http.StatusOK, gin.H{"message": "Scraping job deleted successfully"})
}

// DeleteScrapedContent deletes scraped content
func (h *WebScrapingHandler) DeleteScrapedContent(c *gin.Context) {
	userID := c.GetUint("user_id")
	contentID, err := strconv.ParseUint(c.Param("id"), 10, 32)
	if err != nil {
		c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid content ID"})
		return
	}

	var content models.ScrapedContent
	if err := h.db.Where("id = ? AND user_id = ?", contentID, userID).First(&content).Error; err != nil {
		c.JSON(http.StatusNotFound, gin.H{"error": "Scraped content not found"})
		return
	}

	if err := h.db.Delete(&content).Error; err != nil {
		c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to delete scraped content"})
		return
	}

	c.JSON(http.StatusOK, gin.H{"message": "Scraped content deleted successfully"})
}

// SearchScrapedContent searches within scraped content
func (h *WebScrapingHandler) SearchScrapedContent(c *gin.Context) {
	userID := c.GetUint("user_id")

	query := c.Query("q")
	if query == "" {
		c.JSON(http.StatusBadRequest, gin.H{"error": "Search query is required"})
		return
	}

	contentType := c.Query("content_type")
	domain := c.Query("domain")
	limit := 20
	if l := c.Query("limit"); l != "" {
		if parsed, err := strconv.Atoi(l); err == nil && parsed > 0 && parsed <= 100 {
			limit = parsed
		}
	}

	// Build search query
	dbQuery := h.db.Where("user_id = ?", userID)

	// Search in title, content, and description
	searchCondition := h.db.Where("title ILIKE ?", "%"+query+"%").
		Or("content ILIKE ?", "%"+query+"%").
		Or("description ILIKE ?", "%"+query+"%")

	dbQuery = dbQuery.Where(searchCondition)

	if contentType != "" {
		dbQuery = dbQuery.Where("content_type = ?", contentType)
	}
	if domain != "" {
		dbQuery = dbQuery.Where("domain = ?", domain)
	}

	var content []models.ScrapedContent
	if err := dbQuery.Order("last_scraped DESC").Limit(limit).Find(&content).Error; err != nil {
		c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to search scraped content"})
		return
	}

	c.JSON(http.StatusOK, gin.H{
		"content": content,
		"query":   query,
		"limit":   limit,
	})
}

// Helper functions

// processScrapingJob processes a scraping job asynchronously
func (h *WebScrapingHandler) processScrapingJob(jobID uint) {
	var job models.ScrapingJob
	if err := h.db.First(&job, jobID).Error; err != nil {
		return
	}

	// Update job status to processing
	now := time.Now()
	job.Status = "processing"
	job.StartedAt = &now
	h.db.Save(&job)

	// Perform the scraping
	scrapedContent, err := h.scrapeWebPage(job.URL, job)
	if err != nil {
		job.Status = "failed"
		job.ErrorMessage = err.Error()
		completedAt := time.Now()
		job.CompletedAt = &completedAt
		h.db.Save(&job)
		return
	}

	// Update job with results
	job.Status = "completed"
	job.ScrapedContentID = &scrapedContent.ID
	job.Progress = 100
	completedAt := time.Now()
	job.CompletedAt = &completedAt
	h.db.Save(&job)
}

// scrapeWebPage scrapes a web page and extracts content
func (h *WebScrapingHandler) scrapeWebPage(pageURL string, job models.ScrapingJob) (*models.ScrapedContent, error) {
	parsedURL, err := url.Parse(pageURL)
	if err != nil {
		return nil, fmt.Errorf("invalid URL: %w", err)
	}

	// Create a new collector
	c := colly.NewCollector(
		colly.AllowURLRevisit(),
		colly.Async(true),
	)

	// Set up content extraction variables
	var title, description, content string
	var keywords []string
	var images []models.ScrapedImage
	var links []models.ScrapedLink
	var videos []models.ScrapedVideo

	// Extract title
	c.OnHTML("title", func(e *colly.HTMLElement) {
		title = strings.TrimSpace(e.Text)
	})

	// Extract meta description
	c.OnHTML("meta[name='description']", func(e *colly.HTMLElement) {
		if description == "" {
			description = e.Attr("content")
		}
	})

	// Extract meta keywords
	c.OnHTML("meta[name='keywords']", func(e *colly.HTMLElement) {
		if len(keywords) == 0 {
			keywordsStr := e.Attr("content")
			if keywordsStr != "" {
				keywords = strings.Split(keywordsStr, ",")
				for i, kw := range keywords {
					keywords[i] = strings.TrimSpace(kw)
				}
			}
		}
	})

	// Extract main content
	c.OnHTML("article, main, .content, .post-content, .entry-content", func(e *colly.HTMLElement) {
		content = strings.TrimSpace(e.Text)
	})

	// Fallback to body content if no specific content found
	c.OnHTML("body", func(e *colly.HTMLElement) {
		if content == "" {
			content = strings.TrimSpace(e.Text)
		}
	})

	// Extract images if requested
	if job.ExtractImages {
		c.OnHTML("img", func(e *colly.HTMLElement) {
			src := e.Attr("src")
			alt := e.Attr("alt")

			// Convert relative URLs to absolute
			if src != "" {
				if strings.HasPrefix(src, "/") {
					src = parsedURL.Scheme + "://" + parsedURL.Host + src
				} else if !strings.HasPrefix(src, "http") {
					src = parsedURL.Scheme + "://" + parsedURL.Host + "/" + src
				}

				images = append(images, models.ScrapedImage{
					URL:         src,
					AltText:     alt,
					Format:      h.getImageFormat(src),
					IsMainImage: false,
				})
			}
		})
	}

	// Extract links if requested
	if job.ExtractLinks {
		c.OnHTML("a[href]", func(e *colly.HTMLElement) {
			href := e.Attr("href")
			text := strings.TrimSpace(e.Text)

			if href != "" && text != "" {
				// Convert relative URLs to absolute
				if strings.HasPrefix(href, "/") {
					href = parsedURL.Scheme + "://" + parsedURL.Host + href
				}

				linkType := "external"
				if strings.Contains(href, parsedURL.Host) {
					linkType = "internal"
				}

				links = append(links, models.ScrapedLink{
					URL:      href,
					Text:     text,
					LinkType: linkType,
					Domain:   h.getDomainFromURL(href),
				})
			}
		})
	}

	// Extract videos if requested
	if job.ExtractVideos {
		c.OnHTML("iframe[src], video source", func(e *colly.HTMLElement) {
			src := e.Attr("src")
			title := e.Attr("title")

			if src != "" {
				platform := h.getVideoPlatform(src)
				videos = append(videos, models.ScrapedVideo{
					URL:      src,
					Title:    title,
					Platform: platform,
					VideoID:  h.getVideoID(src, platform),
				})
			}
		})
	}

	// Set error handler
	c.OnError(func(r *colly.Response, err error) {
		log.Printf("Error scraping %s: %v", r.Request.URL, err)
	})

	// Start scraping
	err = c.Visit(pageURL)
	if err != nil {
		return nil, fmt.Errorf("failed to visit page: %w", err)
	}

	c.Wait()

	// Clean and process content
	if content == "" {
		content = "No content could be extracted from this page."
	}

	if description == "" {
		description = content
		if len(description) > 200 {
			description = description[:200] + "..."
		}
	}

	// Generate keywords if none found
	if len(keywords) == 0 && job.ExtractMetadata {
		keywords = h.extractKeywordsFromContent(content)
	}

	// Create the scraped content
	scrapedContent := models.ScrapedContent{
		UserID:       job.UserID,
		URL:          pageURL,
		Domain:       parsedURL.Hostname(),
		Title:        title,
		Description:  description,
		Content:      content,
		Keywords:     keywords,
		ContentType:  h.detectContentType(title, content),
		WordCount:    len(strings.Fields(content)),
		ReadingTime:  h.estimateReadingTime(len(strings.Fields(content))),
		QualityScore: 0, // Will be calculated below
		Status:       "completed",
		LastScraped:  time.Now(),
	}

	// Generate summary if requested
	if job.GenerateSummary {
		scrapedContent.Summary = h.generateSummary(content)
	}

	// Create the content in database
	if err := h.db.Create(&scrapedContent).Error; err != nil {
		return nil, fmt.Errorf("failed to save scraped content: %w", err)
	}

	// Save related content
	if len(images) > 0 {
		for i := range images {
			images[i].ScrapedContentID = scrapedContent.ID
		}
		h.db.Create(&images)
	}

	if len(links) > 0 {
		for i := range links {
			links[i].ScrapedContentID = scrapedContent.ID
		}
		h.db.Create(&links)
	}

	if len(videos) > 0 {
		for i := range videos {
			videos[i].ScrapedContentID = scrapedContent.ID
		}
		h.db.Create(&videos)
	}

	// Calculate and save quality score
	scrapedContent.QualityScore = h.calculateQualityScore(scrapedContent)
	h.db.Save(&scrapedContent)

	return &scrapedContent, nil
}

// extractTextFromHTML extracts text content from HTML
func (h *WebScrapingHandler) extractTextFromHTML(html string) string {
	// Remove HTML tags
	re := regexp.MustCompile(`<[^>]*>`)
	text := re.ReplaceAllString(html, "")

	// Clean up whitespace
	text = strings.TrimSpace(text)
	text = regexp.MustCompile(`\s+`).ReplaceAllString(text, " ")

	return text
}

// estimateReadingTime estimates reading time in minutes
func (h *WebScrapingHandler) estimateReadingTime(wordCount int) int {
	// Average reading speed: 200-250 words per minute
	readingSpeed := 225
	readingTime := wordCount / readingSpeed
	if readingTime < 1 {
		readingTime = 1
	}
	return readingTime
}

// calculateQualityScore calculates a quality score for the content
func (h *WebScrapingHandler) calculateQualityScore(content models.ScrapedContent) float64 {
	score := 50.0 // Base score

	// Add points for having title
	if content.Title != "" {
		score += 10
	}

	// Add points for content length
	if content.WordCount > 100 {
		score += 10
	}
	if content.WordCount > 500 {
		score += 10
	}

	// Add points for having description
	if content.Description != "" {
		score += 10
	}

	// Add points for having images
	if len(content.Images) > 0 {
		score += 5
	}

	// Add points for having keywords
	if len(content.Keywords) > 0 {
		score += 5
	}

	// Cap at 100
	if score > 100 {
		score = 100
	}

	return score
}

// Helper methods for web scraping

// getImageFormat extracts image format from URL
func (h *WebScrapingHandler) getImageFormat(url string) string {
	lower := strings.ToLower(url)
	if strings.HasSuffix(lower, ".jpg") || strings.HasSuffix(lower, ".jpeg") {
		return "jpg"
	} else if strings.HasSuffix(lower, ".png") {
		return "png"
	} else if strings.HasSuffix(lower, ".gif") {
		return "gif"
	} else if strings.HasSuffix(lower, ".svg") {
		return "svg"
	} else if strings.HasSuffix(lower, ".webp") {
		return "webp"
	}
	return "unknown"
}

// getDomainFromURL extracts domain from URL
func (h *WebScrapingHandler) getDomainFromURL(urlStr string) string {
	if parsedURL, err := url.Parse(urlStr); err == nil {
		return parsedURL.Hostname()
	}
	return ""
}

// getVideoPlatform detects video platform from URL
func (h *WebScrapingHandler) getVideoPlatform(urlStr string) string {
	lower := strings.ToLower(urlStr)
	if strings.Contains(lower, "youtube.com") || strings.Contains(lower, "youtu.be") {
		return "youtube"
	} else if strings.Contains(lower, "vimeo.com") {
		return "vimeo"
	} else if strings.Contains(lower, "twitch.tv") {
		return "twitch"
	}
	return "unknown"
}

// getVideoID extracts video ID from URL
func (h *WebScrapingHandler) getVideoID(urlStr, platform string) string {
	switch platform {
	case "youtube":
		if strings.Contains(urlStr, "youtube.com/watch?v=") {
			parts := strings.Split(urlStr, "v=")
			if len(parts) > 1 {
				id := strings.Split(parts[1], "&")[0]
				return id
			}
		} else if strings.Contains(urlStr, "youtu.be/") {
			parts := strings.Split(urlStr, "youtu.be/")
			if len(parts) > 1 {
				return strings.Split(parts[1], "?")[0]
			}
		}
	case "vimeo":
		parts := strings.Split(urlStr, "vimeo.com/")
		if len(parts) > 1 {
			return strings.Split(parts[1], "?")[0]
		}
	}
	return ""
}

// extractKeywordsFromContent extracts keywords from content
func (h *WebScrapingHandler) extractKeywordsFromContent(content string) []string {
	// Simple keyword extraction - in production, you'd use more sophisticated NLP
	words := strings.Fields(strings.ToLower(content))
	wordCount := make(map[string]int)

	// Count word frequency
	for _, word := range words {
		// Filter out common words
		if len(word) > 3 && !h.isCommonWord(word) {
			wordCount[word]++
		}
	}

	// Get top keywords
	type wordFreq struct {
		word  string
		count int
	}

	var sortedWords []wordFreq
	for word, count := range wordCount {
		if count > 1 { // Only include words that appear more than once
			sortedWords = append(sortedWords, wordFreq{word, count})
		}
	}

	// Sort by frequency
	for i := 0; i < len(sortedWords)-1; i++ {
		for j := i + 1; j < len(sortedWords); j++ {
			if sortedWords[j].count > sortedWords[i].count {
				sortedWords[i], sortedWords[j] = sortedWords[j], sortedWords[i]
			}
		}
	}

	// Return top 10 keywords
	var keywords []string
	for i := 0; i < len(sortedWords) && i < 10; i++ {
		keywords = append(keywords, sortedWords[i].word)
	}

	return keywords
}

// isCommonWord checks if a word is too common to be a keyword
func (h *WebScrapingHandler) isCommonWord(word string) bool {
	commonWords := []string{
		"the", "and", "for", "are", "but", "not", "you", "all", "can", "had", "her", "was", "one", "our", "out", "day", "get", "has", "him", "his", "how", "man", "new", "now", "old", "see", "two", "way", "who", "boy", "did", "its", "let", "put", "say", "she", "too", "use", "with", "have", "this", "that", "from", "they", "been", "call", "come", "each", "find", "give", "hand", "keep", "know", "last", "leave", "life", "long", "made", "many", "move", "must", "name", "need", "only", "over", "part", "said", "same", "show", "tell", "time", "turn", "well", "went", "were", "what", "will", "your", "about", "after", "again", "before", "being", "below", "could", "every", "first", "found", "great", "house", "large", "never", "other", "place", "right", "small", "sound", "still", "their", "there", "think", "under", "water", "where", "which", "world", "would", "write", "years",
	}

	for _, common := range commonWords {
		if word == common {
			return true
		}
	}
	return false
}

// detectContentType detects the type of content
func (h *WebScrapingHandler) detectContentType(title, content string) string {
	titleLower := strings.ToLower(title)
	contentLower := strings.ToLower(content)

	// Check for tutorial
	if strings.Contains(titleLower, "tutorial") || strings.Contains(titleLower, "how to") || strings.Contains(contentLower, "step by step") {
		return "tutorial"
	}

	// Check for documentation
	if strings.Contains(titleLower, "documentation") || strings.Contains(titleLower, "api") || strings.Contains(contentLower, "function") {
		return "documentation"
	}

	// Check for news
	if strings.Contains(titleLower, "news") || strings.Contains(contentLower, "breaking") || strings.Contains(contentLower, "report") {
		return "news"
	}

	// Check for blog
	if strings.Contains(titleLower, "blog") || strings.Contains(contentLower, "posted") || strings.Contains(contentLower, "opinion") {
		return "blog"
	}

	// Default to article
	return "article"
}

// generateSummary generates a simple summary
func (h *WebScrapingHandler) generateSummary(content string) string {
	sentences := strings.Split(content, ".")
	if len(sentences) == 0 {
		return ""
	}

	// Take first 2-3 sentences as summary
	summaryLength := 2
	if len(sentences) < 2 {
		summaryLength = len(sentences)
	} else if len(sentences) > 3 {
		summaryLength = 3
	}

	var summary string
	for i := 0; i < summaryLength; i++ {
		sentence := strings.TrimSpace(sentences[i])
		if sentence != "" {
			summary += sentence + ". "
		}
	}

	return strings.TrimSpace(summary)
}