package handlers import ( "fmt" "net/http" "net/url" "regexp" "strconv" "strings" "time" "github.com/gin-gonic/gin" "github.com/gocolly/colly/v2" "github.com/trackeep/backend/models" "gorm.io/gorm" ) // WebScrapingHandler handles web scraping operations type WebScrapingHandler struct { db *gorm.DB } // NewWebScrapingHandler creates a new web scraping handler func NewWebScrapingHandler(db *gorm.DB) *WebScrapingHandler { return &WebScrapingHandler{db: db} } // CreateScrapingJob creates a new web scraping job func (h *WebScrapingHandler) CreateScrapingJob(c *gin.Context) { userID := c.GetUint("user_id") var req struct { URL string `json:"url" binding:"required"` JobType string `json:"job_type"` Priority string `json:"priority"` ExtractImages bool `json:"extract_images"` ExtractLinks bool `json:"extract_links"` ExtractVideos bool `json:"extract_videos"` GenerateSummary bool `json:"generate_summary"` DownloadImages bool `json:"download_images"` ExtractMetadata bool `json:"extract_metadata"` } if err := c.ShouldBindJSON(&req); err != nil { c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()}) return } // Validate URL if _, err := url.ParseRequestURI(req.URL); err != nil { c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid URL format"}) return } // Set defaults if req.JobType == "" { req.JobType = "full_scrape" } if req.Priority == "" { req.Priority = "normal" } job := models.ScrapingJob{ UserID: userID, URL: req.URL, JobType: req.JobType, Priority: req.Priority, ExtractImages: req.ExtractImages, ExtractLinks: req.ExtractLinks, ExtractVideos: req.ExtractVideos, GenerateSummary: req.GenerateSummary, DownloadImages: req.DownloadImages, ExtractMetadata: req.ExtractMetadata, Status: "pending", } if err := h.db.Create(&job).Error; err != nil { c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to create scraping job"}) return } // Start processing the job asynchronously go h.processScrapingJob(job.ID) c.JSON(http.StatusCreated, job) } // GetScrapingJobs returns user's scraping jobs func (h *WebScrapingHandler) GetScrapingJobs(c *gin.Context) { userID := c.GetUint("user_id") status := c.Query("status") limit := 20 if l := c.Query("limit"); l != "" { if parsed, err := strconv.Atoi(l); err == nil && parsed > 0 && parsed <= 100 { limit = parsed } } query := h.db.Where("user_id = ?", userID) if status != "" { query = query.Where("status = ?", status) } var jobs []models.ScrapingJob if err := query.Order("created_at DESC").Limit(limit).Find(&jobs).Error; err != nil { c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to fetch scraping jobs"}) return } c.JSON(http.StatusOK, gin.H{ "jobs": jobs, "limit": limit, }) } // GetScrapingJob returns a specific scraping job func (h *WebScrapingHandler) GetScrapingJob(c *gin.Context) { userID := c.GetUint("user_id") jobID, err := strconv.ParseUint(c.Param("id"), 10, 32) if err != nil { c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid job ID"}) return } var job models.ScrapingJob if err := h.db.Where("id = ? AND user_id = ?", jobID, userID). Preload("ScrapedContent"). First(&job).Error; err != nil { c.JSON(http.StatusNotFound, gin.H{"error": "Scraping job not found"}) return } c.JSON(http.StatusOK, job) } // GetScrapedContent returns scraped content func (h *WebScrapingHandler) GetScrapedContent(c *gin.Context) { userID := c.GetUint("user_id") contentID, err := strconv.ParseUint(c.Param("id"), 10, 32) if err != nil { c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid content ID"}) return } var content models.ScrapedContent if err := h.db.Where("id = ? AND user_id = ?", contentID, userID). Preload("Images"). Preload("Links"). Preload("Videos"). Preload("Tags"). First(&content).Error; err != nil { c.JSON(http.StatusNotFound, gin.H{"error": "Scraped content not found"}) return } c.JSON(http.StatusOK, content) } // GetScrapedContentList returns user's scraped content func (h *WebScrapingHandler) GetScrapedContentList(c *gin.Context) { userID := c.GetUint("user_id") contentType := c.Query("content_type") domain := c.Query("domain") limit := 20 if l := c.Query("limit"); l != "" { if parsed, err := strconv.Atoi(l); err == nil && parsed > 0 && parsed <= 100 { limit = parsed } } query := h.db.Where("user_id = ?", userID) if contentType != "" { query = query.Where("content_type = ?", contentType) } if domain != "" { query = query.Where("domain = ?", domain) } var content []models.ScrapedContent if err := query.Order("last_scraped DESC").Limit(limit).Find(&content).Error; err != nil { c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to fetch scraped content"}) return } c.JSON(http.StatusOK, gin.H{ "content": content, "limit": limit, }) } // DeleteScrapingJob deletes a scraping job func (h *WebScrapingHandler) DeleteScrapingJob(c *gin.Context) { userID := c.GetUint("user_id") jobID, err := strconv.ParseUint(c.Param("id"), 10, 32) if err != nil { c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid job ID"}) return } var job models.ScrapingJob if err := h.db.Where("id = ? AND user_id = ?", jobID, userID).First(&job).Error; err != nil { c.JSON(http.StatusNotFound, gin.H{"error": "Scraping job not found"}) return } // Only allow deletion of pending, completed, or failed jobs if job.Status == "processing" { c.JSON(http.StatusBadRequest, gin.H{"error": "Cannot delete job that is currently processing"}) return } if err := h.db.Delete(&job).Error; err != nil { c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to delete scraping job"}) return } c.JSON(http.StatusOK, gin.H{"message": "Scraping job deleted successfully"}) } // DeleteScrapedContent deletes scraped content func (h *WebScrapingHandler) DeleteScrapedContent(c *gin.Context) { userID := c.GetUint("user_id") contentID, err := strconv.ParseUint(c.Param("id"), 10, 32) if err != nil { c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid content ID"}) return } var content models.ScrapedContent if err := h.db.Where("id = ? AND user_id = ?", contentID, userID).First(&content).Error; err != nil { c.JSON(http.StatusNotFound, gin.H{"error": "Scraped content not found"}) return } if err := h.db.Delete(&content).Error; err != nil { c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to delete scraped content"}) return } c.JSON(http.StatusOK, gin.H{"message": "Scraped content deleted successfully"}) } // SearchScrapedContent searches within scraped content func (h *WebScrapingHandler) SearchScrapedContent(c *gin.Context) { userID := c.GetUint("user_id") query := c.Query("q") if query == "" { c.JSON(http.StatusBadRequest, gin.H{"error": "Search query is required"}) return } contentType := c.Query("content_type") domain := c.Query("domain") limit := 20 if l := c.Query("limit"); l != "" { if parsed, err := strconv.Atoi(l); err == nil && parsed > 0 && parsed <= 100 { limit = parsed } } // Build search query dbQuery := h.db.Where("user_id = ?", userID) // Search in title, content, and description searchCondition := h.db.Where("title ILIKE ?", "%"+query+"%"). Or("content ILIKE ?", "%"+query+"%"). Or("description ILIKE ?", "%"+query+"%") dbQuery = dbQuery.Where(searchCondition) if contentType != "" { dbQuery = dbQuery.Where("content_type = ?", contentType) } if domain != "" { dbQuery = dbQuery.Where("domain = ?", domain) } var content []models.ScrapedContent if err := dbQuery.Order("last_scraped DESC").Limit(limit).Find(&content).Error; err != nil { c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to search scraped content"}) return } c.JSON(http.StatusOK, gin.H{ "content": content, "query": query, "limit": limit, }) } // Helper functions // processScrapingJob processes a scraping job asynchronously func (h *WebScrapingHandler) processScrapingJob(jobID uint) { var job models.ScrapingJob if err := h.db.First(&job, jobID).Error; err != nil { return } // Update job status to processing now := time.Now() job.Status = "processing" job.StartedAt = &now h.db.Save(&job) // Perform the scraping scrapedContent, err := h.scrapeWebPage(job.URL, job) if err != nil { job.Status = "failed" job.ErrorMessage = err.Error() completedAt := time.Now() job.CompletedAt = &completedAt h.db.Save(&job) return } // Update job with results job.Status = "completed" job.ScrapedContentID = &scrapedContent.ID job.Progress = 100 completedAt := time.Now() job.CompletedAt = &completedAt h.db.Save(&job) } // scrapeWebPage scrapes a web page and extracts content func (h *WebScrapingHandler) scrapeWebPage(pageURL string, job models.ScrapingJob) (*models.ScrapedContent, error) { parsedURL, err := url.Parse(pageURL) if err != nil { return nil, fmt.Errorf("invalid URL: %w", err) } // Create a new collector c := colly.NewCollector( colly.AllowURLRevisit(), colly.Async(true), ) // Set up content extraction variables var title, description, content string var keywords []string var images []models.ScrapedImage var links []models.ScrapedLink var videos []models.ScrapedVideo // Extract title c.OnHTML("title", func(e *colly.HTMLElement) { title = strings.TrimSpace(e.Text) }) // Extract meta description c.OnHTML("meta[name='description']", func(e *colly.HTMLElement) { if description == "" { description = e.Attr("content") } }) // Extract meta keywords c.OnHTML("meta[name='keywords']", func(e *colly.HTMLElement) { if len(keywords) == 0 { keywordsStr := e.Attr("content") if keywordsStr != "" { keywords = strings.Split(keywordsStr, ",") for i, kw := range keywords { keywords[i] = strings.TrimSpace(kw) } } } }) // Extract main content c.OnHTML("article, main, .content, .post-content, .entry-content", func(e *colly.HTMLElement) { content = strings.TrimSpace(e.Text) }) // Fallback to body content if no specific content found c.OnHTML("body", func(e *colly.HTMLElement) { if content == "" { content = strings.TrimSpace(e.Text) } }) // Extract images if requested if job.ExtractImages { c.OnHTML("img", func(e *colly.HTMLElement) { src := e.Attr("src") alt := e.Attr("alt") // Convert relative URLs to absolute if src != "" { if strings.HasPrefix(src, "/") { src = parsedURL.Scheme + "://" + parsedURL.Host + src } else if !strings.HasPrefix(src, "http") { src = parsedURL.Scheme + "://" + parsedURL.Host + "/" + src } images = append(images, models.ScrapedImage{ URL: src, AltText: alt, Format: h.getImageFormat(src), IsMainImage: false, }) } }) } // Extract links if requested if job.ExtractLinks { c.OnHTML("a[href]", func(e *colly.HTMLElement) { href := e.Attr("href") text := strings.TrimSpace(e.Text) if href != "" && text != "" { // Convert relative URLs to absolute if strings.HasPrefix(href, "/") { href = parsedURL.Scheme + "://" + parsedURL.Host + href } linkType := "external" if strings.Contains(href, parsedURL.Host) { linkType = "internal" } links = append(links, models.ScrapedLink{ URL: href, Text: text, LinkType: linkType, Domain: h.getDomainFromURL(href), }) } }) } // Extract videos if requested if job.ExtractVideos { c.OnHTML("iframe[src], video source", func(e *colly.HTMLElement) { src := e.Attr("src") title := e.Attr("title") if src != "" { platform := h.getVideoPlatform(src) videos = append(videos, models.ScrapedVideo{ URL: src, Title: title, Platform: platform, VideoID: h.getVideoID(src, platform), }) } }) } // Set error handler c.OnError(func(r *colly.Response, err error) { fmt.Printf("Error scraping %s: %v\n", r.Request.URL, err) }) // Start scraping err = c.Visit(pageURL) if err != nil { return nil, fmt.Errorf("failed to visit page: %w", err) } c.Wait() // Clean and process content if content == "" { content = "No content could be extracted from this page." } if description == "" { description = content if len(description) > 200 { description = description[:200] + "..." } } // Generate keywords if none found if len(keywords) == 0 && job.ExtractMetadata { keywords = h.extractKeywordsFromContent(content) } // Create the scraped content scrapedContent := models.ScrapedContent{ UserID: job.UserID, URL: pageURL, Domain: parsedURL.Hostname(), Title: title, Description: description, Content: content, Keywords: keywords, ContentType: h.detectContentType(title, content), WordCount: len(strings.Fields(content)), ReadingTime: h.estimateReadingTime(len(strings.Fields(content))), QualityScore: 0, // Will be calculated below Status: "completed", LastScraped: time.Now(), } // Generate summary if requested if job.GenerateSummary { scrapedContent.Summary = h.generateSummary(content) } // Create the content in database if err := h.db.Create(&scrapedContent).Error; err != nil { return nil, fmt.Errorf("failed to save scraped content: %w", err) } // Save related content if len(images) > 0 { for i := range images { images[i].ScrapedContentID = scrapedContent.ID } h.db.Create(&images) } if len(links) > 0 { for i := range links { links[i].ScrapedContentID = scrapedContent.ID } h.db.Create(&links) } if len(videos) > 0 { for i := range videos { videos[i].ScrapedContentID = scrapedContent.ID } h.db.Create(&videos) } // Calculate and save quality score scrapedContent.QualityScore = h.calculateQualityScore(scrapedContent) h.db.Save(&scrapedContent) return &scrapedContent, nil } // extractTextFromHTML extracts text content from HTML func (h *WebScrapingHandler) extractTextFromHTML(html string) string { // Remove HTML tags re := regexp.MustCompile(`<[^>]*>`) text := re.ReplaceAllString(html, "") // Clean up whitespace text = strings.TrimSpace(text) text = regexp.MustCompile(`\s+`).ReplaceAllString(text, " ") return text } // estimateReadingTime estimates reading time in minutes func (h *WebScrapingHandler) estimateReadingTime(wordCount int) int { // Average reading speed: 200-250 words per minute readingSpeed := 225 readingTime := wordCount / readingSpeed if readingTime < 1 { readingTime = 1 } return readingTime } // calculateQualityScore calculates a quality score for the content func (h *WebScrapingHandler) calculateQualityScore(content models.ScrapedContent) float64 { score := 50.0 // Base score // Add points for having title if content.Title != "" { score += 10 } // Add points for content length if content.WordCount > 100 { score += 10 } if content.WordCount > 500 { score += 10 } // Add points for having description if content.Description != "" { score += 10 } // Add points for having images if len(content.Images) > 0 { score += 5 } // Add points for having keywords if len(content.Keywords) > 0 { score += 5 } // Cap at 100 if score > 100 { score = 100 } return score } // Helper methods for web scraping // getImageFormat extracts image format from URL func (h *WebScrapingHandler) getImageFormat(url string) string { lower := strings.ToLower(url) if strings.HasSuffix(lower, ".jpg") || strings.HasSuffix(lower, ".jpeg") { return "jpg" } else if strings.HasSuffix(lower, ".png") { return "png" } else if strings.HasSuffix(lower, ".gif") { return "gif" } else if strings.HasSuffix(lower, ".svg") { return "svg" } else if strings.HasSuffix(lower, ".webp") { return "webp" } return "unknown" } // getDomainFromURL extracts domain from URL func (h *WebScrapingHandler) getDomainFromURL(urlStr string) string { if parsedURL, err := url.Parse(urlStr); err == nil { return parsedURL.Hostname() } return "" } // getVideoPlatform detects video platform from URL func (h *WebScrapingHandler) getVideoPlatform(urlStr string) string { lower := strings.ToLower(urlStr) if strings.Contains(lower, "youtube.com") || strings.Contains(lower, "youtu.be") { return "youtube" } else if strings.Contains(lower, "vimeo.com") { return "vimeo" } else if strings.Contains(lower, "twitch.tv") { return "twitch" } return "unknown" } // getVideoID extracts video ID from URL func (h *WebScrapingHandler) getVideoID(urlStr, platform string) string { switch platform { case "youtube": if strings.Contains(urlStr, "youtube.com/watch?v=") { parts := strings.Split(urlStr, "v=") if len(parts) > 1 { id := strings.Split(parts[1], "&")[0] return id } } else if strings.Contains(urlStr, "youtu.be/") { parts := strings.Split(urlStr, "youtu.be/") if len(parts) > 1 { return strings.Split(parts[1], "?")[0] } } case "vimeo": parts := strings.Split(urlStr, "vimeo.com/") if len(parts) > 1 { return strings.Split(parts[1], "?")[0] } } return "" } // extractKeywordsFromContent extracts keywords from content func (h *WebScrapingHandler) extractKeywordsFromContent(content string) []string { // Simple keyword extraction - in production, you'd use more sophisticated NLP words := strings.Fields(strings.ToLower(content)) wordCount := make(map[string]int) // Count word frequency for _, word := range words { // Filter out common words if len(word) > 3 && !h.isCommonWord(word) { wordCount[word]++ } } // Get top keywords type wordFreq struct { word string count int } var sortedWords []wordFreq for word, count := range wordCount { if count > 1 { // Only include words that appear more than once sortedWords = append(sortedWords, wordFreq{word, count}) } } // Sort by frequency for i := 0; i < len(sortedWords)-1; i++ { for j := i + 1; j < len(sortedWords); j++ { if sortedWords[j].count > sortedWords[i].count { sortedWords[i], sortedWords[j] = sortedWords[j], sortedWords[i] } } } // Return top 10 keywords var keywords []string for i := 0; i < len(sortedWords) && i < 10; i++ { keywords = append(keywords, sortedWords[i].word) } return keywords } // isCommonWord checks if a word is too common to be a keyword func (h *WebScrapingHandler) isCommonWord(word string) bool { commonWords := []string{ "the", "and", "for", "are", "but", "not", "you", "all", "can", "had", "her", "was", "one", "our", "out", "day", "get", "has", "him", "his", "how", "man", "new", "now", "old", "see", "two", "way", "who", "boy", "did", "its", "let", "put", "say", "she", "too", "use", "with", "have", "this", "that", "from", "they", "been", "call", "come", "each", "find", "give", "hand", "keep", "know", "last", "leave", "life", "long", "made", "many", "move", "must", "name", "need", "only", "over", "part", "said", "same", "show", "tell", "time", "turn", "well", "went", "were", "what", "will", "your", "about", "after", "again", "before", "being", "below", "could", "every", "first", "found", "great", "house", "large", "never", "other", "place", "right", "small", "sound", "still", "their", "there", "think", "under", "water", "where", "which", "world", "would", "write", "years", } for _, common := range commonWords { if word == common { return true } } return false } // detectContentType detects the type of content func (h *WebScrapingHandler) detectContentType(title, content string) string { titleLower := strings.ToLower(title) contentLower := strings.ToLower(content) // Check for tutorial if strings.Contains(titleLower, "tutorial") || strings.Contains(titleLower, "how to") || strings.Contains(contentLower, "step by step") { return "tutorial" } // Check for documentation if strings.Contains(titleLower, "documentation") || strings.Contains(titleLower, "api") || strings.Contains(contentLower, "function") { return "documentation" } // Check for news if strings.Contains(titleLower, "news") || strings.Contains(contentLower, "breaking") || strings.Contains(contentLower, "report") { return "news" } // Check for blog if strings.Contains(titleLower, "blog") || strings.Contains(contentLower, "posted") || strings.Contains(contentLower, "opinion") { return "blog" } // Default to article return "article" } // generateSummary generates a simple summary func (h *WebScrapingHandler) generateSummary(content string) string { sentences := strings.Split(content, ".") if len(sentences) == 0 { return "" } // Take first 2-3 sentences as summary summaryLength := 2 if len(sentences) < 2 { summaryLength = len(sentences) } else if len(sentences) > 3 { summaryLength = 3 } var summary string for i := 0; i < summaryLength; i++ { sentence := strings.TrimSpace(sentences[i]) if sentence != "" { summary += sentence + ". " } } return strings.TrimSpace(summary) }