mirror of
https://github.com/Dvorinka/Trackeep.git
synced 2026-06-03 20:12:58 +00:00
784 lines
21 KiB
Go
784 lines
21 KiB
Go
package handlers
|
|
|
|
import (
|
|
"fmt"
|
|
"log"
|
|
"net/http"
|
|
"net/url"
|
|
"regexp"
|
|
"strconv"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/gin-gonic/gin"
|
|
"github.com/gocolly/colly/v2"
|
|
"github.com/trackeep/backend/models"
|
|
"gorm.io/gorm"
|
|
)
|
|
|
|
// WebScrapingHandler handles web scraping operations
|
|
type WebScrapingHandler struct {
|
|
db *gorm.DB
|
|
}
|
|
|
|
// NewWebScrapingHandler creates a new web scraping handler
|
|
func NewWebScrapingHandler(db *gorm.DB) *WebScrapingHandler {
|
|
return &WebScrapingHandler{db: db}
|
|
}
|
|
|
|
// CreateScrapingJob creates a new web scraping job
|
|
func (h *WebScrapingHandler) CreateScrapingJob(c *gin.Context) {
|
|
userID := c.GetUint("user_id")
|
|
|
|
var req struct {
|
|
URL string `json:"url" binding:"required"`
|
|
JobType string `json:"job_type"`
|
|
Priority string `json:"priority"`
|
|
ExtractImages bool `json:"extract_images"`
|
|
ExtractLinks bool `json:"extract_links"`
|
|
ExtractVideos bool `json:"extract_videos"`
|
|
GenerateSummary bool `json:"generate_summary"`
|
|
DownloadImages bool `json:"download_images"`
|
|
ExtractMetadata bool `json:"extract_metadata"`
|
|
}
|
|
|
|
if err := c.ShouldBindJSON(&req); err != nil {
|
|
c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
|
|
return
|
|
}
|
|
|
|
// Validate URL
|
|
if _, err := url.ParseRequestURI(req.URL); err != nil {
|
|
c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid URL format"})
|
|
return
|
|
}
|
|
|
|
// Set defaults
|
|
if req.JobType == "" {
|
|
req.JobType = "full_scrape"
|
|
}
|
|
if req.Priority == "" {
|
|
req.Priority = "normal"
|
|
}
|
|
|
|
job := models.ScrapingJob{
|
|
UserID: userID,
|
|
URL: req.URL,
|
|
JobType: req.JobType,
|
|
Priority: req.Priority,
|
|
ExtractImages: req.ExtractImages,
|
|
ExtractLinks: req.ExtractLinks,
|
|
ExtractVideos: req.ExtractVideos,
|
|
GenerateSummary: req.GenerateSummary,
|
|
DownloadImages: req.DownloadImages,
|
|
ExtractMetadata: req.ExtractMetadata,
|
|
Status: "pending",
|
|
}
|
|
|
|
if err := h.db.Create(&job).Error; err != nil {
|
|
c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to create scraping job"})
|
|
return
|
|
}
|
|
|
|
// Start processing the job asynchronously
|
|
go h.processScrapingJob(job.ID)
|
|
|
|
c.JSON(http.StatusCreated, job)
|
|
}
|
|
|
|
// GetScrapingJobs returns user's scraping jobs
|
|
func (h *WebScrapingHandler) GetScrapingJobs(c *gin.Context) {
|
|
userID := c.GetUint("user_id")
|
|
|
|
status := c.Query("status")
|
|
limit := 20
|
|
if l := c.Query("limit"); l != "" {
|
|
if parsed, err := strconv.Atoi(l); err == nil && parsed > 0 && parsed <= 100 {
|
|
limit = parsed
|
|
}
|
|
}
|
|
|
|
query := h.db.Where("user_id = ?", userID)
|
|
if status != "" {
|
|
query = query.Where("status = ?", status)
|
|
}
|
|
|
|
var jobs []models.ScrapingJob
|
|
if err := query.Order("created_at DESC").Limit(limit).Find(&jobs).Error; err != nil {
|
|
c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to fetch scraping jobs"})
|
|
return
|
|
}
|
|
|
|
c.JSON(http.StatusOK, gin.H{
|
|
"jobs": jobs,
|
|
"limit": limit,
|
|
})
|
|
}
|
|
|
|
// GetScrapingJob returns a specific scraping job
|
|
func (h *WebScrapingHandler) GetScrapingJob(c *gin.Context) {
|
|
userID := c.GetUint("user_id")
|
|
jobID, err := strconv.ParseUint(c.Param("id"), 10, 32)
|
|
if err != nil {
|
|
c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid job ID"})
|
|
return
|
|
}
|
|
|
|
var job models.ScrapingJob
|
|
if err := h.db.Where("id = ? AND user_id = ?", jobID, userID).
|
|
Preload("ScrapedContent").
|
|
First(&job).Error; err != nil {
|
|
c.JSON(http.StatusNotFound, gin.H{"error": "Scraping job not found"})
|
|
return
|
|
}
|
|
|
|
c.JSON(http.StatusOK, job)
|
|
}
|
|
|
|
// GetScrapedContent returns scraped content
|
|
func (h *WebScrapingHandler) GetScrapedContent(c *gin.Context) {
|
|
userID := c.GetUint("user_id")
|
|
contentID, err := strconv.ParseUint(c.Param("id"), 10, 32)
|
|
if err != nil {
|
|
c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid content ID"})
|
|
return
|
|
}
|
|
|
|
var content models.ScrapedContent
|
|
if err := h.db.Where("id = ? AND user_id = ?", contentID, userID).
|
|
Preload("Images").
|
|
Preload("Links").
|
|
Preload("Videos").
|
|
Preload("Tags").
|
|
First(&content).Error; err != nil {
|
|
c.JSON(http.StatusNotFound, gin.H{"error": "Scraped content not found"})
|
|
return
|
|
}
|
|
|
|
c.JSON(http.StatusOK, content)
|
|
}
|
|
|
|
// GetScrapedContentList returns user's scraped content
|
|
func (h *WebScrapingHandler) GetScrapedContentList(c *gin.Context) {
|
|
userID := c.GetUint("user_id")
|
|
|
|
contentType := c.Query("content_type")
|
|
domain := c.Query("domain")
|
|
limit := 20
|
|
if l := c.Query("limit"); l != "" {
|
|
if parsed, err := strconv.Atoi(l); err == nil && parsed > 0 && parsed <= 100 {
|
|
limit = parsed
|
|
}
|
|
}
|
|
|
|
query := h.db.Where("user_id = ?", userID)
|
|
if contentType != "" {
|
|
query = query.Where("content_type = ?", contentType)
|
|
}
|
|
if domain != "" {
|
|
query = query.Where("domain = ?", domain)
|
|
}
|
|
|
|
var content []models.ScrapedContent
|
|
if err := query.Order("last_scraped DESC").Limit(limit).Find(&content).Error; err != nil {
|
|
c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to fetch scraped content"})
|
|
return
|
|
}
|
|
|
|
c.JSON(http.StatusOK, gin.H{
|
|
"content": content,
|
|
"limit": limit,
|
|
})
|
|
}
|
|
|
|
// DeleteScrapingJob deletes a scraping job
|
|
func (h *WebScrapingHandler) DeleteScrapingJob(c *gin.Context) {
|
|
userID := c.GetUint("user_id")
|
|
jobID, err := strconv.ParseUint(c.Param("id"), 10, 32)
|
|
if err != nil {
|
|
c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid job ID"})
|
|
return
|
|
}
|
|
|
|
var job models.ScrapingJob
|
|
if err := h.db.Where("id = ? AND user_id = ?", jobID, userID).First(&job).Error; err != nil {
|
|
c.JSON(http.StatusNotFound, gin.H{"error": "Scraping job not found"})
|
|
return
|
|
}
|
|
|
|
// Only allow deletion of pending, completed, or failed jobs
|
|
if job.Status == "processing" {
|
|
c.JSON(http.StatusBadRequest, gin.H{"error": "Cannot delete job that is currently processing"})
|
|
return
|
|
}
|
|
|
|
if err := h.db.Delete(&job).Error; err != nil {
|
|
c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to delete scraping job"})
|
|
return
|
|
}
|
|
|
|
c.JSON(http.StatusOK, gin.H{"message": "Scraping job deleted successfully"})
|
|
}
|
|
|
|
// DeleteScrapedContent deletes scraped content
|
|
func (h *WebScrapingHandler) DeleteScrapedContent(c *gin.Context) {
|
|
userID := c.GetUint("user_id")
|
|
contentID, err := strconv.ParseUint(c.Param("id"), 10, 32)
|
|
if err != nil {
|
|
c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid content ID"})
|
|
return
|
|
}
|
|
|
|
var content models.ScrapedContent
|
|
if err := h.db.Where("id = ? AND user_id = ?", contentID, userID).First(&content).Error; err != nil {
|
|
c.JSON(http.StatusNotFound, gin.H{"error": "Scraped content not found"})
|
|
return
|
|
}
|
|
|
|
if err := h.db.Delete(&content).Error; err != nil {
|
|
c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to delete scraped content"})
|
|
return
|
|
}
|
|
|
|
c.JSON(http.StatusOK, gin.H{"message": "Scraped content deleted successfully"})
|
|
}
|
|
|
|
// SearchScrapedContent searches within scraped content
|
|
func (h *WebScrapingHandler) SearchScrapedContent(c *gin.Context) {
|
|
userID := c.GetUint("user_id")
|
|
|
|
query := c.Query("q")
|
|
if query == "" {
|
|
c.JSON(http.StatusBadRequest, gin.H{"error": "Search query is required"})
|
|
return
|
|
}
|
|
|
|
contentType := c.Query("content_type")
|
|
domain := c.Query("domain")
|
|
limit := 20
|
|
if l := c.Query("limit"); l != "" {
|
|
if parsed, err := strconv.Atoi(l); err == nil && parsed > 0 && parsed <= 100 {
|
|
limit = parsed
|
|
}
|
|
}
|
|
|
|
// Build search query
|
|
dbQuery := h.db.Where("user_id = ?", userID)
|
|
|
|
// Search in title, content, and description
|
|
searchCondition := h.db.Where("title ILIKE ?", "%"+query+"%").
|
|
Or("content ILIKE ?", "%"+query+"%").
|
|
Or("description ILIKE ?", "%"+query+"%")
|
|
|
|
dbQuery = dbQuery.Where(searchCondition)
|
|
|
|
if contentType != "" {
|
|
dbQuery = dbQuery.Where("content_type = ?", contentType)
|
|
}
|
|
if domain != "" {
|
|
dbQuery = dbQuery.Where("domain = ?", domain)
|
|
}
|
|
|
|
var content []models.ScrapedContent
|
|
if err := dbQuery.Order("last_scraped DESC").Limit(limit).Find(&content).Error; err != nil {
|
|
c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to search scraped content"})
|
|
return
|
|
}
|
|
|
|
c.JSON(http.StatusOK, gin.H{
|
|
"content": content,
|
|
"query": query,
|
|
"limit": limit,
|
|
})
|
|
}
|
|
|
|
// Helper functions
|
|
|
|
// processScrapingJob processes a scraping job asynchronously
|
|
func (h *WebScrapingHandler) processScrapingJob(jobID uint) {
|
|
var job models.ScrapingJob
|
|
if err := h.db.First(&job, jobID).Error; err != nil {
|
|
return
|
|
}
|
|
|
|
// Update job status to processing
|
|
now := time.Now()
|
|
job.Status = "processing"
|
|
job.StartedAt = &now
|
|
h.db.Save(&job)
|
|
|
|
// Perform the scraping
|
|
scrapedContent, err := h.scrapeWebPage(job.URL, job)
|
|
if err != nil {
|
|
job.Status = "failed"
|
|
job.ErrorMessage = err.Error()
|
|
completedAt := time.Now()
|
|
job.CompletedAt = &completedAt
|
|
h.db.Save(&job)
|
|
return
|
|
}
|
|
|
|
// Update job with results
|
|
job.Status = "completed"
|
|
job.ScrapedContentID = &scrapedContent.ID
|
|
job.Progress = 100
|
|
completedAt := time.Now()
|
|
job.CompletedAt = &completedAt
|
|
h.db.Save(&job)
|
|
}
|
|
|
|
// scrapeWebPage scrapes a web page and extracts content
|
|
func (h *WebScrapingHandler) scrapeWebPage(pageURL string, job models.ScrapingJob) (*models.ScrapedContent, error) {
|
|
parsedURL, err := url.Parse(pageURL)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("invalid URL: %w", err)
|
|
}
|
|
|
|
// Create a new collector
|
|
c := colly.NewCollector(
|
|
colly.AllowURLRevisit(),
|
|
colly.Async(true),
|
|
)
|
|
|
|
// Set up content extraction variables
|
|
var title, description, content string
|
|
var keywords []string
|
|
var images []models.ScrapedImage
|
|
var links []models.ScrapedLink
|
|
var videos []models.ScrapedVideo
|
|
|
|
// Extract title
|
|
c.OnHTML("title", func(e *colly.HTMLElement) {
|
|
title = strings.TrimSpace(e.Text)
|
|
})
|
|
|
|
// Extract meta description
|
|
c.OnHTML("meta[name='description']", func(e *colly.HTMLElement) {
|
|
if description == "" {
|
|
description = e.Attr("content")
|
|
}
|
|
})
|
|
|
|
// Extract meta keywords
|
|
c.OnHTML("meta[name='keywords']", func(e *colly.HTMLElement) {
|
|
if len(keywords) == 0 {
|
|
keywordsStr := e.Attr("content")
|
|
if keywordsStr != "" {
|
|
keywords = strings.Split(keywordsStr, ",")
|
|
for i, kw := range keywords {
|
|
keywords[i] = strings.TrimSpace(kw)
|
|
}
|
|
}
|
|
}
|
|
})
|
|
|
|
// Extract main content
|
|
c.OnHTML("article, main, .content, .post-content, .entry-content", func(e *colly.HTMLElement) {
|
|
content = strings.TrimSpace(e.Text)
|
|
})
|
|
|
|
// Fallback to body content if no specific content found
|
|
c.OnHTML("body", func(e *colly.HTMLElement) {
|
|
if content == "" {
|
|
content = strings.TrimSpace(e.Text)
|
|
}
|
|
})
|
|
|
|
// Extract images if requested
|
|
if job.ExtractImages {
|
|
c.OnHTML("img", func(e *colly.HTMLElement) {
|
|
src := e.Attr("src")
|
|
alt := e.Attr("alt")
|
|
|
|
// Convert relative URLs to absolute
|
|
if src != "" {
|
|
if strings.HasPrefix(src, "/") {
|
|
src = parsedURL.Scheme + "://" + parsedURL.Host + src
|
|
} else if !strings.HasPrefix(src, "http") {
|
|
src = parsedURL.Scheme + "://" + parsedURL.Host + "/" + src
|
|
}
|
|
|
|
images = append(images, models.ScrapedImage{
|
|
URL: src,
|
|
AltText: alt,
|
|
Format: h.getImageFormat(src),
|
|
IsMainImage: false,
|
|
})
|
|
}
|
|
})
|
|
}
|
|
|
|
// Extract links if requested
|
|
if job.ExtractLinks {
|
|
c.OnHTML("a[href]", func(e *colly.HTMLElement) {
|
|
href := e.Attr("href")
|
|
text := strings.TrimSpace(e.Text)
|
|
|
|
if href != "" && text != "" {
|
|
// Convert relative URLs to absolute
|
|
if strings.HasPrefix(href, "/") {
|
|
href = parsedURL.Scheme + "://" + parsedURL.Host + href
|
|
}
|
|
|
|
linkType := "external"
|
|
if strings.Contains(href, parsedURL.Host) {
|
|
linkType = "internal"
|
|
}
|
|
|
|
links = append(links, models.ScrapedLink{
|
|
URL: href,
|
|
Text: text,
|
|
LinkType: linkType,
|
|
Domain: h.getDomainFromURL(href),
|
|
})
|
|
}
|
|
})
|
|
}
|
|
|
|
// Extract videos if requested
|
|
if job.ExtractVideos {
|
|
c.OnHTML("iframe[src], video source", func(e *colly.HTMLElement) {
|
|
src := e.Attr("src")
|
|
title := e.Attr("title")
|
|
|
|
if src != "" {
|
|
platform := h.getVideoPlatform(src)
|
|
videos = append(videos, models.ScrapedVideo{
|
|
URL: src,
|
|
Title: title,
|
|
Platform: platform,
|
|
VideoID: h.getVideoID(src, platform),
|
|
})
|
|
}
|
|
})
|
|
}
|
|
|
|
// Set error handler
|
|
c.OnError(func(r *colly.Response, err error) {
|
|
log.Printf("Error scraping %s: %v", r.Request.URL, err)
|
|
})
|
|
|
|
// Start scraping
|
|
err = c.Visit(pageURL)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to visit page: %w", err)
|
|
}
|
|
|
|
c.Wait()
|
|
|
|
// Clean and process content
|
|
if content == "" {
|
|
content = "No content could be extracted from this page."
|
|
}
|
|
|
|
if description == "" {
|
|
description = content
|
|
if len(description) > 200 {
|
|
description = description[:200] + "..."
|
|
}
|
|
}
|
|
|
|
// Generate keywords if none found
|
|
if len(keywords) == 0 && job.ExtractMetadata {
|
|
keywords = h.extractKeywordsFromContent(content)
|
|
}
|
|
|
|
// Create the scraped content
|
|
scrapedContent := models.ScrapedContent{
|
|
UserID: job.UserID,
|
|
URL: pageURL,
|
|
Domain: parsedURL.Hostname(),
|
|
Title: title,
|
|
Description: description,
|
|
Content: content,
|
|
Keywords: keywords,
|
|
ContentType: h.detectContentType(title, content),
|
|
WordCount: len(strings.Fields(content)),
|
|
ReadingTime: h.estimateReadingTime(len(strings.Fields(content))),
|
|
QualityScore: 0, // Will be calculated below
|
|
Status: "completed",
|
|
LastScraped: time.Now(),
|
|
}
|
|
|
|
// Generate summary if requested
|
|
if job.GenerateSummary {
|
|
scrapedContent.Summary = h.generateSummary(content)
|
|
}
|
|
|
|
// Create the content in database
|
|
if err := h.db.Create(&scrapedContent).Error; err != nil {
|
|
return nil, fmt.Errorf("failed to save scraped content: %w", err)
|
|
}
|
|
|
|
// Save related content
|
|
if len(images) > 0 {
|
|
for i := range images {
|
|
images[i].ScrapedContentID = scrapedContent.ID
|
|
}
|
|
h.db.Create(&images)
|
|
}
|
|
|
|
if len(links) > 0 {
|
|
for i := range links {
|
|
links[i].ScrapedContentID = scrapedContent.ID
|
|
}
|
|
h.db.Create(&links)
|
|
}
|
|
|
|
if len(videos) > 0 {
|
|
for i := range videos {
|
|
videos[i].ScrapedContentID = scrapedContent.ID
|
|
}
|
|
h.db.Create(&videos)
|
|
}
|
|
|
|
// Calculate and save quality score
|
|
scrapedContent.QualityScore = h.calculateQualityScore(scrapedContent)
|
|
h.db.Save(&scrapedContent)
|
|
|
|
return &scrapedContent, nil
|
|
}
|
|
|
|
// extractTextFromHTML extracts text content from HTML
|
|
func (h *WebScrapingHandler) extractTextFromHTML(html string) string {
|
|
// Remove HTML tags
|
|
re := regexp.MustCompile(`<[^>]*>`)
|
|
text := re.ReplaceAllString(html, "")
|
|
|
|
// Clean up whitespace
|
|
text = strings.TrimSpace(text)
|
|
text = regexp.MustCompile(`\s+`).ReplaceAllString(text, " ")
|
|
|
|
return text
|
|
}
|
|
|
|
// estimateReadingTime estimates reading time in minutes
|
|
func (h *WebScrapingHandler) estimateReadingTime(wordCount int) int {
|
|
// Average reading speed: 200-250 words per minute
|
|
readingSpeed := 225
|
|
readingTime := wordCount / readingSpeed
|
|
if readingTime < 1 {
|
|
readingTime = 1
|
|
}
|
|
return readingTime
|
|
}
|
|
|
|
// calculateQualityScore calculates a quality score for the content
|
|
func (h *WebScrapingHandler) calculateQualityScore(content models.ScrapedContent) float64 {
|
|
score := 50.0 // Base score
|
|
|
|
// Add points for having title
|
|
if content.Title != "" {
|
|
score += 10
|
|
}
|
|
|
|
// Add points for content length
|
|
if content.WordCount > 100 {
|
|
score += 10
|
|
}
|
|
if content.WordCount > 500 {
|
|
score += 10
|
|
}
|
|
|
|
// Add points for having description
|
|
if content.Description != "" {
|
|
score += 10
|
|
}
|
|
|
|
// Add points for having images
|
|
if len(content.Images) > 0 {
|
|
score += 5
|
|
}
|
|
|
|
// Add points for having keywords
|
|
if len(content.Keywords) > 0 {
|
|
score += 5
|
|
}
|
|
|
|
// Cap at 100
|
|
if score > 100 {
|
|
score = 100
|
|
}
|
|
|
|
return score
|
|
}
|
|
|
|
// Helper methods for web scraping
|
|
|
|
// getImageFormat extracts image format from URL
|
|
func (h *WebScrapingHandler) getImageFormat(url string) string {
|
|
lower := strings.ToLower(url)
|
|
if strings.HasSuffix(lower, ".jpg") || strings.HasSuffix(lower, ".jpeg") {
|
|
return "jpg"
|
|
} else if strings.HasSuffix(lower, ".png") {
|
|
return "png"
|
|
} else if strings.HasSuffix(lower, ".gif") {
|
|
return "gif"
|
|
} else if strings.HasSuffix(lower, ".svg") {
|
|
return "svg"
|
|
} else if strings.HasSuffix(lower, ".webp") {
|
|
return "webp"
|
|
}
|
|
return "unknown"
|
|
}
|
|
|
|
// getDomainFromURL extracts domain from URL
|
|
func (h *WebScrapingHandler) getDomainFromURL(urlStr string) string {
|
|
if parsedURL, err := url.Parse(urlStr); err == nil {
|
|
return parsedURL.Hostname()
|
|
}
|
|
return ""
|
|
}
|
|
|
|
// getVideoPlatform detects video platform from URL
|
|
func (h *WebScrapingHandler) getVideoPlatform(urlStr string) string {
|
|
lower := strings.ToLower(urlStr)
|
|
if strings.Contains(lower, "youtube.com") || strings.Contains(lower, "youtu.be") {
|
|
return "youtube"
|
|
} else if strings.Contains(lower, "vimeo.com") {
|
|
return "vimeo"
|
|
} else if strings.Contains(lower, "twitch.tv") {
|
|
return "twitch"
|
|
}
|
|
return "unknown"
|
|
}
|
|
|
|
// getVideoID extracts video ID from URL
|
|
func (h *WebScrapingHandler) getVideoID(urlStr, platform string) string {
|
|
switch platform {
|
|
case "youtube":
|
|
if strings.Contains(urlStr, "youtube.com/watch?v=") {
|
|
parts := strings.Split(urlStr, "v=")
|
|
if len(parts) > 1 {
|
|
id := strings.Split(parts[1], "&")[0]
|
|
return id
|
|
}
|
|
} else if strings.Contains(urlStr, "youtu.be/") {
|
|
parts := strings.Split(urlStr, "youtu.be/")
|
|
if len(parts) > 1 {
|
|
return strings.Split(parts[1], "?")[0]
|
|
}
|
|
}
|
|
case "vimeo":
|
|
parts := strings.Split(urlStr, "vimeo.com/")
|
|
if len(parts) > 1 {
|
|
return strings.Split(parts[1], "?")[0]
|
|
}
|
|
}
|
|
return ""
|
|
}
|
|
|
|
// extractKeywordsFromContent extracts keywords from content
|
|
func (h *WebScrapingHandler) extractKeywordsFromContent(content string) []string {
|
|
// Simple keyword extraction - in production, you'd use more sophisticated NLP
|
|
words := strings.Fields(strings.ToLower(content))
|
|
wordCount := make(map[string]int)
|
|
|
|
// Count word frequency
|
|
for _, word := range words {
|
|
// Filter out common words
|
|
if len(word) > 3 && !h.isCommonWord(word) {
|
|
wordCount[word]++
|
|
}
|
|
}
|
|
|
|
// Get top keywords
|
|
type wordFreq struct {
|
|
word string
|
|
count int
|
|
}
|
|
|
|
var sortedWords []wordFreq
|
|
for word, count := range wordCount {
|
|
if count > 1 { // Only include words that appear more than once
|
|
sortedWords = append(sortedWords, wordFreq{word, count})
|
|
}
|
|
}
|
|
|
|
// Sort by frequency
|
|
for i := 0; i < len(sortedWords)-1; i++ {
|
|
for j := i + 1; j < len(sortedWords); j++ {
|
|
if sortedWords[j].count > sortedWords[i].count {
|
|
sortedWords[i], sortedWords[j] = sortedWords[j], sortedWords[i]
|
|
}
|
|
}
|
|
}
|
|
|
|
// Return top 10 keywords
|
|
var keywords []string
|
|
for i := 0; i < len(sortedWords) && i < 10; i++ {
|
|
keywords = append(keywords, sortedWords[i].word)
|
|
}
|
|
|
|
return keywords
|
|
}
|
|
|
|
// isCommonWord checks if a word is too common to be a keyword
|
|
func (h *WebScrapingHandler) isCommonWord(word string) bool {
|
|
commonWords := []string{
|
|
"the", "and", "for", "are", "but", "not", "you", "all", "can", "had", "her", "was", "one", "our", "out", "day", "get", "has", "him", "his", "how", "man", "new", "now", "old", "see", "two", "way", "who", "boy", "did", "its", "let", "put", "say", "she", "too", "use", "with", "have", "this", "that", "from", "they", "been", "call", "come", "each", "find", "give", "hand", "keep", "know", "last", "leave", "life", "long", "made", "many", "move", "must", "name", "need", "only", "over", "part", "said", "same", "show", "tell", "time", "turn", "well", "went", "were", "what", "will", "your", "about", "after", "again", "before", "being", "below", "could", "every", "first", "found", "great", "house", "large", "never", "other", "place", "right", "small", "sound", "still", "their", "there", "think", "under", "water", "where", "which", "world", "would", "write", "years",
|
|
}
|
|
|
|
for _, common := range commonWords {
|
|
if word == common {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
// detectContentType detects the type of content
|
|
func (h *WebScrapingHandler) detectContentType(title, content string) string {
|
|
titleLower := strings.ToLower(title)
|
|
contentLower := strings.ToLower(content)
|
|
|
|
// Check for tutorial
|
|
if strings.Contains(titleLower, "tutorial") || strings.Contains(titleLower, "how to") || strings.Contains(contentLower, "step by step") {
|
|
return "tutorial"
|
|
}
|
|
|
|
// Check for documentation
|
|
if strings.Contains(titleLower, "documentation") || strings.Contains(titleLower, "api") || strings.Contains(contentLower, "function") {
|
|
return "documentation"
|
|
}
|
|
|
|
// Check for news
|
|
if strings.Contains(titleLower, "news") || strings.Contains(contentLower, "breaking") || strings.Contains(contentLower, "report") {
|
|
return "news"
|
|
}
|
|
|
|
// Check for blog
|
|
if strings.Contains(titleLower, "blog") || strings.Contains(contentLower, "posted") || strings.Contains(contentLower, "opinion") {
|
|
return "blog"
|
|
}
|
|
|
|
// Default to article
|
|
return "article"
|
|
}
|
|
|
|
// generateSummary generates a simple summary
|
|
func (h *WebScrapingHandler) generateSummary(content string) string {
|
|
sentences := strings.Split(content, ".")
|
|
if len(sentences) == 0 {
|
|
return ""
|
|
}
|
|
|
|
// Take first 2-3 sentences as summary
|
|
summaryLength := 2
|
|
if len(sentences) < 2 {
|
|
summaryLength = len(sentences)
|
|
} else if len(sentences) > 3 {
|
|
summaryLength = 3
|
|
}
|
|
|
|
var summary string
|
|
for i := 0; i < summaryLength; i++ {
|
|
sentence := strings.TrimSpace(sentences[i])
|
|
if sentence != "" {
|
|
summary += sentence + ". "
|
|
}
|
|
}
|
|
|
|
return strings.TrimSpace(summary)
|
|
}
|