mirror of
https://github.com/Dvorinka/Trackeep.git
synced 2026-06-03 20:12:58 +00:00
first test
This commit is contained in:
@@ -0,0 +1,782 @@
|
||||
package handlers
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"regexp"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/gin-gonic/gin"
|
||||
"github.com/gocolly/colly/v2"
|
||||
"github.com/trackeep/backend/models"
|
||||
"gorm.io/gorm"
|
||||
)
|
||||
|
||||
// WebScrapingHandler handles web scraping operations
|
||||
type WebScrapingHandler struct {
|
||||
db *gorm.DB
|
||||
}
|
||||
|
||||
// NewWebScrapingHandler creates a new web scraping handler
|
||||
func NewWebScrapingHandler(db *gorm.DB) *WebScrapingHandler {
|
||||
return &WebScrapingHandler{db: db}
|
||||
}
|
||||
|
||||
// CreateScrapingJob creates a new web scraping job
|
||||
func (h *WebScrapingHandler) CreateScrapingJob(c *gin.Context) {
|
||||
userID := c.GetUint("user_id")
|
||||
|
||||
var req struct {
|
||||
URL string `json:"url" binding:"required"`
|
||||
JobType string `json:"job_type"`
|
||||
Priority string `json:"priority"`
|
||||
ExtractImages bool `json:"extract_images"`
|
||||
ExtractLinks bool `json:"extract_links"`
|
||||
ExtractVideos bool `json:"extract_videos"`
|
||||
GenerateSummary bool `json:"generate_summary"`
|
||||
DownloadImages bool `json:"download_images"`
|
||||
ExtractMetadata bool `json:"extract_metadata"`
|
||||
}
|
||||
|
||||
if err := c.ShouldBindJSON(&req); err != nil {
|
||||
c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
|
||||
return
|
||||
}
|
||||
|
||||
// Validate URL
|
||||
if _, err := url.ParseRequestURI(req.URL); err != nil {
|
||||
c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid URL format"})
|
||||
return
|
||||
}
|
||||
|
||||
// Set defaults
|
||||
if req.JobType == "" {
|
||||
req.JobType = "full_scrape"
|
||||
}
|
||||
if req.Priority == "" {
|
||||
req.Priority = "normal"
|
||||
}
|
||||
|
||||
job := models.ScrapingJob{
|
||||
UserID: userID,
|
||||
URL: req.URL,
|
||||
JobType: req.JobType,
|
||||
Priority: req.Priority,
|
||||
ExtractImages: req.ExtractImages,
|
||||
ExtractLinks: req.ExtractLinks,
|
||||
ExtractVideos: req.ExtractVideos,
|
||||
GenerateSummary: req.GenerateSummary,
|
||||
DownloadImages: req.DownloadImages,
|
||||
ExtractMetadata: req.ExtractMetadata,
|
||||
Status: "pending",
|
||||
}
|
||||
|
||||
if err := h.db.Create(&job).Error; err != nil {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to create scraping job"})
|
||||
return
|
||||
}
|
||||
|
||||
// Start processing the job asynchronously
|
||||
go h.processScrapingJob(job.ID)
|
||||
|
||||
c.JSON(http.StatusCreated, job)
|
||||
}
|
||||
|
||||
// GetScrapingJobs returns user's scraping jobs
|
||||
func (h *WebScrapingHandler) GetScrapingJobs(c *gin.Context) {
|
||||
userID := c.GetUint("user_id")
|
||||
|
||||
status := c.Query("status")
|
||||
limit := 20
|
||||
if l := c.Query("limit"); l != "" {
|
||||
if parsed, err := strconv.Atoi(l); err == nil && parsed > 0 && parsed <= 100 {
|
||||
limit = parsed
|
||||
}
|
||||
}
|
||||
|
||||
query := h.db.Where("user_id = ?", userID)
|
||||
if status != "" {
|
||||
query = query.Where("status = ?", status)
|
||||
}
|
||||
|
||||
var jobs []models.ScrapingJob
|
||||
if err := query.Order("created_at DESC").Limit(limit).Find(&jobs).Error; err != nil {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to fetch scraping jobs"})
|
||||
return
|
||||
}
|
||||
|
||||
c.JSON(http.StatusOK, gin.H{
|
||||
"jobs": jobs,
|
||||
"limit": limit,
|
||||
})
|
||||
}
|
||||
|
||||
// GetScrapingJob returns a specific scraping job
|
||||
func (h *WebScrapingHandler) GetScrapingJob(c *gin.Context) {
|
||||
userID := c.GetUint("user_id")
|
||||
jobID, err := strconv.ParseUint(c.Param("id"), 10, 32)
|
||||
if err != nil {
|
||||
c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid job ID"})
|
||||
return
|
||||
}
|
||||
|
||||
var job models.ScrapingJob
|
||||
if err := h.db.Where("id = ? AND user_id = ?", jobID, userID).
|
||||
Preload("ScrapedContent").
|
||||
First(&job).Error; err != nil {
|
||||
c.JSON(http.StatusNotFound, gin.H{"error": "Scraping job not found"})
|
||||
return
|
||||
}
|
||||
|
||||
c.JSON(http.StatusOK, job)
|
||||
}
|
||||
|
||||
// GetScrapedContent returns scraped content
|
||||
func (h *WebScrapingHandler) GetScrapedContent(c *gin.Context) {
|
||||
userID := c.GetUint("user_id")
|
||||
contentID, err := strconv.ParseUint(c.Param("id"), 10, 32)
|
||||
if err != nil {
|
||||
c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid content ID"})
|
||||
return
|
||||
}
|
||||
|
||||
var content models.ScrapedContent
|
||||
if err := h.db.Where("id = ? AND user_id = ?", contentID, userID).
|
||||
Preload("Images").
|
||||
Preload("Links").
|
||||
Preload("Videos").
|
||||
Preload("Tags").
|
||||
First(&content).Error; err != nil {
|
||||
c.JSON(http.StatusNotFound, gin.H{"error": "Scraped content not found"})
|
||||
return
|
||||
}
|
||||
|
||||
c.JSON(http.StatusOK, content)
|
||||
}
|
||||
|
||||
// GetScrapedContentList returns user's scraped content
|
||||
func (h *WebScrapingHandler) GetScrapedContentList(c *gin.Context) {
|
||||
userID := c.GetUint("user_id")
|
||||
|
||||
contentType := c.Query("content_type")
|
||||
domain := c.Query("domain")
|
||||
limit := 20
|
||||
if l := c.Query("limit"); l != "" {
|
||||
if parsed, err := strconv.Atoi(l); err == nil && parsed > 0 && parsed <= 100 {
|
||||
limit = parsed
|
||||
}
|
||||
}
|
||||
|
||||
query := h.db.Where("user_id = ?", userID)
|
||||
if contentType != "" {
|
||||
query = query.Where("content_type = ?", contentType)
|
||||
}
|
||||
if domain != "" {
|
||||
query = query.Where("domain = ?", domain)
|
||||
}
|
||||
|
||||
var content []models.ScrapedContent
|
||||
if err := query.Order("last_scraped DESC").Limit(limit).Find(&content).Error; err != nil {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to fetch scraped content"})
|
||||
return
|
||||
}
|
||||
|
||||
c.JSON(http.StatusOK, gin.H{
|
||||
"content": content,
|
||||
"limit": limit,
|
||||
})
|
||||
}
|
||||
|
||||
// DeleteScrapingJob deletes a scraping job
|
||||
func (h *WebScrapingHandler) DeleteScrapingJob(c *gin.Context) {
|
||||
userID := c.GetUint("user_id")
|
||||
jobID, err := strconv.ParseUint(c.Param("id"), 10, 32)
|
||||
if err != nil {
|
||||
c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid job ID"})
|
||||
return
|
||||
}
|
||||
|
||||
var job models.ScrapingJob
|
||||
if err := h.db.Where("id = ? AND user_id = ?", jobID, userID).First(&job).Error; err != nil {
|
||||
c.JSON(http.StatusNotFound, gin.H{"error": "Scraping job not found"})
|
||||
return
|
||||
}
|
||||
|
||||
// Only allow deletion of pending, completed, or failed jobs
|
||||
if job.Status == "processing" {
|
||||
c.JSON(http.StatusBadRequest, gin.H{"error": "Cannot delete job that is currently processing"})
|
||||
return
|
||||
}
|
||||
|
||||
if err := h.db.Delete(&job).Error; err != nil {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to delete scraping job"})
|
||||
return
|
||||
}
|
||||
|
||||
c.JSON(http.StatusOK, gin.H{"message": "Scraping job deleted successfully"})
|
||||
}
|
||||
|
||||
// DeleteScrapedContent deletes scraped content
|
||||
func (h *WebScrapingHandler) DeleteScrapedContent(c *gin.Context) {
|
||||
userID := c.GetUint("user_id")
|
||||
contentID, err := strconv.ParseUint(c.Param("id"), 10, 32)
|
||||
if err != nil {
|
||||
c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid content ID"})
|
||||
return
|
||||
}
|
||||
|
||||
var content models.ScrapedContent
|
||||
if err := h.db.Where("id = ? AND user_id = ?", contentID, userID).First(&content).Error; err != nil {
|
||||
c.JSON(http.StatusNotFound, gin.H{"error": "Scraped content not found"})
|
||||
return
|
||||
}
|
||||
|
||||
if err := h.db.Delete(&content).Error; err != nil {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to delete scraped content"})
|
||||
return
|
||||
}
|
||||
|
||||
c.JSON(http.StatusOK, gin.H{"message": "Scraped content deleted successfully"})
|
||||
}
|
||||
|
||||
// SearchScrapedContent searches within scraped content
|
||||
func (h *WebScrapingHandler) SearchScrapedContent(c *gin.Context) {
|
||||
userID := c.GetUint("user_id")
|
||||
|
||||
query := c.Query("q")
|
||||
if query == "" {
|
||||
c.JSON(http.StatusBadRequest, gin.H{"error": "Search query is required"})
|
||||
return
|
||||
}
|
||||
|
||||
contentType := c.Query("content_type")
|
||||
domain := c.Query("domain")
|
||||
limit := 20
|
||||
if l := c.Query("limit"); l != "" {
|
||||
if parsed, err := strconv.Atoi(l); err == nil && parsed > 0 && parsed <= 100 {
|
||||
limit = parsed
|
||||
}
|
||||
}
|
||||
|
||||
// Build search query
|
||||
dbQuery := h.db.Where("user_id = ?", userID)
|
||||
|
||||
// Search in title, content, and description
|
||||
searchCondition := h.db.Where("title ILIKE ?", "%"+query+"%").
|
||||
Or("content ILIKE ?", "%"+query+"%").
|
||||
Or("description ILIKE ?", "%"+query+"%")
|
||||
|
||||
dbQuery = dbQuery.Where(searchCondition)
|
||||
|
||||
if contentType != "" {
|
||||
dbQuery = dbQuery.Where("content_type = ?", contentType)
|
||||
}
|
||||
if domain != "" {
|
||||
dbQuery = dbQuery.Where("domain = ?", domain)
|
||||
}
|
||||
|
||||
var content []models.ScrapedContent
|
||||
if err := dbQuery.Order("last_scraped DESC").Limit(limit).Find(&content).Error; err != nil {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to search scraped content"})
|
||||
return
|
||||
}
|
||||
|
||||
c.JSON(http.StatusOK, gin.H{
|
||||
"content": content,
|
||||
"query": query,
|
||||
"limit": limit,
|
||||
})
|
||||
}
|
||||
|
||||
// Helper functions
|
||||
|
||||
// processScrapingJob processes a scraping job asynchronously
|
||||
func (h *WebScrapingHandler) processScrapingJob(jobID uint) {
|
||||
var job models.ScrapingJob
|
||||
if err := h.db.First(&job, jobID).Error; err != nil {
|
||||
return
|
||||
}
|
||||
|
||||
// Update job status to processing
|
||||
now := time.Now()
|
||||
job.Status = "processing"
|
||||
job.StartedAt = &now
|
||||
h.db.Save(&job)
|
||||
|
||||
// Perform the scraping
|
||||
scrapedContent, err := h.scrapeWebPage(job.URL, job)
|
||||
if err != nil {
|
||||
job.Status = "failed"
|
||||
job.ErrorMessage = err.Error()
|
||||
completedAt := time.Now()
|
||||
job.CompletedAt = &completedAt
|
||||
h.db.Save(&job)
|
||||
return
|
||||
}
|
||||
|
||||
// Update job with results
|
||||
job.Status = "completed"
|
||||
job.ScrapedContentID = &scrapedContent.ID
|
||||
job.Progress = 100
|
||||
completedAt := time.Now()
|
||||
job.CompletedAt = &completedAt
|
||||
h.db.Save(&job)
|
||||
}
|
||||
|
||||
// scrapeWebPage scrapes a web page and extracts content
|
||||
func (h *WebScrapingHandler) scrapeWebPage(pageURL string, job models.ScrapingJob) (*models.ScrapedContent, error) {
|
||||
parsedURL, err := url.Parse(pageURL)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("invalid URL: %w", err)
|
||||
}
|
||||
|
||||
// Create a new collector
|
||||
c := colly.NewCollector(
|
||||
colly.AllowURLRevisit(),
|
||||
colly.Async(true),
|
||||
)
|
||||
|
||||
// Set up content extraction variables
|
||||
var title, description, content string
|
||||
var keywords []string
|
||||
var images []models.ScrapedImage
|
||||
var links []models.ScrapedLink
|
||||
var videos []models.ScrapedVideo
|
||||
|
||||
// Extract title
|
||||
c.OnHTML("title", func(e *colly.HTMLElement) {
|
||||
title = strings.TrimSpace(e.Text)
|
||||
})
|
||||
|
||||
// Extract meta description
|
||||
c.OnHTML("meta[name='description']", func(e *colly.HTMLElement) {
|
||||
if description == "" {
|
||||
description = e.Attr("content")
|
||||
}
|
||||
})
|
||||
|
||||
// Extract meta keywords
|
||||
c.OnHTML("meta[name='keywords']", func(e *colly.HTMLElement) {
|
||||
if len(keywords) == 0 {
|
||||
keywordsStr := e.Attr("content")
|
||||
if keywordsStr != "" {
|
||||
keywords = strings.Split(keywordsStr, ",")
|
||||
for i, kw := range keywords {
|
||||
keywords[i] = strings.TrimSpace(kw)
|
||||
}
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
// Extract main content
|
||||
c.OnHTML("article, main, .content, .post-content, .entry-content", func(e *colly.HTMLElement) {
|
||||
content = strings.TrimSpace(e.Text)
|
||||
})
|
||||
|
||||
// Fallback to body content if no specific content found
|
||||
c.OnHTML("body", func(e *colly.HTMLElement) {
|
||||
if content == "" {
|
||||
content = strings.TrimSpace(e.Text)
|
||||
}
|
||||
})
|
||||
|
||||
// Extract images if requested
|
||||
if job.ExtractImages {
|
||||
c.OnHTML("img", func(e *colly.HTMLElement) {
|
||||
src := e.Attr("src")
|
||||
alt := e.Attr("alt")
|
||||
|
||||
// Convert relative URLs to absolute
|
||||
if src != "" {
|
||||
if strings.HasPrefix(src, "/") {
|
||||
src = parsedURL.Scheme + "://" + parsedURL.Host + src
|
||||
} else if !strings.HasPrefix(src, "http") {
|
||||
src = parsedURL.Scheme + "://" + parsedURL.Host + "/" + src
|
||||
}
|
||||
|
||||
images = append(images, models.ScrapedImage{
|
||||
URL: src,
|
||||
AltText: alt,
|
||||
Format: h.getImageFormat(src),
|
||||
IsMainImage: false,
|
||||
})
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
// Extract links if requested
|
||||
if job.ExtractLinks {
|
||||
c.OnHTML("a[href]", func(e *colly.HTMLElement) {
|
||||
href := e.Attr("href")
|
||||
text := strings.TrimSpace(e.Text)
|
||||
|
||||
if href != "" && text != "" {
|
||||
// Convert relative URLs to absolute
|
||||
if strings.HasPrefix(href, "/") {
|
||||
href = parsedURL.Scheme + "://" + parsedURL.Host + href
|
||||
}
|
||||
|
||||
linkType := "external"
|
||||
if strings.Contains(href, parsedURL.Host) {
|
||||
linkType = "internal"
|
||||
}
|
||||
|
||||
links = append(links, models.ScrapedLink{
|
||||
URL: href,
|
||||
Text: text,
|
||||
LinkType: linkType,
|
||||
Domain: h.getDomainFromURL(href),
|
||||
})
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
// Extract videos if requested
|
||||
if job.ExtractVideos {
|
||||
c.OnHTML("iframe[src], video source", func(e *colly.HTMLElement) {
|
||||
src := e.Attr("src")
|
||||
title := e.Attr("title")
|
||||
|
||||
if src != "" {
|
||||
platform := h.getVideoPlatform(src)
|
||||
videos = append(videos, models.ScrapedVideo{
|
||||
URL: src,
|
||||
Title: title,
|
||||
Platform: platform,
|
||||
VideoID: h.getVideoID(src, platform),
|
||||
})
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
// Set error handler
|
||||
c.OnError(func(r *colly.Response, err error) {
|
||||
fmt.Printf("Error scraping %s: %v\n", r.Request.URL, err)
|
||||
})
|
||||
|
||||
// Start scraping
|
||||
err = c.Visit(pageURL)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to visit page: %w", err)
|
||||
}
|
||||
|
||||
c.Wait()
|
||||
|
||||
// Clean and process content
|
||||
if content == "" {
|
||||
content = "No content could be extracted from this page."
|
||||
}
|
||||
|
||||
if description == "" {
|
||||
description = content
|
||||
if len(description) > 200 {
|
||||
description = description[:200] + "..."
|
||||
}
|
||||
}
|
||||
|
||||
// Generate keywords if none found
|
||||
if len(keywords) == 0 && job.ExtractMetadata {
|
||||
keywords = h.extractKeywordsFromContent(content)
|
||||
}
|
||||
|
||||
// Create the scraped content
|
||||
scrapedContent := models.ScrapedContent{
|
||||
UserID: job.UserID,
|
||||
URL: pageURL,
|
||||
Domain: parsedURL.Hostname(),
|
||||
Title: title,
|
||||
Description: description,
|
||||
Content: content,
|
||||
Keywords: keywords,
|
||||
ContentType: h.detectContentType(title, content),
|
||||
WordCount: len(strings.Fields(content)),
|
||||
ReadingTime: h.estimateReadingTime(len(strings.Fields(content))),
|
||||
QualityScore: 0, // Will be calculated below
|
||||
Status: "completed",
|
||||
LastScraped: time.Now(),
|
||||
}
|
||||
|
||||
// Generate summary if requested
|
||||
if job.GenerateSummary {
|
||||
scrapedContent.Summary = h.generateSummary(content)
|
||||
}
|
||||
|
||||
// Create the content in database
|
||||
if err := h.db.Create(&scrapedContent).Error; err != nil {
|
||||
return nil, fmt.Errorf("failed to save scraped content: %w", err)
|
||||
}
|
||||
|
||||
// Save related content
|
||||
if len(images) > 0 {
|
||||
for i := range images {
|
||||
images[i].ScrapedContentID = scrapedContent.ID
|
||||
}
|
||||
h.db.Create(&images)
|
||||
}
|
||||
|
||||
if len(links) > 0 {
|
||||
for i := range links {
|
||||
links[i].ScrapedContentID = scrapedContent.ID
|
||||
}
|
||||
h.db.Create(&links)
|
||||
}
|
||||
|
||||
if len(videos) > 0 {
|
||||
for i := range videos {
|
||||
videos[i].ScrapedContentID = scrapedContent.ID
|
||||
}
|
||||
h.db.Create(&videos)
|
||||
}
|
||||
|
||||
// Calculate and save quality score
|
||||
scrapedContent.QualityScore = h.calculateQualityScore(scrapedContent)
|
||||
h.db.Save(&scrapedContent)
|
||||
|
||||
return &scrapedContent, nil
|
||||
}
|
||||
|
||||
// extractTextFromHTML extracts text content from HTML
|
||||
func (h *WebScrapingHandler) extractTextFromHTML(html string) string {
|
||||
// Remove HTML tags
|
||||
re := regexp.MustCompile(`<[^>]*>`)
|
||||
text := re.ReplaceAllString(html, "")
|
||||
|
||||
// Clean up whitespace
|
||||
text = strings.TrimSpace(text)
|
||||
text = regexp.MustCompile(`\s+`).ReplaceAllString(text, " ")
|
||||
|
||||
return text
|
||||
}
|
||||
|
||||
// estimateReadingTime estimates reading time in minutes
|
||||
func (h *WebScrapingHandler) estimateReadingTime(wordCount int) int {
|
||||
// Average reading speed: 200-250 words per minute
|
||||
readingSpeed := 225
|
||||
readingTime := wordCount / readingSpeed
|
||||
if readingTime < 1 {
|
||||
readingTime = 1
|
||||
}
|
||||
return readingTime
|
||||
}
|
||||
|
||||
// calculateQualityScore calculates a quality score for the content
|
||||
func (h *WebScrapingHandler) calculateQualityScore(content models.ScrapedContent) float64 {
|
||||
score := 50.0 // Base score
|
||||
|
||||
// Add points for having title
|
||||
if content.Title != "" {
|
||||
score += 10
|
||||
}
|
||||
|
||||
// Add points for content length
|
||||
if content.WordCount > 100 {
|
||||
score += 10
|
||||
}
|
||||
if content.WordCount > 500 {
|
||||
score += 10
|
||||
}
|
||||
|
||||
// Add points for having description
|
||||
if content.Description != "" {
|
||||
score += 10
|
||||
}
|
||||
|
||||
// Add points for having images
|
||||
if len(content.Images) > 0 {
|
||||
score += 5
|
||||
}
|
||||
|
||||
// Add points for having keywords
|
||||
if len(content.Keywords) > 0 {
|
||||
score += 5
|
||||
}
|
||||
|
||||
// Cap at 100
|
||||
if score > 100 {
|
||||
score = 100
|
||||
}
|
||||
|
||||
return score
|
||||
}
|
||||
|
||||
// Helper methods for web scraping
|
||||
|
||||
// getImageFormat extracts image format from URL
|
||||
func (h *WebScrapingHandler) getImageFormat(url string) string {
|
||||
lower := strings.ToLower(url)
|
||||
if strings.HasSuffix(lower, ".jpg") || strings.HasSuffix(lower, ".jpeg") {
|
||||
return "jpg"
|
||||
} else if strings.HasSuffix(lower, ".png") {
|
||||
return "png"
|
||||
} else if strings.HasSuffix(lower, ".gif") {
|
||||
return "gif"
|
||||
} else if strings.HasSuffix(lower, ".svg") {
|
||||
return "svg"
|
||||
} else if strings.HasSuffix(lower, ".webp") {
|
||||
return "webp"
|
||||
}
|
||||
return "unknown"
|
||||
}
|
||||
|
||||
// getDomainFromURL extracts domain from URL
|
||||
func (h *WebScrapingHandler) getDomainFromURL(urlStr string) string {
|
||||
if parsedURL, err := url.Parse(urlStr); err == nil {
|
||||
return parsedURL.Hostname()
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
// getVideoPlatform detects video platform from URL
|
||||
func (h *WebScrapingHandler) getVideoPlatform(urlStr string) string {
|
||||
lower := strings.ToLower(urlStr)
|
||||
if strings.Contains(lower, "youtube.com") || strings.Contains(lower, "youtu.be") {
|
||||
return "youtube"
|
||||
} else if strings.Contains(lower, "vimeo.com") {
|
||||
return "vimeo"
|
||||
} else if strings.Contains(lower, "twitch.tv") {
|
||||
return "twitch"
|
||||
}
|
||||
return "unknown"
|
||||
}
|
||||
|
||||
// getVideoID extracts video ID from URL
|
||||
func (h *WebScrapingHandler) getVideoID(urlStr, platform string) string {
|
||||
switch platform {
|
||||
case "youtube":
|
||||
if strings.Contains(urlStr, "youtube.com/watch?v=") {
|
||||
parts := strings.Split(urlStr, "v=")
|
||||
if len(parts) > 1 {
|
||||
id := strings.Split(parts[1], "&")[0]
|
||||
return id
|
||||
}
|
||||
} else if strings.Contains(urlStr, "youtu.be/") {
|
||||
parts := strings.Split(urlStr, "youtu.be/")
|
||||
if len(parts) > 1 {
|
||||
return strings.Split(parts[1], "?")[0]
|
||||
}
|
||||
}
|
||||
case "vimeo":
|
||||
parts := strings.Split(urlStr, "vimeo.com/")
|
||||
if len(parts) > 1 {
|
||||
return strings.Split(parts[1], "?")[0]
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
// extractKeywordsFromContent extracts keywords from content
|
||||
func (h *WebScrapingHandler) extractKeywordsFromContent(content string) []string {
|
||||
// Simple keyword extraction - in production, you'd use more sophisticated NLP
|
||||
words := strings.Fields(strings.ToLower(content))
|
||||
wordCount := make(map[string]int)
|
||||
|
||||
// Count word frequency
|
||||
for _, word := range words {
|
||||
// Filter out common words
|
||||
if len(word) > 3 && !h.isCommonWord(word) {
|
||||
wordCount[word]++
|
||||
}
|
||||
}
|
||||
|
||||
// Get top keywords
|
||||
type wordFreq struct {
|
||||
word string
|
||||
count int
|
||||
}
|
||||
|
||||
var sortedWords []wordFreq
|
||||
for word, count := range wordCount {
|
||||
if count > 1 { // Only include words that appear more than once
|
||||
sortedWords = append(sortedWords, wordFreq{word, count})
|
||||
}
|
||||
}
|
||||
|
||||
// Sort by frequency
|
||||
for i := 0; i < len(sortedWords)-1; i++ {
|
||||
for j := i + 1; j < len(sortedWords); j++ {
|
||||
if sortedWords[j].count > sortedWords[i].count {
|
||||
sortedWords[i], sortedWords[j] = sortedWords[j], sortedWords[i]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Return top 10 keywords
|
||||
var keywords []string
|
||||
for i := 0; i < len(sortedWords) && i < 10; i++ {
|
||||
keywords = append(keywords, sortedWords[i].word)
|
||||
}
|
||||
|
||||
return keywords
|
||||
}
|
||||
|
||||
// isCommonWord checks if a word is too common to be a keyword
|
||||
func (h *WebScrapingHandler) isCommonWord(word string) bool {
|
||||
commonWords := []string{
|
||||
"the", "and", "for", "are", "but", "not", "you", "all", "can", "had", "her", "was", "one", "our", "out", "day", "get", "has", "him", "his", "how", "man", "new", "now", "old", "see", "two", "way", "who", "boy", "did", "its", "let", "put", "say", "she", "too", "use", "with", "have", "this", "that", "from", "they", "been", "call", "come", "each", "find", "give", "hand", "keep", "know", "last", "leave", "life", "long", "made", "many", "move", "must", "name", "need", "only", "over", "part", "said", "same", "show", "tell", "time", "turn", "well", "went", "were", "what", "will", "your", "about", "after", "again", "before", "being", "below", "could", "every", "first", "found", "great", "house", "large", "never", "other", "place", "right", "small", "sound", "still", "their", "there", "think", "under", "water", "where", "which", "world", "would", "write", "years",
|
||||
}
|
||||
|
||||
for _, common := range commonWords {
|
||||
if word == common {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// detectContentType detects the type of content
|
||||
func (h *WebScrapingHandler) detectContentType(title, content string) string {
|
||||
titleLower := strings.ToLower(title)
|
||||
contentLower := strings.ToLower(content)
|
||||
|
||||
// Check for tutorial
|
||||
if strings.Contains(titleLower, "tutorial") || strings.Contains(titleLower, "how to") || strings.Contains(contentLower, "step by step") {
|
||||
return "tutorial"
|
||||
}
|
||||
|
||||
// Check for documentation
|
||||
if strings.Contains(titleLower, "documentation") || strings.Contains(titleLower, "api") || strings.Contains(contentLower, "function") {
|
||||
return "documentation"
|
||||
}
|
||||
|
||||
// Check for news
|
||||
if strings.Contains(titleLower, "news") || strings.Contains(contentLower, "breaking") || strings.Contains(contentLower, "report") {
|
||||
return "news"
|
||||
}
|
||||
|
||||
// Check for blog
|
||||
if strings.Contains(titleLower, "blog") || strings.Contains(contentLower, "posted") || strings.Contains(contentLower, "opinion") {
|
||||
return "blog"
|
||||
}
|
||||
|
||||
// Default to article
|
||||
return "article"
|
||||
}
|
||||
|
||||
// generateSummary generates a simple summary
|
||||
func (h *WebScrapingHandler) generateSummary(content string) string {
|
||||
sentences := strings.Split(content, ".")
|
||||
if len(sentences) == 0 {
|
||||
return ""
|
||||
}
|
||||
|
||||
// Take first 2-3 sentences as summary
|
||||
summaryLength := 2
|
||||
if len(sentences) < 2 {
|
||||
summaryLength = len(sentences)
|
||||
} else if len(sentences) > 3 {
|
||||
summaryLength = 3
|
||||
}
|
||||
|
||||
var summary string
|
||||
for i := 0; i < summaryLength; i++ {
|
||||
sentence := strings.TrimSpace(sentences[i])
|
||||
if sentence != "" {
|
||||
summary += sentence + ". "
|
||||
}
|
||||
}
|
||||
|
||||
return strings.TrimSpace(summary)
|
||||
}
|
||||
Reference in New Issue
Block a user