mirror of
https://github.com/Dvorinka/Devour.git
synced 2026-06-03 20:13:03 +00:00
297 lines
6.5 KiB
Go
297 lines
6.5 KiB
Go
package scraper
|
|
|
|
import (
|
|
"context"
|
|
"crypto/sha256"
|
|
"encoding/hex"
|
|
"fmt"
|
|
"net/url"
|
|
"regexp"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/gocolly/colly/v2"
|
|
)
|
|
|
|
// WebScraper scrapes documentation from web URLs.
|
|
type WebScraper struct {
|
|
config *Config
|
|
}
|
|
|
|
// NewWebScraper creates a new web scraper.
|
|
func NewWebScraper(config *Config) *WebScraper {
|
|
return &WebScraper{config: config}
|
|
}
|
|
|
|
// Scrape fetches and parses documents from a web source.
|
|
func (s *WebScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
|
|
var documents []*Document
|
|
visited := make(map[string]bool)
|
|
|
|
// Parse base URL for domain restrictions
|
|
baseURL, err := url.Parse(source.URL)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("invalid URL: %w", err)
|
|
}
|
|
|
|
// Create Colly collector
|
|
c := colly.NewCollector(
|
|
colly.AllowedDomains(baseURL.Host),
|
|
colly.MaxDepth(s.config.MaxDepth),
|
|
colly.Async(true),
|
|
colly.UserAgent(s.config.UserAgent),
|
|
)
|
|
|
|
// Set rate limiting
|
|
if s.config.RateLimit > 0 {
|
|
c.Limit(&colly.LimitRule{
|
|
DomainGlob: "*",
|
|
Parallelism: s.config.Concurrency,
|
|
Delay: s.config.RateLimit,
|
|
})
|
|
}
|
|
|
|
// Set timeout
|
|
if s.config.Timeout > 0 {
|
|
c.SetRequestTimeout(s.config.Timeout)
|
|
}
|
|
|
|
// Enable caching if cache directory is set
|
|
if s.config.CacheDir != "" {
|
|
c.CacheDir = s.config.CacheDir
|
|
}
|
|
|
|
// Handle errors
|
|
c.OnError(func(r *colly.Response, err error) {
|
|
fmt.Printf("Error scraping %s: %v\n", r.Request.URL, err)
|
|
})
|
|
|
|
// Extract content from pages
|
|
c.OnHTML("html", func(e *colly.HTMLElement) {
|
|
pageURL := e.Request.URL.String()
|
|
|
|
// Skip if already visited
|
|
if visited[pageURL] {
|
|
return
|
|
}
|
|
visited[pageURL] = true
|
|
|
|
// Check include/exclude patterns
|
|
if !s.shouldInclude(pageURL, source.Include, source.Exclude) {
|
|
return
|
|
}
|
|
|
|
// Extract title
|
|
title := e.ChildText("title")
|
|
if title == "" {
|
|
title = e.ChildText("h1")
|
|
}
|
|
|
|
// Extract main content
|
|
content := s.extractContent(e)
|
|
|
|
// Skip if content is too short
|
|
if len(content) < 100 {
|
|
return
|
|
}
|
|
|
|
// Generate hash for change detection
|
|
hash := s.generateHash(content)
|
|
|
|
// Extract metadata
|
|
metadata := map[string]interface{}{
|
|
"headings": s.extractHeadings(e),
|
|
"links": s.extractLinks(e),
|
|
"images": s.extractImages(e),
|
|
"description": e.ChildAttr(`meta[name="description"]`, "content"),
|
|
}
|
|
|
|
doc := &Document{
|
|
ID: generateDocID(pageURL),
|
|
Source: source.Name,
|
|
Type: "html",
|
|
Title: strings.TrimSpace(title),
|
|
Content: content,
|
|
URL: pageURL,
|
|
Metadata: metadata,
|
|
Hash: hash,
|
|
Timestamp: time.Now(),
|
|
}
|
|
|
|
documents = append(documents, doc)
|
|
})
|
|
|
|
// Follow links
|
|
c.OnHTML("a[href]", func(e *colly.HTMLElement) {
|
|
link := e.Attr("href")
|
|
absoluteURL := e.Request.AbsoluteURL(link)
|
|
|
|
// Skip if already visited
|
|
if visited[absoluteURL] {
|
|
return
|
|
}
|
|
|
|
// Check include/exclude patterns
|
|
if !s.shouldInclude(absoluteURL, source.Include, source.Exclude) {
|
|
return
|
|
}
|
|
|
|
c.Visit(absoluteURL)
|
|
})
|
|
|
|
// Start scraping
|
|
if err := c.Visit(source.URL); err != nil {
|
|
return nil, fmt.Errorf("failed to start scraping: %w", err)
|
|
}
|
|
|
|
// Wait for async scraping to complete
|
|
c.Wait()
|
|
|
|
return documents, nil
|
|
}
|
|
|
|
// DetectChanges checks if the web source has changed.
|
|
func (s *WebScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
|
|
// Quick check by fetching just the main page
|
|
c := colly.NewCollector(
|
|
colly.UserAgent(s.config.UserAgent),
|
|
)
|
|
c.SetRequestTimeout(s.config.Timeout)
|
|
|
|
var content string
|
|
c.OnHTML("html", func(e *colly.HTMLElement) {
|
|
content = s.extractContent(e)
|
|
})
|
|
|
|
if err := c.Visit(source.URL); err != nil {
|
|
return false, "", err
|
|
}
|
|
|
|
currentHash := s.generateHash(content)
|
|
changed := currentHash != lastHash
|
|
|
|
return changed, currentHash, nil
|
|
}
|
|
|
|
// extractContent extracts the main text content from a page.
|
|
func (s *WebScraper) extractContent(e *colly.HTMLElement) string {
|
|
// Try common content selectors
|
|
selectors := []string{
|
|
"article",
|
|
"main",
|
|
".content",
|
|
".documentation",
|
|
".docs",
|
|
".markdown-body",
|
|
"[role='main']",
|
|
"#content",
|
|
"#main",
|
|
}
|
|
|
|
var content string
|
|
for _, selector := range selectors {
|
|
content = e.ChildText(selector)
|
|
if len(content) > 200 {
|
|
break
|
|
}
|
|
}
|
|
|
|
// Fallback to body if no content found
|
|
if content == "" {
|
|
content = e.ChildText("body")
|
|
}
|
|
|
|
// Clean up content
|
|
content = cleanText(content)
|
|
|
|
return content
|
|
}
|
|
|
|
// extractHeadings extracts heading structure.
|
|
func (s *WebScraper) extractHeadings(e *colly.HTMLElement) []string {
|
|
var headings []string
|
|
e.ForEach("h1, h2, h3, h4, h5, h6", func(_ int, h *colly.HTMLElement) {
|
|
text := strings.TrimSpace(h.Text)
|
|
if text != "" {
|
|
headings = append(headings, text)
|
|
}
|
|
})
|
|
return headings
|
|
}
|
|
|
|
// extractLinks extracts internal links.
|
|
func (s *WebScraper) extractLinks(e *colly.HTMLElement) []string {
|
|
var links []string
|
|
seen := make(map[string]bool)
|
|
e.ForEach("a[href]", func(_ int, a *colly.HTMLElement) {
|
|
href := a.Attr("href")
|
|
if href != "" && !seen[href] && !strings.HasPrefix(href, "#") {
|
|
links = append(links, href)
|
|
seen[href] = true
|
|
}
|
|
})
|
|
return links
|
|
}
|
|
|
|
// extractImages extracts image URLs.
|
|
func (s *WebScraper) extractImages(e *colly.HTMLElement) []string {
|
|
var images []string
|
|
e.ForEach("img[src]", func(_ int, img *colly.HTMLElement) {
|
|
src := img.Attr("src")
|
|
if src != "" {
|
|
images = append(images, src)
|
|
}
|
|
})
|
|
return images
|
|
}
|
|
|
|
// shouldInclude checks if a URL should be included based on patterns.
|
|
func (s *WebScraper) shouldInclude(urlStr string, include, exclude []string) bool {
|
|
// Check exclude patterns first
|
|
for _, pattern := range exclude {
|
|
matched, _ := regexp.MatchString(pattern, urlStr)
|
|
if matched {
|
|
return false
|
|
}
|
|
}
|
|
|
|
// If no include patterns, include all
|
|
if len(include) == 0 {
|
|
return true
|
|
}
|
|
|
|
// Check include patterns
|
|
for _, pattern := range include {
|
|
matched, _ := regexp.MatchString(pattern, urlStr)
|
|
if matched {
|
|
return true
|
|
}
|
|
}
|
|
|
|
return false
|
|
}
|
|
|
|
// generateHash generates a SHA256 hash of content.
|
|
func (s *WebScraper) generateHash(content string) string {
|
|
hash := sha256.Sum256([]byte(content))
|
|
return hex.EncodeToString(hash[:])
|
|
}
|
|
|
|
// cleanText removes extra whitespace and normalizes text.
|
|
func cleanText(text string) string {
|
|
// Replace multiple whitespace with single space
|
|
re := regexp.MustCompile(`\s+`)
|
|
text = re.ReplaceAllString(text, " ")
|
|
|
|
// Trim spaces
|
|
text = strings.TrimSpace(text)
|
|
|
|
return text
|
|
}
|
|
|
|
// generateDocID generates a unique ID for a document.
|
|
func generateDocID(urlStr string) string {
|
|
hash := sha256.Sum256([]byte(urlStr))
|
|
return hex.EncodeToString(hash[:12])
|
|
}
|