Files
Devour/internal/scraper/web.go
Tomas Dvorak 55885a0e8f first commit
2026-02-22 10:42:17 +01:00

297 lines
6.5 KiB
Go

package scraper
import (
"context"
"crypto/sha256"
"encoding/hex"
"fmt"
"net/url"
"regexp"
"strings"
"time"
"github.com/gocolly/colly/v2"
)
// WebScraper scrapes documentation from web URLs.
type WebScraper struct {
config *Config
}
// NewWebScraper creates a new web scraper.
func NewWebScraper(config *Config) *WebScraper {
return &WebScraper{config: config}
}
// Scrape fetches and parses documents from a web source.
func (s *WebScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
var documents []*Document
visited := make(map[string]bool)
// Parse base URL for domain restrictions
baseURL, err := url.Parse(source.URL)
if err != nil {
return nil, fmt.Errorf("invalid URL: %w", err)
}
// Create Colly collector
c := colly.NewCollector(
colly.AllowedDomains(baseURL.Host),
colly.MaxDepth(s.config.MaxDepth),
colly.Async(true),
colly.UserAgent(s.config.UserAgent),
)
// Set rate limiting
if s.config.RateLimit > 0 {
c.Limit(&colly.LimitRule{
DomainGlob: "*",
Parallelism: s.config.Concurrency,
Delay: s.config.RateLimit,
})
}
// Set timeout
if s.config.Timeout > 0 {
c.SetRequestTimeout(s.config.Timeout)
}
// Enable caching if cache directory is set
if s.config.CacheDir != "" {
c.CacheDir = s.config.CacheDir
}
// Handle errors
c.OnError(func(r *colly.Response, err error) {
fmt.Printf("Error scraping %s: %v\n", r.Request.URL, err)
})
// Extract content from pages
c.OnHTML("html", func(e *colly.HTMLElement) {
pageURL := e.Request.URL.String()
// Skip if already visited
if visited[pageURL] {
return
}
visited[pageURL] = true
// Check include/exclude patterns
if !s.shouldInclude(pageURL, source.Include, source.Exclude) {
return
}
// Extract title
title := e.ChildText("title")
if title == "" {
title = e.ChildText("h1")
}
// Extract main content
content := s.extractContent(e)
// Skip if content is too short
if len(content) < 100 {
return
}
// Generate hash for change detection
hash := s.generateHash(content)
// Extract metadata
metadata := map[string]interface{}{
"headings": s.extractHeadings(e),
"links": s.extractLinks(e),
"images": s.extractImages(e),
"description": e.ChildAttr(`meta[name="description"]`, "content"),
}
doc := &Document{
ID: generateDocID(pageURL),
Source: source.Name,
Type: "html",
Title: strings.TrimSpace(title),
Content: content,
URL: pageURL,
Metadata: metadata,
Hash: hash,
Timestamp: time.Now(),
}
documents = append(documents, doc)
})
// Follow links
c.OnHTML("a[href]", func(e *colly.HTMLElement) {
link := e.Attr("href")
absoluteURL := e.Request.AbsoluteURL(link)
// Skip if already visited
if visited[absoluteURL] {
return
}
// Check include/exclude patterns
if !s.shouldInclude(absoluteURL, source.Include, source.Exclude) {
return
}
c.Visit(absoluteURL)
})
// Start scraping
if err := c.Visit(source.URL); err != nil {
return nil, fmt.Errorf("failed to start scraping: %w", err)
}
// Wait for async scraping to complete
c.Wait()
return documents, nil
}
// DetectChanges checks if the web source has changed.
func (s *WebScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
// Quick check by fetching just the main page
c := colly.NewCollector(
colly.UserAgent(s.config.UserAgent),
)
c.SetRequestTimeout(s.config.Timeout)
var content string
c.OnHTML("html", func(e *colly.HTMLElement) {
content = s.extractContent(e)
})
if err := c.Visit(source.URL); err != nil {
return false, "", err
}
currentHash := s.generateHash(content)
changed := currentHash != lastHash
return changed, currentHash, nil
}
// extractContent extracts the main text content from a page.
func (s *WebScraper) extractContent(e *colly.HTMLElement) string {
// Try common content selectors
selectors := []string{
"article",
"main",
".content",
".documentation",
".docs",
".markdown-body",
"[role='main']",
"#content",
"#main",
}
var content string
for _, selector := range selectors {
content = e.ChildText(selector)
if len(content) > 200 {
break
}
}
// Fallback to body if no content found
if content == "" {
content = e.ChildText("body")
}
// Clean up content
content = cleanText(content)
return content
}
// extractHeadings extracts heading structure.
func (s *WebScraper) extractHeadings(e *colly.HTMLElement) []string {
var headings []string
e.ForEach("h1, h2, h3, h4, h5, h6", func(_ int, h *colly.HTMLElement) {
text := strings.TrimSpace(h.Text)
if text != "" {
headings = append(headings, text)
}
})
return headings
}
// extractLinks extracts internal links.
func (s *WebScraper) extractLinks(e *colly.HTMLElement) []string {
var links []string
seen := make(map[string]bool)
e.ForEach("a[href]", func(_ int, a *colly.HTMLElement) {
href := a.Attr("href")
if href != "" && !seen[href] && !strings.HasPrefix(href, "#") {
links = append(links, href)
seen[href] = true
}
})
return links
}
// extractImages extracts image URLs.
func (s *WebScraper) extractImages(e *colly.HTMLElement) []string {
var images []string
e.ForEach("img[src]", func(_ int, img *colly.HTMLElement) {
src := img.Attr("src")
if src != "" {
images = append(images, src)
}
})
return images
}
// shouldInclude checks if a URL should be included based on patterns.
func (s *WebScraper) shouldInclude(urlStr string, include, exclude []string) bool {
// Check exclude patterns first
for _, pattern := range exclude {
matched, _ := regexp.MatchString(pattern, urlStr)
if matched {
return false
}
}
// If no include patterns, include all
if len(include) == 0 {
return true
}
// Check include patterns
for _, pattern := range include {
matched, _ := regexp.MatchString(pattern, urlStr)
if matched {
return true
}
}
return false
}
// generateHash generates a SHA256 hash of content.
func (s *WebScraper) generateHash(content string) string {
hash := sha256.Sum256([]byte(content))
return hex.EncodeToString(hash[:])
}
// cleanText removes extra whitespace and normalizes text.
func cleanText(text string) string {
// Replace multiple whitespace with single space
re := regexp.MustCompile(`\s+`)
text = re.ReplaceAllString(text, " ")
// Trim spaces
text = strings.TrimSpace(text)
return text
}
// generateDocID generates a unique ID for a document.
func generateDocID(urlStr string) string {
hash := sha256.Sum256([]byte(urlStr))
return hex.EncodeToString(hash[:12])
}