Files
Devour/internal/scraper/web.go
2026-02-24 12:10:13 +01:00

480 lines
10 KiB
Go

package scraper
import (
"context"
"crypto/sha256"
"encoding/hex"
"errors"
"fmt"
"net/url"
"path"
"regexp"
"strings"
"sync"
"time"
"github.com/gocolly/colly/v2"
)
// WebScraper scrapes documentation from web URLs.
type WebScraper struct {
config *Config
}
// NewWebScraper creates a new web scraper.
func NewWebScraper(config *Config) *WebScraper {
return &WebScraper{config: config}
}
// Scrape fetches and parses documents from a web source.
func (s *WebScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
var documents []*Document
visited := make(map[string]bool)
scheduled := make(map[string]bool)
contentHashes := make(map[string]bool)
var mu sync.Mutex
var scrapeErrors []error
// Parse base URL for domain restrictions
baseURL, err := url.Parse(source.URL)
if err != nil {
return nil, fmt.Errorf("invalid URL: %w", err)
}
allowedDomain := baseURL.Hostname()
if allowedDomain == "" {
allowedDomain = baseURL.Host
}
maxDepth := s.config.MaxDepth
if maxDepth <= 0 {
maxDepth = 2
}
maxPages := s.config.Concurrency * 40
if maxPages < 20 {
maxPages = 20
}
if maxDepth <= 1 && maxPages > 30 {
maxPages = 30
}
if maxPages > 300 {
maxPages = 300
}
scopePrefix := pathScopePrefix(baseURL.Path)
scopeLeaf := pathScopeLeaf(baseURL.Path)
// Create Colly collector
c := colly.NewCollector(
colly.AllowedDomains(allowedDomain),
colly.MaxDepth(maxDepth),
colly.Async(true),
colly.UserAgent(s.config.UserAgent),
)
// Set rate limiting
if s.config.RateLimit > 0 {
if err := c.Limit(&colly.LimitRule{
DomainGlob: "*",
Parallelism: s.config.Concurrency,
Delay: s.config.RateLimit,
}); err != nil {
return nil, fmt.Errorf("failed to set rate limiting: %w", err)
}
}
// Set timeout
if s.config.Timeout > 0 {
c.SetRequestTimeout(s.config.Timeout)
}
// Enable caching if cache directory is set
if s.config.CacheDir != "" {
c.CacheDir = s.config.CacheDir
}
// Handle errors
c.OnError(func(r *colly.Response, err error) {
errText := strings.ToLower(err.Error())
if strings.Contains(errText, "already visited") {
return
}
reqURL := source.URL
if r != nil && r.Request != nil && r.Request.URL != nil {
reqURL = r.Request.URL.String()
}
mu.Lock()
if len(scrapeErrors) < 20 {
scrapeErrors = append(scrapeErrors, fmt.Errorf("%s: %w", reqURL, err))
}
mu.Unlock()
})
// Extract content from pages
c.OnHTML("html", func(e *colly.HTMLElement) {
pageURL := e.Request.URL.String()
if !withinScope(e.Request.URL, baseURL, scopePrefix, scopeLeaf) {
return
}
// Skip if already visited
mu.Lock()
if visited[pageURL] {
mu.Unlock()
return
}
if len(visited) >= maxPages {
mu.Unlock()
return
}
visited[pageURL] = true
mu.Unlock()
// Check include/exclude patterns
if !s.shouldInclude(pageURL, source.Include, source.Exclude) {
return
}
// Extract title
title := e.ChildText("title")
if title == "" {
title = e.ChildText("h1")
}
// Extract main content
content := s.extractContent(e)
// Skip if content is too short
if len(content) < 100 {
return
}
// Generate hash for change detection
hash := s.generateHash(content)
mu.Lock()
if contentHashes[hash] {
mu.Unlock()
return
}
contentHashes[hash] = true
mu.Unlock()
// Extract metadata
metadata := map[string]interface{}{
"headings": s.extractHeadings(e),
"links": s.extractLinks(e),
"images": s.extractImages(e),
"description": e.ChildAttr(`meta[name="description"]`, "content"),
}
doc := &Document{
ID: generateDocID(pageURL),
Source: source.Name,
Type: "html",
Title: strings.TrimSpace(title),
Content: content,
URL: pageURL,
Metadata: metadata,
Hash: hash,
Timestamp: time.Now(),
}
mu.Lock()
documents = append(documents, doc)
mu.Unlock()
})
// Follow links
c.OnHTML("a[href]", func(e *colly.HTMLElement) {
link := e.Attr("href")
absoluteURL := e.Request.AbsoluteURL(link)
if absoluteURL == "" {
return
}
linkURL, err := url.Parse(absoluteURL)
if err != nil {
return
}
if linkURL.Scheme != "http" && linkURL.Scheme != "https" {
return
}
if !withinScope(linkURL, baseURL, scopePrefix, scopeLeaf) {
return
}
// Skip if already visited
mu.Lock()
if visited[absoluteURL] {
mu.Unlock()
return
}
if len(visited) >= maxPages {
mu.Unlock()
return
}
mu.Unlock()
// Check include/exclude patterns
if !s.shouldInclude(absoluteURL, source.Include, source.Exclude) {
return
}
mu.Lock()
if scheduled[absoluteURL] {
mu.Unlock()
return
}
if len(scheduled) >= maxPages {
mu.Unlock()
return
}
scheduled[absoluteURL] = true
mu.Unlock()
if err := c.Visit(absoluteURL); err != nil {
errText := strings.ToLower(err.Error())
if strings.Contains(errText, "already visited") {
return
}
mu.Lock()
if len(scrapeErrors) < 20 {
scrapeErrors = append(scrapeErrors, fmt.Errorf("%s: %w", absoluteURL, err))
}
mu.Unlock()
}
})
// Start scraping
scheduled[source.URL] = true
if err := c.Visit(source.URL); err != nil {
return nil, fmt.Errorf("failed to start scraping: %w", err)
}
// Wait for async scraping to complete
c.Wait()
mu.Lock()
defer mu.Unlock()
if len(documents) == 0 {
if len(scrapeErrors) > 0 {
return nil, fmt.Errorf("web scrape failed: %w", errors.Join(scrapeErrors...))
}
return nil, fmt.Errorf("web scrape extracted no documents from %s", source.URL)
}
return documents, nil
}
// DetectChanges checks if the web source has changed.
func (s *WebScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
// Quick check by fetching just the main page
c := colly.NewCollector(
colly.UserAgent(s.config.UserAgent),
)
c.SetRequestTimeout(s.config.Timeout)
var content string
c.OnHTML("html", func(e *colly.HTMLElement) {
content = s.extractContent(e)
})
if err := c.Visit(source.URL); err != nil {
return false, "", err
}
currentHash := s.generateHash(content)
changed := currentHash != lastHash
return changed, currentHash, nil
}
// extractContent extracts the main text content from a page.
func (s *WebScraper) extractContent(e *colly.HTMLElement) string {
// Try common content selectors
selectors := []string{
"article",
"main",
".content",
".documentation",
".docs",
".markdown-body",
"[role='main']",
"#content",
"#main",
}
var content string
for _, selector := range selectors {
content = e.ChildText(selector)
if len(content) > 200 {
break
}
}
// Fallback to body if no content found
if content == "" {
content = e.ChildText("body")
}
// Clean up content
content = cleanText(content)
return content
}
// extractHeadings extracts heading structure.
func (s *WebScraper) extractHeadings(e *colly.HTMLElement) []string {
var headings []string
e.ForEach("h1, h2, h3, h4, h5, h6", func(_ int, h *colly.HTMLElement) {
text := strings.TrimSpace(h.Text)
if text != "" {
headings = append(headings, text)
}
})
return headings
}
// extractLinks extracts internal links.
func (s *WebScraper) extractLinks(e *colly.HTMLElement) []string {
var links []string
seen := make(map[string]bool)
e.ForEach("a[href]", func(_ int, a *colly.HTMLElement) {
href := a.Attr("href")
if href != "" && !seen[href] && !strings.HasPrefix(href, "#") {
links = append(links, href)
seen[href] = true
}
})
return links
}
// extractImages extracts image URLs.
func (s *WebScraper) extractImages(e *colly.HTMLElement) []string {
var images []string
e.ForEach("img[src]", func(_ int, img *colly.HTMLElement) {
src := img.Attr("src")
if src != "" {
images = append(images, src)
}
})
return images
}
// shouldInclude checks if a URL should be included based on patterns.
func (s *WebScraper) shouldInclude(urlStr string, include, exclude []string) bool {
// Check exclude patterns first
for _, pattern := range exclude {
matched, _ := regexp.MatchString(pattern, urlStr)
if matched {
return false
}
}
// If no include patterns, include all
if len(include) == 0 {
return true
}
// Check include patterns
for _, pattern := range include {
matched, _ := regexp.MatchString(pattern, urlStr)
if matched {
return true
}
}
return false
}
// generateHash generates a SHA256 hash of content.
func (s *WebScraper) generateHash(content string) string {
hash := sha256.Sum256([]byte(content))
return hex.EncodeToString(hash[:])
}
// cleanText removes extra whitespace and normalizes text.
func cleanText(text string) string {
noisePhrases := []string{
"table of contents",
"in this article",
"additional resources",
"feedback",
"collaborate with us on github",
"copyright",
"all rights reserved",
"privacy policy",
"terms of service",
"sign in",
"skip to main content",
"ask learn",
}
for _, phrase := range noisePhrases {
re := regexp.MustCompile(`(?i)` + regexp.QuoteMeta(phrase))
text = re.ReplaceAllString(text, " ")
}
// Replace multiple whitespace with single space
re := regexp.MustCompile(`\s+`)
text = re.ReplaceAllString(text, " ")
// Trim spaces
text = strings.TrimSpace(text)
return text
}
func pathScopePrefix(rawPath string) string {
clean := path.Clean(rawPath)
if clean == "." || clean == "/" || clean == "" {
return ""
}
last := path.Base(clean)
if strings.Contains(last, ".") {
dir := path.Dir(clean)
if dir == "/" {
// Root-level document page: keep crawler scoped to this page path.
return clean
}
return dir
}
dir := path.Dir(clean)
if dir == "/" {
return clean
}
return dir
}
func pathScopeLeaf(rawPath string) string {
clean := path.Clean(rawPath)
if clean == "." || clean == "/" || clean == "" {
return ""
}
last := path.Base(clean)
if strings.Contains(last, ".") {
return last
}
return ""
}
func withinScope(target, base *url.URL, prefix, leaf string) bool {
if target == nil || base == nil {
return false
}
if !strings.EqualFold(target.Hostname(), base.Hostname()) {
return false
}
if prefix == "" {
return true
}
targetPath := target.Path
if targetPath == "" {
targetPath = path.Clean("/")
}
if strings.HasPrefix(targetPath, prefix) {
return true
}
return leaf != "" && path.Base(targetPath) == leaf
}