Devour/internal/scraper/web.go

package scraper

import (
	"context"
	"crypto/sha256"
	"encoding/hex"
	"fmt"
	"net/url"
	"regexp"
	"strings"
	"time"

	"github.com/gocolly/colly/v2"
)

// WebScraper scrapes documentation from web URLs.
type WebScraper struct {
	config *Config
}

// NewWebScraper creates a new web scraper.
func NewWebScraper(config *Config) *WebScraper {
	return &WebScraper{config: config}
}

// Scrape fetches and parses documents from a web source.
func (s *WebScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
	var documents []*Document
	visited := make(map[string]bool)

	// Parse base URL for domain restrictions
	baseURL, err := url.Parse(source.URL)
	if err != nil {
		return nil, fmt.Errorf("invalid URL: %w", err)
	}

	// Create Colly collector
	c := colly.NewCollector(
		colly.AllowedDomains(baseURL.Host),
		colly.MaxDepth(s.config.MaxDepth),
		colly.Async(true),
		colly.UserAgent(s.config.UserAgent),
	)

	// Set rate limiting
	if s.config.RateLimit > 0 {
		c.Limit(&colly.LimitRule{
			DomainGlob:  "*",
			Parallelism: s.config.Concurrency,
			Delay:       s.config.RateLimit,
		})
	}

	// Set timeout
	if s.config.Timeout > 0 {
		c.SetRequestTimeout(s.config.Timeout)
	}

	// Enable caching if cache directory is set
	if s.config.CacheDir != "" {
		c.CacheDir = s.config.CacheDir
	}

	// Handle errors
	c.OnError(func(r *colly.Response, err error) {
		fmt.Printf("Error scraping %s: %v\n", r.Request.URL, err)
	})

	// Extract content from pages
	c.OnHTML("html", func(e *colly.HTMLElement) {
		pageURL := e.Request.URL.String()

		// Skip if already visited
		if visited[pageURL] {
			return
		}
		visited[pageURL] = true

		// Check include/exclude patterns
		if !s.shouldInclude(pageURL, source.Include, source.Exclude) {
			return
		}

		// Extract title
		title := e.ChildText("title")
		if title == "" {
			title = e.ChildText("h1")
		}

		// Extract main content
		content := s.extractContent(e)

		// Skip if content is too short
		if len(content) < 100 {
			return
		}

		// Generate hash for change detection
		hash := s.generateHash(content)

		// Extract metadata
		metadata := map[string]interface{}{
			"headings":    s.extractHeadings(e),
			"links":       s.extractLinks(e),
			"images":      s.extractImages(e),
			"description": e.ChildAttr(`meta[name="description"]`, "content"),
		}

		doc := &Document{
			ID:        generateDocID(pageURL),
			Source:    source.Name,
			Type:      "html",
			Title:     strings.TrimSpace(title),
			Content:   content,
			URL:       pageURL,
			Metadata:  metadata,
			Hash:      hash,
			Timestamp: time.Now(),
		}

		documents = append(documents, doc)
	})

	// Follow links
	c.OnHTML("a[href]", func(e *colly.HTMLElement) {
		link := e.Attr("href")
		absoluteURL := e.Request.AbsoluteURL(link)

		// Skip if already visited
		if visited[absoluteURL] {
			return
		}

		// Check include/exclude patterns
		if !s.shouldInclude(absoluteURL, source.Include, source.Exclude) {
			return
		}

		c.Visit(absoluteURL)
	})

	// Start scraping
	if err := c.Visit(source.URL); err != nil {
		return nil, fmt.Errorf("failed to start scraping: %w", err)
	}

	// Wait for async scraping to complete
	c.Wait()

	return documents, nil
}

// DetectChanges checks if the web source has changed.
func (s *WebScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
	// Quick check by fetching just the main page
	c := colly.NewCollector(
		colly.UserAgent(s.config.UserAgent),
	)
	c.SetRequestTimeout(s.config.Timeout)

	var content string
	c.OnHTML("html", func(e *colly.HTMLElement) {
		content = s.extractContent(e)
	})

	if err := c.Visit(source.URL); err != nil {
		return false, "", err
	}

	currentHash := s.generateHash(content)
	changed := currentHash != lastHash

	return changed, currentHash, nil
}

// extractContent extracts the main text content from a page.
func (s *WebScraper) extractContent(e *colly.HTMLElement) string {
	// Try common content selectors
	selectors := []string{
		"article",
		"main",
		".content",
		".documentation",
		".docs",
		".markdown-body",
		"[role='main']",
		"#content",
		"#main",
	}

	var content string
	for _, selector := range selectors {
		content = e.ChildText(selector)
		if len(content) > 200 {
			break
		}
	}

	// Fallback to body if no content found
	if content == "" {
		content = e.ChildText("body")
	}

	// Clean up content
	content = cleanText(content)

	return content
}

// extractHeadings extracts heading structure.
func (s *WebScraper) extractHeadings(e *colly.HTMLElement) []string {
	var headings []string
	e.ForEach("h1, h2, h3, h4, h5, h6", func(_ int, h *colly.HTMLElement) {
		text := strings.TrimSpace(h.Text)
		if text != "" {
			headings = append(headings, text)
		}
	})
	return headings
}

// extractLinks extracts internal links.
func (s *WebScraper) extractLinks(e *colly.HTMLElement) []string {
	var links []string
	seen := make(map[string]bool)
	e.ForEach("a[href]", func(_ int, a *colly.HTMLElement) {
		href := a.Attr("href")
		if href != "" && !seen[href] && !strings.HasPrefix(href, "#") {
			links = append(links, href)
			seen[href] = true
		}
	})
	return links
}

// extractImages extracts image URLs.
func (s *WebScraper) extractImages(e *colly.HTMLElement) []string {
	var images []string
	e.ForEach("img[src]", func(_ int, img *colly.HTMLElement) {
		src := img.Attr("src")
		if src != "" {
			images = append(images, src)
		}
	})
	return images
}

// shouldInclude checks if a URL should be included based on patterns.
func (s *WebScraper) shouldInclude(urlStr string, include, exclude []string) bool {
	// Check exclude patterns first
	for _, pattern := range exclude {
		matched, _ := regexp.MatchString(pattern, urlStr)
		if matched {
			return false
		}
	}

	// If no include patterns, include all
	if len(include) == 0 {
		return true
	}

	// Check include patterns
	for _, pattern := range include {
		matched, _ := regexp.MatchString(pattern, urlStr)
		if matched {
			return true
		}
	}

	return false
}

// generateHash generates a SHA256 hash of content.
func (s *WebScraper) generateHash(content string) string {
	hash := sha256.Sum256([]byte(content))
	return hex.EncodeToString(hash[:])
}

// cleanText removes extra whitespace and normalizes text.
func cleanText(text string) string {
	// Replace multiple whitespace with single space
	re := regexp.MustCompile(`\s+`)
	text = re.ReplaceAllString(text, " ")

	// Trim spaces
	text = strings.TrimSpace(text)

	return text
}

// generateDocID generates a unique ID for a document.
func generateDocID(urlStr string) string {
	hash := sha256.Sum256([]byte(urlStr))
	return hex.EncodeToString(hash[:12])
}