Devour/internal/scraper/web.go

package scraper

import (
	"context"
	"crypto/sha256"
	"encoding/hex"
	"errors"
	"fmt"
	"net/url"
	"path"
	"regexp"
	"strings"
	"sync"
	"time"

	"github.com/gocolly/colly/v2"
)

// WebScraper scrapes documentation from web URLs.
type WebScraper struct {
	config *Config
}

// NewWebScraper creates a new web scraper.
func NewWebScraper(config *Config) *WebScraper {
	return &WebScraper{config: config}
}

// Scrape fetches and parses documents from a web source.
func (s *WebScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
	var documents []*Document
	visited := make(map[string]bool)
	scheduled := make(map[string]bool)
	contentHashes := make(map[string]bool)
	var mu sync.Mutex
	var scrapeErrors []error

	// Parse base URL for domain restrictions
	baseURL, err := url.Parse(source.URL)
	if err != nil {
		return nil, fmt.Errorf("invalid URL: %w", err)
	}
	allowedDomain := baseURL.Hostname()
	if allowedDomain == "" {
		allowedDomain = baseURL.Host
	}

	maxDepth := s.config.MaxDepth
	if maxDepth <= 0 {
		maxDepth = 2
	}
	maxPages := s.config.Concurrency * 40
	if maxPages < 20 {
		maxPages = 20
	}
	if maxDepth <= 1 && maxPages > 30 {
		maxPages = 30
	}
	if maxPages > 300 {
		maxPages = 300
	}
	scopePrefix := pathScopePrefix(baseURL.Path)
	scopeLeaf := pathScopeLeaf(baseURL.Path)

	// Create Colly collector
	c := colly.NewCollector(
		colly.AllowedDomains(allowedDomain),
		colly.MaxDepth(maxDepth),
		colly.Async(true),
		colly.UserAgent(s.config.UserAgent),
	)

	// Set rate limiting
	if s.config.RateLimit > 0 {
		if err := c.Limit(&colly.LimitRule{
			DomainGlob:  "*",
			Parallelism: s.config.Concurrency,
			Delay:       s.config.RateLimit,
		}); err != nil {
			return nil, fmt.Errorf("failed to set rate limiting: %w", err)
		}
	}

	// Set timeout
	if s.config.Timeout > 0 {
		c.SetRequestTimeout(s.config.Timeout)
	}

	// Enable caching if cache directory is set
	if s.config.CacheDir != "" {
		c.CacheDir = s.config.CacheDir
	}

	// Handle errors
	c.OnError(func(r *colly.Response, err error) {
		errText := strings.ToLower(err.Error())
		if strings.Contains(errText, "already visited") {
			return
		}
		reqURL := source.URL
		if r != nil && r.Request != nil && r.Request.URL != nil {
			reqURL = r.Request.URL.String()
		}
		mu.Lock()
		if len(scrapeErrors) < 20 {
			scrapeErrors = append(scrapeErrors, fmt.Errorf("%s: %w", reqURL, err))
		}
		mu.Unlock()
	})

	// Extract content from pages
	c.OnHTML("html", func(e *colly.HTMLElement) {
		pageURL := e.Request.URL.String()
		if !withinScope(e.Request.URL, baseURL, scopePrefix, scopeLeaf) {
			return
		}

		// Skip if already visited
		mu.Lock()
		if visited[pageURL] {
			mu.Unlock()
			return
		}
		if len(visited) >= maxPages {
			mu.Unlock()
			return
		}
		visited[pageURL] = true
		mu.Unlock()

		// Check include/exclude patterns
		if !s.shouldInclude(pageURL, source.Include, source.Exclude) {
			return
		}

		// Extract title
		title := e.ChildText("title")
		if title == "" {
			title = e.ChildText("h1")
		}

		// Extract main content
		content := s.extractContent(e)

		// Skip if content is too short
		if len(content) < 100 {
			return
		}

		// Generate hash for change detection
		hash := s.generateHash(content)
		mu.Lock()
		if contentHashes[hash] {
			mu.Unlock()
			return
		}
		contentHashes[hash] = true
		mu.Unlock()

		// Extract metadata
		metadata := map[string]interface{}{
			"headings":    s.extractHeadings(e),
			"links":       s.extractLinks(e),
			"images":      s.extractImages(e),
			"description": e.ChildAttr(`meta[name="description"]`, "content"),
		}

		doc := &Document{
			ID:        generateDocID(pageURL),
			Source:    source.Name,
			Type:      "html",
			Title:     strings.TrimSpace(title),
			Content:   content,
			URL:       pageURL,
			Metadata:  metadata,
			Hash:      hash,
			Timestamp: time.Now(),
		}

		mu.Lock()
		documents = append(documents, doc)
		mu.Unlock()
	})

	// Follow links
	c.OnHTML("a[href]", func(e *colly.HTMLElement) {
		link := e.Attr("href")
		absoluteURL := e.Request.AbsoluteURL(link)
		if absoluteURL == "" {
			return
		}

		linkURL, err := url.Parse(absoluteURL)
		if err != nil {
			return
		}
		if linkURL.Scheme != "http" && linkURL.Scheme != "https" {
			return
		}
		if !withinScope(linkURL, baseURL, scopePrefix, scopeLeaf) {
			return
		}

		// Skip if already visited
		mu.Lock()
		if visited[absoluteURL] {
			mu.Unlock()
			return
		}
		if len(visited) >= maxPages {
			mu.Unlock()
			return
		}
		mu.Unlock()

		// Check include/exclude patterns
		if !s.shouldInclude(absoluteURL, source.Include, source.Exclude) {
			return
		}

		mu.Lock()
		if scheduled[absoluteURL] {
			mu.Unlock()
			return
		}
		if len(scheduled) >= maxPages {
			mu.Unlock()
			return
		}
		scheduled[absoluteURL] = true
		mu.Unlock()

		if err := c.Visit(absoluteURL); err != nil {
			errText := strings.ToLower(err.Error())
			if strings.Contains(errText, "already visited") {
				return
			}
			mu.Lock()
			if len(scrapeErrors) < 20 {
				scrapeErrors = append(scrapeErrors, fmt.Errorf("%s: %w", absoluteURL, err))
			}
			mu.Unlock()
		}
	})

	// Start scraping
	scheduled[source.URL] = true
	if err := c.Visit(source.URL); err != nil {
		return nil, fmt.Errorf("failed to start scraping: %w", err)
	}

	// Wait for async scraping to complete
	c.Wait()

	mu.Lock()
	defer mu.Unlock()

	if len(documents) == 0 {
		if len(scrapeErrors) > 0 {
			return nil, fmt.Errorf("web scrape failed: %w", errors.Join(scrapeErrors...))
		}
		return nil, fmt.Errorf("web scrape extracted no documents from %s", source.URL)
	}

	return documents, nil
}

// DetectChanges checks if the web source has changed.
func (s *WebScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
	// Quick check by fetching just the main page
	c := colly.NewCollector(
		colly.UserAgent(s.config.UserAgent),
	)
	c.SetRequestTimeout(s.config.Timeout)

	var content string
	c.OnHTML("html", func(e *colly.HTMLElement) {
		content = s.extractContent(e)
	})

	if err := c.Visit(source.URL); err != nil {
		return false, "", err
	}

	currentHash := s.generateHash(content)
	changed := currentHash != lastHash

	return changed, currentHash, nil
}

// extractContent extracts the main text content from a page.
func (s *WebScraper) extractContent(e *colly.HTMLElement) string {
	// Try common content selectors
	selectors := []string{
		"article",
		"main",
		".content",
		".documentation",
		".docs",
		".markdown-body",
		"[role='main']",
		"#content",
		"#main",
	}

	var content string
	for _, selector := range selectors {
		content = e.ChildText(selector)
		if len(content) > 200 {
			break
		}
	}

	// Fallback to body if no content found
	if content == "" {
		content = e.ChildText("body")
	}

	// Clean up content
	content = cleanText(content)

	return content
}

// extractHeadings extracts heading structure.
func (s *WebScraper) extractHeadings(e *colly.HTMLElement) []string {
	var headings []string
	e.ForEach("h1, h2, h3, h4, h5, h6", func(_ int, h *colly.HTMLElement) {
		text := strings.TrimSpace(h.Text)
		if text != "" {
			headings = append(headings, text)
		}
	})
	return headings
}

// extractLinks extracts internal links.
func (s *WebScraper) extractLinks(e *colly.HTMLElement) []string {
	var links []string
	seen := make(map[string]bool)
	e.ForEach("a[href]", func(_ int, a *colly.HTMLElement) {
		href := a.Attr("href")
		if href != "" && !seen[href] && !strings.HasPrefix(href, "#") {
			links = append(links, href)
			seen[href] = true
		}
	})
	return links
}

// extractImages extracts image URLs.
func (s *WebScraper) extractImages(e *colly.HTMLElement) []string {
	var images []string
	e.ForEach("img[src]", func(_ int, img *colly.HTMLElement) {
		src := img.Attr("src")
		if src != "" {
			images = append(images, src)
		}
	})
	return images
}

// shouldInclude checks if a URL should be included based on patterns.
func (s *WebScraper) shouldInclude(urlStr string, include, exclude []string) bool {
	// Check exclude patterns first
	for _, pattern := range exclude {
		matched, _ := regexp.MatchString(pattern, urlStr)
		if matched {
			return false
		}
	}

	// If no include patterns, include all
	if len(include) == 0 {
		return true
	}

	// Check include patterns
	for _, pattern := range include {
		matched, _ := regexp.MatchString(pattern, urlStr)
		if matched {
			return true
		}
	}

	return false
}

// generateHash generates a SHA256 hash of content.
func (s *WebScraper) generateHash(content string) string {
	hash := sha256.Sum256([]byte(content))
	return hex.EncodeToString(hash[:])
}

// cleanText removes extra whitespace and normalizes text.
func cleanText(text string) string {
	noisePhrases := []string{
		"table of contents",
		"in this article",
		"additional resources",
		"feedback",
		"collaborate with us on github",
		"copyright",
		"all rights reserved",
		"privacy policy",
		"terms of service",
		"sign in",
		"skip to main content",
		"ask learn",
	}
	for _, phrase := range noisePhrases {
		re := regexp.MustCompile(`(?i)` + regexp.QuoteMeta(phrase))
		text = re.ReplaceAllString(text, " ")
	}

	// Replace multiple whitespace with single space
	re := regexp.MustCompile(`\s+`)
	text = re.ReplaceAllString(text, " ")

	// Trim spaces
	text = strings.TrimSpace(text)

	return text
}

func pathScopePrefix(rawPath string) string {
	clean := path.Clean(rawPath)
	if clean == "." || clean == "/" || clean == "" {
		return ""
	}

	last := path.Base(clean)
	if strings.Contains(last, ".") {
		dir := path.Dir(clean)
		if dir == "/" {
			// Root-level document page: keep crawler scoped to this page path.
			return clean
		}
		return dir
	}

	dir := path.Dir(clean)
	if dir == "/" {
		return clean
	}
	return dir
}

func pathScopeLeaf(rawPath string) string {
	clean := path.Clean(rawPath)
	if clean == "." || clean == "/" || clean == "" {
		return ""
	}
	last := path.Base(clean)
	if strings.Contains(last, ".") {
		return last
	}
	return ""
}

func withinScope(target, base *url.URL, prefix, leaf string) bool {
	if target == nil || base == nil {
		return false
	}
	if !strings.EqualFold(target.Hostname(), base.Hostname()) {
		return false
	}
	if prefix == "" {
		return true
	}
	targetPath := target.Path
	if targetPath == "" {
		targetPath = path.Clean("/")
	}
	if strings.HasPrefix(targetPath, prefix) {
		return true
	}
	return leaf != "" && path.Base(targetPath) == leaf
}