first commit

2026-07-29 07:33:48 +00:00 · 2026-02-22 10:42:17 +01:00
commit 55885a0e8f
239 changed files with 103690 additions and 0 deletions
@@ -0,0 +1,296 @@
+package scraper
+
+import (
+	"context"
+	"crypto/sha256"
+	"encoding/hex"
+	"fmt"
+	"net/url"
+	"regexp"
+	"strings"
+	"time"
+
+	"github.com/gocolly/colly/v2"
+)
+
+// WebScraper scrapes documentation from web URLs.
+type WebScraper struct {
+	config *Config
+}
+
+// NewWebScraper creates a new web scraper.
+func NewWebScraper(config *Config) *WebScraper {
+	return &WebScraper{config: config}
+}
+
+// Scrape fetches and parses documents from a web source.
+func (s *WebScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
+	var documents []*Document
+	visited := make(map[string]bool)
+
+	// Parse base URL for domain restrictions
+	baseURL, err := url.Parse(source.URL)
+	if err != nil {
+		return nil, fmt.Errorf("invalid URL: %w", err)
+	}
+
+	// Create Colly collector
+	c := colly.NewCollector(
+		colly.AllowedDomains(baseURL.Host),
+		colly.MaxDepth(s.config.MaxDepth),
+		colly.Async(true),
+		colly.UserAgent(s.config.UserAgent),
+	)
+
+	// Set rate limiting
+	if s.config.RateLimit > 0 {
+		c.Limit(&colly.LimitRule{
+			DomainGlob:  "*",
+			Parallelism: s.config.Concurrency,
+			Delay:       s.config.RateLimit,
+		})
+	}
+
+	// Set timeout
+	if s.config.Timeout > 0 {
+		c.SetRequestTimeout(s.config.Timeout)
+	}
+
+	// Enable caching if cache directory is set
+	if s.config.CacheDir != "" {
+		c.CacheDir = s.config.CacheDir
+	}
+
+	// Handle errors
+	c.OnError(func(r *colly.Response, err error) {
+		fmt.Printf("Error scraping %s: %v\n", r.Request.URL, err)
+	})
+
+	// Extract content from pages
+	c.OnHTML("html", func(e *colly.HTMLElement) {
+		pageURL := e.Request.URL.String()
+
+		// Skip if already visited
+		if visited[pageURL] {
+			return
+		}
+		visited[pageURL] = true
+
+		// Check include/exclude patterns
+		if !s.shouldInclude(pageURL, source.Include, source.Exclude) {
+			return
+		}
+
+		// Extract title
+		title := e.ChildText("title")
+		if title == "" {
+			title = e.ChildText("h1")
+		}
+
+		// Extract main content
+		content := s.extractContent(e)
+
+		// Skip if content is too short
+		if len(content) < 100 {
+			return
+		}
+
+		// Generate hash for change detection
+		hash := s.generateHash(content)
+
+		// Extract metadata
+		metadata := map[string]interface{}{
+			"headings":    s.extractHeadings(e),
+			"links":       s.extractLinks(e),
+			"images":      s.extractImages(e),
+			"description": e.ChildAttr(`meta[name="description"]`, "content"),
+		}
+
+		doc := &Document{
+			ID:        generateDocID(pageURL),
+			Source:    source.Name,
+			Type:      "html",
+			Title:     strings.TrimSpace(title),
+			Content:   content,
+			URL:       pageURL,
+			Metadata:  metadata,
+			Hash:      hash,
+			Timestamp: time.Now(),
+		}
+
+		documents = append(documents, doc)
+	})
+
+	// Follow links
+	c.OnHTML("a[href]", func(e *colly.HTMLElement) {
+		link := e.Attr("href")
+		absoluteURL := e.Request.AbsoluteURL(link)
+
+		// Skip if already visited
+		if visited[absoluteURL] {
+			return
+		}
+
+		// Check include/exclude patterns
+		if !s.shouldInclude(absoluteURL, source.Include, source.Exclude) {
+			return
+		}
+
+		c.Visit(absoluteURL)
+	})
+
+	// Start scraping
+	if err := c.Visit(source.URL); err != nil {
+		return nil, fmt.Errorf("failed to start scraping: %w", err)
+	}
+
+	// Wait for async scraping to complete
+	c.Wait()
+
+	return documents, nil
+}
+
+// DetectChanges checks if the web source has changed.
+func (s *WebScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
+	// Quick check by fetching just the main page
+	c := colly.NewCollector(
+		colly.UserAgent(s.config.UserAgent),
+	)
+	c.SetRequestTimeout(s.config.Timeout)
+
+	var content string
+	c.OnHTML("html", func(e *colly.HTMLElement) {
+		content = s.extractContent(e)
+	})
+
+	if err := c.Visit(source.URL); err != nil {
+		return false, "", err
+	}
+
+	currentHash := s.generateHash(content)
+	changed := currentHash != lastHash
+
+	return changed, currentHash, nil
+}
+
+// extractContent extracts the main text content from a page.
+func (s *WebScraper) extractContent(e *colly.HTMLElement) string {
+	// Try common content selectors
+	selectors := []string{
+		"article",
+		"main",
+		".content",
+		".documentation",
+		".docs",
+		".markdown-body",
+		"[role='main']",
+		"#content",
+		"#main",
+	}
+
+	var content string
+	for _, selector := range selectors {
+		content = e.ChildText(selector)
+		if len(content) > 200 {
+			break
+		}
+	}
+
+	// Fallback to body if no content found
+	if content == "" {
+		content = e.ChildText("body")
+	}
+
+	// Clean up content
+	content = cleanText(content)
+
+	return content
+}
+
+// extractHeadings extracts heading structure.
+func (s *WebScraper) extractHeadings(e *colly.HTMLElement) []string {
+	var headings []string
+	e.ForEach("h1, h2, h3, h4, h5, h6", func(_ int, h *colly.HTMLElement) {
+		text := strings.TrimSpace(h.Text)
+		if text != "" {
+			headings = append(headings, text)
+		}
+	})
+	return headings
+}
+
+// extractLinks extracts internal links.
+func (s *WebScraper) extractLinks(e *colly.HTMLElement) []string {
+	var links []string
+	seen := make(map[string]bool)
+	e.ForEach("a[href]", func(_ int, a *colly.HTMLElement) {
+		href := a.Attr("href")
+		if href != "" && !seen[href] && !strings.HasPrefix(href, "#") {
+			links = append(links, href)
+			seen[href] = true
+		}
+	})
+	return links
+}
+
+// extractImages extracts image URLs.
+func (s *WebScraper) extractImages(e *colly.HTMLElement) []string {
+	var images []string
+	e.ForEach("img[src]", func(_ int, img *colly.HTMLElement) {
+		src := img.Attr("src")
+		if src != "" {
+			images = append(images, src)
+		}
+	})
+	return images
+}
+
+// shouldInclude checks if a URL should be included based on patterns.
+func (s *WebScraper) shouldInclude(urlStr string, include, exclude []string) bool {
+	// Check exclude patterns first
+	for _, pattern := range exclude {
+		matched, _ := regexp.MatchString(pattern, urlStr)
+		if matched {
+			return false
+		}
+	}
+
+	// If no include patterns, include all
+	if len(include) == 0 {
+		return true
+	}
+
+	// Check include patterns
+	for _, pattern := range include {
+		matched, _ := regexp.MatchString(pattern, urlStr)
+		if matched {
+			return true
+		}
+	}
+
+	return false
+}
+
+// generateHash generates a SHA256 hash of content.
+func (s *WebScraper) generateHash(content string) string {
+	hash := sha256.Sum256([]byte(content))
+	return hex.EncodeToString(hash[:])
+}
+
+// cleanText removes extra whitespace and normalizes text.
+func cleanText(text string) string {
+	// Replace multiple whitespace with single space
+	re := regexp.MustCompile(`\s+`)
+	text = re.ReplaceAllString(text, " ")
+
+	// Trim spaces
+	text = strings.TrimSpace(text)
+
+	return text
+}
+
+// generateDocID generates a unique ID for a document.
+func generateDocID(urlStr string) string {
+	hash := sha256.Sum256([]byte(urlStr))
+	return hex.EncodeToString(hash[:12])
+}