update

2026-07-29 07:33:48 +00:00 · 2026-02-24 10:33:59 +01:00
parent 409acd2e08
commit 898a3c303f
1374 changed files with 290408 additions and 29186 deletions
@@ -6,8 +6,10 @@ import (
 	"encoding/hex"
 	"fmt"
 	"net/url"
+	"path"
 	"regexp"
 	"strings"
+	"sync"
 	"time"

 	"github.com/gocolly/colly/v2"
@@ -27,17 +29,42 @@ func NewWebScraper(config *Config) *WebScraper {
 func (s *WebScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
 	var documents []*Document
 	visited := make(map[string]bool)
+	scheduled := make(map[string]bool)
+	contentHashes := make(map[string]bool)
+	var mu sync.Mutex
+	var scrapeErrors []string

 	// Parse base URL for domain restrictions
 	baseURL, err := url.Parse(source.URL)
 	if err != nil {
 		return nil, fmt.Errorf("invalid URL: %w", err)
 	}
+	allowedDomain := baseURL.Hostname()
+	if allowedDomain == "" {
+		allowedDomain = baseURL.Host
+	}
+
+	maxDepth := s.config.MaxDepth
+	if maxDepth <= 0 {
+		maxDepth = 2
+	}
+	maxPages := s.config.Concurrency * 40
+	if maxPages < 20 {
+		maxPages = 20
+	}
+	if maxDepth <= 1 && maxPages > 30 {
+		maxPages = 30
+	}
+	if maxPages > 300 {
+		maxPages = 300
+	}
+	scopePrefix := pathScopePrefix(baseURL.Path)
+	scopeLeaf := pathScopeLeaf(baseURL.Path)

 	// Create Colly collector
 	c := colly.NewCollector(
-		colly.AllowedDomains(baseURL.Host),
-		colly.MaxDepth(s.config.MaxDepth),
+		colly.AllowedDomains(allowedDomain),
+		colly.MaxDepth(maxDepth),
 		colly.Async(true),
 		colly.UserAgent(s.config.UserAgent),
 	)
@@ -65,18 +92,40 @@ func (s *WebScraper) Scrape(ctx context.Context, source *Source) ([]*Document, e

 	// Handle errors
 	c.OnError(func(r *colly.Response, err error) {
-		fmt.Printf("Error scraping %s: %v\n", r.Request.URL, err)
+		errText := strings.ToLower(err.Error())
+		if strings.Contains(errText, "already visited") {
+			return
+		}
+		reqURL := source.URL
+		if r != nil && r.Request != nil && r.Request.URL != nil {
+			reqURL = r.Request.URL.String()
+		}
+		mu.Lock()
+		if len(scrapeErrors) < 20 {
+			scrapeErrors = append(scrapeErrors, fmt.Sprintf("%s: %v", reqURL, err))
+		}
+		mu.Unlock()
 	})

 	// Extract content from pages
 	c.OnHTML("html", func(e *colly.HTMLElement) {
 		pageURL := e.Request.URL.String()
+		if !withinScope(e.Request.URL, baseURL, scopePrefix, scopeLeaf) {
+			return
+		}

 		// Skip if already visited
+		mu.Lock()
 		if visited[pageURL] {
+			mu.Unlock()
+			return
+		}
+		if len(visited) >= maxPages {
+			mu.Unlock()
 			return
 		}
 		visited[pageURL] = true
+		mu.Unlock()

 		// Check include/exclude patterns
 		if !s.shouldInclude(pageURL, source.Include, source.Exclude) {
@@ -99,6 +148,13 @@ func (s *WebScraper) Scrape(ctx context.Context, source *Source) ([]*Document, e

 		// Generate hash for change detection
 		hash := s.generateHash(content)
+		mu.Lock()
+		if contentHashes[hash] {
+			mu.Unlock()
+			return
+		}
+		contentHashes[hash] = true
+		mu.Unlock()

 		// Extract metadata
 		metadata := map[string]interface{}{
@@ -120,30 +176,74 @@ func (s *WebScraper) Scrape(ctx context.Context, source *Source) ([]*Document, e
 			Timestamp: time.Now(),
 		}

+		mu.Lock()
 		documents = append(documents, doc)
+		mu.Unlock()
 	})

 	// Follow links
 	c.OnHTML("a[href]", func(e *colly.HTMLElement) {
 		link := e.Attr("href")
 		absoluteURL := e.Request.AbsoluteURL(link)
-
-		// Skip if already visited
-		if visited[absoluteURL] {
+		if absoluteURL == "" {
 			return
 		}

+		linkURL, err := url.Parse(absoluteURL)
+		if err != nil {
+			return
+		}
+		if linkURL.Scheme != "http" && linkURL.Scheme != "https" {
+			return
+		}
+		if !withinScope(linkURL, baseURL, scopePrefix, scopeLeaf) {
+			return
+		}
+
+		// Skip if already visited
+		mu.Lock()
+		if visited[absoluteURL] {
+			mu.Unlock()
+			return
+		}
+		if len(visited) >= maxPages {
+			mu.Unlock()
+			return
+		}
+		mu.Unlock()
+
 		// Check include/exclude patterns
 		if !s.shouldInclude(absoluteURL, source.Include, source.Exclude) {
 			return
 		}

+		mu.Lock()
+		if scheduled[absoluteURL] {
+			mu.Unlock()
+			return
+		}
+		if len(scheduled) >= maxPages {
+			mu.Unlock()
+			return
+		}
+		scheduled[absoluteURL] = true
+		mu.Unlock()
+
 		if err := c.Visit(absoluteURL); err != nil {
-			fmt.Printf("Error visiting %s: %v\n", absoluteURL, err)
+			errText := strings.ToLower(err.Error())
+			if strings.Contains(errText, "already visited") {
+				return
+			}
+			mu.Lock()
+			if len(scrapeErrors) < 20 {
+				scrapeErrors = append(scrapeErrors, fmt.Sprintf("%s: %v", absoluteURL, err))
+			}
+			mu.Unlock()
 		}
 	})

 	// Start scraping
+	scheduled[source.URL] = true
 	if err := c.Visit(source.URL); err != nil {
 		return nil, fmt.Errorf("failed to start scraping: %w", err)
 	}
@@ -151,6 +251,16 @@ func (s *WebScraper) Scrape(ctx context.Context, source *Source) ([]*Document, e
 	// Wait for async scraping to complete
 	c.Wait()

+	mu.Lock()
+	defer mu.Unlock()
+
+	if len(documents) == 0 {
+		if len(scrapeErrors) > 0 {
+			return nil, fmt.Errorf("web scrape failed: %s", strings.Join(scrapeErrors, "; "))
+		}
+		return nil, fmt.Errorf("web scrape extracted no documents from %s", source.URL)
+	}
+
 	return documents, nil
 }

@@ -283,6 +393,25 @@ func (s *WebScraper) generateHash(content string) string {

 // cleanText removes extra whitespace and normalizes text.
 func cleanText(text string) string {
+	noisePhrases := []string{
+		"table of contents",
+		"in this article",
+		"additional resources",
+		"feedback",
+		"collaborate with us on github",
+		"copyright",
+		"all rights reserved",
+		"privacy policy",
+		"terms of service",
+		"sign in",
+		"skip to main content",
+		"ask learn",
+	}
+	for _, phrase := range noisePhrases {
+		re := regexp.MustCompile(`(?i)` + regexp.QuoteMeta(phrase))
+		text = re.ReplaceAllString(text, " ")
+	}
+
 	// Replace multiple whitespace with single space
 	re := regexp.MustCompile(`\s+`)
 	text = re.ReplaceAllString(text, " ")
@@ -292,3 +421,58 @@ func cleanText(text string) string {

 	return text
 }
+
+func pathScopePrefix(rawPath string) string {
+	clean := path.Clean(rawPath)
+	if clean == "." || clean == "/" || clean == "" {
+		return ""
+	}
+
+	last := path.Base(clean)
+	if strings.Contains(last, ".") {
+		dir := path.Dir(clean)
+		if dir == "/" {
+			// Root-level document page: keep crawler scoped to this page path.
+			return clean
+		}
+		return dir
+	}
+
+	dir := path.Dir(clean)
+	if dir == "/" {
+		return clean
+	}
+	return dir
+}
+
+func pathScopeLeaf(rawPath string) string {
+	clean := path.Clean(rawPath)
+	if clean == "." || clean == "/" || clean == "" {
+		return ""
+	}
+	last := path.Base(clean)
+	if strings.Contains(last, ".") {
+		return last
+	}
+	return ""
+}
+
+func withinScope(target, base *url.URL, prefix, leaf string) bool {
+	if target == nil || base == nil {
+		return false
+	}
+	if !strings.EqualFold(target.Hostname(), base.Hostname()) {
+		return false
+	}
+	if prefix == "" {
+		return true
+	}
+	targetPath := target.Path
+	if targetPath == "" {
+		targetPath = path.Clean("/")
+	}
+	if strings.HasPrefix(targetPath, prefix) {
+		return true
+	}
+	return leaf != "" && path.Base(targetPath) == leaf
+}