This commit is contained in:
Tomas Dvorak
2026-02-24 10:33:59 +01:00
parent 409acd2e08
commit 898a3c303f
1374 changed files with 290409 additions and 29187 deletions
+191 -7
View File
@@ -6,8 +6,10 @@ import (
"encoding/hex"
"fmt"
"net/url"
"path"
"regexp"
"strings"
"sync"
"time"
"github.com/gocolly/colly/v2"
@@ -27,17 +29,42 @@ func NewWebScraper(config *Config) *WebScraper {
func (s *WebScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
var documents []*Document
visited := make(map[string]bool)
scheduled := make(map[string]bool)
contentHashes := make(map[string]bool)
var mu sync.Mutex
var scrapeErrors []string
// Parse base URL for domain restrictions
baseURL, err := url.Parse(source.URL)
if err != nil {
return nil, fmt.Errorf("invalid URL: %w", err)
}
allowedDomain := baseURL.Hostname()
if allowedDomain == "" {
allowedDomain = baseURL.Host
}
maxDepth := s.config.MaxDepth
if maxDepth <= 0 {
maxDepth = 2
}
maxPages := s.config.Concurrency * 40
if maxPages < 20 {
maxPages = 20
}
if maxDepth <= 1 && maxPages > 30 {
maxPages = 30
}
if maxPages > 300 {
maxPages = 300
}
scopePrefix := pathScopePrefix(baseURL.Path)
scopeLeaf := pathScopeLeaf(baseURL.Path)
// Create Colly collector
c := colly.NewCollector(
colly.AllowedDomains(baseURL.Host),
colly.MaxDepth(s.config.MaxDepth),
colly.AllowedDomains(allowedDomain),
colly.MaxDepth(maxDepth),
colly.Async(true),
colly.UserAgent(s.config.UserAgent),
)
@@ -65,18 +92,40 @@ func (s *WebScraper) Scrape(ctx context.Context, source *Source) ([]*Document, e
// Handle errors
c.OnError(func(r *colly.Response, err error) {
fmt.Printf("Error scraping %s: %v\n", r.Request.URL, err)
errText := strings.ToLower(err.Error())
if strings.Contains(errText, "already visited") {
return
}
reqURL := source.URL
if r != nil && r.Request != nil && r.Request.URL != nil {
reqURL = r.Request.URL.String()
}
mu.Lock()
if len(scrapeErrors) < 20 {
scrapeErrors = append(scrapeErrors, fmt.Sprintf("%s: %v", reqURL, err))
}
mu.Unlock()
})
// Extract content from pages
c.OnHTML("html", func(e *colly.HTMLElement) {
pageURL := e.Request.URL.String()
if !withinScope(e.Request.URL, baseURL, scopePrefix, scopeLeaf) {
return
}
// Skip if already visited
mu.Lock()
if visited[pageURL] {
mu.Unlock()
return
}
if len(visited) >= maxPages {
mu.Unlock()
return
}
visited[pageURL] = true
mu.Unlock()
// Check include/exclude patterns
if !s.shouldInclude(pageURL, source.Include, source.Exclude) {
@@ -99,6 +148,13 @@ func (s *WebScraper) Scrape(ctx context.Context, source *Source) ([]*Document, e
// Generate hash for change detection
hash := s.generateHash(content)
mu.Lock()
if contentHashes[hash] {
mu.Unlock()
return
}
contentHashes[hash] = true
mu.Unlock()
// Extract metadata
metadata := map[string]interface{}{
@@ -120,30 +176,74 @@ func (s *WebScraper) Scrape(ctx context.Context, source *Source) ([]*Document, e
Timestamp: time.Now(),
}
mu.Lock()
documents = append(documents, doc)
mu.Unlock()
})
// Follow links
c.OnHTML("a[href]", func(e *colly.HTMLElement) {
link := e.Attr("href")
absoluteURL := e.Request.AbsoluteURL(link)
// Skip if already visited
if visited[absoluteURL] {
if absoluteURL == "" {
return
}
linkURL, err := url.Parse(absoluteURL)
if err != nil {
return
}
if linkURL.Scheme != "http" && linkURL.Scheme != "https" {
return
}
if !withinScope(linkURL, baseURL, scopePrefix, scopeLeaf) {
return
}
// Skip if already visited
mu.Lock()
if visited[absoluteURL] {
mu.Unlock()
return
}
if len(visited) >= maxPages {
mu.Unlock()
return
}
mu.Unlock()
// Check include/exclude patterns
if !s.shouldInclude(absoluteURL, source.Include, source.Exclude) {
return
}
mu.Lock()
if scheduled[absoluteURL] {
mu.Unlock()
return
}
if len(scheduled) >= maxPages {
mu.Unlock()
return
}
scheduled[absoluteURL] = true
mu.Unlock()
if err := c.Visit(absoluteURL); err != nil {
fmt.Printf("Error visiting %s: %v\n", absoluteURL, err)
errText := strings.ToLower(err.Error())
if strings.Contains(errText, "already visited") {
return
}
mu.Lock()
if len(scrapeErrors) < 20 {
scrapeErrors = append(scrapeErrors, fmt.Sprintf("%s: %v", absoluteURL, err))
}
mu.Unlock()
}
})
// Start scraping
scheduled[source.URL] = true
if err := c.Visit(source.URL); err != nil {
return nil, fmt.Errorf("failed to start scraping: %w", err)
}
@@ -151,6 +251,16 @@ func (s *WebScraper) Scrape(ctx context.Context, source *Source) ([]*Document, e
// Wait for async scraping to complete
c.Wait()
mu.Lock()
defer mu.Unlock()
if len(documents) == 0 {
if len(scrapeErrors) > 0 {
return nil, fmt.Errorf("web scrape failed: %s", strings.Join(scrapeErrors, "; "))
}
return nil, fmt.Errorf("web scrape extracted no documents from %s", source.URL)
}
return documents, nil
}
@@ -283,6 +393,25 @@ func (s *WebScraper) generateHash(content string) string {
// cleanText removes extra whitespace and normalizes text.
func cleanText(text string) string {
noisePhrases := []string{
"table of contents",
"in this article",
"additional resources",
"feedback",
"collaborate with us on github",
"copyright",
"all rights reserved",
"privacy policy",
"terms of service",
"sign in",
"skip to main content",
"ask learn",
}
for _, phrase := range noisePhrases {
re := regexp.MustCompile(`(?i)` + regexp.QuoteMeta(phrase))
text = re.ReplaceAllString(text, " ")
}
// Replace multiple whitespace with single space
re := regexp.MustCompile(`\s+`)
text = re.ReplaceAllString(text, " ")
@@ -292,3 +421,58 @@ func cleanText(text string) string {
return text
}
func pathScopePrefix(rawPath string) string {
clean := path.Clean(rawPath)
if clean == "." || clean == "/" || clean == "" {
return ""
}
last := path.Base(clean)
if strings.Contains(last, ".") {
dir := path.Dir(clean)
if dir == "/" {
// Root-level document page: keep crawler scoped to this page path.
return clean
}
return dir
}
dir := path.Dir(clean)
if dir == "/" {
return clean
}
return dir
}
func pathScopeLeaf(rawPath string) string {
clean := path.Clean(rawPath)
if clean == "." || clean == "/" || clean == "" {
return ""
}
last := path.Base(clean)
if strings.Contains(last, ".") {
return last
}
return ""
}
func withinScope(target, base *url.URL, prefix, leaf string) bool {
if target == nil || base == nil {
return false
}
if !strings.EqualFold(target.Hostname(), base.Hostname()) {
return false
}
if prefix == "" {
return true
}
targetPath := target.Path
if targetPath == "" {
targetPath = path.Clean("/")
}
if strings.HasPrefix(targetPath, prefix) {
return true
}
return leaf != "" && path.Base(targetPath) == leaf
}