This commit is contained in:
Tomas Dvorak
2026-02-22 15:41:27 +01:00
parent 0b88627e54
commit 409acd2e08
84 changed files with 65382 additions and 27475 deletions
+7 -9
View File
@@ -44,11 +44,13 @@ func (s *WebScraper) Scrape(ctx context.Context, source *Source) ([]*Document, e
// Set rate limiting
if s.config.RateLimit > 0 {
c.Limit(&colly.LimitRule{
if err := c.Limit(&colly.LimitRule{
DomainGlob: "*",
Parallelism: s.config.Concurrency,
Delay: s.config.RateLimit,
})
}); err != nil {
return nil, fmt.Errorf("failed to set rate limiting: %w", err)
}
}
// Set timeout
@@ -136,7 +138,9 @@ func (s *WebScraper) Scrape(ctx context.Context, source *Source) ([]*Document, e
return
}
c.Visit(absoluteURL)
if err := c.Visit(absoluteURL); err != nil {
fmt.Printf("Error visiting %s: %v\n", absoluteURL, err)
}
})
// Start scraping
@@ -288,9 +292,3 @@ func cleanText(text string) string {
return text
}
// generateDocID generates a unique ID for a document.
func generateDocID(urlStr string) string {
hash := sha256.Sum256([]byte(urlStr))
return hex.EncodeToString(hash[:12])
}