mirror of
https://github.com/Dvorinka/Devour.git
synced 2026-06-03 20:13:03 +00:00
updage
This commit is contained in:
@@ -44,11 +44,13 @@ func (s *WebScraper) Scrape(ctx context.Context, source *Source) ([]*Document, e
|
||||
|
||||
// Set rate limiting
|
||||
if s.config.RateLimit > 0 {
|
||||
c.Limit(&colly.LimitRule{
|
||||
if err := c.Limit(&colly.LimitRule{
|
||||
DomainGlob: "*",
|
||||
Parallelism: s.config.Concurrency,
|
||||
Delay: s.config.RateLimit,
|
||||
})
|
||||
}); err != nil {
|
||||
return nil, fmt.Errorf("failed to set rate limiting: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
// Set timeout
|
||||
@@ -136,7 +138,9 @@ func (s *WebScraper) Scrape(ctx context.Context, source *Source) ([]*Document, e
|
||||
return
|
||||
}
|
||||
|
||||
c.Visit(absoluteURL)
|
||||
if err := c.Visit(absoluteURL); err != nil {
|
||||
fmt.Printf("Error visiting %s: %v\n", absoluteURL, err)
|
||||
}
|
||||
})
|
||||
|
||||
// Start scraping
|
||||
@@ -288,9 +292,3 @@ func cleanText(text string) string {
|
||||
|
||||
return text
|
||||
}
|
||||
|
||||
// generateDocID generates a unique ID for a document.
|
||||
func generateDocID(urlStr string) string {
|
||||
hash := sha256.Sum256([]byte(urlStr))
|
||||
return hex.EncodeToString(hash[:12])
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user