mirror of
https://github.com/Dvorinka/Devour.git
synced 2026-06-04 04:23:02 +00:00
update
This commit is contained in:
@@ -0,0 +1,98 @@
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"net"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
// wrappedScraper adds retry and normalization behavior for all scraper implementations.
|
||||
type wrappedScraper struct {
|
||||
inner Scraper
|
||||
}
|
||||
|
||||
func wrapScraper(inner Scraper) Scraper {
|
||||
if inner == nil {
|
||||
return nil
|
||||
}
|
||||
return &wrappedScraper{inner: inner}
|
||||
}
|
||||
|
||||
func (w *wrappedScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
|
||||
if source == nil {
|
||||
return nil, fmt.Errorf("source is required")
|
||||
}
|
||||
|
||||
retries := 2
|
||||
delay := 300 * time.Millisecond
|
||||
|
||||
var lastErr error
|
||||
for attempt := 0; attempt <= retries; attempt++ {
|
||||
docs, err := w.inner.Scrape(ctx, source)
|
||||
if err == nil {
|
||||
return NormalizeDocuments(docs), nil
|
||||
}
|
||||
lastErr = err
|
||||
|
||||
// One fallback: add trailing slash for doc sites when URL path looks page-like.
|
||||
if attempt == 0 && strings.HasPrefix(source.URL, "http") && !strings.HasSuffix(source.URL, "/") {
|
||||
alt := *source
|
||||
alt.URL = source.URL + "/"
|
||||
docs, altErr := w.inner.Scrape(ctx, &alt)
|
||||
if altErr == nil {
|
||||
return NormalizeDocuments(docs), nil
|
||||
}
|
||||
}
|
||||
|
||||
if attempt < retries && isRetriableScrapeError(err) {
|
||||
if !sleepWithContext(ctx, delay) {
|
||||
return nil, ctx.Err()
|
||||
}
|
||||
continue
|
||||
}
|
||||
break
|
||||
}
|
||||
|
||||
return nil, lastErr
|
||||
}
|
||||
|
||||
func (w *wrappedScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
|
||||
return w.inner.DetectChanges(ctx, source, lastHash)
|
||||
}
|
||||
|
||||
func isRetriableScrapeError(err error) bool {
|
||||
if err == nil {
|
||||
return false
|
||||
}
|
||||
|
||||
s := strings.ToLower(err.Error())
|
||||
if strings.Contains(s, "timeout") ||
|
||||
strings.Contains(s, "temporarily unavailable") ||
|
||||
strings.Contains(s, "connection reset") ||
|
||||
strings.Contains(s, "eof") ||
|
||||
strings.Contains(s, "http 429") ||
|
||||
strings.Contains(s, "http 500") ||
|
||||
strings.Contains(s, "http 502") ||
|
||||
strings.Contains(s, "http 503") ||
|
||||
strings.Contains(s, "http 504") {
|
||||
return true
|
||||
}
|
||||
|
||||
var netErr net.Error
|
||||
return errors.As(err, &netErr)
|
||||
}
|
||||
|
||||
func sleepWithContext(ctx context.Context, d time.Duration) bool {
|
||||
t := time.NewTimer(d)
|
||||
defer t.Stop()
|
||||
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return false
|
||||
case <-t.C:
|
||||
return true
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user