mirror of
https://github.com/Dvorinka/Devour.git
synced 2026-06-03 20:13:03 +00:00
99 lines
2.1 KiB
Go
99 lines
2.1 KiB
Go
package scraper
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"fmt"
|
|
"net"
|
|
"strings"
|
|
"time"
|
|
)
|
|
|
|
// wrappedScraper adds retry and normalization behavior for all scraper implementations.
|
|
type wrappedScraper struct {
|
|
inner Scraper
|
|
}
|
|
|
|
func wrapScraper(inner Scraper) Scraper {
|
|
if inner == nil {
|
|
return nil
|
|
}
|
|
return &wrappedScraper{inner: inner}
|
|
}
|
|
|
|
func (w *wrappedScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
|
|
if source == nil {
|
|
return nil, fmt.Errorf("source is required")
|
|
}
|
|
|
|
retries := 2
|
|
delay := 300 * time.Millisecond
|
|
|
|
var lastErr error
|
|
for attempt := 0; attempt <= retries; attempt++ {
|
|
docs, err := w.inner.Scrape(ctx, source)
|
|
if err == nil {
|
|
return NormalizeDocuments(docs), nil
|
|
}
|
|
lastErr = err
|
|
|
|
// One fallback: add trailing slash for doc sites when URL path looks page-like.
|
|
if attempt == 0 && strings.HasPrefix(source.URL, "http") && !strings.HasSuffix(source.URL, "/") {
|
|
alt := *source
|
|
alt.URL = source.URL + "/"
|
|
docs, altErr := w.inner.Scrape(ctx, &alt)
|
|
if altErr == nil {
|
|
return NormalizeDocuments(docs), nil
|
|
}
|
|
}
|
|
|
|
if attempt < retries && isRetriableScrapeError(err) {
|
|
if !sleepWithContext(ctx, delay) {
|
|
return nil, ctx.Err()
|
|
}
|
|
continue
|
|
}
|
|
break
|
|
}
|
|
|
|
return nil, lastErr
|
|
}
|
|
|
|
func (w *wrappedScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
|
|
return w.inner.DetectChanges(ctx, source, lastHash)
|
|
}
|
|
|
|
func isRetriableScrapeError(err error) bool {
|
|
if err == nil {
|
|
return false
|
|
}
|
|
|
|
s := strings.ToLower(err.Error())
|
|
if strings.Contains(s, "timeout") ||
|
|
strings.Contains(s, "temporarily unavailable") ||
|
|
strings.Contains(s, "connection reset") ||
|
|
strings.Contains(s, "eof") ||
|
|
strings.Contains(s, "http 429") ||
|
|
strings.Contains(s, "http 500") ||
|
|
strings.Contains(s, "http 502") ||
|
|
strings.Contains(s, "http 503") ||
|
|
strings.Contains(s, "http 504") {
|
|
return true
|
|
}
|
|
|
|
var netErr net.Error
|
|
return errors.As(err, &netErr)
|
|
}
|
|
|
|
func sleepWithContext(ctx context.Context, d time.Duration) bool {
|
|
t := time.NewTimer(d)
|
|
defer t.Stop()
|
|
|
|
select {
|
|
case <-ctx.Done():
|
|
return false
|
|
case <-t.C:
|
|
return true
|
|
}
|
|
}
|