Files
Devour/internal/scraper/wrapper.go
T
Tomas Dvorak 898a3c303f update
2026-02-24 10:33:59 +01:00

99 lines
2.1 KiB
Go

package scraper
import (
"context"
"errors"
"fmt"
"net"
"strings"
"time"
)
// wrappedScraper adds retry and normalization behavior for all scraper implementations.
type wrappedScraper struct {
inner Scraper
}
func wrapScraper(inner Scraper) Scraper {
if inner == nil {
return nil
}
return &wrappedScraper{inner: inner}
}
func (w *wrappedScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
if source == nil {
return nil, fmt.Errorf("source is required")
}
retries := 2
delay := 300 * time.Millisecond
var lastErr error
for attempt := 0; attempt <= retries; attempt++ {
docs, err := w.inner.Scrape(ctx, source)
if err == nil {
return NormalizeDocuments(docs), nil
}
lastErr = err
// One fallback: add trailing slash for doc sites when URL path looks page-like.
if attempt == 0 && strings.HasPrefix(source.URL, "http") && !strings.HasSuffix(source.URL, "/") {
alt := *source
alt.URL = source.URL + "/"
docs, altErr := w.inner.Scrape(ctx, &alt)
if altErr == nil {
return NormalizeDocuments(docs), nil
}
}
if attempt < retries && isRetriableScrapeError(err) {
if !sleepWithContext(ctx, delay) {
return nil, ctx.Err()
}
continue
}
break
}
return nil, lastErr
}
func (w *wrappedScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
return w.inner.DetectChanges(ctx, source, lastHash)
}
func isRetriableScrapeError(err error) bool {
if err == nil {
return false
}
s := strings.ToLower(err.Error())
if strings.Contains(s, "timeout") ||
strings.Contains(s, "temporarily unavailable") ||
strings.Contains(s, "connection reset") ||
strings.Contains(s, "eof") ||
strings.Contains(s, "http 429") ||
strings.Contains(s, "http 500") ||
strings.Contains(s, "http 502") ||
strings.Contains(s, "http 503") ||
strings.Contains(s, "http 504") {
return true
}
var netErr net.Error
return errors.As(err, &netErr)
}
func sleepWithContext(ctx context.Context, d time.Duration) bool {
t := time.NewTimer(d)
defer t.Stop()
select {
case <-ctx.Done():
return false
case <-t.C:
return true
}
}