package scraper import ( "context" "errors" "fmt" "net" "strings" "time" ) // wrappedScraper adds retry and normalization behavior for all scraper implementations. type wrappedScraper struct { inner Scraper } func wrapScraper(inner Scraper) Scraper { if inner == nil { return nil } return &wrappedScraper{inner: inner} } func (w *wrappedScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) { if source == nil { return nil, fmt.Errorf("source is required") } retries := 2 delay := 300 * time.Millisecond var lastErr error for attempt := 0; attempt <= retries; attempt++ { docs, err := w.inner.Scrape(ctx, source) if err == nil { return NormalizeDocuments(docs), nil } lastErr = err // One fallback: add trailing slash for doc sites when URL path looks page-like. if attempt == 0 && strings.HasPrefix(source.URL, "http") && !strings.HasSuffix(source.URL, "/") { alt := *source alt.URL = source.URL + "/" docs, altErr := w.inner.Scrape(ctx, &alt) if altErr == nil { return NormalizeDocuments(docs), nil } } if attempt < retries && isRetriableScrapeError(err) { if !sleepWithContext(ctx, delay) { return nil, ctx.Err() } continue } break } return nil, lastErr } func (w *wrappedScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) { return w.inner.DetectChanges(ctx, source, lastHash) } func isRetriableScrapeError(err error) bool { if err == nil { return false } s := strings.ToLower(err.Error()) if strings.Contains(s, "timeout") || strings.Contains(s, "temporarily unavailable") || strings.Contains(s, "connection reset") || strings.Contains(s, "eof") || strings.Contains(s, "http 429") || strings.Contains(s, "http 500") || strings.Contains(s, "http 502") || strings.Contains(s, "http 503") || strings.Contains(s, "http 504") { return true } var netErr net.Error return errors.As(err, &netErr) } func sleepWithContext(ctx context.Context, d time.Duration) bool { t := time.NewTimer(d) defer t.Stop() select { case <-ctx.Done(): return false case <-t.C: return true } }