mirror of
https://github.com/Dvorinka/Devour.git
synced 2026-06-04 04:23:02 +00:00
i dont like commits
This commit is contained in:
@@ -4,6 +4,7 @@ import (
|
||||
"context"
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"errors"
|
||||
"fmt"
|
||||
"net/url"
|
||||
"path"
|
||||
@@ -32,7 +33,7 @@ func (s *WebScraper) Scrape(ctx context.Context, source *Source) ([]*Document, e
|
||||
scheduled := make(map[string]bool)
|
||||
contentHashes := make(map[string]bool)
|
||||
var mu sync.Mutex
|
||||
var scrapeErrors []string
|
||||
var scrapeErrors []error
|
||||
|
||||
// Parse base URL for domain restrictions
|
||||
baseURL, err := url.Parse(source.URL)
|
||||
@@ -102,7 +103,7 @@ func (s *WebScraper) Scrape(ctx context.Context, source *Source) ([]*Document, e
|
||||
}
|
||||
mu.Lock()
|
||||
if len(scrapeErrors) < 20 {
|
||||
scrapeErrors = append(scrapeErrors, fmt.Sprintf("%s: %v", reqURL, err))
|
||||
scrapeErrors = append(scrapeErrors, fmt.Errorf("%s: %w", reqURL, err))
|
||||
}
|
||||
mu.Unlock()
|
||||
})
|
||||
@@ -236,7 +237,7 @@ func (s *WebScraper) Scrape(ctx context.Context, source *Source) ([]*Document, e
|
||||
}
|
||||
mu.Lock()
|
||||
if len(scrapeErrors) < 20 {
|
||||
scrapeErrors = append(scrapeErrors, fmt.Sprintf("%s: %v", absoluteURL, err))
|
||||
scrapeErrors = append(scrapeErrors, fmt.Errorf("%s: %w", absoluteURL, err))
|
||||
}
|
||||
mu.Unlock()
|
||||
}
|
||||
@@ -256,7 +257,7 @@ func (s *WebScraper) Scrape(ctx context.Context, source *Source) ([]*Document, e
|
||||
|
||||
if len(documents) == 0 {
|
||||
if len(scrapeErrors) > 0 {
|
||||
return nil, fmt.Errorf("web scrape failed: %s", strings.Join(scrapeErrors, "; "))
|
||||
return nil, fmt.Errorf("web scrape failed: %w", errors.Join(scrapeErrors...))
|
||||
}
|
||||
return nil, fmt.Errorf("web scrape extracted no documents from %s", source.URL)
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user