i dont like commits

This commit is contained in:
Tomas Dvorak
2026-02-24 12:10:13 +01:00
parent 898a3c303f
commit 1d72a1cc01
109 changed files with 43586 additions and 8484 deletions
+1 -23
View File
@@ -5,7 +5,6 @@ import (
"crypto/sha256"
"encoding/hex"
"fmt"
"io"
"net/http"
"strings"
"time"
@@ -70,28 +69,7 @@ func (s *AstroDocsScraper) DetectChanges(ctx context.Context, source *Source, la
}
func (s *AstroDocsScraper) fetchPage(ctx context.Context, url string) (string, error) {
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
if err != nil {
return "", err
}
req.Header.Set("User-Agent", s.config.UserAgent)
resp, err := s.client.Do(req)
if err != nil {
return "", err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return "", fmt.Errorf("HTTP %d", resp.StatusCode)
}
body, err := io.ReadAll(resp.Body)
if err != nil {
return "", err
}
return string(body), nil
return fetchExternalPage(ctx, s.client, s.config.UserAgent, url)
}
func (s *AstroDocsScraper) generateHash(content string) string {
+1 -23
View File
@@ -5,7 +5,6 @@ import (
"crypto/sha256"
"encoding/hex"
"fmt"
"io"
"net/http"
"strings"
"time"
@@ -75,28 +74,7 @@ func (s *CloudflareDocsScraper) DetectChanges(ctx context.Context, source *Sourc
}
func (s *CloudflareDocsScraper) fetchPage(ctx context.Context, url string) (string, error) {
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
if err != nil {
return "", err
}
req.Header.Set("User-Agent", s.config.UserAgent)
resp, err := s.client.Do(req)
if err != nil {
return "", err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return "", fmt.Errorf("HTTP %d", resp.StatusCode)
}
body, err := io.ReadAll(resp.Body)
if err != nil {
return "", err
}
return string(body), nil
return fetchExternalPage(ctx, s.client, s.config.UserAgent, url)
}
func (s *CloudflareDocsScraper) generateHash(content string) string {
+1 -23
View File
@@ -5,7 +5,6 @@ import (
"crypto/sha256"
"encoding/hex"
"fmt"
"io"
"net/http"
"strings"
"time"
@@ -70,28 +69,7 @@ func (s *DockerDocsScraper) DetectChanges(ctx context.Context, source *Source, l
}
func (s *DockerDocsScraper) fetchPage(ctx context.Context, url string) (string, error) {
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
if err != nil {
return "", err
}
req.Header.Set("User-Agent", s.config.UserAgent)
resp, err := s.client.Do(req)
if err != nil {
return "", err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return "", fmt.Errorf("HTTP %d", resp.StatusCode)
}
body, err := io.ReadAll(resp.Body)
if err != nil {
return "", err
}
return string(body), nil
return fetchExternalPage(ctx, s.client, s.config.UserAgent, url)
}
func (s *DockerDocsScraper) generateHash(content string) string {
+1 -23
View File
@@ -6,7 +6,6 @@ import (
"encoding/hex"
"encoding/json"
"fmt"
"io"
"net/http"
"strings"
"time"
@@ -91,28 +90,7 @@ func (s *GoDocsScraper) DetectChanges(ctx context.Context, source *Source, lastH
}
func (s *GoDocsScraper) fetchPage(ctx context.Context, url string) (string, error) {
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
if err != nil {
return "", err
}
req.Header.Set("User-Agent", s.config.UserAgent)
resp, err := s.client.Do(req)
if err != nil {
return "", err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return "", fmt.Errorf("HTTP %d", resp.StatusCode)
}
body, err := io.ReadAll(resp.Body)
if err != nil {
return "", err
}
return string(body), nil
return fetchExternalPage(ctx, s.client, s.config.UserAgent, url)
}
func (s *GoDocsScraper) generateHash(content string) string {
+1 -23
View File
@@ -5,7 +5,6 @@ import (
"crypto/sha256"
"encoding/hex"
"fmt"
"io"
"net/http"
"strings"
"time"
@@ -85,28 +84,7 @@ func (s *JavaDocsScraper) DetectChanges(ctx context.Context, source *Source, las
}
func (s *JavaDocsScraper) fetchPage(ctx context.Context, url string) (string, error) {
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
if err != nil {
return "", err
}
req.Header.Set("User-Agent", s.config.UserAgent)
resp, err := s.client.Do(req)
if err != nil {
return "", err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return "", fmt.Errorf("HTTP %d", resp.StatusCode)
}
body, err := io.ReadAll(resp.Body)
if err != nil {
return "", err
}
return string(body), nil
return fetchExternalPage(ctx, s.client, s.config.UserAgent, url)
}
func (s *JavaDocsScraper) generateHash(content string) string {
+1 -23
View File
@@ -5,7 +5,6 @@ import (
"crypto/sha256"
"encoding/hex"
"fmt"
"io"
"net/http"
"strings"
"time"
@@ -80,28 +79,7 @@ func (s *MCPDocsScraper) DetectChanges(ctx context.Context, source *Source, last
}
func (s *MCPDocsScraper) fetchPage(ctx context.Context, url string) (string, error) {
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
if err != nil {
return "", err
}
req.Header.Set("User-Agent", s.config.UserAgent)
resp, err := s.client.Do(req)
if err != nil {
return "", err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return "", fmt.Errorf("HTTP %d", resp.StatusCode)
}
body, err := io.ReadAll(resp.Body)
if err != nil {
return "", err
}
return string(body), nil
return fetchExternalPage(ctx, s.client, s.config.UserAgent, url)
}
func (s *MCPDocsScraper) generateHash(content string) string {
+1 -23
View File
@@ -5,7 +5,6 @@ import (
"crypto/sha256"
"encoding/hex"
"fmt"
"io"
"net/http"
"strings"
"time"
@@ -90,28 +89,7 @@ func (s *NuxtDocsScraper) DetectChanges(ctx context.Context, source *Source, las
}
func (s *NuxtDocsScraper) fetchPage(ctx context.Context, url string) (string, error) {
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
if err != nil {
return "", err
}
req.Header.Set("User-Agent", s.config.UserAgent)
resp, err := s.client.Do(req)
if err != nil {
return "", err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return "", fmt.Errorf("HTTP %d", resp.StatusCode)
}
body, err := io.ReadAll(resp.Body)
if err != nil {
return "", err
}
return string(body), nil
return fetchExternalPage(ctx, s.client, s.config.UserAgent, url)
}
func (s *NuxtDocsScraper) generateHash(content string) string {
+1 -23
View File
@@ -5,7 +5,6 @@ import (
"crypto/sha256"
"encoding/hex"
"fmt"
"io"
"net/http"
"strings"
"time"
@@ -100,28 +99,7 @@ func (s *PythonDocsScraper) DetectChanges(ctx context.Context, source *Source, l
}
func (s *PythonDocsScraper) fetchPage(ctx context.Context, url string) (string, error) {
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
if err != nil {
return "", err
}
req.Header.Set("User-Agent", s.config.UserAgent)
resp, err := s.client.Do(req)
if err != nil {
return "", err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return "", fmt.Errorf("HTTP %d", resp.StatusCode)
}
body, err := io.ReadAll(resp.Body)
if err != nil {
return "", err
}
return string(body), nil
return fetchExternalPage(ctx, s.client, s.config.UserAgent, url)
}
func (s *PythonDocsScraper) generateHash(content string) string {
+1 -23
View File
@@ -5,7 +5,6 @@ import (
"crypto/sha256"
"encoding/hex"
"fmt"
"io"
"net/http"
"strings"
"time"
@@ -80,28 +79,7 @@ func (s *ReactDocsScraper) DetectChanges(ctx context.Context, source *Source, la
}
func (s *ReactDocsScraper) fetchPage(ctx context.Context, url string) (string, error) {
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
if err != nil {
return "", err
}
req.Header.Set("User-Agent", s.config.UserAgent)
resp, err := s.client.Do(req)
if err != nil {
return "", err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return "", fmt.Errorf("HTTP %d", resp.StatusCode)
}
body, err := io.ReadAll(resp.Body)
if err != nil {
return "", err
}
return string(body), nil
return fetchExternalPage(ctx, s.client, s.config.UserAgent, url)
}
func (s *ReactDocsScraper) generateHash(content string) string {
+1 -23
View File
@@ -6,7 +6,6 @@ import (
"encoding/hex"
"encoding/json"
"fmt"
"io"
"net/http"
"strings"
"time"
@@ -106,28 +105,7 @@ func (s *RustDocsScraper) DetectChanges(ctx context.Context, source *Source, las
}
func (s *RustDocsScraper) fetchPage(ctx context.Context, url string) (string, error) {
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
if err != nil {
return "", err
}
req.Header.Set("User-Agent", s.config.UserAgent)
resp, err := s.client.Do(req)
if err != nil {
return "", err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return "", fmt.Errorf("HTTP %d", resp.StatusCode)
}
body, err := io.ReadAll(resp.Body)
if err != nil {
return "", err
}
return string(body), nil
return fetchExternalPage(ctx, s.client, s.config.UserAgent, url)
}
func (s *RustDocsScraper) generateHash(content string) string {
+1 -23
View File
@@ -5,7 +5,6 @@ import (
"crypto/sha256"
"encoding/hex"
"fmt"
"io"
"net/http"
"strings"
"time"
@@ -80,28 +79,7 @@ func (s *SpringDocsScraper) DetectChanges(ctx context.Context, source *Source, l
}
func (s *SpringDocsScraper) fetchPage(ctx context.Context, url string) (string, error) {
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
if err != nil {
return "", err
}
req.Header.Set("User-Agent", s.config.UserAgent)
resp, err := s.client.Do(req)
if err != nil {
return "", err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return "", fmt.Errorf("HTTP %d", resp.StatusCode)
}
body, err := io.ReadAll(resp.Body)
if err != nil {
return "", err
}
return string(body), nil
return fetchExternalPage(ctx, s.client, s.config.UserAgent, url)
}
func (s *SpringDocsScraper) generateHash(content string) string {
+1 -23
View File
@@ -5,7 +5,6 @@ import (
"crypto/sha256"
"encoding/hex"
"fmt"
"io"
"net/http"
"strings"
"time"
@@ -85,28 +84,7 @@ func (s *TSDocsScraper) DetectChanges(ctx context.Context, source *Source, lastH
}
func (s *TSDocsScraper) fetchPage(ctx context.Context, url string) (string, error) {
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
if err != nil {
return "", err
}
req.Header.Set("User-Agent", s.config.UserAgent)
resp, err := s.client.Do(req)
if err != nil {
return "", err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return "", fmt.Errorf("HTTP %d", resp.StatusCode)
}
body, err := io.ReadAll(resp.Body)
if err != nil {
return "", err
}
return string(body), nil
return fetchExternalPage(ctx, s.client, s.config.UserAgent, url)
}
func (s *TSDocsScraper) generateHash(content string) string {
+65
View File
@@ -1,8 +1,14 @@
package scraper
import (
"context"
"crypto/sha256"
"encoding/hex"
"fmt"
"io"
"net/http"
"net/url"
"strings"
basescraper "github.com/yourorg/devour/internal/scraper"
)
@@ -19,3 +25,62 @@ func generateDocID(urlStr string) string {
hash := sha256.Sum256([]byte(urlStr))
return hex.EncodeToString(hash[:12])
}
func fetchExternalPage(ctx context.Context, client *http.Client, userAgent, targetURL string) (string, error) {
req, err := http.NewRequestWithContext(ctx, "GET", targetURL, nil)
if err != nil {
return "", err
}
req.Header.Set("User-Agent", userAgent)
resp, err := client.Do(req)
if err != nil {
return "", err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
snippet, readErr := readErrorSnippet(resp.Body)
if readErr != nil {
return "", fmt.Errorf("GET %s returned HTTP %d and body read failed: %w", summarizeURL(targetURL), resp.StatusCode, readErr)
}
return "", fmt.Errorf("GET %s returned HTTP %d: %s", summarizeURL(targetURL), resp.StatusCode, snippet)
}
body, err := io.ReadAll(resp.Body)
if err != nil {
return "", err
}
return string(body), nil
}
func readErrorSnippet(body io.Reader) (string, error) {
const maxErrorBodyBytes = 512
data, err := io.ReadAll(io.LimitReader(body, maxErrorBodyBytes))
if err != nil {
return "", err
}
msg := strings.TrimSpace(string(data))
if msg == "" {
return "<empty body>", nil
}
return msg, nil
}
func summarizeURL(rawURL string) string {
parsedURL, err := url.Parse(rawURL)
if err != nil || parsedURL.Host == "" {
return rawURL
}
path := parsedURL.EscapedPath()
if path == "" {
path = "/"
}
return parsedURL.Host + path
}
+1 -23
View File
@@ -5,7 +5,6 @@ import (
"crypto/sha256"
"encoding/hex"
"fmt"
"io"
"net/http"
"strings"
"time"
@@ -85,28 +84,7 @@ func (s *VueDocsScraper) DetectChanges(ctx context.Context, source *Source, last
}
func (s *VueDocsScraper) fetchPage(ctx context.Context, url string) (string, error) {
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
if err != nil {
return "", err
}
req.Header.Set("User-Agent", s.config.UserAgent)
resp, err := s.client.Do(req)
if err != nil {
return "", err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return "", fmt.Errorf("HTTP %d", resp.StatusCode)
}
body, err := io.ReadAll(resp.Body)
if err != nil {
return "", err
}
return string(body), nil
return fetchExternalPage(ctx, s.client, s.config.UserAgent, url)
}
func (s *VueDocsScraper) generateHash(content string) string {
+7 -7
View File
@@ -28,12 +28,12 @@ func (s *GitHubScraper) Scrape(ctx context.Context, source *Source) ([]*Document
repoURL, repoName, err := s.resolveRepo(source)
if err != nil {
return nil, err
return nil, fmt.Errorf("resolve github repository: %w", err)
}
tmpDir, err := os.MkdirTemp("", "devour-github-*")
if err != nil {
return nil, err
return nil, fmt.Errorf("create temporary clone directory: %w", err)
}
defer os.RemoveAll(tmpDir)
@@ -81,14 +81,14 @@ func (s *GitHubScraper) Scrape(ctx context.Context, source *Source) ([]*Document
local := NewLocalScraper(s.config)
docs, err := local.Scrape(ctx, localSource)
if err != nil {
return nil, err
return nil, fmt.Errorf("scrape repository docs: %w", err)
}
if len(docs) == 0 && len(source.Include) == 0 {
// Sparse patterns did not match this repository layout; retry full checkout.
_ = exec.CommandContext(ctx, "git", "-C", tmpDir, "sparse-checkout", "disable").Run()
docs, err = local.Scrape(ctx, localSource)
if err != nil {
return nil, err
return nil, fmt.Errorf("scrape repository docs after sparse fallback: %w", err)
}
}
@@ -129,7 +129,7 @@ func (s *GitHubScraper) DetectChanges(ctx context.Context, source *Source, lastH
}
_, repoName, err := s.resolveRepo(source)
if err != nil {
return false, "", err
return false, "", fmt.Errorf("resolve github repository: %w", err)
}
remote := "https://github.com/" + strings.TrimSuffix(repoName, ".git") + ".git"
@@ -141,7 +141,7 @@ func (s *GitHubScraper) DetectChanges(ctx context.Context, source *Source, lastH
cmd := exec.CommandContext(ctx, "git", "ls-remote", remote, branch)
output, err := cmd.Output()
if err != nil {
return false, "", err
return false, "", fmt.Errorf("run git ls-remote for %s (%s): %w", remote, branch, err)
}
line := strings.TrimSpace(string(output))
if line == "" {
@@ -169,7 +169,7 @@ func (s *GitHubScraper) resolveRepo(source *Source) (repoURL string, repoName st
u, err := url.Parse(raw)
if err != nil {
return "", "", err
return "", "", fmt.Errorf("parse github url %q: %w", raw, err)
}
if !strings.Contains(strings.ToLower(u.Host), "github.com") {
return "", "", fmt.Errorf("not a github url: %s", raw)
+19 -7
View File
@@ -4,8 +4,10 @@ import (
"context"
"crypto/sha256"
"encoding/hex"
"errors"
"fmt"
"io/fs"
"log"
"os"
"path/filepath"
"regexp"
@@ -44,14 +46,15 @@ func (s *LocalScraper) Scrape(ctx context.Context, source *Source) ([]*Document,
info, err := os.Stat(root)
if err != nil {
return nil, err
return nil, fmt.Errorf("stat local source root %q: %w", root, err)
}
docs := make([]*Document, 0)
nonFatalErrors := make([]error, 0)
if !info.IsDir() {
doc, err := s.fileToDocument(root, source)
if err != nil {
return nil, err
return nil, fmt.Errorf("convert local source file %q: %w", root, err)
}
return []*Document{doc}, nil
}
@@ -89,13 +92,22 @@ func (s *LocalScraper) Scrape(ctx context.Context, source *Source) ([]*Document,
doc, err := s.fileToDocument(path, source)
if err != nil {
if len(nonFatalErrors) < 20 {
nonFatalErrors = append(nonFatalErrors, fmt.Errorf("%s: %w", path, err))
}
return nil
}
docs = append(docs, doc)
return nil
})
if err != nil {
return nil, err
return nil, fmt.Errorf("walk local source root %q: %w", root, err)
}
if len(nonFatalErrors) > 0 {
log.Printf("local scraper skipped %d files due to conversion errors (sample: %v)", len(nonFatalErrors), nonFatalErrors[0])
if len(docs) == 0 {
return nil, fmt.Errorf("local scrape failed while converting files: %w", errors.Join(nonFatalErrors...))
}
}
return docs, nil
@@ -118,7 +130,7 @@ func (s *LocalScraper) DetectChanges(ctx context.Context, source *Source, lastHa
h := sha256.New()
err := filepath.WalkDir(root, func(path string, d fs.DirEntry, err error) error {
if err != nil {
return err
return fmt.Errorf("walk local source path %q: %w", path, err)
}
if d.IsDir() {
name := d.Name()
@@ -133,13 +145,13 @@ func (s *LocalScraper) DetectChanges(ctx context.Context, source *Source, lastHa
info, infoErr := d.Info()
if infoErr != nil {
return infoErr
return fmt.Errorf("stat local source file %q: %w", path, infoErr)
}
fmt.Fprintf(h, "%s|%d|%d\n", path, info.Size(), info.ModTime().UnixNano())
return nil
})
if err != nil {
return false, "", err
return false, "", fmt.Errorf("walk local source root %q for change detection: %w", root, err)
}
hash := hex.EncodeToString(h.Sum(nil))
@@ -149,7 +161,7 @@ func (s *LocalScraper) DetectChanges(ctx context.Context, source *Source, lastHa
func (s *LocalScraper) fileToDocument(path string, source *Source) (*Document, error) {
b, err := os.ReadFile(path)
if err != nil {
return nil, err
return nil, fmt.Errorf("read local source file %q: %w", path, err)
}
ext := strings.ToLower(filepath.Ext(path))
+4 -3
View File
@@ -5,6 +5,7 @@ import (
"crypto/sha256"
"encoding/hex"
"encoding/json"
"errors"
"fmt"
"io"
"net/http"
@@ -81,7 +82,7 @@ func (s *LocalSearchScraper) Scrape(ctx context.Context, source *Source) ([]*Doc
docs := make([]*Document, 0, limit)
seen := make(map[string]bool)
var scrapeErrors []string
var scrapeErrors []error
for i, result := range results {
if ctx.Err() != nil {
@@ -109,7 +110,7 @@ func (s *LocalSearchScraper) Scrape(ctx context.Context, source *Source) ([]*Doc
})
if err != nil {
if len(scrapeErrors) < 20 {
scrapeErrors = append(scrapeErrors, fmt.Sprintf("%s: %v", resultURL, err))
scrapeErrors = append(scrapeErrors, fmt.Errorf("%s: %w", resultURL, err))
}
continue
}
@@ -140,7 +141,7 @@ func (s *LocalSearchScraper) Scrape(ctx context.Context, source *Source) ([]*Doc
if len(docs) == 0 {
if len(scrapeErrors) > 0 {
return nil, fmt.Errorf("local search returned results but page scraping failed: %s", strings.Join(scrapeErrors, "; "))
return nil, fmt.Errorf("local search returned results but page scraping failed: %w", errors.Join(scrapeErrors...))
}
return nil, fmt.Errorf("local search yielded no usable results for query %q", query)
}
+7 -7
View File
@@ -42,12 +42,12 @@ func (s *OpenAPIScraper) Scrape(ctx context.Context, source *Source) ([]*Documen
raw, specURL, err := s.readSpec(ctx, source)
if err != nil {
return nil, err
return nil, fmt.Errorf("read openapi spec: %w", err)
}
spec, err := parseOpenAPISpec(raw)
if err != nil {
return nil, err
return nil, fmt.Errorf("parse openapi spec %q: %w", specURL, err)
}
docs := make([]*Document, 0)
@@ -138,7 +138,7 @@ func (s *OpenAPIScraper) readSpec(ctx context.Context, source *Source) ([]byte,
if strings.HasPrefix(rawPath, "http://") || strings.HasPrefix(rawPath, "https://") {
req, err := http.NewRequestWithContext(ctx, http.MethodGet, rawPath, nil)
if err != nil {
return nil, "", err
return nil, "", fmt.Errorf("build openapi fetch request: %w", err)
}
if s.config != nil && strings.TrimSpace(s.config.UserAgent) != "" {
req.Header.Set("User-Agent", s.config.UserAgent)
@@ -146,7 +146,7 @@ func (s *OpenAPIScraper) readSpec(ctx context.Context, source *Source) ([]byte,
resp, err := s.client.Do(req)
if err != nil {
return nil, "", err
return nil, "", fmt.Errorf("fetch openapi spec from %s: %w", rawPath, err)
}
defer resp.Body.Close()
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
@@ -154,14 +154,14 @@ func (s *OpenAPIScraper) readSpec(ctx context.Context, source *Source) ([]byte,
}
body, err := io.ReadAll(io.LimitReader(resp.Body, 10<<20))
if err != nil {
return nil, "", err
return nil, "", fmt.Errorf("read openapi response body from %s: %w", rawPath, err)
}
return body, rawPath, nil
}
b, err := os.ReadFile(rawPath)
if err != nil {
return nil, "", err
return nil, "", fmt.Errorf("read openapi file %q: %w", rawPath, err)
}
return b, "file://" + rawPath, nil
}
@@ -214,7 +214,7 @@ func parseOpenAPISpec(raw []byte) (*openAPISpec, error) {
var spec openAPISpec
if err := json.Unmarshal(raw, &spec); err != nil {
if yamlErr := yaml.Unmarshal(raw, &spec); yamlErr != nil {
return nil, fmt.Errorf("invalid openapi content: %w", err)
return nil, fmt.Errorf("invalid openapi content (json: %v; yaml: %w)", err, yamlErr)
}
}
+5 -4
View File
@@ -4,6 +4,7 @@ import (
"context"
"crypto/sha256"
"encoding/hex"
"errors"
"fmt"
"net/url"
"path"
@@ -32,7 +33,7 @@ func (s *WebScraper) Scrape(ctx context.Context, source *Source) ([]*Document, e
scheduled := make(map[string]bool)
contentHashes := make(map[string]bool)
var mu sync.Mutex
var scrapeErrors []string
var scrapeErrors []error
// Parse base URL for domain restrictions
baseURL, err := url.Parse(source.URL)
@@ -102,7 +103,7 @@ func (s *WebScraper) Scrape(ctx context.Context, source *Source) ([]*Document, e
}
mu.Lock()
if len(scrapeErrors) < 20 {
scrapeErrors = append(scrapeErrors, fmt.Sprintf("%s: %v", reqURL, err))
scrapeErrors = append(scrapeErrors, fmt.Errorf("%s: %w", reqURL, err))
}
mu.Unlock()
})
@@ -236,7 +237,7 @@ func (s *WebScraper) Scrape(ctx context.Context, source *Source) ([]*Document, e
}
mu.Lock()
if len(scrapeErrors) < 20 {
scrapeErrors = append(scrapeErrors, fmt.Sprintf("%s: %v", absoluteURL, err))
scrapeErrors = append(scrapeErrors, fmt.Errorf("%s: %w", absoluteURL, err))
}
mu.Unlock()
}
@@ -256,7 +257,7 @@ func (s *WebScraper) Scrape(ctx context.Context, source *Source) ([]*Document, e
if len(documents) == 0 {
if len(scrapeErrors) > 0 {
return nil, fmt.Errorf("web scrape failed: %s", strings.Join(scrapeErrors, "; "))
return nil, fmt.Errorf("web scrape failed: %w", errors.Join(scrapeErrors...))
}
return nil, fmt.Errorf("web scrape extracted no documents from %s", source.URL)
}