mirror of
https://github.com/Dvorinka/Devour.git
synced 2026-06-04 04:23:02 +00:00
i dont like commits
This commit is contained in:
+1
-23
@@ -5,7 +5,6 @@ import (
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
@@ -70,28 +69,7 @@ func (s *AstroDocsScraper) DetectChanges(ctx context.Context, source *Source, la
|
||||
}
|
||||
|
||||
func (s *AstroDocsScraper) fetchPage(ctx context.Context, url string) (string, error) {
|
||||
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
req.Header.Set("User-Agent", s.config.UserAgent)
|
||||
|
||||
resp, err := s.client.Do(req)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return "", fmt.Errorf("HTTP %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return string(body), nil
|
||||
return fetchExternalPage(ctx, s.client, s.config.UserAgent, url)
|
||||
}
|
||||
|
||||
func (s *AstroDocsScraper) generateHash(content string) string {
|
||||
|
||||
+1
-23
@@ -5,7 +5,6 @@ import (
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
@@ -75,28 +74,7 @@ func (s *CloudflareDocsScraper) DetectChanges(ctx context.Context, source *Sourc
|
||||
}
|
||||
|
||||
func (s *CloudflareDocsScraper) fetchPage(ctx context.Context, url string) (string, error) {
|
||||
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
req.Header.Set("User-Agent", s.config.UserAgent)
|
||||
|
||||
resp, err := s.client.Do(req)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return "", fmt.Errorf("HTTP %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return string(body), nil
|
||||
return fetchExternalPage(ctx, s.client, s.config.UserAgent, url)
|
||||
}
|
||||
|
||||
func (s *CloudflareDocsScraper) generateHash(content string) string {
|
||||
|
||||
+1
-23
@@ -5,7 +5,6 @@ import (
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
@@ -70,28 +69,7 @@ func (s *DockerDocsScraper) DetectChanges(ctx context.Context, source *Source, l
|
||||
}
|
||||
|
||||
func (s *DockerDocsScraper) fetchPage(ctx context.Context, url string) (string, error) {
|
||||
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
req.Header.Set("User-Agent", s.config.UserAgent)
|
||||
|
||||
resp, err := s.client.Do(req)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return "", fmt.Errorf("HTTP %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return string(body), nil
|
||||
return fetchExternalPage(ctx, s.client, s.config.UserAgent, url)
|
||||
}
|
||||
|
||||
func (s *DockerDocsScraper) generateHash(content string) string {
|
||||
|
||||
Vendored
+1
-23
@@ -6,7 +6,6 @@ import (
|
||||
"encoding/hex"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
@@ -91,28 +90,7 @@ func (s *GoDocsScraper) DetectChanges(ctx context.Context, source *Source, lastH
|
||||
}
|
||||
|
||||
func (s *GoDocsScraper) fetchPage(ctx context.Context, url string) (string, error) {
|
||||
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
req.Header.Set("User-Agent", s.config.UserAgent)
|
||||
|
||||
resp, err := s.client.Do(req)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return "", fmt.Errorf("HTTP %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return string(body), nil
|
||||
return fetchExternalPage(ctx, s.client, s.config.UserAgent, url)
|
||||
}
|
||||
|
||||
func (s *GoDocsScraper) generateHash(content string) string {
|
||||
|
||||
+1
-23
@@ -5,7 +5,6 @@ import (
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
@@ -85,28 +84,7 @@ func (s *JavaDocsScraper) DetectChanges(ctx context.Context, source *Source, las
|
||||
}
|
||||
|
||||
func (s *JavaDocsScraper) fetchPage(ctx context.Context, url string) (string, error) {
|
||||
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
req.Header.Set("User-Agent", s.config.UserAgent)
|
||||
|
||||
resp, err := s.client.Do(req)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return "", fmt.Errorf("HTTP %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return string(body), nil
|
||||
return fetchExternalPage(ctx, s.client, s.config.UserAgent, url)
|
||||
}
|
||||
|
||||
func (s *JavaDocsScraper) generateHash(content string) string {
|
||||
|
||||
Vendored
+1
-23
@@ -5,7 +5,6 @@ import (
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
@@ -80,28 +79,7 @@ func (s *MCPDocsScraper) DetectChanges(ctx context.Context, source *Source, last
|
||||
}
|
||||
|
||||
func (s *MCPDocsScraper) fetchPage(ctx context.Context, url string) (string, error) {
|
||||
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
req.Header.Set("User-Agent", s.config.UserAgent)
|
||||
|
||||
resp, err := s.client.Do(req)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return "", fmt.Errorf("HTTP %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return string(body), nil
|
||||
return fetchExternalPage(ctx, s.client, s.config.UserAgent, url)
|
||||
}
|
||||
|
||||
func (s *MCPDocsScraper) generateHash(content string) string {
|
||||
|
||||
+1
-23
@@ -5,7 +5,6 @@ import (
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
@@ -90,28 +89,7 @@ func (s *NuxtDocsScraper) DetectChanges(ctx context.Context, source *Source, las
|
||||
}
|
||||
|
||||
func (s *NuxtDocsScraper) fetchPage(ctx context.Context, url string) (string, error) {
|
||||
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
req.Header.Set("User-Agent", s.config.UserAgent)
|
||||
|
||||
resp, err := s.client.Do(req)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return "", fmt.Errorf("HTTP %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return string(body), nil
|
||||
return fetchExternalPage(ctx, s.client, s.config.UserAgent, url)
|
||||
}
|
||||
|
||||
func (s *NuxtDocsScraper) generateHash(content string) string {
|
||||
|
||||
+1
-23
@@ -5,7 +5,6 @@ import (
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
@@ -100,28 +99,7 @@ func (s *PythonDocsScraper) DetectChanges(ctx context.Context, source *Source, l
|
||||
}
|
||||
|
||||
func (s *PythonDocsScraper) fetchPage(ctx context.Context, url string) (string, error) {
|
||||
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
req.Header.Set("User-Agent", s.config.UserAgent)
|
||||
|
||||
resp, err := s.client.Do(req)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return "", fmt.Errorf("HTTP %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return string(body), nil
|
||||
return fetchExternalPage(ctx, s.client, s.config.UserAgent, url)
|
||||
}
|
||||
|
||||
func (s *PythonDocsScraper) generateHash(content string) string {
|
||||
|
||||
+1
-23
@@ -5,7 +5,6 @@ import (
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
@@ -80,28 +79,7 @@ func (s *ReactDocsScraper) DetectChanges(ctx context.Context, source *Source, la
|
||||
}
|
||||
|
||||
func (s *ReactDocsScraper) fetchPage(ctx context.Context, url string) (string, error) {
|
||||
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
req.Header.Set("User-Agent", s.config.UserAgent)
|
||||
|
||||
resp, err := s.client.Do(req)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return "", fmt.Errorf("HTTP %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return string(body), nil
|
||||
return fetchExternalPage(ctx, s.client, s.config.UserAgent, url)
|
||||
}
|
||||
|
||||
func (s *ReactDocsScraper) generateHash(content string) string {
|
||||
|
||||
+1
-23
@@ -6,7 +6,6 @@ import (
|
||||
"encoding/hex"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
@@ -106,28 +105,7 @@ func (s *RustDocsScraper) DetectChanges(ctx context.Context, source *Source, las
|
||||
}
|
||||
|
||||
func (s *RustDocsScraper) fetchPage(ctx context.Context, url string) (string, error) {
|
||||
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
req.Header.Set("User-Agent", s.config.UserAgent)
|
||||
|
||||
resp, err := s.client.Do(req)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return "", fmt.Errorf("HTTP %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return string(body), nil
|
||||
return fetchExternalPage(ctx, s.client, s.config.UserAgent, url)
|
||||
}
|
||||
|
||||
func (s *RustDocsScraper) generateHash(content string) string {
|
||||
|
||||
+1
-23
@@ -5,7 +5,6 @@ import (
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
@@ -80,28 +79,7 @@ func (s *SpringDocsScraper) DetectChanges(ctx context.Context, source *Source, l
|
||||
}
|
||||
|
||||
func (s *SpringDocsScraper) fetchPage(ctx context.Context, url string) (string, error) {
|
||||
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
req.Header.Set("User-Agent", s.config.UserAgent)
|
||||
|
||||
resp, err := s.client.Do(req)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return "", fmt.Errorf("HTTP %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return string(body), nil
|
||||
return fetchExternalPage(ctx, s.client, s.config.UserAgent, url)
|
||||
}
|
||||
|
||||
func (s *SpringDocsScraper) generateHash(content string) string {
|
||||
|
||||
Vendored
+1
-23
@@ -5,7 +5,6 @@ import (
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
@@ -85,28 +84,7 @@ func (s *TSDocsScraper) DetectChanges(ctx context.Context, source *Source, lastH
|
||||
}
|
||||
|
||||
func (s *TSDocsScraper) fetchPage(ctx context.Context, url string) (string, error) {
|
||||
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
req.Header.Set("User-Agent", s.config.UserAgent)
|
||||
|
||||
resp, err := s.client.Do(req)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return "", fmt.Errorf("HTTP %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return string(body), nil
|
||||
return fetchExternalPage(ctx, s.client, s.config.UserAgent, url)
|
||||
}
|
||||
|
||||
func (s *TSDocsScraper) generateHash(content string) string {
|
||||
|
||||
Vendored
+65
@@ -1,8 +1,14 @@
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"strings"
|
||||
|
||||
basescraper "github.com/yourorg/devour/internal/scraper"
|
||||
)
|
||||
@@ -19,3 +25,62 @@ func generateDocID(urlStr string) string {
|
||||
hash := sha256.Sum256([]byte(urlStr))
|
||||
return hex.EncodeToString(hash[:12])
|
||||
}
|
||||
|
||||
func fetchExternalPage(ctx context.Context, client *http.Client, userAgent, targetURL string) (string, error) {
|
||||
req, err := http.NewRequestWithContext(ctx, "GET", targetURL, nil)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
req.Header.Set("User-Agent", userAgent)
|
||||
|
||||
resp, err := client.Do(req)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
snippet, readErr := readErrorSnippet(resp.Body)
|
||||
if readErr != nil {
|
||||
return "", fmt.Errorf("GET %s returned HTTP %d and body read failed: %w", summarizeURL(targetURL), resp.StatusCode, readErr)
|
||||
}
|
||||
return "", fmt.Errorf("GET %s returned HTTP %d: %s", summarizeURL(targetURL), resp.StatusCode, snippet)
|
||||
}
|
||||
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
return string(body), nil
|
||||
}
|
||||
|
||||
func readErrorSnippet(body io.Reader) (string, error) {
|
||||
const maxErrorBodyBytes = 512
|
||||
data, err := io.ReadAll(io.LimitReader(body, maxErrorBodyBytes))
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
msg := strings.TrimSpace(string(data))
|
||||
if msg == "" {
|
||||
return "<empty body>", nil
|
||||
}
|
||||
|
||||
return msg, nil
|
||||
}
|
||||
|
||||
func summarizeURL(rawURL string) string {
|
||||
parsedURL, err := url.Parse(rawURL)
|
||||
if err != nil || parsedURL.Host == "" {
|
||||
return rawURL
|
||||
}
|
||||
|
||||
path := parsedURL.EscapedPath()
|
||||
if path == "" {
|
||||
path = "/"
|
||||
}
|
||||
|
||||
return parsedURL.Host + path
|
||||
}
|
||||
|
||||
Vendored
+1
-23
@@ -5,7 +5,6 @@ import (
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
@@ -85,28 +84,7 @@ func (s *VueDocsScraper) DetectChanges(ctx context.Context, source *Source, last
|
||||
}
|
||||
|
||||
func (s *VueDocsScraper) fetchPage(ctx context.Context, url string) (string, error) {
|
||||
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
req.Header.Set("User-Agent", s.config.UserAgent)
|
||||
|
||||
resp, err := s.client.Do(req)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return "", fmt.Errorf("HTTP %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return string(body), nil
|
||||
return fetchExternalPage(ctx, s.client, s.config.UserAgent, url)
|
||||
}
|
||||
|
||||
func (s *VueDocsScraper) generateHash(content string) string {
|
||||
|
||||
@@ -28,12 +28,12 @@ func (s *GitHubScraper) Scrape(ctx context.Context, source *Source) ([]*Document
|
||||
|
||||
repoURL, repoName, err := s.resolveRepo(source)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
return nil, fmt.Errorf("resolve github repository: %w", err)
|
||||
}
|
||||
|
||||
tmpDir, err := os.MkdirTemp("", "devour-github-*")
|
||||
if err != nil {
|
||||
return nil, err
|
||||
return nil, fmt.Errorf("create temporary clone directory: %w", err)
|
||||
}
|
||||
defer os.RemoveAll(tmpDir)
|
||||
|
||||
@@ -81,14 +81,14 @@ func (s *GitHubScraper) Scrape(ctx context.Context, source *Source) ([]*Document
|
||||
local := NewLocalScraper(s.config)
|
||||
docs, err := local.Scrape(ctx, localSource)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
return nil, fmt.Errorf("scrape repository docs: %w", err)
|
||||
}
|
||||
if len(docs) == 0 && len(source.Include) == 0 {
|
||||
// Sparse patterns did not match this repository layout; retry full checkout.
|
||||
_ = exec.CommandContext(ctx, "git", "-C", tmpDir, "sparse-checkout", "disable").Run()
|
||||
docs, err = local.Scrape(ctx, localSource)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
return nil, fmt.Errorf("scrape repository docs after sparse fallback: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -129,7 +129,7 @@ func (s *GitHubScraper) DetectChanges(ctx context.Context, source *Source, lastH
|
||||
}
|
||||
_, repoName, err := s.resolveRepo(source)
|
||||
if err != nil {
|
||||
return false, "", err
|
||||
return false, "", fmt.Errorf("resolve github repository: %w", err)
|
||||
}
|
||||
|
||||
remote := "https://github.com/" + strings.TrimSuffix(repoName, ".git") + ".git"
|
||||
@@ -141,7 +141,7 @@ func (s *GitHubScraper) DetectChanges(ctx context.Context, source *Source, lastH
|
||||
cmd := exec.CommandContext(ctx, "git", "ls-remote", remote, branch)
|
||||
output, err := cmd.Output()
|
||||
if err != nil {
|
||||
return false, "", err
|
||||
return false, "", fmt.Errorf("run git ls-remote for %s (%s): %w", remote, branch, err)
|
||||
}
|
||||
line := strings.TrimSpace(string(output))
|
||||
if line == "" {
|
||||
@@ -169,7 +169,7 @@ func (s *GitHubScraper) resolveRepo(source *Source) (repoURL string, repoName st
|
||||
|
||||
u, err := url.Parse(raw)
|
||||
if err != nil {
|
||||
return "", "", err
|
||||
return "", "", fmt.Errorf("parse github url %q: %w", raw, err)
|
||||
}
|
||||
if !strings.Contains(strings.ToLower(u.Host), "github.com") {
|
||||
return "", "", fmt.Errorf("not a github url: %s", raw)
|
||||
|
||||
@@ -4,8 +4,10 @@ import (
|
||||
"context"
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io/fs"
|
||||
"log"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
@@ -44,14 +46,15 @@ func (s *LocalScraper) Scrape(ctx context.Context, source *Source) ([]*Document,
|
||||
|
||||
info, err := os.Stat(root)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
return nil, fmt.Errorf("stat local source root %q: %w", root, err)
|
||||
}
|
||||
|
||||
docs := make([]*Document, 0)
|
||||
nonFatalErrors := make([]error, 0)
|
||||
if !info.IsDir() {
|
||||
doc, err := s.fileToDocument(root, source)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
return nil, fmt.Errorf("convert local source file %q: %w", root, err)
|
||||
}
|
||||
return []*Document{doc}, nil
|
||||
}
|
||||
@@ -89,13 +92,22 @@ func (s *LocalScraper) Scrape(ctx context.Context, source *Source) ([]*Document,
|
||||
|
||||
doc, err := s.fileToDocument(path, source)
|
||||
if err != nil {
|
||||
if len(nonFatalErrors) < 20 {
|
||||
nonFatalErrors = append(nonFatalErrors, fmt.Errorf("%s: %w", path, err))
|
||||
}
|
||||
return nil
|
||||
}
|
||||
docs = append(docs, doc)
|
||||
return nil
|
||||
})
|
||||
if err != nil {
|
||||
return nil, err
|
||||
return nil, fmt.Errorf("walk local source root %q: %w", root, err)
|
||||
}
|
||||
if len(nonFatalErrors) > 0 {
|
||||
log.Printf("local scraper skipped %d files due to conversion errors (sample: %v)", len(nonFatalErrors), nonFatalErrors[0])
|
||||
if len(docs) == 0 {
|
||||
return nil, fmt.Errorf("local scrape failed while converting files: %w", errors.Join(nonFatalErrors...))
|
||||
}
|
||||
}
|
||||
|
||||
return docs, nil
|
||||
@@ -118,7 +130,7 @@ func (s *LocalScraper) DetectChanges(ctx context.Context, source *Source, lastHa
|
||||
h := sha256.New()
|
||||
err := filepath.WalkDir(root, func(path string, d fs.DirEntry, err error) error {
|
||||
if err != nil {
|
||||
return err
|
||||
return fmt.Errorf("walk local source path %q: %w", path, err)
|
||||
}
|
||||
if d.IsDir() {
|
||||
name := d.Name()
|
||||
@@ -133,13 +145,13 @@ func (s *LocalScraper) DetectChanges(ctx context.Context, source *Source, lastHa
|
||||
|
||||
info, infoErr := d.Info()
|
||||
if infoErr != nil {
|
||||
return infoErr
|
||||
return fmt.Errorf("stat local source file %q: %w", path, infoErr)
|
||||
}
|
||||
fmt.Fprintf(h, "%s|%d|%d\n", path, info.Size(), info.ModTime().UnixNano())
|
||||
return nil
|
||||
})
|
||||
if err != nil {
|
||||
return false, "", err
|
||||
return false, "", fmt.Errorf("walk local source root %q for change detection: %w", root, err)
|
||||
}
|
||||
|
||||
hash := hex.EncodeToString(h.Sum(nil))
|
||||
@@ -149,7 +161,7 @@ func (s *LocalScraper) DetectChanges(ctx context.Context, source *Source, lastHa
|
||||
func (s *LocalScraper) fileToDocument(path string, source *Source) (*Document, error) {
|
||||
b, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
return nil, fmt.Errorf("read local source file %q: %w", path, err)
|
||||
}
|
||||
|
||||
ext := strings.ToLower(filepath.Ext(path))
|
||||
|
||||
@@ -5,6 +5,7 @@ import (
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
@@ -81,7 +82,7 @@ func (s *LocalSearchScraper) Scrape(ctx context.Context, source *Source) ([]*Doc
|
||||
|
||||
docs := make([]*Document, 0, limit)
|
||||
seen := make(map[string]bool)
|
||||
var scrapeErrors []string
|
||||
var scrapeErrors []error
|
||||
|
||||
for i, result := range results {
|
||||
if ctx.Err() != nil {
|
||||
@@ -109,7 +110,7 @@ func (s *LocalSearchScraper) Scrape(ctx context.Context, source *Source) ([]*Doc
|
||||
})
|
||||
if err != nil {
|
||||
if len(scrapeErrors) < 20 {
|
||||
scrapeErrors = append(scrapeErrors, fmt.Sprintf("%s: %v", resultURL, err))
|
||||
scrapeErrors = append(scrapeErrors, fmt.Errorf("%s: %w", resultURL, err))
|
||||
}
|
||||
continue
|
||||
}
|
||||
@@ -140,7 +141,7 @@ func (s *LocalSearchScraper) Scrape(ctx context.Context, source *Source) ([]*Doc
|
||||
|
||||
if len(docs) == 0 {
|
||||
if len(scrapeErrors) > 0 {
|
||||
return nil, fmt.Errorf("local search returned results but page scraping failed: %s", strings.Join(scrapeErrors, "; "))
|
||||
return nil, fmt.Errorf("local search returned results but page scraping failed: %w", errors.Join(scrapeErrors...))
|
||||
}
|
||||
return nil, fmt.Errorf("local search yielded no usable results for query %q", query)
|
||||
}
|
||||
|
||||
@@ -42,12 +42,12 @@ func (s *OpenAPIScraper) Scrape(ctx context.Context, source *Source) ([]*Documen
|
||||
|
||||
raw, specURL, err := s.readSpec(ctx, source)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
return nil, fmt.Errorf("read openapi spec: %w", err)
|
||||
}
|
||||
|
||||
spec, err := parseOpenAPISpec(raw)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
return nil, fmt.Errorf("parse openapi spec %q: %w", specURL, err)
|
||||
}
|
||||
|
||||
docs := make([]*Document, 0)
|
||||
@@ -138,7 +138,7 @@ func (s *OpenAPIScraper) readSpec(ctx context.Context, source *Source) ([]byte,
|
||||
if strings.HasPrefix(rawPath, "http://") || strings.HasPrefix(rawPath, "https://") {
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, rawPath, nil)
|
||||
if err != nil {
|
||||
return nil, "", err
|
||||
return nil, "", fmt.Errorf("build openapi fetch request: %w", err)
|
||||
}
|
||||
if s.config != nil && strings.TrimSpace(s.config.UserAgent) != "" {
|
||||
req.Header.Set("User-Agent", s.config.UserAgent)
|
||||
@@ -146,7 +146,7 @@ func (s *OpenAPIScraper) readSpec(ctx context.Context, source *Source) ([]byte,
|
||||
|
||||
resp, err := s.client.Do(req)
|
||||
if err != nil {
|
||||
return nil, "", err
|
||||
return nil, "", fmt.Errorf("fetch openapi spec from %s: %w", rawPath, err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
|
||||
@@ -154,14 +154,14 @@ func (s *OpenAPIScraper) readSpec(ctx context.Context, source *Source) ([]byte,
|
||||
}
|
||||
body, err := io.ReadAll(io.LimitReader(resp.Body, 10<<20))
|
||||
if err != nil {
|
||||
return nil, "", err
|
||||
return nil, "", fmt.Errorf("read openapi response body from %s: %w", rawPath, err)
|
||||
}
|
||||
return body, rawPath, nil
|
||||
}
|
||||
|
||||
b, err := os.ReadFile(rawPath)
|
||||
if err != nil {
|
||||
return nil, "", err
|
||||
return nil, "", fmt.Errorf("read openapi file %q: %w", rawPath, err)
|
||||
}
|
||||
return b, "file://" + rawPath, nil
|
||||
}
|
||||
@@ -214,7 +214,7 @@ func parseOpenAPISpec(raw []byte) (*openAPISpec, error) {
|
||||
var spec openAPISpec
|
||||
if err := json.Unmarshal(raw, &spec); err != nil {
|
||||
if yamlErr := yaml.Unmarshal(raw, &spec); yamlErr != nil {
|
||||
return nil, fmt.Errorf("invalid openapi content: %w", err)
|
||||
return nil, fmt.Errorf("invalid openapi content (json: %v; yaml: %w)", err, yamlErr)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -4,6 +4,7 @@ import (
|
||||
"context"
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"errors"
|
||||
"fmt"
|
||||
"net/url"
|
||||
"path"
|
||||
@@ -32,7 +33,7 @@ func (s *WebScraper) Scrape(ctx context.Context, source *Source) ([]*Document, e
|
||||
scheduled := make(map[string]bool)
|
||||
contentHashes := make(map[string]bool)
|
||||
var mu sync.Mutex
|
||||
var scrapeErrors []string
|
||||
var scrapeErrors []error
|
||||
|
||||
// Parse base URL for domain restrictions
|
||||
baseURL, err := url.Parse(source.URL)
|
||||
@@ -102,7 +103,7 @@ func (s *WebScraper) Scrape(ctx context.Context, source *Source) ([]*Document, e
|
||||
}
|
||||
mu.Lock()
|
||||
if len(scrapeErrors) < 20 {
|
||||
scrapeErrors = append(scrapeErrors, fmt.Sprintf("%s: %v", reqURL, err))
|
||||
scrapeErrors = append(scrapeErrors, fmt.Errorf("%s: %w", reqURL, err))
|
||||
}
|
||||
mu.Unlock()
|
||||
})
|
||||
@@ -236,7 +237,7 @@ func (s *WebScraper) Scrape(ctx context.Context, source *Source) ([]*Document, e
|
||||
}
|
||||
mu.Lock()
|
||||
if len(scrapeErrors) < 20 {
|
||||
scrapeErrors = append(scrapeErrors, fmt.Sprintf("%s: %v", absoluteURL, err))
|
||||
scrapeErrors = append(scrapeErrors, fmt.Errorf("%s: %w", absoluteURL, err))
|
||||
}
|
||||
mu.Unlock()
|
||||
}
|
||||
@@ -256,7 +257,7 @@ func (s *WebScraper) Scrape(ctx context.Context, source *Source) ([]*Document, e
|
||||
|
||||
if len(documents) == 0 {
|
||||
if len(scrapeErrors) > 0 {
|
||||
return nil, fmt.Errorf("web scrape failed: %s", strings.Join(scrapeErrors, "; "))
|
||||
return nil, fmt.Errorf("web scrape failed: %w", errors.Join(scrapeErrors...))
|
||||
}
|
||||
return nil, fmt.Errorf("web scrape extracted no documents from %s", source.URL)
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user