This commit is contained in:
Tomas Dvorak
2026-02-24 10:33:59 +01:00
parent 409acd2e08
commit 898a3c303f
1374 changed files with 290409 additions and 29187 deletions
+45
View File
@@ -0,0 +1,45 @@
package scraper
import basescraper "github.com/yourorg/devour/internal/scraper"
func init() {
basescraper.RegisterScraper(basescraper.SourceTypeGoDocs, func(c *basescraper.Config) basescraper.Scraper {
return NewGoDocsScraper(c)
})
basescraper.RegisterScraper(basescraper.SourceTypeRustDocs, func(c *basescraper.Config) basescraper.Scraper {
return NewRustDocsScraper(c)
})
basescraper.RegisterScraper(basescraper.SourceTypePythonDocs, func(c *basescraper.Config) basescraper.Scraper {
return NewPythonDocsScraper(c)
})
basescraper.RegisterScraper(basescraper.SourceTypeJavaDocs, func(c *basescraper.Config) basescraper.Scraper {
return NewJavaDocsScraper(c)
})
basescraper.RegisterScraper(basescraper.SourceTypeSpringDocs, func(c *basescraper.Config) basescraper.Scraper {
return NewSpringDocsScraper(c)
})
basescraper.RegisterScraper(basescraper.SourceTypeTSDocs, func(c *basescraper.Config) basescraper.Scraper {
return NewTSDocsScraper(c)
})
basescraper.RegisterScraper(basescraper.SourceTypeReactDocs, func(c *basescraper.Config) basescraper.Scraper {
return NewReactDocsScraper(c)
})
basescraper.RegisterScraper(basescraper.SourceTypeVueDocs, func(c *basescraper.Config) basescraper.Scraper {
return NewVueDocsScraper(c)
})
basescraper.RegisterScraper(basescraper.SourceTypeNuxtDocs, func(c *basescraper.Config) basescraper.Scraper {
return NewNuxtDocsScraper(c)
})
basescraper.RegisterScraper(basescraper.SourceTypeMCPDocs, func(c *basescraper.Config) basescraper.Scraper {
return NewMCPDocsScraper(c)
})
basescraper.RegisterScraper(basescraper.SourceTypeDockerDocs, func(c *basescraper.Config) basescraper.Scraper {
return NewDockerDocsScraper(c)
})
basescraper.RegisterScraper(basescraper.SourceTypeCloudflareDocs, func(c *basescraper.Config) basescraper.Scraper {
return NewCloudflareDocsScraper(c)
})
basescraper.RegisterScraper(basescraper.SourceTypeAstroDocs, func(c *basescraper.Config) basescraper.Scraper {
return NewAstroDocsScraper(c)
})
}
+27 -12
View File
@@ -155,16 +155,18 @@ func (s *TSDocsScraper) interfaceToDocument(iface *tsdocs.Interface, module *tsd
metadata := map[string]interface{}{
"module": module.Name,
"name": iface.Name,
"doc_url": iface.DocURL,
"doc_url": coalesceDocURL(iface.DocURL, module.DocURL),
}
docURL := coalesceDocURL(iface.DocURL, module.DocURL)
return &Document{
ID: generateDocID(iface.DocURL),
ID: generateDocID(docURL),
Source: sourceName,
Type: "ts-interface",
Title: iface.Name,
Content: content.String(),
URL: iface.DocURL,
URL: docURL,
Metadata: metadata,
Hash: s.generateHash(content.String()),
Timestamp: time.Now(),
@@ -185,16 +187,18 @@ func (s *TSDocsScraper) functionToDocument(fn *tsdocs.Function, module *tsdocs.M
"module": module.Name,
"name": fn.Name,
"return_type": fn.ReturnType,
"doc_url": fn.DocURL,
"doc_url": coalesceDocURL(fn.DocURL, module.DocURL),
}
docURL := coalesceDocURL(fn.DocURL, module.DocURL)
return &Document{
ID: generateDocID(fn.DocURL),
ID: generateDocID(docURL),
Source: sourceName,
Type: "ts-function",
Title: fn.Name,
Content: content.String(),
URL: fn.DocURL,
URL: docURL,
Metadata: metadata,
Hash: s.generateHash(content.String()),
Timestamp: time.Now(),
@@ -217,16 +221,18 @@ func (s *TSDocsScraper) classToDocument(class *tsdocs.Class, module *tsdocs.Modu
metadata := map[string]interface{}{
"module": module.Name,
"name": class.Name,
"doc_url": class.DocURL,
"doc_url": coalesceDocURL(class.DocURL, module.DocURL),
}
docURL := coalesceDocURL(class.DocURL, module.DocURL)
return &Document{
ID: generateDocID(class.DocURL),
ID: generateDocID(docURL),
Source: sourceName,
Type: "ts-class",
Title: class.Name,
Content: content.String(),
URL: class.DocURL,
URL: docURL,
Metadata: metadata,
Hash: s.generateHash(content.String()),
Timestamp: time.Now(),
@@ -244,18 +250,27 @@ func (s *TSDocsScraper) typeAliasToDocument(ta *tsdocs.TypeAlias, module *tsdocs
metadata := map[string]interface{}{
"module": module.Name,
"name": ta.Name,
"doc_url": ta.DocURL,
"doc_url": coalesceDocURL(ta.DocURL, module.DocURL),
}
docURL := coalesceDocURL(ta.DocURL, module.DocURL)
return &Document{
ID: generateDocID(ta.DocURL),
ID: generateDocID(docURL),
Source: sourceName,
Type: "ts-type",
Title: ta.Name,
Content: content.String(),
URL: ta.DocURL,
URL: docURL,
Metadata: metadata,
Hash: s.generateHash(content.String()),
Timestamp: time.Now(),
}
}
func coalesceDocURL(primary, fallback string) string {
if strings.TrimSpace(primary) != "" {
return primary
}
return fallback
}
+65
View File
@@ -0,0 +1,65 @@
package scraper
import (
"testing"
"github.com/yourorg/devour/pkg/tsdocs"
)
func TestTSDocsSubDocsFallbackToModuleURL(t *testing.T) {
s := &TSDocsScraper{}
module := &tsdocs.Module{
Name: "Module",
DocURL: "https://www.typescriptlang.org/docs/handbook/2/basic-types.html",
}
cases := []struct {
name string
build func() *Document
docType string
}{
{
name: "interface",
build: func() *Document {
return s.interfaceToDocument(&tsdocs.Interface{Name: "User", DocURL: ""}, module, "ts")
},
docType: "ts-interface",
},
{
name: "function",
build: func() *Document {
return s.functionToDocument(&tsdocs.Function{Name: "parse", DocURL: ""}, module, "ts")
},
docType: "ts-function",
},
{
name: "class",
build: func() *Document {
return s.classToDocument(&tsdocs.Class{Name: "Service", DocURL: ""}, module, "ts")
},
docType: "ts-class",
},
{
name: "type alias",
build: func() *Document {
return s.typeAliasToDocument(&tsdocs.TypeAlias{Name: "ID", Type: "string", DocURL: ""}, module, "ts")
},
docType: "ts-type",
},
}
for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
doc := tc.build()
if doc.URL != module.DocURL {
t.Fatalf("expected fallback URL %q, got %q", module.DocURL, doc.URL)
}
if got := doc.Metadata["doc_url"]; got != module.DocURL {
t.Fatalf("expected metadata doc_url %q, got %#v", module.DocURL, got)
}
if doc.Type != tc.docType {
t.Fatalf("expected doc type %q, got %q", tc.docType, doc.Type)
}
})
}
}
+21
View File
@@ -0,0 +1,21 @@
package scraper
import (
"crypto/sha256"
"encoding/hex"
basescraper "github.com/yourorg/devour/internal/scraper"
)
type SourceType = basescraper.SourceType
type Source = basescraper.Source
type Document = basescraper.Document
type Config = basescraper.Config
func generateDocID(urlStr string) string {
hash := sha256.Sum256([]byte(urlStr))
return hex.EncodeToString(hash[:12])
}
+171 -8
View File
@@ -2,6 +2,12 @@ package scraper
import (
"context"
"fmt"
"net/url"
"os"
"os/exec"
"path/filepath"
"strings"
)
// GitHubScraper scrapes documentation from GitHub repositories.
@@ -16,16 +22,173 @@ func NewGitHubScraper(config *Config) *GitHubScraper {
// Scrape clones and parses documents from a GitHub repository.
func (s *GitHubScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
// TODO: Implement GitHub scraping
// 1. Clone repository (shallow)
// 2. Find markdown files in specified paths
// 3. Parse README, docs/, wiki
// 4. Extract code structure
return nil, nil
if source == nil {
return nil, fmt.Errorf("source is required")
}
repoURL, repoName, err := s.resolveRepo(source)
if err != nil {
return nil, err
}
tmpDir, err := os.MkdirTemp("", "devour-github-*")
if err != nil {
return nil, err
}
defer os.RemoveAll(tmpDir)
cloneArgs := []string{"clone", "--depth", "1", "--filter=blob:none", "--sparse", repoURL, tmpDir}
if branch := strings.TrimSpace(source.Branch); branch != "" {
cloneArgs = []string{"clone", "--depth", "1", "--filter=blob:none", "--sparse", "--branch", branch, repoURL, tmpDir}
}
cmd := exec.CommandContext(ctx, "git", cloneArgs...)
output, err := cmd.CombinedOutput()
if err != nil {
return nil, fmt.Errorf("git clone failed: %v (%s)", err, strings.TrimSpace(string(output)))
}
if len(source.Include) == 0 {
// Try sparse checkout for common docs locations to reduce clone and parse cost.
sparse := exec.CommandContext(ctx, "git", "-C", tmpDir, "sparse-checkout", "set", "--skip-checks",
"docs", "doc", "src/routes", "website/docs", "packages/*/docs")
if sparseOut, sparseErr := sparse.CombinedOutput(); sparseErr != nil {
_ = sparseOut
_ = exec.CommandContext(ctx, "git", "-C", tmpDir, "sparse-checkout", "disable").Run()
}
} else {
_ = exec.CommandContext(ctx, "git", "-C", tmpDir, "sparse-checkout", "disable").Run()
}
localSource := &Source{
Name: coalesce(source.Name, repoName),
Type: SourceTypeLocal,
Path: tmpDir,
Include: append([]string(nil), source.Include...),
Exclude: append([]string(nil), source.Exclude...),
Schedule: source.Schedule,
}
if len(localSource.Include) == 0 {
localSource.Include = []string{
`(?i)(^|/)readme\.md$`,
`(?i)(^|/)docs?/`,
`(?i)\.md$`,
`(?i)\.mdx$`,
}
}
local := NewLocalScraper(s.config)
docs, err := local.Scrape(ctx, localSource)
if err != nil {
return nil, err
}
if len(docs) == 0 && len(source.Include) == 0 {
// Sparse patterns did not match this repository layout; retry full checkout.
_ = exec.CommandContext(ctx, "git", "-C", tmpDir, "sparse-checkout", "disable").Run()
docs, err = local.Scrape(ctx, localSource)
if err != nil {
return nil, err
}
}
for _, doc := range docs {
if doc == nil {
continue
}
branchForURL := strings.TrimSpace(source.Branch)
if branchForURL == "" {
branchForURL = "HEAD"
}
if doc.Metadata == nil {
doc.Metadata = map[string]interface{}{}
}
if rawPath, ok := doc.Metadata["path"].(string); ok {
if relPath, relErr := filepath.Rel(tmpDir, rawPath); relErr == nil {
relPath = filepath.ToSlash(relPath)
relPath = strings.TrimPrefix(relPath, "./")
if relPath != "" && relPath != "." {
doc.URL = fmt.Sprintf("https://github.com/%s/blob/%s/%s", repoName, branchForURL, relPath)
doc.ID = generateDocID(doc.URL)
doc.Metadata["path"] = relPath
}
}
}
doc.Type = "github-document"
doc.Metadata["repo"] = repoName
doc.Metadata["repo_url"] = repoURL
doc.Metadata["source_type"] = "github"
}
return docs, nil
}
// DetectChanges checks if the repository has new commits.
func (s *GitHubScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
// TODO: Check latest commit hash
return false, "", nil
if source == nil {
return false, "", fmt.Errorf("source is required")
}
_, repoName, err := s.resolveRepo(source)
if err != nil {
return false, "", err
}
remote := "https://github.com/" + strings.TrimSuffix(repoName, ".git") + ".git"
branch := strings.TrimSpace(source.Branch)
if branch == "" {
branch = "HEAD"
}
cmd := exec.CommandContext(ctx, "git", "ls-remote", remote, branch)
output, err := cmd.Output()
if err != nil {
return false, "", err
}
line := strings.TrimSpace(string(output))
if line == "" {
return false, "", fmt.Errorf("empty ls-remote output")
}
parts := strings.Fields(line)
if len(parts) == 0 {
return false, "", fmt.Errorf("unexpected ls-remote output")
}
hash := parts[0]
return hash != lastHash, hash, nil
}
func (s *GitHubScraper) resolveRepo(source *Source) (repoURL string, repoName string, err error) {
if strings.TrimSpace(source.Repo) != "" {
repoName = strings.Trim(strings.TrimSpace(source.Repo), "/")
repoName = strings.TrimSuffix(repoName, ".git")
return "https://github.com/" + repoName + ".git", repoName, nil
}
raw := strings.TrimSpace(source.URL)
if raw == "" {
return "", "", fmt.Errorf("github source requires repo or url")
}
u, err := url.Parse(raw)
if err != nil {
return "", "", err
}
if !strings.Contains(strings.ToLower(u.Host), "github.com") {
return "", "", fmt.Errorf("not a github url: %s", raw)
}
parts := strings.Split(strings.Trim(u.Path, "/"), "/")
if len(parts) < 2 {
return "", "", fmt.Errorf("invalid github repo url: %s", raw)
}
repoName = parts[0] + "/" + strings.TrimSuffix(parts[1], ".git")
repoURL = "https://github.com/" + repoName + ".git"
return repoURL, repoName, nil
}
func coalesce(primary, fallback string) string {
if strings.TrimSpace(primary) != "" {
return primary
}
if strings.TrimSpace(fallback) != "" {
return filepath.Base(fallback)
}
return "github"
}
+227 -8
View File
@@ -2,6 +2,20 @@ package scraper
import (
"context"
"crypto/sha256"
"encoding/hex"
"fmt"
"io/fs"
"os"
"path/filepath"
"regexp"
"strings"
"time"
)
var (
reLocalBlankLines = regexp.MustCompile(`\n{3,}`)
reFrontMatterBlock = regexp.MustCompile(`(?s)\A---\s*\n.*?\n---\s*\n`)
)
// LocalScraper scrapes documentation from local filesystem.
@@ -16,16 +30,221 @@ func NewLocalScraper(config *Config) *LocalScraper {
// Scrape scans and parses documents from a local directory.
func (s *LocalScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
// TODO: Implement local scraping
// 1. Walk directory tree
// 2. Filter by include/exclude patterns
// 3. Parse markdown, text, code files
// 4. Extract structure and content
return nil, nil
if source == nil {
return nil, fmt.Errorf("source is required")
}
root := strings.TrimSpace(source.Path)
if root == "" {
root = strings.TrimSpace(source.URL)
}
if root == "" {
return nil, fmt.Errorf("path or url is required for local source")
}
info, err := os.Stat(root)
if err != nil {
return nil, err
}
docs := make([]*Document, 0)
if !info.IsDir() {
doc, err := s.fileToDocument(root, source)
if err != nil {
return nil, err
}
return []*Document{doc}, nil
}
web := NewWebScraper(s.config)
err = filepath.WalkDir(root, func(path string, d fs.DirEntry, err error) error {
if err != nil {
return err
}
select {
case <-ctx.Done():
return ctx.Err()
default:
}
if d.IsDir() {
name := d.Name()
if strings.HasPrefix(name, ".") || name == "node_modules" || name == "vendor" || name == "dist" || name == "build" {
return filepath.SkipDir
}
return nil
}
relPath := path
if rel, relErr := filepath.Rel(root, path); relErr == nil {
relPath = rel
}
normalized := filepath.ToSlash(relPath)
if !web.shouldInclude(normalized, source.Include, source.Exclude) {
return nil
}
if !isDocumentationFile(path) {
return nil
}
doc, err := s.fileToDocument(path, source)
if err != nil {
return nil
}
docs = append(docs, doc)
return nil
})
if err != nil {
return nil, err
}
return docs, nil
}
// DetectChanges checks if files have been modified.
func (s *LocalScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
// TODO: Check file modification times
return false, "", nil
if source == nil {
return false, "", fmt.Errorf("source is required")
}
root := strings.TrimSpace(source.Path)
if root == "" {
root = strings.TrimSpace(source.URL)
}
if root == "" {
return false, "", fmt.Errorf("path or url is required for local source")
}
h := sha256.New()
err := filepath.WalkDir(root, func(path string, d fs.DirEntry, err error) error {
if err != nil {
return err
}
if d.IsDir() {
name := d.Name()
if strings.HasPrefix(name, ".") || name == "node_modules" || name == "vendor" || name == "dist" || name == "build" {
return filepath.SkipDir
}
return nil
}
if !isDocumentationFile(path) {
return nil
}
info, infoErr := d.Info()
if infoErr != nil {
return infoErr
}
fmt.Fprintf(h, "%s|%d|%d\n", path, info.Size(), info.ModTime().UnixNano())
return nil
})
if err != nil {
return false, "", err
}
hash := hex.EncodeToString(h.Sum(nil))
return hash != lastHash, hash, nil
}
func (s *LocalScraper) fileToDocument(path string, source *Source) (*Document, error) {
b, err := os.ReadFile(path)
if err != nil {
return nil, err
}
ext := strings.ToLower(filepath.Ext(path))
content := normalizeLocalContent(string(b), ext)
if content == "" {
return nil, fmt.Errorf("empty file")
}
title := strings.TrimSuffix(filepath.Base(path), filepath.Ext(path))
hash := sha256.Sum256(b)
uri := "file://" + filepath.ToSlash(path)
docType := "local-document"
switch ext {
case ".md", ".mdx":
docType = "local-markdown"
case ".txt":
docType = "local-text"
case ".json", ".yaml", ".yml":
docType = "local-data"
case ".go", ".js", ".ts", ".tsx", ".py", ".java", ".rs", ".rb", ".php":
docType = "local-code"
}
name := source.Name
if strings.TrimSpace(name) == "" {
name = filepath.Base(filepath.Dir(path))
}
return &Document{
ID: generateDocID(uri),
Source: name,
Type: docType,
Title: title,
Content: content,
URL: uri,
Metadata: map[string]interface{}{
"path": path,
"size": len(b),
},
Hash: hex.EncodeToString(hash[:]),
Timestamp: time.Now(),
}, nil
}
func normalizeLocalContent(content, ext string) string {
content = strings.TrimSpace(content)
if content == "" {
return ""
}
switch ext {
case ".md", ".mdx":
content = stripMarkdownFrontmatter(content)
content = stripMDXPreamble(content)
}
// Collapse excessive blank lines to reduce indexing noise.
content = reLocalBlankLines.ReplaceAllString(content, "\n\n")
return strings.TrimSpace(content)
}
func stripMarkdownFrontmatter(content string) string {
if !strings.HasPrefix(content, "---\n") && !strings.HasPrefix(content, "---\r\n") {
return content
}
trimmed := reFrontMatterBlock.ReplaceAllString(content, "")
return trimmed
}
func stripMDXPreamble(content string) string {
lines := strings.Split(content, "\n")
i := 0
for i < len(lines) {
line := strings.TrimSpace(lines[i])
if line == "" {
i++
continue
}
if strings.HasPrefix(line, "import ") || strings.HasPrefix(line, "export ") {
i++
continue
}
break
}
return strings.Join(lines[i:], "\n")
}
func isDocumentationFile(path string) bool {
ext := strings.ToLower(filepath.Ext(path))
switch ext {
case ".md", ".mdx", ".txt", ".rst", ".adoc", ".json", ".yaml", ".yml", ".go", ".py", ".js", ".ts", ".tsx", ".java", ".rs", ".rb", ".php", ".html":
return true
default:
return false
}
}
+102
View File
@@ -0,0 +1,102 @@
package scraper
import (
"context"
"os"
"path/filepath"
"strings"
"testing"
"time"
)
func TestLocalScraperScrapeDirectory(t *testing.T) {
tmp := t.TempDir()
if err := os.WriteFile(filepath.Join(tmp, "README.md"), []byte("# Demo\n\nhello docs"), 0o644); err != nil {
t.Fatal(err)
}
if err := os.WriteFile(filepath.Join(tmp, "notes.txt"), []byte("notes"), 0o644); err != nil {
t.Fatal(err)
}
if err := os.WriteFile(filepath.Join(tmp, "bin.bin"), []byte{0x00, 0x01}, 0o644); err != nil {
t.Fatal(err)
}
s := NewLocalScraper(&Config{Timeout: 2 * time.Second, UserAgent: "DevourTest"})
docs, err := s.Scrape(context.Background(), &Source{Name: "local", Type: SourceTypeLocal, Path: tmp})
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if len(docs) < 2 {
t.Fatalf("expected at least 2 docs, got %d", len(docs))
}
}
func TestLocalScraperDetectChanges(t *testing.T) {
tmp := t.TempDir()
file := filepath.Join(tmp, "README.md")
if err := os.WriteFile(file, []byte("v1"), 0o644); err != nil {
t.Fatal(err)
}
s := NewLocalScraper(&Config{Timeout: 2 * time.Second, UserAgent: "DevourTest"})
src := &Source{Name: "local", Type: SourceTypeLocal, Path: tmp}
changed, hash1, err := s.DetectChanges(context.Background(), src, "")
if err != nil {
t.Fatal(err)
}
if !changed || hash1 == "" {
t.Fatalf("expected first detect to change with non-empty hash, changed=%v hash=%q", changed, hash1)
}
time.Sleep(5 * time.Millisecond)
if err := os.WriteFile(file, []byte("v2"), 0o644); err != nil {
t.Fatal(err)
}
changed, hash2, err := s.DetectChanges(context.Background(), src, hash1)
if err != nil {
t.Fatal(err)
}
if !changed {
t.Fatal("expected change after file update")
}
if hash1 == hash2 {
t.Fatal("expected hash to change")
}
}
func TestLocalScraper_StripsFrontmatterAndMDXPreamble(t *testing.T) {
tmp := t.TempDir()
path := filepath.Join(tmp, "doc.mdx")
content := `---
title: My Doc
slug: /my-doc
---
import { Component } from "x"
export const meta = {}
# Heading
Actual documentation body.
`
if err := os.WriteFile(path, []byte(content), 0o644); err != nil {
t.Fatal(err)
}
s := NewLocalScraper(&Config{Timeout: 2 * time.Second, UserAgent: "DevourTest"})
doc, err := s.fileToDocument(path, &Source{Name: "local", Type: SourceTypeLocal, Path: tmp})
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if strings.Contains(doc.Content, "slug: /my-doc") {
t.Fatalf("expected frontmatter to be stripped, got: %q", doc.Content)
}
if strings.Contains(doc.Content, "import { Component }") {
t.Fatalf("expected MDX preamble to be stripped, got: %q", doc.Content)
}
if !strings.Contains(doc.Content, "Actual documentation body.") {
t.Fatalf("expected markdown body in content, got: %q", doc.Content)
}
}
+402
View File
@@ -0,0 +1,402 @@
package scraper
import (
"context"
"crypto/sha256"
"encoding/hex"
"encoding/json"
"fmt"
"io"
"net/http"
"net/url"
"sort"
"strconv"
"strings"
"time"
)
const (
defaultLocalSearchLimit = 8
maxLocalSearchLimit = 50
maxSearchResponseBytes = 2 << 20 // 2MB
)
// LocalSearchScraper scrapes docs from result URLs returned by a local search API.
type LocalSearchScraper struct {
config *Config
client *http.Client
web *WebScraper
}
type localSearchResult struct {
URL string
Title string
Snippet string
Engine string
Score float64
}
// NewLocalSearchScraper creates a scraper backed by a self-hosted search API.
func NewLocalSearchScraper(config *Config) *LocalSearchScraper {
baseConfig := &Config{}
if config != nil {
*baseConfig = *config
}
if baseConfig.UserAgent == "" {
baseConfig.UserAgent = "Devour/1.0 (Local Search Scraper)"
}
if baseConfig.Timeout <= 0 {
baseConfig.Timeout = 30 * time.Second
}
webConfig := *baseConfig
webConfig.Concurrency = 1
webConfig.MaxDepth = 1
return &LocalSearchScraper{
config: baseConfig,
client: &http.Client{Timeout: baseConfig.Timeout},
web: NewWebScraper(&webConfig),
}
}
// Scrape queries a local search API and scrapes the returned URLs.
func (s *LocalSearchScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
if source == nil {
return nil, fmt.Errorf("source is required")
}
if strings.TrimSpace(source.URL) == "" {
return nil, fmt.Errorf("search API URL is required")
}
query := strings.TrimSpace(source.Query)
if query == "" {
return nil, fmt.Errorf("search query is required for localsearch sources (set source.query or --search-query)")
}
limit := clampLocalSearchLimit(source.ResultLimit)
results, err := s.search(ctx, source, query, limit)
if err != nil {
return nil, err
}
docs := make([]*Document, 0, limit)
seen := make(map[string]bool)
var scrapeErrors []string
for i, result := range results {
if ctx.Err() != nil {
return nil, ctx.Err()
}
resultURL := stripURLFragment(result.URL)
if resultURL == "" || seen[resultURL] {
continue
}
if !domainAllowed(resultURL, source.Domains) {
continue
}
if !s.web.shouldInclude(resultURL, source.Include, source.Exclude) {
continue
}
seen[resultURL] = true
pageDocs, err := s.web.Scrape(ctx, &Source{
Name: source.Name,
Type: SourceTypeWeb,
URL: resultURL,
Include: source.Include,
Exclude: source.Exclude,
})
if err != nil {
if len(scrapeErrors) < 20 {
scrapeErrors = append(scrapeErrors, fmt.Sprintf("%s: %v", resultURL, err))
}
continue
}
for _, doc := range pageDocs {
if doc.Metadata == nil {
doc.Metadata = make(map[string]interface{})
}
doc.Metadata["search_api"] = source.URL
doc.Metadata["search_query"] = query
doc.Metadata["search_rank"] = i + 1
if result.Engine != "" {
doc.Metadata["search_engine"] = result.Engine
}
if result.Snippet != "" {
doc.Metadata["search_snippet"] = result.Snippet
}
if result.Score != 0 {
doc.Metadata["search_score"] = result.Score
}
if strings.TrimSpace(doc.Title) == "" && strings.TrimSpace(result.Title) != "" {
doc.Title = strings.TrimSpace(result.Title)
}
docs = append(docs, doc)
}
}
if len(docs) == 0 {
if len(scrapeErrors) > 0 {
return nil, fmt.Errorf("local search returned results but page scraping failed: %s", strings.Join(scrapeErrors, "; "))
}
return nil, fmt.Errorf("local search yielded no usable results for query %q", query)
}
return docs, nil
}
// DetectChanges checks if top search results changed.
func (s *LocalSearchScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
if source == nil {
return false, "", fmt.Errorf("source is required")
}
query := strings.TrimSpace(source.Query)
if query == "" {
return false, "", fmt.Errorf("search query is required for localsearch sources")
}
limit := clampLocalSearchLimit(source.ResultLimit)
results, err := s.search(ctx, source, query, limit)
if err != nil {
return false, "", err
}
signatures := make([]string, 0, len(results))
for _, result := range results {
u := stripURLFragment(result.URL)
if u == "" {
continue
}
if !domainAllowed(u, source.Domains) {
continue
}
if !s.web.shouldInclude(u, source.Include, source.Exclude) {
continue
}
signatures = append(signatures, fmt.Sprintf("%s|%s|%s|%.6f", u, result.Title, result.Engine, result.Score))
}
sort.Strings(signatures)
hash := sha256.Sum256([]byte(strings.Join(signatures, "\n")))
currentHash := hex.EncodeToString(hash[:])
return currentHash != lastHash, currentHash, nil
}
func (s *LocalSearchScraper) search(ctx context.Context, source *Source, query string, limit int) ([]localSearchResult, error) {
searchURL, err := buildLocalSearchURL(source.URL, query, limit)
if err != nil {
return nil, err
}
req, err := http.NewRequestWithContext(ctx, http.MethodGet, searchURL, nil)
if err != nil {
return nil, fmt.Errorf("failed to build search request: %w", err)
}
req.Header.Set("User-Agent", s.config.UserAgent)
req.Header.Set("Accept", "application/json")
resp, err := s.client.Do(req)
if err != nil {
return nil, fmt.Errorf("search API request failed: %w", err)
}
defer resp.Body.Close()
body, err := io.ReadAll(io.LimitReader(resp.Body, maxSearchResponseBytes))
if err != nil {
return nil, fmt.Errorf("failed reading search API response: %w", err)
}
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
msg := strings.TrimSpace(string(body))
if len(msg) > 200 {
msg = msg[:200]
}
return nil, fmt.Errorf("search API returned HTTP %d: %s", resp.StatusCode, msg)
}
results, err := decodeLocalSearchResults(body)
if err != nil {
return nil, err
}
if len(results) == 0 {
return nil, fmt.Errorf("search API returned no results")
}
if len(results) > limit {
results = results[:limit]
}
return results, nil
}
func buildLocalSearchURL(rawURL, query string, limit int) (string, error) {
u, err := url.Parse(strings.TrimSpace(rawURL))
if err != nil {
return "", fmt.Errorf("invalid search API URL: %w", err)
}
if u.Scheme == "" || u.Host == "" {
return "", fmt.Errorf("invalid search API URL: expected absolute URL, got %q", rawURL)
}
params := u.Query()
params.Set("q", query)
if params.Get("format") == "" {
params.Set("format", "json")
}
if params.Get("limit") == "" {
params.Set("limit", strconv.Itoa(clampLocalSearchLimit(limit)))
}
u.RawQuery = params.Encode()
return u.String(), nil
}
func decodeLocalSearchResults(body []byte) ([]localSearchResult, error) {
var payload map[string]interface{}
if err := json.Unmarshal(body, &payload); err != nil {
return nil, fmt.Errorf("search API returned invalid JSON: %w", err)
}
rawResults, ok := payload["results"]
if !ok {
return nil, fmt.Errorf("search API response missing results field")
}
items, ok := rawResults.([]interface{})
if !ok {
return nil, fmt.Errorf("search API results field is not an array")
}
results := make([]localSearchResult, 0, len(items))
for _, item := range items {
record, ok := item.(map[string]interface{})
if !ok {
continue
}
resultURL := pickString(record, "url", "link", "href")
if strings.TrimSpace(resultURL) == "" {
continue
}
results = append(results, localSearchResult{
URL: strings.TrimSpace(resultURL),
Title: strings.TrimSpace(pickString(record, "title", "name")),
Snippet: strings.TrimSpace(pickString(record, "content", "snippet", "description", "text")),
Engine: strings.TrimSpace(pickString(record, "engine", "source")),
Score: pickFloat(record, "score", "relevance"),
})
}
return results, nil
}
func pickString(record map[string]interface{}, keys ...string) string {
for _, key := range keys {
value, ok := record[key]
if !ok {
continue
}
switch v := value.(type) {
case string:
return v
case json.Number:
return v.String()
case float64:
return strconv.FormatFloat(v, 'f', -1, 64)
case int:
return strconv.Itoa(v)
}
}
return ""
}
func pickFloat(record map[string]interface{}, keys ...string) float64 {
for _, key := range keys {
value, ok := record[key]
if !ok {
continue
}
switch v := value.(type) {
case float64:
return v
case float32:
return float64(v)
case int:
return float64(v)
case int64:
return float64(v)
case json.Number:
f, err := v.Float64()
if err == nil {
return f
}
case string:
f, err := strconv.ParseFloat(strings.TrimSpace(v), 64)
if err == nil {
return f
}
}
}
return 0
}
func clampLocalSearchLimit(limit int) int {
if limit <= 0 {
return defaultLocalSearchLimit
}
if limit > maxLocalSearchLimit {
return maxLocalSearchLimit
}
return limit
}
func stripURLFragment(raw string) string {
u, err := url.Parse(strings.TrimSpace(raw))
if err != nil {
return strings.TrimSpace(raw)
}
u.Fragment = ""
return u.String()
}
func domainAllowed(raw string, allowedDomains []string) bool {
if len(allowedDomains) == 0 {
return true
}
u, err := url.Parse(raw)
if err != nil {
return false
}
host := strings.ToLower(strings.TrimSpace(u.Hostname()))
if host == "" {
return false
}
for _, candidate := range allowedDomains {
domain := normalizeDomain(candidate)
if domain == "" {
continue
}
if host == domain || strings.HasSuffix(host, "."+domain) {
return true
}
}
return false
}
func normalizeDomain(raw string) string {
raw = strings.ToLower(strings.TrimSpace(raw))
if raw == "" {
return ""
}
if strings.Contains(raw, "://") {
parsed, err := url.Parse(raw)
if err == nil {
return strings.ToLower(parsed.Hostname())
}
}
return strings.TrimPrefix(raw, ".")
}
+226
View File
@@ -0,0 +1,226 @@
package scraper
import (
"context"
"encoding/json"
"net/http"
"net/http/httptest"
"net/url"
"strings"
"testing"
"time"
)
func TestLocalSearchScraperScrape(t *testing.T) {
mux := http.NewServeMux()
baseURL := ""
mux.HandleFunc("/search", func(w http.ResponseWriter, r *http.Request) {
if got := r.URL.Query().Get("q"); got != "go http client" {
t.Fatalf("expected query go http client, got %q", got)
}
if got := r.URL.Query().Get("format"); got != "json" {
t.Fatalf("expected format=json, got %q", got)
}
_ = json.NewEncoder(w).Encode(map[string]interface{}{
"results": []map[string]interface{}{
{
"url": baseURL + "/docs/http-client",
"title": "HTTP Client Guide",
"content": "How to build an HTTP client in Go",
"engine": "searxng",
"score": 0.99,
},
},
})
})
mux.HandleFunc("/docs/http-client", func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/html")
_, _ = w.Write([]byte(`<html><head><title>HTTP Client Guide</title></head><body><main>` + strings.Repeat("http client docs ", 30) + `</main></body></html>`))
})
srv := httptest.NewServer(mux)
defer srv.Close()
baseURL = srv.URL
s := NewLocalSearchScraper(&Config{
UserAgent: "DevourTest/1.0",
Timeout: 2 * time.Second,
})
docs, err := s.Scrape(context.Background(), &Source{
Name: "local-search",
Type: SourceTypeLocalSearch,
URL: srv.URL + "/search",
Query: "go http client",
ResultLimit: 5,
})
if err != nil {
t.Fatalf("unexpected scrape error: %v", err)
}
if len(docs) == 0 {
t.Fatal("expected at least one document")
}
doc := docs[0]
if doc.URL != srv.URL+"/docs/http-client" {
t.Fatalf("unexpected document URL: %q", doc.URL)
}
if doc.Metadata["search_query"] != "go http client" {
t.Fatalf("expected metadata search_query, got %v", doc.Metadata["search_query"])
}
if doc.Metadata["search_engine"] != "searxng" {
t.Fatalf("expected metadata search_engine=searxng, got %v", doc.Metadata["search_engine"])
}
}
func TestLocalSearchScraperDomainFilter(t *testing.T) {
mux := http.NewServeMux()
baseURL := ""
mux.HandleFunc("/search", func(w http.ResponseWriter, r *http.Request) {
_ = json.NewEncoder(w).Encode(map[string]interface{}{
"results": []map[string]interface{}{
{
"url": baseURL + "/docs/in-scope",
"title": "In Scope",
},
{
"url": "https://example.com/out-of-scope",
"title": "Out Scope",
},
},
})
})
mux.HandleFunc("/docs/in-scope", func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/html")
_, _ = w.Write([]byte(`<html><head><title>In Scope</title></head><body><main>` + strings.Repeat("scoped docs ", 30) + `</main></body></html>`))
})
srv := httptest.NewServer(mux)
defer srv.Close()
baseURL = srv.URL
parsed, err := url.Parse(srv.URL)
if err != nil {
t.Fatalf("failed to parse server URL: %v", err)
}
s := NewLocalSearchScraper(&Config{
UserAgent: "DevourTest/1.0",
Timeout: 2 * time.Second,
})
docs, err := s.Scrape(context.Background(), &Source{
Name: "local-search",
Type: SourceTypeLocalSearch,
URL: srv.URL + "/search",
Query: "scope test",
ResultLimit: 10,
Domains: []string{parsed.Hostname()},
})
if err != nil {
t.Fatalf("unexpected scrape error: %v", err)
}
if len(docs) == 0 {
t.Fatal("expected at least one in-scope document")
}
for _, doc := range docs {
docURL, parseErr := url.Parse(doc.URL)
if parseErr != nil {
t.Fatalf("invalid doc URL %q: %v", doc.URL, parseErr)
}
if docURL.Hostname() != parsed.Hostname() {
t.Fatalf("expected only in-scope domain, got %q", doc.URL)
}
}
}
func TestLocalSearchScraperRequiresQuery(t *testing.T) {
s := NewLocalSearchScraper(&Config{
UserAgent: "DevourTest/1.0",
Timeout: 2 * time.Second,
})
_, err := s.Scrape(context.Background(), &Source{
Name: "local-search",
Type: SourceTypeLocalSearch,
URL: "http://127.0.0.1:8080/search",
})
if err == nil {
t.Fatal("expected error when query is missing")
}
if !strings.Contains(err.Error(), "query") {
t.Fatalf("unexpected error: %v", err)
}
}
func TestLocalSearchScraperDetectChanges(t *testing.T) {
mux := http.NewServeMux()
baseURL := ""
resultPath := "/docs/one"
mux.HandleFunc("/search", func(w http.ResponseWriter, r *http.Request) {
_ = json.NewEncoder(w).Encode(map[string]interface{}{
"results": []map[string]interface{}{
{
"url": baseURL + resultPath,
"title": "Versioned",
"score": 1.0,
},
},
})
})
srv := httptest.NewServer(mux)
defer srv.Close()
baseURL = srv.URL
s := NewLocalSearchScraper(&Config{
UserAgent: "DevourTest/1.0",
Timeout: 2 * time.Second,
})
source := &Source{
Name: "local-search",
Type: SourceTypeLocalSearch,
URL: srv.URL + "/search",
Query: "version test",
ResultLimit: 3,
}
changed, hash1, err := s.DetectChanges(context.Background(), source, "")
if err != nil {
t.Fatalf("unexpected detect changes error: %v", err)
}
if !changed {
t.Fatal("expected first detect changes call to report changed")
}
if hash1 == "" {
t.Fatal("expected non-empty hash")
}
changed, hash2, err := s.DetectChanges(context.Background(), source, hash1)
if err != nil {
t.Fatalf("unexpected detect changes error: %v", err)
}
if changed {
t.Fatal("expected unchanged results with identical hash")
}
if hash2 != hash1 {
t.Fatalf("expected identical hash, got %q and %q", hash1, hash2)
}
resultPath = "/docs/two"
changed, hash3, err := s.DetectChanges(context.Background(), source, hash1)
if err != nil {
t.Fatalf("unexpected detect changes error: %v", err)
}
if !changed {
t.Fatal("expected changed results after search output changed")
}
if hash3 == hash1 {
t.Fatal("expected hash to change")
}
}
+88
View File
@@ -0,0 +1,88 @@
package scraper
import (
"net/url"
"path"
"regexp"
"strings"
)
var (
titleNoiseRe = regexp.MustCompile(`(?i)\b(added in go[0-9]+(\.[0-9]+)?|deprecated)\b`)
titleSpaceRe = regexp.MustCompile(`\s+`)
contentSpaceRe = regexp.MustCompile(`[ \t]+\n`)
multiNewlineRe = regexp.MustCompile(`\n{3,}`)
nonPrintableTitle = regexp.MustCompile(`[[:cntrl:]]`)
)
// NormalizeDocuments applies normalization to a list of scraped documents.
func NormalizeDocuments(docs []*Document) []*Document {
for _, doc := range docs {
NormalizeDocument(doc)
}
return docs
}
// NormalizeDocument applies cross-scraper output cleanup.
func NormalizeDocument(doc *Document) {
if doc == nil {
return
}
doc.URL = strings.TrimSpace(doc.URL)
doc.Type = strings.TrimSpace(doc.Type)
doc.Title = normalizeTitle(doc.Title)
doc.Content = normalizeContent(doc.Content)
if doc.Title == "" {
doc.Title = inferTitleFromURL(doc.URL)
}
}
func normalizeTitle(title string) string {
title = strings.ReplaceAll(title, "¶", " ")
title = strings.ReplaceAll(title, "_", " ")
title = nonPrintableTitle.ReplaceAllString(title, " ")
title = titleNoiseRe.ReplaceAllString(title, " ")
title = titleSpaceRe.ReplaceAllString(strings.TrimSpace(title), " ")
// Remove dangling punctuation if it became a suffix after cleanup.
title = strings.TrimSpace(strings.Trim(title, "-:.,;"))
return title
}
func normalizeContent(content string) string {
content = strings.ReplaceAll(content, "\r\n", "\n")
content = strings.TrimSpace(content)
content = contentSpaceRe.ReplaceAllString(content, "\n")
content = multiNewlineRe.ReplaceAllString(content, "\n\n")
return content
}
func inferTitleFromURL(rawURL string) string {
if rawURL == "" {
return "Documentation"
}
u, err := url.Parse(rawURL)
if err != nil {
return "Documentation"
}
base := path.Base(strings.Trim(u.Path, "/"))
if base == "" || base == "." || base == "/" {
if u.Host != "" {
return u.Host
}
return "Documentation"
}
base = strings.TrimSuffix(base, ".html")
base = strings.ReplaceAll(base, "-", " ")
base = strings.ReplaceAll(base, "_", " ")
base = titleSpaceRe.ReplaceAllString(strings.TrimSpace(base), " ")
if base == "" {
return "Documentation"
}
return base
}
+33
View File
@@ -0,0 +1,33 @@
package scraper
import "testing"
func TestNormalizeDocument_TitleCleanup(t *testing.T) {
doc := &Document{
Title: "http.type CloseNotifier ¶ deprecated added in go1.1",
Content: "line 1 \n\n\nline 2",
URL: "https://pkg.go.dev/net/http#CloseNotifier",
}
NormalizeDocument(doc)
if doc.Title != "http.type CloseNotifier" {
t.Fatalf("unexpected normalized title: %q", doc.Title)
}
if doc.Content != "line 1\n\nline 2" {
t.Fatalf("unexpected normalized content: %q", doc.Content)
}
}
func TestNormalizeDocument_InferTitle(t *testing.T) {
doc := &Document{
Title: "",
URL: "https://kotlinlang.org/docs/regex.html",
}
NormalizeDocument(doc)
if doc.Title != "regex" {
t.Fatalf("expected inferred title 'regex', got %q", doc.Title)
}
}
+316 -9
View File
@@ -2,30 +2,337 @@ package scraper
import (
"context"
"crypto/sha256"
"encoding/hex"
"encoding/json"
"fmt"
"io"
"net/http"
"os"
"sort"
"strings"
"time"
"gopkg.in/yaml.v3"
)
// OpenAPIScraper parses OpenAPI/Swagger specifications.
type OpenAPIScraper struct {
config *Config
client *http.Client
}
// NewOpenAPIScraper creates a new OpenAPI scraper.
func NewOpenAPIScraper(config *Config) *OpenAPIScraper {
return &OpenAPIScraper{config: config}
timeout := 30 * time.Second
if config != nil && config.Timeout > 0 {
timeout = config.Timeout
}
return &OpenAPIScraper{
config: config,
client: &http.Client{Timeout: timeout},
}
}
// Scrape fetches and parses an OpenAPI specification.
func (s *OpenAPIScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
// TODO: Implement OpenAPI parsing
// 1. Fetch spec from URL
// 2. Parse endpoints, schemas, descriptions
// 3. Create documents per endpoint
// 4. Include authentication, parameters
return nil, nil
if source == nil {
return nil, fmt.Errorf("source is required")
}
raw, specURL, err := s.readSpec(ctx, source)
if err != nil {
return nil, err
}
spec, err := parseOpenAPISpec(raw)
if err != nil {
return nil, err
}
docs := make([]*Document, 0)
mainContent := buildMainSpecContent(spec)
docs = append(docs, &Document{
ID: generateDocID(specURL + "#openapi"),
Source: coalesceSourceName(source.Name, "openapi"),
Type: "openapi-spec",
Title: spec.Info.Title,
Content: mainContent,
URL: specURL,
Metadata: map[string]interface{}{
"openapi": spec.Version,
"servers": spec.Servers,
},
Hash: hashBytes(raw),
Timestamp: time.Now(),
})
paths := make([]string, 0, len(spec.Paths))
for path := range spec.Paths {
paths = append(paths, path)
}
sort.Strings(paths)
for _, p := range paths {
opMap := spec.Paths[p]
methods := make([]string, 0, len(opMap))
for m := range opMap {
methods = append(methods, strings.ToUpper(m))
}
sort.Strings(methods)
for _, method := range methods {
op := opMap[strings.ToLower(method)]
if op == nil {
continue
}
title := strings.TrimSpace(op.Summary)
if title == "" {
title = fmt.Sprintf("%s %s", method, p)
}
content := buildOperationContent(method, p, op)
docURL := fmt.Sprintf("%s#%s-%s", specURL, strings.ToLower(method), sanitizeFragment(p))
docs = append(docs, &Document{
ID: generateDocID(docURL),
Source: coalesceSourceName(source.Name, "openapi"),
Type: "openapi-operation",
Title: title,
Content: content,
URL: docURL,
Metadata: map[string]interface{}{
"method": method,
"path": p,
"operation_id": op.OperationID,
},
Hash: hashString(content),
Timestamp: time.Now(),
})
}
}
return docs, nil
}
// DetectChanges checks if the spec has been updated.
func (s *OpenAPIScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
// TODO: Check spec content hash
return false, "", nil
if source == nil {
return false, "", fmt.Errorf("source is required")
}
raw, _, err := s.readSpec(ctx, source)
if err != nil {
return false, "", err
}
hash := hashBytes(raw)
return hash != lastHash, hash, nil
}
func (s *OpenAPIScraper) readSpec(ctx context.Context, source *Source) ([]byte, string, error) {
rawPath := strings.TrimSpace(source.URL)
if rawPath == "" {
rawPath = strings.TrimSpace(source.Path)
}
if rawPath == "" {
return nil, "", fmt.Errorf("openapi source requires url or path")
}
if strings.HasPrefix(rawPath, "http://") || strings.HasPrefix(rawPath, "https://") {
req, err := http.NewRequestWithContext(ctx, http.MethodGet, rawPath, nil)
if err != nil {
return nil, "", err
}
if s.config != nil && strings.TrimSpace(s.config.UserAgent) != "" {
req.Header.Set("User-Agent", s.config.UserAgent)
}
resp, err := s.client.Do(req)
if err != nil {
return nil, "", err
}
defer resp.Body.Close()
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
return nil, "", fmt.Errorf("openapi fetch failed: HTTP %d", resp.StatusCode)
}
body, err := io.ReadAll(io.LimitReader(resp.Body, 10<<20))
if err != nil {
return nil, "", err
}
return body, rawPath, nil
}
b, err := os.ReadFile(rawPath)
if err != nil {
return nil, "", err
}
return b, "file://" + rawPath, nil
}
type openAPISpec struct {
Version string `json:"openapi" yaml:"openapi"`
Swagger string `json:"swagger" yaml:"swagger"`
Info openAPIInfo `json:"info" yaml:"info"`
Servers []openAPIServer `json:"servers" yaml:"servers"`
Paths map[string]pathItems `json:"paths" yaml:"paths"`
}
type openAPIInfo struct {
Title string `json:"title" yaml:"title"`
Version string `json:"version" yaml:"version"`
Description string `json:"description" yaml:"description"`
}
type openAPIServer struct {
URL string `json:"url" yaml:"url"`
Description string `json:"description" yaml:"description"`
}
type pathItems map[string]*openAPIOperation
type openAPIOperation struct {
Summary string `json:"summary" yaml:"summary"`
Description string `json:"description" yaml:"description"`
OperationID string `json:"operationId" yaml:"operationId"`
Parameters []openAPIParameter `json:"parameters" yaml:"parameters"`
Responses map[string]response `json:"responses" yaml:"responses"`
RequestBody map[string]any `json:"requestBody" yaml:"requestBody"`
Tags []string `json:"tags" yaml:"tags"`
Deprecated bool `json:"deprecated" yaml:"deprecated"`
Security []map[string][]string `json:"security" yaml:"security"`
}
type openAPIParameter struct {
Name string `json:"name" yaml:"name"`
In string `json:"in" yaml:"in"`
Description string `json:"description" yaml:"description"`
Required bool `json:"required" yaml:"required"`
}
type response struct {
Description string `json:"description" yaml:"description"`
}
func parseOpenAPISpec(raw []byte) (*openAPISpec, error) {
var spec openAPISpec
if err := json.Unmarshal(raw, &spec); err != nil {
if yamlErr := yaml.Unmarshal(raw, &spec); yamlErr != nil {
return nil, fmt.Errorf("invalid openapi content: %w", err)
}
}
if strings.TrimSpace(spec.Info.Title) == "" {
spec.Info.Title = "OpenAPI Specification"
}
if strings.TrimSpace(spec.Version) == "" {
spec.Version = spec.Swagger
}
if spec.Paths == nil {
spec.Paths = map[string]pathItems{}
}
return &spec, nil
}
func buildMainSpecContent(spec *openAPISpec) string {
var b strings.Builder
fmt.Fprintf(&b, "# %s\n\n", spec.Info.Title)
if spec.Info.Version != "" {
fmt.Fprintf(&b, "- API Version: %s\n", spec.Info.Version)
}
if spec.Version != "" {
fmt.Fprintf(&b, "- OpenAPI: %s\n", spec.Version)
}
fmt.Fprintf(&b, "- Paths: %d\n", len(spec.Paths))
if spec.Info.Description != "" {
fmt.Fprintf(&b, "\n%s\n", strings.TrimSpace(spec.Info.Description))
}
if len(spec.Servers) > 0 {
fmt.Fprintf(&b, "\n## Servers\n")
for _, s := range spec.Servers {
fmt.Fprintf(&b, "- %s", s.URL)
if s.Description != "" {
fmt.Fprintf(&b, " - %s", s.Description)
}
fmt.Fprintln(&b)
}
}
return b.String()
}
func buildOperationContent(method, path string, op *openAPIOperation) string {
var b strings.Builder
fmt.Fprintf(&b, "# %s %s\n\n", method, path)
if op.Summary != "" {
fmt.Fprintf(&b, "%s\n\n", strings.TrimSpace(op.Summary))
}
if op.Description != "" {
fmt.Fprintf(&b, "%s\n\n", strings.TrimSpace(op.Description))
}
if op.OperationID != "" {
fmt.Fprintf(&b, "- Operation ID: `%s`\n", op.OperationID)
}
if len(op.Tags) > 0 {
fmt.Fprintf(&b, "- Tags: %s\n", strings.Join(op.Tags, ", "))
}
if op.Deprecated {
fmt.Fprintln(&b, "- Deprecated: true")
}
if len(op.Parameters) > 0 {
fmt.Fprintln(&b, "\n## Parameters")
for _, p := range op.Parameters {
req := "optional"
if p.Required {
req = "required"
}
fmt.Fprintf(&b, "- `%s` (%s, %s)", p.Name, p.In, req)
if p.Description != "" {
fmt.Fprintf(&b, ": %s", strings.TrimSpace(p.Description))
}
fmt.Fprintln(&b)
}
}
if len(op.Responses) > 0 {
codes := make([]string, 0, len(op.Responses))
for code := range op.Responses {
codes = append(codes, code)
}
sort.Strings(codes)
fmt.Fprintln(&b, "\n## Responses")
for _, code := range codes {
resp := op.Responses[code]
fmt.Fprintf(&b, "- `%s`", code)
if resp.Description != "" {
fmt.Fprintf(&b, ": %s", strings.TrimSpace(resp.Description))
}
fmt.Fprintln(&b)
}
}
return b.String()
}
func sanitizeFragment(path string) string {
path = strings.ToLower(path)
path = strings.ReplaceAll(path, "/", "-")
path = strings.ReplaceAll(path, "{", "")
path = strings.ReplaceAll(path, "}", "")
path = strings.Trim(path, "-")
if path == "" {
return "root"
}
return path
}
func hashBytes(b []byte) string {
h := sha256.Sum256(b)
return hex.EncodeToString(h[:])
}
func hashString(s string) string {
h := sha256.Sum256([]byte(s))
return hex.EncodeToString(h[:])
}
func coalesceSourceName(name, fallback string) string {
if strings.TrimSpace(name) != "" {
return name
}
return fallback
}
+77
View File
@@ -0,0 +1,77 @@
package scraper
import (
"context"
"net/http"
"net/http/httptest"
"strings"
"testing"
"time"
)
func TestOpenAPIScraperScrape(t *testing.T) {
spec := `{
"openapi": "3.0.0",
"info": {"title": "Pet API", "version": "1.0.0"},
"paths": {
"/pets": {
"get": {
"summary": "List pets",
"operationId": "listPets",
"responses": {"200": {"description": "ok"}}
}
}
}
}`
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/json")
_, _ = w.Write([]byte(spec))
}))
defer srv.Close()
s := NewOpenAPIScraper(&Config{Timeout: 2 * time.Second, UserAgent: "DevourTest"})
docs, err := s.Scrape(context.Background(), &Source{Name: "pet", Type: SourceTypeOpenAPI, URL: srv.URL})
if err != nil {
t.Fatalf("unexpected scrape error: %v", err)
}
if len(docs) < 2 {
t.Fatalf("expected at least 2 docs, got %d", len(docs))
}
foundOp := false
for _, d := range docs {
if strings.Contains(d.Title, "List pets") {
foundOp = true
break
}
}
if !foundOp {
t.Fatal("expected operation document")
}
}
func TestOpenAPIScraperDetectChanges(t *testing.T) {
spec := `{"openapi":"3.0.0","info":{"title":"API"},"paths":{}}`
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
_, _ = w.Write([]byte(spec))
}))
defer srv.Close()
s := NewOpenAPIScraper(&Config{Timeout: 2 * time.Second, UserAgent: "DevourTest"})
src := &Source{Name: "api", Type: SourceTypeOpenAPI, URL: srv.URL}
changed, hash1, err := s.DetectChanges(context.Background(), src, "")
if err != nil {
t.Fatal(err)
}
if !changed || hash1 == "" {
t.Fatalf("expected changed=true and non-empty hash, changed=%v hash=%q", changed, hash1)
}
changed, _, err = s.DetectChanges(context.Background(), src, hash1)
if err != nil {
t.Fatal(err)
}
if changed {
t.Fatal("expected no changes when hash matches")
}
}
+1
View File
@@ -5,6 +5,7 @@ func init() {
// Additional scrapers can be registered in their own packages
RegisterScraper(SourceTypeWeb, func(c *Config) Scraper { return NewWebScraper(c) })
RegisterScraper(SourceTypeLocal, func(c *Config) Scraper { return NewLocalScraper(c) })
RegisterScraper(SourceTypeLocalSearch, func(c *Config) Scraper { return NewLocalSearchScraper(c) })
RegisterScraper(SourceTypeGitHub, func(c *Config) Scraper { return NewGitHubScraper(c) })
RegisterScraper(SourceTypeOpenAPI, func(c *Config) Scraper { return NewOpenAPIScraper(c) })
}
@@ -0,0 +1,71 @@
package scraper_test
import (
"testing"
"time"
basescraper "github.com/yourorg/devour/internal/scraper"
_ "github.com/yourorg/devour/internal/scraper/external"
)
func TestLanguageScrapersAreRegistered(t *testing.T) {
config := &basescraper.Config{
UserAgent: "DevourTest/1.0",
Timeout: 2 * time.Second,
}
supportedDocTypes := []basescraper.SourceType{
basescraper.SourceTypeGoDocs,
basescraper.SourceTypeRustDocs,
basescraper.SourceTypePythonDocs,
basescraper.SourceTypeJavaDocs,
basescraper.SourceTypeSpringDocs,
basescraper.SourceTypeTSDocs,
basescraper.SourceTypeReactDocs,
basescraper.SourceTypeVueDocs,
basescraper.SourceTypeNuxtDocs,
basescraper.SourceTypeMCPDocs,
basescraper.SourceTypeDockerDocs,
basescraper.SourceTypeCloudflareDocs,
basescraper.SourceTypeAstroDocs,
}
for _, sourceType := range supportedDocTypes {
t.Run(string(sourceType), func(t *testing.T) {
s := basescraper.NewScraper(sourceType, config)
if s == nil {
t.Fatalf("NewScraper(%q) returned nil; scraper was not registered", sourceType)
}
})
}
}
func TestDetectSourceType_ForSupportedDocsHosts(t *testing.T) {
tests := []struct {
input string
expected basescraper.SourceType
}{
{"https://pkg.go.dev/net/http", basescraper.SourceTypeGoDocs},
{"https://docs.rs/tokio/latest/tokio/", basescraper.SourceTypeRustDocs},
{"https://docs.python.org/3/library/asyncio.html", basescraper.SourceTypePythonDocs},
{"https://docs.oracle.com/javase/8/docs/api/java/util/List.html", basescraper.SourceTypeJavaDocs},
{"https://docs.spring.io/spring-boot/docs/current/reference/htmlsingle/", basescraper.SourceTypeSpringDocs},
{"https://www.typescriptlang.org/docs/handbook/2/basic-types.html", basescraper.SourceTypeTSDocs},
{"https://react.dev/reference/react/hooks", basescraper.SourceTypeReactDocs},
{"https://vuejs.org/guide/introduction.html", basescraper.SourceTypeVueDocs},
{"https://nuxt.com/docs/guide/directory-structure", basescraper.SourceTypeNuxtDocs},
{"https://docs.docker.com/compose", basescraper.SourceTypeDockerDocs},
{"https://hub.docker.com/mcp/server/github", basescraper.SourceTypeMCPDocs},
{"https://developers.cloudflare.com/workers", basescraper.SourceTypeCloudflareDocs},
{"https://docs.astro.build/en/guides/components/", basescraper.SourceTypeAstroDocs},
}
for _, tt := range tests {
t.Run(tt.input, func(t *testing.T) {
got := basescraper.DetectSourceType(tt.input)
if got != tt.expected {
t.Fatalf("DetectSourceType(%q) = %q, want %q", tt.input, got, tt.expected)
}
})
}
}
+1 -1
View File
@@ -28,7 +28,7 @@ func (r *ScraperRegistry) Register(sourceType SourceType, constructor ScraperCon
// Create creates a scraper instance
func (r *ScraperRegistry) Create(sourceType SourceType, config *Config) Scraper {
if constructor, exists := r.constructors[sourceType]; exists {
return constructor(config)
return wrapScraper(constructor(config))
}
return nil
}
+18 -9
View File
@@ -17,6 +17,7 @@ const (
SourceTypeGitHub SourceType = "github"
SourceTypeOpenAPI SourceType = "openapi"
SourceTypeLocal SourceType = "local"
SourceTypeLocalSearch SourceType = "localsearch"
SourceTypeGoDocs SourceType = "godocs"
SourceTypeRustDocs SourceType = "rustdocs"
SourceTypePythonDocs SourceType = "pythondocs"
@@ -34,15 +35,18 @@ const (
// Source represents a documentation source to scrape.
type Source struct {
Name string `yaml:"name"`
Type SourceType `yaml:"type"`
URL string `yaml:"url,omitempty"`
Repo string `yaml:"repo,omitempty"`
Branch string `yaml:"branch,omitempty"`
Path string `yaml:"path,omitempty"`
Include []string `yaml:"include,omitempty"`
Exclude []string `yaml:"exclude,omitempty"`
Schedule string `yaml:"schedule,omitempty"`
Name string `yaml:"name"`
Type SourceType `yaml:"type"`
URL string `yaml:"url,omitempty"`
Query string `yaml:"query,omitempty"`
ResultLimit int `yaml:"result_limit,omitempty"`
Domains []string `yaml:"domains,omitempty"`
Repo string `yaml:"repo,omitempty"`
Branch string `yaml:"branch,omitempty"`
Path string `yaml:"path,omitempty"`
Include []string `yaml:"include,omitempty"`
Exclude []string `yaml:"exclude,omitempty"`
Schedule string `yaml:"schedule,omitempty"`
}
// Document represents a scraped document.
@@ -113,6 +117,11 @@ func DetectSourceType(input string) SourceType {
}
}
// MCP servers are hosted under Docker Hub paths.
if strings.Contains(input, "hub.docker.com/mcp/") {
return SourceTypeMCPDocs
}
// Check for OpenAPI specs
if strings.HasSuffix(input, ".json") || strings.HasSuffix(input, ".yaml") || strings.HasSuffix(input, ".yml") {
if strings.Contains(strings.ToLower(input), "openapi") || strings.Contains(strings.ToLower(input), "swagger") {
+191 -7
View File
@@ -6,8 +6,10 @@ import (
"encoding/hex"
"fmt"
"net/url"
"path"
"regexp"
"strings"
"sync"
"time"
"github.com/gocolly/colly/v2"
@@ -27,17 +29,42 @@ func NewWebScraper(config *Config) *WebScraper {
func (s *WebScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
var documents []*Document
visited := make(map[string]bool)
scheduled := make(map[string]bool)
contentHashes := make(map[string]bool)
var mu sync.Mutex
var scrapeErrors []string
// Parse base URL for domain restrictions
baseURL, err := url.Parse(source.URL)
if err != nil {
return nil, fmt.Errorf("invalid URL: %w", err)
}
allowedDomain := baseURL.Hostname()
if allowedDomain == "" {
allowedDomain = baseURL.Host
}
maxDepth := s.config.MaxDepth
if maxDepth <= 0 {
maxDepth = 2
}
maxPages := s.config.Concurrency * 40
if maxPages < 20 {
maxPages = 20
}
if maxDepth <= 1 && maxPages > 30 {
maxPages = 30
}
if maxPages > 300 {
maxPages = 300
}
scopePrefix := pathScopePrefix(baseURL.Path)
scopeLeaf := pathScopeLeaf(baseURL.Path)
// Create Colly collector
c := colly.NewCollector(
colly.AllowedDomains(baseURL.Host),
colly.MaxDepth(s.config.MaxDepth),
colly.AllowedDomains(allowedDomain),
colly.MaxDepth(maxDepth),
colly.Async(true),
colly.UserAgent(s.config.UserAgent),
)
@@ -65,18 +92,40 @@ func (s *WebScraper) Scrape(ctx context.Context, source *Source) ([]*Document, e
// Handle errors
c.OnError(func(r *colly.Response, err error) {
fmt.Printf("Error scraping %s: %v\n", r.Request.URL, err)
errText := strings.ToLower(err.Error())
if strings.Contains(errText, "already visited") {
return
}
reqURL := source.URL
if r != nil && r.Request != nil && r.Request.URL != nil {
reqURL = r.Request.URL.String()
}
mu.Lock()
if len(scrapeErrors) < 20 {
scrapeErrors = append(scrapeErrors, fmt.Sprintf("%s: %v", reqURL, err))
}
mu.Unlock()
})
// Extract content from pages
c.OnHTML("html", func(e *colly.HTMLElement) {
pageURL := e.Request.URL.String()
if !withinScope(e.Request.URL, baseURL, scopePrefix, scopeLeaf) {
return
}
// Skip if already visited
mu.Lock()
if visited[pageURL] {
mu.Unlock()
return
}
if len(visited) >= maxPages {
mu.Unlock()
return
}
visited[pageURL] = true
mu.Unlock()
// Check include/exclude patterns
if !s.shouldInclude(pageURL, source.Include, source.Exclude) {
@@ -99,6 +148,13 @@ func (s *WebScraper) Scrape(ctx context.Context, source *Source) ([]*Document, e
// Generate hash for change detection
hash := s.generateHash(content)
mu.Lock()
if contentHashes[hash] {
mu.Unlock()
return
}
contentHashes[hash] = true
mu.Unlock()
// Extract metadata
metadata := map[string]interface{}{
@@ -120,30 +176,74 @@ func (s *WebScraper) Scrape(ctx context.Context, source *Source) ([]*Document, e
Timestamp: time.Now(),
}
mu.Lock()
documents = append(documents, doc)
mu.Unlock()
})
// Follow links
c.OnHTML("a[href]", func(e *colly.HTMLElement) {
link := e.Attr("href")
absoluteURL := e.Request.AbsoluteURL(link)
// Skip if already visited
if visited[absoluteURL] {
if absoluteURL == "" {
return
}
linkURL, err := url.Parse(absoluteURL)
if err != nil {
return
}
if linkURL.Scheme != "http" && linkURL.Scheme != "https" {
return
}
if !withinScope(linkURL, baseURL, scopePrefix, scopeLeaf) {
return
}
// Skip if already visited
mu.Lock()
if visited[absoluteURL] {
mu.Unlock()
return
}
if len(visited) >= maxPages {
mu.Unlock()
return
}
mu.Unlock()
// Check include/exclude patterns
if !s.shouldInclude(absoluteURL, source.Include, source.Exclude) {
return
}
mu.Lock()
if scheduled[absoluteURL] {
mu.Unlock()
return
}
if len(scheduled) >= maxPages {
mu.Unlock()
return
}
scheduled[absoluteURL] = true
mu.Unlock()
if err := c.Visit(absoluteURL); err != nil {
fmt.Printf("Error visiting %s: %v\n", absoluteURL, err)
errText := strings.ToLower(err.Error())
if strings.Contains(errText, "already visited") {
return
}
mu.Lock()
if len(scrapeErrors) < 20 {
scrapeErrors = append(scrapeErrors, fmt.Sprintf("%s: %v", absoluteURL, err))
}
mu.Unlock()
}
})
// Start scraping
scheduled[source.URL] = true
if err := c.Visit(source.URL); err != nil {
return nil, fmt.Errorf("failed to start scraping: %w", err)
}
@@ -151,6 +251,16 @@ func (s *WebScraper) Scrape(ctx context.Context, source *Source) ([]*Document, e
// Wait for async scraping to complete
c.Wait()
mu.Lock()
defer mu.Unlock()
if len(documents) == 0 {
if len(scrapeErrors) > 0 {
return nil, fmt.Errorf("web scrape failed: %s", strings.Join(scrapeErrors, "; "))
}
return nil, fmt.Errorf("web scrape extracted no documents from %s", source.URL)
}
return documents, nil
}
@@ -283,6 +393,25 @@ func (s *WebScraper) generateHash(content string) string {
// cleanText removes extra whitespace and normalizes text.
func cleanText(text string) string {
noisePhrases := []string{
"table of contents",
"in this article",
"additional resources",
"feedback",
"collaborate with us on github",
"copyright",
"all rights reserved",
"privacy policy",
"terms of service",
"sign in",
"skip to main content",
"ask learn",
}
for _, phrase := range noisePhrases {
re := regexp.MustCompile(`(?i)` + regexp.QuoteMeta(phrase))
text = re.ReplaceAllString(text, " ")
}
// Replace multiple whitespace with single space
re := regexp.MustCompile(`\s+`)
text = re.ReplaceAllString(text, " ")
@@ -292,3 +421,58 @@ func cleanText(text string) string {
return text
}
func pathScopePrefix(rawPath string) string {
clean := path.Clean(rawPath)
if clean == "." || clean == "/" || clean == "" {
return ""
}
last := path.Base(clean)
if strings.Contains(last, ".") {
dir := path.Dir(clean)
if dir == "/" {
// Root-level document page: keep crawler scoped to this page path.
return clean
}
return dir
}
dir := path.Dir(clean)
if dir == "/" {
return clean
}
return dir
}
func pathScopeLeaf(rawPath string) string {
clean := path.Clean(rawPath)
if clean == "." || clean == "/" || clean == "" {
return ""
}
last := path.Base(clean)
if strings.Contains(last, ".") {
return last
}
return ""
}
func withinScope(target, base *url.URL, prefix, leaf string) bool {
if target == nil || base == nil {
return false
}
if !strings.EqualFold(target.Hostname(), base.Hostname()) {
return false
}
if prefix == "" {
return true
}
targetPath := target.Path
if targetPath == "" {
targetPath = path.Clean("/")
}
if strings.HasPrefix(targetPath, prefix) {
return true
}
return leaf != "" && path.Base(targetPath) == leaf
}
+132
View File
@@ -0,0 +1,132 @@
package scraper
import (
"context"
"net/http"
"net/http/httptest"
"strings"
"testing"
"time"
)
func TestWebScraper_ReturnsErrorWhenNothingFetched(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
http.NotFound(w, r)
}))
defer srv.Close()
s := NewWebScraper(&Config{
UserAgent: "DevourTest/1.0",
Timeout: 2 * time.Second,
Concurrency: 1,
MaxDepth: 1,
})
_, err := s.Scrape(context.Background(), &Source{
Name: "missing",
Type: SourceTypeWeb,
URL: srv.URL + "/missing",
})
if err == nil {
t.Fatal("expected error when web scrape yields no documents")
}
}
func TestWebScraper_ReturnsErrorWhenPageHasNoExtractableContent(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/html")
_, _ = w.Write([]byte(`<html><head><title>Empty</title></head><body><p>tiny</p></body></html>`))
}))
defer srv.Close()
s := NewWebScraper(&Config{
UserAgent: "DevourTest/1.0",
Timeout: 2 * time.Second,
Concurrency: 1,
MaxDepth: 1,
})
_, err := s.Scrape(context.Background(), &Source{
Name: "empty",
Type: SourceTypeWeb,
URL: srv.URL,
})
if err == nil {
t.Fatal("expected error when page has no extractable docs")
}
if !strings.Contains(err.Error(), "extracted no documents") {
t.Fatalf("unexpected error message: %v", err)
}
}
func TestWebScraper_AllowsRedirectedDocumentPath(t *testing.T) {
content := strings.Repeat("ruby docs content ", 30)
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
switch r.URL.Path {
case "/core/Regexp.html":
http.Redirect(w, r, "/3.4.1/Regexp.html", http.StatusFound)
case "/3.4.1/Regexp.html":
w.Header().Set("Content-Type", "text/html")
_, _ = w.Write([]byte(`<html><head><title>Regexp</title></head><body><main>` + content + `</main></body></html>`))
default:
http.NotFound(w, r)
}
}))
defer srv.Close()
s := NewWebScraper(&Config{
UserAgent: "DevourTest/1.0",
Timeout: 2 * time.Second,
Concurrency: 1,
MaxDepth: 1,
})
docs, err := s.Scrape(context.Background(), &Source{
Name: "ruby",
Type: SourceTypeWeb,
URL: srv.URL + "/core/Regexp.html",
})
if err != nil {
t.Fatalf("unexpected scrape error: %v", err)
}
if len(docs) == 0 {
t.Fatal("expected redirected page to be scraped")
}
if !strings.Contains(docs[0].URL, "/3.4.1/Regexp.html") {
t.Fatalf("expected final redirected URL, got %q", docs[0].URL)
}
}
func TestWebScraper_GlobalWrapperNormalizesOutput(t *testing.T) {
content := strings.Repeat("docs content ", 20)
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/html")
_, _ = w.Write([]byte(`<html><head><title>Regex Guide ¶ deprecated</title></head><body><main>` + content + `</main></body></html>`))
}))
defer srv.Close()
s := NewScraper(SourceTypeWeb, &Config{
UserAgent: "DevourTest/1.0",
Timeout: 2 * time.Second,
Concurrency: 1,
MaxDepth: 1,
})
if s == nil {
t.Fatal("expected web scraper")
}
docs, err := s.Scrape(context.Background(), &Source{
Name: "test",
Type: SourceTypeWeb,
URL: srv.URL,
})
if err != nil {
t.Fatalf("unexpected scrape error: %v", err)
}
if len(docs) == 0 {
t.Fatal("expected at least one document")
}
if docs[0].Title != "Regex Guide" {
t.Fatalf("expected normalized title, got %q", docs[0].Title)
}
}
+98
View File
@@ -0,0 +1,98 @@
package scraper
import (
"context"
"errors"
"fmt"
"net"
"strings"
"time"
)
// wrappedScraper adds retry and normalization behavior for all scraper implementations.
type wrappedScraper struct {
inner Scraper
}
func wrapScraper(inner Scraper) Scraper {
if inner == nil {
return nil
}
return &wrappedScraper{inner: inner}
}
func (w *wrappedScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
if source == nil {
return nil, fmt.Errorf("source is required")
}
retries := 2
delay := 300 * time.Millisecond
var lastErr error
for attempt := 0; attempt <= retries; attempt++ {
docs, err := w.inner.Scrape(ctx, source)
if err == nil {
return NormalizeDocuments(docs), nil
}
lastErr = err
// One fallback: add trailing slash for doc sites when URL path looks page-like.
if attempt == 0 && strings.HasPrefix(source.URL, "http") && !strings.HasSuffix(source.URL, "/") {
alt := *source
alt.URL = source.URL + "/"
docs, altErr := w.inner.Scrape(ctx, &alt)
if altErr == nil {
return NormalizeDocuments(docs), nil
}
}
if attempt < retries && isRetriableScrapeError(err) {
if !sleepWithContext(ctx, delay) {
return nil, ctx.Err()
}
continue
}
break
}
return nil, lastErr
}
func (w *wrappedScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
return w.inner.DetectChanges(ctx, source, lastHash)
}
func isRetriableScrapeError(err error) bool {
if err == nil {
return false
}
s := strings.ToLower(err.Error())
if strings.Contains(s, "timeout") ||
strings.Contains(s, "temporarily unavailable") ||
strings.Contains(s, "connection reset") ||
strings.Contains(s, "eof") ||
strings.Contains(s, "http 429") ||
strings.Contains(s, "http 500") ||
strings.Contains(s, "http 502") ||
strings.Contains(s, "http 503") ||
strings.Contains(s, "http 504") {
return true
}
var netErr net.Error
return errors.As(err, &netErr)
}
func sleepWithContext(ctx context.Context, d time.Duration) bool {
t := time.NewTimer(d)
defer t.Stop()
select {
case <-ctx.Done():
return false
case <-t.C:
return true
}
}
+45
View File
@@ -0,0 +1,45 @@
package scraper
import (
"context"
"fmt"
"testing"
)
type flakyStubScraper struct {
failFirst bool
calls int
}
func (f *flakyStubScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
f.calls++
if f.failFirst && f.calls == 1 {
return nil, fmt.Errorf("HTTP 503")
}
return []*Document{
{
Title: "Example ¶ deprecated",
Content: "ok",
URL: source.URL,
Type: "test",
},
}, nil
}
func (f *flakyStubScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
return true, "hash", nil
}
func TestWrappedScraper_RetriesAndNormalizes(t *testing.T) {
w := wrapScraper(&flakyStubScraper{failFirst: true})
docs, err := w.Scrape(context.Background(), &Source{URL: "https://example.com"})
if err != nil {
t.Fatalf("expected retry to succeed, got error: %v", err)
}
if len(docs) != 1 {
t.Fatalf("expected 1 document, got %d", len(docs))
}
if docs[0].Title != "Example" {
t.Fatalf("expected normalized title, got %q", docs[0].Title)
}
}