mirror of
https://github.com/Dvorinka/Devour.git
synced 2026-06-04 04:23:02 +00:00
update
This commit is contained in:
+45
@@ -0,0 +1,45 @@
|
||||
package scraper
|
||||
|
||||
import basescraper "github.com/yourorg/devour/internal/scraper"
|
||||
|
||||
func init() {
|
||||
basescraper.RegisterScraper(basescraper.SourceTypeGoDocs, func(c *basescraper.Config) basescraper.Scraper {
|
||||
return NewGoDocsScraper(c)
|
||||
})
|
||||
basescraper.RegisterScraper(basescraper.SourceTypeRustDocs, func(c *basescraper.Config) basescraper.Scraper {
|
||||
return NewRustDocsScraper(c)
|
||||
})
|
||||
basescraper.RegisterScraper(basescraper.SourceTypePythonDocs, func(c *basescraper.Config) basescraper.Scraper {
|
||||
return NewPythonDocsScraper(c)
|
||||
})
|
||||
basescraper.RegisterScraper(basescraper.SourceTypeJavaDocs, func(c *basescraper.Config) basescraper.Scraper {
|
||||
return NewJavaDocsScraper(c)
|
||||
})
|
||||
basescraper.RegisterScraper(basescraper.SourceTypeSpringDocs, func(c *basescraper.Config) basescraper.Scraper {
|
||||
return NewSpringDocsScraper(c)
|
||||
})
|
||||
basescraper.RegisterScraper(basescraper.SourceTypeTSDocs, func(c *basescraper.Config) basescraper.Scraper {
|
||||
return NewTSDocsScraper(c)
|
||||
})
|
||||
basescraper.RegisterScraper(basescraper.SourceTypeReactDocs, func(c *basescraper.Config) basescraper.Scraper {
|
||||
return NewReactDocsScraper(c)
|
||||
})
|
||||
basescraper.RegisterScraper(basescraper.SourceTypeVueDocs, func(c *basescraper.Config) basescraper.Scraper {
|
||||
return NewVueDocsScraper(c)
|
||||
})
|
||||
basescraper.RegisterScraper(basescraper.SourceTypeNuxtDocs, func(c *basescraper.Config) basescraper.Scraper {
|
||||
return NewNuxtDocsScraper(c)
|
||||
})
|
||||
basescraper.RegisterScraper(basescraper.SourceTypeMCPDocs, func(c *basescraper.Config) basescraper.Scraper {
|
||||
return NewMCPDocsScraper(c)
|
||||
})
|
||||
basescraper.RegisterScraper(basescraper.SourceTypeDockerDocs, func(c *basescraper.Config) basescraper.Scraper {
|
||||
return NewDockerDocsScraper(c)
|
||||
})
|
||||
basescraper.RegisterScraper(basescraper.SourceTypeCloudflareDocs, func(c *basescraper.Config) basescraper.Scraper {
|
||||
return NewCloudflareDocsScraper(c)
|
||||
})
|
||||
basescraper.RegisterScraper(basescraper.SourceTypeAstroDocs, func(c *basescraper.Config) basescraper.Scraper {
|
||||
return NewAstroDocsScraper(c)
|
||||
})
|
||||
}
|
||||
Vendored
+27
-12
@@ -155,16 +155,18 @@ func (s *TSDocsScraper) interfaceToDocument(iface *tsdocs.Interface, module *tsd
|
||||
metadata := map[string]interface{}{
|
||||
"module": module.Name,
|
||||
"name": iface.Name,
|
||||
"doc_url": iface.DocURL,
|
||||
"doc_url": coalesceDocURL(iface.DocURL, module.DocURL),
|
||||
}
|
||||
|
||||
docURL := coalesceDocURL(iface.DocURL, module.DocURL)
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(iface.DocURL),
|
||||
ID: generateDocID(docURL),
|
||||
Source: sourceName,
|
||||
Type: "ts-interface",
|
||||
Title: iface.Name,
|
||||
Content: content.String(),
|
||||
URL: iface.DocURL,
|
||||
URL: docURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
@@ -185,16 +187,18 @@ func (s *TSDocsScraper) functionToDocument(fn *tsdocs.Function, module *tsdocs.M
|
||||
"module": module.Name,
|
||||
"name": fn.Name,
|
||||
"return_type": fn.ReturnType,
|
||||
"doc_url": fn.DocURL,
|
||||
"doc_url": coalesceDocURL(fn.DocURL, module.DocURL),
|
||||
}
|
||||
|
||||
docURL := coalesceDocURL(fn.DocURL, module.DocURL)
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(fn.DocURL),
|
||||
ID: generateDocID(docURL),
|
||||
Source: sourceName,
|
||||
Type: "ts-function",
|
||||
Title: fn.Name,
|
||||
Content: content.String(),
|
||||
URL: fn.DocURL,
|
||||
URL: docURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
@@ -217,16 +221,18 @@ func (s *TSDocsScraper) classToDocument(class *tsdocs.Class, module *tsdocs.Modu
|
||||
metadata := map[string]interface{}{
|
||||
"module": module.Name,
|
||||
"name": class.Name,
|
||||
"doc_url": class.DocURL,
|
||||
"doc_url": coalesceDocURL(class.DocURL, module.DocURL),
|
||||
}
|
||||
|
||||
docURL := coalesceDocURL(class.DocURL, module.DocURL)
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(class.DocURL),
|
||||
ID: generateDocID(docURL),
|
||||
Source: sourceName,
|
||||
Type: "ts-class",
|
||||
Title: class.Name,
|
||||
Content: content.String(),
|
||||
URL: class.DocURL,
|
||||
URL: docURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
@@ -244,18 +250,27 @@ func (s *TSDocsScraper) typeAliasToDocument(ta *tsdocs.TypeAlias, module *tsdocs
|
||||
metadata := map[string]interface{}{
|
||||
"module": module.Name,
|
||||
"name": ta.Name,
|
||||
"doc_url": ta.DocURL,
|
||||
"doc_url": coalesceDocURL(ta.DocURL, module.DocURL),
|
||||
}
|
||||
|
||||
docURL := coalesceDocURL(ta.DocURL, module.DocURL)
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(ta.DocURL),
|
||||
ID: generateDocID(docURL),
|
||||
Source: sourceName,
|
||||
Type: "ts-type",
|
||||
Title: ta.Name,
|
||||
Content: content.String(),
|
||||
URL: ta.DocURL,
|
||||
URL: docURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func coalesceDocURL(primary, fallback string) string {
|
||||
if strings.TrimSpace(primary) != "" {
|
||||
return primary
|
||||
}
|
||||
return fallback
|
||||
}
|
||||
|
||||
+65
@@ -0,0 +1,65 @@
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/yourorg/devour/pkg/tsdocs"
|
||||
)
|
||||
|
||||
func TestTSDocsSubDocsFallbackToModuleURL(t *testing.T) {
|
||||
s := &TSDocsScraper{}
|
||||
module := &tsdocs.Module{
|
||||
Name: "Module",
|
||||
DocURL: "https://www.typescriptlang.org/docs/handbook/2/basic-types.html",
|
||||
}
|
||||
|
||||
cases := []struct {
|
||||
name string
|
||||
build func() *Document
|
||||
docType string
|
||||
}{
|
||||
{
|
||||
name: "interface",
|
||||
build: func() *Document {
|
||||
return s.interfaceToDocument(&tsdocs.Interface{Name: "User", DocURL: ""}, module, "ts")
|
||||
},
|
||||
docType: "ts-interface",
|
||||
},
|
||||
{
|
||||
name: "function",
|
||||
build: func() *Document {
|
||||
return s.functionToDocument(&tsdocs.Function{Name: "parse", DocURL: ""}, module, "ts")
|
||||
},
|
||||
docType: "ts-function",
|
||||
},
|
||||
{
|
||||
name: "class",
|
||||
build: func() *Document {
|
||||
return s.classToDocument(&tsdocs.Class{Name: "Service", DocURL: ""}, module, "ts")
|
||||
},
|
||||
docType: "ts-class",
|
||||
},
|
||||
{
|
||||
name: "type alias",
|
||||
build: func() *Document {
|
||||
return s.typeAliasToDocument(&tsdocs.TypeAlias{Name: "ID", Type: "string", DocURL: ""}, module, "ts")
|
||||
},
|
||||
docType: "ts-type",
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range cases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
doc := tc.build()
|
||||
if doc.URL != module.DocURL {
|
||||
t.Fatalf("expected fallback URL %q, got %q", module.DocURL, doc.URL)
|
||||
}
|
||||
if got := doc.Metadata["doc_url"]; got != module.DocURL {
|
||||
t.Fatalf("expected metadata doc_url %q, got %#v", module.DocURL, got)
|
||||
}
|
||||
if doc.Type != tc.docType {
|
||||
t.Fatalf("expected doc type %q, got %q", tc.docType, doc.Type)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
Vendored
+21
@@ -0,0 +1,21 @@
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
|
||||
basescraper "github.com/yourorg/devour/internal/scraper"
|
||||
)
|
||||
|
||||
type SourceType = basescraper.SourceType
|
||||
|
||||
type Source = basescraper.Source
|
||||
|
||||
type Document = basescraper.Document
|
||||
|
||||
type Config = basescraper.Config
|
||||
|
||||
func generateDocID(urlStr string) string {
|
||||
hash := sha256.Sum256([]byte(urlStr))
|
||||
return hex.EncodeToString(hash[:12])
|
||||
}
|
||||
+171
-8
@@ -2,6 +2,12 @@ package scraper
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"net/url"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// GitHubScraper scrapes documentation from GitHub repositories.
|
||||
@@ -16,16 +22,173 @@ func NewGitHubScraper(config *Config) *GitHubScraper {
|
||||
|
||||
// Scrape clones and parses documents from a GitHub repository.
|
||||
func (s *GitHubScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
|
||||
// TODO: Implement GitHub scraping
|
||||
// 1. Clone repository (shallow)
|
||||
// 2. Find markdown files in specified paths
|
||||
// 3. Parse README, docs/, wiki
|
||||
// 4. Extract code structure
|
||||
return nil, nil
|
||||
if source == nil {
|
||||
return nil, fmt.Errorf("source is required")
|
||||
}
|
||||
|
||||
repoURL, repoName, err := s.resolveRepo(source)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
tmpDir, err := os.MkdirTemp("", "devour-github-*")
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer os.RemoveAll(tmpDir)
|
||||
|
||||
cloneArgs := []string{"clone", "--depth", "1", "--filter=blob:none", "--sparse", repoURL, tmpDir}
|
||||
if branch := strings.TrimSpace(source.Branch); branch != "" {
|
||||
cloneArgs = []string{"clone", "--depth", "1", "--filter=blob:none", "--sparse", "--branch", branch, repoURL, tmpDir}
|
||||
}
|
||||
|
||||
cmd := exec.CommandContext(ctx, "git", cloneArgs...)
|
||||
output, err := cmd.CombinedOutput()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("git clone failed: %v (%s)", err, strings.TrimSpace(string(output)))
|
||||
}
|
||||
|
||||
if len(source.Include) == 0 {
|
||||
// Try sparse checkout for common docs locations to reduce clone and parse cost.
|
||||
sparse := exec.CommandContext(ctx, "git", "-C", tmpDir, "sparse-checkout", "set", "--skip-checks",
|
||||
"docs", "doc", "src/routes", "website/docs", "packages/*/docs")
|
||||
if sparseOut, sparseErr := sparse.CombinedOutput(); sparseErr != nil {
|
||||
_ = sparseOut
|
||||
_ = exec.CommandContext(ctx, "git", "-C", tmpDir, "sparse-checkout", "disable").Run()
|
||||
}
|
||||
} else {
|
||||
_ = exec.CommandContext(ctx, "git", "-C", tmpDir, "sparse-checkout", "disable").Run()
|
||||
}
|
||||
|
||||
localSource := &Source{
|
||||
Name: coalesce(source.Name, repoName),
|
||||
Type: SourceTypeLocal,
|
||||
Path: tmpDir,
|
||||
Include: append([]string(nil), source.Include...),
|
||||
Exclude: append([]string(nil), source.Exclude...),
|
||||
Schedule: source.Schedule,
|
||||
}
|
||||
|
||||
if len(localSource.Include) == 0 {
|
||||
localSource.Include = []string{
|
||||
`(?i)(^|/)readme\.md$`,
|
||||
`(?i)(^|/)docs?/`,
|
||||
`(?i)\.md$`,
|
||||
`(?i)\.mdx$`,
|
||||
}
|
||||
}
|
||||
|
||||
local := NewLocalScraper(s.config)
|
||||
docs, err := local.Scrape(ctx, localSource)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if len(docs) == 0 && len(source.Include) == 0 {
|
||||
// Sparse patterns did not match this repository layout; retry full checkout.
|
||||
_ = exec.CommandContext(ctx, "git", "-C", tmpDir, "sparse-checkout", "disable").Run()
|
||||
docs, err = local.Scrape(ctx, localSource)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
for _, doc := range docs {
|
||||
if doc == nil {
|
||||
continue
|
||||
}
|
||||
branchForURL := strings.TrimSpace(source.Branch)
|
||||
if branchForURL == "" {
|
||||
branchForURL = "HEAD"
|
||||
}
|
||||
if doc.Metadata == nil {
|
||||
doc.Metadata = map[string]interface{}{}
|
||||
}
|
||||
if rawPath, ok := doc.Metadata["path"].(string); ok {
|
||||
if relPath, relErr := filepath.Rel(tmpDir, rawPath); relErr == nil {
|
||||
relPath = filepath.ToSlash(relPath)
|
||||
relPath = strings.TrimPrefix(relPath, "./")
|
||||
if relPath != "" && relPath != "." {
|
||||
doc.URL = fmt.Sprintf("https://github.com/%s/blob/%s/%s", repoName, branchForURL, relPath)
|
||||
doc.ID = generateDocID(doc.URL)
|
||||
doc.Metadata["path"] = relPath
|
||||
}
|
||||
}
|
||||
}
|
||||
doc.Type = "github-document"
|
||||
doc.Metadata["repo"] = repoName
|
||||
doc.Metadata["repo_url"] = repoURL
|
||||
doc.Metadata["source_type"] = "github"
|
||||
}
|
||||
return docs, nil
|
||||
}
|
||||
|
||||
// DetectChanges checks if the repository has new commits.
|
||||
func (s *GitHubScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
|
||||
// TODO: Check latest commit hash
|
||||
return false, "", nil
|
||||
if source == nil {
|
||||
return false, "", fmt.Errorf("source is required")
|
||||
}
|
||||
_, repoName, err := s.resolveRepo(source)
|
||||
if err != nil {
|
||||
return false, "", err
|
||||
}
|
||||
|
||||
remote := "https://github.com/" + strings.TrimSuffix(repoName, ".git") + ".git"
|
||||
branch := strings.TrimSpace(source.Branch)
|
||||
if branch == "" {
|
||||
branch = "HEAD"
|
||||
}
|
||||
|
||||
cmd := exec.CommandContext(ctx, "git", "ls-remote", remote, branch)
|
||||
output, err := cmd.Output()
|
||||
if err != nil {
|
||||
return false, "", err
|
||||
}
|
||||
line := strings.TrimSpace(string(output))
|
||||
if line == "" {
|
||||
return false, "", fmt.Errorf("empty ls-remote output")
|
||||
}
|
||||
parts := strings.Fields(line)
|
||||
if len(parts) == 0 {
|
||||
return false, "", fmt.Errorf("unexpected ls-remote output")
|
||||
}
|
||||
hash := parts[0]
|
||||
return hash != lastHash, hash, nil
|
||||
}
|
||||
|
||||
func (s *GitHubScraper) resolveRepo(source *Source) (repoURL string, repoName string, err error) {
|
||||
if strings.TrimSpace(source.Repo) != "" {
|
||||
repoName = strings.Trim(strings.TrimSpace(source.Repo), "/")
|
||||
repoName = strings.TrimSuffix(repoName, ".git")
|
||||
return "https://github.com/" + repoName + ".git", repoName, nil
|
||||
}
|
||||
|
||||
raw := strings.TrimSpace(source.URL)
|
||||
if raw == "" {
|
||||
return "", "", fmt.Errorf("github source requires repo or url")
|
||||
}
|
||||
|
||||
u, err := url.Parse(raw)
|
||||
if err != nil {
|
||||
return "", "", err
|
||||
}
|
||||
if !strings.Contains(strings.ToLower(u.Host), "github.com") {
|
||||
return "", "", fmt.Errorf("not a github url: %s", raw)
|
||||
}
|
||||
parts := strings.Split(strings.Trim(u.Path, "/"), "/")
|
||||
if len(parts) < 2 {
|
||||
return "", "", fmt.Errorf("invalid github repo url: %s", raw)
|
||||
}
|
||||
repoName = parts[0] + "/" + strings.TrimSuffix(parts[1], ".git")
|
||||
repoURL = "https://github.com/" + repoName + ".git"
|
||||
return repoURL, repoName, nil
|
||||
}
|
||||
|
||||
func coalesce(primary, fallback string) string {
|
||||
if strings.TrimSpace(primary) != "" {
|
||||
return primary
|
||||
}
|
||||
if strings.TrimSpace(fallback) != "" {
|
||||
return filepath.Base(fallback)
|
||||
}
|
||||
return "github"
|
||||
}
|
||||
|
||||
+227
-8
@@ -2,6 +2,20 @@ package scraper
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"fmt"
|
||||
"io/fs"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
var (
|
||||
reLocalBlankLines = regexp.MustCompile(`\n{3,}`)
|
||||
reFrontMatterBlock = regexp.MustCompile(`(?s)\A---\s*\n.*?\n---\s*\n`)
|
||||
)
|
||||
|
||||
// LocalScraper scrapes documentation from local filesystem.
|
||||
@@ -16,16 +30,221 @@ func NewLocalScraper(config *Config) *LocalScraper {
|
||||
|
||||
// Scrape scans and parses documents from a local directory.
|
||||
func (s *LocalScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
|
||||
// TODO: Implement local scraping
|
||||
// 1. Walk directory tree
|
||||
// 2. Filter by include/exclude patterns
|
||||
// 3. Parse markdown, text, code files
|
||||
// 4. Extract structure and content
|
||||
return nil, nil
|
||||
if source == nil {
|
||||
return nil, fmt.Errorf("source is required")
|
||||
}
|
||||
|
||||
root := strings.TrimSpace(source.Path)
|
||||
if root == "" {
|
||||
root = strings.TrimSpace(source.URL)
|
||||
}
|
||||
if root == "" {
|
||||
return nil, fmt.Errorf("path or url is required for local source")
|
||||
}
|
||||
|
||||
info, err := os.Stat(root)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
docs := make([]*Document, 0)
|
||||
if !info.IsDir() {
|
||||
doc, err := s.fileToDocument(root, source)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return []*Document{doc}, nil
|
||||
}
|
||||
|
||||
web := NewWebScraper(s.config)
|
||||
err = filepath.WalkDir(root, func(path string, d fs.DirEntry, err error) error {
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
default:
|
||||
}
|
||||
|
||||
if d.IsDir() {
|
||||
name := d.Name()
|
||||
if strings.HasPrefix(name, ".") || name == "node_modules" || name == "vendor" || name == "dist" || name == "build" {
|
||||
return filepath.SkipDir
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
relPath := path
|
||||
if rel, relErr := filepath.Rel(root, path); relErr == nil {
|
||||
relPath = rel
|
||||
}
|
||||
normalized := filepath.ToSlash(relPath)
|
||||
if !web.shouldInclude(normalized, source.Include, source.Exclude) {
|
||||
return nil
|
||||
}
|
||||
if !isDocumentationFile(path) {
|
||||
return nil
|
||||
}
|
||||
|
||||
doc, err := s.fileToDocument(path, source)
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
docs = append(docs, doc)
|
||||
return nil
|
||||
})
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return docs, nil
|
||||
}
|
||||
|
||||
// DetectChanges checks if files have been modified.
|
||||
func (s *LocalScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
|
||||
// TODO: Check file modification times
|
||||
return false, "", nil
|
||||
if source == nil {
|
||||
return false, "", fmt.Errorf("source is required")
|
||||
}
|
||||
|
||||
root := strings.TrimSpace(source.Path)
|
||||
if root == "" {
|
||||
root = strings.TrimSpace(source.URL)
|
||||
}
|
||||
if root == "" {
|
||||
return false, "", fmt.Errorf("path or url is required for local source")
|
||||
}
|
||||
|
||||
h := sha256.New()
|
||||
err := filepath.WalkDir(root, func(path string, d fs.DirEntry, err error) error {
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if d.IsDir() {
|
||||
name := d.Name()
|
||||
if strings.HasPrefix(name, ".") || name == "node_modules" || name == "vendor" || name == "dist" || name == "build" {
|
||||
return filepath.SkipDir
|
||||
}
|
||||
return nil
|
||||
}
|
||||
if !isDocumentationFile(path) {
|
||||
return nil
|
||||
}
|
||||
|
||||
info, infoErr := d.Info()
|
||||
if infoErr != nil {
|
||||
return infoErr
|
||||
}
|
||||
fmt.Fprintf(h, "%s|%d|%d\n", path, info.Size(), info.ModTime().UnixNano())
|
||||
return nil
|
||||
})
|
||||
if err != nil {
|
||||
return false, "", err
|
||||
}
|
||||
|
||||
hash := hex.EncodeToString(h.Sum(nil))
|
||||
return hash != lastHash, hash, nil
|
||||
}
|
||||
|
||||
func (s *LocalScraper) fileToDocument(path string, source *Source) (*Document, error) {
|
||||
b, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
ext := strings.ToLower(filepath.Ext(path))
|
||||
content := normalizeLocalContent(string(b), ext)
|
||||
if content == "" {
|
||||
return nil, fmt.Errorf("empty file")
|
||||
}
|
||||
|
||||
title := strings.TrimSuffix(filepath.Base(path), filepath.Ext(path))
|
||||
hash := sha256.Sum256(b)
|
||||
uri := "file://" + filepath.ToSlash(path)
|
||||
|
||||
docType := "local-document"
|
||||
switch ext {
|
||||
case ".md", ".mdx":
|
||||
docType = "local-markdown"
|
||||
case ".txt":
|
||||
docType = "local-text"
|
||||
case ".json", ".yaml", ".yml":
|
||||
docType = "local-data"
|
||||
case ".go", ".js", ".ts", ".tsx", ".py", ".java", ".rs", ".rb", ".php":
|
||||
docType = "local-code"
|
||||
}
|
||||
|
||||
name := source.Name
|
||||
if strings.TrimSpace(name) == "" {
|
||||
name = filepath.Base(filepath.Dir(path))
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(uri),
|
||||
Source: name,
|
||||
Type: docType,
|
||||
Title: title,
|
||||
Content: content,
|
||||
URL: uri,
|
||||
Metadata: map[string]interface{}{
|
||||
"path": path,
|
||||
"size": len(b),
|
||||
},
|
||||
Hash: hex.EncodeToString(hash[:]),
|
||||
Timestamp: time.Now(),
|
||||
}, nil
|
||||
}
|
||||
|
||||
func normalizeLocalContent(content, ext string) string {
|
||||
content = strings.TrimSpace(content)
|
||||
if content == "" {
|
||||
return ""
|
||||
}
|
||||
|
||||
switch ext {
|
||||
case ".md", ".mdx":
|
||||
content = stripMarkdownFrontmatter(content)
|
||||
content = stripMDXPreamble(content)
|
||||
}
|
||||
|
||||
// Collapse excessive blank lines to reduce indexing noise.
|
||||
content = reLocalBlankLines.ReplaceAllString(content, "\n\n")
|
||||
return strings.TrimSpace(content)
|
||||
}
|
||||
|
||||
func stripMarkdownFrontmatter(content string) string {
|
||||
if !strings.HasPrefix(content, "---\n") && !strings.HasPrefix(content, "---\r\n") {
|
||||
return content
|
||||
}
|
||||
|
||||
trimmed := reFrontMatterBlock.ReplaceAllString(content, "")
|
||||
return trimmed
|
||||
}
|
||||
|
||||
func stripMDXPreamble(content string) string {
|
||||
lines := strings.Split(content, "\n")
|
||||
i := 0
|
||||
for i < len(lines) {
|
||||
line := strings.TrimSpace(lines[i])
|
||||
if line == "" {
|
||||
i++
|
||||
continue
|
||||
}
|
||||
if strings.HasPrefix(line, "import ") || strings.HasPrefix(line, "export ") {
|
||||
i++
|
||||
continue
|
||||
}
|
||||
break
|
||||
}
|
||||
return strings.Join(lines[i:], "\n")
|
||||
}
|
||||
|
||||
func isDocumentationFile(path string) bool {
|
||||
ext := strings.ToLower(filepath.Ext(path))
|
||||
switch ext {
|
||||
case ".md", ".mdx", ".txt", ".rst", ".adoc", ".json", ".yaml", ".yml", ".go", ".py", ".js", ".ts", ".tsx", ".java", ".rs", ".rb", ".php", ".html":
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,102 @@
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"context"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestLocalScraperScrapeDirectory(t *testing.T) {
|
||||
tmp := t.TempDir()
|
||||
if err := os.WriteFile(filepath.Join(tmp, "README.md"), []byte("# Demo\n\nhello docs"), 0o644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(tmp, "notes.txt"), []byte("notes"), 0o644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(tmp, "bin.bin"), []byte{0x00, 0x01}, 0o644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
s := NewLocalScraper(&Config{Timeout: 2 * time.Second, UserAgent: "DevourTest"})
|
||||
docs, err := s.Scrape(context.Background(), &Source{Name: "local", Type: SourceTypeLocal, Path: tmp})
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if len(docs) < 2 {
|
||||
t.Fatalf("expected at least 2 docs, got %d", len(docs))
|
||||
}
|
||||
}
|
||||
|
||||
func TestLocalScraperDetectChanges(t *testing.T) {
|
||||
tmp := t.TempDir()
|
||||
file := filepath.Join(tmp, "README.md")
|
||||
if err := os.WriteFile(file, []byte("v1"), 0o644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
s := NewLocalScraper(&Config{Timeout: 2 * time.Second, UserAgent: "DevourTest"})
|
||||
src := &Source{Name: "local", Type: SourceTypeLocal, Path: tmp}
|
||||
|
||||
changed, hash1, err := s.DetectChanges(context.Background(), src, "")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if !changed || hash1 == "" {
|
||||
t.Fatalf("expected first detect to change with non-empty hash, changed=%v hash=%q", changed, hash1)
|
||||
}
|
||||
|
||||
time.Sleep(5 * time.Millisecond)
|
||||
if err := os.WriteFile(file, []byte("v2"), 0o644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
changed, hash2, err := s.DetectChanges(context.Background(), src, hash1)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if !changed {
|
||||
t.Fatal("expected change after file update")
|
||||
}
|
||||
if hash1 == hash2 {
|
||||
t.Fatal("expected hash to change")
|
||||
}
|
||||
}
|
||||
|
||||
func TestLocalScraper_StripsFrontmatterAndMDXPreamble(t *testing.T) {
|
||||
tmp := t.TempDir()
|
||||
path := filepath.Join(tmp, "doc.mdx")
|
||||
content := `---
|
||||
title: My Doc
|
||||
slug: /my-doc
|
||||
---
|
||||
|
||||
import { Component } from "x"
|
||||
export const meta = {}
|
||||
|
||||
# Heading
|
||||
|
||||
Actual documentation body.
|
||||
`
|
||||
if err := os.WriteFile(path, []byte(content), 0o644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
s := NewLocalScraper(&Config{Timeout: 2 * time.Second, UserAgent: "DevourTest"})
|
||||
doc, err := s.fileToDocument(path, &Source{Name: "local", Type: SourceTypeLocal, Path: tmp})
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if strings.Contains(doc.Content, "slug: /my-doc") {
|
||||
t.Fatalf("expected frontmatter to be stripped, got: %q", doc.Content)
|
||||
}
|
||||
if strings.Contains(doc.Content, "import { Component }") {
|
||||
t.Fatalf("expected MDX preamble to be stripped, got: %q", doc.Content)
|
||||
}
|
||||
if !strings.Contains(doc.Content, "Actual documentation body.") {
|
||||
t.Fatalf("expected markdown body in content, got: %q", doc.Content)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,402 @@
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
const (
|
||||
defaultLocalSearchLimit = 8
|
||||
maxLocalSearchLimit = 50
|
||||
maxSearchResponseBytes = 2 << 20 // 2MB
|
||||
)
|
||||
|
||||
// LocalSearchScraper scrapes docs from result URLs returned by a local search API.
|
||||
type LocalSearchScraper struct {
|
||||
config *Config
|
||||
client *http.Client
|
||||
web *WebScraper
|
||||
}
|
||||
|
||||
type localSearchResult struct {
|
||||
URL string
|
||||
Title string
|
||||
Snippet string
|
||||
Engine string
|
||||
Score float64
|
||||
}
|
||||
|
||||
// NewLocalSearchScraper creates a scraper backed by a self-hosted search API.
|
||||
func NewLocalSearchScraper(config *Config) *LocalSearchScraper {
|
||||
baseConfig := &Config{}
|
||||
if config != nil {
|
||||
*baseConfig = *config
|
||||
}
|
||||
if baseConfig.UserAgent == "" {
|
||||
baseConfig.UserAgent = "Devour/1.0 (Local Search Scraper)"
|
||||
}
|
||||
if baseConfig.Timeout <= 0 {
|
||||
baseConfig.Timeout = 30 * time.Second
|
||||
}
|
||||
|
||||
webConfig := *baseConfig
|
||||
webConfig.Concurrency = 1
|
||||
webConfig.MaxDepth = 1
|
||||
|
||||
return &LocalSearchScraper{
|
||||
config: baseConfig,
|
||||
client: &http.Client{Timeout: baseConfig.Timeout},
|
||||
web: NewWebScraper(&webConfig),
|
||||
}
|
||||
}
|
||||
|
||||
// Scrape queries a local search API and scrapes the returned URLs.
|
||||
func (s *LocalSearchScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
|
||||
if source == nil {
|
||||
return nil, fmt.Errorf("source is required")
|
||||
}
|
||||
if strings.TrimSpace(source.URL) == "" {
|
||||
return nil, fmt.Errorf("search API URL is required")
|
||||
}
|
||||
query := strings.TrimSpace(source.Query)
|
||||
if query == "" {
|
||||
return nil, fmt.Errorf("search query is required for localsearch sources (set source.query or --search-query)")
|
||||
}
|
||||
|
||||
limit := clampLocalSearchLimit(source.ResultLimit)
|
||||
results, err := s.search(ctx, source, query, limit)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
docs := make([]*Document, 0, limit)
|
||||
seen := make(map[string]bool)
|
||||
var scrapeErrors []string
|
||||
|
||||
for i, result := range results {
|
||||
if ctx.Err() != nil {
|
||||
return nil, ctx.Err()
|
||||
}
|
||||
|
||||
resultURL := stripURLFragment(result.URL)
|
||||
if resultURL == "" || seen[resultURL] {
|
||||
continue
|
||||
}
|
||||
if !domainAllowed(resultURL, source.Domains) {
|
||||
continue
|
||||
}
|
||||
if !s.web.shouldInclude(resultURL, source.Include, source.Exclude) {
|
||||
continue
|
||||
}
|
||||
seen[resultURL] = true
|
||||
|
||||
pageDocs, err := s.web.Scrape(ctx, &Source{
|
||||
Name: source.Name,
|
||||
Type: SourceTypeWeb,
|
||||
URL: resultURL,
|
||||
Include: source.Include,
|
||||
Exclude: source.Exclude,
|
||||
})
|
||||
if err != nil {
|
||||
if len(scrapeErrors) < 20 {
|
||||
scrapeErrors = append(scrapeErrors, fmt.Sprintf("%s: %v", resultURL, err))
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
for _, doc := range pageDocs {
|
||||
if doc.Metadata == nil {
|
||||
doc.Metadata = make(map[string]interface{})
|
||||
}
|
||||
doc.Metadata["search_api"] = source.URL
|
||||
doc.Metadata["search_query"] = query
|
||||
doc.Metadata["search_rank"] = i + 1
|
||||
if result.Engine != "" {
|
||||
doc.Metadata["search_engine"] = result.Engine
|
||||
}
|
||||
if result.Snippet != "" {
|
||||
doc.Metadata["search_snippet"] = result.Snippet
|
||||
}
|
||||
if result.Score != 0 {
|
||||
doc.Metadata["search_score"] = result.Score
|
||||
}
|
||||
if strings.TrimSpace(doc.Title) == "" && strings.TrimSpace(result.Title) != "" {
|
||||
doc.Title = strings.TrimSpace(result.Title)
|
||||
}
|
||||
|
||||
docs = append(docs, doc)
|
||||
}
|
||||
}
|
||||
|
||||
if len(docs) == 0 {
|
||||
if len(scrapeErrors) > 0 {
|
||||
return nil, fmt.Errorf("local search returned results but page scraping failed: %s", strings.Join(scrapeErrors, "; "))
|
||||
}
|
||||
return nil, fmt.Errorf("local search yielded no usable results for query %q", query)
|
||||
}
|
||||
|
||||
return docs, nil
|
||||
}
|
||||
|
||||
// DetectChanges checks if top search results changed.
|
||||
func (s *LocalSearchScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
|
||||
if source == nil {
|
||||
return false, "", fmt.Errorf("source is required")
|
||||
}
|
||||
query := strings.TrimSpace(source.Query)
|
||||
if query == "" {
|
||||
return false, "", fmt.Errorf("search query is required for localsearch sources")
|
||||
}
|
||||
|
||||
limit := clampLocalSearchLimit(source.ResultLimit)
|
||||
results, err := s.search(ctx, source, query, limit)
|
||||
if err != nil {
|
||||
return false, "", err
|
||||
}
|
||||
|
||||
signatures := make([]string, 0, len(results))
|
||||
for _, result := range results {
|
||||
u := stripURLFragment(result.URL)
|
||||
if u == "" {
|
||||
continue
|
||||
}
|
||||
if !domainAllowed(u, source.Domains) {
|
||||
continue
|
||||
}
|
||||
if !s.web.shouldInclude(u, source.Include, source.Exclude) {
|
||||
continue
|
||||
}
|
||||
signatures = append(signatures, fmt.Sprintf("%s|%s|%s|%.6f", u, result.Title, result.Engine, result.Score))
|
||||
}
|
||||
sort.Strings(signatures)
|
||||
|
||||
hash := sha256.Sum256([]byte(strings.Join(signatures, "\n")))
|
||||
currentHash := hex.EncodeToString(hash[:])
|
||||
return currentHash != lastHash, currentHash, nil
|
||||
}
|
||||
|
||||
func (s *LocalSearchScraper) search(ctx context.Context, source *Source, query string, limit int) ([]localSearchResult, error) {
|
||||
searchURL, err := buildLocalSearchURL(source.URL, query, limit)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, searchURL, nil)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to build search request: %w", err)
|
||||
}
|
||||
req.Header.Set("User-Agent", s.config.UserAgent)
|
||||
req.Header.Set("Accept", "application/json")
|
||||
|
||||
resp, err := s.client.Do(req)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("search API request failed: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
body, err := io.ReadAll(io.LimitReader(resp.Body, maxSearchResponseBytes))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed reading search API response: %w", err)
|
||||
}
|
||||
|
||||
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
|
||||
msg := strings.TrimSpace(string(body))
|
||||
if len(msg) > 200 {
|
||||
msg = msg[:200]
|
||||
}
|
||||
return nil, fmt.Errorf("search API returned HTTP %d: %s", resp.StatusCode, msg)
|
||||
}
|
||||
|
||||
results, err := decodeLocalSearchResults(body)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if len(results) == 0 {
|
||||
return nil, fmt.Errorf("search API returned no results")
|
||||
}
|
||||
if len(results) > limit {
|
||||
results = results[:limit]
|
||||
}
|
||||
return results, nil
|
||||
}
|
||||
|
||||
func buildLocalSearchURL(rawURL, query string, limit int) (string, error) {
|
||||
u, err := url.Parse(strings.TrimSpace(rawURL))
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("invalid search API URL: %w", err)
|
||||
}
|
||||
if u.Scheme == "" || u.Host == "" {
|
||||
return "", fmt.Errorf("invalid search API URL: expected absolute URL, got %q", rawURL)
|
||||
}
|
||||
|
||||
params := u.Query()
|
||||
params.Set("q", query)
|
||||
if params.Get("format") == "" {
|
||||
params.Set("format", "json")
|
||||
}
|
||||
if params.Get("limit") == "" {
|
||||
params.Set("limit", strconv.Itoa(clampLocalSearchLimit(limit)))
|
||||
}
|
||||
u.RawQuery = params.Encode()
|
||||
|
||||
return u.String(), nil
|
||||
}
|
||||
|
||||
func decodeLocalSearchResults(body []byte) ([]localSearchResult, error) {
|
||||
var payload map[string]interface{}
|
||||
if err := json.Unmarshal(body, &payload); err != nil {
|
||||
return nil, fmt.Errorf("search API returned invalid JSON: %w", err)
|
||||
}
|
||||
|
||||
rawResults, ok := payload["results"]
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("search API response missing results field")
|
||||
}
|
||||
|
||||
items, ok := rawResults.([]interface{})
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("search API results field is not an array")
|
||||
}
|
||||
|
||||
results := make([]localSearchResult, 0, len(items))
|
||||
for _, item := range items {
|
||||
record, ok := item.(map[string]interface{})
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
|
||||
resultURL := pickString(record, "url", "link", "href")
|
||||
if strings.TrimSpace(resultURL) == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
results = append(results, localSearchResult{
|
||||
URL: strings.TrimSpace(resultURL),
|
||||
Title: strings.TrimSpace(pickString(record, "title", "name")),
|
||||
Snippet: strings.TrimSpace(pickString(record, "content", "snippet", "description", "text")),
|
||||
Engine: strings.TrimSpace(pickString(record, "engine", "source")),
|
||||
Score: pickFloat(record, "score", "relevance"),
|
||||
})
|
||||
}
|
||||
|
||||
return results, nil
|
||||
}
|
||||
|
||||
func pickString(record map[string]interface{}, keys ...string) string {
|
||||
for _, key := range keys {
|
||||
value, ok := record[key]
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
switch v := value.(type) {
|
||||
case string:
|
||||
return v
|
||||
case json.Number:
|
||||
return v.String()
|
||||
case float64:
|
||||
return strconv.FormatFloat(v, 'f', -1, 64)
|
||||
case int:
|
||||
return strconv.Itoa(v)
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func pickFloat(record map[string]interface{}, keys ...string) float64 {
|
||||
for _, key := range keys {
|
||||
value, ok := record[key]
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
switch v := value.(type) {
|
||||
case float64:
|
||||
return v
|
||||
case float32:
|
||||
return float64(v)
|
||||
case int:
|
||||
return float64(v)
|
||||
case int64:
|
||||
return float64(v)
|
||||
case json.Number:
|
||||
f, err := v.Float64()
|
||||
if err == nil {
|
||||
return f
|
||||
}
|
||||
case string:
|
||||
f, err := strconv.ParseFloat(strings.TrimSpace(v), 64)
|
||||
if err == nil {
|
||||
return f
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
func clampLocalSearchLimit(limit int) int {
|
||||
if limit <= 0 {
|
||||
return defaultLocalSearchLimit
|
||||
}
|
||||
if limit > maxLocalSearchLimit {
|
||||
return maxLocalSearchLimit
|
||||
}
|
||||
return limit
|
||||
}
|
||||
|
||||
func stripURLFragment(raw string) string {
|
||||
u, err := url.Parse(strings.TrimSpace(raw))
|
||||
if err != nil {
|
||||
return strings.TrimSpace(raw)
|
||||
}
|
||||
u.Fragment = ""
|
||||
return u.String()
|
||||
}
|
||||
|
||||
func domainAllowed(raw string, allowedDomains []string) bool {
|
||||
if len(allowedDomains) == 0 {
|
||||
return true
|
||||
}
|
||||
|
||||
u, err := url.Parse(raw)
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
host := strings.ToLower(strings.TrimSpace(u.Hostname()))
|
||||
if host == "" {
|
||||
return false
|
||||
}
|
||||
|
||||
for _, candidate := range allowedDomains {
|
||||
domain := normalizeDomain(candidate)
|
||||
if domain == "" {
|
||||
continue
|
||||
}
|
||||
if host == domain || strings.HasSuffix(host, "."+domain) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func normalizeDomain(raw string) string {
|
||||
raw = strings.ToLower(strings.TrimSpace(raw))
|
||||
if raw == "" {
|
||||
return ""
|
||||
}
|
||||
if strings.Contains(raw, "://") {
|
||||
parsed, err := url.Parse(raw)
|
||||
if err == nil {
|
||||
return strings.ToLower(parsed.Hostname())
|
||||
}
|
||||
}
|
||||
return strings.TrimPrefix(raw, ".")
|
||||
}
|
||||
@@ -0,0 +1,226 @@
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"net/url"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestLocalSearchScraperScrape(t *testing.T) {
|
||||
mux := http.NewServeMux()
|
||||
baseURL := ""
|
||||
|
||||
mux.HandleFunc("/search", func(w http.ResponseWriter, r *http.Request) {
|
||||
if got := r.URL.Query().Get("q"); got != "go http client" {
|
||||
t.Fatalf("expected query go http client, got %q", got)
|
||||
}
|
||||
if got := r.URL.Query().Get("format"); got != "json" {
|
||||
t.Fatalf("expected format=json, got %q", got)
|
||||
}
|
||||
_ = json.NewEncoder(w).Encode(map[string]interface{}{
|
||||
"results": []map[string]interface{}{
|
||||
{
|
||||
"url": baseURL + "/docs/http-client",
|
||||
"title": "HTTP Client Guide",
|
||||
"content": "How to build an HTTP client in Go",
|
||||
"engine": "searxng",
|
||||
"score": 0.99,
|
||||
},
|
||||
},
|
||||
})
|
||||
})
|
||||
|
||||
mux.HandleFunc("/docs/http-client", func(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Content-Type", "text/html")
|
||||
_, _ = w.Write([]byte(`<html><head><title>HTTP Client Guide</title></head><body><main>` + strings.Repeat("http client docs ", 30) + `</main></body></html>`))
|
||||
})
|
||||
|
||||
srv := httptest.NewServer(mux)
|
||||
defer srv.Close()
|
||||
baseURL = srv.URL
|
||||
|
||||
s := NewLocalSearchScraper(&Config{
|
||||
UserAgent: "DevourTest/1.0",
|
||||
Timeout: 2 * time.Second,
|
||||
})
|
||||
|
||||
docs, err := s.Scrape(context.Background(), &Source{
|
||||
Name: "local-search",
|
||||
Type: SourceTypeLocalSearch,
|
||||
URL: srv.URL + "/search",
|
||||
Query: "go http client",
|
||||
ResultLimit: 5,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected scrape error: %v", err)
|
||||
}
|
||||
if len(docs) == 0 {
|
||||
t.Fatal("expected at least one document")
|
||||
}
|
||||
|
||||
doc := docs[0]
|
||||
if doc.URL != srv.URL+"/docs/http-client" {
|
||||
t.Fatalf("unexpected document URL: %q", doc.URL)
|
||||
}
|
||||
if doc.Metadata["search_query"] != "go http client" {
|
||||
t.Fatalf("expected metadata search_query, got %v", doc.Metadata["search_query"])
|
||||
}
|
||||
if doc.Metadata["search_engine"] != "searxng" {
|
||||
t.Fatalf("expected metadata search_engine=searxng, got %v", doc.Metadata["search_engine"])
|
||||
}
|
||||
}
|
||||
|
||||
func TestLocalSearchScraperDomainFilter(t *testing.T) {
|
||||
mux := http.NewServeMux()
|
||||
baseURL := ""
|
||||
|
||||
mux.HandleFunc("/search", func(w http.ResponseWriter, r *http.Request) {
|
||||
_ = json.NewEncoder(w).Encode(map[string]interface{}{
|
||||
"results": []map[string]interface{}{
|
||||
{
|
||||
"url": baseURL + "/docs/in-scope",
|
||||
"title": "In Scope",
|
||||
},
|
||||
{
|
||||
"url": "https://example.com/out-of-scope",
|
||||
"title": "Out Scope",
|
||||
},
|
||||
},
|
||||
})
|
||||
})
|
||||
|
||||
mux.HandleFunc("/docs/in-scope", func(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Content-Type", "text/html")
|
||||
_, _ = w.Write([]byte(`<html><head><title>In Scope</title></head><body><main>` + strings.Repeat("scoped docs ", 30) + `</main></body></html>`))
|
||||
})
|
||||
|
||||
srv := httptest.NewServer(mux)
|
||||
defer srv.Close()
|
||||
baseURL = srv.URL
|
||||
|
||||
parsed, err := url.Parse(srv.URL)
|
||||
if err != nil {
|
||||
t.Fatalf("failed to parse server URL: %v", err)
|
||||
}
|
||||
|
||||
s := NewLocalSearchScraper(&Config{
|
||||
UserAgent: "DevourTest/1.0",
|
||||
Timeout: 2 * time.Second,
|
||||
})
|
||||
|
||||
docs, err := s.Scrape(context.Background(), &Source{
|
||||
Name: "local-search",
|
||||
Type: SourceTypeLocalSearch,
|
||||
URL: srv.URL + "/search",
|
||||
Query: "scope test",
|
||||
ResultLimit: 10,
|
||||
Domains: []string{parsed.Hostname()},
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected scrape error: %v", err)
|
||||
}
|
||||
if len(docs) == 0 {
|
||||
t.Fatal("expected at least one in-scope document")
|
||||
}
|
||||
for _, doc := range docs {
|
||||
docURL, parseErr := url.Parse(doc.URL)
|
||||
if parseErr != nil {
|
||||
t.Fatalf("invalid doc URL %q: %v", doc.URL, parseErr)
|
||||
}
|
||||
if docURL.Hostname() != parsed.Hostname() {
|
||||
t.Fatalf("expected only in-scope domain, got %q", doc.URL)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestLocalSearchScraperRequiresQuery(t *testing.T) {
|
||||
s := NewLocalSearchScraper(&Config{
|
||||
UserAgent: "DevourTest/1.0",
|
||||
Timeout: 2 * time.Second,
|
||||
})
|
||||
|
||||
_, err := s.Scrape(context.Background(), &Source{
|
||||
Name: "local-search",
|
||||
Type: SourceTypeLocalSearch,
|
||||
URL: "http://127.0.0.1:8080/search",
|
||||
})
|
||||
if err == nil {
|
||||
t.Fatal("expected error when query is missing")
|
||||
}
|
||||
if !strings.Contains(err.Error(), "query") {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestLocalSearchScraperDetectChanges(t *testing.T) {
|
||||
mux := http.NewServeMux()
|
||||
baseURL := ""
|
||||
resultPath := "/docs/one"
|
||||
|
||||
mux.HandleFunc("/search", func(w http.ResponseWriter, r *http.Request) {
|
||||
_ = json.NewEncoder(w).Encode(map[string]interface{}{
|
||||
"results": []map[string]interface{}{
|
||||
{
|
||||
"url": baseURL + resultPath,
|
||||
"title": "Versioned",
|
||||
"score": 1.0,
|
||||
},
|
||||
},
|
||||
})
|
||||
})
|
||||
|
||||
srv := httptest.NewServer(mux)
|
||||
defer srv.Close()
|
||||
baseURL = srv.URL
|
||||
|
||||
s := NewLocalSearchScraper(&Config{
|
||||
UserAgent: "DevourTest/1.0",
|
||||
Timeout: 2 * time.Second,
|
||||
})
|
||||
source := &Source{
|
||||
Name: "local-search",
|
||||
Type: SourceTypeLocalSearch,
|
||||
URL: srv.URL + "/search",
|
||||
Query: "version test",
|
||||
ResultLimit: 3,
|
||||
}
|
||||
|
||||
changed, hash1, err := s.DetectChanges(context.Background(), source, "")
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected detect changes error: %v", err)
|
||||
}
|
||||
if !changed {
|
||||
t.Fatal("expected first detect changes call to report changed")
|
||||
}
|
||||
if hash1 == "" {
|
||||
t.Fatal("expected non-empty hash")
|
||||
}
|
||||
|
||||
changed, hash2, err := s.DetectChanges(context.Background(), source, hash1)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected detect changes error: %v", err)
|
||||
}
|
||||
if changed {
|
||||
t.Fatal("expected unchanged results with identical hash")
|
||||
}
|
||||
if hash2 != hash1 {
|
||||
t.Fatalf("expected identical hash, got %q and %q", hash1, hash2)
|
||||
}
|
||||
|
||||
resultPath = "/docs/two"
|
||||
changed, hash3, err := s.DetectChanges(context.Background(), source, hash1)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected detect changes error: %v", err)
|
||||
}
|
||||
if !changed {
|
||||
t.Fatal("expected changed results after search output changed")
|
||||
}
|
||||
if hash3 == hash1 {
|
||||
t.Fatal("expected hash to change")
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,88 @@
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"net/url"
|
||||
"path"
|
||||
"regexp"
|
||||
"strings"
|
||||
)
|
||||
|
||||
var (
|
||||
titleNoiseRe = regexp.MustCompile(`(?i)\b(added in go[0-9]+(\.[0-9]+)?|deprecated)\b`)
|
||||
titleSpaceRe = regexp.MustCompile(`\s+`)
|
||||
contentSpaceRe = regexp.MustCompile(`[ \t]+\n`)
|
||||
multiNewlineRe = regexp.MustCompile(`\n{3,}`)
|
||||
nonPrintableTitle = regexp.MustCompile(`[[:cntrl:]]`)
|
||||
)
|
||||
|
||||
// NormalizeDocuments applies normalization to a list of scraped documents.
|
||||
func NormalizeDocuments(docs []*Document) []*Document {
|
||||
for _, doc := range docs {
|
||||
NormalizeDocument(doc)
|
||||
}
|
||||
return docs
|
||||
}
|
||||
|
||||
// NormalizeDocument applies cross-scraper output cleanup.
|
||||
func NormalizeDocument(doc *Document) {
|
||||
if doc == nil {
|
||||
return
|
||||
}
|
||||
|
||||
doc.URL = strings.TrimSpace(doc.URL)
|
||||
doc.Type = strings.TrimSpace(doc.Type)
|
||||
doc.Title = normalizeTitle(doc.Title)
|
||||
doc.Content = normalizeContent(doc.Content)
|
||||
|
||||
if doc.Title == "" {
|
||||
doc.Title = inferTitleFromURL(doc.URL)
|
||||
}
|
||||
}
|
||||
|
||||
func normalizeTitle(title string) string {
|
||||
title = strings.ReplaceAll(title, "¶", " ")
|
||||
title = strings.ReplaceAll(title, "_", " ")
|
||||
title = nonPrintableTitle.ReplaceAllString(title, " ")
|
||||
title = titleNoiseRe.ReplaceAllString(title, " ")
|
||||
title = titleSpaceRe.ReplaceAllString(strings.TrimSpace(title), " ")
|
||||
|
||||
// Remove dangling punctuation if it became a suffix after cleanup.
|
||||
title = strings.TrimSpace(strings.Trim(title, "-:.,;"))
|
||||
return title
|
||||
}
|
||||
|
||||
func normalizeContent(content string) string {
|
||||
content = strings.ReplaceAll(content, "\r\n", "\n")
|
||||
content = strings.TrimSpace(content)
|
||||
content = contentSpaceRe.ReplaceAllString(content, "\n")
|
||||
content = multiNewlineRe.ReplaceAllString(content, "\n\n")
|
||||
return content
|
||||
}
|
||||
|
||||
func inferTitleFromURL(rawURL string) string {
|
||||
if rawURL == "" {
|
||||
return "Documentation"
|
||||
}
|
||||
|
||||
u, err := url.Parse(rawURL)
|
||||
if err != nil {
|
||||
return "Documentation"
|
||||
}
|
||||
|
||||
base := path.Base(strings.Trim(u.Path, "/"))
|
||||
if base == "" || base == "." || base == "/" {
|
||||
if u.Host != "" {
|
||||
return u.Host
|
||||
}
|
||||
return "Documentation"
|
||||
}
|
||||
|
||||
base = strings.TrimSuffix(base, ".html")
|
||||
base = strings.ReplaceAll(base, "-", " ")
|
||||
base = strings.ReplaceAll(base, "_", " ")
|
||||
base = titleSpaceRe.ReplaceAllString(strings.TrimSpace(base), " ")
|
||||
if base == "" {
|
||||
return "Documentation"
|
||||
}
|
||||
return base
|
||||
}
|
||||
@@ -0,0 +1,33 @@
|
||||
package scraper
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestNormalizeDocument_TitleCleanup(t *testing.T) {
|
||||
doc := &Document{
|
||||
Title: "http.type CloseNotifier ¶ deprecated added in go1.1",
|
||||
Content: "line 1 \n\n\nline 2",
|
||||
URL: "https://pkg.go.dev/net/http#CloseNotifier",
|
||||
}
|
||||
|
||||
NormalizeDocument(doc)
|
||||
|
||||
if doc.Title != "http.type CloseNotifier" {
|
||||
t.Fatalf("unexpected normalized title: %q", doc.Title)
|
||||
}
|
||||
if doc.Content != "line 1\n\nline 2" {
|
||||
t.Fatalf("unexpected normalized content: %q", doc.Content)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNormalizeDocument_InferTitle(t *testing.T) {
|
||||
doc := &Document{
|
||||
Title: "",
|
||||
URL: "https://kotlinlang.org/docs/regex.html",
|
||||
}
|
||||
|
||||
NormalizeDocument(doc)
|
||||
|
||||
if doc.Title != "regex" {
|
||||
t.Fatalf("expected inferred title 'regex', got %q", doc.Title)
|
||||
}
|
||||
}
|
||||
+316
-9
@@ -2,30 +2,337 @@ package scraper
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"os"
|
||||
"sort"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"gopkg.in/yaml.v3"
|
||||
)
|
||||
|
||||
// OpenAPIScraper parses OpenAPI/Swagger specifications.
|
||||
type OpenAPIScraper struct {
|
||||
config *Config
|
||||
client *http.Client
|
||||
}
|
||||
|
||||
// NewOpenAPIScraper creates a new OpenAPI scraper.
|
||||
func NewOpenAPIScraper(config *Config) *OpenAPIScraper {
|
||||
return &OpenAPIScraper{config: config}
|
||||
timeout := 30 * time.Second
|
||||
if config != nil && config.Timeout > 0 {
|
||||
timeout = config.Timeout
|
||||
}
|
||||
return &OpenAPIScraper{
|
||||
config: config,
|
||||
client: &http.Client{Timeout: timeout},
|
||||
}
|
||||
}
|
||||
|
||||
// Scrape fetches and parses an OpenAPI specification.
|
||||
func (s *OpenAPIScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
|
||||
// TODO: Implement OpenAPI parsing
|
||||
// 1. Fetch spec from URL
|
||||
// 2. Parse endpoints, schemas, descriptions
|
||||
// 3. Create documents per endpoint
|
||||
// 4. Include authentication, parameters
|
||||
return nil, nil
|
||||
if source == nil {
|
||||
return nil, fmt.Errorf("source is required")
|
||||
}
|
||||
|
||||
raw, specURL, err := s.readSpec(ctx, source)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
spec, err := parseOpenAPISpec(raw)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
docs := make([]*Document, 0)
|
||||
mainContent := buildMainSpecContent(spec)
|
||||
docs = append(docs, &Document{
|
||||
ID: generateDocID(specURL + "#openapi"),
|
||||
Source: coalesceSourceName(source.Name, "openapi"),
|
||||
Type: "openapi-spec",
|
||||
Title: spec.Info.Title,
|
||||
Content: mainContent,
|
||||
URL: specURL,
|
||||
Metadata: map[string]interface{}{
|
||||
"openapi": spec.Version,
|
||||
"servers": spec.Servers,
|
||||
},
|
||||
Hash: hashBytes(raw),
|
||||
Timestamp: time.Now(),
|
||||
})
|
||||
|
||||
paths := make([]string, 0, len(spec.Paths))
|
||||
for path := range spec.Paths {
|
||||
paths = append(paths, path)
|
||||
}
|
||||
sort.Strings(paths)
|
||||
|
||||
for _, p := range paths {
|
||||
opMap := spec.Paths[p]
|
||||
methods := make([]string, 0, len(opMap))
|
||||
for m := range opMap {
|
||||
methods = append(methods, strings.ToUpper(m))
|
||||
}
|
||||
sort.Strings(methods)
|
||||
|
||||
for _, method := range methods {
|
||||
op := opMap[strings.ToLower(method)]
|
||||
if op == nil {
|
||||
continue
|
||||
}
|
||||
title := strings.TrimSpace(op.Summary)
|
||||
if title == "" {
|
||||
title = fmt.Sprintf("%s %s", method, p)
|
||||
}
|
||||
content := buildOperationContent(method, p, op)
|
||||
docURL := fmt.Sprintf("%s#%s-%s", specURL, strings.ToLower(method), sanitizeFragment(p))
|
||||
docs = append(docs, &Document{
|
||||
ID: generateDocID(docURL),
|
||||
Source: coalesceSourceName(source.Name, "openapi"),
|
||||
Type: "openapi-operation",
|
||||
Title: title,
|
||||
Content: content,
|
||||
URL: docURL,
|
||||
Metadata: map[string]interface{}{
|
||||
"method": method,
|
||||
"path": p,
|
||||
"operation_id": op.OperationID,
|
||||
},
|
||||
Hash: hashString(content),
|
||||
Timestamp: time.Now(),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
return docs, nil
|
||||
}
|
||||
|
||||
// DetectChanges checks if the spec has been updated.
|
||||
func (s *OpenAPIScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
|
||||
// TODO: Check spec content hash
|
||||
return false, "", nil
|
||||
if source == nil {
|
||||
return false, "", fmt.Errorf("source is required")
|
||||
}
|
||||
raw, _, err := s.readSpec(ctx, source)
|
||||
if err != nil {
|
||||
return false, "", err
|
||||
}
|
||||
hash := hashBytes(raw)
|
||||
return hash != lastHash, hash, nil
|
||||
}
|
||||
|
||||
func (s *OpenAPIScraper) readSpec(ctx context.Context, source *Source) ([]byte, string, error) {
|
||||
rawPath := strings.TrimSpace(source.URL)
|
||||
if rawPath == "" {
|
||||
rawPath = strings.TrimSpace(source.Path)
|
||||
}
|
||||
if rawPath == "" {
|
||||
return nil, "", fmt.Errorf("openapi source requires url or path")
|
||||
}
|
||||
|
||||
if strings.HasPrefix(rawPath, "http://") || strings.HasPrefix(rawPath, "https://") {
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, rawPath, nil)
|
||||
if err != nil {
|
||||
return nil, "", err
|
||||
}
|
||||
if s.config != nil && strings.TrimSpace(s.config.UserAgent) != "" {
|
||||
req.Header.Set("User-Agent", s.config.UserAgent)
|
||||
}
|
||||
|
||||
resp, err := s.client.Do(req)
|
||||
if err != nil {
|
||||
return nil, "", err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
|
||||
return nil, "", fmt.Errorf("openapi fetch failed: HTTP %d", resp.StatusCode)
|
||||
}
|
||||
body, err := io.ReadAll(io.LimitReader(resp.Body, 10<<20))
|
||||
if err != nil {
|
||||
return nil, "", err
|
||||
}
|
||||
return body, rawPath, nil
|
||||
}
|
||||
|
||||
b, err := os.ReadFile(rawPath)
|
||||
if err != nil {
|
||||
return nil, "", err
|
||||
}
|
||||
return b, "file://" + rawPath, nil
|
||||
}
|
||||
|
||||
type openAPISpec struct {
|
||||
Version string `json:"openapi" yaml:"openapi"`
|
||||
Swagger string `json:"swagger" yaml:"swagger"`
|
||||
Info openAPIInfo `json:"info" yaml:"info"`
|
||||
Servers []openAPIServer `json:"servers" yaml:"servers"`
|
||||
Paths map[string]pathItems `json:"paths" yaml:"paths"`
|
||||
}
|
||||
|
||||
type openAPIInfo struct {
|
||||
Title string `json:"title" yaml:"title"`
|
||||
Version string `json:"version" yaml:"version"`
|
||||
Description string `json:"description" yaml:"description"`
|
||||
}
|
||||
|
||||
type openAPIServer struct {
|
||||
URL string `json:"url" yaml:"url"`
|
||||
Description string `json:"description" yaml:"description"`
|
||||
}
|
||||
|
||||
type pathItems map[string]*openAPIOperation
|
||||
|
||||
type openAPIOperation struct {
|
||||
Summary string `json:"summary" yaml:"summary"`
|
||||
Description string `json:"description" yaml:"description"`
|
||||
OperationID string `json:"operationId" yaml:"operationId"`
|
||||
Parameters []openAPIParameter `json:"parameters" yaml:"parameters"`
|
||||
Responses map[string]response `json:"responses" yaml:"responses"`
|
||||
RequestBody map[string]any `json:"requestBody" yaml:"requestBody"`
|
||||
Tags []string `json:"tags" yaml:"tags"`
|
||||
Deprecated bool `json:"deprecated" yaml:"deprecated"`
|
||||
Security []map[string][]string `json:"security" yaml:"security"`
|
||||
}
|
||||
|
||||
type openAPIParameter struct {
|
||||
Name string `json:"name" yaml:"name"`
|
||||
In string `json:"in" yaml:"in"`
|
||||
Description string `json:"description" yaml:"description"`
|
||||
Required bool `json:"required" yaml:"required"`
|
||||
}
|
||||
|
||||
type response struct {
|
||||
Description string `json:"description" yaml:"description"`
|
||||
}
|
||||
|
||||
func parseOpenAPISpec(raw []byte) (*openAPISpec, error) {
|
||||
var spec openAPISpec
|
||||
if err := json.Unmarshal(raw, &spec); err != nil {
|
||||
if yamlErr := yaml.Unmarshal(raw, &spec); yamlErr != nil {
|
||||
return nil, fmt.Errorf("invalid openapi content: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
if strings.TrimSpace(spec.Info.Title) == "" {
|
||||
spec.Info.Title = "OpenAPI Specification"
|
||||
}
|
||||
if strings.TrimSpace(spec.Version) == "" {
|
||||
spec.Version = spec.Swagger
|
||||
}
|
||||
if spec.Paths == nil {
|
||||
spec.Paths = map[string]pathItems{}
|
||||
}
|
||||
|
||||
return &spec, nil
|
||||
}
|
||||
|
||||
func buildMainSpecContent(spec *openAPISpec) string {
|
||||
var b strings.Builder
|
||||
fmt.Fprintf(&b, "# %s\n\n", spec.Info.Title)
|
||||
if spec.Info.Version != "" {
|
||||
fmt.Fprintf(&b, "- API Version: %s\n", spec.Info.Version)
|
||||
}
|
||||
if spec.Version != "" {
|
||||
fmt.Fprintf(&b, "- OpenAPI: %s\n", spec.Version)
|
||||
}
|
||||
fmt.Fprintf(&b, "- Paths: %d\n", len(spec.Paths))
|
||||
if spec.Info.Description != "" {
|
||||
fmt.Fprintf(&b, "\n%s\n", strings.TrimSpace(spec.Info.Description))
|
||||
}
|
||||
if len(spec.Servers) > 0 {
|
||||
fmt.Fprintf(&b, "\n## Servers\n")
|
||||
for _, s := range spec.Servers {
|
||||
fmt.Fprintf(&b, "- %s", s.URL)
|
||||
if s.Description != "" {
|
||||
fmt.Fprintf(&b, " - %s", s.Description)
|
||||
}
|
||||
fmt.Fprintln(&b)
|
||||
}
|
||||
}
|
||||
return b.String()
|
||||
}
|
||||
|
||||
func buildOperationContent(method, path string, op *openAPIOperation) string {
|
||||
var b strings.Builder
|
||||
fmt.Fprintf(&b, "# %s %s\n\n", method, path)
|
||||
if op.Summary != "" {
|
||||
fmt.Fprintf(&b, "%s\n\n", strings.TrimSpace(op.Summary))
|
||||
}
|
||||
if op.Description != "" {
|
||||
fmt.Fprintf(&b, "%s\n\n", strings.TrimSpace(op.Description))
|
||||
}
|
||||
if op.OperationID != "" {
|
||||
fmt.Fprintf(&b, "- Operation ID: `%s`\n", op.OperationID)
|
||||
}
|
||||
if len(op.Tags) > 0 {
|
||||
fmt.Fprintf(&b, "- Tags: %s\n", strings.Join(op.Tags, ", "))
|
||||
}
|
||||
if op.Deprecated {
|
||||
fmt.Fprintln(&b, "- Deprecated: true")
|
||||
}
|
||||
if len(op.Parameters) > 0 {
|
||||
fmt.Fprintln(&b, "\n## Parameters")
|
||||
for _, p := range op.Parameters {
|
||||
req := "optional"
|
||||
if p.Required {
|
||||
req = "required"
|
||||
}
|
||||
fmt.Fprintf(&b, "- `%s` (%s, %s)", p.Name, p.In, req)
|
||||
if p.Description != "" {
|
||||
fmt.Fprintf(&b, ": %s", strings.TrimSpace(p.Description))
|
||||
}
|
||||
fmt.Fprintln(&b)
|
||||
}
|
||||
}
|
||||
if len(op.Responses) > 0 {
|
||||
codes := make([]string, 0, len(op.Responses))
|
||||
for code := range op.Responses {
|
||||
codes = append(codes, code)
|
||||
}
|
||||
sort.Strings(codes)
|
||||
fmt.Fprintln(&b, "\n## Responses")
|
||||
for _, code := range codes {
|
||||
resp := op.Responses[code]
|
||||
fmt.Fprintf(&b, "- `%s`", code)
|
||||
if resp.Description != "" {
|
||||
fmt.Fprintf(&b, ": %s", strings.TrimSpace(resp.Description))
|
||||
}
|
||||
fmt.Fprintln(&b)
|
||||
}
|
||||
}
|
||||
return b.String()
|
||||
}
|
||||
|
||||
func sanitizeFragment(path string) string {
|
||||
path = strings.ToLower(path)
|
||||
path = strings.ReplaceAll(path, "/", "-")
|
||||
path = strings.ReplaceAll(path, "{", "")
|
||||
path = strings.ReplaceAll(path, "}", "")
|
||||
path = strings.Trim(path, "-")
|
||||
if path == "" {
|
||||
return "root"
|
||||
}
|
||||
return path
|
||||
}
|
||||
|
||||
func hashBytes(b []byte) string {
|
||||
h := sha256.Sum256(b)
|
||||
return hex.EncodeToString(h[:])
|
||||
}
|
||||
|
||||
func hashString(s string) string {
|
||||
h := sha256.Sum256([]byte(s))
|
||||
return hex.EncodeToString(h[:])
|
||||
}
|
||||
|
||||
func coalesceSourceName(name, fallback string) string {
|
||||
if strings.TrimSpace(name) != "" {
|
||||
return name
|
||||
}
|
||||
return fallback
|
||||
}
|
||||
|
||||
@@ -0,0 +1,77 @@
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"context"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestOpenAPIScraperScrape(t *testing.T) {
|
||||
spec := `{
|
||||
"openapi": "3.0.0",
|
||||
"info": {"title": "Pet API", "version": "1.0.0"},
|
||||
"paths": {
|
||||
"/pets": {
|
||||
"get": {
|
||||
"summary": "List pets",
|
||||
"operationId": "listPets",
|
||||
"responses": {"200": {"description": "ok"}}
|
||||
}
|
||||
}
|
||||
}
|
||||
}`
|
||||
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
_, _ = w.Write([]byte(spec))
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
s := NewOpenAPIScraper(&Config{Timeout: 2 * time.Second, UserAgent: "DevourTest"})
|
||||
docs, err := s.Scrape(context.Background(), &Source{Name: "pet", Type: SourceTypeOpenAPI, URL: srv.URL})
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected scrape error: %v", err)
|
||||
}
|
||||
if len(docs) < 2 {
|
||||
t.Fatalf("expected at least 2 docs, got %d", len(docs))
|
||||
}
|
||||
foundOp := false
|
||||
for _, d := range docs {
|
||||
if strings.Contains(d.Title, "List pets") {
|
||||
foundOp = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !foundOp {
|
||||
t.Fatal("expected operation document")
|
||||
}
|
||||
}
|
||||
|
||||
func TestOpenAPIScraperDetectChanges(t *testing.T) {
|
||||
spec := `{"openapi":"3.0.0","info":{"title":"API"},"paths":{}}`
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
_, _ = w.Write([]byte(spec))
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
s := NewOpenAPIScraper(&Config{Timeout: 2 * time.Second, UserAgent: "DevourTest"})
|
||||
src := &Source{Name: "api", Type: SourceTypeOpenAPI, URL: srv.URL}
|
||||
changed, hash1, err := s.DetectChanges(context.Background(), src, "")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if !changed || hash1 == "" {
|
||||
t.Fatalf("expected changed=true and non-empty hash, changed=%v hash=%q", changed, hash1)
|
||||
}
|
||||
|
||||
changed, _, err = s.DetectChanges(context.Background(), src, hash1)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if changed {
|
||||
t.Fatal("expected no changes when hash matches")
|
||||
}
|
||||
}
|
||||
@@ -5,6 +5,7 @@ func init() {
|
||||
// Additional scrapers can be registered in their own packages
|
||||
RegisterScraper(SourceTypeWeb, func(c *Config) Scraper { return NewWebScraper(c) })
|
||||
RegisterScraper(SourceTypeLocal, func(c *Config) Scraper { return NewLocalScraper(c) })
|
||||
RegisterScraper(SourceTypeLocalSearch, func(c *Config) Scraper { return NewLocalSearchScraper(c) })
|
||||
RegisterScraper(SourceTypeGitHub, func(c *Config) Scraper { return NewGitHubScraper(c) })
|
||||
RegisterScraper(SourceTypeOpenAPI, func(c *Config) Scraper { return NewOpenAPIScraper(c) })
|
||||
}
|
||||
|
||||
@@ -0,0 +1,71 @@
|
||||
package scraper_test
|
||||
|
||||
import (
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
basescraper "github.com/yourorg/devour/internal/scraper"
|
||||
_ "github.com/yourorg/devour/internal/scraper/external"
|
||||
)
|
||||
|
||||
func TestLanguageScrapersAreRegistered(t *testing.T) {
|
||||
config := &basescraper.Config{
|
||||
UserAgent: "DevourTest/1.0",
|
||||
Timeout: 2 * time.Second,
|
||||
}
|
||||
|
||||
supportedDocTypes := []basescraper.SourceType{
|
||||
basescraper.SourceTypeGoDocs,
|
||||
basescraper.SourceTypeRustDocs,
|
||||
basescraper.SourceTypePythonDocs,
|
||||
basescraper.SourceTypeJavaDocs,
|
||||
basescraper.SourceTypeSpringDocs,
|
||||
basescraper.SourceTypeTSDocs,
|
||||
basescraper.SourceTypeReactDocs,
|
||||
basescraper.SourceTypeVueDocs,
|
||||
basescraper.SourceTypeNuxtDocs,
|
||||
basescraper.SourceTypeMCPDocs,
|
||||
basescraper.SourceTypeDockerDocs,
|
||||
basescraper.SourceTypeCloudflareDocs,
|
||||
basescraper.SourceTypeAstroDocs,
|
||||
}
|
||||
|
||||
for _, sourceType := range supportedDocTypes {
|
||||
t.Run(string(sourceType), func(t *testing.T) {
|
||||
s := basescraper.NewScraper(sourceType, config)
|
||||
if s == nil {
|
||||
t.Fatalf("NewScraper(%q) returned nil; scraper was not registered", sourceType)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestDetectSourceType_ForSupportedDocsHosts(t *testing.T) {
|
||||
tests := []struct {
|
||||
input string
|
||||
expected basescraper.SourceType
|
||||
}{
|
||||
{"https://pkg.go.dev/net/http", basescraper.SourceTypeGoDocs},
|
||||
{"https://docs.rs/tokio/latest/tokio/", basescraper.SourceTypeRustDocs},
|
||||
{"https://docs.python.org/3/library/asyncio.html", basescraper.SourceTypePythonDocs},
|
||||
{"https://docs.oracle.com/javase/8/docs/api/java/util/List.html", basescraper.SourceTypeJavaDocs},
|
||||
{"https://docs.spring.io/spring-boot/docs/current/reference/htmlsingle/", basescraper.SourceTypeSpringDocs},
|
||||
{"https://www.typescriptlang.org/docs/handbook/2/basic-types.html", basescraper.SourceTypeTSDocs},
|
||||
{"https://react.dev/reference/react/hooks", basescraper.SourceTypeReactDocs},
|
||||
{"https://vuejs.org/guide/introduction.html", basescraper.SourceTypeVueDocs},
|
||||
{"https://nuxt.com/docs/guide/directory-structure", basescraper.SourceTypeNuxtDocs},
|
||||
{"https://docs.docker.com/compose", basescraper.SourceTypeDockerDocs},
|
||||
{"https://hub.docker.com/mcp/server/github", basescraper.SourceTypeMCPDocs},
|
||||
{"https://developers.cloudflare.com/workers", basescraper.SourceTypeCloudflareDocs},
|
||||
{"https://docs.astro.build/en/guides/components/", basescraper.SourceTypeAstroDocs},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.input, func(t *testing.T) {
|
||||
got := basescraper.DetectSourceType(tt.input)
|
||||
if got != tt.expected {
|
||||
t.Fatalf("DetectSourceType(%q) = %q, want %q", tt.input, got, tt.expected)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -28,7 +28,7 @@ func (r *ScraperRegistry) Register(sourceType SourceType, constructor ScraperCon
|
||||
// Create creates a scraper instance
|
||||
func (r *ScraperRegistry) Create(sourceType SourceType, config *Config) Scraper {
|
||||
if constructor, exists := r.constructors[sourceType]; exists {
|
||||
return constructor(config)
|
||||
return wrapScraper(constructor(config))
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -17,6 +17,7 @@ const (
|
||||
SourceTypeGitHub SourceType = "github"
|
||||
SourceTypeOpenAPI SourceType = "openapi"
|
||||
SourceTypeLocal SourceType = "local"
|
||||
SourceTypeLocalSearch SourceType = "localsearch"
|
||||
SourceTypeGoDocs SourceType = "godocs"
|
||||
SourceTypeRustDocs SourceType = "rustdocs"
|
||||
SourceTypePythonDocs SourceType = "pythondocs"
|
||||
@@ -34,15 +35,18 @@ const (
|
||||
|
||||
// Source represents a documentation source to scrape.
|
||||
type Source struct {
|
||||
Name string `yaml:"name"`
|
||||
Type SourceType `yaml:"type"`
|
||||
URL string `yaml:"url,omitempty"`
|
||||
Repo string `yaml:"repo,omitempty"`
|
||||
Branch string `yaml:"branch,omitempty"`
|
||||
Path string `yaml:"path,omitempty"`
|
||||
Include []string `yaml:"include,omitempty"`
|
||||
Exclude []string `yaml:"exclude,omitempty"`
|
||||
Schedule string `yaml:"schedule,omitempty"`
|
||||
Name string `yaml:"name"`
|
||||
Type SourceType `yaml:"type"`
|
||||
URL string `yaml:"url,omitempty"`
|
||||
Query string `yaml:"query,omitempty"`
|
||||
ResultLimit int `yaml:"result_limit,omitempty"`
|
||||
Domains []string `yaml:"domains,omitempty"`
|
||||
Repo string `yaml:"repo,omitempty"`
|
||||
Branch string `yaml:"branch,omitempty"`
|
||||
Path string `yaml:"path,omitempty"`
|
||||
Include []string `yaml:"include,omitempty"`
|
||||
Exclude []string `yaml:"exclude,omitempty"`
|
||||
Schedule string `yaml:"schedule,omitempty"`
|
||||
}
|
||||
|
||||
// Document represents a scraped document.
|
||||
@@ -113,6 +117,11 @@ func DetectSourceType(input string) SourceType {
|
||||
}
|
||||
}
|
||||
|
||||
// MCP servers are hosted under Docker Hub paths.
|
||||
if strings.Contains(input, "hub.docker.com/mcp/") {
|
||||
return SourceTypeMCPDocs
|
||||
}
|
||||
|
||||
// Check for OpenAPI specs
|
||||
if strings.HasSuffix(input, ".json") || strings.HasSuffix(input, ".yaml") || strings.HasSuffix(input, ".yml") {
|
||||
if strings.Contains(strings.ToLower(input), "openapi") || strings.Contains(strings.ToLower(input), "swagger") {
|
||||
|
||||
+191
-7
@@ -6,8 +6,10 @@ import (
|
||||
"encoding/hex"
|
||||
"fmt"
|
||||
"net/url"
|
||||
"path"
|
||||
"regexp"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/gocolly/colly/v2"
|
||||
@@ -27,17 +29,42 @@ func NewWebScraper(config *Config) *WebScraper {
|
||||
func (s *WebScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
|
||||
var documents []*Document
|
||||
visited := make(map[string]bool)
|
||||
scheduled := make(map[string]bool)
|
||||
contentHashes := make(map[string]bool)
|
||||
var mu sync.Mutex
|
||||
var scrapeErrors []string
|
||||
|
||||
// Parse base URL for domain restrictions
|
||||
baseURL, err := url.Parse(source.URL)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("invalid URL: %w", err)
|
||||
}
|
||||
allowedDomain := baseURL.Hostname()
|
||||
if allowedDomain == "" {
|
||||
allowedDomain = baseURL.Host
|
||||
}
|
||||
|
||||
maxDepth := s.config.MaxDepth
|
||||
if maxDepth <= 0 {
|
||||
maxDepth = 2
|
||||
}
|
||||
maxPages := s.config.Concurrency * 40
|
||||
if maxPages < 20 {
|
||||
maxPages = 20
|
||||
}
|
||||
if maxDepth <= 1 && maxPages > 30 {
|
||||
maxPages = 30
|
||||
}
|
||||
if maxPages > 300 {
|
||||
maxPages = 300
|
||||
}
|
||||
scopePrefix := pathScopePrefix(baseURL.Path)
|
||||
scopeLeaf := pathScopeLeaf(baseURL.Path)
|
||||
|
||||
// Create Colly collector
|
||||
c := colly.NewCollector(
|
||||
colly.AllowedDomains(baseURL.Host),
|
||||
colly.MaxDepth(s.config.MaxDepth),
|
||||
colly.AllowedDomains(allowedDomain),
|
||||
colly.MaxDepth(maxDepth),
|
||||
colly.Async(true),
|
||||
colly.UserAgent(s.config.UserAgent),
|
||||
)
|
||||
@@ -65,18 +92,40 @@ func (s *WebScraper) Scrape(ctx context.Context, source *Source) ([]*Document, e
|
||||
|
||||
// Handle errors
|
||||
c.OnError(func(r *colly.Response, err error) {
|
||||
fmt.Printf("Error scraping %s: %v\n", r.Request.URL, err)
|
||||
errText := strings.ToLower(err.Error())
|
||||
if strings.Contains(errText, "already visited") {
|
||||
return
|
||||
}
|
||||
reqURL := source.URL
|
||||
if r != nil && r.Request != nil && r.Request.URL != nil {
|
||||
reqURL = r.Request.URL.String()
|
||||
}
|
||||
mu.Lock()
|
||||
if len(scrapeErrors) < 20 {
|
||||
scrapeErrors = append(scrapeErrors, fmt.Sprintf("%s: %v", reqURL, err))
|
||||
}
|
||||
mu.Unlock()
|
||||
})
|
||||
|
||||
// Extract content from pages
|
||||
c.OnHTML("html", func(e *colly.HTMLElement) {
|
||||
pageURL := e.Request.URL.String()
|
||||
if !withinScope(e.Request.URL, baseURL, scopePrefix, scopeLeaf) {
|
||||
return
|
||||
}
|
||||
|
||||
// Skip if already visited
|
||||
mu.Lock()
|
||||
if visited[pageURL] {
|
||||
mu.Unlock()
|
||||
return
|
||||
}
|
||||
if len(visited) >= maxPages {
|
||||
mu.Unlock()
|
||||
return
|
||||
}
|
||||
visited[pageURL] = true
|
||||
mu.Unlock()
|
||||
|
||||
// Check include/exclude patterns
|
||||
if !s.shouldInclude(pageURL, source.Include, source.Exclude) {
|
||||
@@ -99,6 +148,13 @@ func (s *WebScraper) Scrape(ctx context.Context, source *Source) ([]*Document, e
|
||||
|
||||
// Generate hash for change detection
|
||||
hash := s.generateHash(content)
|
||||
mu.Lock()
|
||||
if contentHashes[hash] {
|
||||
mu.Unlock()
|
||||
return
|
||||
}
|
||||
contentHashes[hash] = true
|
||||
mu.Unlock()
|
||||
|
||||
// Extract metadata
|
||||
metadata := map[string]interface{}{
|
||||
@@ -120,30 +176,74 @@ func (s *WebScraper) Scrape(ctx context.Context, source *Source) ([]*Document, e
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
|
||||
mu.Lock()
|
||||
documents = append(documents, doc)
|
||||
mu.Unlock()
|
||||
})
|
||||
|
||||
// Follow links
|
||||
c.OnHTML("a[href]", func(e *colly.HTMLElement) {
|
||||
link := e.Attr("href")
|
||||
absoluteURL := e.Request.AbsoluteURL(link)
|
||||
|
||||
// Skip if already visited
|
||||
if visited[absoluteURL] {
|
||||
if absoluteURL == "" {
|
||||
return
|
||||
}
|
||||
|
||||
linkURL, err := url.Parse(absoluteURL)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
if linkURL.Scheme != "http" && linkURL.Scheme != "https" {
|
||||
return
|
||||
}
|
||||
if !withinScope(linkURL, baseURL, scopePrefix, scopeLeaf) {
|
||||
return
|
||||
}
|
||||
|
||||
// Skip if already visited
|
||||
mu.Lock()
|
||||
if visited[absoluteURL] {
|
||||
mu.Unlock()
|
||||
return
|
||||
}
|
||||
if len(visited) >= maxPages {
|
||||
mu.Unlock()
|
||||
return
|
||||
}
|
||||
mu.Unlock()
|
||||
|
||||
// Check include/exclude patterns
|
||||
if !s.shouldInclude(absoluteURL, source.Include, source.Exclude) {
|
||||
return
|
||||
}
|
||||
|
||||
mu.Lock()
|
||||
if scheduled[absoluteURL] {
|
||||
mu.Unlock()
|
||||
return
|
||||
}
|
||||
if len(scheduled) >= maxPages {
|
||||
mu.Unlock()
|
||||
return
|
||||
}
|
||||
scheduled[absoluteURL] = true
|
||||
mu.Unlock()
|
||||
|
||||
if err := c.Visit(absoluteURL); err != nil {
|
||||
fmt.Printf("Error visiting %s: %v\n", absoluteURL, err)
|
||||
errText := strings.ToLower(err.Error())
|
||||
if strings.Contains(errText, "already visited") {
|
||||
return
|
||||
}
|
||||
mu.Lock()
|
||||
if len(scrapeErrors) < 20 {
|
||||
scrapeErrors = append(scrapeErrors, fmt.Sprintf("%s: %v", absoluteURL, err))
|
||||
}
|
||||
mu.Unlock()
|
||||
}
|
||||
})
|
||||
|
||||
// Start scraping
|
||||
scheduled[source.URL] = true
|
||||
if err := c.Visit(source.URL); err != nil {
|
||||
return nil, fmt.Errorf("failed to start scraping: %w", err)
|
||||
}
|
||||
@@ -151,6 +251,16 @@ func (s *WebScraper) Scrape(ctx context.Context, source *Source) ([]*Document, e
|
||||
// Wait for async scraping to complete
|
||||
c.Wait()
|
||||
|
||||
mu.Lock()
|
||||
defer mu.Unlock()
|
||||
|
||||
if len(documents) == 0 {
|
||||
if len(scrapeErrors) > 0 {
|
||||
return nil, fmt.Errorf("web scrape failed: %s", strings.Join(scrapeErrors, "; "))
|
||||
}
|
||||
return nil, fmt.Errorf("web scrape extracted no documents from %s", source.URL)
|
||||
}
|
||||
|
||||
return documents, nil
|
||||
}
|
||||
|
||||
@@ -283,6 +393,25 @@ func (s *WebScraper) generateHash(content string) string {
|
||||
|
||||
// cleanText removes extra whitespace and normalizes text.
|
||||
func cleanText(text string) string {
|
||||
noisePhrases := []string{
|
||||
"table of contents",
|
||||
"in this article",
|
||||
"additional resources",
|
||||
"feedback",
|
||||
"collaborate with us on github",
|
||||
"copyright",
|
||||
"all rights reserved",
|
||||
"privacy policy",
|
||||
"terms of service",
|
||||
"sign in",
|
||||
"skip to main content",
|
||||
"ask learn",
|
||||
}
|
||||
for _, phrase := range noisePhrases {
|
||||
re := regexp.MustCompile(`(?i)` + regexp.QuoteMeta(phrase))
|
||||
text = re.ReplaceAllString(text, " ")
|
||||
}
|
||||
|
||||
// Replace multiple whitespace with single space
|
||||
re := regexp.MustCompile(`\s+`)
|
||||
text = re.ReplaceAllString(text, " ")
|
||||
@@ -292,3 +421,58 @@ func cleanText(text string) string {
|
||||
|
||||
return text
|
||||
}
|
||||
|
||||
func pathScopePrefix(rawPath string) string {
|
||||
clean := path.Clean(rawPath)
|
||||
if clean == "." || clean == "/" || clean == "" {
|
||||
return ""
|
||||
}
|
||||
|
||||
last := path.Base(clean)
|
||||
if strings.Contains(last, ".") {
|
||||
dir := path.Dir(clean)
|
||||
if dir == "/" {
|
||||
// Root-level document page: keep crawler scoped to this page path.
|
||||
return clean
|
||||
}
|
||||
return dir
|
||||
}
|
||||
|
||||
dir := path.Dir(clean)
|
||||
if dir == "/" {
|
||||
return clean
|
||||
}
|
||||
return dir
|
||||
}
|
||||
|
||||
func pathScopeLeaf(rawPath string) string {
|
||||
clean := path.Clean(rawPath)
|
||||
if clean == "." || clean == "/" || clean == "" {
|
||||
return ""
|
||||
}
|
||||
last := path.Base(clean)
|
||||
if strings.Contains(last, ".") {
|
||||
return last
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func withinScope(target, base *url.URL, prefix, leaf string) bool {
|
||||
if target == nil || base == nil {
|
||||
return false
|
||||
}
|
||||
if !strings.EqualFold(target.Hostname(), base.Hostname()) {
|
||||
return false
|
||||
}
|
||||
if prefix == "" {
|
||||
return true
|
||||
}
|
||||
targetPath := target.Path
|
||||
if targetPath == "" {
|
||||
targetPath = path.Clean("/")
|
||||
}
|
||||
if strings.HasPrefix(targetPath, prefix) {
|
||||
return true
|
||||
}
|
||||
return leaf != "" && path.Base(targetPath) == leaf
|
||||
}
|
||||
|
||||
@@ -0,0 +1,132 @@
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"context"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestWebScraper_ReturnsErrorWhenNothingFetched(t *testing.T) {
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
http.NotFound(w, r)
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
s := NewWebScraper(&Config{
|
||||
UserAgent: "DevourTest/1.0",
|
||||
Timeout: 2 * time.Second,
|
||||
Concurrency: 1,
|
||||
MaxDepth: 1,
|
||||
})
|
||||
|
||||
_, err := s.Scrape(context.Background(), &Source{
|
||||
Name: "missing",
|
||||
Type: SourceTypeWeb,
|
||||
URL: srv.URL + "/missing",
|
||||
})
|
||||
if err == nil {
|
||||
t.Fatal("expected error when web scrape yields no documents")
|
||||
}
|
||||
}
|
||||
|
||||
func TestWebScraper_ReturnsErrorWhenPageHasNoExtractableContent(t *testing.T) {
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Content-Type", "text/html")
|
||||
_, _ = w.Write([]byte(`<html><head><title>Empty</title></head><body><p>tiny</p></body></html>`))
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
s := NewWebScraper(&Config{
|
||||
UserAgent: "DevourTest/1.0",
|
||||
Timeout: 2 * time.Second,
|
||||
Concurrency: 1,
|
||||
MaxDepth: 1,
|
||||
})
|
||||
|
||||
_, err := s.Scrape(context.Background(), &Source{
|
||||
Name: "empty",
|
||||
Type: SourceTypeWeb,
|
||||
URL: srv.URL,
|
||||
})
|
||||
if err == nil {
|
||||
t.Fatal("expected error when page has no extractable docs")
|
||||
}
|
||||
if !strings.Contains(err.Error(), "extracted no documents") {
|
||||
t.Fatalf("unexpected error message: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestWebScraper_AllowsRedirectedDocumentPath(t *testing.T) {
|
||||
content := strings.Repeat("ruby docs content ", 30)
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
switch r.URL.Path {
|
||||
case "/core/Regexp.html":
|
||||
http.Redirect(w, r, "/3.4.1/Regexp.html", http.StatusFound)
|
||||
case "/3.4.1/Regexp.html":
|
||||
w.Header().Set("Content-Type", "text/html")
|
||||
_, _ = w.Write([]byte(`<html><head><title>Regexp</title></head><body><main>` + content + `</main></body></html>`))
|
||||
default:
|
||||
http.NotFound(w, r)
|
||||
}
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
s := NewWebScraper(&Config{
|
||||
UserAgent: "DevourTest/1.0",
|
||||
Timeout: 2 * time.Second,
|
||||
Concurrency: 1,
|
||||
MaxDepth: 1,
|
||||
})
|
||||
|
||||
docs, err := s.Scrape(context.Background(), &Source{
|
||||
Name: "ruby",
|
||||
Type: SourceTypeWeb,
|
||||
URL: srv.URL + "/core/Regexp.html",
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected scrape error: %v", err)
|
||||
}
|
||||
if len(docs) == 0 {
|
||||
t.Fatal("expected redirected page to be scraped")
|
||||
}
|
||||
if !strings.Contains(docs[0].URL, "/3.4.1/Regexp.html") {
|
||||
t.Fatalf("expected final redirected URL, got %q", docs[0].URL)
|
||||
}
|
||||
}
|
||||
|
||||
func TestWebScraper_GlobalWrapperNormalizesOutput(t *testing.T) {
|
||||
content := strings.Repeat("docs content ", 20)
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Content-Type", "text/html")
|
||||
_, _ = w.Write([]byte(`<html><head><title>Regex Guide ¶ deprecated</title></head><body><main>` + content + `</main></body></html>`))
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
s := NewScraper(SourceTypeWeb, &Config{
|
||||
UserAgent: "DevourTest/1.0",
|
||||
Timeout: 2 * time.Second,
|
||||
Concurrency: 1,
|
||||
MaxDepth: 1,
|
||||
})
|
||||
if s == nil {
|
||||
t.Fatal("expected web scraper")
|
||||
}
|
||||
|
||||
docs, err := s.Scrape(context.Background(), &Source{
|
||||
Name: "test",
|
||||
Type: SourceTypeWeb,
|
||||
URL: srv.URL,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected scrape error: %v", err)
|
||||
}
|
||||
if len(docs) == 0 {
|
||||
t.Fatal("expected at least one document")
|
||||
}
|
||||
if docs[0].Title != "Regex Guide" {
|
||||
t.Fatalf("expected normalized title, got %q", docs[0].Title)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,98 @@
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"net"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
// wrappedScraper adds retry and normalization behavior for all scraper implementations.
|
||||
type wrappedScraper struct {
|
||||
inner Scraper
|
||||
}
|
||||
|
||||
func wrapScraper(inner Scraper) Scraper {
|
||||
if inner == nil {
|
||||
return nil
|
||||
}
|
||||
return &wrappedScraper{inner: inner}
|
||||
}
|
||||
|
||||
func (w *wrappedScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
|
||||
if source == nil {
|
||||
return nil, fmt.Errorf("source is required")
|
||||
}
|
||||
|
||||
retries := 2
|
||||
delay := 300 * time.Millisecond
|
||||
|
||||
var lastErr error
|
||||
for attempt := 0; attempt <= retries; attempt++ {
|
||||
docs, err := w.inner.Scrape(ctx, source)
|
||||
if err == nil {
|
||||
return NormalizeDocuments(docs), nil
|
||||
}
|
||||
lastErr = err
|
||||
|
||||
// One fallback: add trailing slash for doc sites when URL path looks page-like.
|
||||
if attempt == 0 && strings.HasPrefix(source.URL, "http") && !strings.HasSuffix(source.URL, "/") {
|
||||
alt := *source
|
||||
alt.URL = source.URL + "/"
|
||||
docs, altErr := w.inner.Scrape(ctx, &alt)
|
||||
if altErr == nil {
|
||||
return NormalizeDocuments(docs), nil
|
||||
}
|
||||
}
|
||||
|
||||
if attempt < retries && isRetriableScrapeError(err) {
|
||||
if !sleepWithContext(ctx, delay) {
|
||||
return nil, ctx.Err()
|
||||
}
|
||||
continue
|
||||
}
|
||||
break
|
||||
}
|
||||
|
||||
return nil, lastErr
|
||||
}
|
||||
|
||||
func (w *wrappedScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
|
||||
return w.inner.DetectChanges(ctx, source, lastHash)
|
||||
}
|
||||
|
||||
func isRetriableScrapeError(err error) bool {
|
||||
if err == nil {
|
||||
return false
|
||||
}
|
||||
|
||||
s := strings.ToLower(err.Error())
|
||||
if strings.Contains(s, "timeout") ||
|
||||
strings.Contains(s, "temporarily unavailable") ||
|
||||
strings.Contains(s, "connection reset") ||
|
||||
strings.Contains(s, "eof") ||
|
||||
strings.Contains(s, "http 429") ||
|
||||
strings.Contains(s, "http 500") ||
|
||||
strings.Contains(s, "http 502") ||
|
||||
strings.Contains(s, "http 503") ||
|
||||
strings.Contains(s, "http 504") {
|
||||
return true
|
||||
}
|
||||
|
||||
var netErr net.Error
|
||||
return errors.As(err, &netErr)
|
||||
}
|
||||
|
||||
func sleepWithContext(ctx context.Context, d time.Duration) bool {
|
||||
t := time.NewTimer(d)
|
||||
defer t.Stop()
|
||||
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return false
|
||||
case <-t.C:
|
||||
return true
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,45 @@
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"testing"
|
||||
)
|
||||
|
||||
type flakyStubScraper struct {
|
||||
failFirst bool
|
||||
calls int
|
||||
}
|
||||
|
||||
func (f *flakyStubScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
|
||||
f.calls++
|
||||
if f.failFirst && f.calls == 1 {
|
||||
return nil, fmt.Errorf("HTTP 503")
|
||||
}
|
||||
return []*Document{
|
||||
{
|
||||
Title: "Example ¶ deprecated",
|
||||
Content: "ok",
|
||||
URL: source.URL,
|
||||
Type: "test",
|
||||
},
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (f *flakyStubScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
|
||||
return true, "hash", nil
|
||||
}
|
||||
|
||||
func TestWrappedScraper_RetriesAndNormalizes(t *testing.T) {
|
||||
w := wrapScraper(&flakyStubScraper{failFirst: true})
|
||||
docs, err := w.Scrape(context.Background(), &Source{URL: "https://example.com"})
|
||||
if err != nil {
|
||||
t.Fatalf("expected retry to succeed, got error: %v", err)
|
||||
}
|
||||
if len(docs) != 1 {
|
||||
t.Fatalf("expected 1 document, got %d", len(docs))
|
||||
}
|
||||
if docs[0].Title != "Example" {
|
||||
t.Fatalf("expected normalized title, got %q", docs[0].Title)
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user