update

2026-07-29 07:33:48 +00:00 · 2026-02-24 10:33:59 +01:00
parent 409acd2e08
commit 898a3c303f
1374 changed files with 290408 additions and 29186 deletions
@@ -0,0 +1,45 @@
+package scraper
+
+import basescraper "github.com/yourorg/devour/internal/scraper"
+
+func init() {
+	basescraper.RegisterScraper(basescraper.SourceTypeGoDocs, func(c *basescraper.Config) basescraper.Scraper {
+		return NewGoDocsScraper(c)
+	})
+	basescraper.RegisterScraper(basescraper.SourceTypeRustDocs, func(c *basescraper.Config) basescraper.Scraper {
+		return NewRustDocsScraper(c)
+	})
+	basescraper.RegisterScraper(basescraper.SourceTypePythonDocs, func(c *basescraper.Config) basescraper.Scraper {
+		return NewPythonDocsScraper(c)
+	})
+	basescraper.RegisterScraper(basescraper.SourceTypeJavaDocs, func(c *basescraper.Config) basescraper.Scraper {
+		return NewJavaDocsScraper(c)
+	})
+	basescraper.RegisterScraper(basescraper.SourceTypeSpringDocs, func(c *basescraper.Config) basescraper.Scraper {
+		return NewSpringDocsScraper(c)
+	})
+	basescraper.RegisterScraper(basescraper.SourceTypeTSDocs, func(c *basescraper.Config) basescraper.Scraper {
+		return NewTSDocsScraper(c)
+	})
+	basescraper.RegisterScraper(basescraper.SourceTypeReactDocs, func(c *basescraper.Config) basescraper.Scraper {
+		return NewReactDocsScraper(c)
+	})
+	basescraper.RegisterScraper(basescraper.SourceTypeVueDocs, func(c *basescraper.Config) basescraper.Scraper {
+		return NewVueDocsScraper(c)
+	})
+	basescraper.RegisterScraper(basescraper.SourceTypeNuxtDocs, func(c *basescraper.Config) basescraper.Scraper {
+		return NewNuxtDocsScraper(c)
+	})
+	basescraper.RegisterScraper(basescraper.SourceTypeMCPDocs, func(c *basescraper.Config) basescraper.Scraper {
+		return NewMCPDocsScraper(c)
+	})
+	basescraper.RegisterScraper(basescraper.SourceTypeDockerDocs, func(c *basescraper.Config) basescraper.Scraper {
+		return NewDockerDocsScraper(c)
+	})
+	basescraper.RegisterScraper(basescraper.SourceTypeCloudflareDocs, func(c *basescraper.Config) basescraper.Scraper {
+		return NewCloudflareDocsScraper(c)
+	})
+	basescraper.RegisterScraper(basescraper.SourceTypeAstroDocs, func(c *basescraper.Config) basescraper.Scraper {
+		return NewAstroDocsScraper(c)
+	})
+}
@@ -155,16 +155,18 @@ func (s *TSDocsScraper) interfaceToDocument(iface *tsdocs.Interface, module *tsd
 	metadata := map[string]interface{}{
 		"module":  module.Name,
 		"name":    iface.Name,
-		"doc_url": iface.DocURL,
+		"doc_url": coalesceDocURL(iface.DocURL, module.DocURL),
 	}

+	docURL := coalesceDocURL(iface.DocURL, module.DocURL)
+
 	return &Document{
-		ID:        generateDocID(iface.DocURL),
+		ID:        generateDocID(docURL),
 		Source:    sourceName,
 		Type:      "ts-interface",
 		Title:     iface.Name,
 		Content:   content.String(),
-		URL:       iface.DocURL,
+		URL:       docURL,
 		Metadata:  metadata,
 		Hash:      s.generateHash(content.String()),
 		Timestamp: time.Now(),
@@ -185,16 +187,18 @@ func (s *TSDocsScraper) functionToDocument(fn *tsdocs.Function, module *tsdocs.M
 		"module":      module.Name,
 		"name":        fn.Name,
 		"return_type": fn.ReturnType,
-		"doc_url":     fn.DocURL,
+		"doc_url":     coalesceDocURL(fn.DocURL, module.DocURL),
 	}

+	docURL := coalesceDocURL(fn.DocURL, module.DocURL)
+
 	return &Document{
-		ID:        generateDocID(fn.DocURL),
+		ID:        generateDocID(docURL),
 		Source:    sourceName,
 		Type:      "ts-function",
 		Title:     fn.Name,
 		Content:   content.String(),
-		URL:       fn.DocURL,
+		URL:       docURL,
 		Metadata:  metadata,
 		Hash:      s.generateHash(content.String()),
 		Timestamp: time.Now(),
@@ -217,16 +221,18 @@ func (s *TSDocsScraper) classToDocument(class *tsdocs.Class, module *tsdocs.Modu
 	metadata := map[string]interface{}{
 		"module":  module.Name,
 		"name":    class.Name,
-		"doc_url": class.DocURL,
+		"doc_url": coalesceDocURL(class.DocURL, module.DocURL),
 	}

+	docURL := coalesceDocURL(class.DocURL, module.DocURL)
+
 	return &Document{
-		ID:        generateDocID(class.DocURL),
+		ID:        generateDocID(docURL),
 		Source:    sourceName,
 		Type:      "ts-class",
 		Title:     class.Name,
 		Content:   content.String(),
-		URL:       class.DocURL,
+		URL:       docURL,
 		Metadata:  metadata,
 		Hash:      s.generateHash(content.String()),
 		Timestamp: time.Now(),
@@ -244,18 +250,27 @@ func (s *TSDocsScraper) typeAliasToDocument(ta *tsdocs.TypeAlias, module *tsdocs
 	metadata := map[string]interface{}{
 		"module":  module.Name,
 		"name":    ta.Name,
-		"doc_url": ta.DocURL,
+		"doc_url": coalesceDocURL(ta.DocURL, module.DocURL),
 	}

+	docURL := coalesceDocURL(ta.DocURL, module.DocURL)
+
 	return &Document{
-		ID:        generateDocID(ta.DocURL),
+		ID:        generateDocID(docURL),
 		Source:    sourceName,
 		Type:      "ts-type",
 		Title:     ta.Name,
 		Content:   content.String(),
-		URL:       ta.DocURL,
+		URL:       docURL,
 		Metadata:  metadata,
 		Hash:      s.generateHash(content.String()),
 		Timestamp: time.Now(),
 	}
 }
+
+func coalesceDocURL(primary, fallback string) string {
+	if strings.TrimSpace(primary) != "" {
+		return primary
+	}
+	return fallback
+}
@@ -0,0 +1,65 @@
+package scraper
+
+import (
+	"testing"
+
+	"github.com/yourorg/devour/pkg/tsdocs"
+)
+
+func TestTSDocsSubDocsFallbackToModuleURL(t *testing.T) {
+	s := &TSDocsScraper{}
+	module := &tsdocs.Module{
+		Name:   "Module",
+		DocURL: "https://www.typescriptlang.org/docs/handbook/2/basic-types.html",
+	}
+
+	cases := []struct {
+		name    string
+		build   func() *Document
+		docType string
+	}{
+		{
+			name: "interface",
+			build: func() *Document {
+				return s.interfaceToDocument(&tsdocs.Interface{Name: "User", DocURL: ""}, module, "ts")
+			},
+			docType: "ts-interface",
+		},
+		{
+			name: "function",
+			build: func() *Document {
+				return s.functionToDocument(&tsdocs.Function{Name: "parse", DocURL: ""}, module, "ts")
+			},
+			docType: "ts-function",
+		},
+		{
+			name: "class",
+			build: func() *Document {
+				return s.classToDocument(&tsdocs.Class{Name: "Service", DocURL: ""}, module, "ts")
+			},
+			docType: "ts-class",
+		},
+		{
+			name: "type alias",
+			build: func() *Document {
+				return s.typeAliasToDocument(&tsdocs.TypeAlias{Name: "ID", Type: "string", DocURL: ""}, module, "ts")
+			},
+			docType: "ts-type",
+		},
+	}
+
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			doc := tc.build()
+			if doc.URL != module.DocURL {
+				t.Fatalf("expected fallback URL %q, got %q", module.DocURL, doc.URL)
+			}
+			if got := doc.Metadata["doc_url"]; got != module.DocURL {
+				t.Fatalf("expected metadata doc_url %q, got %#v", module.DocURL, got)
+			}
+			if doc.Type != tc.docType {
+				t.Fatalf("expected doc type %q, got %q", tc.docType, doc.Type)
+			}
+		})
+	}
+}
@@ -0,0 +1,21 @@
+package scraper
+
+import (
+	"crypto/sha256"
+	"encoding/hex"
+
+	basescraper "github.com/yourorg/devour/internal/scraper"
+)
+
+type SourceType = basescraper.SourceType
+
+type Source = basescraper.Source
+
+type Document = basescraper.Document
+
+type Config = basescraper.Config
+
+func generateDocID(urlStr string) string {
+	hash := sha256.Sum256([]byte(urlStr))
+	return hex.EncodeToString(hash[:12])
+}
@@ -2,6 +2,12 @@ package scraper

 import (
 	"context"
+	"fmt"
+	"net/url"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"strings"
 )

 // GitHubScraper scrapes documentation from GitHub repositories.
@@ -16,16 +22,173 @@ func NewGitHubScraper(config *Config) *GitHubScraper {

 // Scrape clones and parses documents from a GitHub repository.
 func (s *GitHubScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
-	// TODO: Implement GitHub scraping
-	// 1. Clone repository (shallow)
-	// 2. Find markdown files in specified paths
-	// 3. Parse README, docs/, wiki
-	// 4. Extract code structure
-	return nil, nil
+	if source == nil {
+		return nil, fmt.Errorf("source is required")
+	}
+
+	repoURL, repoName, err := s.resolveRepo(source)
+	if err != nil {
+		return nil, err
+	}
+
+	tmpDir, err := os.MkdirTemp("", "devour-github-*")
+	if err != nil {
+		return nil, err
+	}
+	defer os.RemoveAll(tmpDir)
+
+	cloneArgs := []string{"clone", "--depth", "1", "--filter=blob:none", "--sparse", repoURL, tmpDir}
+	if branch := strings.TrimSpace(source.Branch); branch != "" {
+		cloneArgs = []string{"clone", "--depth", "1", "--filter=blob:none", "--sparse", "--branch", branch, repoURL, tmpDir}
+	}
+
+	cmd := exec.CommandContext(ctx, "git", cloneArgs...)
+	output, err := cmd.CombinedOutput()
+	if err != nil {
+		return nil, fmt.Errorf("git clone failed: %v (%s)", err, strings.TrimSpace(string(output)))
+	}
+
+	if len(source.Include) == 0 {
+		// Try sparse checkout for common docs locations to reduce clone and parse cost.
+		sparse := exec.CommandContext(ctx, "git", "-C", tmpDir, "sparse-checkout", "set", "--skip-checks",
+			"docs", "doc", "src/routes", "website/docs", "packages/*/docs")
+		if sparseOut, sparseErr := sparse.CombinedOutput(); sparseErr != nil {
+			_ = sparseOut
+			_ = exec.CommandContext(ctx, "git", "-C", tmpDir, "sparse-checkout", "disable").Run()
+		}
+	} else {
+		_ = exec.CommandContext(ctx, "git", "-C", tmpDir, "sparse-checkout", "disable").Run()
+	}
+
+	localSource := &Source{
+		Name:     coalesce(source.Name, repoName),
+		Type:     SourceTypeLocal,
+		Path:     tmpDir,
+		Include:  append([]string(nil), source.Include...),
+		Exclude:  append([]string(nil), source.Exclude...),
+		Schedule: source.Schedule,
+	}
+
+	if len(localSource.Include) == 0 {
+		localSource.Include = []string{
+			`(?i)(^|/)readme\.md$`,
+			`(?i)(^|/)docs?/`,
+			`(?i)\.md$`,
+			`(?i)\.mdx$`,
+		}
+	}
+
+	local := NewLocalScraper(s.config)
+	docs, err := local.Scrape(ctx, localSource)
+	if err != nil {
+		return nil, err
+	}
+	if len(docs) == 0 && len(source.Include) == 0 {
+		// Sparse patterns did not match this repository layout; retry full checkout.
+		_ = exec.CommandContext(ctx, "git", "-C", tmpDir, "sparse-checkout", "disable").Run()
+		docs, err = local.Scrape(ctx, localSource)
+		if err != nil {
+			return nil, err
+		}
+	}
+
+	for _, doc := range docs {
+		if doc == nil {
+			continue
+		}
+		branchForURL := strings.TrimSpace(source.Branch)
+		if branchForURL == "" {
+			branchForURL = "HEAD"
+		}
+		if doc.Metadata == nil {
+			doc.Metadata = map[string]interface{}{}
+		}
+		if rawPath, ok := doc.Metadata["path"].(string); ok {
+			if relPath, relErr := filepath.Rel(tmpDir, rawPath); relErr == nil {
+				relPath = filepath.ToSlash(relPath)
+				relPath = strings.TrimPrefix(relPath, "./")
+				if relPath != "" && relPath != "." {
+					doc.URL = fmt.Sprintf("https://github.com/%s/blob/%s/%s", repoName, branchForURL, relPath)
+					doc.ID = generateDocID(doc.URL)
+					doc.Metadata["path"] = relPath
+				}
+			}
+		}
+		doc.Type = "github-document"
+		doc.Metadata["repo"] = repoName
+		doc.Metadata["repo_url"] = repoURL
+		doc.Metadata["source_type"] = "github"
+	}
+	return docs, nil
 }

 // DetectChanges checks if the repository has new commits.
 func (s *GitHubScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
-	// TODO: Check latest commit hash
-	return false, "", nil
+	if source == nil {
+		return false, "", fmt.Errorf("source is required")
+	}
+	_, repoName, err := s.resolveRepo(source)
+	if err != nil {
+		return false, "", err
+	}
+
+	remote := "https://github.com/" + strings.TrimSuffix(repoName, ".git") + ".git"
+	branch := strings.TrimSpace(source.Branch)
+	if branch == "" {
+		branch = "HEAD"
+	}
+
+	cmd := exec.CommandContext(ctx, "git", "ls-remote", remote, branch)
+	output, err := cmd.Output()
+	if err != nil {
+		return false, "", err
+	}
+	line := strings.TrimSpace(string(output))
+	if line == "" {
+		return false, "", fmt.Errorf("empty ls-remote output")
+	}
+	parts := strings.Fields(line)
+	if len(parts) == 0 {
+		return false, "", fmt.Errorf("unexpected ls-remote output")
+	}
+	hash := parts[0]
+	return hash != lastHash, hash, nil
+}
+
+func (s *GitHubScraper) resolveRepo(source *Source) (repoURL string, repoName string, err error) {
+	if strings.TrimSpace(source.Repo) != "" {
+		repoName = strings.Trim(strings.TrimSpace(source.Repo), "/")
+		repoName = strings.TrimSuffix(repoName, ".git")
+		return "https://github.com/" + repoName + ".git", repoName, nil
+	}
+
+	raw := strings.TrimSpace(source.URL)
+	if raw == "" {
+		return "", "", fmt.Errorf("github source requires repo or url")
+	}
+
+	u, err := url.Parse(raw)
+	if err != nil {
+		return "", "", err
+	}
+	if !strings.Contains(strings.ToLower(u.Host), "github.com") {
+		return "", "", fmt.Errorf("not a github url: %s", raw)
+	}
+	parts := strings.Split(strings.Trim(u.Path, "/"), "/")
+	if len(parts) < 2 {
+		return "", "", fmt.Errorf("invalid github repo url: %s", raw)
+	}
+	repoName = parts[0] + "/" + strings.TrimSuffix(parts[1], ".git")
+	repoURL = "https://github.com/" + repoName + ".git"
+	return repoURL, repoName, nil
+}
+
+func coalesce(primary, fallback string) string {
+	if strings.TrimSpace(primary) != "" {
+		return primary
+	}
+	if strings.TrimSpace(fallback) != "" {
+		return filepath.Base(fallback)
+	}
+	return "github"
 }
@@ -2,6 +2,20 @@ package scraper

 import (
 	"context"
+	"crypto/sha256"
+	"encoding/hex"
+	"fmt"
+	"io/fs"
+	"os"
+	"path/filepath"
+	"regexp"
+	"strings"
+	"time"
+)
+
+var (
+	reLocalBlankLines  = regexp.MustCompile(`\n{3,}`)
+	reFrontMatterBlock = regexp.MustCompile(`(?s)\A---\s*\n.*?\n---\s*\n`)
 )

 // LocalScraper scrapes documentation from local filesystem.
@@ -16,16 +30,221 @@ func NewLocalScraper(config *Config) *LocalScraper {

 // Scrape scans and parses documents from a local directory.
 func (s *LocalScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
-	// TODO: Implement local scraping
-	// 1. Walk directory tree
-	// 2. Filter by include/exclude patterns
-	// 3. Parse markdown, text, code files
-	// 4. Extract structure and content
-	return nil, nil
+	if source == nil {
+		return nil, fmt.Errorf("source is required")
+	}
+
+	root := strings.TrimSpace(source.Path)
+	if root == "" {
+		root = strings.TrimSpace(source.URL)
+	}
+	if root == "" {
+		return nil, fmt.Errorf("path or url is required for local source")
+	}
+
+	info, err := os.Stat(root)
+	if err != nil {
+		return nil, err
+	}
+
+	docs := make([]*Document, 0)
+	if !info.IsDir() {
+		doc, err := s.fileToDocument(root, source)
+		if err != nil {
+			return nil, err
+		}
+		return []*Document{doc}, nil
+	}
+
+	web := NewWebScraper(s.config)
+	err = filepath.WalkDir(root, func(path string, d fs.DirEntry, err error) error {
+		if err != nil {
+			return err
+		}
+		select {
+		case <-ctx.Done():
+			return ctx.Err()
+		default:
+		}
+
+		if d.IsDir() {
+			name := d.Name()
+			if strings.HasPrefix(name, ".") || name == "node_modules" || name == "vendor" || name == "dist" || name == "build" {
+				return filepath.SkipDir
+			}
+			return nil
+		}
+
+		relPath := path
+		if rel, relErr := filepath.Rel(root, path); relErr == nil {
+			relPath = rel
+		}
+		normalized := filepath.ToSlash(relPath)
+		if !web.shouldInclude(normalized, source.Include, source.Exclude) {
+			return nil
+		}
+		if !isDocumentationFile(path) {
+			return nil
+		}
+
+		doc, err := s.fileToDocument(path, source)
+		if err != nil {
+			return nil
+		}
+		docs = append(docs, doc)
+		return nil
+	})
+	if err != nil {
+		return nil, err
+	}
+
+	return docs, nil
 }

 // DetectChanges checks if files have been modified.
 func (s *LocalScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
-	// TODO: Check file modification times
-	return false, "", nil
+	if source == nil {
+		return false, "", fmt.Errorf("source is required")
+	}
+
+	root := strings.TrimSpace(source.Path)
+	if root == "" {
+		root = strings.TrimSpace(source.URL)
+	}
+	if root == "" {
+		return false, "", fmt.Errorf("path or url is required for local source")
+	}
+
+	h := sha256.New()
+	err := filepath.WalkDir(root, func(path string, d fs.DirEntry, err error) error {
+		if err != nil {
+			return err
+		}
+		if d.IsDir() {
+			name := d.Name()
+			if strings.HasPrefix(name, ".") || name == "node_modules" || name == "vendor" || name == "dist" || name == "build" {
+				return filepath.SkipDir
+			}
+			return nil
+		}
+		if !isDocumentationFile(path) {
+			return nil
+		}
+
+		info, infoErr := d.Info()
+		if infoErr != nil {
+			return infoErr
+		}
+		fmt.Fprintf(h, "%s|%d|%d\n", path, info.Size(), info.ModTime().UnixNano())
+		return nil
+	})
+	if err != nil {
+		return false, "", err
+	}
+
+	hash := hex.EncodeToString(h.Sum(nil))
+	return hash != lastHash, hash, nil
+}
+
+func (s *LocalScraper) fileToDocument(path string, source *Source) (*Document, error) {
+	b, err := os.ReadFile(path)
+	if err != nil {
+		return nil, err
+	}
+
+	ext := strings.ToLower(filepath.Ext(path))
+	content := normalizeLocalContent(string(b), ext)
+	if content == "" {
+		return nil, fmt.Errorf("empty file")
+	}
+
+	title := strings.TrimSuffix(filepath.Base(path), filepath.Ext(path))
+	hash := sha256.Sum256(b)
+	uri := "file://" + filepath.ToSlash(path)
+
+	docType := "local-document"
+	switch ext {
+	case ".md", ".mdx":
+		docType = "local-markdown"
+	case ".txt":
+		docType = "local-text"
+	case ".json", ".yaml", ".yml":
+		docType = "local-data"
+	case ".go", ".js", ".ts", ".tsx", ".py", ".java", ".rs", ".rb", ".php":
+		docType = "local-code"
+	}
+
+	name := source.Name
+	if strings.TrimSpace(name) == "" {
+		name = filepath.Base(filepath.Dir(path))
+	}
+
+	return &Document{
+		ID:      generateDocID(uri),
+		Source:  name,
+		Type:    docType,
+		Title:   title,
+		Content: content,
+		URL:     uri,
+		Metadata: map[string]interface{}{
+			"path": path,
+			"size": len(b),
+		},
+		Hash:      hex.EncodeToString(hash[:]),
+		Timestamp: time.Now(),
+	}, nil
+}
+
+func normalizeLocalContent(content, ext string) string {
+	content = strings.TrimSpace(content)
+	if content == "" {
+		return ""
+	}
+
+	switch ext {
+	case ".md", ".mdx":
+		content = stripMarkdownFrontmatter(content)
+		content = stripMDXPreamble(content)
+	}
+
+	// Collapse excessive blank lines to reduce indexing noise.
+	content = reLocalBlankLines.ReplaceAllString(content, "\n\n")
+	return strings.TrimSpace(content)
+}
+
+func stripMarkdownFrontmatter(content string) string {
+	if !strings.HasPrefix(content, "---\n") && !strings.HasPrefix(content, "---\r\n") {
+		return content
+	}
+
+	trimmed := reFrontMatterBlock.ReplaceAllString(content, "")
+	return trimmed
+}
+
+func stripMDXPreamble(content string) string {
+	lines := strings.Split(content, "\n")
+	i := 0
+	for i < len(lines) {
+		line := strings.TrimSpace(lines[i])
+		if line == "" {
+			i++
+			continue
+		}
+		if strings.HasPrefix(line, "import ") || strings.HasPrefix(line, "export ") {
+			i++
+			continue
+		}
+		break
+	}
+	return strings.Join(lines[i:], "\n")
+}
+
+func isDocumentationFile(path string) bool {
+	ext := strings.ToLower(filepath.Ext(path))
+	switch ext {
+	case ".md", ".mdx", ".txt", ".rst", ".adoc", ".json", ".yaml", ".yml", ".go", ".py", ".js", ".ts", ".tsx", ".java", ".rs", ".rb", ".php", ".html":
+		return true
+	default:
+		return false
+	}
 }
@@ -0,0 +1,102 @@
+package scraper
+
+import (
+	"context"
+	"os"
+	"path/filepath"
+	"strings"
+	"testing"
+	"time"
+)
+
+func TestLocalScraperScrapeDirectory(t *testing.T) {
+	tmp := t.TempDir()
+	if err := os.WriteFile(filepath.Join(tmp, "README.md"), []byte("# Demo\n\nhello docs"), 0o644); err != nil {
+		t.Fatal(err)
+	}
+	if err := os.WriteFile(filepath.Join(tmp, "notes.txt"), []byte("notes"), 0o644); err != nil {
+		t.Fatal(err)
+	}
+	if err := os.WriteFile(filepath.Join(tmp, "bin.bin"), []byte{0x00, 0x01}, 0o644); err != nil {
+		t.Fatal(err)
+	}
+
+	s := NewLocalScraper(&Config{Timeout: 2 * time.Second, UserAgent: "DevourTest"})
+	docs, err := s.Scrape(context.Background(), &Source{Name: "local", Type: SourceTypeLocal, Path: tmp})
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if len(docs) < 2 {
+		t.Fatalf("expected at least 2 docs, got %d", len(docs))
+	}
+}
+
+func TestLocalScraperDetectChanges(t *testing.T) {
+	tmp := t.TempDir()
+	file := filepath.Join(tmp, "README.md")
+	if err := os.WriteFile(file, []byte("v1"), 0o644); err != nil {
+		t.Fatal(err)
+	}
+
+	s := NewLocalScraper(&Config{Timeout: 2 * time.Second, UserAgent: "DevourTest"})
+	src := &Source{Name: "local", Type: SourceTypeLocal, Path: tmp}
+
+	changed, hash1, err := s.DetectChanges(context.Background(), src, "")
+	if err != nil {
+		t.Fatal(err)
+	}
+	if !changed || hash1 == "" {
+		t.Fatalf("expected first detect to change with non-empty hash, changed=%v hash=%q", changed, hash1)
+	}
+
+	time.Sleep(5 * time.Millisecond)
+	if err := os.WriteFile(file, []byte("v2"), 0o644); err != nil {
+		t.Fatal(err)
+	}
+
+	changed, hash2, err := s.DetectChanges(context.Background(), src, hash1)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if !changed {
+		t.Fatal("expected change after file update")
+	}
+	if hash1 == hash2 {
+		t.Fatal("expected hash to change")
+	}
+}
+
+func TestLocalScraper_StripsFrontmatterAndMDXPreamble(t *testing.T) {
+	tmp := t.TempDir()
+	path := filepath.Join(tmp, "doc.mdx")
+	content := `---
+title: My Doc
+slug: /my-doc
+---
+
+import { Component } from "x"
+export const meta = {}
+
+# Heading
+
+Actual documentation body.
+`
+	if err := os.WriteFile(path, []byte(content), 0o644); err != nil {
+		t.Fatal(err)
+	}
+
+	s := NewLocalScraper(&Config{Timeout: 2 * time.Second, UserAgent: "DevourTest"})
+	doc, err := s.fileToDocument(path, &Source{Name: "local", Type: SourceTypeLocal, Path: tmp})
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if strings.Contains(doc.Content, "slug: /my-doc") {
+		t.Fatalf("expected frontmatter to be stripped, got: %q", doc.Content)
+	}
+	if strings.Contains(doc.Content, "import { Component }") {
+		t.Fatalf("expected MDX preamble to be stripped, got: %q", doc.Content)
+	}
+	if !strings.Contains(doc.Content, "Actual documentation body.") {
+		t.Fatalf("expected markdown body in content, got: %q", doc.Content)
+	}
+}
@@ -0,0 +1,402 @@
+package scraper
+
+import (
+	"context"
+	"crypto/sha256"
+	"encoding/hex"
+	"encoding/json"
+	"fmt"
+	"io"
+	"net/http"
+	"net/url"
+	"sort"
+	"strconv"
+	"strings"
+	"time"
+)
+
+const (
+	defaultLocalSearchLimit = 8
+	maxLocalSearchLimit     = 50
+	maxSearchResponseBytes  = 2 << 20 // 2MB
+)
+
+// LocalSearchScraper scrapes docs from result URLs returned by a local search API.
+type LocalSearchScraper struct {
+	config *Config
+	client *http.Client
+	web    *WebScraper
+}
+
+type localSearchResult struct {
+	URL     string
+	Title   string
+	Snippet string
+	Engine  string
+	Score   float64
+}
+
+// NewLocalSearchScraper creates a scraper backed by a self-hosted search API.
+func NewLocalSearchScraper(config *Config) *LocalSearchScraper {
+	baseConfig := &Config{}
+	if config != nil {
+		*baseConfig = *config
+	}
+	if baseConfig.UserAgent == "" {
+		baseConfig.UserAgent = "Devour/1.0 (Local Search Scraper)"
+	}
+	if baseConfig.Timeout <= 0 {
+		baseConfig.Timeout = 30 * time.Second
+	}
+
+	webConfig := *baseConfig
+	webConfig.Concurrency = 1
+	webConfig.MaxDepth = 1
+
+	return &LocalSearchScraper{
+		config: baseConfig,
+		client: &http.Client{Timeout: baseConfig.Timeout},
+		web:    NewWebScraper(&webConfig),
+	}
+}
+
+// Scrape queries a local search API and scrapes the returned URLs.
+func (s *LocalSearchScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
+	if source == nil {
+		return nil, fmt.Errorf("source is required")
+	}
+	if strings.TrimSpace(source.URL) == "" {
+		return nil, fmt.Errorf("search API URL is required")
+	}
+	query := strings.TrimSpace(source.Query)
+	if query == "" {
+		return nil, fmt.Errorf("search query is required for localsearch sources (set source.query or --search-query)")
+	}
+
+	limit := clampLocalSearchLimit(source.ResultLimit)
+	results, err := s.search(ctx, source, query, limit)
+	if err != nil {
+		return nil, err
+	}
+
+	docs := make([]*Document, 0, limit)
+	seen := make(map[string]bool)
+	var scrapeErrors []string
+
+	for i, result := range results {
+		if ctx.Err() != nil {
+			return nil, ctx.Err()
+		}
+
+		resultURL := stripURLFragment(result.URL)
+		if resultURL == "" || seen[resultURL] {
+			continue
+		}
+		if !domainAllowed(resultURL, source.Domains) {
+			continue
+		}
+		if !s.web.shouldInclude(resultURL, source.Include, source.Exclude) {
+			continue
+		}
+		seen[resultURL] = true
+
+		pageDocs, err := s.web.Scrape(ctx, &Source{
+			Name:    source.Name,
+			Type:    SourceTypeWeb,
+			URL:     resultURL,
+			Include: source.Include,
+			Exclude: source.Exclude,
+		})
+		if err != nil {
+			if len(scrapeErrors) < 20 {
+				scrapeErrors = append(scrapeErrors, fmt.Sprintf("%s: %v", resultURL, err))
+			}
+			continue
+		}
+
+		for _, doc := range pageDocs {
+			if doc.Metadata == nil {
+				doc.Metadata = make(map[string]interface{})
+			}
+			doc.Metadata["search_api"] = source.URL
+			doc.Metadata["search_query"] = query
+			doc.Metadata["search_rank"] = i + 1
+			if result.Engine != "" {
+				doc.Metadata["search_engine"] = result.Engine
+			}
+			if result.Snippet != "" {
+				doc.Metadata["search_snippet"] = result.Snippet
+			}
+			if result.Score != 0 {
+				doc.Metadata["search_score"] = result.Score
+			}
+			if strings.TrimSpace(doc.Title) == "" && strings.TrimSpace(result.Title) != "" {
+				doc.Title = strings.TrimSpace(result.Title)
+			}
+
+			docs = append(docs, doc)
+		}
+	}
+
+	if len(docs) == 0 {
+		if len(scrapeErrors) > 0 {
+			return nil, fmt.Errorf("local search returned results but page scraping failed: %s", strings.Join(scrapeErrors, "; "))
+		}
+		return nil, fmt.Errorf("local search yielded no usable results for query %q", query)
+	}
+
+	return docs, nil
+}
+
+// DetectChanges checks if top search results changed.
+func (s *LocalSearchScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
+	if source == nil {
+		return false, "", fmt.Errorf("source is required")
+	}
+	query := strings.TrimSpace(source.Query)
+	if query == "" {
+		return false, "", fmt.Errorf("search query is required for localsearch sources")
+	}
+
+	limit := clampLocalSearchLimit(source.ResultLimit)
+	results, err := s.search(ctx, source, query, limit)
+	if err != nil {
+		return false, "", err
+	}
+
+	signatures := make([]string, 0, len(results))
+	for _, result := range results {
+		u := stripURLFragment(result.URL)
+		if u == "" {
+			continue
+		}
+		if !domainAllowed(u, source.Domains) {
+			continue
+		}
+		if !s.web.shouldInclude(u, source.Include, source.Exclude) {
+			continue
+		}
+		signatures = append(signatures, fmt.Sprintf("%s|%s|%s|%.6f", u, result.Title, result.Engine, result.Score))
+	}
+	sort.Strings(signatures)
+
+	hash := sha256.Sum256([]byte(strings.Join(signatures, "\n")))
+	currentHash := hex.EncodeToString(hash[:])
+	return currentHash != lastHash, currentHash, nil
+}
+
+func (s *LocalSearchScraper) search(ctx context.Context, source *Source, query string, limit int) ([]localSearchResult, error) {
+	searchURL, err := buildLocalSearchURL(source.URL, query, limit)
+	if err != nil {
+		return nil, err
+	}
+
+	req, err := http.NewRequestWithContext(ctx, http.MethodGet, searchURL, nil)
+	if err != nil {
+		return nil, fmt.Errorf("failed to build search request: %w", err)
+	}
+	req.Header.Set("User-Agent", s.config.UserAgent)
+	req.Header.Set("Accept", "application/json")
+
+	resp, err := s.client.Do(req)
+	if err != nil {
+		return nil, fmt.Errorf("search API request failed: %w", err)
+	}
+	defer resp.Body.Close()
+
+	body, err := io.ReadAll(io.LimitReader(resp.Body, maxSearchResponseBytes))
+	if err != nil {
+		return nil, fmt.Errorf("failed reading search API response: %w", err)
+	}
+
+	if resp.StatusCode < 200 || resp.StatusCode >= 300 {
+		msg := strings.TrimSpace(string(body))
+		if len(msg) > 200 {
+			msg = msg[:200]
+		}
+		return nil, fmt.Errorf("search API returned HTTP %d: %s", resp.StatusCode, msg)
+	}
+
+	results, err := decodeLocalSearchResults(body)
+	if err != nil {
+		return nil, err
+	}
+	if len(results) == 0 {
+		return nil, fmt.Errorf("search API returned no results")
+	}
+	if len(results) > limit {
+		results = results[:limit]
+	}
+	return results, nil
+}
+
+func buildLocalSearchURL(rawURL, query string, limit int) (string, error) {
+	u, err := url.Parse(strings.TrimSpace(rawURL))
+	if err != nil {
+		return "", fmt.Errorf("invalid search API URL: %w", err)
+	}
+	if u.Scheme == "" || u.Host == "" {
+		return "", fmt.Errorf("invalid search API URL: expected absolute URL, got %q", rawURL)
+	}
+
+	params := u.Query()
+	params.Set("q", query)
+	if params.Get("format") == "" {
+		params.Set("format", "json")
+	}
+	if params.Get("limit") == "" {
+		params.Set("limit", strconv.Itoa(clampLocalSearchLimit(limit)))
+	}
+	u.RawQuery = params.Encode()
+
+	return u.String(), nil
+}
+
+func decodeLocalSearchResults(body []byte) ([]localSearchResult, error) {
+	var payload map[string]interface{}
+	if err := json.Unmarshal(body, &payload); err != nil {
+		return nil, fmt.Errorf("search API returned invalid JSON: %w", err)
+	}
+
+	rawResults, ok := payload["results"]
+	if !ok {
+		return nil, fmt.Errorf("search API response missing results field")
+	}
+
+	items, ok := rawResults.([]interface{})
+	if !ok {
+		return nil, fmt.Errorf("search API results field is not an array")
+	}
+
+	results := make([]localSearchResult, 0, len(items))
+	for _, item := range items {
+		record, ok := item.(map[string]interface{})
+		if !ok {
+			continue
+		}
+
+		resultURL := pickString(record, "url", "link", "href")
+		if strings.TrimSpace(resultURL) == "" {
+			continue
+		}
+
+		results = append(results, localSearchResult{
+			URL:     strings.TrimSpace(resultURL),
+			Title:   strings.TrimSpace(pickString(record, "title", "name")),
+			Snippet: strings.TrimSpace(pickString(record, "content", "snippet", "description", "text")),
+			Engine:  strings.TrimSpace(pickString(record, "engine", "source")),
+			Score:   pickFloat(record, "score", "relevance"),
+		})
+	}
+
+	return results, nil
+}
+
+func pickString(record map[string]interface{}, keys ...string) string {
+	for _, key := range keys {
+		value, ok := record[key]
+		if !ok {
+			continue
+		}
+		switch v := value.(type) {
+		case string:
+			return v
+		case json.Number:
+			return v.String()
+		case float64:
+			return strconv.FormatFloat(v, 'f', -1, 64)
+		case int:
+			return strconv.Itoa(v)
+		}
+	}
+	return ""
+}
+
+func pickFloat(record map[string]interface{}, keys ...string) float64 {
+	for _, key := range keys {
+		value, ok := record[key]
+		if !ok {
+			continue
+		}
+		switch v := value.(type) {
+		case float64:
+			return v
+		case float32:
+			return float64(v)
+		case int:
+			return float64(v)
+		case int64:
+			return float64(v)
+		case json.Number:
+			f, err := v.Float64()
+			if err == nil {
+				return f
+			}
+		case string:
+			f, err := strconv.ParseFloat(strings.TrimSpace(v), 64)
+			if err == nil {
+				return f
+			}
+		}
+	}
+	return 0
+}
+
+func clampLocalSearchLimit(limit int) int {
+	if limit <= 0 {
+		return defaultLocalSearchLimit
+	}
+	if limit > maxLocalSearchLimit {
+		return maxLocalSearchLimit
+	}
+	return limit
+}
+
+func stripURLFragment(raw string) string {
+	u, err := url.Parse(strings.TrimSpace(raw))
+	if err != nil {
+		return strings.TrimSpace(raw)
+	}
+	u.Fragment = ""
+	return u.String()
+}
+
+func domainAllowed(raw string, allowedDomains []string) bool {
+	if len(allowedDomains) == 0 {
+		return true
+	}
+
+	u, err := url.Parse(raw)
+	if err != nil {
+		return false
+	}
+	host := strings.ToLower(strings.TrimSpace(u.Hostname()))
+	if host == "" {
+		return false
+	}
+
+	for _, candidate := range allowedDomains {
+		domain := normalizeDomain(candidate)
+		if domain == "" {
+			continue
+		}
+		if host == domain || strings.HasSuffix(host, "."+domain) {
+			return true
+		}
+	}
+	return false
+}
+
+func normalizeDomain(raw string) string {
+	raw = strings.ToLower(strings.TrimSpace(raw))
+	if raw == "" {
+		return ""
+	}
+	if strings.Contains(raw, "://") {
+		parsed, err := url.Parse(raw)
+		if err == nil {
+			return strings.ToLower(parsed.Hostname())
+		}
+	}
+	return strings.TrimPrefix(raw, ".")
+}
@@ -0,0 +1,226 @@
+package scraper
+
+import (
+	"context"
+	"encoding/json"
+	"net/http"
+	"net/http/httptest"
+	"net/url"
+	"strings"
+	"testing"
+	"time"
+)
+
+func TestLocalSearchScraperScrape(t *testing.T) {
+	mux := http.NewServeMux()
+	baseURL := ""
+
+	mux.HandleFunc("/search", func(w http.ResponseWriter, r *http.Request) {
+		if got := r.URL.Query().Get("q"); got != "go http client" {
+			t.Fatalf("expected query go http client, got %q", got)
+		}
+		if got := r.URL.Query().Get("format"); got != "json" {
+			t.Fatalf("expected format=json, got %q", got)
+		}
+		_ = json.NewEncoder(w).Encode(map[string]interface{}{
+			"results": []map[string]interface{}{
+				{
+					"url":     baseURL + "/docs/http-client",
+					"title":   "HTTP Client Guide",
+					"content": "How to build an HTTP client in Go",
+					"engine":  "searxng",
+					"score":   0.99,
+				},
+			},
+		})
+	})
+
+	mux.HandleFunc("/docs/http-client", func(w http.ResponseWriter, r *http.Request) {
+		w.Header().Set("Content-Type", "text/html")
+		_, _ = w.Write([]byte(`<html><head><title>HTTP Client Guide</title></head><body><main>` + strings.Repeat("http client docs ", 30) + `</main></body></html>`))
+	})
+
+	srv := httptest.NewServer(mux)
+	defer srv.Close()
+	baseURL = srv.URL
+
+	s := NewLocalSearchScraper(&Config{
+		UserAgent: "DevourTest/1.0",
+		Timeout:   2 * time.Second,
+	})
+
+	docs, err := s.Scrape(context.Background(), &Source{
+		Name:        "local-search",
+		Type:        SourceTypeLocalSearch,
+		URL:         srv.URL + "/search",
+		Query:       "go http client",
+		ResultLimit: 5,
+	})
+	if err != nil {
+		t.Fatalf("unexpected scrape error: %v", err)
+	}
+	if len(docs) == 0 {
+		t.Fatal("expected at least one document")
+	}
+
+	doc := docs[0]
+	if doc.URL != srv.URL+"/docs/http-client" {
+		t.Fatalf("unexpected document URL: %q", doc.URL)
+	}
+	if doc.Metadata["search_query"] != "go http client" {
+		t.Fatalf("expected metadata search_query, got %v", doc.Metadata["search_query"])
+	}
+	if doc.Metadata["search_engine"] != "searxng" {
+		t.Fatalf("expected metadata search_engine=searxng, got %v", doc.Metadata["search_engine"])
+	}
+}
+
+func TestLocalSearchScraperDomainFilter(t *testing.T) {
+	mux := http.NewServeMux()
+	baseURL := ""
+
+	mux.HandleFunc("/search", func(w http.ResponseWriter, r *http.Request) {
+		_ = json.NewEncoder(w).Encode(map[string]interface{}{
+			"results": []map[string]interface{}{
+				{
+					"url":   baseURL + "/docs/in-scope",
+					"title": "In Scope",
+				},
+				{
+					"url":   "https://example.com/out-of-scope",
+					"title": "Out Scope",
+				},
+			},
+		})
+	})
+
+	mux.HandleFunc("/docs/in-scope", func(w http.ResponseWriter, r *http.Request) {
+		w.Header().Set("Content-Type", "text/html")
+		_, _ = w.Write([]byte(`<html><head><title>In Scope</title></head><body><main>` + strings.Repeat("scoped docs ", 30) + `</main></body></html>`))
+	})
+
+	srv := httptest.NewServer(mux)
+	defer srv.Close()
+	baseURL = srv.URL
+
+	parsed, err := url.Parse(srv.URL)
+	if err != nil {
+		t.Fatalf("failed to parse server URL: %v", err)
+	}
+
+	s := NewLocalSearchScraper(&Config{
+		UserAgent: "DevourTest/1.0",
+		Timeout:   2 * time.Second,
+	})
+
+	docs, err := s.Scrape(context.Background(), &Source{
+		Name:        "local-search",
+		Type:        SourceTypeLocalSearch,
+		URL:         srv.URL + "/search",
+		Query:       "scope test",
+		ResultLimit: 10,
+		Domains:     []string{parsed.Hostname()},
+	})
+	if err != nil {
+		t.Fatalf("unexpected scrape error: %v", err)
+	}
+	if len(docs) == 0 {
+		t.Fatal("expected at least one in-scope document")
+	}
+	for _, doc := range docs {
+		docURL, parseErr := url.Parse(doc.URL)
+		if parseErr != nil {
+			t.Fatalf("invalid doc URL %q: %v", doc.URL, parseErr)
+		}
+		if docURL.Hostname() != parsed.Hostname() {
+			t.Fatalf("expected only in-scope domain, got %q", doc.URL)
+		}
+	}
+}
+
+func TestLocalSearchScraperRequiresQuery(t *testing.T) {
+	s := NewLocalSearchScraper(&Config{
+		UserAgent: "DevourTest/1.0",
+		Timeout:   2 * time.Second,
+	})
+
+	_, err := s.Scrape(context.Background(), &Source{
+		Name: "local-search",
+		Type: SourceTypeLocalSearch,
+		URL:  "http://127.0.0.1:8080/search",
+	})
+	if err == nil {
+		t.Fatal("expected error when query is missing")
+	}
+	if !strings.Contains(err.Error(), "query") {
+		t.Fatalf("unexpected error: %v", err)
+	}
+}
+
+func TestLocalSearchScraperDetectChanges(t *testing.T) {
+	mux := http.NewServeMux()
+	baseURL := ""
+	resultPath := "/docs/one"
+
+	mux.HandleFunc("/search", func(w http.ResponseWriter, r *http.Request) {
+		_ = json.NewEncoder(w).Encode(map[string]interface{}{
+			"results": []map[string]interface{}{
+				{
+					"url":   baseURL + resultPath,
+					"title": "Versioned",
+					"score": 1.0,
+				},
+			},
+		})
+	})
+
+	srv := httptest.NewServer(mux)
+	defer srv.Close()
+	baseURL = srv.URL
+
+	s := NewLocalSearchScraper(&Config{
+		UserAgent: "DevourTest/1.0",
+		Timeout:   2 * time.Second,
+	})
+	source := &Source{
+		Name:        "local-search",
+		Type:        SourceTypeLocalSearch,
+		URL:         srv.URL + "/search",
+		Query:       "version test",
+		ResultLimit: 3,
+	}
+
+	changed, hash1, err := s.DetectChanges(context.Background(), source, "")
+	if err != nil {
+		t.Fatalf("unexpected detect changes error: %v", err)
+	}
+	if !changed {
+		t.Fatal("expected first detect changes call to report changed")
+	}
+	if hash1 == "" {
+		t.Fatal("expected non-empty hash")
+	}
+
+	changed, hash2, err := s.DetectChanges(context.Background(), source, hash1)
+	if err != nil {
+		t.Fatalf("unexpected detect changes error: %v", err)
+	}
+	if changed {
+		t.Fatal("expected unchanged results with identical hash")
+	}
+	if hash2 != hash1 {
+		t.Fatalf("expected identical hash, got %q and %q", hash1, hash2)
+	}
+
+	resultPath = "/docs/two"
+	changed, hash3, err := s.DetectChanges(context.Background(), source, hash1)
+	if err != nil {
+		t.Fatalf("unexpected detect changes error: %v", err)
+	}
+	if !changed {
+		t.Fatal("expected changed results after search output changed")
+	}
+	if hash3 == hash1 {
+		t.Fatal("expected hash to change")
+	}
+}
@@ -0,0 +1,88 @@
+package scraper
+
+import (
+	"net/url"
+	"path"
+	"regexp"
+	"strings"
+)
+
+var (
+	titleNoiseRe      = regexp.MustCompile(`(?i)\b(added in go[0-9]+(\.[0-9]+)?|deprecated)\b`)
+	titleSpaceRe      = regexp.MustCompile(`\s+`)
+	contentSpaceRe    = regexp.MustCompile(`[ \t]+\n`)
+	multiNewlineRe    = regexp.MustCompile(`\n{3,}`)
+	nonPrintableTitle = regexp.MustCompile(`[[:cntrl:]]`)
+)
+
+// NormalizeDocuments applies normalization to a list of scraped documents.
+func NormalizeDocuments(docs []*Document) []*Document {
+	for _, doc := range docs {
+		NormalizeDocument(doc)
+	}
+	return docs
+}
+
+// NormalizeDocument applies cross-scraper output cleanup.
+func NormalizeDocument(doc *Document) {
+	if doc == nil {
+		return
+	}
+
+	doc.URL = strings.TrimSpace(doc.URL)
+	doc.Type = strings.TrimSpace(doc.Type)
+	doc.Title = normalizeTitle(doc.Title)
+	doc.Content = normalizeContent(doc.Content)
+
+	if doc.Title == "" {
+		doc.Title = inferTitleFromURL(doc.URL)
+	}
+}
+
+func normalizeTitle(title string) string {
+	title = strings.ReplaceAll(title, "¶", " ")
+	title = strings.ReplaceAll(title, "_", " ")
+	title = nonPrintableTitle.ReplaceAllString(title, " ")
+	title = titleNoiseRe.ReplaceAllString(title, " ")
+	title = titleSpaceRe.ReplaceAllString(strings.TrimSpace(title), " ")
+
+	// Remove dangling punctuation if it became a suffix after cleanup.
+	title = strings.TrimSpace(strings.Trim(title, "-:.,;"))
+	return title
+}
+
+func normalizeContent(content string) string {
+	content = strings.ReplaceAll(content, "\r\n", "\n")
+	content = strings.TrimSpace(content)
+	content = contentSpaceRe.ReplaceAllString(content, "\n")
+	content = multiNewlineRe.ReplaceAllString(content, "\n\n")
+	return content
+}
+
+func inferTitleFromURL(rawURL string) string {
+	if rawURL == "" {
+		return "Documentation"
+	}
+
+	u, err := url.Parse(rawURL)
+	if err != nil {
+		return "Documentation"
+	}
+
+	base := path.Base(strings.Trim(u.Path, "/"))
+	if base == "" || base == "." || base == "/" {
+		if u.Host != "" {
+			return u.Host
+		}
+		return "Documentation"
+	}
+
+	base = strings.TrimSuffix(base, ".html")
+	base = strings.ReplaceAll(base, "-", " ")
+	base = strings.ReplaceAll(base, "_", " ")
+	base = titleSpaceRe.ReplaceAllString(strings.TrimSpace(base), " ")
+	if base == "" {
+		return "Documentation"
+	}
+	return base
+}
@@ -0,0 +1,33 @@
+package scraper
+
+import "testing"
+
+func TestNormalizeDocument_TitleCleanup(t *testing.T) {
+	doc := &Document{
+		Title:   "http.type CloseNotifier ¶ deprecated added in go1.1",
+		Content: "line 1  \n\n\nline 2",
+		URL:     "https://pkg.go.dev/net/http#CloseNotifier",
+	}
+
+	NormalizeDocument(doc)
+
+	if doc.Title != "http.type CloseNotifier" {
+		t.Fatalf("unexpected normalized title: %q", doc.Title)
+	}
+	if doc.Content != "line 1\n\nline 2" {
+		t.Fatalf("unexpected normalized content: %q", doc.Content)
+	}
+}
+
+func TestNormalizeDocument_InferTitle(t *testing.T) {
+	doc := &Document{
+		Title: "",
+		URL:   "https://kotlinlang.org/docs/regex.html",
+	}
+
+	NormalizeDocument(doc)
+
+	if doc.Title != "regex" {
+		t.Fatalf("expected inferred title 'regex', got %q", doc.Title)
+	}
+}
@@ -2,30 +2,337 @@ package scraper

 import (
 	"context"
+	"crypto/sha256"
+	"encoding/hex"
+	"encoding/json"
+	"fmt"
+	"io"
+	"net/http"
+	"os"
+	"sort"
+	"strings"
+	"time"
+
+	"gopkg.in/yaml.v3"
 )

 // OpenAPIScraper parses OpenAPI/Swagger specifications.
 type OpenAPIScraper struct {
 	config *Config
+	client *http.Client
 }

 // NewOpenAPIScraper creates a new OpenAPI scraper.
 func NewOpenAPIScraper(config *Config) *OpenAPIScraper {
-	return &OpenAPIScraper{config: config}
+	timeout := 30 * time.Second
+	if config != nil && config.Timeout > 0 {
+		timeout = config.Timeout
+	}
+	return &OpenAPIScraper{
+		config: config,
+		client: &http.Client{Timeout: timeout},
+	}
 }

 // Scrape fetches and parses an OpenAPI specification.
 func (s *OpenAPIScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
-	// TODO: Implement OpenAPI parsing
-	// 1. Fetch spec from URL
-	// 2. Parse endpoints, schemas, descriptions
-	// 3. Create documents per endpoint
-	// 4. Include authentication, parameters
-	return nil, nil
+	if source == nil {
+		return nil, fmt.Errorf("source is required")
+	}
+
+	raw, specURL, err := s.readSpec(ctx, source)
+	if err != nil {
+		return nil, err
+	}
+
+	spec, err := parseOpenAPISpec(raw)
+	if err != nil {
+		return nil, err
+	}
+
+	docs := make([]*Document, 0)
+	mainContent := buildMainSpecContent(spec)
+	docs = append(docs, &Document{
+		ID:      generateDocID(specURL + "#openapi"),
+		Source:  coalesceSourceName(source.Name, "openapi"),
+		Type:    "openapi-spec",
+		Title:   spec.Info.Title,
+		Content: mainContent,
+		URL:     specURL,
+		Metadata: map[string]interface{}{
+			"openapi": spec.Version,
+			"servers": spec.Servers,
+		},
+		Hash:      hashBytes(raw),
+		Timestamp: time.Now(),
+	})
+
+	paths := make([]string, 0, len(spec.Paths))
+	for path := range spec.Paths {
+		paths = append(paths, path)
+	}
+	sort.Strings(paths)
+
+	for _, p := range paths {
+		opMap := spec.Paths[p]
+		methods := make([]string, 0, len(opMap))
+		for m := range opMap {
+			methods = append(methods, strings.ToUpper(m))
+		}
+		sort.Strings(methods)
+
+		for _, method := range methods {
+			op := opMap[strings.ToLower(method)]
+			if op == nil {
+				continue
+			}
+			title := strings.TrimSpace(op.Summary)
+			if title == "" {
+				title = fmt.Sprintf("%s %s", method, p)
+			}
+			content := buildOperationContent(method, p, op)
+			docURL := fmt.Sprintf("%s#%s-%s", specURL, strings.ToLower(method), sanitizeFragment(p))
+			docs = append(docs, &Document{
+				ID:      generateDocID(docURL),
+				Source:  coalesceSourceName(source.Name, "openapi"),
+				Type:    "openapi-operation",
+				Title:   title,
+				Content: content,
+				URL:     docURL,
+				Metadata: map[string]interface{}{
+					"method":       method,
+					"path":         p,
+					"operation_id": op.OperationID,
+				},
+				Hash:      hashString(content),
+				Timestamp: time.Now(),
+			})
+		}
+	}
+
+	return docs, nil
 }

 // DetectChanges checks if the spec has been updated.
 func (s *OpenAPIScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
-	// TODO: Check spec content hash
-	return false, "", nil
+	if source == nil {
+		return false, "", fmt.Errorf("source is required")
+	}
+	raw, _, err := s.readSpec(ctx, source)
+	if err != nil {
+		return false, "", err
+	}
+	hash := hashBytes(raw)
+	return hash != lastHash, hash, nil
+}
+
+func (s *OpenAPIScraper) readSpec(ctx context.Context, source *Source) ([]byte, string, error) {
+	rawPath := strings.TrimSpace(source.URL)
+	if rawPath == "" {
+		rawPath = strings.TrimSpace(source.Path)
+	}
+	if rawPath == "" {
+		return nil, "", fmt.Errorf("openapi source requires url or path")
+	}
+
+	if strings.HasPrefix(rawPath, "http://") || strings.HasPrefix(rawPath, "https://") {
+		req, err := http.NewRequestWithContext(ctx, http.MethodGet, rawPath, nil)
+		if err != nil {
+			return nil, "", err
+		}
+		if s.config != nil && strings.TrimSpace(s.config.UserAgent) != "" {
+			req.Header.Set("User-Agent", s.config.UserAgent)
+		}
+
+		resp, err := s.client.Do(req)
+		if err != nil {
+			return nil, "", err
+		}
+		defer resp.Body.Close()
+		if resp.StatusCode < 200 || resp.StatusCode >= 300 {
+			return nil, "", fmt.Errorf("openapi fetch failed: HTTP %d", resp.StatusCode)
+		}
+		body, err := io.ReadAll(io.LimitReader(resp.Body, 10<<20))
+		if err != nil {
+			return nil, "", err
+		}
+		return body, rawPath, nil
+	}
+
+	b, err := os.ReadFile(rawPath)
+	if err != nil {
+		return nil, "", err
+	}
+	return b, "file://" + rawPath, nil
+}
+
+type openAPISpec struct {
+	Version string               `json:"openapi" yaml:"openapi"`
+	Swagger string               `json:"swagger" yaml:"swagger"`
+	Info    openAPIInfo          `json:"info" yaml:"info"`
+	Servers []openAPIServer      `json:"servers" yaml:"servers"`
+	Paths   map[string]pathItems `json:"paths" yaml:"paths"`
+}
+
+type openAPIInfo struct {
+	Title       string `json:"title" yaml:"title"`
+	Version     string `json:"version" yaml:"version"`
+	Description string `json:"description" yaml:"description"`
+}
+
+type openAPIServer struct {
+	URL         string `json:"url" yaml:"url"`
+	Description string `json:"description" yaml:"description"`
+}
+
+type pathItems map[string]*openAPIOperation
+
+type openAPIOperation struct {
+	Summary     string                `json:"summary" yaml:"summary"`
+	Description string                `json:"description" yaml:"description"`
+	OperationID string                `json:"operationId" yaml:"operationId"`
+	Parameters  []openAPIParameter    `json:"parameters" yaml:"parameters"`
+	Responses   map[string]response   `json:"responses" yaml:"responses"`
+	RequestBody map[string]any        `json:"requestBody" yaml:"requestBody"`
+	Tags        []string              `json:"tags" yaml:"tags"`
+	Deprecated  bool                  `json:"deprecated" yaml:"deprecated"`
+	Security    []map[string][]string `json:"security" yaml:"security"`
+}
+
+type openAPIParameter struct {
+	Name        string `json:"name" yaml:"name"`
+	In          string `json:"in" yaml:"in"`
+	Description string `json:"description" yaml:"description"`
+	Required    bool   `json:"required" yaml:"required"`
+}
+
+type response struct {
+	Description string `json:"description" yaml:"description"`
+}
+
+func parseOpenAPISpec(raw []byte) (*openAPISpec, error) {
+	var spec openAPISpec
+	if err := json.Unmarshal(raw, &spec); err != nil {
+		if yamlErr := yaml.Unmarshal(raw, &spec); yamlErr != nil {
+			return nil, fmt.Errorf("invalid openapi content: %w", err)
+		}
+	}
+
+	if strings.TrimSpace(spec.Info.Title) == "" {
+		spec.Info.Title = "OpenAPI Specification"
+	}
+	if strings.TrimSpace(spec.Version) == "" {
+		spec.Version = spec.Swagger
+	}
+	if spec.Paths == nil {
+		spec.Paths = map[string]pathItems{}
+	}
+
+	return &spec, nil
+}
+
+func buildMainSpecContent(spec *openAPISpec) string {
+	var b strings.Builder
+	fmt.Fprintf(&b, "# %s\n\n", spec.Info.Title)
+	if spec.Info.Version != "" {
+		fmt.Fprintf(&b, "- API Version: %s\n", spec.Info.Version)
+	}
+	if spec.Version != "" {
+		fmt.Fprintf(&b, "- OpenAPI: %s\n", spec.Version)
+	}
+	fmt.Fprintf(&b, "- Paths: %d\n", len(spec.Paths))
+	if spec.Info.Description != "" {
+		fmt.Fprintf(&b, "\n%s\n", strings.TrimSpace(spec.Info.Description))
+	}
+	if len(spec.Servers) > 0 {
+		fmt.Fprintf(&b, "\n## Servers\n")
+		for _, s := range spec.Servers {
+			fmt.Fprintf(&b, "- %s", s.URL)
+			if s.Description != "" {
+				fmt.Fprintf(&b, " - %s", s.Description)
+			}
+			fmt.Fprintln(&b)
+		}
+	}
+	return b.String()
+}
+
+func buildOperationContent(method, path string, op *openAPIOperation) string {
+	var b strings.Builder
+	fmt.Fprintf(&b, "# %s %s\n\n", method, path)
+	if op.Summary != "" {
+		fmt.Fprintf(&b, "%s\n\n", strings.TrimSpace(op.Summary))
+	}
+	if op.Description != "" {
+		fmt.Fprintf(&b, "%s\n\n", strings.TrimSpace(op.Description))
+	}
+	if op.OperationID != "" {
+		fmt.Fprintf(&b, "- Operation ID: `%s`\n", op.OperationID)
+	}
+	if len(op.Tags) > 0 {
+		fmt.Fprintf(&b, "- Tags: %s\n", strings.Join(op.Tags, ", "))
+	}
+	if op.Deprecated {
+		fmt.Fprintln(&b, "- Deprecated: true")
+	}
+	if len(op.Parameters) > 0 {
+		fmt.Fprintln(&b, "\n## Parameters")
+		for _, p := range op.Parameters {
+			req := "optional"
+			if p.Required {
+				req = "required"
+			}
+			fmt.Fprintf(&b, "- `%s` (%s, %s)", p.Name, p.In, req)
+			if p.Description != "" {
+				fmt.Fprintf(&b, ": %s", strings.TrimSpace(p.Description))
+			}
+			fmt.Fprintln(&b)
+		}
+	}
+	if len(op.Responses) > 0 {
+		codes := make([]string, 0, len(op.Responses))
+		for code := range op.Responses {
+			codes = append(codes, code)
+		}
+		sort.Strings(codes)
+		fmt.Fprintln(&b, "\n## Responses")
+		for _, code := range codes {
+			resp := op.Responses[code]
+			fmt.Fprintf(&b, "- `%s`", code)
+			if resp.Description != "" {
+				fmt.Fprintf(&b, ": %s", strings.TrimSpace(resp.Description))
+			}
+			fmt.Fprintln(&b)
+		}
+	}
+	return b.String()
+}
+
+func sanitizeFragment(path string) string {
+	path = strings.ToLower(path)
+	path = strings.ReplaceAll(path, "/", "-")
+	path = strings.ReplaceAll(path, "{", "")
+	path = strings.ReplaceAll(path, "}", "")
+	path = strings.Trim(path, "-")
+	if path == "" {
+		return "root"
+	}
+	return path
+}
+
+func hashBytes(b []byte) string {
+	h := sha256.Sum256(b)
+	return hex.EncodeToString(h[:])
+}
+
+func hashString(s string) string {
+	h := sha256.Sum256([]byte(s))
+	return hex.EncodeToString(h[:])
+}
+
+func coalesceSourceName(name, fallback string) string {
+	if strings.TrimSpace(name) != "" {
+		return name
+	}
+	return fallback
 }
@@ -0,0 +1,77 @@
+package scraper
+
+import (
+	"context"
+	"net/http"
+	"net/http/httptest"
+	"strings"
+	"testing"
+	"time"
+)
+
+func TestOpenAPIScraperScrape(t *testing.T) {
+	spec := `{
+  "openapi": "3.0.0",
+  "info": {"title": "Pet API", "version": "1.0.0"},
+  "paths": {
+    "/pets": {
+      "get": {
+        "summary": "List pets",
+        "operationId": "listPets",
+        "responses": {"200": {"description": "ok"}}
+      }
+    }
+  }
+}`
+
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.Header().Set("Content-Type", "application/json")
+		_, _ = w.Write([]byte(spec))
+	}))
+	defer srv.Close()
+
+	s := NewOpenAPIScraper(&Config{Timeout: 2 * time.Second, UserAgent: "DevourTest"})
+	docs, err := s.Scrape(context.Background(), &Source{Name: "pet", Type: SourceTypeOpenAPI, URL: srv.URL})
+	if err != nil {
+		t.Fatalf("unexpected scrape error: %v", err)
+	}
+	if len(docs) < 2 {
+		t.Fatalf("expected at least 2 docs, got %d", len(docs))
+	}
+	foundOp := false
+	for _, d := range docs {
+		if strings.Contains(d.Title, "List pets") {
+			foundOp = true
+			break
+		}
+	}
+	if !foundOp {
+		t.Fatal("expected operation document")
+	}
+}
+
+func TestOpenAPIScraperDetectChanges(t *testing.T) {
+	spec := `{"openapi":"3.0.0","info":{"title":"API"},"paths":{}}`
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		_, _ = w.Write([]byte(spec))
+	}))
+	defer srv.Close()
+
+	s := NewOpenAPIScraper(&Config{Timeout: 2 * time.Second, UserAgent: "DevourTest"})
+	src := &Source{Name: "api", Type: SourceTypeOpenAPI, URL: srv.URL}
+	changed, hash1, err := s.DetectChanges(context.Background(), src, "")
+	if err != nil {
+		t.Fatal(err)
+	}
+	if !changed || hash1 == "" {
+		t.Fatalf("expected changed=true and non-empty hash, changed=%v hash=%q", changed, hash1)
+	}
+
+	changed, _, err = s.DetectChanges(context.Background(), src, hash1)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if changed {
+		t.Fatal("expected no changes when hash matches")
+	}
+}
@@ -5,6 +5,7 @@ func init() {
 	// Additional scrapers can be registered in their own packages
 	RegisterScraper(SourceTypeWeb, func(c *Config) Scraper { return NewWebScraper(c) })
 	RegisterScraper(SourceTypeLocal, func(c *Config) Scraper { return NewLocalScraper(c) })
+	RegisterScraper(SourceTypeLocalSearch, func(c *Config) Scraper { return NewLocalSearchScraper(c) })
 	RegisterScraper(SourceTypeGitHub, func(c *Config) Scraper { return NewGitHubScraper(c) })
 	RegisterScraper(SourceTypeOpenAPI, func(c *Config) Scraper { return NewOpenAPIScraper(c) })
 }
@@ -0,0 +1,71 @@
+package scraper_test
+
+import (
+	"testing"
+	"time"
+
+	basescraper "github.com/yourorg/devour/internal/scraper"
+	_ "github.com/yourorg/devour/internal/scraper/external"
+)
+
+func TestLanguageScrapersAreRegistered(t *testing.T) {
+	config := &basescraper.Config{
+		UserAgent: "DevourTest/1.0",
+		Timeout:   2 * time.Second,
+	}
+
+	supportedDocTypes := []basescraper.SourceType{
+		basescraper.SourceTypeGoDocs,
+		basescraper.SourceTypeRustDocs,
+		basescraper.SourceTypePythonDocs,
+		basescraper.SourceTypeJavaDocs,
+		basescraper.SourceTypeSpringDocs,
+		basescraper.SourceTypeTSDocs,
+		basescraper.SourceTypeReactDocs,
+		basescraper.SourceTypeVueDocs,
+		basescraper.SourceTypeNuxtDocs,
+		basescraper.SourceTypeMCPDocs,
+		basescraper.SourceTypeDockerDocs,
+		basescraper.SourceTypeCloudflareDocs,
+		basescraper.SourceTypeAstroDocs,
+	}
+
+	for _, sourceType := range supportedDocTypes {
+		t.Run(string(sourceType), func(t *testing.T) {
+			s := basescraper.NewScraper(sourceType, config)
+			if s == nil {
+				t.Fatalf("NewScraper(%q) returned nil; scraper was not registered", sourceType)
+			}
+		})
+	}
+}
+
+func TestDetectSourceType_ForSupportedDocsHosts(t *testing.T) {
+	tests := []struct {
+		input    string
+		expected basescraper.SourceType
+	}{
+		{"https://pkg.go.dev/net/http", basescraper.SourceTypeGoDocs},
+		{"https://docs.rs/tokio/latest/tokio/", basescraper.SourceTypeRustDocs},
+		{"https://docs.python.org/3/library/asyncio.html", basescraper.SourceTypePythonDocs},
+		{"https://docs.oracle.com/javase/8/docs/api/java/util/List.html", basescraper.SourceTypeJavaDocs},
+		{"https://docs.spring.io/spring-boot/docs/current/reference/htmlsingle/", basescraper.SourceTypeSpringDocs},
+		{"https://www.typescriptlang.org/docs/handbook/2/basic-types.html", basescraper.SourceTypeTSDocs},
+		{"https://react.dev/reference/react/hooks", basescraper.SourceTypeReactDocs},
+		{"https://vuejs.org/guide/introduction.html", basescraper.SourceTypeVueDocs},
+		{"https://nuxt.com/docs/guide/directory-structure", basescraper.SourceTypeNuxtDocs},
+		{"https://docs.docker.com/compose", basescraper.SourceTypeDockerDocs},
+		{"https://hub.docker.com/mcp/server/github", basescraper.SourceTypeMCPDocs},
+		{"https://developers.cloudflare.com/workers", basescraper.SourceTypeCloudflareDocs},
+		{"https://docs.astro.build/en/guides/components/", basescraper.SourceTypeAstroDocs},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.input, func(t *testing.T) {
+			got := basescraper.DetectSourceType(tt.input)
+			if got != tt.expected {
+				t.Fatalf("DetectSourceType(%q) = %q, want %q", tt.input, got, tt.expected)
+			}
+		})
+	}
+}
@@ -28,7 +28,7 @@ func (r *ScraperRegistry) Register(sourceType SourceType, constructor ScraperCon
 // Create creates a scraper instance
 func (r *ScraperRegistry) Create(sourceType SourceType, config *Config) Scraper {
 	if constructor, exists := r.constructors[sourceType]; exists {
-		return constructor(config)
+		return wrapScraper(constructor(config))
 	}
 	return nil
 }
@@ -17,6 +17,7 @@ const (
 	SourceTypeGitHub         SourceType = "github"
 	SourceTypeOpenAPI        SourceType = "openapi"
 	SourceTypeLocal          SourceType = "local"
+	SourceTypeLocalSearch    SourceType = "localsearch"
 	SourceTypeGoDocs         SourceType = "godocs"
 	SourceTypeRustDocs       SourceType = "rustdocs"
 	SourceTypePythonDocs     SourceType = "pythondocs"
@@ -34,15 +35,18 @@ const (

 // Source represents a documentation source to scrape.
 type Source struct {
-	Name     string     `yaml:"name"`
-	Type     SourceType `yaml:"type"`
-	URL      string     `yaml:"url,omitempty"`
-	Repo     string     `yaml:"repo,omitempty"`
-	Branch   string     `yaml:"branch,omitempty"`
-	Path     string     `yaml:"path,omitempty"`
-	Include  []string   `yaml:"include,omitempty"`
-	Exclude  []string   `yaml:"exclude,omitempty"`
-	Schedule string     `yaml:"schedule,omitempty"`
+	Name        string     `yaml:"name"`
+	Type        SourceType `yaml:"type"`
+	URL         string     `yaml:"url,omitempty"`
+	Query       string     `yaml:"query,omitempty"`
+	ResultLimit int        `yaml:"result_limit,omitempty"`
+	Domains     []string   `yaml:"domains,omitempty"`
+	Repo        string     `yaml:"repo,omitempty"`
+	Branch      string     `yaml:"branch,omitempty"`
+	Path        string     `yaml:"path,omitempty"`
+	Include     []string   `yaml:"include,omitempty"`
+	Exclude     []string   `yaml:"exclude,omitempty"`
+	Schedule    string     `yaml:"schedule,omitempty"`
 }

 // Document represents a scraped document.
@@ -113,6 +117,11 @@ func DetectSourceType(input string) SourceType {
 		}
 	}

+	// MCP servers are hosted under Docker Hub paths.
+	if strings.Contains(input, "hub.docker.com/mcp/") {
+		return SourceTypeMCPDocs
+	}
+
 	// Check for OpenAPI specs
 	if strings.HasSuffix(input, ".json") || strings.HasSuffix(input, ".yaml") || strings.HasSuffix(input, ".yml") {
 		if strings.Contains(strings.ToLower(input), "openapi") || strings.Contains(strings.ToLower(input), "swagger") {
@@ -6,8 +6,10 @@ import (
 	"encoding/hex"
 	"fmt"
 	"net/url"
+	"path"
 	"regexp"
 	"strings"
+	"sync"
 	"time"

 	"github.com/gocolly/colly/v2"
@@ -27,17 +29,42 @@ func NewWebScraper(config *Config) *WebScraper {
 func (s *WebScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
 	var documents []*Document
 	visited := make(map[string]bool)
+	scheduled := make(map[string]bool)
+	contentHashes := make(map[string]bool)
+	var mu sync.Mutex
+	var scrapeErrors []string

 	// Parse base URL for domain restrictions
 	baseURL, err := url.Parse(source.URL)
 	if err != nil {
 		return nil, fmt.Errorf("invalid URL: %w", err)
 	}
+	allowedDomain := baseURL.Hostname()
+	if allowedDomain == "" {
+		allowedDomain = baseURL.Host
+	}
+
+	maxDepth := s.config.MaxDepth
+	if maxDepth <= 0 {
+		maxDepth = 2
+	}
+	maxPages := s.config.Concurrency * 40
+	if maxPages < 20 {
+		maxPages = 20
+	}
+	if maxDepth <= 1 && maxPages > 30 {
+		maxPages = 30
+	}
+	if maxPages > 300 {
+		maxPages = 300
+	}
+	scopePrefix := pathScopePrefix(baseURL.Path)
+	scopeLeaf := pathScopeLeaf(baseURL.Path)

 	// Create Colly collector
 	c := colly.NewCollector(
-		colly.AllowedDomains(baseURL.Host),
-		colly.MaxDepth(s.config.MaxDepth),
+		colly.AllowedDomains(allowedDomain),
+		colly.MaxDepth(maxDepth),
 		colly.Async(true),
 		colly.UserAgent(s.config.UserAgent),
 	)
@@ -65,18 +92,40 @@ func (s *WebScraper) Scrape(ctx context.Context, source *Source) ([]*Document, e

 	// Handle errors
 	c.OnError(func(r *colly.Response, err error) {
-		fmt.Printf("Error scraping %s: %v\n", r.Request.URL, err)
+		errText := strings.ToLower(err.Error())
+		if strings.Contains(errText, "already visited") {
+			return
+		}
+		reqURL := source.URL
+		if r != nil && r.Request != nil && r.Request.URL != nil {
+			reqURL = r.Request.URL.String()
+		}
+		mu.Lock()
+		if len(scrapeErrors) < 20 {
+			scrapeErrors = append(scrapeErrors, fmt.Sprintf("%s: %v", reqURL, err))
+		}
+		mu.Unlock()
 	})

 	// Extract content from pages
 	c.OnHTML("html", func(e *colly.HTMLElement) {
 		pageURL := e.Request.URL.String()
+		if !withinScope(e.Request.URL, baseURL, scopePrefix, scopeLeaf) {
+			return
+		}

 		// Skip if already visited
+		mu.Lock()
 		if visited[pageURL] {
+			mu.Unlock()
+			return
+		}
+		if len(visited) >= maxPages {
+			mu.Unlock()
 			return
 		}
 		visited[pageURL] = true
+		mu.Unlock()

 		// Check include/exclude patterns
 		if !s.shouldInclude(pageURL, source.Include, source.Exclude) {
@@ -99,6 +148,13 @@ func (s *WebScraper) Scrape(ctx context.Context, source *Source) ([]*Document, e

 		// Generate hash for change detection
 		hash := s.generateHash(content)
+		mu.Lock()
+		if contentHashes[hash] {
+			mu.Unlock()
+			return
+		}
+		contentHashes[hash] = true
+		mu.Unlock()

 		// Extract metadata
 		metadata := map[string]interface{}{
@@ -120,30 +176,74 @@ func (s *WebScraper) Scrape(ctx context.Context, source *Source) ([]*Document, e
 			Timestamp: time.Now(),
 		}

+		mu.Lock()
 		documents = append(documents, doc)
+		mu.Unlock()
 	})

 	// Follow links
 	c.OnHTML("a[href]", func(e *colly.HTMLElement) {
 		link := e.Attr("href")
 		absoluteURL := e.Request.AbsoluteURL(link)
-
-		// Skip if already visited
-		if visited[absoluteURL] {
+		if absoluteURL == "" {
 			return
 		}

+		linkURL, err := url.Parse(absoluteURL)
+		if err != nil {
+			return
+		}
+		if linkURL.Scheme != "http" && linkURL.Scheme != "https" {
+			return
+		}
+		if !withinScope(linkURL, baseURL, scopePrefix, scopeLeaf) {
+			return
+		}
+
+		// Skip if already visited
+		mu.Lock()
+		if visited[absoluteURL] {
+			mu.Unlock()
+			return
+		}
+		if len(visited) >= maxPages {
+			mu.Unlock()
+			return
+		}
+		mu.Unlock()
+
 		// Check include/exclude patterns
 		if !s.shouldInclude(absoluteURL, source.Include, source.Exclude) {
 			return
 		}

+		mu.Lock()
+		if scheduled[absoluteURL] {
+			mu.Unlock()
+			return
+		}
+		if len(scheduled) >= maxPages {
+			mu.Unlock()
+			return
+		}
+		scheduled[absoluteURL] = true
+		mu.Unlock()
+
 		if err := c.Visit(absoluteURL); err != nil {
-			fmt.Printf("Error visiting %s: %v\n", absoluteURL, err)
+			errText := strings.ToLower(err.Error())
+			if strings.Contains(errText, "already visited") {
+				return
+			}
+			mu.Lock()
+			if len(scrapeErrors) < 20 {
+				scrapeErrors = append(scrapeErrors, fmt.Sprintf("%s: %v", absoluteURL, err))
+			}
+			mu.Unlock()
 		}
 	})

 	// Start scraping
+	scheduled[source.URL] = true
 	if err := c.Visit(source.URL); err != nil {
 		return nil, fmt.Errorf("failed to start scraping: %w", err)
 	}
@@ -151,6 +251,16 @@ func (s *WebScraper) Scrape(ctx context.Context, source *Source) ([]*Document, e
 	// Wait for async scraping to complete
 	c.Wait()

+	mu.Lock()
+	defer mu.Unlock()
+
+	if len(documents) == 0 {
+		if len(scrapeErrors) > 0 {
+			return nil, fmt.Errorf("web scrape failed: %s", strings.Join(scrapeErrors, "; "))
+		}
+		return nil, fmt.Errorf("web scrape extracted no documents from %s", source.URL)
+	}
+
 	return documents, nil
 }

@@ -283,6 +393,25 @@ func (s *WebScraper) generateHash(content string) string {

 // cleanText removes extra whitespace and normalizes text.
 func cleanText(text string) string {
+	noisePhrases := []string{
+		"table of contents",
+		"in this article",
+		"additional resources",
+		"feedback",
+		"collaborate with us on github",
+		"copyright",
+		"all rights reserved",
+		"privacy policy",
+		"terms of service",
+		"sign in",
+		"skip to main content",
+		"ask learn",
+	}
+	for _, phrase := range noisePhrases {
+		re := regexp.MustCompile(`(?i)` + regexp.QuoteMeta(phrase))
+		text = re.ReplaceAllString(text, " ")
+	}
+
 	// Replace multiple whitespace with single space
 	re := regexp.MustCompile(`\s+`)
 	text = re.ReplaceAllString(text, " ")
@@ -292,3 +421,58 @@ func cleanText(text string) string {

 	return text
 }
+
+func pathScopePrefix(rawPath string) string {
+	clean := path.Clean(rawPath)
+	if clean == "." || clean == "/" || clean == "" {
+		return ""
+	}
+
+	last := path.Base(clean)
+	if strings.Contains(last, ".") {
+		dir := path.Dir(clean)
+		if dir == "/" {
+			// Root-level document page: keep crawler scoped to this page path.
+			return clean
+		}
+		return dir
+	}
+
+	dir := path.Dir(clean)
+	if dir == "/" {
+		return clean
+	}
+	return dir
+}
+
+func pathScopeLeaf(rawPath string) string {
+	clean := path.Clean(rawPath)
+	if clean == "." || clean == "/" || clean == "" {
+		return ""
+	}
+	last := path.Base(clean)
+	if strings.Contains(last, ".") {
+		return last
+	}
+	return ""
+}
+
+func withinScope(target, base *url.URL, prefix, leaf string) bool {
+	if target == nil || base == nil {
+		return false
+	}
+	if !strings.EqualFold(target.Hostname(), base.Hostname()) {
+		return false
+	}
+	if prefix == "" {
+		return true
+	}
+	targetPath := target.Path
+	if targetPath == "" {
+		targetPath = path.Clean("/")
+	}
+	if strings.HasPrefix(targetPath, prefix) {
+		return true
+	}
+	return leaf != "" && path.Base(targetPath) == leaf
+}
@@ -0,0 +1,132 @@
+package scraper
+
+import (
+	"context"
+	"net/http"
+	"net/http/httptest"
+	"strings"
+	"testing"
+	"time"
+)
+
+func TestWebScraper_ReturnsErrorWhenNothingFetched(t *testing.T) {
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		http.NotFound(w, r)
+	}))
+	defer srv.Close()
+
+	s := NewWebScraper(&Config{
+		UserAgent:   "DevourTest/1.0",
+		Timeout:     2 * time.Second,
+		Concurrency: 1,
+		MaxDepth:    1,
+	})
+
+	_, err := s.Scrape(context.Background(), &Source{
+		Name: "missing",
+		Type: SourceTypeWeb,
+		URL:  srv.URL + "/missing",
+	})
+	if err == nil {
+		t.Fatal("expected error when web scrape yields no documents")
+	}
+}
+
+func TestWebScraper_ReturnsErrorWhenPageHasNoExtractableContent(t *testing.T) {
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.Header().Set("Content-Type", "text/html")
+		_, _ = w.Write([]byte(`<html><head><title>Empty</title></head><body><p>tiny</p></body></html>`))
+	}))
+	defer srv.Close()
+
+	s := NewWebScraper(&Config{
+		UserAgent:   "DevourTest/1.0",
+		Timeout:     2 * time.Second,
+		Concurrency: 1,
+		MaxDepth:    1,
+	})
+
+	_, err := s.Scrape(context.Background(), &Source{
+		Name: "empty",
+		Type: SourceTypeWeb,
+		URL:  srv.URL,
+	})
+	if err == nil {
+		t.Fatal("expected error when page has no extractable docs")
+	}
+	if !strings.Contains(err.Error(), "extracted no documents") {
+		t.Fatalf("unexpected error message: %v", err)
+	}
+}
+
+func TestWebScraper_AllowsRedirectedDocumentPath(t *testing.T) {
+	content := strings.Repeat("ruby docs content ", 30)
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		switch r.URL.Path {
+		case "/core/Regexp.html":
+			http.Redirect(w, r, "/3.4.1/Regexp.html", http.StatusFound)
+		case "/3.4.1/Regexp.html":
+			w.Header().Set("Content-Type", "text/html")
+			_, _ = w.Write([]byte(`<html><head><title>Regexp</title></head><body><main>` + content + `</main></body></html>`))
+		default:
+			http.NotFound(w, r)
+		}
+	}))
+	defer srv.Close()
+
+	s := NewWebScraper(&Config{
+		UserAgent:   "DevourTest/1.0",
+		Timeout:     2 * time.Second,
+		Concurrency: 1,
+		MaxDepth:    1,
+	})
+
+	docs, err := s.Scrape(context.Background(), &Source{
+		Name: "ruby",
+		Type: SourceTypeWeb,
+		URL:  srv.URL + "/core/Regexp.html",
+	})
+	if err != nil {
+		t.Fatalf("unexpected scrape error: %v", err)
+	}
+	if len(docs) == 0 {
+		t.Fatal("expected redirected page to be scraped")
+	}
+	if !strings.Contains(docs[0].URL, "/3.4.1/Regexp.html") {
+		t.Fatalf("expected final redirected URL, got %q", docs[0].URL)
+	}
+}
+
+func TestWebScraper_GlobalWrapperNormalizesOutput(t *testing.T) {
+	content := strings.Repeat("docs content ", 20)
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.Header().Set("Content-Type", "text/html")
+		_, _ = w.Write([]byte(`<html><head><title>Regex Guide ¶ deprecated</title></head><body><main>` + content + `</main></body></html>`))
+	}))
+	defer srv.Close()
+
+	s := NewScraper(SourceTypeWeb, &Config{
+		UserAgent:   "DevourTest/1.0",
+		Timeout:     2 * time.Second,
+		Concurrency: 1,
+		MaxDepth:    1,
+	})
+	if s == nil {
+		t.Fatal("expected web scraper")
+	}
+
+	docs, err := s.Scrape(context.Background(), &Source{
+		Name: "test",
+		Type: SourceTypeWeb,
+		URL:  srv.URL,
+	})
+	if err != nil {
+		t.Fatalf("unexpected scrape error: %v", err)
+	}
+	if len(docs) == 0 {
+		t.Fatal("expected at least one document")
+	}
+	if docs[0].Title != "Regex Guide" {
+		t.Fatalf("expected normalized title, got %q", docs[0].Title)
+	}
+}
@@ -0,0 +1,98 @@
+package scraper
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"net"
+	"strings"
+	"time"
+)
+
+// wrappedScraper adds retry and normalization behavior for all scraper implementations.
+type wrappedScraper struct {
+	inner Scraper
+}
+
+func wrapScraper(inner Scraper) Scraper {
+	if inner == nil {
+		return nil
+	}
+	return &wrappedScraper{inner: inner}
+}
+
+func (w *wrappedScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
+	if source == nil {
+		return nil, fmt.Errorf("source is required")
+	}
+
+	retries := 2
+	delay := 300 * time.Millisecond
+
+	var lastErr error
+	for attempt := 0; attempt <= retries; attempt++ {
+		docs, err := w.inner.Scrape(ctx, source)
+		if err == nil {
+			return NormalizeDocuments(docs), nil
+		}
+		lastErr = err
+
+		// One fallback: add trailing slash for doc sites when URL path looks page-like.
+		if attempt == 0 && strings.HasPrefix(source.URL, "http") && !strings.HasSuffix(source.URL, "/") {
+			alt := *source
+			alt.URL = source.URL + "/"
+			docs, altErr := w.inner.Scrape(ctx, &alt)
+			if altErr == nil {
+				return NormalizeDocuments(docs), nil
+			}
+		}
+
+		if attempt < retries && isRetriableScrapeError(err) {
+			if !sleepWithContext(ctx, delay) {
+				return nil, ctx.Err()
+			}
+			continue
+		}
+		break
+	}
+
+	return nil, lastErr
+}
+
+func (w *wrappedScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
+	return w.inner.DetectChanges(ctx, source, lastHash)
+}
+
+func isRetriableScrapeError(err error) bool {
+	if err == nil {
+		return false
+	}
+
+	s := strings.ToLower(err.Error())
+	if strings.Contains(s, "timeout") ||
+		strings.Contains(s, "temporarily unavailable") ||
+		strings.Contains(s, "connection reset") ||
+		strings.Contains(s, "eof") ||
+		strings.Contains(s, "http 429") ||
+		strings.Contains(s, "http 500") ||
+		strings.Contains(s, "http 502") ||
+		strings.Contains(s, "http 503") ||
+		strings.Contains(s, "http 504") {
+		return true
+	}
+
+	var netErr net.Error
+	return errors.As(err, &netErr)
+}
+
+func sleepWithContext(ctx context.Context, d time.Duration) bool {
+	t := time.NewTimer(d)
+	defer t.Stop()
+
+	select {
+	case <-ctx.Done():
+		return false
+	case <-t.C:
+		return true
+	}
+}
@@ -0,0 +1,45 @@
+package scraper
+
+import (
+	"context"
+	"fmt"
+	"testing"
+)
+
+type flakyStubScraper struct {
+	failFirst bool
+	calls     int
+}
+
+func (f *flakyStubScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
+	f.calls++
+	if f.failFirst && f.calls == 1 {
+		return nil, fmt.Errorf("HTTP 503")
+	}
+	return []*Document{
+		{
+			Title:   "Example ¶ deprecated",
+			Content: "ok",
+			URL:     source.URL,
+			Type:    "test",
+		},
+	}, nil
+}
+
+func (f *flakyStubScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
+	return true, "hash", nil
+}
+
+func TestWrappedScraper_RetriesAndNormalizes(t *testing.T) {
+	w := wrapScraper(&flakyStubScraper{failFirst: true})
+	docs, err := w.Scrape(context.Background(), &Source{URL: "https://example.com"})
+	if err != nil {
+		t.Fatalf("expected retry to succeed, got error: %v", err)
+	}
+	if len(docs) != 1 {
+		t.Fatalf("expected 1 document, got %d", len(docs))
+	}
+	if docs[0].Title != "Example" {
+		t.Fatalf("expected normalized title, got %q", docs[0].Title)
+	}
+}