update

2026-07-29 07:33:48 +00:00 · 2026-02-24 10:33:59 +01:00
parent 409acd2e08
commit 898a3c303f
1374 changed files with 290408 additions and 29186 deletions
@@ -0,0 +1,368 @@
+package config
+
+import (
+	"fmt"
+	"os"
+	"path/filepath"
+	"strings"
+	"time"
+
+	"gopkg.in/yaml.v3"
+)
+
+// Config is the typed application configuration loaded from devour.yaml.
+type Config struct {
+	Version      int                `yaml:"version"`
+	Storage      StorageConfig      `yaml:"storage"`
+	Embeddings   EmbeddingsConfig   `yaml:"embeddings"`
+	VectorDB     VectorDBConfig     `yaml:"vector_db"`
+	Scraper      ScraperConfig      `yaml:"scraper"`
+	Scheduler    SchedulerConfig    `yaml:"scheduler"`
+	Server       ServerConfig       `yaml:"server"`
+	Indexing     IndexingConfig     `yaml:"indexing"`
+	Verification VerificationConfig `yaml:"verification"`
+	Sources      []SourceConfig     `yaml:"sources"`
+
+	ConfigPath string `yaml:"-"`
+}
+
+type StorageConfig struct {
+	DocsDir     string `yaml:"docs_dir"`
+	IndexDir    string `yaml:"index_dir"`
+	MetadataDir string `yaml:"metadata_dir"`
+	CacheDir    string `yaml:"cache_dir"`
+}
+
+type EmbeddingsConfig struct {
+	Provider   string `yaml:"provider"`
+	Model      string `yaml:"model"`
+	Dimensions int    `yaml:"dimensions"`
+	APIKey     string `yaml:"api_key"`
+	BatchSize  int    `yaml:"batch_size"`
+	BaseURL    string `yaml:"base_url"`
+}
+
+type VectorDBConfig struct {
+	Type             string `yaml:"type"`
+	Persist          bool   `yaml:"persist"`
+	SimilarityMetric string `yaml:"similarity_metric"`
+	PersistDir       string `yaml:"persist_dir"`
+}
+
+type ScraperConfig struct {
+	UserAgent   string        `yaml:"user_agent"`
+	Timeout     time.Duration `yaml:"timeout"`
+	RetryCount  int           `yaml:"retry_count"`
+	RetryDelay  time.Duration `yaml:"retry_delay"`
+	Concurrency int           `yaml:"concurrency"`
+	RateLimit   time.Duration `yaml:"rate_limit"`
+	MaxDepth    int           `yaml:"max_depth"`
+	CacheDir    string        `yaml:"cache_dir"`
+}
+
+type SchedulerConfig struct {
+	Enabled     bool          `yaml:"enabled"`
+	Interval    time.Duration `yaml:"interval"`
+	CheckMethod string        `yaml:"check_method"`
+	OnStartup   bool          `yaml:"on_startup"`
+}
+
+type ServerConfig struct {
+	Mode      string `yaml:"mode"`
+	Transport string `yaml:"transport"`
+	Host      string `yaml:"host"`
+	Port      int    `yaml:"port"`
+}
+
+type IndexingConfig struct {
+	Enabled       bool `yaml:"enabled"`
+	AutoReindex   bool `yaml:"auto_reindex"`
+	SnippetLength int  `yaml:"snippet_length"`
+	MaxDocs       int  `yaml:"max_docs"`
+}
+
+type VerificationConfig struct {
+	Enabled bool          `yaml:"enabled"`
+	Timeout time.Duration `yaml:"timeout"`
+}
+
+type SourceConfig struct {
+	Name        string   `yaml:"name"`
+	Type        string   `yaml:"type"`
+	URL         string   `yaml:"url,omitempty"`
+	Query       string   `yaml:"query,omitempty"`
+	ResultLimit int      `yaml:"result_limit,omitempty"`
+	Domains     []string `yaml:"domains,omitempty"`
+	Repo        string   `yaml:"repo,omitempty"`
+	Branch      string   `yaml:"branch,omitempty"`
+	Path        string   `yaml:"path,omitempty"`
+	Include     []string `yaml:"include,omitempty"`
+	Exclude     []string `yaml:"exclude,omitempty"`
+	Schedule    string   `yaml:"schedule,omitempty"`
+}
+
+// Default returns a default configuration that matches devour init behavior.
+func Default() *Config {
+	return &Config{
+		Version: 1,
+		Storage: StorageConfig{
+			DocsDir:     "./devour_data/docs",
+			IndexDir:    "./devour_data/index",
+			MetadataDir: "./devour_data/metadata",
+			CacheDir:    "./devour_data/cache",
+		},
+		Embeddings: EmbeddingsConfig{
+			Provider:   "openai",
+			Model:      "text-embedding-3-small",
+			Dimensions: 1536,
+			BatchSize:  100,
+			APIKey:     "${OPENAI_API_KEY}",
+		},
+		VectorDB: VectorDBConfig{
+			Type:             "memory",
+			Persist:          true,
+			SimilarityMetric: "cosine",
+		},
+		Scraper: ScraperConfig{
+			UserAgent:   "Devour/1.0",
+			Timeout:     30 * time.Second,
+			RetryCount:  3,
+			RetryDelay:  1 * time.Second,
+			Concurrency: 10,
+			RateLimit:   500 * time.Millisecond,
+			MaxDepth:    3,
+			CacheDir:    "./devour_data/cache",
+		},
+		Scheduler: SchedulerConfig{
+			Enabled:     true,
+			Interval:    72 * time.Hour,
+			CheckMethod: "hash",
+			OnStartup:   false,
+		},
+		Server: ServerConfig{
+			Mode:      "local",
+			Transport: "stdio",
+			Host:      "localhost",
+			Port:      8080,
+		},
+		Indexing: IndexingConfig{
+			Enabled:       true,
+			AutoReindex:   true,
+			SnippetLength: 220,
+			MaxDocs:       10000,
+		},
+		Verification: VerificationConfig{
+			Enabled: true,
+			Timeout: 90 * time.Second,
+		},
+		Sources: []SourceConfig{},
+	}
+}
+
+const initTemplateSourcesComment = `
+# Sources (add your own)
+sources: []
+  # - name: example-docs
+  #   type: url
+  #   url: https://docs.example.com
+  #   include: ["**/*.md", "**/*.html"]
+  # - name: local-searxng
+  #   type: localsearch
+  #   url: http://127.0.0.1:8080/search
+  #   query: golang http client
+  #   result_limit: 8
+  #   domains: ["pkg.go.dev", "go.dev"]
+`
+
+// RenderInitYAML returns the default init config file content from canonical defaults.
+func RenderInitYAML(remote bool) (string, error) {
+	cfg := Default()
+	if remote {
+		cfg.Server.Mode = "remote"
+	}
+	// Keep the init template comments for discoverability while sourcing
+	// the actual values from canonical defaults.
+	cfg.Sources = nil
+
+	body, err := yaml.Marshal(cfg)
+	if err != nil {
+		return "", fmt.Errorf("marshal default config: %w", err)
+	}
+
+	trimmed := strings.TrimSuffix(string(body), "\n")
+	if strings.HasSuffix(trimmed, "sources: []") {
+		trimmed = strings.TrimSuffix(trimmed, "sources: []")
+		trimmed = strings.TrimSpace(trimmed)
+	}
+
+	return "# Devour Configuration\n" + trimmed + initTemplateSourcesComment, nil
+}
+
+// Load loads configuration from an explicit path or the default search paths.
+func Load(explicitPath string) (*Config, error) {
+	cfg := Default()
+
+	path, err := findConfigPath(explicitPath)
+	if err != nil {
+		return nil, err
+	}
+	if path == "" {
+		cfg.ApplyDefaults()
+		return cfg, nil
+	}
+
+	b, err := os.ReadFile(path)
+	if err != nil {
+		return nil, fmt.Errorf("read config: %w", err)
+	}
+
+	if err := yaml.Unmarshal(b, cfg); err != nil {
+		return nil, fmt.Errorf("parse config: %w", err)
+	}
+
+	cfg.ConfigPath = path
+	cfg.ApplyDefaults()
+	return cfg, nil
+}
+
+// ApplyDefaults ensures additive backward-compatible defaults after unmarshaling.
+func (c *Config) ApplyDefaults() {
+	if c.Version == 0 {
+		c.Version = 1
+	}
+
+	if c.Storage.DocsDir == "" {
+		c.Storage.DocsDir = "./devour_data/docs"
+	}
+	if c.Storage.IndexDir == "" {
+		c.Storage.IndexDir = "./devour_data/index"
+	}
+	if c.Storage.MetadataDir == "" {
+		c.Storage.MetadataDir = "./devour_data/metadata"
+	}
+	if c.Storage.CacheDir == "" {
+		c.Storage.CacheDir = "./devour_data/cache"
+	}
+
+	if c.Embeddings.Provider == "" {
+		c.Embeddings.Provider = "openai"
+	}
+	if c.Embeddings.Model == "" {
+		c.Embeddings.Model = "text-embedding-3-small"
+	}
+	if c.Embeddings.Dimensions <= 0 {
+		c.Embeddings.Dimensions = 1536
+	}
+	if c.Embeddings.BatchSize <= 0 {
+		c.Embeddings.BatchSize = 100
+	}
+
+	if c.VectorDB.Type == "" {
+		c.VectorDB.Type = "memory"
+	}
+	if c.VectorDB.SimilarityMetric == "" {
+		c.VectorDB.SimilarityMetric = "cosine"
+	}
+
+	if c.Scraper.UserAgent == "" {
+		c.Scraper.UserAgent = "Devour/1.0"
+	}
+	if c.Scraper.Timeout <= 0 {
+		c.Scraper.Timeout = 30 * time.Second
+	}
+	if c.Scraper.RetryCount <= 0 {
+		c.Scraper.RetryCount = 3
+	}
+	if c.Scraper.RetryDelay <= 0 {
+		c.Scraper.RetryDelay = 1 * time.Second
+	}
+	if c.Scraper.Concurrency <= 0 {
+		c.Scraper.Concurrency = 10
+	}
+	if c.Scraper.RateLimit < 0 {
+		c.Scraper.RateLimit = 0
+	}
+	if c.Scraper.MaxDepth <= 0 {
+		c.Scraper.MaxDepth = 3
+	}
+	if c.Scraper.CacheDir == "" {
+		c.Scraper.CacheDir = c.Storage.CacheDir
+	}
+
+	if c.Scheduler.Interval <= 0 {
+		c.Scheduler.Interval = 72 * time.Hour
+	}
+	if c.Scheduler.CheckMethod == "" {
+		c.Scheduler.CheckMethod = "hash"
+	}
+
+	if c.Server.Mode == "" {
+		c.Server.Mode = "local"
+	}
+	if c.Server.Transport == "" {
+		c.Server.Transport = "stdio"
+	}
+	if c.Server.Host == "" {
+		c.Server.Host = "localhost"
+	}
+	if c.Server.Port <= 0 {
+		c.Server.Port = 8080
+	}
+
+	if !c.Indexing.Enabled {
+		// keep explicit false but initialize defaults for remaining fields
+	}
+	if c.Indexing.SnippetLength <= 0 {
+		c.Indexing.SnippetLength = 220
+	}
+	if c.Indexing.MaxDocs <= 0 {
+		c.Indexing.MaxDocs = 10000
+	}
+
+	if c.Verification.Timeout <= 0 {
+		c.Verification.Timeout = 90 * time.Second
+	}
+}
+
+func findConfigPath(explicitPath string) (string, error) {
+	if strings.TrimSpace(explicitPath) != "" {
+		p, err := filepath.Abs(explicitPath)
+		if err != nil {
+			return "", err
+		}
+		if _, err := os.Stat(p); err != nil {
+			return "", fmt.Errorf("config file not found: %s", explicitPath)
+		}
+		return p, nil
+	}
+
+	candidates := []string{"./devour.yaml"}
+	if home, err := os.UserHomeDir(); err == nil {
+		candidates = append(candidates, filepath.Join(home, ".devour", "devour.yaml"))
+	}
+
+	for _, c := range candidates {
+		if _, err := os.Stat(c); err == nil {
+			p, absErr := filepath.Abs(c)
+			if absErr != nil {
+				return "", absErr
+			}
+			return p, nil
+		}
+	}
+	return "", nil
+}
+
+// EnsureStorageDirs creates required local storage directories.
+func (c *Config) EnsureStorageDirs() error {
+	dirs := []string{c.Storage.DocsDir, c.Storage.IndexDir, c.Storage.MetadataDir, c.Storage.CacheDir}
+	for _, dir := range dirs {
+		if strings.TrimSpace(dir) == "" {
+			continue
+		}
+		if err := os.MkdirAll(dir, 0o755); err != nil {
+			return err
+		}
+	}
+	return nil
+}
@@ -0,0 +1,130 @@
+package projectstate
+
+import (
+	"encoding/json"
+	"fmt"
+	"io/fs"
+	"os"
+	"path/filepath"
+	"strings"
+	"time"
+)
+
+type SourceState struct {
+	Name      string    `json:"name"`
+	Type      string    `json:"type"`
+	URL       string    `json:"url,omitempty"`
+	Hash      string    `json:"hash,omitempty"`
+	LastSync  time.Time `json:"last_sync,omitempty"`
+	DocCount  int       `json:"doc_count"`
+	LastError string    `json:"last_error,omitempty"`
+}
+
+type SourceStateFile struct {
+	UpdatedAt time.Time               `json:"updated_at"`
+	Sources   map[string]*SourceState `json:"sources"`
+}
+
+type DocsStats struct {
+	DocumentCount int
+	LastUpdated   time.Time
+	BySource      map[string]int
+	StorageBytes  int64
+}
+
+type docSummary struct {
+	Source string `json:"source"`
+}
+
+const sourceStateFileName = "source_state.json"
+
+func LoadSourceState(metadataDir string) (*SourceStateFile, error) {
+	path := filepath.Join(metadataDir, sourceStateFileName)
+	b, err := os.ReadFile(path)
+	if err != nil {
+		if os.IsNotExist(err) {
+			return &SourceStateFile{UpdatedAt: time.Now(), Sources: map[string]*SourceState{}}, nil
+		}
+		return nil, err
+	}
+
+	var state SourceStateFile
+	if err := json.Unmarshal(b, &state); err != nil {
+		return nil, err
+	}
+	if state.Sources == nil {
+		state.Sources = map[string]*SourceState{}
+	}
+	return &state, nil
+}
+
+func SaveSourceState(metadataDir string, state *SourceStateFile) error {
+	if state == nil {
+		return fmt.Errorf("state is required")
+	}
+	if state.Sources == nil {
+		state.Sources = map[string]*SourceState{}
+	}
+	state.UpdatedAt = time.Now()
+
+	if err := os.MkdirAll(metadataDir, 0o755); err != nil {
+		return err
+	}
+	path := filepath.Join(metadataDir, sourceStateFileName)
+	b, err := json.MarshalIndent(state, "", "  ")
+	if err != nil {
+		return err
+	}
+	return os.WriteFile(path, b, 0o644)
+}
+
+func CollectDocsStats(docsDir string) (*DocsStats, error) {
+	stats := &DocsStats{BySource: map[string]int{}}
+
+	err := filepath.WalkDir(docsDir, func(path string, d fs.DirEntry, err error) error {
+		if err != nil {
+			if os.IsNotExist(err) {
+				return nil
+			}
+			return err
+		}
+		if d.IsDir() {
+			return nil
+		}
+
+		info, infoErr := d.Info()
+		if infoErr != nil {
+			return infoErr
+		}
+		stats.StorageBytes += info.Size()
+		if info.ModTime().After(stats.LastUpdated) {
+			stats.LastUpdated = info.ModTime()
+		}
+
+		ext := strings.ToLower(filepath.Ext(path))
+		if ext != ".json" && ext != ".md" && ext != ".txt" {
+			return nil
+		}
+		stats.DocumentCount++
+
+		if ext == ".json" {
+			b, readErr := os.ReadFile(path)
+			if readErr != nil {
+				return nil
+			}
+			var d docSummary
+			if err := json.Unmarshal(b, &d); err == nil {
+				source := strings.TrimSpace(d.Source)
+				if source != "" {
+					stats.BySource[source]++
+				}
+			}
+		}
+		return nil
+	})
+	if err != nil {
+		return nil, err
+	}
+
+	return stats, nil
+}
@@ -398,8 +398,8 @@ func NewSecretsDetector() *SecretsDetector {
 			{Name: "GitHub OAuth", Pattern: regexp.MustCompile(`gho_[0-9a-zA-Z]{36}`), Severity: quality.SeverityT4},
 			{Name: "GitHub App Token", Pattern: regexp.MustCompile(`(ghu|ghs)_[0-9a-zA-Z]{36}`), Severity: quality.SeverityT4},
 			{Name: "Slack Token", Pattern: regexp.MustCompile(`xox[baprs]-[0-9]{10,13}-[0-9]{10,13}[a-zA-Z0-9]{24}`), Severity: quality.SeverityT4},
-			{Name: "RSA Private Key", Pattern: regexp.MustCompile(`-----BEGIN RSA PRIVATE KEY-----`), Severity: quality.SeverityT4},
-			{Name: "Private Key", Pattern: regexp.MustCompile(`-----BEGIN PRIVATE KEY-----`), Severity: quality.SeverityT4},
+			{Name: "RSA Private Key", Pattern: regexp.MustCompile(`-----BEGIN ` + `RSA PRIVATE KEY-----`), Severity: quality.SeverityT4},
+			{Name: "Private Key", Pattern: regexp.MustCompile(`-----BEGIN ` + `PRIVATE KEY-----`), Severity: quality.SeverityT4},
 			{Name: "JWT", Pattern: regexp.MustCompile(`eyJ[a-zA-Z0-9_-]*\.eyJ[a-zA-Z0-9_-]*\.[a-zA-Z0-9_-]*`), Severity: quality.SeverityT3},
 			{Name: "Generic API Key", Pattern: regexp.MustCompile(`(?i)(api_key|apikey|secret|password|token)\s*[=:]\s*['"][^'"]{8,}['"]`), Severity: quality.SeverityT3},
 			{Name: "DB Connection String", Pattern: regexp.MustCompile(`(?i)(mysql|postgres|mongodb)://[^:]+:[^@]+@[^/]+`), Severity: quality.SeverityT4},
@@ -0,0 +1,79 @@
+package quality
+
+import "strings"
+
+type docsEvidence struct {
+	URLs       []string
+	Rationale  string
+	Confidence string
+}
+
+var defaultEvidenceByType = map[string]docsEvidence{
+	"complexity_ast": {
+		URLs:       []string{"https://go.dev/doc/effective_go", "https://go.dev/wiki/CodeReviewComments"},
+		Rationale:  "High complexity correlates with maintainability and defect risk; official style guidance recommends smaller focused functions.",
+		Confidence: "0.82",
+	},
+	"god_function": {
+		URLs:       []string{"https://go.dev/doc/effective_go", "https://go.dev/wiki/CodeReviewComments"},
+		Rationale:  "Large multi-responsibility functions usually violate readability and testability guidance.",
+		Confidence: "0.84",
+	},
+	"unused_import": {
+		URLs:       []string{"https://pkg.go.dev/cmd/go", "https://pkg.go.dev/go/importer"},
+		Rationale:  "Unused imports break build hygiene and indicate stale code paths.",
+		Confidence: "0.95",
+	},
+	"dead_code": {
+		URLs:       []string{"https://pkg.go.dev/cmd/go", "https://go.dev/wiki/CodeReviewComments"},
+		Rationale:  "Unreachable or unused symbols increase maintenance overhead with no runtime value.",
+		Confidence: "0.90",
+	},
+	"dead_code_enhanced": {
+		URLs:       []string{"https://pkg.go.dev/cmd/go", "https://go.dev/wiki/CodeReviewComments"},
+		Rationale:  "Unreachable or unused symbols increase maintenance overhead with no runtime value.",
+		Confidence: "0.90",
+	},
+	"duplication": {
+		URLs:       []string{"https://go.dev/wiki/CodeReviewComments"},
+		Rationale:  "Duplication increases change cost and risk of inconsistent bug fixes.",
+		Confidence: "0.80",
+	},
+	"single_use": {
+		URLs:       []string{"https://go.dev/doc/effective_go", "https://go.dev/wiki/CodeReviewComments"},
+		Rationale:  "Single-use abstractions can reduce clarity unless they encode reusable domain behavior.",
+		Confidence: "0.74",
+	},
+	"test_coverage": {
+		URLs:       []string{"https://go.dev/doc/tutorial/add-a-test", "https://pkg.go.dev/testing"},
+		Rationale:  "Coverage gaps on changed code increase regression probability.",
+		Confidence: "0.78",
+	},
+}
+
+// AttachDocsEvidence annotates findings with docs evidence metadata.
+func AttachDocsEvidence(language string, findings []Finding) []Finding {
+	language = strings.ToLower(strings.TrimSpace(language))
+	for i := range findings {
+		ev, ok := defaultEvidenceByType[findings[i].Type]
+		if !ok {
+			continue
+		}
+		if findings[i].Metadata == nil {
+			findings[i].Metadata = map[string]string{}
+		}
+		if len(ev.URLs) > 0 {
+			findings[i].Metadata["docs_evidence_urls"] = strings.Join(ev.URLs, " | ")
+		}
+		if ev.Rationale != "" {
+			findings[i].Metadata["docs_evidence_rationale"] = ev.Rationale
+		}
+		if ev.Confidence != "" {
+			findings[i].Metadata["docs_evidence_confidence"] = ev.Confidence
+		}
+		if language != "" {
+			findings[i].Metadata["docs_evidence_language"] = language
+		}
+	}
+	return findings
+}
@@ -104,7 +104,7 @@ func (f *DefaultFileFinder) FindFiles(path string, language string) ([]string, e
 		if info.IsDir() {
 			// Skip hidden directories and common exclude dirs
 			base := filepath.Base(filePath)
-			if strings.HasPrefix(base, ".") || base == "node_modules" || base == "vendor" {
+			if filePath != path && (strings.HasPrefix(base, ".") || base == "node_modules" || base == "vendor") {
 				return filepath.SkipDir
 			}
 			return nil
@@ -170,6 +170,37 @@ func TestDefaultFileFinder_FindFiles_EmptyDirectory(t *testing.T) {
 	}
 }

+func TestDefaultFileFinder_FindFiles_DotPathRootNotSkipped(t *testing.T) {
+	tmpDir, err := os.MkdirTemp("", "filefinder_dot_root_test")
+	if err != nil {
+		t.Fatalf("Failed to create temp dir: %v", err)
+	}
+	defer os.RemoveAll(tmpDir)
+
+	if err := os.WriteFile(filepath.Join(tmpDir, "main.go"), []byte("package main"), 0644); err != nil {
+		t.Fatalf("Failed to write go file: %v", err)
+	}
+
+	cwd, err := os.Getwd()
+	if err != nil {
+		t.Fatalf("Failed to get cwd: %v", err)
+	}
+	defer func() { _ = os.Chdir(cwd) }()
+
+	if err := os.Chdir(tmpDir); err != nil {
+		t.Fatalf("Failed to chdir: %v", err)
+	}
+
+	finder := NewDefaultFileFinder()
+	files, err := finder.FindFiles(".", "go")
+	if err != nil {
+		t.Fatalf("FindFiles() failed: %v", err)
+	}
+	if len(files) != 1 {
+		t.Fatalf("FindFiles('.') expected 1 file, got %d", len(files))
+	}
+}
+
 func TestDefaultFileFinder_FindFiles_NonExistentPath(t *testing.T) {
 	finder := NewDefaultFileFinder()
 	files, err := finder.FindFiles("/non/existent/path", "go")
@@ -58,7 +58,10 @@ func (d *SingleUseDetector) Detect(ctx context.Context, path string, config *qua

 			switch obj := obj.(type) {
 			case *types.Func:
-				key := obj.Pkg().Path() + "." + obj.Name()
+				key, ok := functionKey(obj)
+				if !ok {
+					continue
+				}
 				callCounts[key]++
 			case *types.TypeName:
 				if obj.Pkg() != nil {
@@ -75,17 +78,18 @@ func (d *SingleUseDetector) Detect(ctx context.Context, path string, config *qua

 			switch obj := obj.(type) {
 			case *types.Func:
-				if obj.Pkg() != nil {
-					key := obj.Pkg().Path() + "." + obj.Name()
-					pos := pkg.Fset.Position(obj.Pos())
-					funcDefs[key] = FuncDef{
-						Name:      obj.Name(),
-						File:      pos.Filename,
-						Line:      pos.Line,
-						Package:   obj.Pkg().Path(),
-						Exported:  obj.Exported(),
-						Signature: obj.Type().String(),
-					}
+				key, ok := functionKey(obj)
+				if !ok {
+					continue
+				}
+				pos := pkg.Fset.Position(obj.Pos())
+				funcDefs[key] = FuncDef{
+					Name:      obj.Name(),
+					File:      pos.Filename,
+					Line:      pos.Line,
+					Package:   obj.Pkg().Path(),
+					Exported:  obj.Exported(),
+					Signature: obj.Type().String(),
 				}
 			case *types.TypeName:
 				if obj.Pkg() != nil {
@@ -109,6 +113,9 @@ func (d *SingleUseDetector) Detect(ctx context.Context, path string, config *qua
 	var findings []quality.Finding

 	for key, def := range funcDefs {
+		if def.Exported || isLikelyEntrypointFile(def.File) {
+			continue
+		}
 		if strings.HasSuffix(def.Name, "Test") || strings.HasPrefix(def.Name, "Test") {
 			continue
 		}
@@ -143,9 +150,18 @@ func (d *SingleUseDetector) Detect(ctx context.Context, path string, config *qua
 	}

 	for key, def := range typeDefs {
+		if def.Exported || isLikelyEntrypointFile(def.File) {
+			continue
+		}
 		if strings.HasSuffix(def.Name, "Error") || strings.HasSuffix(def.Name, "Options") {
 			continue
 		}
+		if strings.HasSuffix(def.Name, "Config") || strings.HasSuffix(def.Name, "Params") {
+			continue
+		}
+		if !strings.Contains(def.Underlying, "struct") && !strings.Contains(def.Underlying, "interface") {
+			continue
+		}

 		count := typeUsages[key]
 		if count == 1 {
@@ -242,6 +258,22 @@ func (d *SingleUseDetector) getFuncLOC(file string, startLine int) (int, error)
 	return loc, nil
 }

+func functionKey(fn *types.Func) (string, bool) {
+	if fn == nil || fn.Pkg() == nil {
+		return "", false
+	}
+	sig, ok := fn.Type().(*types.Signature)
+	if ok && sig.Recv() != nil {
+		return "", false
+	}
+	return fn.Pkg().Path() + "." + fn.Name(), true
+}
+
+func isLikelyEntrypointFile(path string) bool {
+	p := filepath.ToSlash(path)
+	return strings.HasPrefix(p, "cmd/") || strings.Contains(p, "/cmd/") || strings.HasSuffix(p, "/main.go") || strings.HasSuffix(p, "_test.go")
+}
+
 type FuncDef struct {
 	Name      string
 	File      string
@@ -471,33 +503,36 @@ func (d *EnhancedDeadCodeDetector) Detect(ctx context.Context, path string, conf
 			switch o := obj.(type) {
 			case *types.Func:
 				defs[key] = ObjInfo{
-					Name:      obj.Name(),
-					Type:      "function",
-					File:      pos.Filename,
-					Line:      pos.Line,
-					Package:   obj.Pkg().Path(),
-					Exported:  obj.Exported(),
-					Signature: o.Type().String(),
+					Name:        obj.Name(),
+					Type:        "function",
+					File:        pos.Filename,
+					Line:        pos.Line,
+					Package:     obj.Pkg().Path(),
+					PackageName: pkg.Name,
+					Exported:    obj.Exported(),
+					Signature:   o.Type().String(),
 				}
 			case *types.TypeName:
 				defs[key] = ObjInfo{
-					Name:       obj.Name(),
-					Type:       "type",
-					File:       pos.Filename,
-					Line:       pos.Line,
-					Package:    obj.Pkg().Path(),
-					Exported:   obj.Exported(),
-					Underlying: o.Type().Underlying().String(),
+					Name:        obj.Name(),
+					Type:        "type",
+					File:        pos.Filename,
+					Line:        pos.Line,
+					Package:     obj.Pkg().Path(),
+					PackageName: pkg.Name,
+					Exported:    obj.Exported(),
+					Underlying:  o.Type().Underlying().String(),
 				}
 			case *types.Var:
-				if obj.Exported() {
+				if obj.Exported() && !o.IsField() {
 					defs[key] = ObjInfo{
-						Name:     obj.Name(),
-						Type:     "variable",
-						File:     pos.Filename,
-						Line:     pos.Line,
-						Package:  obj.Pkg().Path(),
-						Exported: obj.Exported(),
+						Name:        obj.Name(),
+						Type:        "variable",
+						File:        pos.Filename,
+						Line:        pos.Line,
+						Package:     obj.Pkg().Path(),
+						PackageName: pkg.Name,
+						Exported:    obj.Exported(),
 					}
 				}
 			}
@@ -521,10 +556,22 @@ func (d *EnhancedDeadCodeDetector) Detect(ctx context.Context, path string, conf
 		if entryPoints[key] {
 			continue
 		}
+		if !strings.Contains(def.Package, "/internal/") || def.PackageName == "main" {
+			continue
+		}
+		if isLikelyEntrypointFile(def.File) {
+			continue
+		}

 		if strings.HasPrefix(def.Name, "Test") || strings.HasPrefix(def.Name, "Benchmark") || strings.HasPrefix(def.Name, "Fuzz") {
 			continue
 		}
+		if def.Type == "function" && strings.HasPrefix(def.Name, "New") {
+			continue
+		}
+		if def.Type == "type" && (strings.HasSuffix(def.Name, "Config") || strings.HasSuffix(def.Name, "Options")) {
+			continue
+		}

 		if strings.HasSuffix(def.Name, "Error") && def.Type == "type" {
 			continue
@@ -573,12 +620,13 @@ func (d *EnhancedDeadCodeDetector) Detect(ctx context.Context, path string, conf
 }

 type ObjInfo struct {
-	Name       string
-	Type       string
-	File       string
-	Line       int
-	Package    string
-	Exported   bool
-	Signature  string
-	Underlying string
+	Name        string
+	Type        string
+	File        string
+	Line        int
+	Package     string
+	PackageName string
+	Exported    bool
+	Signature   string
+	Underlying  string
 }
@@ -172,8 +172,7 @@ func (d *UnusedImportDetector) analyzeFile(path string) ([]quality.Finding, erro
 		if imp.Name != nil {
 			name = imp.Name.Name
 		} else {
-			parts := strings.Split(pkgPath, "/")
-			name = parts[len(parts)-1]
+			name = inferImportName(pkgPath)
 		}
 		imports[pkgPath] = name
 	}
@@ -191,8 +190,7 @@ func (d *UnusedImportDetector) analyzeFile(path string) ([]quality.Finding, erro
 		if imp.Name != nil {
 			name = imp.Name.Name
 		} else {
-			parts := strings.Split(pkgPath, "/")
-			name = parts[len(parts)-1]
+			name = inferImportName(pkgPath)
 		}

 		if name == "_" || name == "." {
@@ -224,6 +222,42 @@ func (d *UnusedImportDetector) analyzeFile(path string) ([]quality.Finding, erro
 	return findings, nil
 }

+func inferImportName(pkgPath string) string {
+	parts := strings.Split(pkgPath, "/")
+	if len(parts) == 0 {
+		return pkgPath
+	}
+
+	last := parts[len(parts)-1]
+	if isSemverSegment(last) && len(parts) >= 2 {
+		last = parts[len(parts)-2]
+	}
+	if idx := strings.Index(last, ".v"); idx > 0 && isDigits(last[idx+2:]) {
+		last = last[:idx]
+	}
+
+	return last
+}
+
+func isSemverSegment(segment string) bool {
+	if len(segment) < 2 || segment[0] != 'v' {
+		return false
+	}
+	return isDigits(segment[1:])
+}
+
+func isDigits(value string) bool {
+	if value == "" {
+		return false
+	}
+	for _, r := range value {
+		if r < '0' || r > '9' {
+			return false
+		}
+	}
+	return true
+}
+
 type CycleDetector struct {
 	*quality.BaseDetector
 }
@@ -0,0 +1,22 @@
+package analyzers
+
+import "testing"
+
+func TestInferImportName(t *testing.T) {
+	tests := []struct {
+		path string
+		want string
+	}{
+		{path: "fmt", want: "fmt"},
+		{path: "gopkg.in/yaml.v3", want: "yaml"},
+		{path: "github.com/gocolly/colly/v2", want: "colly"},
+		{path: "golang.org/x/tools/go/packages", want: "packages"},
+	}
+
+	for _, tt := range tests {
+		got := inferImportName(tt.path)
+		if got != tt.want {
+			t.Fatalf("inferImportName(%q) = %q, want %q", tt.path, got, tt.want)
+		}
+	}
+}
@@ -240,6 +240,10 @@ func (d *DebugLogDetector) analyzeFile(path string) []quality.Finding {
 	if err != nil {
 		return nil
 	}
+	normPath := filepath.ToSlash(path)
+	if strings.Contains(normPath, "internal/ui/") || strings.Contains(normPath, "examples/") {
+		return nil
+	}

 	debugPatterns := []string{
 		"log.Print",
@@ -267,7 +271,7 @@ func (d *DebugLogDetector) analyzeFile(path string) []quality.Finding {

 		for _, pattern := range debugPatterns {
 			if callStr == pattern || strings.HasPrefix(callStr, pattern) {
-				if strings.Contains(path, "_test.go") {
+				if strings.HasSuffix(normPath, "_test.go") || strings.HasPrefix(normPath, "cmd/") || strings.Contains(normPath, "/cmd/") {
 					return true
 				}

@@ -291,7 +295,7 @@ func (d *DebugLogDetector) analyzeFile(path string) []quality.Finding {
 			}
 		}

-		if strings.Contains(path, "/cmd/") {
+		if strings.HasPrefix(normPath, "cmd/") || strings.Contains(normPath, "/cmd/") {
 			return true
 		}

@@ -42,7 +42,6 @@ func (p *GoPlugin) DefaultSrcDir() string {

 func (p *GoPlugin) CreateDetectors(finder quality.FileFinder) []quality.Detector {
 	return []quality.Detector{
-		analyzers.NewDeadCodeDetector(finder),
 		analyzers.NewEnhancedDeadCodeDetector(finder),
 		analyzers.NewUnusedImportDetector(finder),
 		analyzers.NewCycleDetector(finder),
@@ -67,13 +67,13 @@ func (s *Scanner) Scan(ctx context.Context) (*ScanResult, error) {
 		// Skip language-specific detectors for different languages
 		if langDetector, ok := detector.(LanguageDetector); ok {
 			supported := langDetector.SupportedLanguages()
-			if !contains(supported, language) {
+			if len(supported) > 0 && !contains(supported, language) {
 				log.Printf("Skipping detector %s for language %s", name, language)
 				continue
 			}
 		}

-		findings, err := detector.Detect(ctx, s.config.Path, s.config)
+		findings, err := s.runDetectorSafely(ctx, detector, name)
 		if err != nil {
 			log.Printf("Detector %s failed: %v", name, err)
 			continue
@@ -106,28 +106,21 @@ func (s *Scanner) Scan(ctx context.Context) (*ScanResult, error) {
 	return result, nil
 }

+func (s *Scanner) runDetectorSafely(ctx context.Context, detector Detector, name string) (_ []Finding, err error) {
+	defer func() {
+		if r := recover(); r != nil {
+			err = fmt.Errorf("detector panic in %s: %v", name, r)
+		}
+	}()
+	return detector.Detect(ctx, s.config.Path, s.config)
+}
+
 // detectLanguage attempts to auto-detect the project language
 func (s *Scanner) detectLanguage(path string) string {
-	// Check for marker files
-	markers := map[string]string{
-		"go.mod":           "go",
-		"package.json":     "typescript",
-		"tsconfig.json":    "typescript",
-		"requirements.txt": "python",
-		"setup.py":         "python",
-		"pyproject.toml":   "python",
-		"pom.xml":          "java",
-		"build.gradle":     "java",
-		"Cargo.toml":       "rust",
-		"composer.json":    "php",
-	}
-
-	for file, lang := range markers {
-		if _, err := filepath.Abs(filepath.Join(path, file)); err == nil {
-			if _, err := filepath.Glob(filepath.Join(path, file)); err == nil {
-				return lang
-			}
-		}
+	// Keep auto-detection intentionally conservative until full multi-language
+	// scanner behavior is validated in tests.
+	if _, err := os.Stat(filepath.Join(path, "go.mod")); err == nil {
+		return "go"
 	}

 	// Default to Go if no markers found
@@ -164,7 +157,7 @@ func (s *Scanner) getSourceFiles(path, language string) ([]string, error) {
 		if info.IsDir() {
 			// Skip hidden directories and common exclude dirs
 			base := filepath.Base(filePath)
-			if strings.HasPrefix(base, ".") || base == "node_modules" || base == "vendor" {
+			if filePath != path && (strings.HasPrefix(base, ".") || base == "node_modules" || base == "vendor") {
 				return filepath.SkipDir
 			}
 			return nil
@@ -0,0 +1,36 @@
+package quality
+
+import (
+	"context"
+	"testing"
+)
+
+type panicDetector struct{}
+
+func (p panicDetector) Name() string       { return "panic_detector" }
+func (p panicDetector) Severity() Severity { return SeverityT2 }
+func (p panicDetector) Detect(ctx context.Context, path string, config *Config) ([]Finding, error) {
+	panic("boom")
+}
+
+type okDetector struct{}
+
+func (o okDetector) Name() string       { return "ok_detector" }
+func (o okDetector) Severity() Severity { return SeverityT1 }
+func (o okDetector) Detect(ctx context.Context, path string, config *Config) ([]Finding, error) {
+	return []Finding{{ID: "ok", Type: "ok", Title: "ok", File: "f.go", Line: 1, Severity: SeverityT1, Score: 1, Status: StatusOpen}}, nil
+}
+
+func TestScannerRecoversDetectorPanic(t *testing.T) {
+	s := NewScanner(&Config{Path: ".", Language: "go"})
+	s.RegisterDetector(panicDetector{})
+	s.RegisterDetector(okDetector{})
+
+	result, err := s.Scan(context.Background())
+	if err != nil {
+		t.Fatalf("scan should recover detector panic, got err: %v", err)
+	}
+	if len(result.Findings) != 1 {
+		t.Fatalf("expected findings from healthy detector only, got %d", len(result.Findings))
+	}
+}
@@ -457,6 +457,37 @@ func TestScanner_getSourceFiles_Fallback(t *testing.T) {
 	}
 }

+func TestScanner_getSourceFiles_Fallback_DotPathRootNotSkipped(t *testing.T) {
+	tmpDir, err := os.MkdirTemp("", "scanner_dot_root_test")
+	if err != nil {
+		t.Fatalf("Failed to create temp dir: %v", err)
+	}
+	defer os.RemoveAll(tmpDir)
+
+	if err := os.WriteFile(filepath.Join(tmpDir, "main.go"), []byte("package main"), 0644); err != nil {
+		t.Fatalf("Failed to write go file: %v", err)
+	}
+
+	cwd, err := os.Getwd()
+	if err != nil {
+		t.Fatalf("Failed to get cwd: %v", err)
+	}
+	defer func() { _ = os.Chdir(cwd) }()
+
+	if err := os.Chdir(tmpDir); err != nil {
+		t.Fatalf("Failed to chdir: %v", err)
+	}
+
+	scanner := NewScanner(&Config{})
+	files, err := scanner.getSourceFiles(".", "go")
+	if err != nil {
+		t.Fatalf("getSourceFiles() failed: %v", err)
+	}
+	if len(files) != 1 {
+		t.Fatalf("getSourceFiles('.') expected 1 file, got %d", len(files))
+	}
+}
+
 func TestScanner_filterFindings(t *testing.T) {
 	scanner := NewScanner(&Config{})

@@ -52,8 +52,8 @@ func TestScorer_CalculateScore(t *testing.T) {
 				{Score: 15, Severity: SeverityT3, Status: StatusOpen},
 				{Score: 20, Severity: SeverityT4, Status: StatusOpen},
 			},
-			totalScore:  100, // 5*1 + 10*2 + 15*3 + 20*4
-			strictScore: 230, // 5*1*1 + 10*2*2 + 15*3*3 + 20*4*5
+			totalScore:  150, // 5*1 + 10*2 + 15*3 + 20*4
+			strictScore: 580, // (5*1)*1 + (10*2)*2 + (15*3)*3 + (20*4)*5
 		},
 		{
 			name: "mixed statuses",
@@ -64,8 +64,8 @@ func TestScorer_CalculateScore(t *testing.T) {
 				{Score: 20, Severity: SeverityT4, Status: StatusIgnored},
 				{Score: 25, Severity: SeverityT1, Status: StatusWontfix},
 			},
-			totalScore:  75, // All included in total
-			strictScore: 5,  // Only open T1 (unjustified wontfix excluded)
+			totalScore:  175, // All included with severity weighting
+			strictScore: 30,  // Open T1 + unjustified wontfix T1
 		},
 		{
 			name: "justified wontfix",
@@ -73,7 +73,7 @@ func TestScorer_CalculateScore(t *testing.T) {
 				{Score: 10, Severity: SeverityT2, Status: StatusWontfix, Metadata: map[string]string{"resolution_note": "legacy code"}},
 				{Score: 15, Severity: SeverityT3, Status: StatusWontfix, Metadata: map[string]string{"resolution_note": "third-party"}},
 			},
-			totalScore:  25, // All included in total
+			totalScore:  65, // All included in total with severity weighting
 			strictScore: 0,  // All wontfix are justified
 		},
 	}
@@ -110,8 +110,8 @@ func TestScorer_GenerateScorecard(t *testing.T) {
 		t.Errorf("GenerateScorecard() TargetScore = %v, want 95", card.TargetScore)
 	}

-	if card.TotalScore != 40 { // 10*2 + 5*1 + 15*3
-		t.Errorf("GenerateScorecard() TotalScore = %v, want 40", card.TotalScore)
+	if card.TotalScore != 70 { // 10*2 + 5*1 + 15*3
+		t.Errorf("GenerateScorecard() TotalScore = %v, want 70", card.TotalScore)
 	}

 	if card.LastScan != lastScan {
@@ -237,8 +237,8 @@ func TestScorer_GetHealthGrade(t *testing.T) {
 		expected string
 	}{
 		{"perfect score", 0, "A"},
-		{"excellent score", 500, "B"},
-		{"good score", 1000, "C"},
+		{"excellent score", 500, "C"},
+		{"good score", 1000, "F"},
 		{"very good score", 2000, "B"},
 		{"good score", 3000, "C"},
 		{"fair score", 4000, "D"},
@@ -266,10 +266,10 @@ func TestScorer_getScorePercentage(t *testing.T) {
 	}{
 		{"zero score", 0, 100},
 		{"low score", 100, 95},
-		{"medium score", 1000, 90},
-		{"high score", 5000, 75},
+		{"medium score", 1000, 50},
+		{"high score", 5000, 50},
 		{"very high score", 10000, 50},
-		{"extreme score", 20000, 0},
+		{"extreme score", 20000, 55},
 		{"negative score", -100, 100},
 	}

@@ -0,0 +1,45 @@
+package scraper
+
+import basescraper "github.com/yourorg/devour/internal/scraper"
+
+func init() {
+	basescraper.RegisterScraper(basescraper.SourceTypeGoDocs, func(c *basescraper.Config) basescraper.Scraper {
+		return NewGoDocsScraper(c)
+	})
+	basescraper.RegisterScraper(basescraper.SourceTypeRustDocs, func(c *basescraper.Config) basescraper.Scraper {
+		return NewRustDocsScraper(c)
+	})
+	basescraper.RegisterScraper(basescraper.SourceTypePythonDocs, func(c *basescraper.Config) basescraper.Scraper {
+		return NewPythonDocsScraper(c)
+	})
+	basescraper.RegisterScraper(basescraper.SourceTypeJavaDocs, func(c *basescraper.Config) basescraper.Scraper {
+		return NewJavaDocsScraper(c)
+	})
+	basescraper.RegisterScraper(basescraper.SourceTypeSpringDocs, func(c *basescraper.Config) basescraper.Scraper {
+		return NewSpringDocsScraper(c)
+	})
+	basescraper.RegisterScraper(basescraper.SourceTypeTSDocs, func(c *basescraper.Config) basescraper.Scraper {
+		return NewTSDocsScraper(c)
+	})
+	basescraper.RegisterScraper(basescraper.SourceTypeReactDocs, func(c *basescraper.Config) basescraper.Scraper {
+		return NewReactDocsScraper(c)
+	})
+	basescraper.RegisterScraper(basescraper.SourceTypeVueDocs, func(c *basescraper.Config) basescraper.Scraper {
+		return NewVueDocsScraper(c)
+	})
+	basescraper.RegisterScraper(basescraper.SourceTypeNuxtDocs, func(c *basescraper.Config) basescraper.Scraper {
+		return NewNuxtDocsScraper(c)
+	})
+	basescraper.RegisterScraper(basescraper.SourceTypeMCPDocs, func(c *basescraper.Config) basescraper.Scraper {
+		return NewMCPDocsScraper(c)
+	})
+	basescraper.RegisterScraper(basescraper.SourceTypeDockerDocs, func(c *basescraper.Config) basescraper.Scraper {
+		return NewDockerDocsScraper(c)
+	})
+	basescraper.RegisterScraper(basescraper.SourceTypeCloudflareDocs, func(c *basescraper.Config) basescraper.Scraper {
+		return NewCloudflareDocsScraper(c)
+	})
+	basescraper.RegisterScraper(basescraper.SourceTypeAstroDocs, func(c *basescraper.Config) basescraper.Scraper {
+		return NewAstroDocsScraper(c)
+	})
+}
@@ -155,16 +155,18 @@ func (s *TSDocsScraper) interfaceToDocument(iface *tsdocs.Interface, module *tsd
 	metadata := map[string]interface{}{
 		"module":  module.Name,
 		"name":    iface.Name,
-		"doc_url": iface.DocURL,
+		"doc_url": coalesceDocURL(iface.DocURL, module.DocURL),
 	}

+	docURL := coalesceDocURL(iface.DocURL, module.DocURL)
+
 	return &Document{
-		ID:        generateDocID(iface.DocURL),
+		ID:        generateDocID(docURL),
 		Source:    sourceName,
 		Type:      "ts-interface",
 		Title:     iface.Name,
 		Content:   content.String(),
-		URL:       iface.DocURL,
+		URL:       docURL,
 		Metadata:  metadata,
 		Hash:      s.generateHash(content.String()),
 		Timestamp: time.Now(),
@@ -185,16 +187,18 @@ func (s *TSDocsScraper) functionToDocument(fn *tsdocs.Function, module *tsdocs.M
 		"module":      module.Name,
 		"name":        fn.Name,
 		"return_type": fn.ReturnType,
-		"doc_url":     fn.DocURL,
+		"doc_url":     coalesceDocURL(fn.DocURL, module.DocURL),
 	}

+	docURL := coalesceDocURL(fn.DocURL, module.DocURL)
+
 	return &Document{
-		ID:        generateDocID(fn.DocURL),
+		ID:        generateDocID(docURL),
 		Source:    sourceName,
 		Type:      "ts-function",
 		Title:     fn.Name,
 		Content:   content.String(),
-		URL:       fn.DocURL,
+		URL:       docURL,
 		Metadata:  metadata,
 		Hash:      s.generateHash(content.String()),
 		Timestamp: time.Now(),
@@ -217,16 +221,18 @@ func (s *TSDocsScraper) classToDocument(class *tsdocs.Class, module *tsdocs.Modu
 	metadata := map[string]interface{}{
 		"module":  module.Name,
 		"name":    class.Name,
-		"doc_url": class.DocURL,
+		"doc_url": coalesceDocURL(class.DocURL, module.DocURL),
 	}

+	docURL := coalesceDocURL(class.DocURL, module.DocURL)
+
 	return &Document{
-		ID:        generateDocID(class.DocURL),
+		ID:        generateDocID(docURL),
 		Source:    sourceName,
 		Type:      "ts-class",
 		Title:     class.Name,
 		Content:   content.String(),
-		URL:       class.DocURL,
+		URL:       docURL,
 		Metadata:  metadata,
 		Hash:      s.generateHash(content.String()),
 		Timestamp: time.Now(),
@@ -244,18 +250,27 @@ func (s *TSDocsScraper) typeAliasToDocument(ta *tsdocs.TypeAlias, module *tsdocs
 	metadata := map[string]interface{}{
 		"module":  module.Name,
 		"name":    ta.Name,
-		"doc_url": ta.DocURL,
+		"doc_url": coalesceDocURL(ta.DocURL, module.DocURL),
 	}

+	docURL := coalesceDocURL(ta.DocURL, module.DocURL)
+
 	return &Document{
-		ID:        generateDocID(ta.DocURL),
+		ID:        generateDocID(docURL),
 		Source:    sourceName,
 		Type:      "ts-type",
 		Title:     ta.Name,
 		Content:   content.String(),
-		URL:       ta.DocURL,
+		URL:       docURL,
 		Metadata:  metadata,
 		Hash:      s.generateHash(content.String()),
 		Timestamp: time.Now(),
 	}
 }
+
+func coalesceDocURL(primary, fallback string) string {
+	if strings.TrimSpace(primary) != "" {
+		return primary
+	}
+	return fallback
+}
@@ -0,0 +1,65 @@
+package scraper
+
+import (
+	"testing"
+
+	"github.com/yourorg/devour/pkg/tsdocs"
+)
+
+func TestTSDocsSubDocsFallbackToModuleURL(t *testing.T) {
+	s := &TSDocsScraper{}
+	module := &tsdocs.Module{
+		Name:   "Module",
+		DocURL: "https://www.typescriptlang.org/docs/handbook/2/basic-types.html",
+	}
+
+	cases := []struct {
+		name    string
+		build   func() *Document
+		docType string
+	}{
+		{
+			name: "interface",
+			build: func() *Document {
+				return s.interfaceToDocument(&tsdocs.Interface{Name: "User", DocURL: ""}, module, "ts")
+			},
+			docType: "ts-interface",
+		},
+		{
+			name: "function",
+			build: func() *Document {
+				return s.functionToDocument(&tsdocs.Function{Name: "parse", DocURL: ""}, module, "ts")
+			},
+			docType: "ts-function",
+		},
+		{
+			name: "class",
+			build: func() *Document {
+				return s.classToDocument(&tsdocs.Class{Name: "Service", DocURL: ""}, module, "ts")
+			},
+			docType: "ts-class",
+		},
+		{
+			name: "type alias",
+			build: func() *Document {
+				return s.typeAliasToDocument(&tsdocs.TypeAlias{Name: "ID", Type: "string", DocURL: ""}, module, "ts")
+			},
+			docType: "ts-type",
+		},
+	}
+
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			doc := tc.build()
+			if doc.URL != module.DocURL {
+				t.Fatalf("expected fallback URL %q, got %q", module.DocURL, doc.URL)
+			}
+			if got := doc.Metadata["doc_url"]; got != module.DocURL {
+				t.Fatalf("expected metadata doc_url %q, got %#v", module.DocURL, got)
+			}
+			if doc.Type != tc.docType {
+				t.Fatalf("expected doc type %q, got %q", tc.docType, doc.Type)
+			}
+		})
+	}
+}
@@ -0,0 +1,21 @@
+package scraper
+
+import (
+	"crypto/sha256"
+	"encoding/hex"
+
+	basescraper "github.com/yourorg/devour/internal/scraper"
+)
+
+type SourceType = basescraper.SourceType
+
+type Source = basescraper.Source
+
+type Document = basescraper.Document
+
+type Config = basescraper.Config
+
+func generateDocID(urlStr string) string {
+	hash := sha256.Sum256([]byte(urlStr))
+	return hex.EncodeToString(hash[:12])
+}
@@ -2,6 +2,12 @@ package scraper

 import (
 	"context"
+	"fmt"
+	"net/url"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"strings"
 )

 // GitHubScraper scrapes documentation from GitHub repositories.
@@ -16,16 +22,173 @@ func NewGitHubScraper(config *Config) *GitHubScraper {

 // Scrape clones and parses documents from a GitHub repository.
 func (s *GitHubScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
-	// TODO: Implement GitHub scraping
-	// 1. Clone repository (shallow)
-	// 2. Find markdown files in specified paths
-	// 3. Parse README, docs/, wiki
-	// 4. Extract code structure
-	return nil, nil
+	if source == nil {
+		return nil, fmt.Errorf("source is required")
+	}
+
+	repoURL, repoName, err := s.resolveRepo(source)
+	if err != nil {
+		return nil, err
+	}
+
+	tmpDir, err := os.MkdirTemp("", "devour-github-*")
+	if err != nil {
+		return nil, err
+	}
+	defer os.RemoveAll(tmpDir)
+
+	cloneArgs := []string{"clone", "--depth", "1", "--filter=blob:none", "--sparse", repoURL, tmpDir}
+	if branch := strings.TrimSpace(source.Branch); branch != "" {
+		cloneArgs = []string{"clone", "--depth", "1", "--filter=blob:none", "--sparse", "--branch", branch, repoURL, tmpDir}
+	}
+
+	cmd := exec.CommandContext(ctx, "git", cloneArgs...)
+	output, err := cmd.CombinedOutput()
+	if err != nil {
+		return nil, fmt.Errorf("git clone failed: %v (%s)", err, strings.TrimSpace(string(output)))
+	}
+
+	if len(source.Include) == 0 {
+		// Try sparse checkout for common docs locations to reduce clone and parse cost.
+		sparse := exec.CommandContext(ctx, "git", "-C", tmpDir, "sparse-checkout", "set", "--skip-checks",
+			"docs", "doc", "src/routes", "website/docs", "packages/*/docs")
+		if sparseOut, sparseErr := sparse.CombinedOutput(); sparseErr != nil {
+			_ = sparseOut
+			_ = exec.CommandContext(ctx, "git", "-C", tmpDir, "sparse-checkout", "disable").Run()
+		}
+	} else {
+		_ = exec.CommandContext(ctx, "git", "-C", tmpDir, "sparse-checkout", "disable").Run()
+	}
+
+	localSource := &Source{
+		Name:     coalesce(source.Name, repoName),
+		Type:     SourceTypeLocal,
+		Path:     tmpDir,
+		Include:  append([]string(nil), source.Include...),
+		Exclude:  append([]string(nil), source.Exclude...),
+		Schedule: source.Schedule,
+	}
+
+	if len(localSource.Include) == 0 {
+		localSource.Include = []string{
+			`(?i)(^|/)readme\.md$`,
+			`(?i)(^|/)docs?/`,
+			`(?i)\.md$`,
+			`(?i)\.mdx$`,
+		}
+	}
+
+	local := NewLocalScraper(s.config)
+	docs, err := local.Scrape(ctx, localSource)
+	if err != nil {
+		return nil, err
+	}
+	if len(docs) == 0 && len(source.Include) == 0 {
+		// Sparse patterns did not match this repository layout; retry full checkout.
+		_ = exec.CommandContext(ctx, "git", "-C", tmpDir, "sparse-checkout", "disable").Run()
+		docs, err = local.Scrape(ctx, localSource)
+		if err != nil {
+			return nil, err
+		}
+	}
+
+	for _, doc := range docs {
+		if doc == nil {
+			continue
+		}
+		branchForURL := strings.TrimSpace(source.Branch)
+		if branchForURL == "" {
+			branchForURL = "HEAD"
+		}
+		if doc.Metadata == nil {
+			doc.Metadata = map[string]interface{}{}
+		}
+		if rawPath, ok := doc.Metadata["path"].(string); ok {
+			if relPath, relErr := filepath.Rel(tmpDir, rawPath); relErr == nil {
+				relPath = filepath.ToSlash(relPath)
+				relPath = strings.TrimPrefix(relPath, "./")
+				if relPath != "" && relPath != "." {
+					doc.URL = fmt.Sprintf("https://github.com/%s/blob/%s/%s", repoName, branchForURL, relPath)
+					doc.ID = generateDocID(doc.URL)
+					doc.Metadata["path"] = relPath
+				}
+			}
+		}
+		doc.Type = "github-document"
+		doc.Metadata["repo"] = repoName
+		doc.Metadata["repo_url"] = repoURL
+		doc.Metadata["source_type"] = "github"
+	}
+	return docs, nil
 }

 // DetectChanges checks if the repository has new commits.
 func (s *GitHubScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
-	// TODO: Check latest commit hash
-	return false, "", nil
+	if source == nil {
+		return false, "", fmt.Errorf("source is required")
+	}
+	_, repoName, err := s.resolveRepo(source)
+	if err != nil {
+		return false, "", err
+	}
+
+	remote := "https://github.com/" + strings.TrimSuffix(repoName, ".git") + ".git"
+	branch := strings.TrimSpace(source.Branch)
+	if branch == "" {
+		branch = "HEAD"
+	}
+
+	cmd := exec.CommandContext(ctx, "git", "ls-remote", remote, branch)
+	output, err := cmd.Output()
+	if err != nil {
+		return false, "", err
+	}
+	line := strings.TrimSpace(string(output))
+	if line == "" {
+		return false, "", fmt.Errorf("empty ls-remote output")
+	}
+	parts := strings.Fields(line)
+	if len(parts) == 0 {
+		return false, "", fmt.Errorf("unexpected ls-remote output")
+	}
+	hash := parts[0]
+	return hash != lastHash, hash, nil
+}
+
+func (s *GitHubScraper) resolveRepo(source *Source) (repoURL string, repoName string, err error) {
+	if strings.TrimSpace(source.Repo) != "" {
+		repoName = strings.Trim(strings.TrimSpace(source.Repo), "/")
+		repoName = strings.TrimSuffix(repoName, ".git")
+		return "https://github.com/" + repoName + ".git", repoName, nil
+	}
+
+	raw := strings.TrimSpace(source.URL)
+	if raw == "" {
+		return "", "", fmt.Errorf("github source requires repo or url")
+	}
+
+	u, err := url.Parse(raw)
+	if err != nil {
+		return "", "", err
+	}
+	if !strings.Contains(strings.ToLower(u.Host), "github.com") {
+		return "", "", fmt.Errorf("not a github url: %s", raw)
+	}
+	parts := strings.Split(strings.Trim(u.Path, "/"), "/")
+	if len(parts) < 2 {
+		return "", "", fmt.Errorf("invalid github repo url: %s", raw)
+	}
+	repoName = parts[0] + "/" + strings.TrimSuffix(parts[1], ".git")
+	repoURL = "https://github.com/" + repoName + ".git"
+	return repoURL, repoName, nil
+}
+
+func coalesce(primary, fallback string) string {
+	if strings.TrimSpace(primary) != "" {
+		return primary
+	}
+	if strings.TrimSpace(fallback) != "" {
+		return filepath.Base(fallback)
+	}
+	return "github"
 }
@@ -2,6 +2,20 @@ package scraper

 import (
 	"context"
+	"crypto/sha256"
+	"encoding/hex"
+	"fmt"
+	"io/fs"
+	"os"
+	"path/filepath"
+	"regexp"
+	"strings"
+	"time"
+)
+
+var (
+	reLocalBlankLines  = regexp.MustCompile(`\n{3,}`)
+	reFrontMatterBlock = regexp.MustCompile(`(?s)\A---\s*\n.*?\n---\s*\n`)
 )

 // LocalScraper scrapes documentation from local filesystem.
@@ -16,16 +30,221 @@ func NewLocalScraper(config *Config) *LocalScraper {

 // Scrape scans and parses documents from a local directory.
 func (s *LocalScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
-	// TODO: Implement local scraping
-	// 1. Walk directory tree
-	// 2. Filter by include/exclude patterns
-	// 3. Parse markdown, text, code files
-	// 4. Extract structure and content
-	return nil, nil
+	if source == nil {
+		return nil, fmt.Errorf("source is required")
+	}
+
+	root := strings.TrimSpace(source.Path)
+	if root == "" {
+		root = strings.TrimSpace(source.URL)
+	}
+	if root == "" {
+		return nil, fmt.Errorf("path or url is required for local source")
+	}
+
+	info, err := os.Stat(root)
+	if err != nil {
+		return nil, err
+	}
+
+	docs := make([]*Document, 0)
+	if !info.IsDir() {
+		doc, err := s.fileToDocument(root, source)
+		if err != nil {
+			return nil, err
+		}
+		return []*Document{doc}, nil
+	}
+
+	web := NewWebScraper(s.config)
+	err = filepath.WalkDir(root, func(path string, d fs.DirEntry, err error) error {
+		if err != nil {
+			return err
+		}
+		select {
+		case <-ctx.Done():
+			return ctx.Err()
+		default:
+		}
+
+		if d.IsDir() {
+			name := d.Name()
+			if strings.HasPrefix(name, ".") || name == "node_modules" || name == "vendor" || name == "dist" || name == "build" {
+				return filepath.SkipDir
+			}
+			return nil
+		}
+
+		relPath := path
+		if rel, relErr := filepath.Rel(root, path); relErr == nil {
+			relPath = rel
+		}
+		normalized := filepath.ToSlash(relPath)
+		if !web.shouldInclude(normalized, source.Include, source.Exclude) {
+			return nil
+		}
+		if !isDocumentationFile(path) {
+			return nil
+		}
+
+		doc, err := s.fileToDocument(path, source)
+		if err != nil {
+			return nil
+		}
+		docs = append(docs, doc)
+		return nil
+	})
+	if err != nil {
+		return nil, err
+	}
+
+	return docs, nil
 }

 // DetectChanges checks if files have been modified.
 func (s *LocalScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
-	// TODO: Check file modification times
-	return false, "", nil
+	if source == nil {
+		return false, "", fmt.Errorf("source is required")
+	}
+
+	root := strings.TrimSpace(source.Path)
+	if root == "" {
+		root = strings.TrimSpace(source.URL)
+	}
+	if root == "" {
+		return false, "", fmt.Errorf("path or url is required for local source")
+	}
+
+	h := sha256.New()
+	err := filepath.WalkDir(root, func(path string, d fs.DirEntry, err error) error {
+		if err != nil {
+			return err
+		}
+		if d.IsDir() {
+			name := d.Name()
+			if strings.HasPrefix(name, ".") || name == "node_modules" || name == "vendor" || name == "dist" || name == "build" {
+				return filepath.SkipDir
+			}
+			return nil
+		}
+		if !isDocumentationFile(path) {
+			return nil
+		}
+
+		info, infoErr := d.Info()
+		if infoErr != nil {
+			return infoErr
+		}
+		fmt.Fprintf(h, "%s|%d|%d\n", path, info.Size(), info.ModTime().UnixNano())
+		return nil
+	})
+	if err != nil {
+		return false, "", err
+	}
+
+	hash := hex.EncodeToString(h.Sum(nil))
+	return hash != lastHash, hash, nil
+}
+
+func (s *LocalScraper) fileToDocument(path string, source *Source) (*Document, error) {
+	b, err := os.ReadFile(path)
+	if err != nil {
+		return nil, err
+	}
+
+	ext := strings.ToLower(filepath.Ext(path))
+	content := normalizeLocalContent(string(b), ext)
+	if content == "" {
+		return nil, fmt.Errorf("empty file")
+	}
+
+	title := strings.TrimSuffix(filepath.Base(path), filepath.Ext(path))
+	hash := sha256.Sum256(b)
+	uri := "file://" + filepath.ToSlash(path)
+
+	docType := "local-document"
+	switch ext {
+	case ".md", ".mdx":
+		docType = "local-markdown"
+	case ".txt":
+		docType = "local-text"
+	case ".json", ".yaml", ".yml":
+		docType = "local-data"
+	case ".go", ".js", ".ts", ".tsx", ".py", ".java", ".rs", ".rb", ".php":
+		docType = "local-code"
+	}
+
+	name := source.Name
+	if strings.TrimSpace(name) == "" {
+		name = filepath.Base(filepath.Dir(path))
+	}
+
+	return &Document{
+		ID:      generateDocID(uri),
+		Source:  name,
+		Type:    docType,
+		Title:   title,
+		Content: content,
+		URL:     uri,
+		Metadata: map[string]interface{}{
+			"path": path,
+			"size": len(b),
+		},
+		Hash:      hex.EncodeToString(hash[:]),
+		Timestamp: time.Now(),
+	}, nil
+}
+
+func normalizeLocalContent(content, ext string) string {
+	content = strings.TrimSpace(content)
+	if content == "" {
+		return ""
+	}
+
+	switch ext {
+	case ".md", ".mdx":
+		content = stripMarkdownFrontmatter(content)
+		content = stripMDXPreamble(content)
+	}
+
+	// Collapse excessive blank lines to reduce indexing noise.
+	content = reLocalBlankLines.ReplaceAllString(content, "\n\n")
+	return strings.TrimSpace(content)
+}
+
+func stripMarkdownFrontmatter(content string) string {
+	if !strings.HasPrefix(content, "---\n") && !strings.HasPrefix(content, "---\r\n") {
+		return content
+	}
+
+	trimmed := reFrontMatterBlock.ReplaceAllString(content, "")
+	return trimmed
+}
+
+func stripMDXPreamble(content string) string {
+	lines := strings.Split(content, "\n")
+	i := 0
+	for i < len(lines) {
+		line := strings.TrimSpace(lines[i])
+		if line == "" {
+			i++
+			continue
+		}
+		if strings.HasPrefix(line, "import ") || strings.HasPrefix(line, "export ") {
+			i++
+			continue
+		}
+		break
+	}
+	return strings.Join(lines[i:], "\n")
+}
+
+func isDocumentationFile(path string) bool {
+	ext := strings.ToLower(filepath.Ext(path))
+	switch ext {
+	case ".md", ".mdx", ".txt", ".rst", ".adoc", ".json", ".yaml", ".yml", ".go", ".py", ".js", ".ts", ".tsx", ".java", ".rs", ".rb", ".php", ".html":
+		return true
+	default:
+		return false
+	}
 }
@@ -0,0 +1,102 @@
+package scraper
+
+import (
+	"context"
+	"os"
+	"path/filepath"
+	"strings"
+	"testing"
+	"time"
+)
+
+func TestLocalScraperScrapeDirectory(t *testing.T) {
+	tmp := t.TempDir()
+	if err := os.WriteFile(filepath.Join(tmp, "README.md"), []byte("# Demo\n\nhello docs"), 0o644); err != nil {
+		t.Fatal(err)
+	}
+	if err := os.WriteFile(filepath.Join(tmp, "notes.txt"), []byte("notes"), 0o644); err != nil {
+		t.Fatal(err)
+	}
+	if err := os.WriteFile(filepath.Join(tmp, "bin.bin"), []byte{0x00, 0x01}, 0o644); err != nil {
+		t.Fatal(err)
+	}
+
+	s := NewLocalScraper(&Config{Timeout: 2 * time.Second, UserAgent: "DevourTest"})
+	docs, err := s.Scrape(context.Background(), &Source{Name: "local", Type: SourceTypeLocal, Path: tmp})
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if len(docs) < 2 {
+		t.Fatalf("expected at least 2 docs, got %d", len(docs))
+	}
+}
+
+func TestLocalScraperDetectChanges(t *testing.T) {
+	tmp := t.TempDir()
+	file := filepath.Join(tmp, "README.md")
+	if err := os.WriteFile(file, []byte("v1"), 0o644); err != nil {
+		t.Fatal(err)
+	}
+
+	s := NewLocalScraper(&Config{Timeout: 2 * time.Second, UserAgent: "DevourTest"})
+	src := &Source{Name: "local", Type: SourceTypeLocal, Path: tmp}
+
+	changed, hash1, err := s.DetectChanges(context.Background(), src, "")
+	if err != nil {
+		t.Fatal(err)
+	}
+	if !changed || hash1 == "" {
+		t.Fatalf("expected first detect to change with non-empty hash, changed=%v hash=%q", changed, hash1)
+	}
+
+	time.Sleep(5 * time.Millisecond)
+	if err := os.WriteFile(file, []byte("v2"), 0o644); err != nil {
+		t.Fatal(err)
+	}
+
+	changed, hash2, err := s.DetectChanges(context.Background(), src, hash1)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if !changed {
+		t.Fatal("expected change after file update")
+	}
+	if hash1 == hash2 {
+		t.Fatal("expected hash to change")
+	}
+}
+
+func TestLocalScraper_StripsFrontmatterAndMDXPreamble(t *testing.T) {
+	tmp := t.TempDir()
+	path := filepath.Join(tmp, "doc.mdx")
+	content := `---
+title: My Doc
+slug: /my-doc
+---
+
+import { Component } from "x"
+export const meta = {}
+
+# Heading
+
+Actual documentation body.
+`
+	if err := os.WriteFile(path, []byte(content), 0o644); err != nil {
+		t.Fatal(err)
+	}
+
+	s := NewLocalScraper(&Config{Timeout: 2 * time.Second, UserAgent: "DevourTest"})
+	doc, err := s.fileToDocument(path, &Source{Name: "local", Type: SourceTypeLocal, Path: tmp})
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if strings.Contains(doc.Content, "slug: /my-doc") {
+		t.Fatalf("expected frontmatter to be stripped, got: %q", doc.Content)
+	}
+	if strings.Contains(doc.Content, "import { Component }") {
+		t.Fatalf("expected MDX preamble to be stripped, got: %q", doc.Content)
+	}
+	if !strings.Contains(doc.Content, "Actual documentation body.") {
+		t.Fatalf("expected markdown body in content, got: %q", doc.Content)
+	}
+}
@@ -0,0 +1,402 @@
+package scraper
+
+import (
+	"context"
+	"crypto/sha256"
+	"encoding/hex"
+	"encoding/json"
+	"fmt"
+	"io"
+	"net/http"
+	"net/url"
+	"sort"
+	"strconv"
+	"strings"
+	"time"
+)
+
+const (
+	defaultLocalSearchLimit = 8
+	maxLocalSearchLimit     = 50
+	maxSearchResponseBytes  = 2 << 20 // 2MB
+)
+
+// LocalSearchScraper scrapes docs from result URLs returned by a local search API.
+type LocalSearchScraper struct {
+	config *Config
+	client *http.Client
+	web    *WebScraper
+}
+
+type localSearchResult struct {
+	URL     string
+	Title   string
+	Snippet string
+	Engine  string
+	Score   float64
+}
+
+// NewLocalSearchScraper creates a scraper backed by a self-hosted search API.
+func NewLocalSearchScraper(config *Config) *LocalSearchScraper {
+	baseConfig := &Config{}
+	if config != nil {
+		*baseConfig = *config
+	}
+	if baseConfig.UserAgent == "" {
+		baseConfig.UserAgent = "Devour/1.0 (Local Search Scraper)"
+	}
+	if baseConfig.Timeout <= 0 {
+		baseConfig.Timeout = 30 * time.Second
+	}
+
+	webConfig := *baseConfig
+	webConfig.Concurrency = 1
+	webConfig.MaxDepth = 1
+
+	return &LocalSearchScraper{
+		config: baseConfig,
+		client: &http.Client{Timeout: baseConfig.Timeout},
+		web:    NewWebScraper(&webConfig),
+	}
+}
+
+// Scrape queries a local search API and scrapes the returned URLs.
+func (s *LocalSearchScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
+	if source == nil {
+		return nil, fmt.Errorf("source is required")
+	}
+	if strings.TrimSpace(source.URL) == "" {
+		return nil, fmt.Errorf("search API URL is required")
+	}
+	query := strings.TrimSpace(source.Query)
+	if query == "" {
+		return nil, fmt.Errorf("search query is required for localsearch sources (set source.query or --search-query)")
+	}
+
+	limit := clampLocalSearchLimit(source.ResultLimit)
+	results, err := s.search(ctx, source, query, limit)
+	if err != nil {
+		return nil, err
+	}
+
+	docs := make([]*Document, 0, limit)
+	seen := make(map[string]bool)
+	var scrapeErrors []string
+
+	for i, result := range results {
+		if ctx.Err() != nil {
+			return nil, ctx.Err()
+		}
+
+		resultURL := stripURLFragment(result.URL)
+		if resultURL == "" || seen[resultURL] {
+			continue
+		}
+		if !domainAllowed(resultURL, source.Domains) {
+			continue
+		}
+		if !s.web.shouldInclude(resultURL, source.Include, source.Exclude) {
+			continue
+		}
+		seen[resultURL] = true
+
+		pageDocs, err := s.web.Scrape(ctx, &Source{
+			Name:    source.Name,
+			Type:    SourceTypeWeb,
+			URL:     resultURL,
+			Include: source.Include,
+			Exclude: source.Exclude,
+		})
+		if err != nil {
+			if len(scrapeErrors) < 20 {
+				scrapeErrors = append(scrapeErrors, fmt.Sprintf("%s: %v", resultURL, err))
+			}
+			continue
+		}
+
+		for _, doc := range pageDocs {
+			if doc.Metadata == nil {
+				doc.Metadata = make(map[string]interface{})
+			}
+			doc.Metadata["search_api"] = source.URL
+			doc.Metadata["search_query"] = query
+			doc.Metadata["search_rank"] = i + 1
+			if result.Engine != "" {
+				doc.Metadata["search_engine"] = result.Engine
+			}
+			if result.Snippet != "" {
+				doc.Metadata["search_snippet"] = result.Snippet
+			}
+			if result.Score != 0 {
+				doc.Metadata["search_score"] = result.Score
+			}
+			if strings.TrimSpace(doc.Title) == "" && strings.TrimSpace(result.Title) != "" {
+				doc.Title = strings.TrimSpace(result.Title)
+			}
+
+			docs = append(docs, doc)
+		}
+	}
+
+	if len(docs) == 0 {
+		if len(scrapeErrors) > 0 {
+			return nil, fmt.Errorf("local search returned results but page scraping failed: %s", strings.Join(scrapeErrors, "; "))
+		}
+		return nil, fmt.Errorf("local search yielded no usable results for query %q", query)
+	}
+
+	return docs, nil
+}
+
+// DetectChanges checks if top search results changed.
+func (s *LocalSearchScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
+	if source == nil {
+		return false, "", fmt.Errorf("source is required")
+	}
+	query := strings.TrimSpace(source.Query)
+	if query == "" {
+		return false, "", fmt.Errorf("search query is required for localsearch sources")
+	}
+
+	limit := clampLocalSearchLimit(source.ResultLimit)
+	results, err := s.search(ctx, source, query, limit)
+	if err != nil {
+		return false, "", err
+	}
+
+	signatures := make([]string, 0, len(results))
+	for _, result := range results {
+		u := stripURLFragment(result.URL)
+		if u == "" {
+			continue
+		}
+		if !domainAllowed(u, source.Domains) {
+			continue
+		}
+		if !s.web.shouldInclude(u, source.Include, source.Exclude) {
+			continue
+		}
+		signatures = append(signatures, fmt.Sprintf("%s|%s|%s|%.6f", u, result.Title, result.Engine, result.Score))
+	}
+	sort.Strings(signatures)
+
+	hash := sha256.Sum256([]byte(strings.Join(signatures, "\n")))
+	currentHash := hex.EncodeToString(hash[:])
+	return currentHash != lastHash, currentHash, nil
+}
+
+func (s *LocalSearchScraper) search(ctx context.Context, source *Source, query string, limit int) ([]localSearchResult, error) {
+	searchURL, err := buildLocalSearchURL(source.URL, query, limit)
+	if err != nil {
+		return nil, err
+	}
+
+	req, err := http.NewRequestWithContext(ctx, http.MethodGet, searchURL, nil)
+	if err != nil {
+		return nil, fmt.Errorf("failed to build search request: %w", err)
+	}
+	req.Header.Set("User-Agent", s.config.UserAgent)
+	req.Header.Set("Accept", "application/json")
+
+	resp, err := s.client.Do(req)
+	if err != nil {
+		return nil, fmt.Errorf("search API request failed: %w", err)
+	}
+	defer resp.Body.Close()
+
+	body, err := io.ReadAll(io.LimitReader(resp.Body, maxSearchResponseBytes))
+	if err != nil {
+		return nil, fmt.Errorf("failed reading search API response: %w", err)
+	}
+
+	if resp.StatusCode < 200 || resp.StatusCode >= 300 {
+		msg := strings.TrimSpace(string(body))
+		if len(msg) > 200 {
+			msg = msg[:200]
+		}
+		return nil, fmt.Errorf("search API returned HTTP %d: %s", resp.StatusCode, msg)
+	}
+
+	results, err := decodeLocalSearchResults(body)
+	if err != nil {
+		return nil, err
+	}
+	if len(results) == 0 {
+		return nil, fmt.Errorf("search API returned no results")
+	}
+	if len(results) > limit {
+		results = results[:limit]
+	}
+	return results, nil
+}
+
+func buildLocalSearchURL(rawURL, query string, limit int) (string, error) {
+	u, err := url.Parse(strings.TrimSpace(rawURL))
+	if err != nil {
+		return "", fmt.Errorf("invalid search API URL: %w", err)
+	}
+	if u.Scheme == "" || u.Host == "" {
+		return "", fmt.Errorf("invalid search API URL: expected absolute URL, got %q", rawURL)
+	}
+
+	params := u.Query()
+	params.Set("q", query)
+	if params.Get("format") == "" {
+		params.Set("format", "json")
+	}
+	if params.Get("limit") == "" {
+		params.Set("limit", strconv.Itoa(clampLocalSearchLimit(limit)))
+	}
+	u.RawQuery = params.Encode()
+
+	return u.String(), nil
+}
+
+func decodeLocalSearchResults(body []byte) ([]localSearchResult, error) {
+	var payload map[string]interface{}
+	if err := json.Unmarshal(body, &payload); err != nil {
+		return nil, fmt.Errorf("search API returned invalid JSON: %w", err)
+	}
+
+	rawResults, ok := payload["results"]
+	if !ok {
+		return nil, fmt.Errorf("search API response missing results field")
+	}
+
+	items, ok := rawResults.([]interface{})
+	if !ok {
+		return nil, fmt.Errorf("search API results field is not an array")
+	}
+
+	results := make([]localSearchResult, 0, len(items))
+	for _, item := range items {
+		record, ok := item.(map[string]interface{})
+		if !ok {
+			continue
+		}
+
+		resultURL := pickString(record, "url", "link", "href")
+		if strings.TrimSpace(resultURL) == "" {
+			continue
+		}
+
+		results = append(results, localSearchResult{
+			URL:     strings.TrimSpace(resultURL),
+			Title:   strings.TrimSpace(pickString(record, "title", "name")),
+			Snippet: strings.TrimSpace(pickString(record, "content", "snippet", "description", "text")),
+			Engine:  strings.TrimSpace(pickString(record, "engine", "source")),
+			Score:   pickFloat(record, "score", "relevance"),
+		})
+	}
+
+	return results, nil
+}
+
+func pickString(record map[string]interface{}, keys ...string) string {
+	for _, key := range keys {
+		value, ok := record[key]
+		if !ok {
+			continue
+		}
+		switch v := value.(type) {
+		case string:
+			return v
+		case json.Number:
+			return v.String()
+		case float64:
+			return strconv.FormatFloat(v, 'f', -1, 64)
+		case int:
+			return strconv.Itoa(v)
+		}
+	}
+	return ""
+}
+
+func pickFloat(record map[string]interface{}, keys ...string) float64 {
+	for _, key := range keys {
+		value, ok := record[key]
+		if !ok {
+			continue
+		}
+		switch v := value.(type) {
+		case float64:
+			return v
+		case float32:
+			return float64(v)
+		case int:
+			return float64(v)
+		case int64:
+			return float64(v)
+		case json.Number:
+			f, err := v.Float64()
+			if err == nil {
+				return f
+			}
+		case string:
+			f, err := strconv.ParseFloat(strings.TrimSpace(v), 64)
+			if err == nil {
+				return f
+			}
+		}
+	}
+	return 0
+}
+
+func clampLocalSearchLimit(limit int) int {
+	if limit <= 0 {
+		return defaultLocalSearchLimit
+	}
+	if limit > maxLocalSearchLimit {
+		return maxLocalSearchLimit
+	}
+	return limit
+}
+
+func stripURLFragment(raw string) string {
+	u, err := url.Parse(strings.TrimSpace(raw))
+	if err != nil {
+		return strings.TrimSpace(raw)
+	}
+	u.Fragment = ""
+	return u.String()
+}
+
+func domainAllowed(raw string, allowedDomains []string) bool {
+	if len(allowedDomains) == 0 {
+		return true
+	}
+
+	u, err := url.Parse(raw)
+	if err != nil {
+		return false
+	}
+	host := strings.ToLower(strings.TrimSpace(u.Hostname()))
+	if host == "" {
+		return false
+	}
+
+	for _, candidate := range allowedDomains {
+		domain := normalizeDomain(candidate)
+		if domain == "" {
+			continue
+		}
+		if host == domain || strings.HasSuffix(host, "."+domain) {
+			return true
+		}
+	}
+	return false
+}
+
+func normalizeDomain(raw string) string {
+	raw = strings.ToLower(strings.TrimSpace(raw))
+	if raw == "" {
+		return ""
+	}
+	if strings.Contains(raw, "://") {
+		parsed, err := url.Parse(raw)
+		if err == nil {
+			return strings.ToLower(parsed.Hostname())
+		}
+	}
+	return strings.TrimPrefix(raw, ".")
+}
@@ -0,0 +1,226 @@
+package scraper
+
+import (
+	"context"
+	"encoding/json"
+	"net/http"
+	"net/http/httptest"
+	"net/url"
+	"strings"
+	"testing"
+	"time"
+)
+
+func TestLocalSearchScraperScrape(t *testing.T) {
+	mux := http.NewServeMux()
+	baseURL := ""
+
+	mux.HandleFunc("/search", func(w http.ResponseWriter, r *http.Request) {
+		if got := r.URL.Query().Get("q"); got != "go http client" {
+			t.Fatalf("expected query go http client, got %q", got)
+		}
+		if got := r.URL.Query().Get("format"); got != "json" {
+			t.Fatalf("expected format=json, got %q", got)
+		}
+		_ = json.NewEncoder(w).Encode(map[string]interface{}{
+			"results": []map[string]interface{}{
+				{
+					"url":     baseURL + "/docs/http-client",
+					"title":   "HTTP Client Guide",
+					"content": "How to build an HTTP client in Go",
+					"engine":  "searxng",
+					"score":   0.99,
+				},
+			},
+		})
+	})
+
+	mux.HandleFunc("/docs/http-client", func(w http.ResponseWriter, r *http.Request) {
+		w.Header().Set("Content-Type", "text/html")
+		_, _ = w.Write([]byte(`<html><head><title>HTTP Client Guide</title></head><body><main>` + strings.Repeat("http client docs ", 30) + `</main></body></html>`))
+	})
+
+	srv := httptest.NewServer(mux)
+	defer srv.Close()
+	baseURL = srv.URL
+
+	s := NewLocalSearchScraper(&Config{
+		UserAgent: "DevourTest/1.0",
+		Timeout:   2 * time.Second,
+	})
+
+	docs, err := s.Scrape(context.Background(), &Source{
+		Name:        "local-search",
+		Type:        SourceTypeLocalSearch,
+		URL:         srv.URL + "/search",
+		Query:       "go http client",
+		ResultLimit: 5,
+	})
+	if err != nil {
+		t.Fatalf("unexpected scrape error: %v", err)
+	}
+	if len(docs) == 0 {
+		t.Fatal("expected at least one document")
+	}
+
+	doc := docs[0]
+	if doc.URL != srv.URL+"/docs/http-client" {
+		t.Fatalf("unexpected document URL: %q", doc.URL)
+	}
+	if doc.Metadata["search_query"] != "go http client" {
+		t.Fatalf("expected metadata search_query, got %v", doc.Metadata["search_query"])
+	}
+	if doc.Metadata["search_engine"] != "searxng" {
+		t.Fatalf("expected metadata search_engine=searxng, got %v", doc.Metadata["search_engine"])
+	}
+}
+
+func TestLocalSearchScraperDomainFilter(t *testing.T) {
+	mux := http.NewServeMux()
+	baseURL := ""
+
+	mux.HandleFunc("/search", func(w http.ResponseWriter, r *http.Request) {
+		_ = json.NewEncoder(w).Encode(map[string]interface{}{
+			"results": []map[string]interface{}{
+				{
+					"url":   baseURL + "/docs/in-scope",
+					"title": "In Scope",
+				},
+				{
+					"url":   "https://example.com/out-of-scope",
+					"title": "Out Scope",
+				},
+			},
+		})
+	})
+
+	mux.HandleFunc("/docs/in-scope", func(w http.ResponseWriter, r *http.Request) {
+		w.Header().Set("Content-Type", "text/html")
+		_, _ = w.Write([]byte(`<html><head><title>In Scope</title></head><body><main>` + strings.Repeat("scoped docs ", 30) + `</main></body></html>`))
+	})
+
+	srv := httptest.NewServer(mux)
+	defer srv.Close()
+	baseURL = srv.URL
+
+	parsed, err := url.Parse(srv.URL)
+	if err != nil {
+		t.Fatalf("failed to parse server URL: %v", err)
+	}
+
+	s := NewLocalSearchScraper(&Config{
+		UserAgent: "DevourTest/1.0",
+		Timeout:   2 * time.Second,
+	})
+
+	docs, err := s.Scrape(context.Background(), &Source{
+		Name:        "local-search",
+		Type:        SourceTypeLocalSearch,
+		URL:         srv.URL + "/search",
+		Query:       "scope test",
+		ResultLimit: 10,
+		Domains:     []string{parsed.Hostname()},
+	})
+	if err != nil {
+		t.Fatalf("unexpected scrape error: %v", err)
+	}
+	if len(docs) == 0 {
+		t.Fatal("expected at least one in-scope document")
+	}
+	for _, doc := range docs {
+		docURL, parseErr := url.Parse(doc.URL)
+		if parseErr != nil {
+			t.Fatalf("invalid doc URL %q: %v", doc.URL, parseErr)
+		}
+		if docURL.Hostname() != parsed.Hostname() {
+			t.Fatalf("expected only in-scope domain, got %q", doc.URL)
+		}
+	}
+}
+
+func TestLocalSearchScraperRequiresQuery(t *testing.T) {
+	s := NewLocalSearchScraper(&Config{
+		UserAgent: "DevourTest/1.0",
+		Timeout:   2 * time.Second,
+	})
+
+	_, err := s.Scrape(context.Background(), &Source{
+		Name: "local-search",
+		Type: SourceTypeLocalSearch,
+		URL:  "http://127.0.0.1:8080/search",
+	})
+	if err == nil {
+		t.Fatal("expected error when query is missing")
+	}
+	if !strings.Contains(err.Error(), "query") {
+		t.Fatalf("unexpected error: %v", err)
+	}
+}
+
+func TestLocalSearchScraperDetectChanges(t *testing.T) {
+	mux := http.NewServeMux()
+	baseURL := ""
+	resultPath := "/docs/one"
+
+	mux.HandleFunc("/search", func(w http.ResponseWriter, r *http.Request) {
+		_ = json.NewEncoder(w).Encode(map[string]interface{}{
+			"results": []map[string]interface{}{
+				{
+					"url":   baseURL + resultPath,
+					"title": "Versioned",
+					"score": 1.0,
+				},
+			},
+		})
+	})
+
+	srv := httptest.NewServer(mux)
+	defer srv.Close()
+	baseURL = srv.URL
+
+	s := NewLocalSearchScraper(&Config{
+		UserAgent: "DevourTest/1.0",
+		Timeout:   2 * time.Second,
+	})
+	source := &Source{
+		Name:        "local-search",
+		Type:        SourceTypeLocalSearch,
+		URL:         srv.URL + "/search",
+		Query:       "version test",
+		ResultLimit: 3,
+	}
+
+	changed, hash1, err := s.DetectChanges(context.Background(), source, "")
+	if err != nil {
+		t.Fatalf("unexpected detect changes error: %v", err)
+	}
+	if !changed {
+		t.Fatal("expected first detect changes call to report changed")
+	}
+	if hash1 == "" {
+		t.Fatal("expected non-empty hash")
+	}
+
+	changed, hash2, err := s.DetectChanges(context.Background(), source, hash1)
+	if err != nil {
+		t.Fatalf("unexpected detect changes error: %v", err)
+	}
+	if changed {
+		t.Fatal("expected unchanged results with identical hash")
+	}
+	if hash2 != hash1 {
+		t.Fatalf("expected identical hash, got %q and %q", hash1, hash2)
+	}
+
+	resultPath = "/docs/two"
+	changed, hash3, err := s.DetectChanges(context.Background(), source, hash1)
+	if err != nil {
+		t.Fatalf("unexpected detect changes error: %v", err)
+	}
+	if !changed {
+		t.Fatal("expected changed results after search output changed")
+	}
+	if hash3 == hash1 {
+		t.Fatal("expected hash to change")
+	}
+}
@@ -0,0 +1,88 @@
+package scraper
+
+import (
+	"net/url"
+	"path"
+	"regexp"
+	"strings"
+)
+
+var (
+	titleNoiseRe      = regexp.MustCompile(`(?i)\b(added in go[0-9]+(\.[0-9]+)?|deprecated)\b`)
+	titleSpaceRe      = regexp.MustCompile(`\s+`)
+	contentSpaceRe    = regexp.MustCompile(`[ \t]+\n`)
+	multiNewlineRe    = regexp.MustCompile(`\n{3,}`)
+	nonPrintableTitle = regexp.MustCompile(`[[:cntrl:]]`)
+)
+
+// NormalizeDocuments applies normalization to a list of scraped documents.
+func NormalizeDocuments(docs []*Document) []*Document {
+	for _, doc := range docs {
+		NormalizeDocument(doc)
+	}
+	return docs
+}
+
+// NormalizeDocument applies cross-scraper output cleanup.
+func NormalizeDocument(doc *Document) {
+	if doc == nil {
+		return
+	}
+
+	doc.URL = strings.TrimSpace(doc.URL)
+	doc.Type = strings.TrimSpace(doc.Type)
+	doc.Title = normalizeTitle(doc.Title)
+	doc.Content = normalizeContent(doc.Content)
+
+	if doc.Title == "" {
+		doc.Title = inferTitleFromURL(doc.URL)
+	}
+}
+
+func normalizeTitle(title string) string {
+	title = strings.ReplaceAll(title, "¶", " ")
+	title = strings.ReplaceAll(title, "_", " ")
+	title = nonPrintableTitle.ReplaceAllString(title, " ")
+	title = titleNoiseRe.ReplaceAllString(title, " ")
+	title = titleSpaceRe.ReplaceAllString(strings.TrimSpace(title), " ")
+
+	// Remove dangling punctuation if it became a suffix after cleanup.
+	title = strings.TrimSpace(strings.Trim(title, "-:.,;"))
+	return title
+}
+
+func normalizeContent(content string) string {
+	content = strings.ReplaceAll(content, "\r\n", "\n")
+	content = strings.TrimSpace(content)
+	content = contentSpaceRe.ReplaceAllString(content, "\n")
+	content = multiNewlineRe.ReplaceAllString(content, "\n\n")
+	return content
+}
+
+func inferTitleFromURL(rawURL string) string {
+	if rawURL == "" {
+		return "Documentation"
+	}
+
+	u, err := url.Parse(rawURL)
+	if err != nil {
+		return "Documentation"
+	}
+
+	base := path.Base(strings.Trim(u.Path, "/"))
+	if base == "" || base == "." || base == "/" {
+		if u.Host != "" {
+			return u.Host
+		}
+		return "Documentation"
+	}
+
+	base = strings.TrimSuffix(base, ".html")
+	base = strings.ReplaceAll(base, "-", " ")
+	base = strings.ReplaceAll(base, "_", " ")
+	base = titleSpaceRe.ReplaceAllString(strings.TrimSpace(base), " ")
+	if base == "" {
+		return "Documentation"
+	}
+	return base
+}
@@ -0,0 +1,33 @@
+package scraper
+
+import "testing"
+
+func TestNormalizeDocument_TitleCleanup(t *testing.T) {
+	doc := &Document{
+		Title:   "http.type CloseNotifier ¶ deprecated added in go1.1",
+		Content: "line 1  \n\n\nline 2",
+		URL:     "https://pkg.go.dev/net/http#CloseNotifier",
+	}
+
+	NormalizeDocument(doc)
+
+	if doc.Title != "http.type CloseNotifier" {
+		t.Fatalf("unexpected normalized title: %q", doc.Title)
+	}
+	if doc.Content != "line 1\n\nline 2" {
+		t.Fatalf("unexpected normalized content: %q", doc.Content)
+	}
+}
+
+func TestNormalizeDocument_InferTitle(t *testing.T) {
+	doc := &Document{
+		Title: "",
+		URL:   "https://kotlinlang.org/docs/regex.html",
+	}
+
+	NormalizeDocument(doc)
+
+	if doc.Title != "regex" {
+		t.Fatalf("expected inferred title 'regex', got %q", doc.Title)
+	}
+}
@@ -2,30 +2,337 @@ package scraper

 import (
 	"context"
+	"crypto/sha256"
+	"encoding/hex"
+	"encoding/json"
+	"fmt"
+	"io"
+	"net/http"
+	"os"
+	"sort"
+	"strings"
+	"time"
+
+	"gopkg.in/yaml.v3"
 )

 // OpenAPIScraper parses OpenAPI/Swagger specifications.
 type OpenAPIScraper struct {
 	config *Config
+	client *http.Client
 }

 // NewOpenAPIScraper creates a new OpenAPI scraper.
 func NewOpenAPIScraper(config *Config) *OpenAPIScraper {
-	return &OpenAPIScraper{config: config}
+	timeout := 30 * time.Second
+	if config != nil && config.Timeout > 0 {
+		timeout = config.Timeout
+	}
+	return &OpenAPIScraper{
+		config: config,
+		client: &http.Client{Timeout: timeout},
+	}
 }

 // Scrape fetches and parses an OpenAPI specification.
 func (s *OpenAPIScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
-	// TODO: Implement OpenAPI parsing
-	// 1. Fetch spec from URL
-	// 2. Parse endpoints, schemas, descriptions
-	// 3. Create documents per endpoint
-	// 4. Include authentication, parameters
-	return nil, nil
+	if source == nil {
+		return nil, fmt.Errorf("source is required")
+	}
+
+	raw, specURL, err := s.readSpec(ctx, source)
+	if err != nil {
+		return nil, err
+	}
+
+	spec, err := parseOpenAPISpec(raw)
+	if err != nil {
+		return nil, err
+	}
+
+	docs := make([]*Document, 0)
+	mainContent := buildMainSpecContent(spec)
+	docs = append(docs, &Document{
+		ID:      generateDocID(specURL + "#openapi"),
+		Source:  coalesceSourceName(source.Name, "openapi"),
+		Type:    "openapi-spec",
+		Title:   spec.Info.Title,
+		Content: mainContent,
+		URL:     specURL,
+		Metadata: map[string]interface{}{
+			"openapi": spec.Version,
+			"servers": spec.Servers,
+		},
+		Hash:      hashBytes(raw),
+		Timestamp: time.Now(),
+	})
+
+	paths := make([]string, 0, len(spec.Paths))
+	for path := range spec.Paths {
+		paths = append(paths, path)
+	}
+	sort.Strings(paths)
+
+	for _, p := range paths {
+		opMap := spec.Paths[p]
+		methods := make([]string, 0, len(opMap))
+		for m := range opMap {
+			methods = append(methods, strings.ToUpper(m))
+		}
+		sort.Strings(methods)
+
+		for _, method := range methods {
+			op := opMap[strings.ToLower(method)]
+			if op == nil {
+				continue
+			}
+			title := strings.TrimSpace(op.Summary)
+			if title == "" {
+				title = fmt.Sprintf("%s %s", method, p)
+			}
+			content := buildOperationContent(method, p, op)
+			docURL := fmt.Sprintf("%s#%s-%s", specURL, strings.ToLower(method), sanitizeFragment(p))
+			docs = append(docs, &Document{
+				ID:      generateDocID(docURL),
+				Source:  coalesceSourceName(source.Name, "openapi"),
+				Type:    "openapi-operation",
+				Title:   title,
+				Content: content,
+				URL:     docURL,
+				Metadata: map[string]interface{}{
+					"method":       method,
+					"path":         p,
+					"operation_id": op.OperationID,
+				},
+				Hash:      hashString(content),
+				Timestamp: time.Now(),
+			})
+		}
+	}
+
+	return docs, nil
 }

 // DetectChanges checks if the spec has been updated.
 func (s *OpenAPIScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
-	// TODO: Check spec content hash
-	return false, "", nil
+	if source == nil {
+		return false, "", fmt.Errorf("source is required")
+	}
+	raw, _, err := s.readSpec(ctx, source)
+	if err != nil {
+		return false, "", err
+	}
+	hash := hashBytes(raw)
+	return hash != lastHash, hash, nil
+}
+
+func (s *OpenAPIScraper) readSpec(ctx context.Context, source *Source) ([]byte, string, error) {
+	rawPath := strings.TrimSpace(source.URL)
+	if rawPath == "" {
+		rawPath = strings.TrimSpace(source.Path)
+	}
+	if rawPath == "" {
+		return nil, "", fmt.Errorf("openapi source requires url or path")
+	}
+
+	if strings.HasPrefix(rawPath, "http://") || strings.HasPrefix(rawPath, "https://") {
+		req, err := http.NewRequestWithContext(ctx, http.MethodGet, rawPath, nil)
+		if err != nil {
+			return nil, "", err
+		}
+		if s.config != nil && strings.TrimSpace(s.config.UserAgent) != "" {
+			req.Header.Set("User-Agent", s.config.UserAgent)
+		}
+
+		resp, err := s.client.Do(req)
+		if err != nil {
+			return nil, "", err
+		}
+		defer resp.Body.Close()
+		if resp.StatusCode < 200 || resp.StatusCode >= 300 {
+			return nil, "", fmt.Errorf("openapi fetch failed: HTTP %d", resp.StatusCode)
+		}
+		body, err := io.ReadAll(io.LimitReader(resp.Body, 10<<20))
+		if err != nil {
+			return nil, "", err
+		}
+		return body, rawPath, nil
+	}
+
+	b, err := os.ReadFile(rawPath)
+	if err != nil {
+		return nil, "", err
+	}
+	return b, "file://" + rawPath, nil
+}
+
+type openAPISpec struct {
+	Version string               `json:"openapi" yaml:"openapi"`
+	Swagger string               `json:"swagger" yaml:"swagger"`
+	Info    openAPIInfo          `json:"info" yaml:"info"`
+	Servers []openAPIServer      `json:"servers" yaml:"servers"`
+	Paths   map[string]pathItems `json:"paths" yaml:"paths"`
+}
+
+type openAPIInfo struct {
+	Title       string `json:"title" yaml:"title"`
+	Version     string `json:"version" yaml:"version"`
+	Description string `json:"description" yaml:"description"`
+}
+
+type openAPIServer struct {
+	URL         string `json:"url" yaml:"url"`
+	Description string `json:"description" yaml:"description"`
+}
+
+type pathItems map[string]*openAPIOperation
+
+type openAPIOperation struct {
+	Summary     string                `json:"summary" yaml:"summary"`
+	Description string                `json:"description" yaml:"description"`
+	OperationID string                `json:"operationId" yaml:"operationId"`
+	Parameters  []openAPIParameter    `json:"parameters" yaml:"parameters"`
+	Responses   map[string]response   `json:"responses" yaml:"responses"`
+	RequestBody map[string]any        `json:"requestBody" yaml:"requestBody"`
+	Tags        []string              `json:"tags" yaml:"tags"`
+	Deprecated  bool                  `json:"deprecated" yaml:"deprecated"`
+	Security    []map[string][]string `json:"security" yaml:"security"`
+}
+
+type openAPIParameter struct {
+	Name        string `json:"name" yaml:"name"`
+	In          string `json:"in" yaml:"in"`
+	Description string `json:"description" yaml:"description"`
+	Required    bool   `json:"required" yaml:"required"`
+}
+
+type response struct {
+	Description string `json:"description" yaml:"description"`
+}
+
+func parseOpenAPISpec(raw []byte) (*openAPISpec, error) {
+	var spec openAPISpec
+	if err := json.Unmarshal(raw, &spec); err != nil {
+		if yamlErr := yaml.Unmarshal(raw, &spec); yamlErr != nil {
+			return nil, fmt.Errorf("invalid openapi content: %w", err)
+		}
+	}
+
+	if strings.TrimSpace(spec.Info.Title) == "" {
+		spec.Info.Title = "OpenAPI Specification"
+	}
+	if strings.TrimSpace(spec.Version) == "" {
+		spec.Version = spec.Swagger
+	}
+	if spec.Paths == nil {
+		spec.Paths = map[string]pathItems{}
+	}
+
+	return &spec, nil
+}
+
+func buildMainSpecContent(spec *openAPISpec) string {
+	var b strings.Builder
+	fmt.Fprintf(&b, "# %s\n\n", spec.Info.Title)
+	if spec.Info.Version != "" {
+		fmt.Fprintf(&b, "- API Version: %s\n", spec.Info.Version)
+	}
+	if spec.Version != "" {
+		fmt.Fprintf(&b, "- OpenAPI: %s\n", spec.Version)
+	}
+	fmt.Fprintf(&b, "- Paths: %d\n", len(spec.Paths))
+	if spec.Info.Description != "" {
+		fmt.Fprintf(&b, "\n%s\n", strings.TrimSpace(spec.Info.Description))
+	}
+	if len(spec.Servers) > 0 {
+		fmt.Fprintf(&b, "\n## Servers\n")
+		for _, s := range spec.Servers {
+			fmt.Fprintf(&b, "- %s", s.URL)
+			if s.Description != "" {
+				fmt.Fprintf(&b, " - %s", s.Description)
+			}
+			fmt.Fprintln(&b)
+		}
+	}
+	return b.String()
+}
+
+func buildOperationContent(method, path string, op *openAPIOperation) string {
+	var b strings.Builder
+	fmt.Fprintf(&b, "# %s %s\n\n", method, path)
+	if op.Summary != "" {
+		fmt.Fprintf(&b, "%s\n\n", strings.TrimSpace(op.Summary))
+	}
+	if op.Description != "" {
+		fmt.Fprintf(&b, "%s\n\n", strings.TrimSpace(op.Description))
+	}
+	if op.OperationID != "" {
+		fmt.Fprintf(&b, "- Operation ID: `%s`\n", op.OperationID)
+	}
+	if len(op.Tags) > 0 {
+		fmt.Fprintf(&b, "- Tags: %s\n", strings.Join(op.Tags, ", "))
+	}
+	if op.Deprecated {
+		fmt.Fprintln(&b, "- Deprecated: true")
+	}
+	if len(op.Parameters) > 0 {
+		fmt.Fprintln(&b, "\n## Parameters")
+		for _, p := range op.Parameters {
+			req := "optional"
+			if p.Required {
+				req = "required"
+			}
+			fmt.Fprintf(&b, "- `%s` (%s, %s)", p.Name, p.In, req)
+			if p.Description != "" {
+				fmt.Fprintf(&b, ": %s", strings.TrimSpace(p.Description))
+			}
+			fmt.Fprintln(&b)
+		}
+	}
+	if len(op.Responses) > 0 {
+		codes := make([]string, 0, len(op.Responses))
+		for code := range op.Responses {
+			codes = append(codes, code)
+		}
+		sort.Strings(codes)
+		fmt.Fprintln(&b, "\n## Responses")
+		for _, code := range codes {
+			resp := op.Responses[code]
+			fmt.Fprintf(&b, "- `%s`", code)
+			if resp.Description != "" {
+				fmt.Fprintf(&b, ": %s", strings.TrimSpace(resp.Description))
+			}
+			fmt.Fprintln(&b)
+		}
+	}
+	return b.String()
+}
+
+func sanitizeFragment(path string) string {
+	path = strings.ToLower(path)
+	path = strings.ReplaceAll(path, "/", "-")
+	path = strings.ReplaceAll(path, "{", "")
+	path = strings.ReplaceAll(path, "}", "")
+	path = strings.Trim(path, "-")
+	if path == "" {
+		return "root"
+	}
+	return path
+}
+
+func hashBytes(b []byte) string {
+	h := sha256.Sum256(b)
+	return hex.EncodeToString(h[:])
+}
+
+func hashString(s string) string {
+	h := sha256.Sum256([]byte(s))
+	return hex.EncodeToString(h[:])
+}
+
+func coalesceSourceName(name, fallback string) string {
+	if strings.TrimSpace(name) != "" {
+		return name
+	}
+	return fallback
 }
@@ -0,0 +1,77 @@
+package scraper
+
+import (
+	"context"
+	"net/http"
+	"net/http/httptest"
+	"strings"
+	"testing"
+	"time"
+)
+
+func TestOpenAPIScraperScrape(t *testing.T) {
+	spec := `{
+  "openapi": "3.0.0",
+  "info": {"title": "Pet API", "version": "1.0.0"},
+  "paths": {
+    "/pets": {
+      "get": {
+        "summary": "List pets",
+        "operationId": "listPets",
+        "responses": {"200": {"description": "ok"}}
+      }
+    }
+  }
+}`
+
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.Header().Set("Content-Type", "application/json")
+		_, _ = w.Write([]byte(spec))
+	}))
+	defer srv.Close()
+
+	s := NewOpenAPIScraper(&Config{Timeout: 2 * time.Second, UserAgent: "DevourTest"})
+	docs, err := s.Scrape(context.Background(), &Source{Name: "pet", Type: SourceTypeOpenAPI, URL: srv.URL})
+	if err != nil {
+		t.Fatalf("unexpected scrape error: %v", err)
+	}
+	if len(docs) < 2 {
+		t.Fatalf("expected at least 2 docs, got %d", len(docs))
+	}
+	foundOp := false
+	for _, d := range docs {
+		if strings.Contains(d.Title, "List pets") {
+			foundOp = true
+			break
+		}
+	}
+	if !foundOp {
+		t.Fatal("expected operation document")
+	}
+}
+
+func TestOpenAPIScraperDetectChanges(t *testing.T) {
+	spec := `{"openapi":"3.0.0","info":{"title":"API"},"paths":{}}`
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		_, _ = w.Write([]byte(spec))
+	}))
+	defer srv.Close()
+
+	s := NewOpenAPIScraper(&Config{Timeout: 2 * time.Second, UserAgent: "DevourTest"})
+	src := &Source{Name: "api", Type: SourceTypeOpenAPI, URL: srv.URL}
+	changed, hash1, err := s.DetectChanges(context.Background(), src, "")
+	if err != nil {
+		t.Fatal(err)
+	}
+	if !changed || hash1 == "" {
+		t.Fatalf("expected changed=true and non-empty hash, changed=%v hash=%q", changed, hash1)
+	}
+
+	changed, _, err = s.DetectChanges(context.Background(), src, hash1)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if changed {
+		t.Fatal("expected no changes when hash matches")
+	}
+}
@@ -5,6 +5,7 @@ func init() {
 	// Additional scrapers can be registered in their own packages
 	RegisterScraper(SourceTypeWeb, func(c *Config) Scraper { return NewWebScraper(c) })
 	RegisterScraper(SourceTypeLocal, func(c *Config) Scraper { return NewLocalScraper(c) })
+	RegisterScraper(SourceTypeLocalSearch, func(c *Config) Scraper { return NewLocalSearchScraper(c) })
 	RegisterScraper(SourceTypeGitHub, func(c *Config) Scraper { return NewGitHubScraper(c) })
 	RegisterScraper(SourceTypeOpenAPI, func(c *Config) Scraper { return NewOpenAPIScraper(c) })
 }
@@ -0,0 +1,71 @@
+package scraper_test
+
+import (
+	"testing"
+	"time"
+
+	basescraper "github.com/yourorg/devour/internal/scraper"
+	_ "github.com/yourorg/devour/internal/scraper/external"
+)
+
+func TestLanguageScrapersAreRegistered(t *testing.T) {
+	config := &basescraper.Config{
+		UserAgent: "DevourTest/1.0",
+		Timeout:   2 * time.Second,
+	}
+
+	supportedDocTypes := []basescraper.SourceType{
+		basescraper.SourceTypeGoDocs,
+		basescraper.SourceTypeRustDocs,
+		basescraper.SourceTypePythonDocs,
+		basescraper.SourceTypeJavaDocs,
+		basescraper.SourceTypeSpringDocs,
+		basescraper.SourceTypeTSDocs,
+		basescraper.SourceTypeReactDocs,
+		basescraper.SourceTypeVueDocs,
+		basescraper.SourceTypeNuxtDocs,
+		basescraper.SourceTypeMCPDocs,
+		basescraper.SourceTypeDockerDocs,
+		basescraper.SourceTypeCloudflareDocs,
+		basescraper.SourceTypeAstroDocs,
+	}
+
+	for _, sourceType := range supportedDocTypes {
+		t.Run(string(sourceType), func(t *testing.T) {
+			s := basescraper.NewScraper(sourceType, config)
+			if s == nil {
+				t.Fatalf("NewScraper(%q) returned nil; scraper was not registered", sourceType)
+			}
+		})
+	}
+}
+
+func TestDetectSourceType_ForSupportedDocsHosts(t *testing.T) {
+	tests := []struct {
+		input    string
+		expected basescraper.SourceType
+	}{
+		{"https://pkg.go.dev/net/http", basescraper.SourceTypeGoDocs},
+		{"https://docs.rs/tokio/latest/tokio/", basescraper.SourceTypeRustDocs},
+		{"https://docs.python.org/3/library/asyncio.html", basescraper.SourceTypePythonDocs},
+		{"https://docs.oracle.com/javase/8/docs/api/java/util/List.html", basescraper.SourceTypeJavaDocs},
+		{"https://docs.spring.io/spring-boot/docs/current/reference/htmlsingle/", basescraper.SourceTypeSpringDocs},
+		{"https://www.typescriptlang.org/docs/handbook/2/basic-types.html", basescraper.SourceTypeTSDocs},
+		{"https://react.dev/reference/react/hooks", basescraper.SourceTypeReactDocs},
+		{"https://vuejs.org/guide/introduction.html", basescraper.SourceTypeVueDocs},
+		{"https://nuxt.com/docs/guide/directory-structure", basescraper.SourceTypeNuxtDocs},
+		{"https://docs.docker.com/compose", basescraper.SourceTypeDockerDocs},
+		{"https://hub.docker.com/mcp/server/github", basescraper.SourceTypeMCPDocs},
+		{"https://developers.cloudflare.com/workers", basescraper.SourceTypeCloudflareDocs},
+		{"https://docs.astro.build/en/guides/components/", basescraper.SourceTypeAstroDocs},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.input, func(t *testing.T) {
+			got := basescraper.DetectSourceType(tt.input)
+			if got != tt.expected {
+				t.Fatalf("DetectSourceType(%q) = %q, want %q", tt.input, got, tt.expected)
+			}
+		})
+	}
+}
@@ -28,7 +28,7 @@ func (r *ScraperRegistry) Register(sourceType SourceType, constructor ScraperCon
 // Create creates a scraper instance
 func (r *ScraperRegistry) Create(sourceType SourceType, config *Config) Scraper {
 	if constructor, exists := r.constructors[sourceType]; exists {
-		return constructor(config)
+		return wrapScraper(constructor(config))
 	}
 	return nil
 }
@@ -17,6 +17,7 @@ const (
 	SourceTypeGitHub         SourceType = "github"
 	SourceTypeOpenAPI        SourceType = "openapi"
 	SourceTypeLocal          SourceType = "local"
+	SourceTypeLocalSearch    SourceType = "localsearch"
 	SourceTypeGoDocs         SourceType = "godocs"
 	SourceTypeRustDocs       SourceType = "rustdocs"
 	SourceTypePythonDocs     SourceType = "pythondocs"
@@ -34,15 +35,18 @@ const (

 // Source represents a documentation source to scrape.
 type Source struct {
-	Name     string     `yaml:"name"`
-	Type     SourceType `yaml:"type"`
-	URL      string     `yaml:"url,omitempty"`
-	Repo     string     `yaml:"repo,omitempty"`
-	Branch   string     `yaml:"branch,omitempty"`
-	Path     string     `yaml:"path,omitempty"`
-	Include  []string   `yaml:"include,omitempty"`
-	Exclude  []string   `yaml:"exclude,omitempty"`
-	Schedule string     `yaml:"schedule,omitempty"`
+	Name        string     `yaml:"name"`
+	Type        SourceType `yaml:"type"`
+	URL         string     `yaml:"url,omitempty"`
+	Query       string     `yaml:"query,omitempty"`
+	ResultLimit int        `yaml:"result_limit,omitempty"`
+	Domains     []string   `yaml:"domains,omitempty"`
+	Repo        string     `yaml:"repo,omitempty"`
+	Branch      string     `yaml:"branch,omitempty"`
+	Path        string     `yaml:"path,omitempty"`
+	Include     []string   `yaml:"include,omitempty"`
+	Exclude     []string   `yaml:"exclude,omitempty"`
+	Schedule    string     `yaml:"schedule,omitempty"`
 }

 // Document represents a scraped document.
@@ -113,6 +117,11 @@ func DetectSourceType(input string) SourceType {
 		}
 	}

+	// MCP servers are hosted under Docker Hub paths.
+	if strings.Contains(input, "hub.docker.com/mcp/") {
+		return SourceTypeMCPDocs
+	}
+
 	// Check for OpenAPI specs
 	if strings.HasSuffix(input, ".json") || strings.HasSuffix(input, ".yaml") || strings.HasSuffix(input, ".yml") {
 		if strings.Contains(strings.ToLower(input), "openapi") || strings.Contains(strings.ToLower(input), "swagger") {
@@ -6,8 +6,10 @@ import (
 	"encoding/hex"
 	"fmt"
 	"net/url"
+	"path"
 	"regexp"
 	"strings"
+	"sync"
 	"time"

 	"github.com/gocolly/colly/v2"
@@ -27,17 +29,42 @@ func NewWebScraper(config *Config) *WebScraper {
 func (s *WebScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
 	var documents []*Document
 	visited := make(map[string]bool)
+	scheduled := make(map[string]bool)
+	contentHashes := make(map[string]bool)
+	var mu sync.Mutex
+	var scrapeErrors []string

 	// Parse base URL for domain restrictions
 	baseURL, err := url.Parse(source.URL)
 	if err != nil {
 		return nil, fmt.Errorf("invalid URL: %w", err)
 	}
+	allowedDomain := baseURL.Hostname()
+	if allowedDomain == "" {
+		allowedDomain = baseURL.Host
+	}
+
+	maxDepth := s.config.MaxDepth
+	if maxDepth <= 0 {
+		maxDepth = 2
+	}
+	maxPages := s.config.Concurrency * 40
+	if maxPages < 20 {
+		maxPages = 20
+	}
+	if maxDepth <= 1 && maxPages > 30 {
+		maxPages = 30
+	}
+	if maxPages > 300 {
+		maxPages = 300
+	}
+	scopePrefix := pathScopePrefix(baseURL.Path)
+	scopeLeaf := pathScopeLeaf(baseURL.Path)

 	// Create Colly collector
 	c := colly.NewCollector(
-		colly.AllowedDomains(baseURL.Host),
-		colly.MaxDepth(s.config.MaxDepth),
+		colly.AllowedDomains(allowedDomain),
+		colly.MaxDepth(maxDepth),
 		colly.Async(true),
 		colly.UserAgent(s.config.UserAgent),
 	)
@@ -65,18 +92,40 @@ func (s *WebScraper) Scrape(ctx context.Context, source *Source) ([]*Document, e

 	// Handle errors
 	c.OnError(func(r *colly.Response, err error) {
-		fmt.Printf("Error scraping %s: %v\n", r.Request.URL, err)
+		errText := strings.ToLower(err.Error())
+		if strings.Contains(errText, "already visited") {
+			return
+		}
+		reqURL := source.URL
+		if r != nil && r.Request != nil && r.Request.URL != nil {
+			reqURL = r.Request.URL.String()
+		}
+		mu.Lock()
+		if len(scrapeErrors) < 20 {
+			scrapeErrors = append(scrapeErrors, fmt.Sprintf("%s: %v", reqURL, err))
+		}
+		mu.Unlock()
 	})

 	// Extract content from pages
 	c.OnHTML("html", func(e *colly.HTMLElement) {
 		pageURL := e.Request.URL.String()
+		if !withinScope(e.Request.URL, baseURL, scopePrefix, scopeLeaf) {
+			return
+		}

 		// Skip if already visited
+		mu.Lock()
 		if visited[pageURL] {
+			mu.Unlock()
+			return
+		}
+		if len(visited) >= maxPages {
+			mu.Unlock()
 			return
 		}
 		visited[pageURL] = true
+		mu.Unlock()

 		// Check include/exclude patterns
 		if !s.shouldInclude(pageURL, source.Include, source.Exclude) {
@@ -99,6 +148,13 @@ func (s *WebScraper) Scrape(ctx context.Context, source *Source) ([]*Document, e

 		// Generate hash for change detection
 		hash := s.generateHash(content)
+		mu.Lock()
+		if contentHashes[hash] {
+			mu.Unlock()
+			return
+		}
+		contentHashes[hash] = true
+		mu.Unlock()

 		// Extract metadata
 		metadata := map[string]interface{}{
@@ -120,30 +176,74 @@ func (s *WebScraper) Scrape(ctx context.Context, source *Source) ([]*Document, e
 			Timestamp: time.Now(),
 		}

+		mu.Lock()
 		documents = append(documents, doc)
+		mu.Unlock()
 	})

 	// Follow links
 	c.OnHTML("a[href]", func(e *colly.HTMLElement) {
 		link := e.Attr("href")
 		absoluteURL := e.Request.AbsoluteURL(link)
-
-		// Skip if already visited
-		if visited[absoluteURL] {
+		if absoluteURL == "" {
 			return
 		}

+		linkURL, err := url.Parse(absoluteURL)
+		if err != nil {
+			return
+		}
+		if linkURL.Scheme != "http" && linkURL.Scheme != "https" {
+			return
+		}
+		if !withinScope(linkURL, baseURL, scopePrefix, scopeLeaf) {
+			return
+		}
+
+		// Skip if already visited
+		mu.Lock()
+		if visited[absoluteURL] {
+			mu.Unlock()
+			return
+		}
+		if len(visited) >= maxPages {
+			mu.Unlock()
+			return
+		}
+		mu.Unlock()
+
 		// Check include/exclude patterns
 		if !s.shouldInclude(absoluteURL, source.Include, source.Exclude) {
 			return
 		}

+		mu.Lock()
+		if scheduled[absoluteURL] {
+			mu.Unlock()
+			return
+		}
+		if len(scheduled) >= maxPages {
+			mu.Unlock()
+			return
+		}
+		scheduled[absoluteURL] = true
+		mu.Unlock()
+
 		if err := c.Visit(absoluteURL); err != nil {
-			fmt.Printf("Error visiting %s: %v\n", absoluteURL, err)
+			errText := strings.ToLower(err.Error())
+			if strings.Contains(errText, "already visited") {
+				return
+			}
+			mu.Lock()
+			if len(scrapeErrors) < 20 {
+				scrapeErrors = append(scrapeErrors, fmt.Sprintf("%s: %v", absoluteURL, err))
+			}
+			mu.Unlock()
 		}
 	})

 	// Start scraping
+	scheduled[source.URL] = true
 	if err := c.Visit(source.URL); err != nil {
 		return nil, fmt.Errorf("failed to start scraping: %w", err)
 	}
@@ -151,6 +251,16 @@ func (s *WebScraper) Scrape(ctx context.Context, source *Source) ([]*Document, e
 	// Wait for async scraping to complete
 	c.Wait()

+	mu.Lock()
+	defer mu.Unlock()
+
+	if len(documents) == 0 {
+		if len(scrapeErrors) > 0 {
+			return nil, fmt.Errorf("web scrape failed: %s", strings.Join(scrapeErrors, "; "))
+		}
+		return nil, fmt.Errorf("web scrape extracted no documents from %s", source.URL)
+	}
+
 	return documents, nil
 }

@@ -283,6 +393,25 @@ func (s *WebScraper) generateHash(content string) string {

 // cleanText removes extra whitespace and normalizes text.
 func cleanText(text string) string {
+	noisePhrases := []string{
+		"table of contents",
+		"in this article",
+		"additional resources",
+		"feedback",
+		"collaborate with us on github",
+		"copyright",
+		"all rights reserved",
+		"privacy policy",
+		"terms of service",
+		"sign in",
+		"skip to main content",
+		"ask learn",
+	}
+	for _, phrase := range noisePhrases {
+		re := regexp.MustCompile(`(?i)` + regexp.QuoteMeta(phrase))
+		text = re.ReplaceAllString(text, " ")
+	}
+
 	// Replace multiple whitespace with single space
 	re := regexp.MustCompile(`\s+`)
 	text = re.ReplaceAllString(text, " ")
@@ -292,3 +421,58 @@ func cleanText(text string) string {

 	return text
 }
+
+func pathScopePrefix(rawPath string) string {
+	clean := path.Clean(rawPath)
+	if clean == "." || clean == "/" || clean == "" {
+		return ""
+	}
+
+	last := path.Base(clean)
+	if strings.Contains(last, ".") {
+		dir := path.Dir(clean)
+		if dir == "/" {
+			// Root-level document page: keep crawler scoped to this page path.
+			return clean
+		}
+		return dir
+	}
+
+	dir := path.Dir(clean)
+	if dir == "/" {
+		return clean
+	}
+	return dir
+}
+
+func pathScopeLeaf(rawPath string) string {
+	clean := path.Clean(rawPath)
+	if clean == "." || clean == "/" || clean == "" {
+		return ""
+	}
+	last := path.Base(clean)
+	if strings.Contains(last, ".") {
+		return last
+	}
+	return ""
+}
+
+func withinScope(target, base *url.URL, prefix, leaf string) bool {
+	if target == nil || base == nil {
+		return false
+	}
+	if !strings.EqualFold(target.Hostname(), base.Hostname()) {
+		return false
+	}
+	if prefix == "" {
+		return true
+	}
+	targetPath := target.Path
+	if targetPath == "" {
+		targetPath = path.Clean("/")
+	}
+	if strings.HasPrefix(targetPath, prefix) {
+		return true
+	}
+	return leaf != "" && path.Base(targetPath) == leaf
+}
@@ -0,0 +1,132 @@
+package scraper
+
+import (
+	"context"
+	"net/http"
+	"net/http/httptest"
+	"strings"
+	"testing"
+	"time"
+)
+
+func TestWebScraper_ReturnsErrorWhenNothingFetched(t *testing.T) {
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		http.NotFound(w, r)
+	}))
+	defer srv.Close()
+
+	s := NewWebScraper(&Config{
+		UserAgent:   "DevourTest/1.0",
+		Timeout:     2 * time.Second,
+		Concurrency: 1,
+		MaxDepth:    1,
+	})
+
+	_, err := s.Scrape(context.Background(), &Source{
+		Name: "missing",
+		Type: SourceTypeWeb,
+		URL:  srv.URL + "/missing",
+	})
+	if err == nil {
+		t.Fatal("expected error when web scrape yields no documents")
+	}
+}
+
+func TestWebScraper_ReturnsErrorWhenPageHasNoExtractableContent(t *testing.T) {
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.Header().Set("Content-Type", "text/html")
+		_, _ = w.Write([]byte(`<html><head><title>Empty</title></head><body><p>tiny</p></body></html>`))
+	}))
+	defer srv.Close()
+
+	s := NewWebScraper(&Config{
+		UserAgent:   "DevourTest/1.0",
+		Timeout:     2 * time.Second,
+		Concurrency: 1,
+		MaxDepth:    1,
+	})
+
+	_, err := s.Scrape(context.Background(), &Source{
+		Name: "empty",
+		Type: SourceTypeWeb,
+		URL:  srv.URL,
+	})
+	if err == nil {
+		t.Fatal("expected error when page has no extractable docs")
+	}
+	if !strings.Contains(err.Error(), "extracted no documents") {
+		t.Fatalf("unexpected error message: %v", err)
+	}
+}
+
+func TestWebScraper_AllowsRedirectedDocumentPath(t *testing.T) {
+	content := strings.Repeat("ruby docs content ", 30)
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		switch r.URL.Path {
+		case "/core/Regexp.html":
+			http.Redirect(w, r, "/3.4.1/Regexp.html", http.StatusFound)
+		case "/3.4.1/Regexp.html":
+			w.Header().Set("Content-Type", "text/html")
+			_, _ = w.Write([]byte(`<html><head><title>Regexp</title></head><body><main>` + content + `</main></body></html>`))
+		default:
+			http.NotFound(w, r)
+		}
+	}))
+	defer srv.Close()
+
+	s := NewWebScraper(&Config{
+		UserAgent:   "DevourTest/1.0",
+		Timeout:     2 * time.Second,
+		Concurrency: 1,
+		MaxDepth:    1,
+	})
+
+	docs, err := s.Scrape(context.Background(), &Source{
+		Name: "ruby",
+		Type: SourceTypeWeb,
+		URL:  srv.URL + "/core/Regexp.html",
+	})
+	if err != nil {
+		t.Fatalf("unexpected scrape error: %v", err)
+	}
+	if len(docs) == 0 {
+		t.Fatal("expected redirected page to be scraped")
+	}
+	if !strings.Contains(docs[0].URL, "/3.4.1/Regexp.html") {
+		t.Fatalf("expected final redirected URL, got %q", docs[0].URL)
+	}
+}
+
+func TestWebScraper_GlobalWrapperNormalizesOutput(t *testing.T) {
+	content := strings.Repeat("docs content ", 20)
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.Header().Set("Content-Type", "text/html")
+		_, _ = w.Write([]byte(`<html><head><title>Regex Guide ¶ deprecated</title></head><body><main>` + content + `</main></body></html>`))
+	}))
+	defer srv.Close()
+
+	s := NewScraper(SourceTypeWeb, &Config{
+		UserAgent:   "DevourTest/1.0",
+		Timeout:     2 * time.Second,
+		Concurrency: 1,
+		MaxDepth:    1,
+	})
+	if s == nil {
+		t.Fatal("expected web scraper")
+	}
+
+	docs, err := s.Scrape(context.Background(), &Source{
+		Name: "test",
+		Type: SourceTypeWeb,
+		URL:  srv.URL,
+	})
+	if err != nil {
+		t.Fatalf("unexpected scrape error: %v", err)
+	}
+	if len(docs) == 0 {
+		t.Fatal("expected at least one document")
+	}
+	if docs[0].Title != "Regex Guide" {
+		t.Fatalf("expected normalized title, got %q", docs[0].Title)
+	}
+}
@@ -0,0 +1,98 @@
+package scraper
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"net"
+	"strings"
+	"time"
+)
+
+// wrappedScraper adds retry and normalization behavior for all scraper implementations.
+type wrappedScraper struct {
+	inner Scraper
+}
+
+func wrapScraper(inner Scraper) Scraper {
+	if inner == nil {
+		return nil
+	}
+	return &wrappedScraper{inner: inner}
+}
+
+func (w *wrappedScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
+	if source == nil {
+		return nil, fmt.Errorf("source is required")
+	}
+
+	retries := 2
+	delay := 300 * time.Millisecond
+
+	var lastErr error
+	for attempt := 0; attempt <= retries; attempt++ {
+		docs, err := w.inner.Scrape(ctx, source)
+		if err == nil {
+			return NormalizeDocuments(docs), nil
+		}
+		lastErr = err
+
+		// One fallback: add trailing slash for doc sites when URL path looks page-like.
+		if attempt == 0 && strings.HasPrefix(source.URL, "http") && !strings.HasSuffix(source.URL, "/") {
+			alt := *source
+			alt.URL = source.URL + "/"
+			docs, altErr := w.inner.Scrape(ctx, &alt)
+			if altErr == nil {
+				return NormalizeDocuments(docs), nil
+			}
+		}
+
+		if attempt < retries && isRetriableScrapeError(err) {
+			if !sleepWithContext(ctx, delay) {
+				return nil, ctx.Err()
+			}
+			continue
+		}
+		break
+	}
+
+	return nil, lastErr
+}
+
+func (w *wrappedScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
+	return w.inner.DetectChanges(ctx, source, lastHash)
+}
+
+func isRetriableScrapeError(err error) bool {
+	if err == nil {
+		return false
+	}
+
+	s := strings.ToLower(err.Error())
+	if strings.Contains(s, "timeout") ||
+		strings.Contains(s, "temporarily unavailable") ||
+		strings.Contains(s, "connection reset") ||
+		strings.Contains(s, "eof") ||
+		strings.Contains(s, "http 429") ||
+		strings.Contains(s, "http 500") ||
+		strings.Contains(s, "http 502") ||
+		strings.Contains(s, "http 503") ||
+		strings.Contains(s, "http 504") {
+		return true
+	}
+
+	var netErr net.Error
+	return errors.As(err, &netErr)
+}
+
+func sleepWithContext(ctx context.Context, d time.Duration) bool {
+	t := time.NewTimer(d)
+	defer t.Stop()
+
+	select {
+	case <-ctx.Done():
+		return false
+	case <-t.C:
+		return true
+	}
+}
@@ -0,0 +1,45 @@
+package scraper
+
+import (
+	"context"
+	"fmt"
+	"testing"
+)
+
+type flakyStubScraper struct {
+	failFirst bool
+	calls     int
+}
+
+func (f *flakyStubScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
+	f.calls++
+	if f.failFirst && f.calls == 1 {
+		return nil, fmt.Errorf("HTTP 503")
+	}
+	return []*Document{
+		{
+			Title:   "Example ¶ deprecated",
+			Content: "ok",
+			URL:     source.URL,
+			Type:    "test",
+		},
+	}, nil
+}
+
+func (f *flakyStubScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
+	return true, "hash", nil
+}
+
+func TestWrappedScraper_RetriesAndNormalizes(t *testing.T) {
+	w := wrapScraper(&flakyStubScraper{failFirst: true})
+	docs, err := w.Scrape(context.Background(), &Source{URL: "https://example.com"})
+	if err != nil {
+		t.Fatalf("expected retry to succeed, got error: %v", err)
+	}
+	if len(docs) != 1 {
+		t.Fatalf("expected 1 document, got %d", len(docs))
+	}
+	if docs[0].Title != "Example" {
+		t.Fatalf("expected normalized title, got %q", docs[0].Title)
+	}
+}
@@ -0,0 +1,528 @@
+package search
+
+import (
+	"context"
+	"crypto/sha256"
+	"encoding/hex"
+	"encoding/json"
+	"fmt"
+	"io/fs"
+	"os"
+	"path/filepath"
+	"sort"
+	"strings"
+	"time"
+
+	"github.com/yourorg/devour/internal/config"
+)
+
+type Engine struct {
+	DocsDir       string
+	IndexDir      string
+	MetadataDir   string
+	SnippetLength int
+}
+
+type SearchOptions struct {
+	Limit     int
+	Threshold float64
+}
+
+type Result struct {
+	ID      string         `json:"id"`
+	DocID   string         `json:"doc_id"`
+	Title   string         `json:"title"`
+	URL     string         `json:"url,omitempty"`
+	Type    string         `json:"type"`
+	Source  string         `json:"source,omitempty"`
+	Path    string         `json:"path"`
+	Score   float64        `json:"score"`
+	Snippet string         `json:"snippet"`
+	Meta    map[string]any `json:"metadata,omitempty"`
+}
+
+type IndexStats struct {
+	Documents      int       `json:"documents"`
+	Tokens         int       `json:"tokens"`
+	LastIndexedAt  time.Time `json:"last_indexed_at"`
+	IndexPath      string    `json:"index_path"`
+	MetadataPath   string    `json:"metadata_path"`
+	SourceFileHash string    `json:"source_file_hash"`
+}
+
+type indexedDoc struct {
+	ID       string         `json:"id"`
+	DocID    string         `json:"doc_id"`
+	Title    string         `json:"title"`
+	URL      string         `json:"url,omitempty"`
+	Type     string         `json:"type"`
+	Source   string         `json:"source,omitempty"`
+	Path     string         `json:"path"`
+	Content  string         `json:"content"`
+	TermFreq map[string]int `json:"term_freq"`
+	Length   int            `json:"length"`
+}
+
+type persistedIndex struct {
+	Version string       `json:"version"`
+	BuiltAt time.Time    `json:"built_at"`
+	Docs    []indexedDoc `json:"docs"`
+}
+
+type persistedMeta struct {
+	Version        string    `json:"version"`
+	BuiltAt        time.Time `json:"built_at"`
+	DocsDir        string    `json:"docs_dir"`
+	SourceFileHash string    `json:"source_file_hash"`
+	DocCount       int       `json:"doc_count"`
+}
+
+type rawDoc struct {
+	ID       string         `json:"id"`
+	Source   string         `json:"source"`
+	Type     string         `json:"type"`
+	Title    string         `json:"title"`
+	Content  string         `json:"content"`
+	URL      string         `json:"url,omitempty"`
+	Metadata map[string]any `json:"metadata,omitempty"`
+}
+
+const (
+	indexFileName = "lexical_index.json"
+	metaFileName  = "lexical_index_meta.json"
+	indexVersion  = "1"
+)
+
+func NewEngine(cfg *config.Config) *Engine {
+	snippetLength := cfg.Indexing.SnippetLength
+	if snippetLength <= 0 {
+		snippetLength = 220
+	}
+	return &Engine{
+		DocsDir:       cfg.Storage.DocsDir,
+		IndexDir:      cfg.Storage.IndexDir,
+		MetadataDir:   cfg.Storage.MetadataDir,
+		SnippetLength: snippetLength,
+	}
+}
+
+func (e *Engine) Rebuild(ctx context.Context) (*IndexStats, error) {
+	if strings.TrimSpace(e.DocsDir) == "" {
+		return nil, fmt.Errorf("docs directory is required")
+	}
+	if err := os.MkdirAll(e.IndexDir, 0o755); err != nil {
+		return nil, err
+	}
+	if err := os.MkdirAll(e.MetadataDir, 0o755); err != nil {
+		return nil, err
+	}
+
+	docFiles, sourceHash, err := e.listDocFiles()
+	if err != nil {
+		return nil, err
+	}
+
+	docs := make([]indexedDoc, 0, len(docFiles))
+	tokenCount := 0
+	for _, file := range docFiles {
+		select {
+		case <-ctx.Done():
+			return nil, ctx.Err()
+		default:
+		}
+
+		rd, err := parseDocFile(file)
+		if err != nil {
+			continue
+		}
+		if strings.TrimSpace(rd.Content) == "" {
+			continue
+		}
+
+		id := rd.ID
+		if id == "" {
+			id = hashString(file + ":" + rd.Title)
+		}
+		termFreq := frequency(tokenize(strings.Join([]string{rd.Title, rd.Content, rd.URL, rd.Type}, " ")))
+		length := 0
+		for _, v := range termFreq {
+			length += v
+		}
+		tokenCount += length
+
+		docs = append(docs, indexedDoc{
+			ID:       hashString(file),
+			DocID:    id,
+			Title:    bestTitle(rd.Title, file),
+			URL:      strings.TrimSpace(rd.URL),
+			Type:     defaultString(strings.TrimSpace(rd.Type), "document"),
+			Source:   strings.TrimSpace(rd.Source),
+			Path:     file,
+			Content:  collapseWhitespace(rd.Content),
+			TermFreq: termFreq,
+			Length:   length,
+		})
+	}
+
+	index := persistedIndex{Version: indexVersion, BuiltAt: time.Now(), Docs: docs}
+	indexPath := filepath.Join(e.IndexDir, indexFileName)
+	if err := writeJSON(indexPath, index); err != nil {
+		return nil, err
+	}
+
+	meta := persistedMeta{
+		Version:        indexVersion,
+		BuiltAt:        index.BuiltAt,
+		DocsDir:        e.DocsDir,
+		SourceFileHash: sourceHash,
+		DocCount:       len(docs),
+	}
+	metaPath := filepath.Join(e.MetadataDir, metaFileName)
+	if err := writeJSON(metaPath, meta); err != nil {
+		return nil, err
+	}
+
+	return &IndexStats{
+		Documents:      len(docs),
+		Tokens:         tokenCount,
+		LastIndexedAt:  index.BuiltAt,
+		IndexPath:      indexPath,
+		MetadataPath:   metaPath,
+		SourceFileHash: sourceHash,
+	}, nil
+}
+
+func (e *Engine) EnsureIndexed(ctx context.Context) (*IndexStats, error) {
+	metaPath := filepath.Join(e.MetadataDir, metaFileName)
+	b, err := os.ReadFile(metaPath)
+	if err != nil {
+		if os.IsNotExist(err) {
+			return e.Rebuild(ctx)
+		}
+		return nil, err
+	}
+
+	var meta persistedMeta
+	if err := json.Unmarshal(b, &meta); err != nil {
+		return e.Rebuild(ctx)
+	}
+
+	_, sourceHash, err := e.listDocFiles()
+	if err != nil {
+		return nil, err
+	}
+	if sourceHash != meta.SourceFileHash {
+		return e.Rebuild(ctx)
+	}
+
+	return &IndexStats{
+		Documents:      meta.DocCount,
+		LastIndexedAt:  meta.BuiltAt,
+		IndexPath:      filepath.Join(e.IndexDir, indexFileName),
+		MetadataPath:   metaPath,
+		SourceFileHash: meta.SourceFileHash,
+	}, nil
+}
+
+func (e *Engine) Search(ctx context.Context, query string, opts SearchOptions) ([]Result, *IndexStats, error) {
+	query = strings.TrimSpace(query)
+	if query == "" {
+		return nil, nil, fmt.Errorf("query is required")
+	}
+
+	stats, err := e.EnsureIndexed(ctx)
+	if err != nil {
+		return nil, nil, err
+	}
+
+	indexPath := filepath.Join(e.IndexDir, indexFileName)
+	b, err := os.ReadFile(indexPath)
+	if err != nil {
+		return nil, nil, err
+	}
+
+	var idx persistedIndex
+	if err := json.Unmarshal(b, &idx); err != nil {
+		return nil, nil, err
+	}
+
+	limit := opts.Limit
+	if limit <= 0 {
+		limit = 5
+	}
+	threshold := opts.Threshold
+	if threshold < 0 {
+		threshold = 0
+	}
+
+	queryTokens := tokenize(query)
+	if len(queryTokens) == 0 {
+		return nil, stats, nil
+	}
+	qFreq := frequency(queryTokens)
+
+	type scored struct {
+		doc   indexedDoc
+		score float64
+	}
+	matches := make([]scored, 0)
+
+	for _, doc := range idx.Docs {
+		select {
+		case <-ctx.Done():
+			return nil, nil, ctx.Err()
+		default:
+		}
+		score := lexicalScore(qFreq, queryTokens, doc)
+		if score <= 0 {
+			continue
+		}
+		if threshold > 0 && score < threshold {
+			continue
+		}
+		matches = append(matches, scored{doc: doc, score: score})
+	}
+
+	sort.Slice(matches, func(i, j int) bool {
+		if matches[i].score == matches[j].score {
+			return matches[i].doc.Title < matches[j].doc.Title
+		}
+		return matches[i].score > matches[j].score
+	})
+
+	if limit > len(matches) {
+		limit = len(matches)
+	}
+
+	results := make([]Result, 0, limit)
+	for i := 0; i < limit; i++ {
+		d := matches[i].doc
+		results = append(results, Result{
+			ID:      d.ID,
+			DocID:   d.DocID,
+			Title:   d.Title,
+			URL:     d.URL,
+			Type:    d.Type,
+			Source:  d.Source,
+			Path:    d.Path,
+			Score:   matches[i].score,
+			Snippet: bestSnippet(d.Content, queryTokens, e.SnippetLength),
+			Meta: map[string]any{
+				"length": d.Length,
+			},
+		})
+	}
+
+	return results, stats, nil
+}
+
+func (e *Engine) listDocFiles() ([]string, string, error) {
+	files := make([]string, 0)
+	h := sha256.New()
+
+	err := filepath.WalkDir(e.DocsDir, func(path string, d fs.DirEntry, err error) error {
+		if err != nil {
+			return err
+		}
+		if d.IsDir() {
+			return nil
+		}
+		ext := strings.ToLower(filepath.Ext(path))
+		switch ext {
+		case ".json", ".md", ".txt":
+		default:
+			return nil
+		}
+
+		info, statErr := d.Info()
+		if statErr != nil {
+			return statErr
+		}
+		files = append(files, path)
+		fmt.Fprintf(h, "%s|%d|%d\n", path, info.Size(), info.ModTime().UnixNano())
+		return nil
+	})
+	if err != nil {
+		if os.IsNotExist(err) {
+			return []string{}, hashString("empty"), nil
+		}
+		return nil, "", err
+	}
+
+	sort.Strings(files)
+	return files, hex.EncodeToString(h.Sum(nil)), nil
+}
+
+func parseDocFile(path string) (*rawDoc, error) {
+	b, err := os.ReadFile(path)
+	if err != nil {
+		return nil, err
+	}
+	ext := strings.ToLower(filepath.Ext(path))
+	switch ext {
+	case ".json":
+		var d rawDoc
+		if err := json.Unmarshal(b, &d); err == nil && (d.Title != "" || d.Content != "") {
+			return &d, nil
+		}
+		// Not a structured doc JSON, index as raw text fallback.
+		return &rawDoc{Title: filepath.Base(path), Content: string(b), Type: "json", Source: "local"}, nil
+	case ".md":
+		content := string(b)
+		title := markdownTitle(content)
+		return &rawDoc{Title: title, Content: content, Type: "markdown", Source: "local"}, nil
+	default:
+		return &rawDoc{Title: filepath.Base(path), Content: string(b), Type: "text", Source: "local"}, nil
+	}
+}
+
+func markdownTitle(content string) string {
+	for _, line := range strings.Split(content, "\n") {
+		trim := strings.TrimSpace(line)
+		if strings.HasPrefix(trim, "#") {
+			trim = strings.TrimLeft(trim, "#")
+			trim = strings.TrimSpace(trim)
+			if trim != "" {
+				return trim
+			}
+		}
+	}
+	return ""
+}
+
+func writeJSON(path string, v any) error {
+	b, err := json.MarshalIndent(v, "", "  ")
+	if err != nil {
+		return err
+	}
+	return os.WriteFile(path, b, 0o644)
+}
+
+func tokenize(input string) []string {
+	replacer := strings.NewReplacer(",", " ", ".", " ", "(", " ", ")", " ", "[", " ", "]", " ", "{", " ", "}", " ",
+		":", " ", ";", " ", "!", " ", "?", " ", "\n", " ", "\r", " ", "\t", " ", "\"", " ", "'", " ", "`", " ",
+	)
+	clean := strings.ToLower(replacer.Replace(input))
+	parts := strings.Fields(clean)
+	out := make([]string, 0, len(parts))
+	for _, p := range parts {
+		p = strings.TrimSpace(p)
+		if len(p) < 2 {
+			continue
+		}
+		out = append(out, p)
+	}
+	return out
+}
+
+func frequency(tokens []string) map[string]int {
+	m := make(map[string]int, len(tokens))
+	for _, t := range tokens {
+		m[t]++
+	}
+	return m
+}
+
+func lexicalScore(qFreq map[string]int, queryTokens []string, doc indexedDoc) float64 {
+	if len(doc.TermFreq) == 0 {
+		return 0
+	}
+
+	titleLower := strings.ToLower(doc.Title)
+	urlLower := strings.ToLower(doc.URL)
+	contentLower := strings.ToLower(doc.Content)
+
+	score := 0.0
+	for token, qCount := range qFreq {
+		dCount := doc.TermFreq[token]
+		if dCount == 0 {
+			continue
+		}
+		part := float64(dCount*qCount) / float64(max(1, doc.Length))
+		score += part * 8.0
+		if strings.Contains(titleLower, token) {
+			score += 2.5
+		}
+		if strings.Contains(urlLower, token) {
+			score += 1.2
+		}
+	}
+
+	phrase := strings.Join(queryTokens, " ")
+	if len(queryTokens) > 1 && strings.Contains(contentLower, phrase) {
+		score += 1.5
+	}
+	return score
+}
+
+func bestSnippet(content string, queryTokens []string, maxLen int) string {
+	if maxLen <= 0 {
+		maxLen = 220
+	}
+	flat := collapseWhitespace(content)
+	if flat == "" {
+		return ""
+	}
+	if len(flat) <= maxLen {
+		return flat
+	}
+
+	lower := strings.ToLower(flat)
+	start := 0
+	for _, tok := range queryTokens {
+		if idx := strings.Index(lower, tok); idx >= 0 {
+			start = idx - (maxLen / 4)
+			if start < 0 {
+				start = 0
+			}
+			break
+		}
+	}
+	end := start + maxLen
+	if end > len(flat) {
+		end = len(flat)
+	}
+	snippet := strings.TrimSpace(flat[start:end])
+	if end < len(flat) {
+		snippet += "..."
+	}
+	return snippet
+}
+
+func collapseWhitespace(s string) string {
+	return strings.Join(strings.Fields(strings.TrimSpace(s)), " ")
+}
+
+func bestTitle(title, path string) string {
+	title = strings.TrimSpace(title)
+	if title != "" {
+		return title
+	}
+	base := filepath.Base(path)
+	base = strings.TrimSuffix(base, filepath.Ext(base))
+	base = strings.ReplaceAll(base, "_", " ")
+	base = strings.TrimSpace(base)
+	if base == "" {
+		return "Documentation"
+	}
+	return base
+}
+
+func defaultString(v, fallback string) string {
+	if strings.TrimSpace(v) == "" {
+		return fallback
+	}
+	return v
+}
+
+func hashString(s string) string {
+	sum := sha256.Sum256([]byte(s))
+	return hex.EncodeToString(sum[:12])
+}
+
+func max(a, b int) int {
+	if a > b {
+		return a
+	}
+	return b
+}
@@ -0,0 +1,56 @@
+package search
+
+import (
+	"context"
+	"encoding/json"
+	"os"
+	"path/filepath"
+	"testing"
+
+	"github.com/yourorg/devour/internal/config"
+)
+
+func TestEngineRebuildAndSearch(t *testing.T) {
+	tmp := t.TempDir()
+	docsDir := filepath.Join(tmp, "docs")
+	indexDir := filepath.Join(tmp, "index")
+	metaDir := filepath.Join(tmp, "metadata")
+	if err := os.MkdirAll(docsDir, 0o755); err != nil {
+		t.Fatal(err)
+	}
+
+	doc := map[string]any{
+		"id":      "1",
+		"title":   "HTTP Client",
+		"content": "Use net/http client with timeout",
+		"type":    "go-doc",
+		"source":  "go",
+		"url":     "https://pkg.go.dev/net/http",
+	}
+	b, _ := json.Marshal(doc)
+	if err := os.WriteFile(filepath.Join(docsDir, "doc.json"), b, 0o644); err != nil {
+		t.Fatal(err)
+	}
+
+	cfg := config.Default()
+	cfg.Storage.DocsDir = docsDir
+	cfg.Storage.IndexDir = indexDir
+	cfg.Storage.MetadataDir = metaDir
+
+	e := NewEngine(cfg)
+	stats, err := e.Rebuild(context.Background())
+	if err != nil {
+		t.Fatalf("rebuild failed: %v", err)
+	}
+	if stats.Documents == 0 {
+		t.Fatal("expected documents in index")
+	}
+
+	results, _, err := e.Search(context.Background(), "http timeout", SearchOptions{Limit: 5})
+	if err != nil {
+		t.Fatalf("search failed: %v", err)
+	}
+	if len(results) == 0 {
+		t.Fatal("expected at least one search result")
+	}
+}
@@ -2,7 +2,16 @@
 package server

 import (
+	"bufio"
 	"context"
+	"encoding/json"
+	"fmt"
+	"io"
+	"net/http"
+	"os"
+	"strings"
+	"sync"
+	"time"
 )

 // Config holds server configuration.
@@ -11,8 +20,13 @@ type Config struct {
 	Transport string `yaml:"transport"`
 	Host      string `yaml:"host"`
 	Port      int    `yaml:"port"`
+
+	Handler MethodHandler `yaml:"-"`
 }

+// MethodHandler executes a server method with raw params and returns result payload.
+type MethodHandler func(ctx context.Context, method string, params json.RawMessage) (any, error)
+
 // Server defines the MCP server interface.
 type Server interface {
 	// Start begins listening for connections.
@@ -47,9 +61,28 @@ type Result struct {
 	Metadata   map[string]any `json:"metadata,omitempty"`
 }

+type rpcRequest struct {
+	JSONRPC string          `json:"jsonrpc"`
+	ID      any             `json:"id"`
+	Method  string          `json:"method"`
+	Params  json.RawMessage `json:"params,omitempty"`
+}
+
+type rpcResponse struct {
+	JSONRPC string    `json:"jsonrpc"`
+	ID      any       `json:"id"`
+	Result  any       `json:"result,omitempty"`
+	Error   *rpcError `json:"error,omitempty"`
+}
+
+type rpcError struct {
+	Code    int    `json:"code"`
+	Message string `json:"message"`
+}
+
 // NewServer creates a new MCP server.
 func NewServer(config *Config) Server {
-	if config.Mode == "remote" {
+	if strings.EqualFold(config.Mode, "remote") {
 		return NewHTTPServer(config)
 	}
 	return NewStdioServer(config)
@@ -68,27 +101,156 @@ func NewStdioServer(config *Config) *StdioServer {
 // HTTPServer implements Server for HTTP transport.
 type HTTPServer struct {
 	config *Config
+	http   *http.Server
+	mu     sync.Mutex
 }

 func (s *HTTPServer) Start(ctx context.Context) error {
-	// TODO: Implement HTTP server with MCP endpoints
-	return nil
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	if s.config == nil {
+		return fmt.Errorf("server config is required")
+	}
+	if s.config.Handler == nil {
+		return fmt.Errorf("server handler is required")
+	}
+
+	mux := http.NewServeMux()
+	mux.HandleFunc("/health", func(w http.ResponseWriter, r *http.Request) {
+		w.Header().Set("Content-Type", "application/json")
+		_, _ = io.WriteString(w, `{"ok":true}`)
+	})
+	mux.HandleFunc("/rpc", func(w http.ResponseWriter, r *http.Request) {
+		if r.Method != http.MethodPost {
+			w.WriteHeader(http.StatusMethodNotAllowed)
+			return
+		}
+		defer r.Body.Close()
+		var req rpcRequest
+		if err := json.NewDecoder(io.LimitReader(r.Body, 2<<20)).Decode(&req); err != nil {
+			writeRPC(w, rpcResponse{JSONRPC: "2.0", Error: &rpcError{Code: -32700, Message: "parse error"}})
+			return
+		}
+		resp := s.handleRPC(r.Context(), req)
+		writeRPC(w, resp)
+	})
+
+	host := s.config.Host
+	if host == "" {
+		host = "localhost"
+	}
+	port := s.config.Port
+	if port == 0 {
+		port = 8080
+	}
+	s.http = &http.Server{Addr: fmt.Sprintf("%s:%d", host, port), Handler: mux}
+
+	errCh := make(chan error, 1)
+	go func() {
+		errCh <- s.http.ListenAndServe()
+	}()
+
+	select {
+	case <-ctx.Done():
+		shutdownCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+		defer cancel()
+		_ = s.http.Shutdown(shutdownCtx)
+		return ctx.Err()
+	case err := <-errCh:
+		if err != nil && err != http.ErrServerClosed {
+			return err
+		}
+		return nil
+	}
 }

 func (s *HTTPServer) Stop(ctx context.Context) error {
-	return nil
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	if s.http == nil {
+		return nil
+	}
+	return s.http.Shutdown(ctx)
+}
+
+func (s *HTTPServer) handleRPC(ctx context.Context, req rpcRequest) rpcResponse {
+	return handleRPC(ctx, s.config.Handler, req)
 }

 // StdioServer implements Server for stdio transport.
 type StdioServer struct {
 	config *Config
+	mu     sync.Mutex
+	stop   bool
 }

 func (s *StdioServer) Start(ctx context.Context) error {
-	// TODO: Implement stdio JSON-RPC server
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	if s.config == nil {
+		return fmt.Errorf("server config is required")
+	}
+	if s.config.Handler == nil {
+		return fmt.Errorf("server handler is required")
+	}
+
+	scanner := bufio.NewScanner(os.Stdin)
+	out := json.NewEncoder(os.Stdout)
+
+	for scanner.Scan() {
+		if ctx.Err() != nil || s.stop {
+			break
+		}
+		line := strings.TrimSpace(scanner.Text())
+		if line == "" {
+			continue
+		}
+
+		var req rpcRequest
+		if err := json.Unmarshal([]byte(line), &req); err != nil {
+			_ = out.Encode(rpcResponse{JSONRPC: "2.0", Error: &rpcError{Code: -32700, Message: "parse error"}})
+			continue
+		}
+		resp := handleRPC(ctx, s.config.Handler, req)
+		if err := out.Encode(resp); err != nil {
+			return err
+		}
+	}
+
+	if err := scanner.Err(); err != nil {
+		return err
+	}
 	return nil
 }

 func (s *StdioServer) Stop(ctx context.Context) error {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	s.stop = true
 	return nil
 }
+
+func handleRPC(ctx context.Context, handler MethodHandler, req rpcRequest) rpcResponse {
+	if req.JSONRPC == "" {
+		req.JSONRPC = "2.0"
+	}
+	if req.Method == "" {
+		return rpcResponse{JSONRPC: "2.0", ID: req.ID, Error: &rpcError{Code: -32600, Message: "invalid request"}}
+	}
+
+	result, err := handler(ctx, req.Method, req.Params)
+	if err != nil {
+		return rpcResponse{JSONRPC: "2.0", ID: req.ID, Error: &rpcError{Code: -32000, Message: err.Error()}}
+	}
+	return rpcResponse{JSONRPC: "2.0", ID: req.ID, Result: result}
+}
+
+func writeRPC(w http.ResponseWriter, payload rpcResponse) {
+	w.Header().Set("Content-Type", "application/json")
+	if payload.Error != nil {
+		w.WriteHeader(http.StatusBadRequest)
+	}
+	_ = json.NewEncoder(w).Encode(payload)
+}
@@ -0,0 +1,149 @@
+package storage
+
+import (
+	"encoding/json"
+	"fmt"
+	"os"
+	"path/filepath"
+	"regexp"
+	"strings"
+
+	"github.com/yourorg/devour/internal/markdown"
+	"github.com/yourorg/devour/internal/scraper"
+)
+
+type SaveOptions struct {
+	Format      string
+	OutputDir   string
+	AllowEmpty  bool
+	PrintWriter func(string, ...any)
+}
+
+type SaveResult struct {
+	Count int
+	Files []string
+}
+
+var slugUnsafe = regexp.MustCompile(`[^a-z0-9._-]+`)
+
+func SaveDocuments(docs []*scraper.Document, opts SaveOptions) (*SaveResult, error) {
+	if len(docs) == 0 {
+		if opts.AllowEmpty {
+			return &SaveResult{}, nil
+		}
+		return nil, fmt.Errorf("no documents scraped")
+	}
+
+	format := strings.ToLower(strings.TrimSpace(opts.Format))
+	if format == "" {
+		format = "json"
+	}
+	if format != "json" && format != "markdown" {
+		return nil, fmt.Errorf("unsupported format: %s", opts.Format)
+	}
+
+	if strings.TrimSpace(opts.OutputDir) == "" {
+		return nil, fmt.Errorf("output directory is required")
+	}
+	if err := os.MkdirAll(opts.OutputDir, 0o755); err != nil {
+		return nil, err
+	}
+
+	used := map[string]int{}
+	files := make([]string, 0, len(docs))
+	formatter := markdown.NewFormatter()
+
+	for i, doc := range docs {
+		if doc == nil {
+			continue
+		}
+		base := slugify(defaultTitle(doc.Title, i))
+		ext := ".json"
+		if format == "markdown" {
+			ext = ".md"
+		}
+		name := uniqueName(base, ext, used, doc.ID)
+		path := filepath.Join(opts.OutputDir, name)
+
+		var b []byte
+		var err error
+		if format == "markdown" {
+			md := &markdown.Document{
+				ID:        doc.ID,
+				Source:    doc.Source,
+				Type:      doc.Type,
+				Title:     doc.Title,
+				Content:   doc.Content,
+				URL:       doc.URL,
+				Metadata:  doc.Metadata,
+				Hash:      doc.Hash,
+				Timestamp: doc.Timestamp,
+			}
+			b = []byte(formatter.FormatWithTOC(md))
+		} else {
+			b, err = json.MarshalIndent(doc, "", "  ")
+			if err != nil {
+				return nil, err
+			}
+		}
+
+		if err := os.WriteFile(path, b, 0o644); err != nil {
+			return nil, err
+		}
+		files = append(files, path)
+		if opts.PrintWriter != nil {
+			opts.PrintWriter("  📄 %s (%s)\n", filepath.Base(path), doc.Type)
+		}
+	}
+
+	if len(files) == 0 && !opts.AllowEmpty {
+		return nil, fmt.Errorf("no documents scraped")
+	}
+
+	return &SaveResult{Count: len(files), Files: files}, nil
+}
+
+func defaultTitle(title string, idx int) string {
+	title = strings.TrimSpace(title)
+	if title != "" {
+		return title
+	}
+	return fmt.Sprintf("document_%d", idx)
+}
+
+func slugify(name string) string {
+	name = strings.ToLower(strings.TrimSpace(name))
+	name = strings.ReplaceAll(name, " ", "-")
+	name = strings.ReplaceAll(name, "/", "-")
+	name = strings.ReplaceAll(name, "\\", "-")
+	name = strings.ReplaceAll(name, ":", "-")
+	name = strings.ReplaceAll(name, "?", "")
+	name = strings.ReplaceAll(name, "&", "and")
+	name = slugUnsafe.ReplaceAllString(name, "-")
+	name = strings.Trim(name, "-.")
+	if name == "" {
+		name = "document"
+	}
+	if len(name) > 80 {
+		name = strings.Trim(name[:80], "-.")
+	}
+	if name == "" {
+		name = "document"
+	}
+	return name
+}
+
+func uniqueName(base, ext string, used map[string]int, id string) string {
+	key := base + ext
+	if used[key] == 0 {
+		used[key] = 1
+		return key
+	}
+	used[key]++
+	suffix := used[key]
+	id = strings.TrimSpace(id)
+	if len(id) >= 8 {
+		return fmt.Sprintf("%s-%s%s", base, id[:8], ext)
+	}
+	return fmt.Sprintf("%s-%d%s", base, suffix, ext)
+}