This commit is contained in:
Tomas Dvorak
2026-02-24 10:33:59 +01:00
parent 409acd2e08
commit 898a3c303f
1374 changed files with 290409 additions and 29187 deletions
+368
View File
@@ -0,0 +1,368 @@
package config
import (
"fmt"
"os"
"path/filepath"
"strings"
"time"
"gopkg.in/yaml.v3"
)
// Config is the typed application configuration loaded from devour.yaml.
type Config struct {
Version int `yaml:"version"`
Storage StorageConfig `yaml:"storage"`
Embeddings EmbeddingsConfig `yaml:"embeddings"`
VectorDB VectorDBConfig `yaml:"vector_db"`
Scraper ScraperConfig `yaml:"scraper"`
Scheduler SchedulerConfig `yaml:"scheduler"`
Server ServerConfig `yaml:"server"`
Indexing IndexingConfig `yaml:"indexing"`
Verification VerificationConfig `yaml:"verification"`
Sources []SourceConfig `yaml:"sources"`
ConfigPath string `yaml:"-"`
}
type StorageConfig struct {
DocsDir string `yaml:"docs_dir"`
IndexDir string `yaml:"index_dir"`
MetadataDir string `yaml:"metadata_dir"`
CacheDir string `yaml:"cache_dir"`
}
type EmbeddingsConfig struct {
Provider string `yaml:"provider"`
Model string `yaml:"model"`
Dimensions int `yaml:"dimensions"`
APIKey string `yaml:"api_key"`
BatchSize int `yaml:"batch_size"`
BaseURL string `yaml:"base_url"`
}
type VectorDBConfig struct {
Type string `yaml:"type"`
Persist bool `yaml:"persist"`
SimilarityMetric string `yaml:"similarity_metric"`
PersistDir string `yaml:"persist_dir"`
}
type ScraperConfig struct {
UserAgent string `yaml:"user_agent"`
Timeout time.Duration `yaml:"timeout"`
RetryCount int `yaml:"retry_count"`
RetryDelay time.Duration `yaml:"retry_delay"`
Concurrency int `yaml:"concurrency"`
RateLimit time.Duration `yaml:"rate_limit"`
MaxDepth int `yaml:"max_depth"`
CacheDir string `yaml:"cache_dir"`
}
type SchedulerConfig struct {
Enabled bool `yaml:"enabled"`
Interval time.Duration `yaml:"interval"`
CheckMethod string `yaml:"check_method"`
OnStartup bool `yaml:"on_startup"`
}
type ServerConfig struct {
Mode string `yaml:"mode"`
Transport string `yaml:"transport"`
Host string `yaml:"host"`
Port int `yaml:"port"`
}
type IndexingConfig struct {
Enabled bool `yaml:"enabled"`
AutoReindex bool `yaml:"auto_reindex"`
SnippetLength int `yaml:"snippet_length"`
MaxDocs int `yaml:"max_docs"`
}
type VerificationConfig struct {
Enabled bool `yaml:"enabled"`
Timeout time.Duration `yaml:"timeout"`
}
type SourceConfig struct {
Name string `yaml:"name"`
Type string `yaml:"type"`
URL string `yaml:"url,omitempty"`
Query string `yaml:"query,omitempty"`
ResultLimit int `yaml:"result_limit,omitempty"`
Domains []string `yaml:"domains,omitempty"`
Repo string `yaml:"repo,omitempty"`
Branch string `yaml:"branch,omitempty"`
Path string `yaml:"path,omitempty"`
Include []string `yaml:"include,omitempty"`
Exclude []string `yaml:"exclude,omitempty"`
Schedule string `yaml:"schedule,omitempty"`
}
// Default returns a default configuration that matches devour init behavior.
func Default() *Config {
return &Config{
Version: 1,
Storage: StorageConfig{
DocsDir: "./devour_data/docs",
IndexDir: "./devour_data/index",
MetadataDir: "./devour_data/metadata",
CacheDir: "./devour_data/cache",
},
Embeddings: EmbeddingsConfig{
Provider: "openai",
Model: "text-embedding-3-small",
Dimensions: 1536,
BatchSize: 100,
APIKey: "${OPENAI_API_KEY}",
},
VectorDB: VectorDBConfig{
Type: "memory",
Persist: true,
SimilarityMetric: "cosine",
},
Scraper: ScraperConfig{
UserAgent: "Devour/1.0",
Timeout: 30 * time.Second,
RetryCount: 3,
RetryDelay: 1 * time.Second,
Concurrency: 10,
RateLimit: 500 * time.Millisecond,
MaxDepth: 3,
CacheDir: "./devour_data/cache",
},
Scheduler: SchedulerConfig{
Enabled: true,
Interval: 72 * time.Hour,
CheckMethod: "hash",
OnStartup: false,
},
Server: ServerConfig{
Mode: "local",
Transport: "stdio",
Host: "localhost",
Port: 8080,
},
Indexing: IndexingConfig{
Enabled: true,
AutoReindex: true,
SnippetLength: 220,
MaxDocs: 10000,
},
Verification: VerificationConfig{
Enabled: true,
Timeout: 90 * time.Second,
},
Sources: []SourceConfig{},
}
}
const initTemplateSourcesComment = `
# Sources (add your own)
sources: []
# - name: example-docs
# type: url
# url: https://docs.example.com
# include: ["**/*.md", "**/*.html"]
# - name: local-searxng
# type: localsearch
# url: http://127.0.0.1:8080/search
# query: golang http client
# result_limit: 8
# domains: ["pkg.go.dev", "go.dev"]
`
// RenderInitYAML returns the default init config file content from canonical defaults.
func RenderInitYAML(remote bool) (string, error) {
cfg := Default()
if remote {
cfg.Server.Mode = "remote"
}
// Keep the init template comments for discoverability while sourcing
// the actual values from canonical defaults.
cfg.Sources = nil
body, err := yaml.Marshal(cfg)
if err != nil {
return "", fmt.Errorf("marshal default config: %w", err)
}
trimmed := strings.TrimSuffix(string(body), "\n")
if strings.HasSuffix(trimmed, "sources: []") {
trimmed = strings.TrimSuffix(trimmed, "sources: []")
trimmed = strings.TrimSpace(trimmed)
}
return "# Devour Configuration\n" + trimmed + initTemplateSourcesComment, nil
}
// Load loads configuration from an explicit path or the default search paths.
func Load(explicitPath string) (*Config, error) {
cfg := Default()
path, err := findConfigPath(explicitPath)
if err != nil {
return nil, err
}
if path == "" {
cfg.ApplyDefaults()
return cfg, nil
}
b, err := os.ReadFile(path)
if err != nil {
return nil, fmt.Errorf("read config: %w", err)
}
if err := yaml.Unmarshal(b, cfg); err != nil {
return nil, fmt.Errorf("parse config: %w", err)
}
cfg.ConfigPath = path
cfg.ApplyDefaults()
return cfg, nil
}
// ApplyDefaults ensures additive backward-compatible defaults after unmarshaling.
func (c *Config) ApplyDefaults() {
if c.Version == 0 {
c.Version = 1
}
if c.Storage.DocsDir == "" {
c.Storage.DocsDir = "./devour_data/docs"
}
if c.Storage.IndexDir == "" {
c.Storage.IndexDir = "./devour_data/index"
}
if c.Storage.MetadataDir == "" {
c.Storage.MetadataDir = "./devour_data/metadata"
}
if c.Storage.CacheDir == "" {
c.Storage.CacheDir = "./devour_data/cache"
}
if c.Embeddings.Provider == "" {
c.Embeddings.Provider = "openai"
}
if c.Embeddings.Model == "" {
c.Embeddings.Model = "text-embedding-3-small"
}
if c.Embeddings.Dimensions <= 0 {
c.Embeddings.Dimensions = 1536
}
if c.Embeddings.BatchSize <= 0 {
c.Embeddings.BatchSize = 100
}
if c.VectorDB.Type == "" {
c.VectorDB.Type = "memory"
}
if c.VectorDB.SimilarityMetric == "" {
c.VectorDB.SimilarityMetric = "cosine"
}
if c.Scraper.UserAgent == "" {
c.Scraper.UserAgent = "Devour/1.0"
}
if c.Scraper.Timeout <= 0 {
c.Scraper.Timeout = 30 * time.Second
}
if c.Scraper.RetryCount <= 0 {
c.Scraper.RetryCount = 3
}
if c.Scraper.RetryDelay <= 0 {
c.Scraper.RetryDelay = 1 * time.Second
}
if c.Scraper.Concurrency <= 0 {
c.Scraper.Concurrency = 10
}
if c.Scraper.RateLimit < 0 {
c.Scraper.RateLimit = 0
}
if c.Scraper.MaxDepth <= 0 {
c.Scraper.MaxDepth = 3
}
if c.Scraper.CacheDir == "" {
c.Scraper.CacheDir = c.Storage.CacheDir
}
if c.Scheduler.Interval <= 0 {
c.Scheduler.Interval = 72 * time.Hour
}
if c.Scheduler.CheckMethod == "" {
c.Scheduler.CheckMethod = "hash"
}
if c.Server.Mode == "" {
c.Server.Mode = "local"
}
if c.Server.Transport == "" {
c.Server.Transport = "stdio"
}
if c.Server.Host == "" {
c.Server.Host = "localhost"
}
if c.Server.Port <= 0 {
c.Server.Port = 8080
}
if !c.Indexing.Enabled {
// keep explicit false but initialize defaults for remaining fields
}
if c.Indexing.SnippetLength <= 0 {
c.Indexing.SnippetLength = 220
}
if c.Indexing.MaxDocs <= 0 {
c.Indexing.MaxDocs = 10000
}
if c.Verification.Timeout <= 0 {
c.Verification.Timeout = 90 * time.Second
}
}
func findConfigPath(explicitPath string) (string, error) {
if strings.TrimSpace(explicitPath) != "" {
p, err := filepath.Abs(explicitPath)
if err != nil {
return "", err
}
if _, err := os.Stat(p); err != nil {
return "", fmt.Errorf("config file not found: %s", explicitPath)
}
return p, nil
}
candidates := []string{"./devour.yaml"}
if home, err := os.UserHomeDir(); err == nil {
candidates = append(candidates, filepath.Join(home, ".devour", "devour.yaml"))
}
for _, c := range candidates {
if _, err := os.Stat(c); err == nil {
p, absErr := filepath.Abs(c)
if absErr != nil {
return "", absErr
}
return p, nil
}
}
return "", nil
}
// EnsureStorageDirs creates required local storage directories.
func (c *Config) EnsureStorageDirs() error {
dirs := []string{c.Storage.DocsDir, c.Storage.IndexDir, c.Storage.MetadataDir, c.Storage.CacheDir}
for _, dir := range dirs {
if strings.TrimSpace(dir) == "" {
continue
}
if err := os.MkdirAll(dir, 0o755); err != nil {
return err
}
}
return nil
}
+130
View File
@@ -0,0 +1,130 @@
package projectstate
import (
"encoding/json"
"fmt"
"io/fs"
"os"
"path/filepath"
"strings"
"time"
)
type SourceState struct {
Name string `json:"name"`
Type string `json:"type"`
URL string `json:"url,omitempty"`
Hash string `json:"hash,omitempty"`
LastSync time.Time `json:"last_sync,omitempty"`
DocCount int `json:"doc_count"`
LastError string `json:"last_error,omitempty"`
}
type SourceStateFile struct {
UpdatedAt time.Time `json:"updated_at"`
Sources map[string]*SourceState `json:"sources"`
}
type DocsStats struct {
DocumentCount int
LastUpdated time.Time
BySource map[string]int
StorageBytes int64
}
type docSummary struct {
Source string `json:"source"`
}
const sourceStateFileName = "source_state.json"
func LoadSourceState(metadataDir string) (*SourceStateFile, error) {
path := filepath.Join(metadataDir, sourceStateFileName)
b, err := os.ReadFile(path)
if err != nil {
if os.IsNotExist(err) {
return &SourceStateFile{UpdatedAt: time.Now(), Sources: map[string]*SourceState{}}, nil
}
return nil, err
}
var state SourceStateFile
if err := json.Unmarshal(b, &state); err != nil {
return nil, err
}
if state.Sources == nil {
state.Sources = map[string]*SourceState{}
}
return &state, nil
}
func SaveSourceState(metadataDir string, state *SourceStateFile) error {
if state == nil {
return fmt.Errorf("state is required")
}
if state.Sources == nil {
state.Sources = map[string]*SourceState{}
}
state.UpdatedAt = time.Now()
if err := os.MkdirAll(metadataDir, 0o755); err != nil {
return err
}
path := filepath.Join(metadataDir, sourceStateFileName)
b, err := json.MarshalIndent(state, "", " ")
if err != nil {
return err
}
return os.WriteFile(path, b, 0o644)
}
func CollectDocsStats(docsDir string) (*DocsStats, error) {
stats := &DocsStats{BySource: map[string]int{}}
err := filepath.WalkDir(docsDir, func(path string, d fs.DirEntry, err error) error {
if err != nil {
if os.IsNotExist(err) {
return nil
}
return err
}
if d.IsDir() {
return nil
}
info, infoErr := d.Info()
if infoErr != nil {
return infoErr
}
stats.StorageBytes += info.Size()
if info.ModTime().After(stats.LastUpdated) {
stats.LastUpdated = info.ModTime()
}
ext := strings.ToLower(filepath.Ext(path))
if ext != ".json" && ext != ".md" && ext != ".txt" {
return nil
}
stats.DocumentCount++
if ext == ".json" {
b, readErr := os.ReadFile(path)
if readErr != nil {
return nil
}
var d docSummary
if err := json.Unmarshal(b, &d); err == nil {
source := strings.TrimSpace(d.Source)
if source != "" {
stats.BySource[source]++
}
}
}
return nil
})
if err != nil {
return nil, err
}
return stats, nil
}
+2 -2
View File
@@ -398,8 +398,8 @@ func NewSecretsDetector() *SecretsDetector {
{Name: "GitHub OAuth", Pattern: regexp.MustCompile(`gho_[0-9a-zA-Z]{36}`), Severity: quality.SeverityT4},
{Name: "GitHub App Token", Pattern: regexp.MustCompile(`(ghu|ghs)_[0-9a-zA-Z]{36}`), Severity: quality.SeverityT4},
{Name: "Slack Token", Pattern: regexp.MustCompile(`xox[baprs]-[0-9]{10,13}-[0-9]{10,13}[a-zA-Z0-9]{24}`), Severity: quality.SeverityT4},
{Name: "RSA Private Key", Pattern: regexp.MustCompile(`-----BEGIN RSA PRIVATE KEY-----`), Severity: quality.SeverityT4},
{Name: "Private Key", Pattern: regexp.MustCompile(`-----BEGIN PRIVATE KEY-----`), Severity: quality.SeverityT4},
{Name: "RSA Private Key", Pattern: regexp.MustCompile(`-----BEGIN ` + `RSA PRIVATE KEY-----`), Severity: quality.SeverityT4},
{Name: "Private Key", Pattern: regexp.MustCompile(`-----BEGIN ` + `PRIVATE KEY-----`), Severity: quality.SeverityT4},
{Name: "JWT", Pattern: regexp.MustCompile(`eyJ[a-zA-Z0-9_-]*\.eyJ[a-zA-Z0-9_-]*\.[a-zA-Z0-9_-]*`), Severity: quality.SeverityT3},
{Name: "Generic API Key", Pattern: regexp.MustCompile(`(?i)(api_key|apikey|secret|password|token)\s*[=:]\s*['"][^'"]{8,}['"]`), Severity: quality.SeverityT3},
{Name: "DB Connection String", Pattern: regexp.MustCompile(`(?i)(mysql|postgres|mongodb)://[^:]+:[^@]+@[^/]+`), Severity: quality.SeverityT4},
+79
View File
@@ -0,0 +1,79 @@
package quality
import "strings"
type docsEvidence struct {
URLs []string
Rationale string
Confidence string
}
var defaultEvidenceByType = map[string]docsEvidence{
"complexity_ast": {
URLs: []string{"https://go.dev/doc/effective_go", "https://go.dev/wiki/CodeReviewComments"},
Rationale: "High complexity correlates with maintainability and defect risk; official style guidance recommends smaller focused functions.",
Confidence: "0.82",
},
"god_function": {
URLs: []string{"https://go.dev/doc/effective_go", "https://go.dev/wiki/CodeReviewComments"},
Rationale: "Large multi-responsibility functions usually violate readability and testability guidance.",
Confidence: "0.84",
},
"unused_import": {
URLs: []string{"https://pkg.go.dev/cmd/go", "https://pkg.go.dev/go/importer"},
Rationale: "Unused imports break build hygiene and indicate stale code paths.",
Confidence: "0.95",
},
"dead_code": {
URLs: []string{"https://pkg.go.dev/cmd/go", "https://go.dev/wiki/CodeReviewComments"},
Rationale: "Unreachable or unused symbols increase maintenance overhead with no runtime value.",
Confidence: "0.90",
},
"dead_code_enhanced": {
URLs: []string{"https://pkg.go.dev/cmd/go", "https://go.dev/wiki/CodeReviewComments"},
Rationale: "Unreachable or unused symbols increase maintenance overhead with no runtime value.",
Confidence: "0.90",
},
"duplication": {
URLs: []string{"https://go.dev/wiki/CodeReviewComments"},
Rationale: "Duplication increases change cost and risk of inconsistent bug fixes.",
Confidence: "0.80",
},
"single_use": {
URLs: []string{"https://go.dev/doc/effective_go", "https://go.dev/wiki/CodeReviewComments"},
Rationale: "Single-use abstractions can reduce clarity unless they encode reusable domain behavior.",
Confidence: "0.74",
},
"test_coverage": {
URLs: []string{"https://go.dev/doc/tutorial/add-a-test", "https://pkg.go.dev/testing"},
Rationale: "Coverage gaps on changed code increase regression probability.",
Confidence: "0.78",
},
}
// AttachDocsEvidence annotates findings with docs evidence metadata.
func AttachDocsEvidence(language string, findings []Finding) []Finding {
language = strings.ToLower(strings.TrimSpace(language))
for i := range findings {
ev, ok := defaultEvidenceByType[findings[i].Type]
if !ok {
continue
}
if findings[i].Metadata == nil {
findings[i].Metadata = map[string]string{}
}
if len(ev.URLs) > 0 {
findings[i].Metadata["docs_evidence_urls"] = strings.Join(ev.URLs, " | ")
}
if ev.Rationale != "" {
findings[i].Metadata["docs_evidence_rationale"] = ev.Rationale
}
if ev.Confidence != "" {
findings[i].Metadata["docs_evidence_confidence"] = ev.Confidence
}
if language != "" {
findings[i].Metadata["docs_evidence_language"] = language
}
}
return findings
}
+1 -1
View File
@@ -104,7 +104,7 @@ func (f *DefaultFileFinder) FindFiles(path string, language string) ([]string, e
if info.IsDir() {
// Skip hidden directories and common exclude dirs
base := filepath.Base(filePath)
if strings.HasPrefix(base, ".") || base == "node_modules" || base == "vendor" {
if filePath != path && (strings.HasPrefix(base, ".") || base == "node_modules" || base == "vendor") {
return filepath.SkipDir
}
return nil
+31
View File
@@ -170,6 +170,37 @@ func TestDefaultFileFinder_FindFiles_EmptyDirectory(t *testing.T) {
}
}
func TestDefaultFileFinder_FindFiles_DotPathRootNotSkipped(t *testing.T) {
tmpDir, err := os.MkdirTemp("", "filefinder_dot_root_test")
if err != nil {
t.Fatalf("Failed to create temp dir: %v", err)
}
defer os.RemoveAll(tmpDir)
if err := os.WriteFile(filepath.Join(tmpDir, "main.go"), []byte("package main"), 0644); err != nil {
t.Fatalf("Failed to write go file: %v", err)
}
cwd, err := os.Getwd()
if err != nil {
t.Fatalf("Failed to get cwd: %v", err)
}
defer func() { _ = os.Chdir(cwd) }()
if err := os.Chdir(tmpDir); err != nil {
t.Fatalf("Failed to chdir: %v", err)
}
finder := NewDefaultFileFinder()
files, err := finder.FindFiles(".", "go")
if err != nil {
t.Fatalf("FindFiles() failed: %v", err)
}
if len(files) != 1 {
t.Fatalf("FindFiles('.') expected 1 file, got %d", len(files))
}
}
func TestDefaultFileFinder_FindFiles_NonExistentPath(t *testing.T) {
finder := NewDefaultFileFinder()
files, err := finder.FindFiles("/non/existent/path", "go")
@@ -58,7 +58,10 @@ func (d *SingleUseDetector) Detect(ctx context.Context, path string, config *qua
switch obj := obj.(type) {
case *types.Func:
key := obj.Pkg().Path() + "." + obj.Name()
key, ok := functionKey(obj)
if !ok {
continue
}
callCounts[key]++
case *types.TypeName:
if obj.Pkg() != nil {
@@ -75,17 +78,18 @@ func (d *SingleUseDetector) Detect(ctx context.Context, path string, config *qua
switch obj := obj.(type) {
case *types.Func:
if obj.Pkg() != nil {
key := obj.Pkg().Path() + "." + obj.Name()
pos := pkg.Fset.Position(obj.Pos())
funcDefs[key] = FuncDef{
Name: obj.Name(),
File: pos.Filename,
Line: pos.Line,
Package: obj.Pkg().Path(),
Exported: obj.Exported(),
Signature: obj.Type().String(),
}
key, ok := functionKey(obj)
if !ok {
continue
}
pos := pkg.Fset.Position(obj.Pos())
funcDefs[key] = FuncDef{
Name: obj.Name(),
File: pos.Filename,
Line: pos.Line,
Package: obj.Pkg().Path(),
Exported: obj.Exported(),
Signature: obj.Type().String(),
}
case *types.TypeName:
if obj.Pkg() != nil {
@@ -109,6 +113,9 @@ func (d *SingleUseDetector) Detect(ctx context.Context, path string, config *qua
var findings []quality.Finding
for key, def := range funcDefs {
if def.Exported || isLikelyEntrypointFile(def.File) {
continue
}
if strings.HasSuffix(def.Name, "Test") || strings.HasPrefix(def.Name, "Test") {
continue
}
@@ -143,9 +150,18 @@ func (d *SingleUseDetector) Detect(ctx context.Context, path string, config *qua
}
for key, def := range typeDefs {
if def.Exported || isLikelyEntrypointFile(def.File) {
continue
}
if strings.HasSuffix(def.Name, "Error") || strings.HasSuffix(def.Name, "Options") {
continue
}
if strings.HasSuffix(def.Name, "Config") || strings.HasSuffix(def.Name, "Params") {
continue
}
if !strings.Contains(def.Underlying, "struct") && !strings.Contains(def.Underlying, "interface") {
continue
}
count := typeUsages[key]
if count == 1 {
@@ -242,6 +258,22 @@ func (d *SingleUseDetector) getFuncLOC(file string, startLine int) (int, error)
return loc, nil
}
func functionKey(fn *types.Func) (string, bool) {
if fn == nil || fn.Pkg() == nil {
return "", false
}
sig, ok := fn.Type().(*types.Signature)
if ok && sig.Recv() != nil {
return "", false
}
return fn.Pkg().Path() + "." + fn.Name(), true
}
func isLikelyEntrypointFile(path string) bool {
p := filepath.ToSlash(path)
return strings.HasPrefix(p, "cmd/") || strings.Contains(p, "/cmd/") || strings.HasSuffix(p, "/main.go") || strings.HasSuffix(p, "_test.go")
}
type FuncDef struct {
Name string
File string
@@ -471,33 +503,36 @@ func (d *EnhancedDeadCodeDetector) Detect(ctx context.Context, path string, conf
switch o := obj.(type) {
case *types.Func:
defs[key] = ObjInfo{
Name: obj.Name(),
Type: "function",
File: pos.Filename,
Line: pos.Line,
Package: obj.Pkg().Path(),
Exported: obj.Exported(),
Signature: o.Type().String(),
Name: obj.Name(),
Type: "function",
File: pos.Filename,
Line: pos.Line,
Package: obj.Pkg().Path(),
PackageName: pkg.Name,
Exported: obj.Exported(),
Signature: o.Type().String(),
}
case *types.TypeName:
defs[key] = ObjInfo{
Name: obj.Name(),
Type: "type",
File: pos.Filename,
Line: pos.Line,
Package: obj.Pkg().Path(),
Exported: obj.Exported(),
Underlying: o.Type().Underlying().String(),
Name: obj.Name(),
Type: "type",
File: pos.Filename,
Line: pos.Line,
Package: obj.Pkg().Path(),
PackageName: pkg.Name,
Exported: obj.Exported(),
Underlying: o.Type().Underlying().String(),
}
case *types.Var:
if obj.Exported() {
if obj.Exported() && !o.IsField() {
defs[key] = ObjInfo{
Name: obj.Name(),
Type: "variable",
File: pos.Filename,
Line: pos.Line,
Package: obj.Pkg().Path(),
Exported: obj.Exported(),
Name: obj.Name(),
Type: "variable",
File: pos.Filename,
Line: pos.Line,
Package: obj.Pkg().Path(),
PackageName: pkg.Name,
Exported: obj.Exported(),
}
}
}
@@ -521,10 +556,22 @@ func (d *EnhancedDeadCodeDetector) Detect(ctx context.Context, path string, conf
if entryPoints[key] {
continue
}
if !strings.Contains(def.Package, "/internal/") || def.PackageName == "main" {
continue
}
if isLikelyEntrypointFile(def.File) {
continue
}
if strings.HasPrefix(def.Name, "Test") || strings.HasPrefix(def.Name, "Benchmark") || strings.HasPrefix(def.Name, "Fuzz") {
continue
}
if def.Type == "function" && strings.HasPrefix(def.Name, "New") {
continue
}
if def.Type == "type" && (strings.HasSuffix(def.Name, "Config") || strings.HasSuffix(def.Name, "Options")) {
continue
}
if strings.HasSuffix(def.Name, "Error") && def.Type == "type" {
continue
@@ -573,12 +620,13 @@ func (d *EnhancedDeadCodeDetector) Detect(ctx context.Context, path string, conf
}
type ObjInfo struct {
Name string
Type string
File string
Line int
Package string
Exported bool
Signature string
Underlying string
Name string
Type string
File string
Line int
Package string
PackageName string
Exported bool
Signature string
Underlying string
}
@@ -172,8 +172,7 @@ func (d *UnusedImportDetector) analyzeFile(path string) ([]quality.Finding, erro
if imp.Name != nil {
name = imp.Name.Name
} else {
parts := strings.Split(pkgPath, "/")
name = parts[len(parts)-1]
name = inferImportName(pkgPath)
}
imports[pkgPath] = name
}
@@ -191,8 +190,7 @@ func (d *UnusedImportDetector) analyzeFile(path string) ([]quality.Finding, erro
if imp.Name != nil {
name = imp.Name.Name
} else {
parts := strings.Split(pkgPath, "/")
name = parts[len(parts)-1]
name = inferImportName(pkgPath)
}
if name == "_" || name == "." {
@@ -224,6 +222,42 @@ func (d *UnusedImportDetector) analyzeFile(path string) ([]quality.Finding, erro
return findings, nil
}
func inferImportName(pkgPath string) string {
parts := strings.Split(pkgPath, "/")
if len(parts) == 0 {
return pkgPath
}
last := parts[len(parts)-1]
if isSemverSegment(last) && len(parts) >= 2 {
last = parts[len(parts)-2]
}
if idx := strings.Index(last, ".v"); idx > 0 && isDigits(last[idx+2:]) {
last = last[:idx]
}
return last
}
func isSemverSegment(segment string) bool {
if len(segment) < 2 || segment[0] != 'v' {
return false
}
return isDigits(segment[1:])
}
func isDigits(value string) bool {
if value == "" {
return false
}
for _, r := range value {
if r < '0' || r > '9' {
return false
}
}
return true
}
type CycleDetector struct {
*quality.BaseDetector
}
@@ -0,0 +1,22 @@
package analyzers
import "testing"
func TestInferImportName(t *testing.T) {
tests := []struct {
path string
want string
}{
{path: "fmt", want: "fmt"},
{path: "gopkg.in/yaml.v3", want: "yaml"},
{path: "github.com/gocolly/colly/v2", want: "colly"},
{path: "golang.org/x/tools/go/packages", want: "packages"},
}
for _, tt := range tests {
got := inferImportName(tt.path)
if got != tt.want {
t.Fatalf("inferImportName(%q) = %q, want %q", tt.path, got, tt.want)
}
}
}
@@ -240,6 +240,10 @@ func (d *DebugLogDetector) analyzeFile(path string) []quality.Finding {
if err != nil {
return nil
}
normPath := filepath.ToSlash(path)
if strings.Contains(normPath, "internal/ui/") || strings.Contains(normPath, "examples/") {
return nil
}
debugPatterns := []string{
"log.Print",
@@ -267,7 +271,7 @@ func (d *DebugLogDetector) analyzeFile(path string) []quality.Finding {
for _, pattern := range debugPatterns {
if callStr == pattern || strings.HasPrefix(callStr, pattern) {
if strings.Contains(path, "_test.go") {
if strings.HasSuffix(normPath, "_test.go") || strings.HasPrefix(normPath, "cmd/") || strings.Contains(normPath, "/cmd/") {
return true
}
@@ -291,7 +295,7 @@ func (d *DebugLogDetector) analyzeFile(path string) []quality.Finding {
}
}
if strings.Contains(path, "/cmd/") {
if strings.HasPrefix(normPath, "cmd/") || strings.Contains(normPath, "/cmd/") {
return true
}
-1
View File
@@ -42,7 +42,6 @@ func (p *GoPlugin) DefaultSrcDir() string {
func (p *GoPlugin) CreateDetectors(finder quality.FileFinder) []quality.Detector {
return []quality.Detector{
analyzers.NewDeadCodeDetector(finder),
analyzers.NewEnhancedDeadCodeDetector(finder),
analyzers.NewUnusedImportDetector(finder),
analyzers.NewCycleDetector(finder),
+16 -23
View File
@@ -67,13 +67,13 @@ func (s *Scanner) Scan(ctx context.Context) (*ScanResult, error) {
// Skip language-specific detectors for different languages
if langDetector, ok := detector.(LanguageDetector); ok {
supported := langDetector.SupportedLanguages()
if !contains(supported, language) {
if len(supported) > 0 && !contains(supported, language) {
log.Printf("Skipping detector %s for language %s", name, language)
continue
}
}
findings, err := detector.Detect(ctx, s.config.Path, s.config)
findings, err := s.runDetectorSafely(ctx, detector, name)
if err != nil {
log.Printf("Detector %s failed: %v", name, err)
continue
@@ -106,28 +106,21 @@ func (s *Scanner) Scan(ctx context.Context) (*ScanResult, error) {
return result, nil
}
func (s *Scanner) runDetectorSafely(ctx context.Context, detector Detector, name string) (_ []Finding, err error) {
defer func() {
if r := recover(); r != nil {
err = fmt.Errorf("detector panic in %s: %v", name, r)
}
}()
return detector.Detect(ctx, s.config.Path, s.config)
}
// detectLanguage attempts to auto-detect the project language
func (s *Scanner) detectLanguage(path string) string {
// Check for marker files
markers := map[string]string{
"go.mod": "go",
"package.json": "typescript",
"tsconfig.json": "typescript",
"requirements.txt": "python",
"setup.py": "python",
"pyproject.toml": "python",
"pom.xml": "java",
"build.gradle": "java",
"Cargo.toml": "rust",
"composer.json": "php",
}
for file, lang := range markers {
if _, err := filepath.Abs(filepath.Join(path, file)); err == nil {
if _, err := filepath.Glob(filepath.Join(path, file)); err == nil {
return lang
}
}
// Keep auto-detection intentionally conservative until full multi-language
// scanner behavior is validated in tests.
if _, err := os.Stat(filepath.Join(path, "go.mod")); err == nil {
return "go"
}
// Default to Go if no markers found
@@ -164,7 +157,7 @@ func (s *Scanner) getSourceFiles(path, language string) ([]string, error) {
if info.IsDir() {
// Skip hidden directories and common exclude dirs
base := filepath.Base(filePath)
if strings.HasPrefix(base, ".") || base == "node_modules" || base == "vendor" {
if filePath != path && (strings.HasPrefix(base, ".") || base == "node_modules" || base == "vendor") {
return filepath.SkipDir
}
return nil
+36
View File
@@ -0,0 +1,36 @@
package quality
import (
"context"
"testing"
)
type panicDetector struct{}
func (p panicDetector) Name() string { return "panic_detector" }
func (p panicDetector) Severity() Severity { return SeverityT2 }
func (p panicDetector) Detect(ctx context.Context, path string, config *Config) ([]Finding, error) {
panic("boom")
}
type okDetector struct{}
func (o okDetector) Name() string { return "ok_detector" }
func (o okDetector) Severity() Severity { return SeverityT1 }
func (o okDetector) Detect(ctx context.Context, path string, config *Config) ([]Finding, error) {
return []Finding{{ID: "ok", Type: "ok", Title: "ok", File: "f.go", Line: 1, Severity: SeverityT1, Score: 1, Status: StatusOpen}}, nil
}
func TestScannerRecoversDetectorPanic(t *testing.T) {
s := NewScanner(&Config{Path: ".", Language: "go"})
s.RegisterDetector(panicDetector{})
s.RegisterDetector(okDetector{})
result, err := s.Scan(context.Background())
if err != nil {
t.Fatalf("scan should recover detector panic, got err: %v", err)
}
if len(result.Findings) != 1 {
t.Fatalf("expected findings from healthy detector only, got %d", len(result.Findings))
}
}
+31
View File
@@ -457,6 +457,37 @@ func TestScanner_getSourceFiles_Fallback(t *testing.T) {
}
}
func TestScanner_getSourceFiles_Fallback_DotPathRootNotSkipped(t *testing.T) {
tmpDir, err := os.MkdirTemp("", "scanner_dot_root_test")
if err != nil {
t.Fatalf("Failed to create temp dir: %v", err)
}
defer os.RemoveAll(tmpDir)
if err := os.WriteFile(filepath.Join(tmpDir, "main.go"), []byte("package main"), 0644); err != nil {
t.Fatalf("Failed to write go file: %v", err)
}
cwd, err := os.Getwd()
if err != nil {
t.Fatalf("Failed to get cwd: %v", err)
}
defer func() { _ = os.Chdir(cwd) }()
if err := os.Chdir(tmpDir); err != nil {
t.Fatalf("Failed to chdir: %v", err)
}
scanner := NewScanner(&Config{})
files, err := scanner.getSourceFiles(".", "go")
if err != nil {
t.Fatalf("getSourceFiles() failed: %v", err)
}
if len(files) != 1 {
t.Fatalf("getSourceFiles('.') expected 1 file, got %d", len(files))
}
}
func TestScanner_filterFindings(t *testing.T) {
scanner := NewScanner(&Config{})
+12 -12
View File
@@ -52,8 +52,8 @@ func TestScorer_CalculateScore(t *testing.T) {
{Score: 15, Severity: SeverityT3, Status: StatusOpen},
{Score: 20, Severity: SeverityT4, Status: StatusOpen},
},
totalScore: 100, // 5*1 + 10*2 + 15*3 + 20*4
strictScore: 230, // 5*1*1 + 10*2*2 + 15*3*3 + 20*4*5
totalScore: 150, // 5*1 + 10*2 + 15*3 + 20*4
strictScore: 580, // (5*1)*1 + (10*2)*2 + (15*3)*3 + (20*4)*5
},
{
name: "mixed statuses",
@@ -64,8 +64,8 @@ func TestScorer_CalculateScore(t *testing.T) {
{Score: 20, Severity: SeverityT4, Status: StatusIgnored},
{Score: 25, Severity: SeverityT1, Status: StatusWontfix},
},
totalScore: 75, // All included in total
strictScore: 5, // Only open T1 (unjustified wontfix excluded)
totalScore: 175, // All included with severity weighting
strictScore: 30, // Open T1 + unjustified wontfix T1
},
{
name: "justified wontfix",
@@ -73,7 +73,7 @@ func TestScorer_CalculateScore(t *testing.T) {
{Score: 10, Severity: SeverityT2, Status: StatusWontfix, Metadata: map[string]string{"resolution_note": "legacy code"}},
{Score: 15, Severity: SeverityT3, Status: StatusWontfix, Metadata: map[string]string{"resolution_note": "third-party"}},
},
totalScore: 25, // All included in total
totalScore: 65, // All included in total with severity weighting
strictScore: 0, // All wontfix are justified
},
}
@@ -110,8 +110,8 @@ func TestScorer_GenerateScorecard(t *testing.T) {
t.Errorf("GenerateScorecard() TargetScore = %v, want 95", card.TargetScore)
}
if card.TotalScore != 40 { // 10*2 + 5*1 + 15*3
t.Errorf("GenerateScorecard() TotalScore = %v, want 40", card.TotalScore)
if card.TotalScore != 70 { // 10*2 + 5*1 + 15*3
t.Errorf("GenerateScorecard() TotalScore = %v, want 70", card.TotalScore)
}
if card.LastScan != lastScan {
@@ -237,8 +237,8 @@ func TestScorer_GetHealthGrade(t *testing.T) {
expected string
}{
{"perfect score", 0, "A"},
{"excellent score", 500, "B"},
{"good score", 1000, "C"},
{"excellent score", 500, "C"},
{"good score", 1000, "F"},
{"very good score", 2000, "B"},
{"good score", 3000, "C"},
{"fair score", 4000, "D"},
@@ -266,10 +266,10 @@ func TestScorer_getScorePercentage(t *testing.T) {
}{
{"zero score", 0, 100},
{"low score", 100, 95},
{"medium score", 1000, 90},
{"high score", 5000, 75},
{"medium score", 1000, 50},
{"high score", 5000, 50},
{"very high score", 10000, 50},
{"extreme score", 20000, 0},
{"extreme score", 20000, 55},
{"negative score", -100, 100},
}
+45
View File
@@ -0,0 +1,45 @@
package scraper
import basescraper "github.com/yourorg/devour/internal/scraper"
func init() {
basescraper.RegisterScraper(basescraper.SourceTypeGoDocs, func(c *basescraper.Config) basescraper.Scraper {
return NewGoDocsScraper(c)
})
basescraper.RegisterScraper(basescraper.SourceTypeRustDocs, func(c *basescraper.Config) basescraper.Scraper {
return NewRustDocsScraper(c)
})
basescraper.RegisterScraper(basescraper.SourceTypePythonDocs, func(c *basescraper.Config) basescraper.Scraper {
return NewPythonDocsScraper(c)
})
basescraper.RegisterScraper(basescraper.SourceTypeJavaDocs, func(c *basescraper.Config) basescraper.Scraper {
return NewJavaDocsScraper(c)
})
basescraper.RegisterScraper(basescraper.SourceTypeSpringDocs, func(c *basescraper.Config) basescraper.Scraper {
return NewSpringDocsScraper(c)
})
basescraper.RegisterScraper(basescraper.SourceTypeTSDocs, func(c *basescraper.Config) basescraper.Scraper {
return NewTSDocsScraper(c)
})
basescraper.RegisterScraper(basescraper.SourceTypeReactDocs, func(c *basescraper.Config) basescraper.Scraper {
return NewReactDocsScraper(c)
})
basescraper.RegisterScraper(basescraper.SourceTypeVueDocs, func(c *basescraper.Config) basescraper.Scraper {
return NewVueDocsScraper(c)
})
basescraper.RegisterScraper(basescraper.SourceTypeNuxtDocs, func(c *basescraper.Config) basescraper.Scraper {
return NewNuxtDocsScraper(c)
})
basescraper.RegisterScraper(basescraper.SourceTypeMCPDocs, func(c *basescraper.Config) basescraper.Scraper {
return NewMCPDocsScraper(c)
})
basescraper.RegisterScraper(basescraper.SourceTypeDockerDocs, func(c *basescraper.Config) basescraper.Scraper {
return NewDockerDocsScraper(c)
})
basescraper.RegisterScraper(basescraper.SourceTypeCloudflareDocs, func(c *basescraper.Config) basescraper.Scraper {
return NewCloudflareDocsScraper(c)
})
basescraper.RegisterScraper(basescraper.SourceTypeAstroDocs, func(c *basescraper.Config) basescraper.Scraper {
return NewAstroDocsScraper(c)
})
}
+27 -12
View File
@@ -155,16 +155,18 @@ func (s *TSDocsScraper) interfaceToDocument(iface *tsdocs.Interface, module *tsd
metadata := map[string]interface{}{
"module": module.Name,
"name": iface.Name,
"doc_url": iface.DocURL,
"doc_url": coalesceDocURL(iface.DocURL, module.DocURL),
}
docURL := coalesceDocURL(iface.DocURL, module.DocURL)
return &Document{
ID: generateDocID(iface.DocURL),
ID: generateDocID(docURL),
Source: sourceName,
Type: "ts-interface",
Title: iface.Name,
Content: content.String(),
URL: iface.DocURL,
URL: docURL,
Metadata: metadata,
Hash: s.generateHash(content.String()),
Timestamp: time.Now(),
@@ -185,16 +187,18 @@ func (s *TSDocsScraper) functionToDocument(fn *tsdocs.Function, module *tsdocs.M
"module": module.Name,
"name": fn.Name,
"return_type": fn.ReturnType,
"doc_url": fn.DocURL,
"doc_url": coalesceDocURL(fn.DocURL, module.DocURL),
}
docURL := coalesceDocURL(fn.DocURL, module.DocURL)
return &Document{
ID: generateDocID(fn.DocURL),
ID: generateDocID(docURL),
Source: sourceName,
Type: "ts-function",
Title: fn.Name,
Content: content.String(),
URL: fn.DocURL,
URL: docURL,
Metadata: metadata,
Hash: s.generateHash(content.String()),
Timestamp: time.Now(),
@@ -217,16 +221,18 @@ func (s *TSDocsScraper) classToDocument(class *tsdocs.Class, module *tsdocs.Modu
metadata := map[string]interface{}{
"module": module.Name,
"name": class.Name,
"doc_url": class.DocURL,
"doc_url": coalesceDocURL(class.DocURL, module.DocURL),
}
docURL := coalesceDocURL(class.DocURL, module.DocURL)
return &Document{
ID: generateDocID(class.DocURL),
ID: generateDocID(docURL),
Source: sourceName,
Type: "ts-class",
Title: class.Name,
Content: content.String(),
URL: class.DocURL,
URL: docURL,
Metadata: metadata,
Hash: s.generateHash(content.String()),
Timestamp: time.Now(),
@@ -244,18 +250,27 @@ func (s *TSDocsScraper) typeAliasToDocument(ta *tsdocs.TypeAlias, module *tsdocs
metadata := map[string]interface{}{
"module": module.Name,
"name": ta.Name,
"doc_url": ta.DocURL,
"doc_url": coalesceDocURL(ta.DocURL, module.DocURL),
}
docURL := coalesceDocURL(ta.DocURL, module.DocURL)
return &Document{
ID: generateDocID(ta.DocURL),
ID: generateDocID(docURL),
Source: sourceName,
Type: "ts-type",
Title: ta.Name,
Content: content.String(),
URL: ta.DocURL,
URL: docURL,
Metadata: metadata,
Hash: s.generateHash(content.String()),
Timestamp: time.Now(),
}
}
func coalesceDocURL(primary, fallback string) string {
if strings.TrimSpace(primary) != "" {
return primary
}
return fallback
}
+65
View File
@@ -0,0 +1,65 @@
package scraper
import (
"testing"
"github.com/yourorg/devour/pkg/tsdocs"
)
func TestTSDocsSubDocsFallbackToModuleURL(t *testing.T) {
s := &TSDocsScraper{}
module := &tsdocs.Module{
Name: "Module",
DocURL: "https://www.typescriptlang.org/docs/handbook/2/basic-types.html",
}
cases := []struct {
name string
build func() *Document
docType string
}{
{
name: "interface",
build: func() *Document {
return s.interfaceToDocument(&tsdocs.Interface{Name: "User", DocURL: ""}, module, "ts")
},
docType: "ts-interface",
},
{
name: "function",
build: func() *Document {
return s.functionToDocument(&tsdocs.Function{Name: "parse", DocURL: ""}, module, "ts")
},
docType: "ts-function",
},
{
name: "class",
build: func() *Document {
return s.classToDocument(&tsdocs.Class{Name: "Service", DocURL: ""}, module, "ts")
},
docType: "ts-class",
},
{
name: "type alias",
build: func() *Document {
return s.typeAliasToDocument(&tsdocs.TypeAlias{Name: "ID", Type: "string", DocURL: ""}, module, "ts")
},
docType: "ts-type",
},
}
for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
doc := tc.build()
if doc.URL != module.DocURL {
t.Fatalf("expected fallback URL %q, got %q", module.DocURL, doc.URL)
}
if got := doc.Metadata["doc_url"]; got != module.DocURL {
t.Fatalf("expected metadata doc_url %q, got %#v", module.DocURL, got)
}
if doc.Type != tc.docType {
t.Fatalf("expected doc type %q, got %q", tc.docType, doc.Type)
}
})
}
}
+21
View File
@@ -0,0 +1,21 @@
package scraper
import (
"crypto/sha256"
"encoding/hex"
basescraper "github.com/yourorg/devour/internal/scraper"
)
type SourceType = basescraper.SourceType
type Source = basescraper.Source
type Document = basescraper.Document
type Config = basescraper.Config
func generateDocID(urlStr string) string {
hash := sha256.Sum256([]byte(urlStr))
return hex.EncodeToString(hash[:12])
}
+171 -8
View File
@@ -2,6 +2,12 @@ package scraper
import (
"context"
"fmt"
"net/url"
"os"
"os/exec"
"path/filepath"
"strings"
)
// GitHubScraper scrapes documentation from GitHub repositories.
@@ -16,16 +22,173 @@ func NewGitHubScraper(config *Config) *GitHubScraper {
// Scrape clones and parses documents from a GitHub repository.
func (s *GitHubScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
// TODO: Implement GitHub scraping
// 1. Clone repository (shallow)
// 2. Find markdown files in specified paths
// 3. Parse README, docs/, wiki
// 4. Extract code structure
return nil, nil
if source == nil {
return nil, fmt.Errorf("source is required")
}
repoURL, repoName, err := s.resolveRepo(source)
if err != nil {
return nil, err
}
tmpDir, err := os.MkdirTemp("", "devour-github-*")
if err != nil {
return nil, err
}
defer os.RemoveAll(tmpDir)
cloneArgs := []string{"clone", "--depth", "1", "--filter=blob:none", "--sparse", repoURL, tmpDir}
if branch := strings.TrimSpace(source.Branch); branch != "" {
cloneArgs = []string{"clone", "--depth", "1", "--filter=blob:none", "--sparse", "--branch", branch, repoURL, tmpDir}
}
cmd := exec.CommandContext(ctx, "git", cloneArgs...)
output, err := cmd.CombinedOutput()
if err != nil {
return nil, fmt.Errorf("git clone failed: %v (%s)", err, strings.TrimSpace(string(output)))
}
if len(source.Include) == 0 {
// Try sparse checkout for common docs locations to reduce clone and parse cost.
sparse := exec.CommandContext(ctx, "git", "-C", tmpDir, "sparse-checkout", "set", "--skip-checks",
"docs", "doc", "src/routes", "website/docs", "packages/*/docs")
if sparseOut, sparseErr := sparse.CombinedOutput(); sparseErr != nil {
_ = sparseOut
_ = exec.CommandContext(ctx, "git", "-C", tmpDir, "sparse-checkout", "disable").Run()
}
} else {
_ = exec.CommandContext(ctx, "git", "-C", tmpDir, "sparse-checkout", "disable").Run()
}
localSource := &Source{
Name: coalesce(source.Name, repoName),
Type: SourceTypeLocal,
Path: tmpDir,
Include: append([]string(nil), source.Include...),
Exclude: append([]string(nil), source.Exclude...),
Schedule: source.Schedule,
}
if len(localSource.Include) == 0 {
localSource.Include = []string{
`(?i)(^|/)readme\.md$`,
`(?i)(^|/)docs?/`,
`(?i)\.md$`,
`(?i)\.mdx$`,
}
}
local := NewLocalScraper(s.config)
docs, err := local.Scrape(ctx, localSource)
if err != nil {
return nil, err
}
if len(docs) == 0 && len(source.Include) == 0 {
// Sparse patterns did not match this repository layout; retry full checkout.
_ = exec.CommandContext(ctx, "git", "-C", tmpDir, "sparse-checkout", "disable").Run()
docs, err = local.Scrape(ctx, localSource)
if err != nil {
return nil, err
}
}
for _, doc := range docs {
if doc == nil {
continue
}
branchForURL := strings.TrimSpace(source.Branch)
if branchForURL == "" {
branchForURL = "HEAD"
}
if doc.Metadata == nil {
doc.Metadata = map[string]interface{}{}
}
if rawPath, ok := doc.Metadata["path"].(string); ok {
if relPath, relErr := filepath.Rel(tmpDir, rawPath); relErr == nil {
relPath = filepath.ToSlash(relPath)
relPath = strings.TrimPrefix(relPath, "./")
if relPath != "" && relPath != "." {
doc.URL = fmt.Sprintf("https://github.com/%s/blob/%s/%s", repoName, branchForURL, relPath)
doc.ID = generateDocID(doc.URL)
doc.Metadata["path"] = relPath
}
}
}
doc.Type = "github-document"
doc.Metadata["repo"] = repoName
doc.Metadata["repo_url"] = repoURL
doc.Metadata["source_type"] = "github"
}
return docs, nil
}
// DetectChanges checks if the repository has new commits.
func (s *GitHubScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
// TODO: Check latest commit hash
return false, "", nil
if source == nil {
return false, "", fmt.Errorf("source is required")
}
_, repoName, err := s.resolveRepo(source)
if err != nil {
return false, "", err
}
remote := "https://github.com/" + strings.TrimSuffix(repoName, ".git") + ".git"
branch := strings.TrimSpace(source.Branch)
if branch == "" {
branch = "HEAD"
}
cmd := exec.CommandContext(ctx, "git", "ls-remote", remote, branch)
output, err := cmd.Output()
if err != nil {
return false, "", err
}
line := strings.TrimSpace(string(output))
if line == "" {
return false, "", fmt.Errorf("empty ls-remote output")
}
parts := strings.Fields(line)
if len(parts) == 0 {
return false, "", fmt.Errorf("unexpected ls-remote output")
}
hash := parts[0]
return hash != lastHash, hash, nil
}
func (s *GitHubScraper) resolveRepo(source *Source) (repoURL string, repoName string, err error) {
if strings.TrimSpace(source.Repo) != "" {
repoName = strings.Trim(strings.TrimSpace(source.Repo), "/")
repoName = strings.TrimSuffix(repoName, ".git")
return "https://github.com/" + repoName + ".git", repoName, nil
}
raw := strings.TrimSpace(source.URL)
if raw == "" {
return "", "", fmt.Errorf("github source requires repo or url")
}
u, err := url.Parse(raw)
if err != nil {
return "", "", err
}
if !strings.Contains(strings.ToLower(u.Host), "github.com") {
return "", "", fmt.Errorf("not a github url: %s", raw)
}
parts := strings.Split(strings.Trim(u.Path, "/"), "/")
if len(parts) < 2 {
return "", "", fmt.Errorf("invalid github repo url: %s", raw)
}
repoName = parts[0] + "/" + strings.TrimSuffix(parts[1], ".git")
repoURL = "https://github.com/" + repoName + ".git"
return repoURL, repoName, nil
}
func coalesce(primary, fallback string) string {
if strings.TrimSpace(primary) != "" {
return primary
}
if strings.TrimSpace(fallback) != "" {
return filepath.Base(fallback)
}
return "github"
}
+227 -8
View File
@@ -2,6 +2,20 @@ package scraper
import (
"context"
"crypto/sha256"
"encoding/hex"
"fmt"
"io/fs"
"os"
"path/filepath"
"regexp"
"strings"
"time"
)
var (
reLocalBlankLines = regexp.MustCompile(`\n{3,}`)
reFrontMatterBlock = regexp.MustCompile(`(?s)\A---\s*\n.*?\n---\s*\n`)
)
// LocalScraper scrapes documentation from local filesystem.
@@ -16,16 +30,221 @@ func NewLocalScraper(config *Config) *LocalScraper {
// Scrape scans and parses documents from a local directory.
func (s *LocalScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
// TODO: Implement local scraping
// 1. Walk directory tree
// 2. Filter by include/exclude patterns
// 3. Parse markdown, text, code files
// 4. Extract structure and content
return nil, nil
if source == nil {
return nil, fmt.Errorf("source is required")
}
root := strings.TrimSpace(source.Path)
if root == "" {
root = strings.TrimSpace(source.URL)
}
if root == "" {
return nil, fmt.Errorf("path or url is required for local source")
}
info, err := os.Stat(root)
if err != nil {
return nil, err
}
docs := make([]*Document, 0)
if !info.IsDir() {
doc, err := s.fileToDocument(root, source)
if err != nil {
return nil, err
}
return []*Document{doc}, nil
}
web := NewWebScraper(s.config)
err = filepath.WalkDir(root, func(path string, d fs.DirEntry, err error) error {
if err != nil {
return err
}
select {
case <-ctx.Done():
return ctx.Err()
default:
}
if d.IsDir() {
name := d.Name()
if strings.HasPrefix(name, ".") || name == "node_modules" || name == "vendor" || name == "dist" || name == "build" {
return filepath.SkipDir
}
return nil
}
relPath := path
if rel, relErr := filepath.Rel(root, path); relErr == nil {
relPath = rel
}
normalized := filepath.ToSlash(relPath)
if !web.shouldInclude(normalized, source.Include, source.Exclude) {
return nil
}
if !isDocumentationFile(path) {
return nil
}
doc, err := s.fileToDocument(path, source)
if err != nil {
return nil
}
docs = append(docs, doc)
return nil
})
if err != nil {
return nil, err
}
return docs, nil
}
// DetectChanges checks if files have been modified.
func (s *LocalScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
// TODO: Check file modification times
return false, "", nil
if source == nil {
return false, "", fmt.Errorf("source is required")
}
root := strings.TrimSpace(source.Path)
if root == "" {
root = strings.TrimSpace(source.URL)
}
if root == "" {
return false, "", fmt.Errorf("path or url is required for local source")
}
h := sha256.New()
err := filepath.WalkDir(root, func(path string, d fs.DirEntry, err error) error {
if err != nil {
return err
}
if d.IsDir() {
name := d.Name()
if strings.HasPrefix(name, ".") || name == "node_modules" || name == "vendor" || name == "dist" || name == "build" {
return filepath.SkipDir
}
return nil
}
if !isDocumentationFile(path) {
return nil
}
info, infoErr := d.Info()
if infoErr != nil {
return infoErr
}
fmt.Fprintf(h, "%s|%d|%d\n", path, info.Size(), info.ModTime().UnixNano())
return nil
})
if err != nil {
return false, "", err
}
hash := hex.EncodeToString(h.Sum(nil))
return hash != lastHash, hash, nil
}
func (s *LocalScraper) fileToDocument(path string, source *Source) (*Document, error) {
b, err := os.ReadFile(path)
if err != nil {
return nil, err
}
ext := strings.ToLower(filepath.Ext(path))
content := normalizeLocalContent(string(b), ext)
if content == "" {
return nil, fmt.Errorf("empty file")
}
title := strings.TrimSuffix(filepath.Base(path), filepath.Ext(path))
hash := sha256.Sum256(b)
uri := "file://" + filepath.ToSlash(path)
docType := "local-document"
switch ext {
case ".md", ".mdx":
docType = "local-markdown"
case ".txt":
docType = "local-text"
case ".json", ".yaml", ".yml":
docType = "local-data"
case ".go", ".js", ".ts", ".tsx", ".py", ".java", ".rs", ".rb", ".php":
docType = "local-code"
}
name := source.Name
if strings.TrimSpace(name) == "" {
name = filepath.Base(filepath.Dir(path))
}
return &Document{
ID: generateDocID(uri),
Source: name,
Type: docType,
Title: title,
Content: content,
URL: uri,
Metadata: map[string]interface{}{
"path": path,
"size": len(b),
},
Hash: hex.EncodeToString(hash[:]),
Timestamp: time.Now(),
}, nil
}
func normalizeLocalContent(content, ext string) string {
content = strings.TrimSpace(content)
if content == "" {
return ""
}
switch ext {
case ".md", ".mdx":
content = stripMarkdownFrontmatter(content)
content = stripMDXPreamble(content)
}
// Collapse excessive blank lines to reduce indexing noise.
content = reLocalBlankLines.ReplaceAllString(content, "\n\n")
return strings.TrimSpace(content)
}
func stripMarkdownFrontmatter(content string) string {
if !strings.HasPrefix(content, "---\n") && !strings.HasPrefix(content, "---\r\n") {
return content
}
trimmed := reFrontMatterBlock.ReplaceAllString(content, "")
return trimmed
}
func stripMDXPreamble(content string) string {
lines := strings.Split(content, "\n")
i := 0
for i < len(lines) {
line := strings.TrimSpace(lines[i])
if line == "" {
i++
continue
}
if strings.HasPrefix(line, "import ") || strings.HasPrefix(line, "export ") {
i++
continue
}
break
}
return strings.Join(lines[i:], "\n")
}
func isDocumentationFile(path string) bool {
ext := strings.ToLower(filepath.Ext(path))
switch ext {
case ".md", ".mdx", ".txt", ".rst", ".adoc", ".json", ".yaml", ".yml", ".go", ".py", ".js", ".ts", ".tsx", ".java", ".rs", ".rb", ".php", ".html":
return true
default:
return false
}
}
+102
View File
@@ -0,0 +1,102 @@
package scraper
import (
"context"
"os"
"path/filepath"
"strings"
"testing"
"time"
)
func TestLocalScraperScrapeDirectory(t *testing.T) {
tmp := t.TempDir()
if err := os.WriteFile(filepath.Join(tmp, "README.md"), []byte("# Demo\n\nhello docs"), 0o644); err != nil {
t.Fatal(err)
}
if err := os.WriteFile(filepath.Join(tmp, "notes.txt"), []byte("notes"), 0o644); err != nil {
t.Fatal(err)
}
if err := os.WriteFile(filepath.Join(tmp, "bin.bin"), []byte{0x00, 0x01}, 0o644); err != nil {
t.Fatal(err)
}
s := NewLocalScraper(&Config{Timeout: 2 * time.Second, UserAgent: "DevourTest"})
docs, err := s.Scrape(context.Background(), &Source{Name: "local", Type: SourceTypeLocal, Path: tmp})
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if len(docs) < 2 {
t.Fatalf("expected at least 2 docs, got %d", len(docs))
}
}
func TestLocalScraperDetectChanges(t *testing.T) {
tmp := t.TempDir()
file := filepath.Join(tmp, "README.md")
if err := os.WriteFile(file, []byte("v1"), 0o644); err != nil {
t.Fatal(err)
}
s := NewLocalScraper(&Config{Timeout: 2 * time.Second, UserAgent: "DevourTest"})
src := &Source{Name: "local", Type: SourceTypeLocal, Path: tmp}
changed, hash1, err := s.DetectChanges(context.Background(), src, "")
if err != nil {
t.Fatal(err)
}
if !changed || hash1 == "" {
t.Fatalf("expected first detect to change with non-empty hash, changed=%v hash=%q", changed, hash1)
}
time.Sleep(5 * time.Millisecond)
if err := os.WriteFile(file, []byte("v2"), 0o644); err != nil {
t.Fatal(err)
}
changed, hash2, err := s.DetectChanges(context.Background(), src, hash1)
if err != nil {
t.Fatal(err)
}
if !changed {
t.Fatal("expected change after file update")
}
if hash1 == hash2 {
t.Fatal("expected hash to change")
}
}
func TestLocalScraper_StripsFrontmatterAndMDXPreamble(t *testing.T) {
tmp := t.TempDir()
path := filepath.Join(tmp, "doc.mdx")
content := `---
title: My Doc
slug: /my-doc
---
import { Component } from "x"
export const meta = {}
# Heading
Actual documentation body.
`
if err := os.WriteFile(path, []byte(content), 0o644); err != nil {
t.Fatal(err)
}
s := NewLocalScraper(&Config{Timeout: 2 * time.Second, UserAgent: "DevourTest"})
doc, err := s.fileToDocument(path, &Source{Name: "local", Type: SourceTypeLocal, Path: tmp})
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if strings.Contains(doc.Content, "slug: /my-doc") {
t.Fatalf("expected frontmatter to be stripped, got: %q", doc.Content)
}
if strings.Contains(doc.Content, "import { Component }") {
t.Fatalf("expected MDX preamble to be stripped, got: %q", doc.Content)
}
if !strings.Contains(doc.Content, "Actual documentation body.") {
t.Fatalf("expected markdown body in content, got: %q", doc.Content)
}
}
+402
View File
@@ -0,0 +1,402 @@
package scraper
import (
"context"
"crypto/sha256"
"encoding/hex"
"encoding/json"
"fmt"
"io"
"net/http"
"net/url"
"sort"
"strconv"
"strings"
"time"
)
const (
defaultLocalSearchLimit = 8
maxLocalSearchLimit = 50
maxSearchResponseBytes = 2 << 20 // 2MB
)
// LocalSearchScraper scrapes docs from result URLs returned by a local search API.
type LocalSearchScraper struct {
config *Config
client *http.Client
web *WebScraper
}
type localSearchResult struct {
URL string
Title string
Snippet string
Engine string
Score float64
}
// NewLocalSearchScraper creates a scraper backed by a self-hosted search API.
func NewLocalSearchScraper(config *Config) *LocalSearchScraper {
baseConfig := &Config{}
if config != nil {
*baseConfig = *config
}
if baseConfig.UserAgent == "" {
baseConfig.UserAgent = "Devour/1.0 (Local Search Scraper)"
}
if baseConfig.Timeout <= 0 {
baseConfig.Timeout = 30 * time.Second
}
webConfig := *baseConfig
webConfig.Concurrency = 1
webConfig.MaxDepth = 1
return &LocalSearchScraper{
config: baseConfig,
client: &http.Client{Timeout: baseConfig.Timeout},
web: NewWebScraper(&webConfig),
}
}
// Scrape queries a local search API and scrapes the returned URLs.
func (s *LocalSearchScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
if source == nil {
return nil, fmt.Errorf("source is required")
}
if strings.TrimSpace(source.URL) == "" {
return nil, fmt.Errorf("search API URL is required")
}
query := strings.TrimSpace(source.Query)
if query == "" {
return nil, fmt.Errorf("search query is required for localsearch sources (set source.query or --search-query)")
}
limit := clampLocalSearchLimit(source.ResultLimit)
results, err := s.search(ctx, source, query, limit)
if err != nil {
return nil, err
}
docs := make([]*Document, 0, limit)
seen := make(map[string]bool)
var scrapeErrors []string
for i, result := range results {
if ctx.Err() != nil {
return nil, ctx.Err()
}
resultURL := stripURLFragment(result.URL)
if resultURL == "" || seen[resultURL] {
continue
}
if !domainAllowed(resultURL, source.Domains) {
continue
}
if !s.web.shouldInclude(resultURL, source.Include, source.Exclude) {
continue
}
seen[resultURL] = true
pageDocs, err := s.web.Scrape(ctx, &Source{
Name: source.Name,
Type: SourceTypeWeb,
URL: resultURL,
Include: source.Include,
Exclude: source.Exclude,
})
if err != nil {
if len(scrapeErrors) < 20 {
scrapeErrors = append(scrapeErrors, fmt.Sprintf("%s: %v", resultURL, err))
}
continue
}
for _, doc := range pageDocs {
if doc.Metadata == nil {
doc.Metadata = make(map[string]interface{})
}
doc.Metadata["search_api"] = source.URL
doc.Metadata["search_query"] = query
doc.Metadata["search_rank"] = i + 1
if result.Engine != "" {
doc.Metadata["search_engine"] = result.Engine
}
if result.Snippet != "" {
doc.Metadata["search_snippet"] = result.Snippet
}
if result.Score != 0 {
doc.Metadata["search_score"] = result.Score
}
if strings.TrimSpace(doc.Title) == "" && strings.TrimSpace(result.Title) != "" {
doc.Title = strings.TrimSpace(result.Title)
}
docs = append(docs, doc)
}
}
if len(docs) == 0 {
if len(scrapeErrors) > 0 {
return nil, fmt.Errorf("local search returned results but page scraping failed: %s", strings.Join(scrapeErrors, "; "))
}
return nil, fmt.Errorf("local search yielded no usable results for query %q", query)
}
return docs, nil
}
// DetectChanges checks if top search results changed.
func (s *LocalSearchScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
if source == nil {
return false, "", fmt.Errorf("source is required")
}
query := strings.TrimSpace(source.Query)
if query == "" {
return false, "", fmt.Errorf("search query is required for localsearch sources")
}
limit := clampLocalSearchLimit(source.ResultLimit)
results, err := s.search(ctx, source, query, limit)
if err != nil {
return false, "", err
}
signatures := make([]string, 0, len(results))
for _, result := range results {
u := stripURLFragment(result.URL)
if u == "" {
continue
}
if !domainAllowed(u, source.Domains) {
continue
}
if !s.web.shouldInclude(u, source.Include, source.Exclude) {
continue
}
signatures = append(signatures, fmt.Sprintf("%s|%s|%s|%.6f", u, result.Title, result.Engine, result.Score))
}
sort.Strings(signatures)
hash := sha256.Sum256([]byte(strings.Join(signatures, "\n")))
currentHash := hex.EncodeToString(hash[:])
return currentHash != lastHash, currentHash, nil
}
func (s *LocalSearchScraper) search(ctx context.Context, source *Source, query string, limit int) ([]localSearchResult, error) {
searchURL, err := buildLocalSearchURL(source.URL, query, limit)
if err != nil {
return nil, err
}
req, err := http.NewRequestWithContext(ctx, http.MethodGet, searchURL, nil)
if err != nil {
return nil, fmt.Errorf("failed to build search request: %w", err)
}
req.Header.Set("User-Agent", s.config.UserAgent)
req.Header.Set("Accept", "application/json")
resp, err := s.client.Do(req)
if err != nil {
return nil, fmt.Errorf("search API request failed: %w", err)
}
defer resp.Body.Close()
body, err := io.ReadAll(io.LimitReader(resp.Body, maxSearchResponseBytes))
if err != nil {
return nil, fmt.Errorf("failed reading search API response: %w", err)
}
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
msg := strings.TrimSpace(string(body))
if len(msg) > 200 {
msg = msg[:200]
}
return nil, fmt.Errorf("search API returned HTTP %d: %s", resp.StatusCode, msg)
}
results, err := decodeLocalSearchResults(body)
if err != nil {
return nil, err
}
if len(results) == 0 {
return nil, fmt.Errorf("search API returned no results")
}
if len(results) > limit {
results = results[:limit]
}
return results, nil
}
func buildLocalSearchURL(rawURL, query string, limit int) (string, error) {
u, err := url.Parse(strings.TrimSpace(rawURL))
if err != nil {
return "", fmt.Errorf("invalid search API URL: %w", err)
}
if u.Scheme == "" || u.Host == "" {
return "", fmt.Errorf("invalid search API URL: expected absolute URL, got %q", rawURL)
}
params := u.Query()
params.Set("q", query)
if params.Get("format") == "" {
params.Set("format", "json")
}
if params.Get("limit") == "" {
params.Set("limit", strconv.Itoa(clampLocalSearchLimit(limit)))
}
u.RawQuery = params.Encode()
return u.String(), nil
}
func decodeLocalSearchResults(body []byte) ([]localSearchResult, error) {
var payload map[string]interface{}
if err := json.Unmarshal(body, &payload); err != nil {
return nil, fmt.Errorf("search API returned invalid JSON: %w", err)
}
rawResults, ok := payload["results"]
if !ok {
return nil, fmt.Errorf("search API response missing results field")
}
items, ok := rawResults.([]interface{})
if !ok {
return nil, fmt.Errorf("search API results field is not an array")
}
results := make([]localSearchResult, 0, len(items))
for _, item := range items {
record, ok := item.(map[string]interface{})
if !ok {
continue
}
resultURL := pickString(record, "url", "link", "href")
if strings.TrimSpace(resultURL) == "" {
continue
}
results = append(results, localSearchResult{
URL: strings.TrimSpace(resultURL),
Title: strings.TrimSpace(pickString(record, "title", "name")),
Snippet: strings.TrimSpace(pickString(record, "content", "snippet", "description", "text")),
Engine: strings.TrimSpace(pickString(record, "engine", "source")),
Score: pickFloat(record, "score", "relevance"),
})
}
return results, nil
}
func pickString(record map[string]interface{}, keys ...string) string {
for _, key := range keys {
value, ok := record[key]
if !ok {
continue
}
switch v := value.(type) {
case string:
return v
case json.Number:
return v.String()
case float64:
return strconv.FormatFloat(v, 'f', -1, 64)
case int:
return strconv.Itoa(v)
}
}
return ""
}
func pickFloat(record map[string]interface{}, keys ...string) float64 {
for _, key := range keys {
value, ok := record[key]
if !ok {
continue
}
switch v := value.(type) {
case float64:
return v
case float32:
return float64(v)
case int:
return float64(v)
case int64:
return float64(v)
case json.Number:
f, err := v.Float64()
if err == nil {
return f
}
case string:
f, err := strconv.ParseFloat(strings.TrimSpace(v), 64)
if err == nil {
return f
}
}
}
return 0
}
func clampLocalSearchLimit(limit int) int {
if limit <= 0 {
return defaultLocalSearchLimit
}
if limit > maxLocalSearchLimit {
return maxLocalSearchLimit
}
return limit
}
func stripURLFragment(raw string) string {
u, err := url.Parse(strings.TrimSpace(raw))
if err != nil {
return strings.TrimSpace(raw)
}
u.Fragment = ""
return u.String()
}
func domainAllowed(raw string, allowedDomains []string) bool {
if len(allowedDomains) == 0 {
return true
}
u, err := url.Parse(raw)
if err != nil {
return false
}
host := strings.ToLower(strings.TrimSpace(u.Hostname()))
if host == "" {
return false
}
for _, candidate := range allowedDomains {
domain := normalizeDomain(candidate)
if domain == "" {
continue
}
if host == domain || strings.HasSuffix(host, "."+domain) {
return true
}
}
return false
}
func normalizeDomain(raw string) string {
raw = strings.ToLower(strings.TrimSpace(raw))
if raw == "" {
return ""
}
if strings.Contains(raw, "://") {
parsed, err := url.Parse(raw)
if err == nil {
return strings.ToLower(parsed.Hostname())
}
}
return strings.TrimPrefix(raw, ".")
}
+226
View File
@@ -0,0 +1,226 @@
package scraper
import (
"context"
"encoding/json"
"net/http"
"net/http/httptest"
"net/url"
"strings"
"testing"
"time"
)
func TestLocalSearchScraperScrape(t *testing.T) {
mux := http.NewServeMux()
baseURL := ""
mux.HandleFunc("/search", func(w http.ResponseWriter, r *http.Request) {
if got := r.URL.Query().Get("q"); got != "go http client" {
t.Fatalf("expected query go http client, got %q", got)
}
if got := r.URL.Query().Get("format"); got != "json" {
t.Fatalf("expected format=json, got %q", got)
}
_ = json.NewEncoder(w).Encode(map[string]interface{}{
"results": []map[string]interface{}{
{
"url": baseURL + "/docs/http-client",
"title": "HTTP Client Guide",
"content": "How to build an HTTP client in Go",
"engine": "searxng",
"score": 0.99,
},
},
})
})
mux.HandleFunc("/docs/http-client", func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/html")
_, _ = w.Write([]byte(`<html><head><title>HTTP Client Guide</title></head><body><main>` + strings.Repeat("http client docs ", 30) + `</main></body></html>`))
})
srv := httptest.NewServer(mux)
defer srv.Close()
baseURL = srv.URL
s := NewLocalSearchScraper(&Config{
UserAgent: "DevourTest/1.0",
Timeout: 2 * time.Second,
})
docs, err := s.Scrape(context.Background(), &Source{
Name: "local-search",
Type: SourceTypeLocalSearch,
URL: srv.URL + "/search",
Query: "go http client",
ResultLimit: 5,
})
if err != nil {
t.Fatalf("unexpected scrape error: %v", err)
}
if len(docs) == 0 {
t.Fatal("expected at least one document")
}
doc := docs[0]
if doc.URL != srv.URL+"/docs/http-client" {
t.Fatalf("unexpected document URL: %q", doc.URL)
}
if doc.Metadata["search_query"] != "go http client" {
t.Fatalf("expected metadata search_query, got %v", doc.Metadata["search_query"])
}
if doc.Metadata["search_engine"] != "searxng" {
t.Fatalf("expected metadata search_engine=searxng, got %v", doc.Metadata["search_engine"])
}
}
func TestLocalSearchScraperDomainFilter(t *testing.T) {
mux := http.NewServeMux()
baseURL := ""
mux.HandleFunc("/search", func(w http.ResponseWriter, r *http.Request) {
_ = json.NewEncoder(w).Encode(map[string]interface{}{
"results": []map[string]interface{}{
{
"url": baseURL + "/docs/in-scope",
"title": "In Scope",
},
{
"url": "https://example.com/out-of-scope",
"title": "Out Scope",
},
},
})
})
mux.HandleFunc("/docs/in-scope", func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/html")
_, _ = w.Write([]byte(`<html><head><title>In Scope</title></head><body><main>` + strings.Repeat("scoped docs ", 30) + `</main></body></html>`))
})
srv := httptest.NewServer(mux)
defer srv.Close()
baseURL = srv.URL
parsed, err := url.Parse(srv.URL)
if err != nil {
t.Fatalf("failed to parse server URL: %v", err)
}
s := NewLocalSearchScraper(&Config{
UserAgent: "DevourTest/1.0",
Timeout: 2 * time.Second,
})
docs, err := s.Scrape(context.Background(), &Source{
Name: "local-search",
Type: SourceTypeLocalSearch,
URL: srv.URL + "/search",
Query: "scope test",
ResultLimit: 10,
Domains: []string{parsed.Hostname()},
})
if err != nil {
t.Fatalf("unexpected scrape error: %v", err)
}
if len(docs) == 0 {
t.Fatal("expected at least one in-scope document")
}
for _, doc := range docs {
docURL, parseErr := url.Parse(doc.URL)
if parseErr != nil {
t.Fatalf("invalid doc URL %q: %v", doc.URL, parseErr)
}
if docURL.Hostname() != parsed.Hostname() {
t.Fatalf("expected only in-scope domain, got %q", doc.URL)
}
}
}
func TestLocalSearchScraperRequiresQuery(t *testing.T) {
s := NewLocalSearchScraper(&Config{
UserAgent: "DevourTest/1.0",
Timeout: 2 * time.Second,
})
_, err := s.Scrape(context.Background(), &Source{
Name: "local-search",
Type: SourceTypeLocalSearch,
URL: "http://127.0.0.1:8080/search",
})
if err == nil {
t.Fatal("expected error when query is missing")
}
if !strings.Contains(err.Error(), "query") {
t.Fatalf("unexpected error: %v", err)
}
}
func TestLocalSearchScraperDetectChanges(t *testing.T) {
mux := http.NewServeMux()
baseURL := ""
resultPath := "/docs/one"
mux.HandleFunc("/search", func(w http.ResponseWriter, r *http.Request) {
_ = json.NewEncoder(w).Encode(map[string]interface{}{
"results": []map[string]interface{}{
{
"url": baseURL + resultPath,
"title": "Versioned",
"score": 1.0,
},
},
})
})
srv := httptest.NewServer(mux)
defer srv.Close()
baseURL = srv.URL
s := NewLocalSearchScraper(&Config{
UserAgent: "DevourTest/1.0",
Timeout: 2 * time.Second,
})
source := &Source{
Name: "local-search",
Type: SourceTypeLocalSearch,
URL: srv.URL + "/search",
Query: "version test",
ResultLimit: 3,
}
changed, hash1, err := s.DetectChanges(context.Background(), source, "")
if err != nil {
t.Fatalf("unexpected detect changes error: %v", err)
}
if !changed {
t.Fatal("expected first detect changes call to report changed")
}
if hash1 == "" {
t.Fatal("expected non-empty hash")
}
changed, hash2, err := s.DetectChanges(context.Background(), source, hash1)
if err != nil {
t.Fatalf("unexpected detect changes error: %v", err)
}
if changed {
t.Fatal("expected unchanged results with identical hash")
}
if hash2 != hash1 {
t.Fatalf("expected identical hash, got %q and %q", hash1, hash2)
}
resultPath = "/docs/two"
changed, hash3, err := s.DetectChanges(context.Background(), source, hash1)
if err != nil {
t.Fatalf("unexpected detect changes error: %v", err)
}
if !changed {
t.Fatal("expected changed results after search output changed")
}
if hash3 == hash1 {
t.Fatal("expected hash to change")
}
}
+88
View File
@@ -0,0 +1,88 @@
package scraper
import (
"net/url"
"path"
"regexp"
"strings"
)
var (
titleNoiseRe = regexp.MustCompile(`(?i)\b(added in go[0-9]+(\.[0-9]+)?|deprecated)\b`)
titleSpaceRe = regexp.MustCompile(`\s+`)
contentSpaceRe = regexp.MustCompile(`[ \t]+\n`)
multiNewlineRe = regexp.MustCompile(`\n{3,}`)
nonPrintableTitle = regexp.MustCompile(`[[:cntrl:]]`)
)
// NormalizeDocuments applies normalization to a list of scraped documents.
func NormalizeDocuments(docs []*Document) []*Document {
for _, doc := range docs {
NormalizeDocument(doc)
}
return docs
}
// NormalizeDocument applies cross-scraper output cleanup.
func NormalizeDocument(doc *Document) {
if doc == nil {
return
}
doc.URL = strings.TrimSpace(doc.URL)
doc.Type = strings.TrimSpace(doc.Type)
doc.Title = normalizeTitle(doc.Title)
doc.Content = normalizeContent(doc.Content)
if doc.Title == "" {
doc.Title = inferTitleFromURL(doc.URL)
}
}
func normalizeTitle(title string) string {
title = strings.ReplaceAll(title, "¶", " ")
title = strings.ReplaceAll(title, "_", " ")
title = nonPrintableTitle.ReplaceAllString(title, " ")
title = titleNoiseRe.ReplaceAllString(title, " ")
title = titleSpaceRe.ReplaceAllString(strings.TrimSpace(title), " ")
// Remove dangling punctuation if it became a suffix after cleanup.
title = strings.TrimSpace(strings.Trim(title, "-:.,;"))
return title
}
func normalizeContent(content string) string {
content = strings.ReplaceAll(content, "\r\n", "\n")
content = strings.TrimSpace(content)
content = contentSpaceRe.ReplaceAllString(content, "\n")
content = multiNewlineRe.ReplaceAllString(content, "\n\n")
return content
}
func inferTitleFromURL(rawURL string) string {
if rawURL == "" {
return "Documentation"
}
u, err := url.Parse(rawURL)
if err != nil {
return "Documentation"
}
base := path.Base(strings.Trim(u.Path, "/"))
if base == "" || base == "." || base == "/" {
if u.Host != "" {
return u.Host
}
return "Documentation"
}
base = strings.TrimSuffix(base, ".html")
base = strings.ReplaceAll(base, "-", " ")
base = strings.ReplaceAll(base, "_", " ")
base = titleSpaceRe.ReplaceAllString(strings.TrimSpace(base), " ")
if base == "" {
return "Documentation"
}
return base
}
+33
View File
@@ -0,0 +1,33 @@
package scraper
import "testing"
func TestNormalizeDocument_TitleCleanup(t *testing.T) {
doc := &Document{
Title: "http.type CloseNotifier ¶ deprecated added in go1.1",
Content: "line 1 \n\n\nline 2",
URL: "https://pkg.go.dev/net/http#CloseNotifier",
}
NormalizeDocument(doc)
if doc.Title != "http.type CloseNotifier" {
t.Fatalf("unexpected normalized title: %q", doc.Title)
}
if doc.Content != "line 1\n\nline 2" {
t.Fatalf("unexpected normalized content: %q", doc.Content)
}
}
func TestNormalizeDocument_InferTitle(t *testing.T) {
doc := &Document{
Title: "",
URL: "https://kotlinlang.org/docs/regex.html",
}
NormalizeDocument(doc)
if doc.Title != "regex" {
t.Fatalf("expected inferred title 'regex', got %q", doc.Title)
}
}
+316 -9
View File
@@ -2,30 +2,337 @@ package scraper
import (
"context"
"crypto/sha256"
"encoding/hex"
"encoding/json"
"fmt"
"io"
"net/http"
"os"
"sort"
"strings"
"time"
"gopkg.in/yaml.v3"
)
// OpenAPIScraper parses OpenAPI/Swagger specifications.
type OpenAPIScraper struct {
config *Config
client *http.Client
}
// NewOpenAPIScraper creates a new OpenAPI scraper.
func NewOpenAPIScraper(config *Config) *OpenAPIScraper {
return &OpenAPIScraper{config: config}
timeout := 30 * time.Second
if config != nil && config.Timeout > 0 {
timeout = config.Timeout
}
return &OpenAPIScraper{
config: config,
client: &http.Client{Timeout: timeout},
}
}
// Scrape fetches and parses an OpenAPI specification.
func (s *OpenAPIScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
// TODO: Implement OpenAPI parsing
// 1. Fetch spec from URL
// 2. Parse endpoints, schemas, descriptions
// 3. Create documents per endpoint
// 4. Include authentication, parameters
return nil, nil
if source == nil {
return nil, fmt.Errorf("source is required")
}
raw, specURL, err := s.readSpec(ctx, source)
if err != nil {
return nil, err
}
spec, err := parseOpenAPISpec(raw)
if err != nil {
return nil, err
}
docs := make([]*Document, 0)
mainContent := buildMainSpecContent(spec)
docs = append(docs, &Document{
ID: generateDocID(specURL + "#openapi"),
Source: coalesceSourceName(source.Name, "openapi"),
Type: "openapi-spec",
Title: spec.Info.Title,
Content: mainContent,
URL: specURL,
Metadata: map[string]interface{}{
"openapi": spec.Version,
"servers": spec.Servers,
},
Hash: hashBytes(raw),
Timestamp: time.Now(),
})
paths := make([]string, 0, len(spec.Paths))
for path := range spec.Paths {
paths = append(paths, path)
}
sort.Strings(paths)
for _, p := range paths {
opMap := spec.Paths[p]
methods := make([]string, 0, len(opMap))
for m := range opMap {
methods = append(methods, strings.ToUpper(m))
}
sort.Strings(methods)
for _, method := range methods {
op := opMap[strings.ToLower(method)]
if op == nil {
continue
}
title := strings.TrimSpace(op.Summary)
if title == "" {
title = fmt.Sprintf("%s %s", method, p)
}
content := buildOperationContent(method, p, op)
docURL := fmt.Sprintf("%s#%s-%s", specURL, strings.ToLower(method), sanitizeFragment(p))
docs = append(docs, &Document{
ID: generateDocID(docURL),
Source: coalesceSourceName(source.Name, "openapi"),
Type: "openapi-operation",
Title: title,
Content: content,
URL: docURL,
Metadata: map[string]interface{}{
"method": method,
"path": p,
"operation_id": op.OperationID,
},
Hash: hashString(content),
Timestamp: time.Now(),
})
}
}
return docs, nil
}
// DetectChanges checks if the spec has been updated.
func (s *OpenAPIScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
// TODO: Check spec content hash
return false, "", nil
if source == nil {
return false, "", fmt.Errorf("source is required")
}
raw, _, err := s.readSpec(ctx, source)
if err != nil {
return false, "", err
}
hash := hashBytes(raw)
return hash != lastHash, hash, nil
}
func (s *OpenAPIScraper) readSpec(ctx context.Context, source *Source) ([]byte, string, error) {
rawPath := strings.TrimSpace(source.URL)
if rawPath == "" {
rawPath = strings.TrimSpace(source.Path)
}
if rawPath == "" {
return nil, "", fmt.Errorf("openapi source requires url or path")
}
if strings.HasPrefix(rawPath, "http://") || strings.HasPrefix(rawPath, "https://") {
req, err := http.NewRequestWithContext(ctx, http.MethodGet, rawPath, nil)
if err != nil {
return nil, "", err
}
if s.config != nil && strings.TrimSpace(s.config.UserAgent) != "" {
req.Header.Set("User-Agent", s.config.UserAgent)
}
resp, err := s.client.Do(req)
if err != nil {
return nil, "", err
}
defer resp.Body.Close()
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
return nil, "", fmt.Errorf("openapi fetch failed: HTTP %d", resp.StatusCode)
}
body, err := io.ReadAll(io.LimitReader(resp.Body, 10<<20))
if err != nil {
return nil, "", err
}
return body, rawPath, nil
}
b, err := os.ReadFile(rawPath)
if err != nil {
return nil, "", err
}
return b, "file://" + rawPath, nil
}
type openAPISpec struct {
Version string `json:"openapi" yaml:"openapi"`
Swagger string `json:"swagger" yaml:"swagger"`
Info openAPIInfo `json:"info" yaml:"info"`
Servers []openAPIServer `json:"servers" yaml:"servers"`
Paths map[string]pathItems `json:"paths" yaml:"paths"`
}
type openAPIInfo struct {
Title string `json:"title" yaml:"title"`
Version string `json:"version" yaml:"version"`
Description string `json:"description" yaml:"description"`
}
type openAPIServer struct {
URL string `json:"url" yaml:"url"`
Description string `json:"description" yaml:"description"`
}
type pathItems map[string]*openAPIOperation
type openAPIOperation struct {
Summary string `json:"summary" yaml:"summary"`
Description string `json:"description" yaml:"description"`
OperationID string `json:"operationId" yaml:"operationId"`
Parameters []openAPIParameter `json:"parameters" yaml:"parameters"`
Responses map[string]response `json:"responses" yaml:"responses"`
RequestBody map[string]any `json:"requestBody" yaml:"requestBody"`
Tags []string `json:"tags" yaml:"tags"`
Deprecated bool `json:"deprecated" yaml:"deprecated"`
Security []map[string][]string `json:"security" yaml:"security"`
}
type openAPIParameter struct {
Name string `json:"name" yaml:"name"`
In string `json:"in" yaml:"in"`
Description string `json:"description" yaml:"description"`
Required bool `json:"required" yaml:"required"`
}
type response struct {
Description string `json:"description" yaml:"description"`
}
func parseOpenAPISpec(raw []byte) (*openAPISpec, error) {
var spec openAPISpec
if err := json.Unmarshal(raw, &spec); err != nil {
if yamlErr := yaml.Unmarshal(raw, &spec); yamlErr != nil {
return nil, fmt.Errorf("invalid openapi content: %w", err)
}
}
if strings.TrimSpace(spec.Info.Title) == "" {
spec.Info.Title = "OpenAPI Specification"
}
if strings.TrimSpace(spec.Version) == "" {
spec.Version = spec.Swagger
}
if spec.Paths == nil {
spec.Paths = map[string]pathItems{}
}
return &spec, nil
}
func buildMainSpecContent(spec *openAPISpec) string {
var b strings.Builder
fmt.Fprintf(&b, "# %s\n\n", spec.Info.Title)
if spec.Info.Version != "" {
fmt.Fprintf(&b, "- API Version: %s\n", spec.Info.Version)
}
if spec.Version != "" {
fmt.Fprintf(&b, "- OpenAPI: %s\n", spec.Version)
}
fmt.Fprintf(&b, "- Paths: %d\n", len(spec.Paths))
if spec.Info.Description != "" {
fmt.Fprintf(&b, "\n%s\n", strings.TrimSpace(spec.Info.Description))
}
if len(spec.Servers) > 0 {
fmt.Fprintf(&b, "\n## Servers\n")
for _, s := range spec.Servers {
fmt.Fprintf(&b, "- %s", s.URL)
if s.Description != "" {
fmt.Fprintf(&b, " - %s", s.Description)
}
fmt.Fprintln(&b)
}
}
return b.String()
}
func buildOperationContent(method, path string, op *openAPIOperation) string {
var b strings.Builder
fmt.Fprintf(&b, "# %s %s\n\n", method, path)
if op.Summary != "" {
fmt.Fprintf(&b, "%s\n\n", strings.TrimSpace(op.Summary))
}
if op.Description != "" {
fmt.Fprintf(&b, "%s\n\n", strings.TrimSpace(op.Description))
}
if op.OperationID != "" {
fmt.Fprintf(&b, "- Operation ID: `%s`\n", op.OperationID)
}
if len(op.Tags) > 0 {
fmt.Fprintf(&b, "- Tags: %s\n", strings.Join(op.Tags, ", "))
}
if op.Deprecated {
fmt.Fprintln(&b, "- Deprecated: true")
}
if len(op.Parameters) > 0 {
fmt.Fprintln(&b, "\n## Parameters")
for _, p := range op.Parameters {
req := "optional"
if p.Required {
req = "required"
}
fmt.Fprintf(&b, "- `%s` (%s, %s)", p.Name, p.In, req)
if p.Description != "" {
fmt.Fprintf(&b, ": %s", strings.TrimSpace(p.Description))
}
fmt.Fprintln(&b)
}
}
if len(op.Responses) > 0 {
codes := make([]string, 0, len(op.Responses))
for code := range op.Responses {
codes = append(codes, code)
}
sort.Strings(codes)
fmt.Fprintln(&b, "\n## Responses")
for _, code := range codes {
resp := op.Responses[code]
fmt.Fprintf(&b, "- `%s`", code)
if resp.Description != "" {
fmt.Fprintf(&b, ": %s", strings.TrimSpace(resp.Description))
}
fmt.Fprintln(&b)
}
}
return b.String()
}
func sanitizeFragment(path string) string {
path = strings.ToLower(path)
path = strings.ReplaceAll(path, "/", "-")
path = strings.ReplaceAll(path, "{", "")
path = strings.ReplaceAll(path, "}", "")
path = strings.Trim(path, "-")
if path == "" {
return "root"
}
return path
}
func hashBytes(b []byte) string {
h := sha256.Sum256(b)
return hex.EncodeToString(h[:])
}
func hashString(s string) string {
h := sha256.Sum256([]byte(s))
return hex.EncodeToString(h[:])
}
func coalesceSourceName(name, fallback string) string {
if strings.TrimSpace(name) != "" {
return name
}
return fallback
}
+77
View File
@@ -0,0 +1,77 @@
package scraper
import (
"context"
"net/http"
"net/http/httptest"
"strings"
"testing"
"time"
)
func TestOpenAPIScraperScrape(t *testing.T) {
spec := `{
"openapi": "3.0.0",
"info": {"title": "Pet API", "version": "1.0.0"},
"paths": {
"/pets": {
"get": {
"summary": "List pets",
"operationId": "listPets",
"responses": {"200": {"description": "ok"}}
}
}
}
}`
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/json")
_, _ = w.Write([]byte(spec))
}))
defer srv.Close()
s := NewOpenAPIScraper(&Config{Timeout: 2 * time.Second, UserAgent: "DevourTest"})
docs, err := s.Scrape(context.Background(), &Source{Name: "pet", Type: SourceTypeOpenAPI, URL: srv.URL})
if err != nil {
t.Fatalf("unexpected scrape error: %v", err)
}
if len(docs) < 2 {
t.Fatalf("expected at least 2 docs, got %d", len(docs))
}
foundOp := false
for _, d := range docs {
if strings.Contains(d.Title, "List pets") {
foundOp = true
break
}
}
if !foundOp {
t.Fatal("expected operation document")
}
}
func TestOpenAPIScraperDetectChanges(t *testing.T) {
spec := `{"openapi":"3.0.0","info":{"title":"API"},"paths":{}}`
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
_, _ = w.Write([]byte(spec))
}))
defer srv.Close()
s := NewOpenAPIScraper(&Config{Timeout: 2 * time.Second, UserAgent: "DevourTest"})
src := &Source{Name: "api", Type: SourceTypeOpenAPI, URL: srv.URL}
changed, hash1, err := s.DetectChanges(context.Background(), src, "")
if err != nil {
t.Fatal(err)
}
if !changed || hash1 == "" {
t.Fatalf("expected changed=true and non-empty hash, changed=%v hash=%q", changed, hash1)
}
changed, _, err = s.DetectChanges(context.Background(), src, hash1)
if err != nil {
t.Fatal(err)
}
if changed {
t.Fatal("expected no changes when hash matches")
}
}
+1
View File
@@ -5,6 +5,7 @@ func init() {
// Additional scrapers can be registered in their own packages
RegisterScraper(SourceTypeWeb, func(c *Config) Scraper { return NewWebScraper(c) })
RegisterScraper(SourceTypeLocal, func(c *Config) Scraper { return NewLocalScraper(c) })
RegisterScraper(SourceTypeLocalSearch, func(c *Config) Scraper { return NewLocalSearchScraper(c) })
RegisterScraper(SourceTypeGitHub, func(c *Config) Scraper { return NewGitHubScraper(c) })
RegisterScraper(SourceTypeOpenAPI, func(c *Config) Scraper { return NewOpenAPIScraper(c) })
}
@@ -0,0 +1,71 @@
package scraper_test
import (
"testing"
"time"
basescraper "github.com/yourorg/devour/internal/scraper"
_ "github.com/yourorg/devour/internal/scraper/external"
)
func TestLanguageScrapersAreRegistered(t *testing.T) {
config := &basescraper.Config{
UserAgent: "DevourTest/1.0",
Timeout: 2 * time.Second,
}
supportedDocTypes := []basescraper.SourceType{
basescraper.SourceTypeGoDocs,
basescraper.SourceTypeRustDocs,
basescraper.SourceTypePythonDocs,
basescraper.SourceTypeJavaDocs,
basescraper.SourceTypeSpringDocs,
basescraper.SourceTypeTSDocs,
basescraper.SourceTypeReactDocs,
basescraper.SourceTypeVueDocs,
basescraper.SourceTypeNuxtDocs,
basescraper.SourceTypeMCPDocs,
basescraper.SourceTypeDockerDocs,
basescraper.SourceTypeCloudflareDocs,
basescraper.SourceTypeAstroDocs,
}
for _, sourceType := range supportedDocTypes {
t.Run(string(sourceType), func(t *testing.T) {
s := basescraper.NewScraper(sourceType, config)
if s == nil {
t.Fatalf("NewScraper(%q) returned nil; scraper was not registered", sourceType)
}
})
}
}
func TestDetectSourceType_ForSupportedDocsHosts(t *testing.T) {
tests := []struct {
input string
expected basescraper.SourceType
}{
{"https://pkg.go.dev/net/http", basescraper.SourceTypeGoDocs},
{"https://docs.rs/tokio/latest/tokio/", basescraper.SourceTypeRustDocs},
{"https://docs.python.org/3/library/asyncio.html", basescraper.SourceTypePythonDocs},
{"https://docs.oracle.com/javase/8/docs/api/java/util/List.html", basescraper.SourceTypeJavaDocs},
{"https://docs.spring.io/spring-boot/docs/current/reference/htmlsingle/", basescraper.SourceTypeSpringDocs},
{"https://www.typescriptlang.org/docs/handbook/2/basic-types.html", basescraper.SourceTypeTSDocs},
{"https://react.dev/reference/react/hooks", basescraper.SourceTypeReactDocs},
{"https://vuejs.org/guide/introduction.html", basescraper.SourceTypeVueDocs},
{"https://nuxt.com/docs/guide/directory-structure", basescraper.SourceTypeNuxtDocs},
{"https://docs.docker.com/compose", basescraper.SourceTypeDockerDocs},
{"https://hub.docker.com/mcp/server/github", basescraper.SourceTypeMCPDocs},
{"https://developers.cloudflare.com/workers", basescraper.SourceTypeCloudflareDocs},
{"https://docs.astro.build/en/guides/components/", basescraper.SourceTypeAstroDocs},
}
for _, tt := range tests {
t.Run(tt.input, func(t *testing.T) {
got := basescraper.DetectSourceType(tt.input)
if got != tt.expected {
t.Fatalf("DetectSourceType(%q) = %q, want %q", tt.input, got, tt.expected)
}
})
}
}
+1 -1
View File
@@ -28,7 +28,7 @@ func (r *ScraperRegistry) Register(sourceType SourceType, constructor ScraperCon
// Create creates a scraper instance
func (r *ScraperRegistry) Create(sourceType SourceType, config *Config) Scraper {
if constructor, exists := r.constructors[sourceType]; exists {
return constructor(config)
return wrapScraper(constructor(config))
}
return nil
}
+18 -9
View File
@@ -17,6 +17,7 @@ const (
SourceTypeGitHub SourceType = "github"
SourceTypeOpenAPI SourceType = "openapi"
SourceTypeLocal SourceType = "local"
SourceTypeLocalSearch SourceType = "localsearch"
SourceTypeGoDocs SourceType = "godocs"
SourceTypeRustDocs SourceType = "rustdocs"
SourceTypePythonDocs SourceType = "pythondocs"
@@ -34,15 +35,18 @@ const (
// Source represents a documentation source to scrape.
type Source struct {
Name string `yaml:"name"`
Type SourceType `yaml:"type"`
URL string `yaml:"url,omitempty"`
Repo string `yaml:"repo,omitempty"`
Branch string `yaml:"branch,omitempty"`
Path string `yaml:"path,omitempty"`
Include []string `yaml:"include,omitempty"`
Exclude []string `yaml:"exclude,omitempty"`
Schedule string `yaml:"schedule,omitempty"`
Name string `yaml:"name"`
Type SourceType `yaml:"type"`
URL string `yaml:"url,omitempty"`
Query string `yaml:"query,omitempty"`
ResultLimit int `yaml:"result_limit,omitempty"`
Domains []string `yaml:"domains,omitempty"`
Repo string `yaml:"repo,omitempty"`
Branch string `yaml:"branch,omitempty"`
Path string `yaml:"path,omitempty"`
Include []string `yaml:"include,omitempty"`
Exclude []string `yaml:"exclude,omitempty"`
Schedule string `yaml:"schedule,omitempty"`
}
// Document represents a scraped document.
@@ -113,6 +117,11 @@ func DetectSourceType(input string) SourceType {
}
}
// MCP servers are hosted under Docker Hub paths.
if strings.Contains(input, "hub.docker.com/mcp/") {
return SourceTypeMCPDocs
}
// Check for OpenAPI specs
if strings.HasSuffix(input, ".json") || strings.HasSuffix(input, ".yaml") || strings.HasSuffix(input, ".yml") {
if strings.Contains(strings.ToLower(input), "openapi") || strings.Contains(strings.ToLower(input), "swagger") {
+191 -7
View File
@@ -6,8 +6,10 @@ import (
"encoding/hex"
"fmt"
"net/url"
"path"
"regexp"
"strings"
"sync"
"time"
"github.com/gocolly/colly/v2"
@@ -27,17 +29,42 @@ func NewWebScraper(config *Config) *WebScraper {
func (s *WebScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
var documents []*Document
visited := make(map[string]bool)
scheduled := make(map[string]bool)
contentHashes := make(map[string]bool)
var mu sync.Mutex
var scrapeErrors []string
// Parse base URL for domain restrictions
baseURL, err := url.Parse(source.URL)
if err != nil {
return nil, fmt.Errorf("invalid URL: %w", err)
}
allowedDomain := baseURL.Hostname()
if allowedDomain == "" {
allowedDomain = baseURL.Host
}
maxDepth := s.config.MaxDepth
if maxDepth <= 0 {
maxDepth = 2
}
maxPages := s.config.Concurrency * 40
if maxPages < 20 {
maxPages = 20
}
if maxDepth <= 1 && maxPages > 30 {
maxPages = 30
}
if maxPages > 300 {
maxPages = 300
}
scopePrefix := pathScopePrefix(baseURL.Path)
scopeLeaf := pathScopeLeaf(baseURL.Path)
// Create Colly collector
c := colly.NewCollector(
colly.AllowedDomains(baseURL.Host),
colly.MaxDepth(s.config.MaxDepth),
colly.AllowedDomains(allowedDomain),
colly.MaxDepth(maxDepth),
colly.Async(true),
colly.UserAgent(s.config.UserAgent),
)
@@ -65,18 +92,40 @@ func (s *WebScraper) Scrape(ctx context.Context, source *Source) ([]*Document, e
// Handle errors
c.OnError(func(r *colly.Response, err error) {
fmt.Printf("Error scraping %s: %v\n", r.Request.URL, err)
errText := strings.ToLower(err.Error())
if strings.Contains(errText, "already visited") {
return
}
reqURL := source.URL
if r != nil && r.Request != nil && r.Request.URL != nil {
reqURL = r.Request.URL.String()
}
mu.Lock()
if len(scrapeErrors) < 20 {
scrapeErrors = append(scrapeErrors, fmt.Sprintf("%s: %v", reqURL, err))
}
mu.Unlock()
})
// Extract content from pages
c.OnHTML("html", func(e *colly.HTMLElement) {
pageURL := e.Request.URL.String()
if !withinScope(e.Request.URL, baseURL, scopePrefix, scopeLeaf) {
return
}
// Skip if already visited
mu.Lock()
if visited[pageURL] {
mu.Unlock()
return
}
if len(visited) >= maxPages {
mu.Unlock()
return
}
visited[pageURL] = true
mu.Unlock()
// Check include/exclude patterns
if !s.shouldInclude(pageURL, source.Include, source.Exclude) {
@@ -99,6 +148,13 @@ func (s *WebScraper) Scrape(ctx context.Context, source *Source) ([]*Document, e
// Generate hash for change detection
hash := s.generateHash(content)
mu.Lock()
if contentHashes[hash] {
mu.Unlock()
return
}
contentHashes[hash] = true
mu.Unlock()
// Extract metadata
metadata := map[string]interface{}{
@@ -120,30 +176,74 @@ func (s *WebScraper) Scrape(ctx context.Context, source *Source) ([]*Document, e
Timestamp: time.Now(),
}
mu.Lock()
documents = append(documents, doc)
mu.Unlock()
})
// Follow links
c.OnHTML("a[href]", func(e *colly.HTMLElement) {
link := e.Attr("href")
absoluteURL := e.Request.AbsoluteURL(link)
// Skip if already visited
if visited[absoluteURL] {
if absoluteURL == "" {
return
}
linkURL, err := url.Parse(absoluteURL)
if err != nil {
return
}
if linkURL.Scheme != "http" && linkURL.Scheme != "https" {
return
}
if !withinScope(linkURL, baseURL, scopePrefix, scopeLeaf) {
return
}
// Skip if already visited
mu.Lock()
if visited[absoluteURL] {
mu.Unlock()
return
}
if len(visited) >= maxPages {
mu.Unlock()
return
}
mu.Unlock()
// Check include/exclude patterns
if !s.shouldInclude(absoluteURL, source.Include, source.Exclude) {
return
}
mu.Lock()
if scheduled[absoluteURL] {
mu.Unlock()
return
}
if len(scheduled) >= maxPages {
mu.Unlock()
return
}
scheduled[absoluteURL] = true
mu.Unlock()
if err := c.Visit(absoluteURL); err != nil {
fmt.Printf("Error visiting %s: %v\n", absoluteURL, err)
errText := strings.ToLower(err.Error())
if strings.Contains(errText, "already visited") {
return
}
mu.Lock()
if len(scrapeErrors) < 20 {
scrapeErrors = append(scrapeErrors, fmt.Sprintf("%s: %v", absoluteURL, err))
}
mu.Unlock()
}
})
// Start scraping
scheduled[source.URL] = true
if err := c.Visit(source.URL); err != nil {
return nil, fmt.Errorf("failed to start scraping: %w", err)
}
@@ -151,6 +251,16 @@ func (s *WebScraper) Scrape(ctx context.Context, source *Source) ([]*Document, e
// Wait for async scraping to complete
c.Wait()
mu.Lock()
defer mu.Unlock()
if len(documents) == 0 {
if len(scrapeErrors) > 0 {
return nil, fmt.Errorf("web scrape failed: %s", strings.Join(scrapeErrors, "; "))
}
return nil, fmt.Errorf("web scrape extracted no documents from %s", source.URL)
}
return documents, nil
}
@@ -283,6 +393,25 @@ func (s *WebScraper) generateHash(content string) string {
// cleanText removes extra whitespace and normalizes text.
func cleanText(text string) string {
noisePhrases := []string{
"table of contents",
"in this article",
"additional resources",
"feedback",
"collaborate with us on github",
"copyright",
"all rights reserved",
"privacy policy",
"terms of service",
"sign in",
"skip to main content",
"ask learn",
}
for _, phrase := range noisePhrases {
re := regexp.MustCompile(`(?i)` + regexp.QuoteMeta(phrase))
text = re.ReplaceAllString(text, " ")
}
// Replace multiple whitespace with single space
re := regexp.MustCompile(`\s+`)
text = re.ReplaceAllString(text, " ")
@@ -292,3 +421,58 @@ func cleanText(text string) string {
return text
}
func pathScopePrefix(rawPath string) string {
clean := path.Clean(rawPath)
if clean == "." || clean == "/" || clean == "" {
return ""
}
last := path.Base(clean)
if strings.Contains(last, ".") {
dir := path.Dir(clean)
if dir == "/" {
// Root-level document page: keep crawler scoped to this page path.
return clean
}
return dir
}
dir := path.Dir(clean)
if dir == "/" {
return clean
}
return dir
}
func pathScopeLeaf(rawPath string) string {
clean := path.Clean(rawPath)
if clean == "." || clean == "/" || clean == "" {
return ""
}
last := path.Base(clean)
if strings.Contains(last, ".") {
return last
}
return ""
}
func withinScope(target, base *url.URL, prefix, leaf string) bool {
if target == nil || base == nil {
return false
}
if !strings.EqualFold(target.Hostname(), base.Hostname()) {
return false
}
if prefix == "" {
return true
}
targetPath := target.Path
if targetPath == "" {
targetPath = path.Clean("/")
}
if strings.HasPrefix(targetPath, prefix) {
return true
}
return leaf != "" && path.Base(targetPath) == leaf
}
+132
View File
@@ -0,0 +1,132 @@
package scraper
import (
"context"
"net/http"
"net/http/httptest"
"strings"
"testing"
"time"
)
func TestWebScraper_ReturnsErrorWhenNothingFetched(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
http.NotFound(w, r)
}))
defer srv.Close()
s := NewWebScraper(&Config{
UserAgent: "DevourTest/1.0",
Timeout: 2 * time.Second,
Concurrency: 1,
MaxDepth: 1,
})
_, err := s.Scrape(context.Background(), &Source{
Name: "missing",
Type: SourceTypeWeb,
URL: srv.URL + "/missing",
})
if err == nil {
t.Fatal("expected error when web scrape yields no documents")
}
}
func TestWebScraper_ReturnsErrorWhenPageHasNoExtractableContent(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/html")
_, _ = w.Write([]byte(`<html><head><title>Empty</title></head><body><p>tiny</p></body></html>`))
}))
defer srv.Close()
s := NewWebScraper(&Config{
UserAgent: "DevourTest/1.0",
Timeout: 2 * time.Second,
Concurrency: 1,
MaxDepth: 1,
})
_, err := s.Scrape(context.Background(), &Source{
Name: "empty",
Type: SourceTypeWeb,
URL: srv.URL,
})
if err == nil {
t.Fatal("expected error when page has no extractable docs")
}
if !strings.Contains(err.Error(), "extracted no documents") {
t.Fatalf("unexpected error message: %v", err)
}
}
func TestWebScraper_AllowsRedirectedDocumentPath(t *testing.T) {
content := strings.Repeat("ruby docs content ", 30)
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
switch r.URL.Path {
case "/core/Regexp.html":
http.Redirect(w, r, "/3.4.1/Regexp.html", http.StatusFound)
case "/3.4.1/Regexp.html":
w.Header().Set("Content-Type", "text/html")
_, _ = w.Write([]byte(`<html><head><title>Regexp</title></head><body><main>` + content + `</main></body></html>`))
default:
http.NotFound(w, r)
}
}))
defer srv.Close()
s := NewWebScraper(&Config{
UserAgent: "DevourTest/1.0",
Timeout: 2 * time.Second,
Concurrency: 1,
MaxDepth: 1,
})
docs, err := s.Scrape(context.Background(), &Source{
Name: "ruby",
Type: SourceTypeWeb,
URL: srv.URL + "/core/Regexp.html",
})
if err != nil {
t.Fatalf("unexpected scrape error: %v", err)
}
if len(docs) == 0 {
t.Fatal("expected redirected page to be scraped")
}
if !strings.Contains(docs[0].URL, "/3.4.1/Regexp.html") {
t.Fatalf("expected final redirected URL, got %q", docs[0].URL)
}
}
func TestWebScraper_GlobalWrapperNormalizesOutput(t *testing.T) {
content := strings.Repeat("docs content ", 20)
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/html")
_, _ = w.Write([]byte(`<html><head><title>Regex Guide ¶ deprecated</title></head><body><main>` + content + `</main></body></html>`))
}))
defer srv.Close()
s := NewScraper(SourceTypeWeb, &Config{
UserAgent: "DevourTest/1.0",
Timeout: 2 * time.Second,
Concurrency: 1,
MaxDepth: 1,
})
if s == nil {
t.Fatal("expected web scraper")
}
docs, err := s.Scrape(context.Background(), &Source{
Name: "test",
Type: SourceTypeWeb,
URL: srv.URL,
})
if err != nil {
t.Fatalf("unexpected scrape error: %v", err)
}
if len(docs) == 0 {
t.Fatal("expected at least one document")
}
if docs[0].Title != "Regex Guide" {
t.Fatalf("expected normalized title, got %q", docs[0].Title)
}
}
+98
View File
@@ -0,0 +1,98 @@
package scraper
import (
"context"
"errors"
"fmt"
"net"
"strings"
"time"
)
// wrappedScraper adds retry and normalization behavior for all scraper implementations.
type wrappedScraper struct {
inner Scraper
}
func wrapScraper(inner Scraper) Scraper {
if inner == nil {
return nil
}
return &wrappedScraper{inner: inner}
}
func (w *wrappedScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
if source == nil {
return nil, fmt.Errorf("source is required")
}
retries := 2
delay := 300 * time.Millisecond
var lastErr error
for attempt := 0; attempt <= retries; attempt++ {
docs, err := w.inner.Scrape(ctx, source)
if err == nil {
return NormalizeDocuments(docs), nil
}
lastErr = err
// One fallback: add trailing slash for doc sites when URL path looks page-like.
if attempt == 0 && strings.HasPrefix(source.URL, "http") && !strings.HasSuffix(source.URL, "/") {
alt := *source
alt.URL = source.URL + "/"
docs, altErr := w.inner.Scrape(ctx, &alt)
if altErr == nil {
return NormalizeDocuments(docs), nil
}
}
if attempt < retries && isRetriableScrapeError(err) {
if !sleepWithContext(ctx, delay) {
return nil, ctx.Err()
}
continue
}
break
}
return nil, lastErr
}
func (w *wrappedScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
return w.inner.DetectChanges(ctx, source, lastHash)
}
func isRetriableScrapeError(err error) bool {
if err == nil {
return false
}
s := strings.ToLower(err.Error())
if strings.Contains(s, "timeout") ||
strings.Contains(s, "temporarily unavailable") ||
strings.Contains(s, "connection reset") ||
strings.Contains(s, "eof") ||
strings.Contains(s, "http 429") ||
strings.Contains(s, "http 500") ||
strings.Contains(s, "http 502") ||
strings.Contains(s, "http 503") ||
strings.Contains(s, "http 504") {
return true
}
var netErr net.Error
return errors.As(err, &netErr)
}
func sleepWithContext(ctx context.Context, d time.Duration) bool {
t := time.NewTimer(d)
defer t.Stop()
select {
case <-ctx.Done():
return false
case <-t.C:
return true
}
}
+45
View File
@@ -0,0 +1,45 @@
package scraper
import (
"context"
"fmt"
"testing"
)
type flakyStubScraper struct {
failFirst bool
calls int
}
func (f *flakyStubScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
f.calls++
if f.failFirst && f.calls == 1 {
return nil, fmt.Errorf("HTTP 503")
}
return []*Document{
{
Title: "Example ¶ deprecated",
Content: "ok",
URL: source.URL,
Type: "test",
},
}, nil
}
func (f *flakyStubScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
return true, "hash", nil
}
func TestWrappedScraper_RetriesAndNormalizes(t *testing.T) {
w := wrapScraper(&flakyStubScraper{failFirst: true})
docs, err := w.Scrape(context.Background(), &Source{URL: "https://example.com"})
if err != nil {
t.Fatalf("expected retry to succeed, got error: %v", err)
}
if len(docs) != 1 {
t.Fatalf("expected 1 document, got %d", len(docs))
}
if docs[0].Title != "Example" {
t.Fatalf("expected normalized title, got %q", docs[0].Title)
}
}
+528
View File
@@ -0,0 +1,528 @@
package search
import (
"context"
"crypto/sha256"
"encoding/hex"
"encoding/json"
"fmt"
"io/fs"
"os"
"path/filepath"
"sort"
"strings"
"time"
"github.com/yourorg/devour/internal/config"
)
type Engine struct {
DocsDir string
IndexDir string
MetadataDir string
SnippetLength int
}
type SearchOptions struct {
Limit int
Threshold float64
}
type Result struct {
ID string `json:"id"`
DocID string `json:"doc_id"`
Title string `json:"title"`
URL string `json:"url,omitempty"`
Type string `json:"type"`
Source string `json:"source,omitempty"`
Path string `json:"path"`
Score float64 `json:"score"`
Snippet string `json:"snippet"`
Meta map[string]any `json:"metadata,omitempty"`
}
type IndexStats struct {
Documents int `json:"documents"`
Tokens int `json:"tokens"`
LastIndexedAt time.Time `json:"last_indexed_at"`
IndexPath string `json:"index_path"`
MetadataPath string `json:"metadata_path"`
SourceFileHash string `json:"source_file_hash"`
}
type indexedDoc struct {
ID string `json:"id"`
DocID string `json:"doc_id"`
Title string `json:"title"`
URL string `json:"url,omitempty"`
Type string `json:"type"`
Source string `json:"source,omitempty"`
Path string `json:"path"`
Content string `json:"content"`
TermFreq map[string]int `json:"term_freq"`
Length int `json:"length"`
}
type persistedIndex struct {
Version string `json:"version"`
BuiltAt time.Time `json:"built_at"`
Docs []indexedDoc `json:"docs"`
}
type persistedMeta struct {
Version string `json:"version"`
BuiltAt time.Time `json:"built_at"`
DocsDir string `json:"docs_dir"`
SourceFileHash string `json:"source_file_hash"`
DocCount int `json:"doc_count"`
}
type rawDoc struct {
ID string `json:"id"`
Source string `json:"source"`
Type string `json:"type"`
Title string `json:"title"`
Content string `json:"content"`
URL string `json:"url,omitempty"`
Metadata map[string]any `json:"metadata,omitempty"`
}
const (
indexFileName = "lexical_index.json"
metaFileName = "lexical_index_meta.json"
indexVersion = "1"
)
func NewEngine(cfg *config.Config) *Engine {
snippetLength := cfg.Indexing.SnippetLength
if snippetLength <= 0 {
snippetLength = 220
}
return &Engine{
DocsDir: cfg.Storage.DocsDir,
IndexDir: cfg.Storage.IndexDir,
MetadataDir: cfg.Storage.MetadataDir,
SnippetLength: snippetLength,
}
}
func (e *Engine) Rebuild(ctx context.Context) (*IndexStats, error) {
if strings.TrimSpace(e.DocsDir) == "" {
return nil, fmt.Errorf("docs directory is required")
}
if err := os.MkdirAll(e.IndexDir, 0o755); err != nil {
return nil, err
}
if err := os.MkdirAll(e.MetadataDir, 0o755); err != nil {
return nil, err
}
docFiles, sourceHash, err := e.listDocFiles()
if err != nil {
return nil, err
}
docs := make([]indexedDoc, 0, len(docFiles))
tokenCount := 0
for _, file := range docFiles {
select {
case <-ctx.Done():
return nil, ctx.Err()
default:
}
rd, err := parseDocFile(file)
if err != nil {
continue
}
if strings.TrimSpace(rd.Content) == "" {
continue
}
id := rd.ID
if id == "" {
id = hashString(file + ":" + rd.Title)
}
termFreq := frequency(tokenize(strings.Join([]string{rd.Title, rd.Content, rd.URL, rd.Type}, " ")))
length := 0
for _, v := range termFreq {
length += v
}
tokenCount += length
docs = append(docs, indexedDoc{
ID: hashString(file),
DocID: id,
Title: bestTitle(rd.Title, file),
URL: strings.TrimSpace(rd.URL),
Type: defaultString(strings.TrimSpace(rd.Type), "document"),
Source: strings.TrimSpace(rd.Source),
Path: file,
Content: collapseWhitespace(rd.Content),
TermFreq: termFreq,
Length: length,
})
}
index := persistedIndex{Version: indexVersion, BuiltAt: time.Now(), Docs: docs}
indexPath := filepath.Join(e.IndexDir, indexFileName)
if err := writeJSON(indexPath, index); err != nil {
return nil, err
}
meta := persistedMeta{
Version: indexVersion,
BuiltAt: index.BuiltAt,
DocsDir: e.DocsDir,
SourceFileHash: sourceHash,
DocCount: len(docs),
}
metaPath := filepath.Join(e.MetadataDir, metaFileName)
if err := writeJSON(metaPath, meta); err != nil {
return nil, err
}
return &IndexStats{
Documents: len(docs),
Tokens: tokenCount,
LastIndexedAt: index.BuiltAt,
IndexPath: indexPath,
MetadataPath: metaPath,
SourceFileHash: sourceHash,
}, nil
}
func (e *Engine) EnsureIndexed(ctx context.Context) (*IndexStats, error) {
metaPath := filepath.Join(e.MetadataDir, metaFileName)
b, err := os.ReadFile(metaPath)
if err != nil {
if os.IsNotExist(err) {
return e.Rebuild(ctx)
}
return nil, err
}
var meta persistedMeta
if err := json.Unmarshal(b, &meta); err != nil {
return e.Rebuild(ctx)
}
_, sourceHash, err := e.listDocFiles()
if err != nil {
return nil, err
}
if sourceHash != meta.SourceFileHash {
return e.Rebuild(ctx)
}
return &IndexStats{
Documents: meta.DocCount,
LastIndexedAt: meta.BuiltAt,
IndexPath: filepath.Join(e.IndexDir, indexFileName),
MetadataPath: metaPath,
SourceFileHash: meta.SourceFileHash,
}, nil
}
func (e *Engine) Search(ctx context.Context, query string, opts SearchOptions) ([]Result, *IndexStats, error) {
query = strings.TrimSpace(query)
if query == "" {
return nil, nil, fmt.Errorf("query is required")
}
stats, err := e.EnsureIndexed(ctx)
if err != nil {
return nil, nil, err
}
indexPath := filepath.Join(e.IndexDir, indexFileName)
b, err := os.ReadFile(indexPath)
if err != nil {
return nil, nil, err
}
var idx persistedIndex
if err := json.Unmarshal(b, &idx); err != nil {
return nil, nil, err
}
limit := opts.Limit
if limit <= 0 {
limit = 5
}
threshold := opts.Threshold
if threshold < 0 {
threshold = 0
}
queryTokens := tokenize(query)
if len(queryTokens) == 0 {
return nil, stats, nil
}
qFreq := frequency(queryTokens)
type scored struct {
doc indexedDoc
score float64
}
matches := make([]scored, 0)
for _, doc := range idx.Docs {
select {
case <-ctx.Done():
return nil, nil, ctx.Err()
default:
}
score := lexicalScore(qFreq, queryTokens, doc)
if score <= 0 {
continue
}
if threshold > 0 && score < threshold {
continue
}
matches = append(matches, scored{doc: doc, score: score})
}
sort.Slice(matches, func(i, j int) bool {
if matches[i].score == matches[j].score {
return matches[i].doc.Title < matches[j].doc.Title
}
return matches[i].score > matches[j].score
})
if limit > len(matches) {
limit = len(matches)
}
results := make([]Result, 0, limit)
for i := 0; i < limit; i++ {
d := matches[i].doc
results = append(results, Result{
ID: d.ID,
DocID: d.DocID,
Title: d.Title,
URL: d.URL,
Type: d.Type,
Source: d.Source,
Path: d.Path,
Score: matches[i].score,
Snippet: bestSnippet(d.Content, queryTokens, e.SnippetLength),
Meta: map[string]any{
"length": d.Length,
},
})
}
return results, stats, nil
}
func (e *Engine) listDocFiles() ([]string, string, error) {
files := make([]string, 0)
h := sha256.New()
err := filepath.WalkDir(e.DocsDir, func(path string, d fs.DirEntry, err error) error {
if err != nil {
return err
}
if d.IsDir() {
return nil
}
ext := strings.ToLower(filepath.Ext(path))
switch ext {
case ".json", ".md", ".txt":
default:
return nil
}
info, statErr := d.Info()
if statErr != nil {
return statErr
}
files = append(files, path)
fmt.Fprintf(h, "%s|%d|%d\n", path, info.Size(), info.ModTime().UnixNano())
return nil
})
if err != nil {
if os.IsNotExist(err) {
return []string{}, hashString("empty"), nil
}
return nil, "", err
}
sort.Strings(files)
return files, hex.EncodeToString(h.Sum(nil)), nil
}
func parseDocFile(path string) (*rawDoc, error) {
b, err := os.ReadFile(path)
if err != nil {
return nil, err
}
ext := strings.ToLower(filepath.Ext(path))
switch ext {
case ".json":
var d rawDoc
if err := json.Unmarshal(b, &d); err == nil && (d.Title != "" || d.Content != "") {
return &d, nil
}
// Not a structured doc JSON, index as raw text fallback.
return &rawDoc{Title: filepath.Base(path), Content: string(b), Type: "json", Source: "local"}, nil
case ".md":
content := string(b)
title := markdownTitle(content)
return &rawDoc{Title: title, Content: content, Type: "markdown", Source: "local"}, nil
default:
return &rawDoc{Title: filepath.Base(path), Content: string(b), Type: "text", Source: "local"}, nil
}
}
func markdownTitle(content string) string {
for _, line := range strings.Split(content, "\n") {
trim := strings.TrimSpace(line)
if strings.HasPrefix(trim, "#") {
trim = strings.TrimLeft(trim, "#")
trim = strings.TrimSpace(trim)
if trim != "" {
return trim
}
}
}
return ""
}
func writeJSON(path string, v any) error {
b, err := json.MarshalIndent(v, "", " ")
if err != nil {
return err
}
return os.WriteFile(path, b, 0o644)
}
func tokenize(input string) []string {
replacer := strings.NewReplacer(",", " ", ".", " ", "(", " ", ")", " ", "[", " ", "]", " ", "{", " ", "}", " ",
":", " ", ";", " ", "!", " ", "?", " ", "\n", " ", "\r", " ", "\t", " ", "\"", " ", "'", " ", "`", " ",
)
clean := strings.ToLower(replacer.Replace(input))
parts := strings.Fields(clean)
out := make([]string, 0, len(parts))
for _, p := range parts {
p = strings.TrimSpace(p)
if len(p) < 2 {
continue
}
out = append(out, p)
}
return out
}
func frequency(tokens []string) map[string]int {
m := make(map[string]int, len(tokens))
for _, t := range tokens {
m[t]++
}
return m
}
func lexicalScore(qFreq map[string]int, queryTokens []string, doc indexedDoc) float64 {
if len(doc.TermFreq) == 0 {
return 0
}
titleLower := strings.ToLower(doc.Title)
urlLower := strings.ToLower(doc.URL)
contentLower := strings.ToLower(doc.Content)
score := 0.0
for token, qCount := range qFreq {
dCount := doc.TermFreq[token]
if dCount == 0 {
continue
}
part := float64(dCount*qCount) / float64(max(1, doc.Length))
score += part * 8.0
if strings.Contains(titleLower, token) {
score += 2.5
}
if strings.Contains(urlLower, token) {
score += 1.2
}
}
phrase := strings.Join(queryTokens, " ")
if len(queryTokens) > 1 && strings.Contains(contentLower, phrase) {
score += 1.5
}
return score
}
func bestSnippet(content string, queryTokens []string, maxLen int) string {
if maxLen <= 0 {
maxLen = 220
}
flat := collapseWhitespace(content)
if flat == "" {
return ""
}
if len(flat) <= maxLen {
return flat
}
lower := strings.ToLower(flat)
start := 0
for _, tok := range queryTokens {
if idx := strings.Index(lower, tok); idx >= 0 {
start = idx - (maxLen / 4)
if start < 0 {
start = 0
}
break
}
}
end := start + maxLen
if end > len(flat) {
end = len(flat)
}
snippet := strings.TrimSpace(flat[start:end])
if end < len(flat) {
snippet += "..."
}
return snippet
}
func collapseWhitespace(s string) string {
return strings.Join(strings.Fields(strings.TrimSpace(s)), " ")
}
func bestTitle(title, path string) string {
title = strings.TrimSpace(title)
if title != "" {
return title
}
base := filepath.Base(path)
base = strings.TrimSuffix(base, filepath.Ext(base))
base = strings.ReplaceAll(base, "_", " ")
base = strings.TrimSpace(base)
if base == "" {
return "Documentation"
}
return base
}
func defaultString(v, fallback string) string {
if strings.TrimSpace(v) == "" {
return fallback
}
return v
}
func hashString(s string) string {
sum := sha256.Sum256([]byte(s))
return hex.EncodeToString(sum[:12])
}
func max(a, b int) int {
if a > b {
return a
}
return b
}
+56
View File
@@ -0,0 +1,56 @@
package search
import (
"context"
"encoding/json"
"os"
"path/filepath"
"testing"
"github.com/yourorg/devour/internal/config"
)
func TestEngineRebuildAndSearch(t *testing.T) {
tmp := t.TempDir()
docsDir := filepath.Join(tmp, "docs")
indexDir := filepath.Join(tmp, "index")
metaDir := filepath.Join(tmp, "metadata")
if err := os.MkdirAll(docsDir, 0o755); err != nil {
t.Fatal(err)
}
doc := map[string]any{
"id": "1",
"title": "HTTP Client",
"content": "Use net/http client with timeout",
"type": "go-doc",
"source": "go",
"url": "https://pkg.go.dev/net/http",
}
b, _ := json.Marshal(doc)
if err := os.WriteFile(filepath.Join(docsDir, "doc.json"), b, 0o644); err != nil {
t.Fatal(err)
}
cfg := config.Default()
cfg.Storage.DocsDir = docsDir
cfg.Storage.IndexDir = indexDir
cfg.Storage.MetadataDir = metaDir
e := NewEngine(cfg)
stats, err := e.Rebuild(context.Background())
if err != nil {
t.Fatalf("rebuild failed: %v", err)
}
if stats.Documents == 0 {
t.Fatal("expected documents in index")
}
results, _, err := e.Search(context.Background(), "http timeout", SearchOptions{Limit: 5})
if err != nil {
t.Fatalf("search failed: %v", err)
}
if len(results) == 0 {
t.Fatal("expected at least one search result")
}
}
+167 -5
View File
@@ -2,7 +2,16 @@
package server
import (
"bufio"
"context"
"encoding/json"
"fmt"
"io"
"net/http"
"os"
"strings"
"sync"
"time"
)
// Config holds server configuration.
@@ -11,8 +20,13 @@ type Config struct {
Transport string `yaml:"transport"`
Host string `yaml:"host"`
Port int `yaml:"port"`
Handler MethodHandler `yaml:"-"`
}
// MethodHandler executes a server method with raw params and returns result payload.
type MethodHandler func(ctx context.Context, method string, params json.RawMessage) (any, error)
// Server defines the MCP server interface.
type Server interface {
// Start begins listening for connections.
@@ -47,9 +61,28 @@ type Result struct {
Metadata map[string]any `json:"metadata,omitempty"`
}
type rpcRequest struct {
JSONRPC string `json:"jsonrpc"`
ID any `json:"id"`
Method string `json:"method"`
Params json.RawMessage `json:"params,omitempty"`
}
type rpcResponse struct {
JSONRPC string `json:"jsonrpc"`
ID any `json:"id"`
Result any `json:"result,omitempty"`
Error *rpcError `json:"error,omitempty"`
}
type rpcError struct {
Code int `json:"code"`
Message string `json:"message"`
}
// NewServer creates a new MCP server.
func NewServer(config *Config) Server {
if config.Mode == "remote" {
if strings.EqualFold(config.Mode, "remote") {
return NewHTTPServer(config)
}
return NewStdioServer(config)
@@ -68,27 +101,156 @@ func NewStdioServer(config *Config) *StdioServer {
// HTTPServer implements Server for HTTP transport.
type HTTPServer struct {
config *Config
http *http.Server
mu sync.Mutex
}
func (s *HTTPServer) Start(ctx context.Context) error {
// TODO: Implement HTTP server with MCP endpoints
return nil
s.mu.Lock()
defer s.mu.Unlock()
if s.config == nil {
return fmt.Errorf("server config is required")
}
if s.config.Handler == nil {
return fmt.Errorf("server handler is required")
}
mux := http.NewServeMux()
mux.HandleFunc("/health", func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/json")
_, _ = io.WriteString(w, `{"ok":true}`)
})
mux.HandleFunc("/rpc", func(w http.ResponseWriter, r *http.Request) {
if r.Method != http.MethodPost {
w.WriteHeader(http.StatusMethodNotAllowed)
return
}
defer r.Body.Close()
var req rpcRequest
if err := json.NewDecoder(io.LimitReader(r.Body, 2<<20)).Decode(&req); err != nil {
writeRPC(w, rpcResponse{JSONRPC: "2.0", Error: &rpcError{Code: -32700, Message: "parse error"}})
return
}
resp := s.handleRPC(r.Context(), req)
writeRPC(w, resp)
})
host := s.config.Host
if host == "" {
host = "localhost"
}
port := s.config.Port
if port == 0 {
port = 8080
}
s.http = &http.Server{Addr: fmt.Sprintf("%s:%d", host, port), Handler: mux}
errCh := make(chan error, 1)
go func() {
errCh <- s.http.ListenAndServe()
}()
select {
case <-ctx.Done():
shutdownCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
_ = s.http.Shutdown(shutdownCtx)
return ctx.Err()
case err := <-errCh:
if err != nil && err != http.ErrServerClosed {
return err
}
return nil
}
}
func (s *HTTPServer) Stop(ctx context.Context) error {
return nil
s.mu.Lock()
defer s.mu.Unlock()
if s.http == nil {
return nil
}
return s.http.Shutdown(ctx)
}
func (s *HTTPServer) handleRPC(ctx context.Context, req rpcRequest) rpcResponse {
return handleRPC(ctx, s.config.Handler, req)
}
// StdioServer implements Server for stdio transport.
type StdioServer struct {
config *Config
mu sync.Mutex
stop bool
}
func (s *StdioServer) Start(ctx context.Context) error {
// TODO: Implement stdio JSON-RPC server
s.mu.Lock()
defer s.mu.Unlock()
if s.config == nil {
return fmt.Errorf("server config is required")
}
if s.config.Handler == nil {
return fmt.Errorf("server handler is required")
}
scanner := bufio.NewScanner(os.Stdin)
out := json.NewEncoder(os.Stdout)
for scanner.Scan() {
if ctx.Err() != nil || s.stop {
break
}
line := strings.TrimSpace(scanner.Text())
if line == "" {
continue
}
var req rpcRequest
if err := json.Unmarshal([]byte(line), &req); err != nil {
_ = out.Encode(rpcResponse{JSONRPC: "2.0", Error: &rpcError{Code: -32700, Message: "parse error"}})
continue
}
resp := handleRPC(ctx, s.config.Handler, req)
if err := out.Encode(resp); err != nil {
return err
}
}
if err := scanner.Err(); err != nil {
return err
}
return nil
}
func (s *StdioServer) Stop(ctx context.Context) error {
s.mu.Lock()
defer s.mu.Unlock()
s.stop = true
return nil
}
func handleRPC(ctx context.Context, handler MethodHandler, req rpcRequest) rpcResponse {
if req.JSONRPC == "" {
req.JSONRPC = "2.0"
}
if req.Method == "" {
return rpcResponse{JSONRPC: "2.0", ID: req.ID, Error: &rpcError{Code: -32600, Message: "invalid request"}}
}
result, err := handler(ctx, req.Method, req.Params)
if err != nil {
return rpcResponse{JSONRPC: "2.0", ID: req.ID, Error: &rpcError{Code: -32000, Message: err.Error()}}
}
return rpcResponse{JSONRPC: "2.0", ID: req.ID, Result: result}
}
func writeRPC(w http.ResponseWriter, payload rpcResponse) {
w.Header().Set("Content-Type", "application/json")
if payload.Error != nil {
w.WriteHeader(http.StatusBadRequest)
}
_ = json.NewEncoder(w).Encode(payload)
}
+149
View File
@@ -0,0 +1,149 @@
package storage
import (
"encoding/json"
"fmt"
"os"
"path/filepath"
"regexp"
"strings"
"github.com/yourorg/devour/internal/markdown"
"github.com/yourorg/devour/internal/scraper"
)
type SaveOptions struct {
Format string
OutputDir string
AllowEmpty bool
PrintWriter func(string, ...any)
}
type SaveResult struct {
Count int
Files []string
}
var slugUnsafe = regexp.MustCompile(`[^a-z0-9._-]+`)
func SaveDocuments(docs []*scraper.Document, opts SaveOptions) (*SaveResult, error) {
if len(docs) == 0 {
if opts.AllowEmpty {
return &SaveResult{}, nil
}
return nil, fmt.Errorf("no documents scraped")
}
format := strings.ToLower(strings.TrimSpace(opts.Format))
if format == "" {
format = "json"
}
if format != "json" && format != "markdown" {
return nil, fmt.Errorf("unsupported format: %s", opts.Format)
}
if strings.TrimSpace(opts.OutputDir) == "" {
return nil, fmt.Errorf("output directory is required")
}
if err := os.MkdirAll(opts.OutputDir, 0o755); err != nil {
return nil, err
}
used := map[string]int{}
files := make([]string, 0, len(docs))
formatter := markdown.NewFormatter()
for i, doc := range docs {
if doc == nil {
continue
}
base := slugify(defaultTitle(doc.Title, i))
ext := ".json"
if format == "markdown" {
ext = ".md"
}
name := uniqueName(base, ext, used, doc.ID)
path := filepath.Join(opts.OutputDir, name)
var b []byte
var err error
if format == "markdown" {
md := &markdown.Document{
ID: doc.ID,
Source: doc.Source,
Type: doc.Type,
Title: doc.Title,
Content: doc.Content,
URL: doc.URL,
Metadata: doc.Metadata,
Hash: doc.Hash,
Timestamp: doc.Timestamp,
}
b = []byte(formatter.FormatWithTOC(md))
} else {
b, err = json.MarshalIndent(doc, "", " ")
if err != nil {
return nil, err
}
}
if err := os.WriteFile(path, b, 0o644); err != nil {
return nil, err
}
files = append(files, path)
if opts.PrintWriter != nil {
opts.PrintWriter(" 📄 %s (%s)\n", filepath.Base(path), doc.Type)
}
}
if len(files) == 0 && !opts.AllowEmpty {
return nil, fmt.Errorf("no documents scraped")
}
return &SaveResult{Count: len(files), Files: files}, nil
}
func defaultTitle(title string, idx int) string {
title = strings.TrimSpace(title)
if title != "" {
return title
}
return fmt.Sprintf("document_%d", idx)
}
func slugify(name string) string {
name = strings.ToLower(strings.TrimSpace(name))
name = strings.ReplaceAll(name, " ", "-")
name = strings.ReplaceAll(name, "/", "-")
name = strings.ReplaceAll(name, "\\", "-")
name = strings.ReplaceAll(name, ":", "-")
name = strings.ReplaceAll(name, "?", "")
name = strings.ReplaceAll(name, "&", "and")
name = slugUnsafe.ReplaceAllString(name, "-")
name = strings.Trim(name, "-.")
if name == "" {
name = "document"
}
if len(name) > 80 {
name = strings.Trim(name[:80], "-.")
}
if name == "" {
name = "document"
}
return name
}
func uniqueName(base, ext string, used map[string]int, id string) string {
key := base + ext
if used[key] == 0 {
used[key] = 1
return key
}
used[key]++
suffix := used[key]
id = strings.TrimSpace(id)
if len(id) >= 8 {
return fmt.Sprintf("%s-%s%s", base, id[:8], ext)
}
return fmt.Sprintf("%s-%d%s", base, suffix, ext)
}