mirror of
https://github.com/Dvorinka/Devour.git
synced 2026-06-03 20:13:03 +00:00
update
This commit is contained in:
@@ -0,0 +1,368 @@
|
||||
package config
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"gopkg.in/yaml.v3"
|
||||
)
|
||||
|
||||
// Config is the typed application configuration loaded from devour.yaml.
|
||||
type Config struct {
|
||||
Version int `yaml:"version"`
|
||||
Storage StorageConfig `yaml:"storage"`
|
||||
Embeddings EmbeddingsConfig `yaml:"embeddings"`
|
||||
VectorDB VectorDBConfig `yaml:"vector_db"`
|
||||
Scraper ScraperConfig `yaml:"scraper"`
|
||||
Scheduler SchedulerConfig `yaml:"scheduler"`
|
||||
Server ServerConfig `yaml:"server"`
|
||||
Indexing IndexingConfig `yaml:"indexing"`
|
||||
Verification VerificationConfig `yaml:"verification"`
|
||||
Sources []SourceConfig `yaml:"sources"`
|
||||
|
||||
ConfigPath string `yaml:"-"`
|
||||
}
|
||||
|
||||
type StorageConfig struct {
|
||||
DocsDir string `yaml:"docs_dir"`
|
||||
IndexDir string `yaml:"index_dir"`
|
||||
MetadataDir string `yaml:"metadata_dir"`
|
||||
CacheDir string `yaml:"cache_dir"`
|
||||
}
|
||||
|
||||
type EmbeddingsConfig struct {
|
||||
Provider string `yaml:"provider"`
|
||||
Model string `yaml:"model"`
|
||||
Dimensions int `yaml:"dimensions"`
|
||||
APIKey string `yaml:"api_key"`
|
||||
BatchSize int `yaml:"batch_size"`
|
||||
BaseURL string `yaml:"base_url"`
|
||||
}
|
||||
|
||||
type VectorDBConfig struct {
|
||||
Type string `yaml:"type"`
|
||||
Persist bool `yaml:"persist"`
|
||||
SimilarityMetric string `yaml:"similarity_metric"`
|
||||
PersistDir string `yaml:"persist_dir"`
|
||||
}
|
||||
|
||||
type ScraperConfig struct {
|
||||
UserAgent string `yaml:"user_agent"`
|
||||
Timeout time.Duration `yaml:"timeout"`
|
||||
RetryCount int `yaml:"retry_count"`
|
||||
RetryDelay time.Duration `yaml:"retry_delay"`
|
||||
Concurrency int `yaml:"concurrency"`
|
||||
RateLimit time.Duration `yaml:"rate_limit"`
|
||||
MaxDepth int `yaml:"max_depth"`
|
||||
CacheDir string `yaml:"cache_dir"`
|
||||
}
|
||||
|
||||
type SchedulerConfig struct {
|
||||
Enabled bool `yaml:"enabled"`
|
||||
Interval time.Duration `yaml:"interval"`
|
||||
CheckMethod string `yaml:"check_method"`
|
||||
OnStartup bool `yaml:"on_startup"`
|
||||
}
|
||||
|
||||
type ServerConfig struct {
|
||||
Mode string `yaml:"mode"`
|
||||
Transport string `yaml:"transport"`
|
||||
Host string `yaml:"host"`
|
||||
Port int `yaml:"port"`
|
||||
}
|
||||
|
||||
type IndexingConfig struct {
|
||||
Enabled bool `yaml:"enabled"`
|
||||
AutoReindex bool `yaml:"auto_reindex"`
|
||||
SnippetLength int `yaml:"snippet_length"`
|
||||
MaxDocs int `yaml:"max_docs"`
|
||||
}
|
||||
|
||||
type VerificationConfig struct {
|
||||
Enabled bool `yaml:"enabled"`
|
||||
Timeout time.Duration `yaml:"timeout"`
|
||||
}
|
||||
|
||||
type SourceConfig struct {
|
||||
Name string `yaml:"name"`
|
||||
Type string `yaml:"type"`
|
||||
URL string `yaml:"url,omitempty"`
|
||||
Query string `yaml:"query,omitempty"`
|
||||
ResultLimit int `yaml:"result_limit,omitempty"`
|
||||
Domains []string `yaml:"domains,omitempty"`
|
||||
Repo string `yaml:"repo,omitempty"`
|
||||
Branch string `yaml:"branch,omitempty"`
|
||||
Path string `yaml:"path,omitempty"`
|
||||
Include []string `yaml:"include,omitempty"`
|
||||
Exclude []string `yaml:"exclude,omitempty"`
|
||||
Schedule string `yaml:"schedule,omitempty"`
|
||||
}
|
||||
|
||||
// Default returns a default configuration that matches devour init behavior.
|
||||
func Default() *Config {
|
||||
return &Config{
|
||||
Version: 1,
|
||||
Storage: StorageConfig{
|
||||
DocsDir: "./devour_data/docs",
|
||||
IndexDir: "./devour_data/index",
|
||||
MetadataDir: "./devour_data/metadata",
|
||||
CacheDir: "./devour_data/cache",
|
||||
},
|
||||
Embeddings: EmbeddingsConfig{
|
||||
Provider: "openai",
|
||||
Model: "text-embedding-3-small",
|
||||
Dimensions: 1536,
|
||||
BatchSize: 100,
|
||||
APIKey: "${OPENAI_API_KEY}",
|
||||
},
|
||||
VectorDB: VectorDBConfig{
|
||||
Type: "memory",
|
||||
Persist: true,
|
||||
SimilarityMetric: "cosine",
|
||||
},
|
||||
Scraper: ScraperConfig{
|
||||
UserAgent: "Devour/1.0",
|
||||
Timeout: 30 * time.Second,
|
||||
RetryCount: 3,
|
||||
RetryDelay: 1 * time.Second,
|
||||
Concurrency: 10,
|
||||
RateLimit: 500 * time.Millisecond,
|
||||
MaxDepth: 3,
|
||||
CacheDir: "./devour_data/cache",
|
||||
},
|
||||
Scheduler: SchedulerConfig{
|
||||
Enabled: true,
|
||||
Interval: 72 * time.Hour,
|
||||
CheckMethod: "hash",
|
||||
OnStartup: false,
|
||||
},
|
||||
Server: ServerConfig{
|
||||
Mode: "local",
|
||||
Transport: "stdio",
|
||||
Host: "localhost",
|
||||
Port: 8080,
|
||||
},
|
||||
Indexing: IndexingConfig{
|
||||
Enabled: true,
|
||||
AutoReindex: true,
|
||||
SnippetLength: 220,
|
||||
MaxDocs: 10000,
|
||||
},
|
||||
Verification: VerificationConfig{
|
||||
Enabled: true,
|
||||
Timeout: 90 * time.Second,
|
||||
},
|
||||
Sources: []SourceConfig{},
|
||||
}
|
||||
}
|
||||
|
||||
const initTemplateSourcesComment = `
|
||||
# Sources (add your own)
|
||||
sources: []
|
||||
# - name: example-docs
|
||||
# type: url
|
||||
# url: https://docs.example.com
|
||||
# include: ["**/*.md", "**/*.html"]
|
||||
# - name: local-searxng
|
||||
# type: localsearch
|
||||
# url: http://127.0.0.1:8080/search
|
||||
# query: golang http client
|
||||
# result_limit: 8
|
||||
# domains: ["pkg.go.dev", "go.dev"]
|
||||
`
|
||||
|
||||
// RenderInitYAML returns the default init config file content from canonical defaults.
|
||||
func RenderInitYAML(remote bool) (string, error) {
|
||||
cfg := Default()
|
||||
if remote {
|
||||
cfg.Server.Mode = "remote"
|
||||
}
|
||||
// Keep the init template comments for discoverability while sourcing
|
||||
// the actual values from canonical defaults.
|
||||
cfg.Sources = nil
|
||||
|
||||
body, err := yaml.Marshal(cfg)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("marshal default config: %w", err)
|
||||
}
|
||||
|
||||
trimmed := strings.TrimSuffix(string(body), "\n")
|
||||
if strings.HasSuffix(trimmed, "sources: []") {
|
||||
trimmed = strings.TrimSuffix(trimmed, "sources: []")
|
||||
trimmed = strings.TrimSpace(trimmed)
|
||||
}
|
||||
|
||||
return "# Devour Configuration\n" + trimmed + initTemplateSourcesComment, nil
|
||||
}
|
||||
|
||||
// Load loads configuration from an explicit path or the default search paths.
|
||||
func Load(explicitPath string) (*Config, error) {
|
||||
cfg := Default()
|
||||
|
||||
path, err := findConfigPath(explicitPath)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if path == "" {
|
||||
cfg.ApplyDefaults()
|
||||
return cfg, nil
|
||||
}
|
||||
|
||||
b, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("read config: %w", err)
|
||||
}
|
||||
|
||||
if err := yaml.Unmarshal(b, cfg); err != nil {
|
||||
return nil, fmt.Errorf("parse config: %w", err)
|
||||
}
|
||||
|
||||
cfg.ConfigPath = path
|
||||
cfg.ApplyDefaults()
|
||||
return cfg, nil
|
||||
}
|
||||
|
||||
// ApplyDefaults ensures additive backward-compatible defaults after unmarshaling.
|
||||
func (c *Config) ApplyDefaults() {
|
||||
if c.Version == 0 {
|
||||
c.Version = 1
|
||||
}
|
||||
|
||||
if c.Storage.DocsDir == "" {
|
||||
c.Storage.DocsDir = "./devour_data/docs"
|
||||
}
|
||||
if c.Storage.IndexDir == "" {
|
||||
c.Storage.IndexDir = "./devour_data/index"
|
||||
}
|
||||
if c.Storage.MetadataDir == "" {
|
||||
c.Storage.MetadataDir = "./devour_data/metadata"
|
||||
}
|
||||
if c.Storage.CacheDir == "" {
|
||||
c.Storage.CacheDir = "./devour_data/cache"
|
||||
}
|
||||
|
||||
if c.Embeddings.Provider == "" {
|
||||
c.Embeddings.Provider = "openai"
|
||||
}
|
||||
if c.Embeddings.Model == "" {
|
||||
c.Embeddings.Model = "text-embedding-3-small"
|
||||
}
|
||||
if c.Embeddings.Dimensions <= 0 {
|
||||
c.Embeddings.Dimensions = 1536
|
||||
}
|
||||
if c.Embeddings.BatchSize <= 0 {
|
||||
c.Embeddings.BatchSize = 100
|
||||
}
|
||||
|
||||
if c.VectorDB.Type == "" {
|
||||
c.VectorDB.Type = "memory"
|
||||
}
|
||||
if c.VectorDB.SimilarityMetric == "" {
|
||||
c.VectorDB.SimilarityMetric = "cosine"
|
||||
}
|
||||
|
||||
if c.Scraper.UserAgent == "" {
|
||||
c.Scraper.UserAgent = "Devour/1.0"
|
||||
}
|
||||
if c.Scraper.Timeout <= 0 {
|
||||
c.Scraper.Timeout = 30 * time.Second
|
||||
}
|
||||
if c.Scraper.RetryCount <= 0 {
|
||||
c.Scraper.RetryCount = 3
|
||||
}
|
||||
if c.Scraper.RetryDelay <= 0 {
|
||||
c.Scraper.RetryDelay = 1 * time.Second
|
||||
}
|
||||
if c.Scraper.Concurrency <= 0 {
|
||||
c.Scraper.Concurrency = 10
|
||||
}
|
||||
if c.Scraper.RateLimit < 0 {
|
||||
c.Scraper.RateLimit = 0
|
||||
}
|
||||
if c.Scraper.MaxDepth <= 0 {
|
||||
c.Scraper.MaxDepth = 3
|
||||
}
|
||||
if c.Scraper.CacheDir == "" {
|
||||
c.Scraper.CacheDir = c.Storage.CacheDir
|
||||
}
|
||||
|
||||
if c.Scheduler.Interval <= 0 {
|
||||
c.Scheduler.Interval = 72 * time.Hour
|
||||
}
|
||||
if c.Scheduler.CheckMethod == "" {
|
||||
c.Scheduler.CheckMethod = "hash"
|
||||
}
|
||||
|
||||
if c.Server.Mode == "" {
|
||||
c.Server.Mode = "local"
|
||||
}
|
||||
if c.Server.Transport == "" {
|
||||
c.Server.Transport = "stdio"
|
||||
}
|
||||
if c.Server.Host == "" {
|
||||
c.Server.Host = "localhost"
|
||||
}
|
||||
if c.Server.Port <= 0 {
|
||||
c.Server.Port = 8080
|
||||
}
|
||||
|
||||
if !c.Indexing.Enabled {
|
||||
// keep explicit false but initialize defaults for remaining fields
|
||||
}
|
||||
if c.Indexing.SnippetLength <= 0 {
|
||||
c.Indexing.SnippetLength = 220
|
||||
}
|
||||
if c.Indexing.MaxDocs <= 0 {
|
||||
c.Indexing.MaxDocs = 10000
|
||||
}
|
||||
|
||||
if c.Verification.Timeout <= 0 {
|
||||
c.Verification.Timeout = 90 * time.Second
|
||||
}
|
||||
}
|
||||
|
||||
func findConfigPath(explicitPath string) (string, error) {
|
||||
if strings.TrimSpace(explicitPath) != "" {
|
||||
p, err := filepath.Abs(explicitPath)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
if _, err := os.Stat(p); err != nil {
|
||||
return "", fmt.Errorf("config file not found: %s", explicitPath)
|
||||
}
|
||||
return p, nil
|
||||
}
|
||||
|
||||
candidates := []string{"./devour.yaml"}
|
||||
if home, err := os.UserHomeDir(); err == nil {
|
||||
candidates = append(candidates, filepath.Join(home, ".devour", "devour.yaml"))
|
||||
}
|
||||
|
||||
for _, c := range candidates {
|
||||
if _, err := os.Stat(c); err == nil {
|
||||
p, absErr := filepath.Abs(c)
|
||||
if absErr != nil {
|
||||
return "", absErr
|
||||
}
|
||||
return p, nil
|
||||
}
|
||||
}
|
||||
return "", nil
|
||||
}
|
||||
|
||||
// EnsureStorageDirs creates required local storage directories.
|
||||
func (c *Config) EnsureStorageDirs() error {
|
||||
dirs := []string{c.Storage.DocsDir, c.Storage.IndexDir, c.Storage.MetadataDir, c.Storage.CacheDir}
|
||||
for _, dir := range dirs {
|
||||
if strings.TrimSpace(dir) == "" {
|
||||
continue
|
||||
}
|
||||
if err := os.MkdirAll(dir, 0o755); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
@@ -0,0 +1,130 @@
|
||||
package projectstate
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io/fs"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
type SourceState struct {
|
||||
Name string `json:"name"`
|
||||
Type string `json:"type"`
|
||||
URL string `json:"url,omitempty"`
|
||||
Hash string `json:"hash,omitempty"`
|
||||
LastSync time.Time `json:"last_sync,omitempty"`
|
||||
DocCount int `json:"doc_count"`
|
||||
LastError string `json:"last_error,omitempty"`
|
||||
}
|
||||
|
||||
type SourceStateFile struct {
|
||||
UpdatedAt time.Time `json:"updated_at"`
|
||||
Sources map[string]*SourceState `json:"sources"`
|
||||
}
|
||||
|
||||
type DocsStats struct {
|
||||
DocumentCount int
|
||||
LastUpdated time.Time
|
||||
BySource map[string]int
|
||||
StorageBytes int64
|
||||
}
|
||||
|
||||
type docSummary struct {
|
||||
Source string `json:"source"`
|
||||
}
|
||||
|
||||
const sourceStateFileName = "source_state.json"
|
||||
|
||||
func LoadSourceState(metadataDir string) (*SourceStateFile, error) {
|
||||
path := filepath.Join(metadataDir, sourceStateFileName)
|
||||
b, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
if os.IsNotExist(err) {
|
||||
return &SourceStateFile{UpdatedAt: time.Now(), Sources: map[string]*SourceState{}}, nil
|
||||
}
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var state SourceStateFile
|
||||
if err := json.Unmarshal(b, &state); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if state.Sources == nil {
|
||||
state.Sources = map[string]*SourceState{}
|
||||
}
|
||||
return &state, nil
|
||||
}
|
||||
|
||||
func SaveSourceState(metadataDir string, state *SourceStateFile) error {
|
||||
if state == nil {
|
||||
return fmt.Errorf("state is required")
|
||||
}
|
||||
if state.Sources == nil {
|
||||
state.Sources = map[string]*SourceState{}
|
||||
}
|
||||
state.UpdatedAt = time.Now()
|
||||
|
||||
if err := os.MkdirAll(metadataDir, 0o755); err != nil {
|
||||
return err
|
||||
}
|
||||
path := filepath.Join(metadataDir, sourceStateFileName)
|
||||
b, err := json.MarshalIndent(state, "", " ")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return os.WriteFile(path, b, 0o644)
|
||||
}
|
||||
|
||||
func CollectDocsStats(docsDir string) (*DocsStats, error) {
|
||||
stats := &DocsStats{BySource: map[string]int{}}
|
||||
|
||||
err := filepath.WalkDir(docsDir, func(path string, d fs.DirEntry, err error) error {
|
||||
if err != nil {
|
||||
if os.IsNotExist(err) {
|
||||
return nil
|
||||
}
|
||||
return err
|
||||
}
|
||||
if d.IsDir() {
|
||||
return nil
|
||||
}
|
||||
|
||||
info, infoErr := d.Info()
|
||||
if infoErr != nil {
|
||||
return infoErr
|
||||
}
|
||||
stats.StorageBytes += info.Size()
|
||||
if info.ModTime().After(stats.LastUpdated) {
|
||||
stats.LastUpdated = info.ModTime()
|
||||
}
|
||||
|
||||
ext := strings.ToLower(filepath.Ext(path))
|
||||
if ext != ".json" && ext != ".md" && ext != ".txt" {
|
||||
return nil
|
||||
}
|
||||
stats.DocumentCount++
|
||||
|
||||
if ext == ".json" {
|
||||
b, readErr := os.ReadFile(path)
|
||||
if readErr != nil {
|
||||
return nil
|
||||
}
|
||||
var d docSummary
|
||||
if err := json.Unmarshal(b, &d); err == nil {
|
||||
source := strings.TrimSpace(d.Source)
|
||||
if source != "" {
|
||||
stats.BySource[source]++
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
})
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return stats, nil
|
||||
}
|
||||
@@ -398,8 +398,8 @@ func NewSecretsDetector() *SecretsDetector {
|
||||
{Name: "GitHub OAuth", Pattern: regexp.MustCompile(`gho_[0-9a-zA-Z]{36}`), Severity: quality.SeverityT4},
|
||||
{Name: "GitHub App Token", Pattern: regexp.MustCompile(`(ghu|ghs)_[0-9a-zA-Z]{36}`), Severity: quality.SeverityT4},
|
||||
{Name: "Slack Token", Pattern: regexp.MustCompile(`xox[baprs]-[0-9]{10,13}-[0-9]{10,13}[a-zA-Z0-9]{24}`), Severity: quality.SeverityT4},
|
||||
{Name: "RSA Private Key", Pattern: regexp.MustCompile(`-----BEGIN RSA PRIVATE KEY-----`), Severity: quality.SeverityT4},
|
||||
{Name: "Private Key", Pattern: regexp.MustCompile(`-----BEGIN PRIVATE KEY-----`), Severity: quality.SeverityT4},
|
||||
{Name: "RSA Private Key", Pattern: regexp.MustCompile(`-----BEGIN ` + `RSA PRIVATE KEY-----`), Severity: quality.SeverityT4},
|
||||
{Name: "Private Key", Pattern: regexp.MustCompile(`-----BEGIN ` + `PRIVATE KEY-----`), Severity: quality.SeverityT4},
|
||||
{Name: "JWT", Pattern: regexp.MustCompile(`eyJ[a-zA-Z0-9_-]*\.eyJ[a-zA-Z0-9_-]*\.[a-zA-Z0-9_-]*`), Severity: quality.SeverityT3},
|
||||
{Name: "Generic API Key", Pattern: regexp.MustCompile(`(?i)(api_key|apikey|secret|password|token)\s*[=:]\s*['"][^'"]{8,}['"]`), Severity: quality.SeverityT3},
|
||||
{Name: "DB Connection String", Pattern: regexp.MustCompile(`(?i)(mysql|postgres|mongodb)://[^:]+:[^@]+@[^/]+`), Severity: quality.SeverityT4},
|
||||
|
||||
@@ -0,0 +1,79 @@
|
||||
package quality
|
||||
|
||||
import "strings"
|
||||
|
||||
type docsEvidence struct {
|
||||
URLs []string
|
||||
Rationale string
|
||||
Confidence string
|
||||
}
|
||||
|
||||
var defaultEvidenceByType = map[string]docsEvidence{
|
||||
"complexity_ast": {
|
||||
URLs: []string{"https://go.dev/doc/effective_go", "https://go.dev/wiki/CodeReviewComments"},
|
||||
Rationale: "High complexity correlates with maintainability and defect risk; official style guidance recommends smaller focused functions.",
|
||||
Confidence: "0.82",
|
||||
},
|
||||
"god_function": {
|
||||
URLs: []string{"https://go.dev/doc/effective_go", "https://go.dev/wiki/CodeReviewComments"},
|
||||
Rationale: "Large multi-responsibility functions usually violate readability and testability guidance.",
|
||||
Confidence: "0.84",
|
||||
},
|
||||
"unused_import": {
|
||||
URLs: []string{"https://pkg.go.dev/cmd/go", "https://pkg.go.dev/go/importer"},
|
||||
Rationale: "Unused imports break build hygiene and indicate stale code paths.",
|
||||
Confidence: "0.95",
|
||||
},
|
||||
"dead_code": {
|
||||
URLs: []string{"https://pkg.go.dev/cmd/go", "https://go.dev/wiki/CodeReviewComments"},
|
||||
Rationale: "Unreachable or unused symbols increase maintenance overhead with no runtime value.",
|
||||
Confidence: "0.90",
|
||||
},
|
||||
"dead_code_enhanced": {
|
||||
URLs: []string{"https://pkg.go.dev/cmd/go", "https://go.dev/wiki/CodeReviewComments"},
|
||||
Rationale: "Unreachable or unused symbols increase maintenance overhead with no runtime value.",
|
||||
Confidence: "0.90",
|
||||
},
|
||||
"duplication": {
|
||||
URLs: []string{"https://go.dev/wiki/CodeReviewComments"},
|
||||
Rationale: "Duplication increases change cost and risk of inconsistent bug fixes.",
|
||||
Confidence: "0.80",
|
||||
},
|
||||
"single_use": {
|
||||
URLs: []string{"https://go.dev/doc/effective_go", "https://go.dev/wiki/CodeReviewComments"},
|
||||
Rationale: "Single-use abstractions can reduce clarity unless they encode reusable domain behavior.",
|
||||
Confidence: "0.74",
|
||||
},
|
||||
"test_coverage": {
|
||||
URLs: []string{"https://go.dev/doc/tutorial/add-a-test", "https://pkg.go.dev/testing"},
|
||||
Rationale: "Coverage gaps on changed code increase regression probability.",
|
||||
Confidence: "0.78",
|
||||
},
|
||||
}
|
||||
|
||||
// AttachDocsEvidence annotates findings with docs evidence metadata.
|
||||
func AttachDocsEvidence(language string, findings []Finding) []Finding {
|
||||
language = strings.ToLower(strings.TrimSpace(language))
|
||||
for i := range findings {
|
||||
ev, ok := defaultEvidenceByType[findings[i].Type]
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
if findings[i].Metadata == nil {
|
||||
findings[i].Metadata = map[string]string{}
|
||||
}
|
||||
if len(ev.URLs) > 0 {
|
||||
findings[i].Metadata["docs_evidence_urls"] = strings.Join(ev.URLs, " | ")
|
||||
}
|
||||
if ev.Rationale != "" {
|
||||
findings[i].Metadata["docs_evidence_rationale"] = ev.Rationale
|
||||
}
|
||||
if ev.Confidence != "" {
|
||||
findings[i].Metadata["docs_evidence_confidence"] = ev.Confidence
|
||||
}
|
||||
if language != "" {
|
||||
findings[i].Metadata["docs_evidence_language"] = language
|
||||
}
|
||||
}
|
||||
return findings
|
||||
}
|
||||
@@ -104,7 +104,7 @@ func (f *DefaultFileFinder) FindFiles(path string, language string) ([]string, e
|
||||
if info.IsDir() {
|
||||
// Skip hidden directories and common exclude dirs
|
||||
base := filepath.Base(filePath)
|
||||
if strings.HasPrefix(base, ".") || base == "node_modules" || base == "vendor" {
|
||||
if filePath != path && (strings.HasPrefix(base, ".") || base == "node_modules" || base == "vendor") {
|
||||
return filepath.SkipDir
|
||||
}
|
||||
return nil
|
||||
|
||||
@@ -170,6 +170,37 @@ func TestDefaultFileFinder_FindFiles_EmptyDirectory(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestDefaultFileFinder_FindFiles_DotPathRootNotSkipped(t *testing.T) {
|
||||
tmpDir, err := os.MkdirTemp("", "filefinder_dot_root_test")
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to create temp dir: %v", err)
|
||||
}
|
||||
defer os.RemoveAll(tmpDir)
|
||||
|
||||
if err := os.WriteFile(filepath.Join(tmpDir, "main.go"), []byte("package main"), 0644); err != nil {
|
||||
t.Fatalf("Failed to write go file: %v", err)
|
||||
}
|
||||
|
||||
cwd, err := os.Getwd()
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to get cwd: %v", err)
|
||||
}
|
||||
defer func() { _ = os.Chdir(cwd) }()
|
||||
|
||||
if err := os.Chdir(tmpDir); err != nil {
|
||||
t.Fatalf("Failed to chdir: %v", err)
|
||||
}
|
||||
|
||||
finder := NewDefaultFileFinder()
|
||||
files, err := finder.FindFiles(".", "go")
|
||||
if err != nil {
|
||||
t.Fatalf("FindFiles() failed: %v", err)
|
||||
}
|
||||
if len(files) != 1 {
|
||||
t.Fatalf("FindFiles('.') expected 1 file, got %d", len(files))
|
||||
}
|
||||
}
|
||||
|
||||
func TestDefaultFileFinder_FindFiles_NonExistentPath(t *testing.T) {
|
||||
finder := NewDefaultFileFinder()
|
||||
files, err := finder.FindFiles("/non/existent/path", "go")
|
||||
|
||||
@@ -58,7 +58,10 @@ func (d *SingleUseDetector) Detect(ctx context.Context, path string, config *qua
|
||||
|
||||
switch obj := obj.(type) {
|
||||
case *types.Func:
|
||||
key := obj.Pkg().Path() + "." + obj.Name()
|
||||
key, ok := functionKey(obj)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
callCounts[key]++
|
||||
case *types.TypeName:
|
||||
if obj.Pkg() != nil {
|
||||
@@ -75,17 +78,18 @@ func (d *SingleUseDetector) Detect(ctx context.Context, path string, config *qua
|
||||
|
||||
switch obj := obj.(type) {
|
||||
case *types.Func:
|
||||
if obj.Pkg() != nil {
|
||||
key := obj.Pkg().Path() + "." + obj.Name()
|
||||
pos := pkg.Fset.Position(obj.Pos())
|
||||
funcDefs[key] = FuncDef{
|
||||
Name: obj.Name(),
|
||||
File: pos.Filename,
|
||||
Line: pos.Line,
|
||||
Package: obj.Pkg().Path(),
|
||||
Exported: obj.Exported(),
|
||||
Signature: obj.Type().String(),
|
||||
}
|
||||
key, ok := functionKey(obj)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
pos := pkg.Fset.Position(obj.Pos())
|
||||
funcDefs[key] = FuncDef{
|
||||
Name: obj.Name(),
|
||||
File: pos.Filename,
|
||||
Line: pos.Line,
|
||||
Package: obj.Pkg().Path(),
|
||||
Exported: obj.Exported(),
|
||||
Signature: obj.Type().String(),
|
||||
}
|
||||
case *types.TypeName:
|
||||
if obj.Pkg() != nil {
|
||||
@@ -109,6 +113,9 @@ func (d *SingleUseDetector) Detect(ctx context.Context, path string, config *qua
|
||||
var findings []quality.Finding
|
||||
|
||||
for key, def := range funcDefs {
|
||||
if def.Exported || isLikelyEntrypointFile(def.File) {
|
||||
continue
|
||||
}
|
||||
if strings.HasSuffix(def.Name, "Test") || strings.HasPrefix(def.Name, "Test") {
|
||||
continue
|
||||
}
|
||||
@@ -143,9 +150,18 @@ func (d *SingleUseDetector) Detect(ctx context.Context, path string, config *qua
|
||||
}
|
||||
|
||||
for key, def := range typeDefs {
|
||||
if def.Exported || isLikelyEntrypointFile(def.File) {
|
||||
continue
|
||||
}
|
||||
if strings.HasSuffix(def.Name, "Error") || strings.HasSuffix(def.Name, "Options") {
|
||||
continue
|
||||
}
|
||||
if strings.HasSuffix(def.Name, "Config") || strings.HasSuffix(def.Name, "Params") {
|
||||
continue
|
||||
}
|
||||
if !strings.Contains(def.Underlying, "struct") && !strings.Contains(def.Underlying, "interface") {
|
||||
continue
|
||||
}
|
||||
|
||||
count := typeUsages[key]
|
||||
if count == 1 {
|
||||
@@ -242,6 +258,22 @@ func (d *SingleUseDetector) getFuncLOC(file string, startLine int) (int, error)
|
||||
return loc, nil
|
||||
}
|
||||
|
||||
func functionKey(fn *types.Func) (string, bool) {
|
||||
if fn == nil || fn.Pkg() == nil {
|
||||
return "", false
|
||||
}
|
||||
sig, ok := fn.Type().(*types.Signature)
|
||||
if ok && sig.Recv() != nil {
|
||||
return "", false
|
||||
}
|
||||
return fn.Pkg().Path() + "." + fn.Name(), true
|
||||
}
|
||||
|
||||
func isLikelyEntrypointFile(path string) bool {
|
||||
p := filepath.ToSlash(path)
|
||||
return strings.HasPrefix(p, "cmd/") || strings.Contains(p, "/cmd/") || strings.HasSuffix(p, "/main.go") || strings.HasSuffix(p, "_test.go")
|
||||
}
|
||||
|
||||
type FuncDef struct {
|
||||
Name string
|
||||
File string
|
||||
@@ -471,33 +503,36 @@ func (d *EnhancedDeadCodeDetector) Detect(ctx context.Context, path string, conf
|
||||
switch o := obj.(type) {
|
||||
case *types.Func:
|
||||
defs[key] = ObjInfo{
|
||||
Name: obj.Name(),
|
||||
Type: "function",
|
||||
File: pos.Filename,
|
||||
Line: pos.Line,
|
||||
Package: obj.Pkg().Path(),
|
||||
Exported: obj.Exported(),
|
||||
Signature: o.Type().String(),
|
||||
Name: obj.Name(),
|
||||
Type: "function",
|
||||
File: pos.Filename,
|
||||
Line: pos.Line,
|
||||
Package: obj.Pkg().Path(),
|
||||
PackageName: pkg.Name,
|
||||
Exported: obj.Exported(),
|
||||
Signature: o.Type().String(),
|
||||
}
|
||||
case *types.TypeName:
|
||||
defs[key] = ObjInfo{
|
||||
Name: obj.Name(),
|
||||
Type: "type",
|
||||
File: pos.Filename,
|
||||
Line: pos.Line,
|
||||
Package: obj.Pkg().Path(),
|
||||
Exported: obj.Exported(),
|
||||
Underlying: o.Type().Underlying().String(),
|
||||
Name: obj.Name(),
|
||||
Type: "type",
|
||||
File: pos.Filename,
|
||||
Line: pos.Line,
|
||||
Package: obj.Pkg().Path(),
|
||||
PackageName: pkg.Name,
|
||||
Exported: obj.Exported(),
|
||||
Underlying: o.Type().Underlying().String(),
|
||||
}
|
||||
case *types.Var:
|
||||
if obj.Exported() {
|
||||
if obj.Exported() && !o.IsField() {
|
||||
defs[key] = ObjInfo{
|
||||
Name: obj.Name(),
|
||||
Type: "variable",
|
||||
File: pos.Filename,
|
||||
Line: pos.Line,
|
||||
Package: obj.Pkg().Path(),
|
||||
Exported: obj.Exported(),
|
||||
Name: obj.Name(),
|
||||
Type: "variable",
|
||||
File: pos.Filename,
|
||||
Line: pos.Line,
|
||||
Package: obj.Pkg().Path(),
|
||||
PackageName: pkg.Name,
|
||||
Exported: obj.Exported(),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -521,10 +556,22 @@ func (d *EnhancedDeadCodeDetector) Detect(ctx context.Context, path string, conf
|
||||
if entryPoints[key] {
|
||||
continue
|
||||
}
|
||||
if !strings.Contains(def.Package, "/internal/") || def.PackageName == "main" {
|
||||
continue
|
||||
}
|
||||
if isLikelyEntrypointFile(def.File) {
|
||||
continue
|
||||
}
|
||||
|
||||
if strings.HasPrefix(def.Name, "Test") || strings.HasPrefix(def.Name, "Benchmark") || strings.HasPrefix(def.Name, "Fuzz") {
|
||||
continue
|
||||
}
|
||||
if def.Type == "function" && strings.HasPrefix(def.Name, "New") {
|
||||
continue
|
||||
}
|
||||
if def.Type == "type" && (strings.HasSuffix(def.Name, "Config") || strings.HasSuffix(def.Name, "Options")) {
|
||||
continue
|
||||
}
|
||||
|
||||
if strings.HasSuffix(def.Name, "Error") && def.Type == "type" {
|
||||
continue
|
||||
@@ -573,12 +620,13 @@ func (d *EnhancedDeadCodeDetector) Detect(ctx context.Context, path string, conf
|
||||
}
|
||||
|
||||
type ObjInfo struct {
|
||||
Name string
|
||||
Type string
|
||||
File string
|
||||
Line int
|
||||
Package string
|
||||
Exported bool
|
||||
Signature string
|
||||
Underlying string
|
||||
Name string
|
||||
Type string
|
||||
File string
|
||||
Line int
|
||||
Package string
|
||||
PackageName string
|
||||
Exported bool
|
||||
Signature string
|
||||
Underlying string
|
||||
}
|
||||
|
||||
@@ -172,8 +172,7 @@ func (d *UnusedImportDetector) analyzeFile(path string) ([]quality.Finding, erro
|
||||
if imp.Name != nil {
|
||||
name = imp.Name.Name
|
||||
} else {
|
||||
parts := strings.Split(pkgPath, "/")
|
||||
name = parts[len(parts)-1]
|
||||
name = inferImportName(pkgPath)
|
||||
}
|
||||
imports[pkgPath] = name
|
||||
}
|
||||
@@ -191,8 +190,7 @@ func (d *UnusedImportDetector) analyzeFile(path string) ([]quality.Finding, erro
|
||||
if imp.Name != nil {
|
||||
name = imp.Name.Name
|
||||
} else {
|
||||
parts := strings.Split(pkgPath, "/")
|
||||
name = parts[len(parts)-1]
|
||||
name = inferImportName(pkgPath)
|
||||
}
|
||||
|
||||
if name == "_" || name == "." {
|
||||
@@ -224,6 +222,42 @@ func (d *UnusedImportDetector) analyzeFile(path string) ([]quality.Finding, erro
|
||||
return findings, nil
|
||||
}
|
||||
|
||||
func inferImportName(pkgPath string) string {
|
||||
parts := strings.Split(pkgPath, "/")
|
||||
if len(parts) == 0 {
|
||||
return pkgPath
|
||||
}
|
||||
|
||||
last := parts[len(parts)-1]
|
||||
if isSemverSegment(last) && len(parts) >= 2 {
|
||||
last = parts[len(parts)-2]
|
||||
}
|
||||
if idx := strings.Index(last, ".v"); idx > 0 && isDigits(last[idx+2:]) {
|
||||
last = last[:idx]
|
||||
}
|
||||
|
||||
return last
|
||||
}
|
||||
|
||||
func isSemverSegment(segment string) bool {
|
||||
if len(segment) < 2 || segment[0] != 'v' {
|
||||
return false
|
||||
}
|
||||
return isDigits(segment[1:])
|
||||
}
|
||||
|
||||
func isDigits(value string) bool {
|
||||
if value == "" {
|
||||
return false
|
||||
}
|
||||
for _, r := range value {
|
||||
if r < '0' || r > '9' {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
type CycleDetector struct {
|
||||
*quality.BaseDetector
|
||||
}
|
||||
|
||||
@@ -0,0 +1,22 @@
|
||||
package analyzers
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestInferImportName(t *testing.T) {
|
||||
tests := []struct {
|
||||
path string
|
||||
want string
|
||||
}{
|
||||
{path: "fmt", want: "fmt"},
|
||||
{path: "gopkg.in/yaml.v3", want: "yaml"},
|
||||
{path: "github.com/gocolly/colly/v2", want: "colly"},
|
||||
{path: "golang.org/x/tools/go/packages", want: "packages"},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
got := inferImportName(tt.path)
|
||||
if got != tt.want {
|
||||
t.Fatalf("inferImportName(%q) = %q, want %q", tt.path, got, tt.want)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -240,6 +240,10 @@ func (d *DebugLogDetector) analyzeFile(path string) []quality.Finding {
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
normPath := filepath.ToSlash(path)
|
||||
if strings.Contains(normPath, "internal/ui/") || strings.Contains(normPath, "examples/") {
|
||||
return nil
|
||||
}
|
||||
|
||||
debugPatterns := []string{
|
||||
"log.Print",
|
||||
@@ -267,7 +271,7 @@ func (d *DebugLogDetector) analyzeFile(path string) []quality.Finding {
|
||||
|
||||
for _, pattern := range debugPatterns {
|
||||
if callStr == pattern || strings.HasPrefix(callStr, pattern) {
|
||||
if strings.Contains(path, "_test.go") {
|
||||
if strings.HasSuffix(normPath, "_test.go") || strings.HasPrefix(normPath, "cmd/") || strings.Contains(normPath, "/cmd/") {
|
||||
return true
|
||||
}
|
||||
|
||||
@@ -291,7 +295,7 @@ func (d *DebugLogDetector) analyzeFile(path string) []quality.Finding {
|
||||
}
|
||||
}
|
||||
|
||||
if strings.Contains(path, "/cmd/") {
|
||||
if strings.HasPrefix(normPath, "cmd/") || strings.Contains(normPath, "/cmd/") {
|
||||
return true
|
||||
}
|
||||
|
||||
|
||||
@@ -42,7 +42,6 @@ func (p *GoPlugin) DefaultSrcDir() string {
|
||||
|
||||
func (p *GoPlugin) CreateDetectors(finder quality.FileFinder) []quality.Detector {
|
||||
return []quality.Detector{
|
||||
analyzers.NewDeadCodeDetector(finder),
|
||||
analyzers.NewEnhancedDeadCodeDetector(finder),
|
||||
analyzers.NewUnusedImportDetector(finder),
|
||||
analyzers.NewCycleDetector(finder),
|
||||
|
||||
+16
-23
@@ -67,13 +67,13 @@ func (s *Scanner) Scan(ctx context.Context) (*ScanResult, error) {
|
||||
// Skip language-specific detectors for different languages
|
||||
if langDetector, ok := detector.(LanguageDetector); ok {
|
||||
supported := langDetector.SupportedLanguages()
|
||||
if !contains(supported, language) {
|
||||
if len(supported) > 0 && !contains(supported, language) {
|
||||
log.Printf("Skipping detector %s for language %s", name, language)
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
findings, err := detector.Detect(ctx, s.config.Path, s.config)
|
||||
findings, err := s.runDetectorSafely(ctx, detector, name)
|
||||
if err != nil {
|
||||
log.Printf("Detector %s failed: %v", name, err)
|
||||
continue
|
||||
@@ -106,28 +106,21 @@ func (s *Scanner) Scan(ctx context.Context) (*ScanResult, error) {
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func (s *Scanner) runDetectorSafely(ctx context.Context, detector Detector, name string) (_ []Finding, err error) {
|
||||
defer func() {
|
||||
if r := recover(); r != nil {
|
||||
err = fmt.Errorf("detector panic in %s: %v", name, r)
|
||||
}
|
||||
}()
|
||||
return detector.Detect(ctx, s.config.Path, s.config)
|
||||
}
|
||||
|
||||
// detectLanguage attempts to auto-detect the project language
|
||||
func (s *Scanner) detectLanguage(path string) string {
|
||||
// Check for marker files
|
||||
markers := map[string]string{
|
||||
"go.mod": "go",
|
||||
"package.json": "typescript",
|
||||
"tsconfig.json": "typescript",
|
||||
"requirements.txt": "python",
|
||||
"setup.py": "python",
|
||||
"pyproject.toml": "python",
|
||||
"pom.xml": "java",
|
||||
"build.gradle": "java",
|
||||
"Cargo.toml": "rust",
|
||||
"composer.json": "php",
|
||||
}
|
||||
|
||||
for file, lang := range markers {
|
||||
if _, err := filepath.Abs(filepath.Join(path, file)); err == nil {
|
||||
if _, err := filepath.Glob(filepath.Join(path, file)); err == nil {
|
||||
return lang
|
||||
}
|
||||
}
|
||||
// Keep auto-detection intentionally conservative until full multi-language
|
||||
// scanner behavior is validated in tests.
|
||||
if _, err := os.Stat(filepath.Join(path, "go.mod")); err == nil {
|
||||
return "go"
|
||||
}
|
||||
|
||||
// Default to Go if no markers found
|
||||
@@ -164,7 +157,7 @@ func (s *Scanner) getSourceFiles(path, language string) ([]string, error) {
|
||||
if info.IsDir() {
|
||||
// Skip hidden directories and common exclude dirs
|
||||
base := filepath.Base(filePath)
|
||||
if strings.HasPrefix(base, ".") || base == "node_modules" || base == "vendor" {
|
||||
if filePath != path && (strings.HasPrefix(base, ".") || base == "node_modules" || base == "vendor") {
|
||||
return filepath.SkipDir
|
||||
}
|
||||
return nil
|
||||
|
||||
@@ -0,0 +1,36 @@
|
||||
package quality
|
||||
|
||||
import (
|
||||
"context"
|
||||
"testing"
|
||||
)
|
||||
|
||||
type panicDetector struct{}
|
||||
|
||||
func (p panicDetector) Name() string { return "panic_detector" }
|
||||
func (p panicDetector) Severity() Severity { return SeverityT2 }
|
||||
func (p panicDetector) Detect(ctx context.Context, path string, config *Config) ([]Finding, error) {
|
||||
panic("boom")
|
||||
}
|
||||
|
||||
type okDetector struct{}
|
||||
|
||||
func (o okDetector) Name() string { return "ok_detector" }
|
||||
func (o okDetector) Severity() Severity { return SeverityT1 }
|
||||
func (o okDetector) Detect(ctx context.Context, path string, config *Config) ([]Finding, error) {
|
||||
return []Finding{{ID: "ok", Type: "ok", Title: "ok", File: "f.go", Line: 1, Severity: SeverityT1, Score: 1, Status: StatusOpen}}, nil
|
||||
}
|
||||
|
||||
func TestScannerRecoversDetectorPanic(t *testing.T) {
|
||||
s := NewScanner(&Config{Path: ".", Language: "go"})
|
||||
s.RegisterDetector(panicDetector{})
|
||||
s.RegisterDetector(okDetector{})
|
||||
|
||||
result, err := s.Scan(context.Background())
|
||||
if err != nil {
|
||||
t.Fatalf("scan should recover detector panic, got err: %v", err)
|
||||
}
|
||||
if len(result.Findings) != 1 {
|
||||
t.Fatalf("expected findings from healthy detector only, got %d", len(result.Findings))
|
||||
}
|
||||
}
|
||||
@@ -457,6 +457,37 @@ func TestScanner_getSourceFiles_Fallback(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestScanner_getSourceFiles_Fallback_DotPathRootNotSkipped(t *testing.T) {
|
||||
tmpDir, err := os.MkdirTemp("", "scanner_dot_root_test")
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to create temp dir: %v", err)
|
||||
}
|
||||
defer os.RemoveAll(tmpDir)
|
||||
|
||||
if err := os.WriteFile(filepath.Join(tmpDir, "main.go"), []byte("package main"), 0644); err != nil {
|
||||
t.Fatalf("Failed to write go file: %v", err)
|
||||
}
|
||||
|
||||
cwd, err := os.Getwd()
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to get cwd: %v", err)
|
||||
}
|
||||
defer func() { _ = os.Chdir(cwd) }()
|
||||
|
||||
if err := os.Chdir(tmpDir); err != nil {
|
||||
t.Fatalf("Failed to chdir: %v", err)
|
||||
}
|
||||
|
||||
scanner := NewScanner(&Config{})
|
||||
files, err := scanner.getSourceFiles(".", "go")
|
||||
if err != nil {
|
||||
t.Fatalf("getSourceFiles() failed: %v", err)
|
||||
}
|
||||
if len(files) != 1 {
|
||||
t.Fatalf("getSourceFiles('.') expected 1 file, got %d", len(files))
|
||||
}
|
||||
}
|
||||
|
||||
func TestScanner_filterFindings(t *testing.T) {
|
||||
scanner := NewScanner(&Config{})
|
||||
|
||||
|
||||
@@ -52,8 +52,8 @@ func TestScorer_CalculateScore(t *testing.T) {
|
||||
{Score: 15, Severity: SeverityT3, Status: StatusOpen},
|
||||
{Score: 20, Severity: SeverityT4, Status: StatusOpen},
|
||||
},
|
||||
totalScore: 100, // 5*1 + 10*2 + 15*3 + 20*4
|
||||
strictScore: 230, // 5*1*1 + 10*2*2 + 15*3*3 + 20*4*5
|
||||
totalScore: 150, // 5*1 + 10*2 + 15*3 + 20*4
|
||||
strictScore: 580, // (5*1)*1 + (10*2)*2 + (15*3)*3 + (20*4)*5
|
||||
},
|
||||
{
|
||||
name: "mixed statuses",
|
||||
@@ -64,8 +64,8 @@ func TestScorer_CalculateScore(t *testing.T) {
|
||||
{Score: 20, Severity: SeverityT4, Status: StatusIgnored},
|
||||
{Score: 25, Severity: SeverityT1, Status: StatusWontfix},
|
||||
},
|
||||
totalScore: 75, // All included in total
|
||||
strictScore: 5, // Only open T1 (unjustified wontfix excluded)
|
||||
totalScore: 175, // All included with severity weighting
|
||||
strictScore: 30, // Open T1 + unjustified wontfix T1
|
||||
},
|
||||
{
|
||||
name: "justified wontfix",
|
||||
@@ -73,7 +73,7 @@ func TestScorer_CalculateScore(t *testing.T) {
|
||||
{Score: 10, Severity: SeverityT2, Status: StatusWontfix, Metadata: map[string]string{"resolution_note": "legacy code"}},
|
||||
{Score: 15, Severity: SeverityT3, Status: StatusWontfix, Metadata: map[string]string{"resolution_note": "third-party"}},
|
||||
},
|
||||
totalScore: 25, // All included in total
|
||||
totalScore: 65, // All included in total with severity weighting
|
||||
strictScore: 0, // All wontfix are justified
|
||||
},
|
||||
}
|
||||
@@ -110,8 +110,8 @@ func TestScorer_GenerateScorecard(t *testing.T) {
|
||||
t.Errorf("GenerateScorecard() TargetScore = %v, want 95", card.TargetScore)
|
||||
}
|
||||
|
||||
if card.TotalScore != 40 { // 10*2 + 5*1 + 15*3
|
||||
t.Errorf("GenerateScorecard() TotalScore = %v, want 40", card.TotalScore)
|
||||
if card.TotalScore != 70 { // 10*2 + 5*1 + 15*3
|
||||
t.Errorf("GenerateScorecard() TotalScore = %v, want 70", card.TotalScore)
|
||||
}
|
||||
|
||||
if card.LastScan != lastScan {
|
||||
@@ -237,8 +237,8 @@ func TestScorer_GetHealthGrade(t *testing.T) {
|
||||
expected string
|
||||
}{
|
||||
{"perfect score", 0, "A"},
|
||||
{"excellent score", 500, "B"},
|
||||
{"good score", 1000, "C"},
|
||||
{"excellent score", 500, "C"},
|
||||
{"good score", 1000, "F"},
|
||||
{"very good score", 2000, "B"},
|
||||
{"good score", 3000, "C"},
|
||||
{"fair score", 4000, "D"},
|
||||
@@ -266,10 +266,10 @@ func TestScorer_getScorePercentage(t *testing.T) {
|
||||
}{
|
||||
{"zero score", 0, 100},
|
||||
{"low score", 100, 95},
|
||||
{"medium score", 1000, 90},
|
||||
{"high score", 5000, 75},
|
||||
{"medium score", 1000, 50},
|
||||
{"high score", 5000, 50},
|
||||
{"very high score", 10000, 50},
|
||||
{"extreme score", 20000, 0},
|
||||
{"extreme score", 20000, 55},
|
||||
{"negative score", -100, 100},
|
||||
}
|
||||
|
||||
|
||||
+45
@@ -0,0 +1,45 @@
|
||||
package scraper
|
||||
|
||||
import basescraper "github.com/yourorg/devour/internal/scraper"
|
||||
|
||||
func init() {
|
||||
basescraper.RegisterScraper(basescraper.SourceTypeGoDocs, func(c *basescraper.Config) basescraper.Scraper {
|
||||
return NewGoDocsScraper(c)
|
||||
})
|
||||
basescraper.RegisterScraper(basescraper.SourceTypeRustDocs, func(c *basescraper.Config) basescraper.Scraper {
|
||||
return NewRustDocsScraper(c)
|
||||
})
|
||||
basescraper.RegisterScraper(basescraper.SourceTypePythonDocs, func(c *basescraper.Config) basescraper.Scraper {
|
||||
return NewPythonDocsScraper(c)
|
||||
})
|
||||
basescraper.RegisterScraper(basescraper.SourceTypeJavaDocs, func(c *basescraper.Config) basescraper.Scraper {
|
||||
return NewJavaDocsScraper(c)
|
||||
})
|
||||
basescraper.RegisterScraper(basescraper.SourceTypeSpringDocs, func(c *basescraper.Config) basescraper.Scraper {
|
||||
return NewSpringDocsScraper(c)
|
||||
})
|
||||
basescraper.RegisterScraper(basescraper.SourceTypeTSDocs, func(c *basescraper.Config) basescraper.Scraper {
|
||||
return NewTSDocsScraper(c)
|
||||
})
|
||||
basescraper.RegisterScraper(basescraper.SourceTypeReactDocs, func(c *basescraper.Config) basescraper.Scraper {
|
||||
return NewReactDocsScraper(c)
|
||||
})
|
||||
basescraper.RegisterScraper(basescraper.SourceTypeVueDocs, func(c *basescraper.Config) basescraper.Scraper {
|
||||
return NewVueDocsScraper(c)
|
||||
})
|
||||
basescraper.RegisterScraper(basescraper.SourceTypeNuxtDocs, func(c *basescraper.Config) basescraper.Scraper {
|
||||
return NewNuxtDocsScraper(c)
|
||||
})
|
||||
basescraper.RegisterScraper(basescraper.SourceTypeMCPDocs, func(c *basescraper.Config) basescraper.Scraper {
|
||||
return NewMCPDocsScraper(c)
|
||||
})
|
||||
basescraper.RegisterScraper(basescraper.SourceTypeDockerDocs, func(c *basescraper.Config) basescraper.Scraper {
|
||||
return NewDockerDocsScraper(c)
|
||||
})
|
||||
basescraper.RegisterScraper(basescraper.SourceTypeCloudflareDocs, func(c *basescraper.Config) basescraper.Scraper {
|
||||
return NewCloudflareDocsScraper(c)
|
||||
})
|
||||
basescraper.RegisterScraper(basescraper.SourceTypeAstroDocs, func(c *basescraper.Config) basescraper.Scraper {
|
||||
return NewAstroDocsScraper(c)
|
||||
})
|
||||
}
|
||||
Vendored
+27
-12
@@ -155,16 +155,18 @@ func (s *TSDocsScraper) interfaceToDocument(iface *tsdocs.Interface, module *tsd
|
||||
metadata := map[string]interface{}{
|
||||
"module": module.Name,
|
||||
"name": iface.Name,
|
||||
"doc_url": iface.DocURL,
|
||||
"doc_url": coalesceDocURL(iface.DocURL, module.DocURL),
|
||||
}
|
||||
|
||||
docURL := coalesceDocURL(iface.DocURL, module.DocURL)
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(iface.DocURL),
|
||||
ID: generateDocID(docURL),
|
||||
Source: sourceName,
|
||||
Type: "ts-interface",
|
||||
Title: iface.Name,
|
||||
Content: content.String(),
|
||||
URL: iface.DocURL,
|
||||
URL: docURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
@@ -185,16 +187,18 @@ func (s *TSDocsScraper) functionToDocument(fn *tsdocs.Function, module *tsdocs.M
|
||||
"module": module.Name,
|
||||
"name": fn.Name,
|
||||
"return_type": fn.ReturnType,
|
||||
"doc_url": fn.DocURL,
|
||||
"doc_url": coalesceDocURL(fn.DocURL, module.DocURL),
|
||||
}
|
||||
|
||||
docURL := coalesceDocURL(fn.DocURL, module.DocURL)
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(fn.DocURL),
|
||||
ID: generateDocID(docURL),
|
||||
Source: sourceName,
|
||||
Type: "ts-function",
|
||||
Title: fn.Name,
|
||||
Content: content.String(),
|
||||
URL: fn.DocURL,
|
||||
URL: docURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
@@ -217,16 +221,18 @@ func (s *TSDocsScraper) classToDocument(class *tsdocs.Class, module *tsdocs.Modu
|
||||
metadata := map[string]interface{}{
|
||||
"module": module.Name,
|
||||
"name": class.Name,
|
||||
"doc_url": class.DocURL,
|
||||
"doc_url": coalesceDocURL(class.DocURL, module.DocURL),
|
||||
}
|
||||
|
||||
docURL := coalesceDocURL(class.DocURL, module.DocURL)
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(class.DocURL),
|
||||
ID: generateDocID(docURL),
|
||||
Source: sourceName,
|
||||
Type: "ts-class",
|
||||
Title: class.Name,
|
||||
Content: content.String(),
|
||||
URL: class.DocURL,
|
||||
URL: docURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
@@ -244,18 +250,27 @@ func (s *TSDocsScraper) typeAliasToDocument(ta *tsdocs.TypeAlias, module *tsdocs
|
||||
metadata := map[string]interface{}{
|
||||
"module": module.Name,
|
||||
"name": ta.Name,
|
||||
"doc_url": ta.DocURL,
|
||||
"doc_url": coalesceDocURL(ta.DocURL, module.DocURL),
|
||||
}
|
||||
|
||||
docURL := coalesceDocURL(ta.DocURL, module.DocURL)
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(ta.DocURL),
|
||||
ID: generateDocID(docURL),
|
||||
Source: sourceName,
|
||||
Type: "ts-type",
|
||||
Title: ta.Name,
|
||||
Content: content.String(),
|
||||
URL: ta.DocURL,
|
||||
URL: docURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func coalesceDocURL(primary, fallback string) string {
|
||||
if strings.TrimSpace(primary) != "" {
|
||||
return primary
|
||||
}
|
||||
return fallback
|
||||
}
|
||||
|
||||
+65
@@ -0,0 +1,65 @@
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/yourorg/devour/pkg/tsdocs"
|
||||
)
|
||||
|
||||
func TestTSDocsSubDocsFallbackToModuleURL(t *testing.T) {
|
||||
s := &TSDocsScraper{}
|
||||
module := &tsdocs.Module{
|
||||
Name: "Module",
|
||||
DocURL: "https://www.typescriptlang.org/docs/handbook/2/basic-types.html",
|
||||
}
|
||||
|
||||
cases := []struct {
|
||||
name string
|
||||
build func() *Document
|
||||
docType string
|
||||
}{
|
||||
{
|
||||
name: "interface",
|
||||
build: func() *Document {
|
||||
return s.interfaceToDocument(&tsdocs.Interface{Name: "User", DocURL: ""}, module, "ts")
|
||||
},
|
||||
docType: "ts-interface",
|
||||
},
|
||||
{
|
||||
name: "function",
|
||||
build: func() *Document {
|
||||
return s.functionToDocument(&tsdocs.Function{Name: "parse", DocURL: ""}, module, "ts")
|
||||
},
|
||||
docType: "ts-function",
|
||||
},
|
||||
{
|
||||
name: "class",
|
||||
build: func() *Document {
|
||||
return s.classToDocument(&tsdocs.Class{Name: "Service", DocURL: ""}, module, "ts")
|
||||
},
|
||||
docType: "ts-class",
|
||||
},
|
||||
{
|
||||
name: "type alias",
|
||||
build: func() *Document {
|
||||
return s.typeAliasToDocument(&tsdocs.TypeAlias{Name: "ID", Type: "string", DocURL: ""}, module, "ts")
|
||||
},
|
||||
docType: "ts-type",
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range cases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
doc := tc.build()
|
||||
if doc.URL != module.DocURL {
|
||||
t.Fatalf("expected fallback URL %q, got %q", module.DocURL, doc.URL)
|
||||
}
|
||||
if got := doc.Metadata["doc_url"]; got != module.DocURL {
|
||||
t.Fatalf("expected metadata doc_url %q, got %#v", module.DocURL, got)
|
||||
}
|
||||
if doc.Type != tc.docType {
|
||||
t.Fatalf("expected doc type %q, got %q", tc.docType, doc.Type)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
Vendored
+21
@@ -0,0 +1,21 @@
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
|
||||
basescraper "github.com/yourorg/devour/internal/scraper"
|
||||
)
|
||||
|
||||
type SourceType = basescraper.SourceType
|
||||
|
||||
type Source = basescraper.Source
|
||||
|
||||
type Document = basescraper.Document
|
||||
|
||||
type Config = basescraper.Config
|
||||
|
||||
func generateDocID(urlStr string) string {
|
||||
hash := sha256.Sum256([]byte(urlStr))
|
||||
return hex.EncodeToString(hash[:12])
|
||||
}
|
||||
+171
-8
@@ -2,6 +2,12 @@ package scraper
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"net/url"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// GitHubScraper scrapes documentation from GitHub repositories.
|
||||
@@ -16,16 +22,173 @@ func NewGitHubScraper(config *Config) *GitHubScraper {
|
||||
|
||||
// Scrape clones and parses documents from a GitHub repository.
|
||||
func (s *GitHubScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
|
||||
// TODO: Implement GitHub scraping
|
||||
// 1. Clone repository (shallow)
|
||||
// 2. Find markdown files in specified paths
|
||||
// 3. Parse README, docs/, wiki
|
||||
// 4. Extract code structure
|
||||
return nil, nil
|
||||
if source == nil {
|
||||
return nil, fmt.Errorf("source is required")
|
||||
}
|
||||
|
||||
repoURL, repoName, err := s.resolveRepo(source)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
tmpDir, err := os.MkdirTemp("", "devour-github-*")
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer os.RemoveAll(tmpDir)
|
||||
|
||||
cloneArgs := []string{"clone", "--depth", "1", "--filter=blob:none", "--sparse", repoURL, tmpDir}
|
||||
if branch := strings.TrimSpace(source.Branch); branch != "" {
|
||||
cloneArgs = []string{"clone", "--depth", "1", "--filter=blob:none", "--sparse", "--branch", branch, repoURL, tmpDir}
|
||||
}
|
||||
|
||||
cmd := exec.CommandContext(ctx, "git", cloneArgs...)
|
||||
output, err := cmd.CombinedOutput()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("git clone failed: %v (%s)", err, strings.TrimSpace(string(output)))
|
||||
}
|
||||
|
||||
if len(source.Include) == 0 {
|
||||
// Try sparse checkout for common docs locations to reduce clone and parse cost.
|
||||
sparse := exec.CommandContext(ctx, "git", "-C", tmpDir, "sparse-checkout", "set", "--skip-checks",
|
||||
"docs", "doc", "src/routes", "website/docs", "packages/*/docs")
|
||||
if sparseOut, sparseErr := sparse.CombinedOutput(); sparseErr != nil {
|
||||
_ = sparseOut
|
||||
_ = exec.CommandContext(ctx, "git", "-C", tmpDir, "sparse-checkout", "disable").Run()
|
||||
}
|
||||
} else {
|
||||
_ = exec.CommandContext(ctx, "git", "-C", tmpDir, "sparse-checkout", "disable").Run()
|
||||
}
|
||||
|
||||
localSource := &Source{
|
||||
Name: coalesce(source.Name, repoName),
|
||||
Type: SourceTypeLocal,
|
||||
Path: tmpDir,
|
||||
Include: append([]string(nil), source.Include...),
|
||||
Exclude: append([]string(nil), source.Exclude...),
|
||||
Schedule: source.Schedule,
|
||||
}
|
||||
|
||||
if len(localSource.Include) == 0 {
|
||||
localSource.Include = []string{
|
||||
`(?i)(^|/)readme\.md$`,
|
||||
`(?i)(^|/)docs?/`,
|
||||
`(?i)\.md$`,
|
||||
`(?i)\.mdx$`,
|
||||
}
|
||||
}
|
||||
|
||||
local := NewLocalScraper(s.config)
|
||||
docs, err := local.Scrape(ctx, localSource)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if len(docs) == 0 && len(source.Include) == 0 {
|
||||
// Sparse patterns did not match this repository layout; retry full checkout.
|
||||
_ = exec.CommandContext(ctx, "git", "-C", tmpDir, "sparse-checkout", "disable").Run()
|
||||
docs, err = local.Scrape(ctx, localSource)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
for _, doc := range docs {
|
||||
if doc == nil {
|
||||
continue
|
||||
}
|
||||
branchForURL := strings.TrimSpace(source.Branch)
|
||||
if branchForURL == "" {
|
||||
branchForURL = "HEAD"
|
||||
}
|
||||
if doc.Metadata == nil {
|
||||
doc.Metadata = map[string]interface{}{}
|
||||
}
|
||||
if rawPath, ok := doc.Metadata["path"].(string); ok {
|
||||
if relPath, relErr := filepath.Rel(tmpDir, rawPath); relErr == nil {
|
||||
relPath = filepath.ToSlash(relPath)
|
||||
relPath = strings.TrimPrefix(relPath, "./")
|
||||
if relPath != "" && relPath != "." {
|
||||
doc.URL = fmt.Sprintf("https://github.com/%s/blob/%s/%s", repoName, branchForURL, relPath)
|
||||
doc.ID = generateDocID(doc.URL)
|
||||
doc.Metadata["path"] = relPath
|
||||
}
|
||||
}
|
||||
}
|
||||
doc.Type = "github-document"
|
||||
doc.Metadata["repo"] = repoName
|
||||
doc.Metadata["repo_url"] = repoURL
|
||||
doc.Metadata["source_type"] = "github"
|
||||
}
|
||||
return docs, nil
|
||||
}
|
||||
|
||||
// DetectChanges checks if the repository has new commits.
|
||||
func (s *GitHubScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
|
||||
// TODO: Check latest commit hash
|
||||
return false, "", nil
|
||||
if source == nil {
|
||||
return false, "", fmt.Errorf("source is required")
|
||||
}
|
||||
_, repoName, err := s.resolveRepo(source)
|
||||
if err != nil {
|
||||
return false, "", err
|
||||
}
|
||||
|
||||
remote := "https://github.com/" + strings.TrimSuffix(repoName, ".git") + ".git"
|
||||
branch := strings.TrimSpace(source.Branch)
|
||||
if branch == "" {
|
||||
branch = "HEAD"
|
||||
}
|
||||
|
||||
cmd := exec.CommandContext(ctx, "git", "ls-remote", remote, branch)
|
||||
output, err := cmd.Output()
|
||||
if err != nil {
|
||||
return false, "", err
|
||||
}
|
||||
line := strings.TrimSpace(string(output))
|
||||
if line == "" {
|
||||
return false, "", fmt.Errorf("empty ls-remote output")
|
||||
}
|
||||
parts := strings.Fields(line)
|
||||
if len(parts) == 0 {
|
||||
return false, "", fmt.Errorf("unexpected ls-remote output")
|
||||
}
|
||||
hash := parts[0]
|
||||
return hash != lastHash, hash, nil
|
||||
}
|
||||
|
||||
func (s *GitHubScraper) resolveRepo(source *Source) (repoURL string, repoName string, err error) {
|
||||
if strings.TrimSpace(source.Repo) != "" {
|
||||
repoName = strings.Trim(strings.TrimSpace(source.Repo), "/")
|
||||
repoName = strings.TrimSuffix(repoName, ".git")
|
||||
return "https://github.com/" + repoName + ".git", repoName, nil
|
||||
}
|
||||
|
||||
raw := strings.TrimSpace(source.URL)
|
||||
if raw == "" {
|
||||
return "", "", fmt.Errorf("github source requires repo or url")
|
||||
}
|
||||
|
||||
u, err := url.Parse(raw)
|
||||
if err != nil {
|
||||
return "", "", err
|
||||
}
|
||||
if !strings.Contains(strings.ToLower(u.Host), "github.com") {
|
||||
return "", "", fmt.Errorf("not a github url: %s", raw)
|
||||
}
|
||||
parts := strings.Split(strings.Trim(u.Path, "/"), "/")
|
||||
if len(parts) < 2 {
|
||||
return "", "", fmt.Errorf("invalid github repo url: %s", raw)
|
||||
}
|
||||
repoName = parts[0] + "/" + strings.TrimSuffix(parts[1], ".git")
|
||||
repoURL = "https://github.com/" + repoName + ".git"
|
||||
return repoURL, repoName, nil
|
||||
}
|
||||
|
||||
func coalesce(primary, fallback string) string {
|
||||
if strings.TrimSpace(primary) != "" {
|
||||
return primary
|
||||
}
|
||||
if strings.TrimSpace(fallback) != "" {
|
||||
return filepath.Base(fallback)
|
||||
}
|
||||
return "github"
|
||||
}
|
||||
|
||||
+227
-8
@@ -2,6 +2,20 @@ package scraper
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"fmt"
|
||||
"io/fs"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
var (
|
||||
reLocalBlankLines = regexp.MustCompile(`\n{3,}`)
|
||||
reFrontMatterBlock = regexp.MustCompile(`(?s)\A---\s*\n.*?\n---\s*\n`)
|
||||
)
|
||||
|
||||
// LocalScraper scrapes documentation from local filesystem.
|
||||
@@ -16,16 +30,221 @@ func NewLocalScraper(config *Config) *LocalScraper {
|
||||
|
||||
// Scrape scans and parses documents from a local directory.
|
||||
func (s *LocalScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
|
||||
// TODO: Implement local scraping
|
||||
// 1. Walk directory tree
|
||||
// 2. Filter by include/exclude patterns
|
||||
// 3. Parse markdown, text, code files
|
||||
// 4. Extract structure and content
|
||||
return nil, nil
|
||||
if source == nil {
|
||||
return nil, fmt.Errorf("source is required")
|
||||
}
|
||||
|
||||
root := strings.TrimSpace(source.Path)
|
||||
if root == "" {
|
||||
root = strings.TrimSpace(source.URL)
|
||||
}
|
||||
if root == "" {
|
||||
return nil, fmt.Errorf("path or url is required for local source")
|
||||
}
|
||||
|
||||
info, err := os.Stat(root)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
docs := make([]*Document, 0)
|
||||
if !info.IsDir() {
|
||||
doc, err := s.fileToDocument(root, source)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return []*Document{doc}, nil
|
||||
}
|
||||
|
||||
web := NewWebScraper(s.config)
|
||||
err = filepath.WalkDir(root, func(path string, d fs.DirEntry, err error) error {
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
default:
|
||||
}
|
||||
|
||||
if d.IsDir() {
|
||||
name := d.Name()
|
||||
if strings.HasPrefix(name, ".") || name == "node_modules" || name == "vendor" || name == "dist" || name == "build" {
|
||||
return filepath.SkipDir
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
relPath := path
|
||||
if rel, relErr := filepath.Rel(root, path); relErr == nil {
|
||||
relPath = rel
|
||||
}
|
||||
normalized := filepath.ToSlash(relPath)
|
||||
if !web.shouldInclude(normalized, source.Include, source.Exclude) {
|
||||
return nil
|
||||
}
|
||||
if !isDocumentationFile(path) {
|
||||
return nil
|
||||
}
|
||||
|
||||
doc, err := s.fileToDocument(path, source)
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
docs = append(docs, doc)
|
||||
return nil
|
||||
})
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return docs, nil
|
||||
}
|
||||
|
||||
// DetectChanges checks if files have been modified.
|
||||
func (s *LocalScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
|
||||
// TODO: Check file modification times
|
||||
return false, "", nil
|
||||
if source == nil {
|
||||
return false, "", fmt.Errorf("source is required")
|
||||
}
|
||||
|
||||
root := strings.TrimSpace(source.Path)
|
||||
if root == "" {
|
||||
root = strings.TrimSpace(source.URL)
|
||||
}
|
||||
if root == "" {
|
||||
return false, "", fmt.Errorf("path or url is required for local source")
|
||||
}
|
||||
|
||||
h := sha256.New()
|
||||
err := filepath.WalkDir(root, func(path string, d fs.DirEntry, err error) error {
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if d.IsDir() {
|
||||
name := d.Name()
|
||||
if strings.HasPrefix(name, ".") || name == "node_modules" || name == "vendor" || name == "dist" || name == "build" {
|
||||
return filepath.SkipDir
|
||||
}
|
||||
return nil
|
||||
}
|
||||
if !isDocumentationFile(path) {
|
||||
return nil
|
||||
}
|
||||
|
||||
info, infoErr := d.Info()
|
||||
if infoErr != nil {
|
||||
return infoErr
|
||||
}
|
||||
fmt.Fprintf(h, "%s|%d|%d\n", path, info.Size(), info.ModTime().UnixNano())
|
||||
return nil
|
||||
})
|
||||
if err != nil {
|
||||
return false, "", err
|
||||
}
|
||||
|
||||
hash := hex.EncodeToString(h.Sum(nil))
|
||||
return hash != lastHash, hash, nil
|
||||
}
|
||||
|
||||
func (s *LocalScraper) fileToDocument(path string, source *Source) (*Document, error) {
|
||||
b, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
ext := strings.ToLower(filepath.Ext(path))
|
||||
content := normalizeLocalContent(string(b), ext)
|
||||
if content == "" {
|
||||
return nil, fmt.Errorf("empty file")
|
||||
}
|
||||
|
||||
title := strings.TrimSuffix(filepath.Base(path), filepath.Ext(path))
|
||||
hash := sha256.Sum256(b)
|
||||
uri := "file://" + filepath.ToSlash(path)
|
||||
|
||||
docType := "local-document"
|
||||
switch ext {
|
||||
case ".md", ".mdx":
|
||||
docType = "local-markdown"
|
||||
case ".txt":
|
||||
docType = "local-text"
|
||||
case ".json", ".yaml", ".yml":
|
||||
docType = "local-data"
|
||||
case ".go", ".js", ".ts", ".tsx", ".py", ".java", ".rs", ".rb", ".php":
|
||||
docType = "local-code"
|
||||
}
|
||||
|
||||
name := source.Name
|
||||
if strings.TrimSpace(name) == "" {
|
||||
name = filepath.Base(filepath.Dir(path))
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(uri),
|
||||
Source: name,
|
||||
Type: docType,
|
||||
Title: title,
|
||||
Content: content,
|
||||
URL: uri,
|
||||
Metadata: map[string]interface{}{
|
||||
"path": path,
|
||||
"size": len(b),
|
||||
},
|
||||
Hash: hex.EncodeToString(hash[:]),
|
||||
Timestamp: time.Now(),
|
||||
}, nil
|
||||
}
|
||||
|
||||
func normalizeLocalContent(content, ext string) string {
|
||||
content = strings.TrimSpace(content)
|
||||
if content == "" {
|
||||
return ""
|
||||
}
|
||||
|
||||
switch ext {
|
||||
case ".md", ".mdx":
|
||||
content = stripMarkdownFrontmatter(content)
|
||||
content = stripMDXPreamble(content)
|
||||
}
|
||||
|
||||
// Collapse excessive blank lines to reduce indexing noise.
|
||||
content = reLocalBlankLines.ReplaceAllString(content, "\n\n")
|
||||
return strings.TrimSpace(content)
|
||||
}
|
||||
|
||||
func stripMarkdownFrontmatter(content string) string {
|
||||
if !strings.HasPrefix(content, "---\n") && !strings.HasPrefix(content, "---\r\n") {
|
||||
return content
|
||||
}
|
||||
|
||||
trimmed := reFrontMatterBlock.ReplaceAllString(content, "")
|
||||
return trimmed
|
||||
}
|
||||
|
||||
func stripMDXPreamble(content string) string {
|
||||
lines := strings.Split(content, "\n")
|
||||
i := 0
|
||||
for i < len(lines) {
|
||||
line := strings.TrimSpace(lines[i])
|
||||
if line == "" {
|
||||
i++
|
||||
continue
|
||||
}
|
||||
if strings.HasPrefix(line, "import ") || strings.HasPrefix(line, "export ") {
|
||||
i++
|
||||
continue
|
||||
}
|
||||
break
|
||||
}
|
||||
return strings.Join(lines[i:], "\n")
|
||||
}
|
||||
|
||||
func isDocumentationFile(path string) bool {
|
||||
ext := strings.ToLower(filepath.Ext(path))
|
||||
switch ext {
|
||||
case ".md", ".mdx", ".txt", ".rst", ".adoc", ".json", ".yaml", ".yml", ".go", ".py", ".js", ".ts", ".tsx", ".java", ".rs", ".rb", ".php", ".html":
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,102 @@
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"context"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestLocalScraperScrapeDirectory(t *testing.T) {
|
||||
tmp := t.TempDir()
|
||||
if err := os.WriteFile(filepath.Join(tmp, "README.md"), []byte("# Demo\n\nhello docs"), 0o644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(tmp, "notes.txt"), []byte("notes"), 0o644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(tmp, "bin.bin"), []byte{0x00, 0x01}, 0o644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
s := NewLocalScraper(&Config{Timeout: 2 * time.Second, UserAgent: "DevourTest"})
|
||||
docs, err := s.Scrape(context.Background(), &Source{Name: "local", Type: SourceTypeLocal, Path: tmp})
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if len(docs) < 2 {
|
||||
t.Fatalf("expected at least 2 docs, got %d", len(docs))
|
||||
}
|
||||
}
|
||||
|
||||
func TestLocalScraperDetectChanges(t *testing.T) {
|
||||
tmp := t.TempDir()
|
||||
file := filepath.Join(tmp, "README.md")
|
||||
if err := os.WriteFile(file, []byte("v1"), 0o644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
s := NewLocalScraper(&Config{Timeout: 2 * time.Second, UserAgent: "DevourTest"})
|
||||
src := &Source{Name: "local", Type: SourceTypeLocal, Path: tmp}
|
||||
|
||||
changed, hash1, err := s.DetectChanges(context.Background(), src, "")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if !changed || hash1 == "" {
|
||||
t.Fatalf("expected first detect to change with non-empty hash, changed=%v hash=%q", changed, hash1)
|
||||
}
|
||||
|
||||
time.Sleep(5 * time.Millisecond)
|
||||
if err := os.WriteFile(file, []byte("v2"), 0o644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
changed, hash2, err := s.DetectChanges(context.Background(), src, hash1)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if !changed {
|
||||
t.Fatal("expected change after file update")
|
||||
}
|
||||
if hash1 == hash2 {
|
||||
t.Fatal("expected hash to change")
|
||||
}
|
||||
}
|
||||
|
||||
func TestLocalScraper_StripsFrontmatterAndMDXPreamble(t *testing.T) {
|
||||
tmp := t.TempDir()
|
||||
path := filepath.Join(tmp, "doc.mdx")
|
||||
content := `---
|
||||
title: My Doc
|
||||
slug: /my-doc
|
||||
---
|
||||
|
||||
import { Component } from "x"
|
||||
export const meta = {}
|
||||
|
||||
# Heading
|
||||
|
||||
Actual documentation body.
|
||||
`
|
||||
if err := os.WriteFile(path, []byte(content), 0o644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
s := NewLocalScraper(&Config{Timeout: 2 * time.Second, UserAgent: "DevourTest"})
|
||||
doc, err := s.fileToDocument(path, &Source{Name: "local", Type: SourceTypeLocal, Path: tmp})
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if strings.Contains(doc.Content, "slug: /my-doc") {
|
||||
t.Fatalf("expected frontmatter to be stripped, got: %q", doc.Content)
|
||||
}
|
||||
if strings.Contains(doc.Content, "import { Component }") {
|
||||
t.Fatalf("expected MDX preamble to be stripped, got: %q", doc.Content)
|
||||
}
|
||||
if !strings.Contains(doc.Content, "Actual documentation body.") {
|
||||
t.Fatalf("expected markdown body in content, got: %q", doc.Content)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,402 @@
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
const (
|
||||
defaultLocalSearchLimit = 8
|
||||
maxLocalSearchLimit = 50
|
||||
maxSearchResponseBytes = 2 << 20 // 2MB
|
||||
)
|
||||
|
||||
// LocalSearchScraper scrapes docs from result URLs returned by a local search API.
|
||||
type LocalSearchScraper struct {
|
||||
config *Config
|
||||
client *http.Client
|
||||
web *WebScraper
|
||||
}
|
||||
|
||||
type localSearchResult struct {
|
||||
URL string
|
||||
Title string
|
||||
Snippet string
|
||||
Engine string
|
||||
Score float64
|
||||
}
|
||||
|
||||
// NewLocalSearchScraper creates a scraper backed by a self-hosted search API.
|
||||
func NewLocalSearchScraper(config *Config) *LocalSearchScraper {
|
||||
baseConfig := &Config{}
|
||||
if config != nil {
|
||||
*baseConfig = *config
|
||||
}
|
||||
if baseConfig.UserAgent == "" {
|
||||
baseConfig.UserAgent = "Devour/1.0 (Local Search Scraper)"
|
||||
}
|
||||
if baseConfig.Timeout <= 0 {
|
||||
baseConfig.Timeout = 30 * time.Second
|
||||
}
|
||||
|
||||
webConfig := *baseConfig
|
||||
webConfig.Concurrency = 1
|
||||
webConfig.MaxDepth = 1
|
||||
|
||||
return &LocalSearchScraper{
|
||||
config: baseConfig,
|
||||
client: &http.Client{Timeout: baseConfig.Timeout},
|
||||
web: NewWebScraper(&webConfig),
|
||||
}
|
||||
}
|
||||
|
||||
// Scrape queries a local search API and scrapes the returned URLs.
|
||||
func (s *LocalSearchScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
|
||||
if source == nil {
|
||||
return nil, fmt.Errorf("source is required")
|
||||
}
|
||||
if strings.TrimSpace(source.URL) == "" {
|
||||
return nil, fmt.Errorf("search API URL is required")
|
||||
}
|
||||
query := strings.TrimSpace(source.Query)
|
||||
if query == "" {
|
||||
return nil, fmt.Errorf("search query is required for localsearch sources (set source.query or --search-query)")
|
||||
}
|
||||
|
||||
limit := clampLocalSearchLimit(source.ResultLimit)
|
||||
results, err := s.search(ctx, source, query, limit)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
docs := make([]*Document, 0, limit)
|
||||
seen := make(map[string]bool)
|
||||
var scrapeErrors []string
|
||||
|
||||
for i, result := range results {
|
||||
if ctx.Err() != nil {
|
||||
return nil, ctx.Err()
|
||||
}
|
||||
|
||||
resultURL := stripURLFragment(result.URL)
|
||||
if resultURL == "" || seen[resultURL] {
|
||||
continue
|
||||
}
|
||||
if !domainAllowed(resultURL, source.Domains) {
|
||||
continue
|
||||
}
|
||||
if !s.web.shouldInclude(resultURL, source.Include, source.Exclude) {
|
||||
continue
|
||||
}
|
||||
seen[resultURL] = true
|
||||
|
||||
pageDocs, err := s.web.Scrape(ctx, &Source{
|
||||
Name: source.Name,
|
||||
Type: SourceTypeWeb,
|
||||
URL: resultURL,
|
||||
Include: source.Include,
|
||||
Exclude: source.Exclude,
|
||||
})
|
||||
if err != nil {
|
||||
if len(scrapeErrors) < 20 {
|
||||
scrapeErrors = append(scrapeErrors, fmt.Sprintf("%s: %v", resultURL, err))
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
for _, doc := range pageDocs {
|
||||
if doc.Metadata == nil {
|
||||
doc.Metadata = make(map[string]interface{})
|
||||
}
|
||||
doc.Metadata["search_api"] = source.URL
|
||||
doc.Metadata["search_query"] = query
|
||||
doc.Metadata["search_rank"] = i + 1
|
||||
if result.Engine != "" {
|
||||
doc.Metadata["search_engine"] = result.Engine
|
||||
}
|
||||
if result.Snippet != "" {
|
||||
doc.Metadata["search_snippet"] = result.Snippet
|
||||
}
|
||||
if result.Score != 0 {
|
||||
doc.Metadata["search_score"] = result.Score
|
||||
}
|
||||
if strings.TrimSpace(doc.Title) == "" && strings.TrimSpace(result.Title) != "" {
|
||||
doc.Title = strings.TrimSpace(result.Title)
|
||||
}
|
||||
|
||||
docs = append(docs, doc)
|
||||
}
|
||||
}
|
||||
|
||||
if len(docs) == 0 {
|
||||
if len(scrapeErrors) > 0 {
|
||||
return nil, fmt.Errorf("local search returned results but page scraping failed: %s", strings.Join(scrapeErrors, "; "))
|
||||
}
|
||||
return nil, fmt.Errorf("local search yielded no usable results for query %q", query)
|
||||
}
|
||||
|
||||
return docs, nil
|
||||
}
|
||||
|
||||
// DetectChanges checks if top search results changed.
|
||||
func (s *LocalSearchScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
|
||||
if source == nil {
|
||||
return false, "", fmt.Errorf("source is required")
|
||||
}
|
||||
query := strings.TrimSpace(source.Query)
|
||||
if query == "" {
|
||||
return false, "", fmt.Errorf("search query is required for localsearch sources")
|
||||
}
|
||||
|
||||
limit := clampLocalSearchLimit(source.ResultLimit)
|
||||
results, err := s.search(ctx, source, query, limit)
|
||||
if err != nil {
|
||||
return false, "", err
|
||||
}
|
||||
|
||||
signatures := make([]string, 0, len(results))
|
||||
for _, result := range results {
|
||||
u := stripURLFragment(result.URL)
|
||||
if u == "" {
|
||||
continue
|
||||
}
|
||||
if !domainAllowed(u, source.Domains) {
|
||||
continue
|
||||
}
|
||||
if !s.web.shouldInclude(u, source.Include, source.Exclude) {
|
||||
continue
|
||||
}
|
||||
signatures = append(signatures, fmt.Sprintf("%s|%s|%s|%.6f", u, result.Title, result.Engine, result.Score))
|
||||
}
|
||||
sort.Strings(signatures)
|
||||
|
||||
hash := sha256.Sum256([]byte(strings.Join(signatures, "\n")))
|
||||
currentHash := hex.EncodeToString(hash[:])
|
||||
return currentHash != lastHash, currentHash, nil
|
||||
}
|
||||
|
||||
func (s *LocalSearchScraper) search(ctx context.Context, source *Source, query string, limit int) ([]localSearchResult, error) {
|
||||
searchURL, err := buildLocalSearchURL(source.URL, query, limit)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, searchURL, nil)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to build search request: %w", err)
|
||||
}
|
||||
req.Header.Set("User-Agent", s.config.UserAgent)
|
||||
req.Header.Set("Accept", "application/json")
|
||||
|
||||
resp, err := s.client.Do(req)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("search API request failed: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
body, err := io.ReadAll(io.LimitReader(resp.Body, maxSearchResponseBytes))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed reading search API response: %w", err)
|
||||
}
|
||||
|
||||
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
|
||||
msg := strings.TrimSpace(string(body))
|
||||
if len(msg) > 200 {
|
||||
msg = msg[:200]
|
||||
}
|
||||
return nil, fmt.Errorf("search API returned HTTP %d: %s", resp.StatusCode, msg)
|
||||
}
|
||||
|
||||
results, err := decodeLocalSearchResults(body)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if len(results) == 0 {
|
||||
return nil, fmt.Errorf("search API returned no results")
|
||||
}
|
||||
if len(results) > limit {
|
||||
results = results[:limit]
|
||||
}
|
||||
return results, nil
|
||||
}
|
||||
|
||||
func buildLocalSearchURL(rawURL, query string, limit int) (string, error) {
|
||||
u, err := url.Parse(strings.TrimSpace(rawURL))
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("invalid search API URL: %w", err)
|
||||
}
|
||||
if u.Scheme == "" || u.Host == "" {
|
||||
return "", fmt.Errorf("invalid search API URL: expected absolute URL, got %q", rawURL)
|
||||
}
|
||||
|
||||
params := u.Query()
|
||||
params.Set("q", query)
|
||||
if params.Get("format") == "" {
|
||||
params.Set("format", "json")
|
||||
}
|
||||
if params.Get("limit") == "" {
|
||||
params.Set("limit", strconv.Itoa(clampLocalSearchLimit(limit)))
|
||||
}
|
||||
u.RawQuery = params.Encode()
|
||||
|
||||
return u.String(), nil
|
||||
}
|
||||
|
||||
func decodeLocalSearchResults(body []byte) ([]localSearchResult, error) {
|
||||
var payload map[string]interface{}
|
||||
if err := json.Unmarshal(body, &payload); err != nil {
|
||||
return nil, fmt.Errorf("search API returned invalid JSON: %w", err)
|
||||
}
|
||||
|
||||
rawResults, ok := payload["results"]
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("search API response missing results field")
|
||||
}
|
||||
|
||||
items, ok := rawResults.([]interface{})
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("search API results field is not an array")
|
||||
}
|
||||
|
||||
results := make([]localSearchResult, 0, len(items))
|
||||
for _, item := range items {
|
||||
record, ok := item.(map[string]interface{})
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
|
||||
resultURL := pickString(record, "url", "link", "href")
|
||||
if strings.TrimSpace(resultURL) == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
results = append(results, localSearchResult{
|
||||
URL: strings.TrimSpace(resultURL),
|
||||
Title: strings.TrimSpace(pickString(record, "title", "name")),
|
||||
Snippet: strings.TrimSpace(pickString(record, "content", "snippet", "description", "text")),
|
||||
Engine: strings.TrimSpace(pickString(record, "engine", "source")),
|
||||
Score: pickFloat(record, "score", "relevance"),
|
||||
})
|
||||
}
|
||||
|
||||
return results, nil
|
||||
}
|
||||
|
||||
func pickString(record map[string]interface{}, keys ...string) string {
|
||||
for _, key := range keys {
|
||||
value, ok := record[key]
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
switch v := value.(type) {
|
||||
case string:
|
||||
return v
|
||||
case json.Number:
|
||||
return v.String()
|
||||
case float64:
|
||||
return strconv.FormatFloat(v, 'f', -1, 64)
|
||||
case int:
|
||||
return strconv.Itoa(v)
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func pickFloat(record map[string]interface{}, keys ...string) float64 {
|
||||
for _, key := range keys {
|
||||
value, ok := record[key]
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
switch v := value.(type) {
|
||||
case float64:
|
||||
return v
|
||||
case float32:
|
||||
return float64(v)
|
||||
case int:
|
||||
return float64(v)
|
||||
case int64:
|
||||
return float64(v)
|
||||
case json.Number:
|
||||
f, err := v.Float64()
|
||||
if err == nil {
|
||||
return f
|
||||
}
|
||||
case string:
|
||||
f, err := strconv.ParseFloat(strings.TrimSpace(v), 64)
|
||||
if err == nil {
|
||||
return f
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
func clampLocalSearchLimit(limit int) int {
|
||||
if limit <= 0 {
|
||||
return defaultLocalSearchLimit
|
||||
}
|
||||
if limit > maxLocalSearchLimit {
|
||||
return maxLocalSearchLimit
|
||||
}
|
||||
return limit
|
||||
}
|
||||
|
||||
func stripURLFragment(raw string) string {
|
||||
u, err := url.Parse(strings.TrimSpace(raw))
|
||||
if err != nil {
|
||||
return strings.TrimSpace(raw)
|
||||
}
|
||||
u.Fragment = ""
|
||||
return u.String()
|
||||
}
|
||||
|
||||
func domainAllowed(raw string, allowedDomains []string) bool {
|
||||
if len(allowedDomains) == 0 {
|
||||
return true
|
||||
}
|
||||
|
||||
u, err := url.Parse(raw)
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
host := strings.ToLower(strings.TrimSpace(u.Hostname()))
|
||||
if host == "" {
|
||||
return false
|
||||
}
|
||||
|
||||
for _, candidate := range allowedDomains {
|
||||
domain := normalizeDomain(candidate)
|
||||
if domain == "" {
|
||||
continue
|
||||
}
|
||||
if host == domain || strings.HasSuffix(host, "."+domain) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func normalizeDomain(raw string) string {
|
||||
raw = strings.ToLower(strings.TrimSpace(raw))
|
||||
if raw == "" {
|
||||
return ""
|
||||
}
|
||||
if strings.Contains(raw, "://") {
|
||||
parsed, err := url.Parse(raw)
|
||||
if err == nil {
|
||||
return strings.ToLower(parsed.Hostname())
|
||||
}
|
||||
}
|
||||
return strings.TrimPrefix(raw, ".")
|
||||
}
|
||||
@@ -0,0 +1,226 @@
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"net/url"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestLocalSearchScraperScrape(t *testing.T) {
|
||||
mux := http.NewServeMux()
|
||||
baseURL := ""
|
||||
|
||||
mux.HandleFunc("/search", func(w http.ResponseWriter, r *http.Request) {
|
||||
if got := r.URL.Query().Get("q"); got != "go http client" {
|
||||
t.Fatalf("expected query go http client, got %q", got)
|
||||
}
|
||||
if got := r.URL.Query().Get("format"); got != "json" {
|
||||
t.Fatalf("expected format=json, got %q", got)
|
||||
}
|
||||
_ = json.NewEncoder(w).Encode(map[string]interface{}{
|
||||
"results": []map[string]interface{}{
|
||||
{
|
||||
"url": baseURL + "/docs/http-client",
|
||||
"title": "HTTP Client Guide",
|
||||
"content": "How to build an HTTP client in Go",
|
||||
"engine": "searxng",
|
||||
"score": 0.99,
|
||||
},
|
||||
},
|
||||
})
|
||||
})
|
||||
|
||||
mux.HandleFunc("/docs/http-client", func(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Content-Type", "text/html")
|
||||
_, _ = w.Write([]byte(`<html><head><title>HTTP Client Guide</title></head><body><main>` + strings.Repeat("http client docs ", 30) + `</main></body></html>`))
|
||||
})
|
||||
|
||||
srv := httptest.NewServer(mux)
|
||||
defer srv.Close()
|
||||
baseURL = srv.URL
|
||||
|
||||
s := NewLocalSearchScraper(&Config{
|
||||
UserAgent: "DevourTest/1.0",
|
||||
Timeout: 2 * time.Second,
|
||||
})
|
||||
|
||||
docs, err := s.Scrape(context.Background(), &Source{
|
||||
Name: "local-search",
|
||||
Type: SourceTypeLocalSearch,
|
||||
URL: srv.URL + "/search",
|
||||
Query: "go http client",
|
||||
ResultLimit: 5,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected scrape error: %v", err)
|
||||
}
|
||||
if len(docs) == 0 {
|
||||
t.Fatal("expected at least one document")
|
||||
}
|
||||
|
||||
doc := docs[0]
|
||||
if doc.URL != srv.URL+"/docs/http-client" {
|
||||
t.Fatalf("unexpected document URL: %q", doc.URL)
|
||||
}
|
||||
if doc.Metadata["search_query"] != "go http client" {
|
||||
t.Fatalf("expected metadata search_query, got %v", doc.Metadata["search_query"])
|
||||
}
|
||||
if doc.Metadata["search_engine"] != "searxng" {
|
||||
t.Fatalf("expected metadata search_engine=searxng, got %v", doc.Metadata["search_engine"])
|
||||
}
|
||||
}
|
||||
|
||||
func TestLocalSearchScraperDomainFilter(t *testing.T) {
|
||||
mux := http.NewServeMux()
|
||||
baseURL := ""
|
||||
|
||||
mux.HandleFunc("/search", func(w http.ResponseWriter, r *http.Request) {
|
||||
_ = json.NewEncoder(w).Encode(map[string]interface{}{
|
||||
"results": []map[string]interface{}{
|
||||
{
|
||||
"url": baseURL + "/docs/in-scope",
|
||||
"title": "In Scope",
|
||||
},
|
||||
{
|
||||
"url": "https://example.com/out-of-scope",
|
||||
"title": "Out Scope",
|
||||
},
|
||||
},
|
||||
})
|
||||
})
|
||||
|
||||
mux.HandleFunc("/docs/in-scope", func(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Content-Type", "text/html")
|
||||
_, _ = w.Write([]byte(`<html><head><title>In Scope</title></head><body><main>` + strings.Repeat("scoped docs ", 30) + `</main></body></html>`))
|
||||
})
|
||||
|
||||
srv := httptest.NewServer(mux)
|
||||
defer srv.Close()
|
||||
baseURL = srv.URL
|
||||
|
||||
parsed, err := url.Parse(srv.URL)
|
||||
if err != nil {
|
||||
t.Fatalf("failed to parse server URL: %v", err)
|
||||
}
|
||||
|
||||
s := NewLocalSearchScraper(&Config{
|
||||
UserAgent: "DevourTest/1.0",
|
||||
Timeout: 2 * time.Second,
|
||||
})
|
||||
|
||||
docs, err := s.Scrape(context.Background(), &Source{
|
||||
Name: "local-search",
|
||||
Type: SourceTypeLocalSearch,
|
||||
URL: srv.URL + "/search",
|
||||
Query: "scope test",
|
||||
ResultLimit: 10,
|
||||
Domains: []string{parsed.Hostname()},
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected scrape error: %v", err)
|
||||
}
|
||||
if len(docs) == 0 {
|
||||
t.Fatal("expected at least one in-scope document")
|
||||
}
|
||||
for _, doc := range docs {
|
||||
docURL, parseErr := url.Parse(doc.URL)
|
||||
if parseErr != nil {
|
||||
t.Fatalf("invalid doc URL %q: %v", doc.URL, parseErr)
|
||||
}
|
||||
if docURL.Hostname() != parsed.Hostname() {
|
||||
t.Fatalf("expected only in-scope domain, got %q", doc.URL)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestLocalSearchScraperRequiresQuery(t *testing.T) {
|
||||
s := NewLocalSearchScraper(&Config{
|
||||
UserAgent: "DevourTest/1.0",
|
||||
Timeout: 2 * time.Second,
|
||||
})
|
||||
|
||||
_, err := s.Scrape(context.Background(), &Source{
|
||||
Name: "local-search",
|
||||
Type: SourceTypeLocalSearch,
|
||||
URL: "http://127.0.0.1:8080/search",
|
||||
})
|
||||
if err == nil {
|
||||
t.Fatal("expected error when query is missing")
|
||||
}
|
||||
if !strings.Contains(err.Error(), "query") {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestLocalSearchScraperDetectChanges(t *testing.T) {
|
||||
mux := http.NewServeMux()
|
||||
baseURL := ""
|
||||
resultPath := "/docs/one"
|
||||
|
||||
mux.HandleFunc("/search", func(w http.ResponseWriter, r *http.Request) {
|
||||
_ = json.NewEncoder(w).Encode(map[string]interface{}{
|
||||
"results": []map[string]interface{}{
|
||||
{
|
||||
"url": baseURL + resultPath,
|
||||
"title": "Versioned",
|
||||
"score": 1.0,
|
||||
},
|
||||
},
|
||||
})
|
||||
})
|
||||
|
||||
srv := httptest.NewServer(mux)
|
||||
defer srv.Close()
|
||||
baseURL = srv.URL
|
||||
|
||||
s := NewLocalSearchScraper(&Config{
|
||||
UserAgent: "DevourTest/1.0",
|
||||
Timeout: 2 * time.Second,
|
||||
})
|
||||
source := &Source{
|
||||
Name: "local-search",
|
||||
Type: SourceTypeLocalSearch,
|
||||
URL: srv.URL + "/search",
|
||||
Query: "version test",
|
||||
ResultLimit: 3,
|
||||
}
|
||||
|
||||
changed, hash1, err := s.DetectChanges(context.Background(), source, "")
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected detect changes error: %v", err)
|
||||
}
|
||||
if !changed {
|
||||
t.Fatal("expected first detect changes call to report changed")
|
||||
}
|
||||
if hash1 == "" {
|
||||
t.Fatal("expected non-empty hash")
|
||||
}
|
||||
|
||||
changed, hash2, err := s.DetectChanges(context.Background(), source, hash1)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected detect changes error: %v", err)
|
||||
}
|
||||
if changed {
|
||||
t.Fatal("expected unchanged results with identical hash")
|
||||
}
|
||||
if hash2 != hash1 {
|
||||
t.Fatalf("expected identical hash, got %q and %q", hash1, hash2)
|
||||
}
|
||||
|
||||
resultPath = "/docs/two"
|
||||
changed, hash3, err := s.DetectChanges(context.Background(), source, hash1)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected detect changes error: %v", err)
|
||||
}
|
||||
if !changed {
|
||||
t.Fatal("expected changed results after search output changed")
|
||||
}
|
||||
if hash3 == hash1 {
|
||||
t.Fatal("expected hash to change")
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,88 @@
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"net/url"
|
||||
"path"
|
||||
"regexp"
|
||||
"strings"
|
||||
)
|
||||
|
||||
var (
|
||||
titleNoiseRe = regexp.MustCompile(`(?i)\b(added in go[0-9]+(\.[0-9]+)?|deprecated)\b`)
|
||||
titleSpaceRe = regexp.MustCompile(`\s+`)
|
||||
contentSpaceRe = regexp.MustCompile(`[ \t]+\n`)
|
||||
multiNewlineRe = regexp.MustCompile(`\n{3,}`)
|
||||
nonPrintableTitle = regexp.MustCompile(`[[:cntrl:]]`)
|
||||
)
|
||||
|
||||
// NormalizeDocuments applies normalization to a list of scraped documents.
|
||||
func NormalizeDocuments(docs []*Document) []*Document {
|
||||
for _, doc := range docs {
|
||||
NormalizeDocument(doc)
|
||||
}
|
||||
return docs
|
||||
}
|
||||
|
||||
// NormalizeDocument applies cross-scraper output cleanup.
|
||||
func NormalizeDocument(doc *Document) {
|
||||
if doc == nil {
|
||||
return
|
||||
}
|
||||
|
||||
doc.URL = strings.TrimSpace(doc.URL)
|
||||
doc.Type = strings.TrimSpace(doc.Type)
|
||||
doc.Title = normalizeTitle(doc.Title)
|
||||
doc.Content = normalizeContent(doc.Content)
|
||||
|
||||
if doc.Title == "" {
|
||||
doc.Title = inferTitleFromURL(doc.URL)
|
||||
}
|
||||
}
|
||||
|
||||
func normalizeTitle(title string) string {
|
||||
title = strings.ReplaceAll(title, "¶", " ")
|
||||
title = strings.ReplaceAll(title, "_", " ")
|
||||
title = nonPrintableTitle.ReplaceAllString(title, " ")
|
||||
title = titleNoiseRe.ReplaceAllString(title, " ")
|
||||
title = titleSpaceRe.ReplaceAllString(strings.TrimSpace(title), " ")
|
||||
|
||||
// Remove dangling punctuation if it became a suffix after cleanup.
|
||||
title = strings.TrimSpace(strings.Trim(title, "-:.,;"))
|
||||
return title
|
||||
}
|
||||
|
||||
func normalizeContent(content string) string {
|
||||
content = strings.ReplaceAll(content, "\r\n", "\n")
|
||||
content = strings.TrimSpace(content)
|
||||
content = contentSpaceRe.ReplaceAllString(content, "\n")
|
||||
content = multiNewlineRe.ReplaceAllString(content, "\n\n")
|
||||
return content
|
||||
}
|
||||
|
||||
func inferTitleFromURL(rawURL string) string {
|
||||
if rawURL == "" {
|
||||
return "Documentation"
|
||||
}
|
||||
|
||||
u, err := url.Parse(rawURL)
|
||||
if err != nil {
|
||||
return "Documentation"
|
||||
}
|
||||
|
||||
base := path.Base(strings.Trim(u.Path, "/"))
|
||||
if base == "" || base == "." || base == "/" {
|
||||
if u.Host != "" {
|
||||
return u.Host
|
||||
}
|
||||
return "Documentation"
|
||||
}
|
||||
|
||||
base = strings.TrimSuffix(base, ".html")
|
||||
base = strings.ReplaceAll(base, "-", " ")
|
||||
base = strings.ReplaceAll(base, "_", " ")
|
||||
base = titleSpaceRe.ReplaceAllString(strings.TrimSpace(base), " ")
|
||||
if base == "" {
|
||||
return "Documentation"
|
||||
}
|
||||
return base
|
||||
}
|
||||
@@ -0,0 +1,33 @@
|
||||
package scraper
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestNormalizeDocument_TitleCleanup(t *testing.T) {
|
||||
doc := &Document{
|
||||
Title: "http.type CloseNotifier ¶ deprecated added in go1.1",
|
||||
Content: "line 1 \n\n\nline 2",
|
||||
URL: "https://pkg.go.dev/net/http#CloseNotifier",
|
||||
}
|
||||
|
||||
NormalizeDocument(doc)
|
||||
|
||||
if doc.Title != "http.type CloseNotifier" {
|
||||
t.Fatalf("unexpected normalized title: %q", doc.Title)
|
||||
}
|
||||
if doc.Content != "line 1\n\nline 2" {
|
||||
t.Fatalf("unexpected normalized content: %q", doc.Content)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNormalizeDocument_InferTitle(t *testing.T) {
|
||||
doc := &Document{
|
||||
Title: "",
|
||||
URL: "https://kotlinlang.org/docs/regex.html",
|
||||
}
|
||||
|
||||
NormalizeDocument(doc)
|
||||
|
||||
if doc.Title != "regex" {
|
||||
t.Fatalf("expected inferred title 'regex', got %q", doc.Title)
|
||||
}
|
||||
}
|
||||
+316
-9
@@ -2,30 +2,337 @@ package scraper
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"os"
|
||||
"sort"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"gopkg.in/yaml.v3"
|
||||
)
|
||||
|
||||
// OpenAPIScraper parses OpenAPI/Swagger specifications.
|
||||
type OpenAPIScraper struct {
|
||||
config *Config
|
||||
client *http.Client
|
||||
}
|
||||
|
||||
// NewOpenAPIScraper creates a new OpenAPI scraper.
|
||||
func NewOpenAPIScraper(config *Config) *OpenAPIScraper {
|
||||
return &OpenAPIScraper{config: config}
|
||||
timeout := 30 * time.Second
|
||||
if config != nil && config.Timeout > 0 {
|
||||
timeout = config.Timeout
|
||||
}
|
||||
return &OpenAPIScraper{
|
||||
config: config,
|
||||
client: &http.Client{Timeout: timeout},
|
||||
}
|
||||
}
|
||||
|
||||
// Scrape fetches and parses an OpenAPI specification.
|
||||
func (s *OpenAPIScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
|
||||
// TODO: Implement OpenAPI parsing
|
||||
// 1. Fetch spec from URL
|
||||
// 2. Parse endpoints, schemas, descriptions
|
||||
// 3. Create documents per endpoint
|
||||
// 4. Include authentication, parameters
|
||||
return nil, nil
|
||||
if source == nil {
|
||||
return nil, fmt.Errorf("source is required")
|
||||
}
|
||||
|
||||
raw, specURL, err := s.readSpec(ctx, source)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
spec, err := parseOpenAPISpec(raw)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
docs := make([]*Document, 0)
|
||||
mainContent := buildMainSpecContent(spec)
|
||||
docs = append(docs, &Document{
|
||||
ID: generateDocID(specURL + "#openapi"),
|
||||
Source: coalesceSourceName(source.Name, "openapi"),
|
||||
Type: "openapi-spec",
|
||||
Title: spec.Info.Title,
|
||||
Content: mainContent,
|
||||
URL: specURL,
|
||||
Metadata: map[string]interface{}{
|
||||
"openapi": spec.Version,
|
||||
"servers": spec.Servers,
|
||||
},
|
||||
Hash: hashBytes(raw),
|
||||
Timestamp: time.Now(),
|
||||
})
|
||||
|
||||
paths := make([]string, 0, len(spec.Paths))
|
||||
for path := range spec.Paths {
|
||||
paths = append(paths, path)
|
||||
}
|
||||
sort.Strings(paths)
|
||||
|
||||
for _, p := range paths {
|
||||
opMap := spec.Paths[p]
|
||||
methods := make([]string, 0, len(opMap))
|
||||
for m := range opMap {
|
||||
methods = append(methods, strings.ToUpper(m))
|
||||
}
|
||||
sort.Strings(methods)
|
||||
|
||||
for _, method := range methods {
|
||||
op := opMap[strings.ToLower(method)]
|
||||
if op == nil {
|
||||
continue
|
||||
}
|
||||
title := strings.TrimSpace(op.Summary)
|
||||
if title == "" {
|
||||
title = fmt.Sprintf("%s %s", method, p)
|
||||
}
|
||||
content := buildOperationContent(method, p, op)
|
||||
docURL := fmt.Sprintf("%s#%s-%s", specURL, strings.ToLower(method), sanitizeFragment(p))
|
||||
docs = append(docs, &Document{
|
||||
ID: generateDocID(docURL),
|
||||
Source: coalesceSourceName(source.Name, "openapi"),
|
||||
Type: "openapi-operation",
|
||||
Title: title,
|
||||
Content: content,
|
||||
URL: docURL,
|
||||
Metadata: map[string]interface{}{
|
||||
"method": method,
|
||||
"path": p,
|
||||
"operation_id": op.OperationID,
|
||||
},
|
||||
Hash: hashString(content),
|
||||
Timestamp: time.Now(),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
return docs, nil
|
||||
}
|
||||
|
||||
// DetectChanges checks if the spec has been updated.
|
||||
func (s *OpenAPIScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
|
||||
// TODO: Check spec content hash
|
||||
return false, "", nil
|
||||
if source == nil {
|
||||
return false, "", fmt.Errorf("source is required")
|
||||
}
|
||||
raw, _, err := s.readSpec(ctx, source)
|
||||
if err != nil {
|
||||
return false, "", err
|
||||
}
|
||||
hash := hashBytes(raw)
|
||||
return hash != lastHash, hash, nil
|
||||
}
|
||||
|
||||
func (s *OpenAPIScraper) readSpec(ctx context.Context, source *Source) ([]byte, string, error) {
|
||||
rawPath := strings.TrimSpace(source.URL)
|
||||
if rawPath == "" {
|
||||
rawPath = strings.TrimSpace(source.Path)
|
||||
}
|
||||
if rawPath == "" {
|
||||
return nil, "", fmt.Errorf("openapi source requires url or path")
|
||||
}
|
||||
|
||||
if strings.HasPrefix(rawPath, "http://") || strings.HasPrefix(rawPath, "https://") {
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, rawPath, nil)
|
||||
if err != nil {
|
||||
return nil, "", err
|
||||
}
|
||||
if s.config != nil && strings.TrimSpace(s.config.UserAgent) != "" {
|
||||
req.Header.Set("User-Agent", s.config.UserAgent)
|
||||
}
|
||||
|
||||
resp, err := s.client.Do(req)
|
||||
if err != nil {
|
||||
return nil, "", err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
|
||||
return nil, "", fmt.Errorf("openapi fetch failed: HTTP %d", resp.StatusCode)
|
||||
}
|
||||
body, err := io.ReadAll(io.LimitReader(resp.Body, 10<<20))
|
||||
if err != nil {
|
||||
return nil, "", err
|
||||
}
|
||||
return body, rawPath, nil
|
||||
}
|
||||
|
||||
b, err := os.ReadFile(rawPath)
|
||||
if err != nil {
|
||||
return nil, "", err
|
||||
}
|
||||
return b, "file://" + rawPath, nil
|
||||
}
|
||||
|
||||
type openAPISpec struct {
|
||||
Version string `json:"openapi" yaml:"openapi"`
|
||||
Swagger string `json:"swagger" yaml:"swagger"`
|
||||
Info openAPIInfo `json:"info" yaml:"info"`
|
||||
Servers []openAPIServer `json:"servers" yaml:"servers"`
|
||||
Paths map[string]pathItems `json:"paths" yaml:"paths"`
|
||||
}
|
||||
|
||||
type openAPIInfo struct {
|
||||
Title string `json:"title" yaml:"title"`
|
||||
Version string `json:"version" yaml:"version"`
|
||||
Description string `json:"description" yaml:"description"`
|
||||
}
|
||||
|
||||
type openAPIServer struct {
|
||||
URL string `json:"url" yaml:"url"`
|
||||
Description string `json:"description" yaml:"description"`
|
||||
}
|
||||
|
||||
type pathItems map[string]*openAPIOperation
|
||||
|
||||
type openAPIOperation struct {
|
||||
Summary string `json:"summary" yaml:"summary"`
|
||||
Description string `json:"description" yaml:"description"`
|
||||
OperationID string `json:"operationId" yaml:"operationId"`
|
||||
Parameters []openAPIParameter `json:"parameters" yaml:"parameters"`
|
||||
Responses map[string]response `json:"responses" yaml:"responses"`
|
||||
RequestBody map[string]any `json:"requestBody" yaml:"requestBody"`
|
||||
Tags []string `json:"tags" yaml:"tags"`
|
||||
Deprecated bool `json:"deprecated" yaml:"deprecated"`
|
||||
Security []map[string][]string `json:"security" yaml:"security"`
|
||||
}
|
||||
|
||||
type openAPIParameter struct {
|
||||
Name string `json:"name" yaml:"name"`
|
||||
In string `json:"in" yaml:"in"`
|
||||
Description string `json:"description" yaml:"description"`
|
||||
Required bool `json:"required" yaml:"required"`
|
||||
}
|
||||
|
||||
type response struct {
|
||||
Description string `json:"description" yaml:"description"`
|
||||
}
|
||||
|
||||
func parseOpenAPISpec(raw []byte) (*openAPISpec, error) {
|
||||
var spec openAPISpec
|
||||
if err := json.Unmarshal(raw, &spec); err != nil {
|
||||
if yamlErr := yaml.Unmarshal(raw, &spec); yamlErr != nil {
|
||||
return nil, fmt.Errorf("invalid openapi content: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
if strings.TrimSpace(spec.Info.Title) == "" {
|
||||
spec.Info.Title = "OpenAPI Specification"
|
||||
}
|
||||
if strings.TrimSpace(spec.Version) == "" {
|
||||
spec.Version = spec.Swagger
|
||||
}
|
||||
if spec.Paths == nil {
|
||||
spec.Paths = map[string]pathItems{}
|
||||
}
|
||||
|
||||
return &spec, nil
|
||||
}
|
||||
|
||||
func buildMainSpecContent(spec *openAPISpec) string {
|
||||
var b strings.Builder
|
||||
fmt.Fprintf(&b, "# %s\n\n", spec.Info.Title)
|
||||
if spec.Info.Version != "" {
|
||||
fmt.Fprintf(&b, "- API Version: %s\n", spec.Info.Version)
|
||||
}
|
||||
if spec.Version != "" {
|
||||
fmt.Fprintf(&b, "- OpenAPI: %s\n", spec.Version)
|
||||
}
|
||||
fmt.Fprintf(&b, "- Paths: %d\n", len(spec.Paths))
|
||||
if spec.Info.Description != "" {
|
||||
fmt.Fprintf(&b, "\n%s\n", strings.TrimSpace(spec.Info.Description))
|
||||
}
|
||||
if len(spec.Servers) > 0 {
|
||||
fmt.Fprintf(&b, "\n## Servers\n")
|
||||
for _, s := range spec.Servers {
|
||||
fmt.Fprintf(&b, "- %s", s.URL)
|
||||
if s.Description != "" {
|
||||
fmt.Fprintf(&b, " - %s", s.Description)
|
||||
}
|
||||
fmt.Fprintln(&b)
|
||||
}
|
||||
}
|
||||
return b.String()
|
||||
}
|
||||
|
||||
func buildOperationContent(method, path string, op *openAPIOperation) string {
|
||||
var b strings.Builder
|
||||
fmt.Fprintf(&b, "# %s %s\n\n", method, path)
|
||||
if op.Summary != "" {
|
||||
fmt.Fprintf(&b, "%s\n\n", strings.TrimSpace(op.Summary))
|
||||
}
|
||||
if op.Description != "" {
|
||||
fmt.Fprintf(&b, "%s\n\n", strings.TrimSpace(op.Description))
|
||||
}
|
||||
if op.OperationID != "" {
|
||||
fmt.Fprintf(&b, "- Operation ID: `%s`\n", op.OperationID)
|
||||
}
|
||||
if len(op.Tags) > 0 {
|
||||
fmt.Fprintf(&b, "- Tags: %s\n", strings.Join(op.Tags, ", "))
|
||||
}
|
||||
if op.Deprecated {
|
||||
fmt.Fprintln(&b, "- Deprecated: true")
|
||||
}
|
||||
if len(op.Parameters) > 0 {
|
||||
fmt.Fprintln(&b, "\n## Parameters")
|
||||
for _, p := range op.Parameters {
|
||||
req := "optional"
|
||||
if p.Required {
|
||||
req = "required"
|
||||
}
|
||||
fmt.Fprintf(&b, "- `%s` (%s, %s)", p.Name, p.In, req)
|
||||
if p.Description != "" {
|
||||
fmt.Fprintf(&b, ": %s", strings.TrimSpace(p.Description))
|
||||
}
|
||||
fmt.Fprintln(&b)
|
||||
}
|
||||
}
|
||||
if len(op.Responses) > 0 {
|
||||
codes := make([]string, 0, len(op.Responses))
|
||||
for code := range op.Responses {
|
||||
codes = append(codes, code)
|
||||
}
|
||||
sort.Strings(codes)
|
||||
fmt.Fprintln(&b, "\n## Responses")
|
||||
for _, code := range codes {
|
||||
resp := op.Responses[code]
|
||||
fmt.Fprintf(&b, "- `%s`", code)
|
||||
if resp.Description != "" {
|
||||
fmt.Fprintf(&b, ": %s", strings.TrimSpace(resp.Description))
|
||||
}
|
||||
fmt.Fprintln(&b)
|
||||
}
|
||||
}
|
||||
return b.String()
|
||||
}
|
||||
|
||||
func sanitizeFragment(path string) string {
|
||||
path = strings.ToLower(path)
|
||||
path = strings.ReplaceAll(path, "/", "-")
|
||||
path = strings.ReplaceAll(path, "{", "")
|
||||
path = strings.ReplaceAll(path, "}", "")
|
||||
path = strings.Trim(path, "-")
|
||||
if path == "" {
|
||||
return "root"
|
||||
}
|
||||
return path
|
||||
}
|
||||
|
||||
func hashBytes(b []byte) string {
|
||||
h := sha256.Sum256(b)
|
||||
return hex.EncodeToString(h[:])
|
||||
}
|
||||
|
||||
func hashString(s string) string {
|
||||
h := sha256.Sum256([]byte(s))
|
||||
return hex.EncodeToString(h[:])
|
||||
}
|
||||
|
||||
func coalesceSourceName(name, fallback string) string {
|
||||
if strings.TrimSpace(name) != "" {
|
||||
return name
|
||||
}
|
||||
return fallback
|
||||
}
|
||||
|
||||
@@ -0,0 +1,77 @@
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"context"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestOpenAPIScraperScrape(t *testing.T) {
|
||||
spec := `{
|
||||
"openapi": "3.0.0",
|
||||
"info": {"title": "Pet API", "version": "1.0.0"},
|
||||
"paths": {
|
||||
"/pets": {
|
||||
"get": {
|
||||
"summary": "List pets",
|
||||
"operationId": "listPets",
|
||||
"responses": {"200": {"description": "ok"}}
|
||||
}
|
||||
}
|
||||
}
|
||||
}`
|
||||
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
_, _ = w.Write([]byte(spec))
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
s := NewOpenAPIScraper(&Config{Timeout: 2 * time.Second, UserAgent: "DevourTest"})
|
||||
docs, err := s.Scrape(context.Background(), &Source{Name: "pet", Type: SourceTypeOpenAPI, URL: srv.URL})
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected scrape error: %v", err)
|
||||
}
|
||||
if len(docs) < 2 {
|
||||
t.Fatalf("expected at least 2 docs, got %d", len(docs))
|
||||
}
|
||||
foundOp := false
|
||||
for _, d := range docs {
|
||||
if strings.Contains(d.Title, "List pets") {
|
||||
foundOp = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !foundOp {
|
||||
t.Fatal("expected operation document")
|
||||
}
|
||||
}
|
||||
|
||||
func TestOpenAPIScraperDetectChanges(t *testing.T) {
|
||||
spec := `{"openapi":"3.0.0","info":{"title":"API"},"paths":{}}`
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
_, _ = w.Write([]byte(spec))
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
s := NewOpenAPIScraper(&Config{Timeout: 2 * time.Second, UserAgent: "DevourTest"})
|
||||
src := &Source{Name: "api", Type: SourceTypeOpenAPI, URL: srv.URL}
|
||||
changed, hash1, err := s.DetectChanges(context.Background(), src, "")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if !changed || hash1 == "" {
|
||||
t.Fatalf("expected changed=true and non-empty hash, changed=%v hash=%q", changed, hash1)
|
||||
}
|
||||
|
||||
changed, _, err = s.DetectChanges(context.Background(), src, hash1)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if changed {
|
||||
t.Fatal("expected no changes when hash matches")
|
||||
}
|
||||
}
|
||||
@@ -5,6 +5,7 @@ func init() {
|
||||
// Additional scrapers can be registered in their own packages
|
||||
RegisterScraper(SourceTypeWeb, func(c *Config) Scraper { return NewWebScraper(c) })
|
||||
RegisterScraper(SourceTypeLocal, func(c *Config) Scraper { return NewLocalScraper(c) })
|
||||
RegisterScraper(SourceTypeLocalSearch, func(c *Config) Scraper { return NewLocalSearchScraper(c) })
|
||||
RegisterScraper(SourceTypeGitHub, func(c *Config) Scraper { return NewGitHubScraper(c) })
|
||||
RegisterScraper(SourceTypeOpenAPI, func(c *Config) Scraper { return NewOpenAPIScraper(c) })
|
||||
}
|
||||
|
||||
@@ -0,0 +1,71 @@
|
||||
package scraper_test
|
||||
|
||||
import (
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
basescraper "github.com/yourorg/devour/internal/scraper"
|
||||
_ "github.com/yourorg/devour/internal/scraper/external"
|
||||
)
|
||||
|
||||
func TestLanguageScrapersAreRegistered(t *testing.T) {
|
||||
config := &basescraper.Config{
|
||||
UserAgent: "DevourTest/1.0",
|
||||
Timeout: 2 * time.Second,
|
||||
}
|
||||
|
||||
supportedDocTypes := []basescraper.SourceType{
|
||||
basescraper.SourceTypeGoDocs,
|
||||
basescraper.SourceTypeRustDocs,
|
||||
basescraper.SourceTypePythonDocs,
|
||||
basescraper.SourceTypeJavaDocs,
|
||||
basescraper.SourceTypeSpringDocs,
|
||||
basescraper.SourceTypeTSDocs,
|
||||
basescraper.SourceTypeReactDocs,
|
||||
basescraper.SourceTypeVueDocs,
|
||||
basescraper.SourceTypeNuxtDocs,
|
||||
basescraper.SourceTypeMCPDocs,
|
||||
basescraper.SourceTypeDockerDocs,
|
||||
basescraper.SourceTypeCloudflareDocs,
|
||||
basescraper.SourceTypeAstroDocs,
|
||||
}
|
||||
|
||||
for _, sourceType := range supportedDocTypes {
|
||||
t.Run(string(sourceType), func(t *testing.T) {
|
||||
s := basescraper.NewScraper(sourceType, config)
|
||||
if s == nil {
|
||||
t.Fatalf("NewScraper(%q) returned nil; scraper was not registered", sourceType)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestDetectSourceType_ForSupportedDocsHosts(t *testing.T) {
|
||||
tests := []struct {
|
||||
input string
|
||||
expected basescraper.SourceType
|
||||
}{
|
||||
{"https://pkg.go.dev/net/http", basescraper.SourceTypeGoDocs},
|
||||
{"https://docs.rs/tokio/latest/tokio/", basescraper.SourceTypeRustDocs},
|
||||
{"https://docs.python.org/3/library/asyncio.html", basescraper.SourceTypePythonDocs},
|
||||
{"https://docs.oracle.com/javase/8/docs/api/java/util/List.html", basescraper.SourceTypeJavaDocs},
|
||||
{"https://docs.spring.io/spring-boot/docs/current/reference/htmlsingle/", basescraper.SourceTypeSpringDocs},
|
||||
{"https://www.typescriptlang.org/docs/handbook/2/basic-types.html", basescraper.SourceTypeTSDocs},
|
||||
{"https://react.dev/reference/react/hooks", basescraper.SourceTypeReactDocs},
|
||||
{"https://vuejs.org/guide/introduction.html", basescraper.SourceTypeVueDocs},
|
||||
{"https://nuxt.com/docs/guide/directory-structure", basescraper.SourceTypeNuxtDocs},
|
||||
{"https://docs.docker.com/compose", basescraper.SourceTypeDockerDocs},
|
||||
{"https://hub.docker.com/mcp/server/github", basescraper.SourceTypeMCPDocs},
|
||||
{"https://developers.cloudflare.com/workers", basescraper.SourceTypeCloudflareDocs},
|
||||
{"https://docs.astro.build/en/guides/components/", basescraper.SourceTypeAstroDocs},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.input, func(t *testing.T) {
|
||||
got := basescraper.DetectSourceType(tt.input)
|
||||
if got != tt.expected {
|
||||
t.Fatalf("DetectSourceType(%q) = %q, want %q", tt.input, got, tt.expected)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -28,7 +28,7 @@ func (r *ScraperRegistry) Register(sourceType SourceType, constructor ScraperCon
|
||||
// Create creates a scraper instance
|
||||
func (r *ScraperRegistry) Create(sourceType SourceType, config *Config) Scraper {
|
||||
if constructor, exists := r.constructors[sourceType]; exists {
|
||||
return constructor(config)
|
||||
return wrapScraper(constructor(config))
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -17,6 +17,7 @@ const (
|
||||
SourceTypeGitHub SourceType = "github"
|
||||
SourceTypeOpenAPI SourceType = "openapi"
|
||||
SourceTypeLocal SourceType = "local"
|
||||
SourceTypeLocalSearch SourceType = "localsearch"
|
||||
SourceTypeGoDocs SourceType = "godocs"
|
||||
SourceTypeRustDocs SourceType = "rustdocs"
|
||||
SourceTypePythonDocs SourceType = "pythondocs"
|
||||
@@ -34,15 +35,18 @@ const (
|
||||
|
||||
// Source represents a documentation source to scrape.
|
||||
type Source struct {
|
||||
Name string `yaml:"name"`
|
||||
Type SourceType `yaml:"type"`
|
||||
URL string `yaml:"url,omitempty"`
|
||||
Repo string `yaml:"repo,omitempty"`
|
||||
Branch string `yaml:"branch,omitempty"`
|
||||
Path string `yaml:"path,omitempty"`
|
||||
Include []string `yaml:"include,omitempty"`
|
||||
Exclude []string `yaml:"exclude,omitempty"`
|
||||
Schedule string `yaml:"schedule,omitempty"`
|
||||
Name string `yaml:"name"`
|
||||
Type SourceType `yaml:"type"`
|
||||
URL string `yaml:"url,omitempty"`
|
||||
Query string `yaml:"query,omitempty"`
|
||||
ResultLimit int `yaml:"result_limit,omitempty"`
|
||||
Domains []string `yaml:"domains,omitempty"`
|
||||
Repo string `yaml:"repo,omitempty"`
|
||||
Branch string `yaml:"branch,omitempty"`
|
||||
Path string `yaml:"path,omitempty"`
|
||||
Include []string `yaml:"include,omitempty"`
|
||||
Exclude []string `yaml:"exclude,omitempty"`
|
||||
Schedule string `yaml:"schedule,omitempty"`
|
||||
}
|
||||
|
||||
// Document represents a scraped document.
|
||||
@@ -113,6 +117,11 @@ func DetectSourceType(input string) SourceType {
|
||||
}
|
||||
}
|
||||
|
||||
// MCP servers are hosted under Docker Hub paths.
|
||||
if strings.Contains(input, "hub.docker.com/mcp/") {
|
||||
return SourceTypeMCPDocs
|
||||
}
|
||||
|
||||
// Check for OpenAPI specs
|
||||
if strings.HasSuffix(input, ".json") || strings.HasSuffix(input, ".yaml") || strings.HasSuffix(input, ".yml") {
|
||||
if strings.Contains(strings.ToLower(input), "openapi") || strings.Contains(strings.ToLower(input), "swagger") {
|
||||
|
||||
+191
-7
@@ -6,8 +6,10 @@ import (
|
||||
"encoding/hex"
|
||||
"fmt"
|
||||
"net/url"
|
||||
"path"
|
||||
"regexp"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/gocolly/colly/v2"
|
||||
@@ -27,17 +29,42 @@ func NewWebScraper(config *Config) *WebScraper {
|
||||
func (s *WebScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
|
||||
var documents []*Document
|
||||
visited := make(map[string]bool)
|
||||
scheduled := make(map[string]bool)
|
||||
contentHashes := make(map[string]bool)
|
||||
var mu sync.Mutex
|
||||
var scrapeErrors []string
|
||||
|
||||
// Parse base URL for domain restrictions
|
||||
baseURL, err := url.Parse(source.URL)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("invalid URL: %w", err)
|
||||
}
|
||||
allowedDomain := baseURL.Hostname()
|
||||
if allowedDomain == "" {
|
||||
allowedDomain = baseURL.Host
|
||||
}
|
||||
|
||||
maxDepth := s.config.MaxDepth
|
||||
if maxDepth <= 0 {
|
||||
maxDepth = 2
|
||||
}
|
||||
maxPages := s.config.Concurrency * 40
|
||||
if maxPages < 20 {
|
||||
maxPages = 20
|
||||
}
|
||||
if maxDepth <= 1 && maxPages > 30 {
|
||||
maxPages = 30
|
||||
}
|
||||
if maxPages > 300 {
|
||||
maxPages = 300
|
||||
}
|
||||
scopePrefix := pathScopePrefix(baseURL.Path)
|
||||
scopeLeaf := pathScopeLeaf(baseURL.Path)
|
||||
|
||||
// Create Colly collector
|
||||
c := colly.NewCollector(
|
||||
colly.AllowedDomains(baseURL.Host),
|
||||
colly.MaxDepth(s.config.MaxDepth),
|
||||
colly.AllowedDomains(allowedDomain),
|
||||
colly.MaxDepth(maxDepth),
|
||||
colly.Async(true),
|
||||
colly.UserAgent(s.config.UserAgent),
|
||||
)
|
||||
@@ -65,18 +92,40 @@ func (s *WebScraper) Scrape(ctx context.Context, source *Source) ([]*Document, e
|
||||
|
||||
// Handle errors
|
||||
c.OnError(func(r *colly.Response, err error) {
|
||||
fmt.Printf("Error scraping %s: %v\n", r.Request.URL, err)
|
||||
errText := strings.ToLower(err.Error())
|
||||
if strings.Contains(errText, "already visited") {
|
||||
return
|
||||
}
|
||||
reqURL := source.URL
|
||||
if r != nil && r.Request != nil && r.Request.URL != nil {
|
||||
reqURL = r.Request.URL.String()
|
||||
}
|
||||
mu.Lock()
|
||||
if len(scrapeErrors) < 20 {
|
||||
scrapeErrors = append(scrapeErrors, fmt.Sprintf("%s: %v", reqURL, err))
|
||||
}
|
||||
mu.Unlock()
|
||||
})
|
||||
|
||||
// Extract content from pages
|
||||
c.OnHTML("html", func(e *colly.HTMLElement) {
|
||||
pageURL := e.Request.URL.String()
|
||||
if !withinScope(e.Request.URL, baseURL, scopePrefix, scopeLeaf) {
|
||||
return
|
||||
}
|
||||
|
||||
// Skip if already visited
|
||||
mu.Lock()
|
||||
if visited[pageURL] {
|
||||
mu.Unlock()
|
||||
return
|
||||
}
|
||||
if len(visited) >= maxPages {
|
||||
mu.Unlock()
|
||||
return
|
||||
}
|
||||
visited[pageURL] = true
|
||||
mu.Unlock()
|
||||
|
||||
// Check include/exclude patterns
|
||||
if !s.shouldInclude(pageURL, source.Include, source.Exclude) {
|
||||
@@ -99,6 +148,13 @@ func (s *WebScraper) Scrape(ctx context.Context, source *Source) ([]*Document, e
|
||||
|
||||
// Generate hash for change detection
|
||||
hash := s.generateHash(content)
|
||||
mu.Lock()
|
||||
if contentHashes[hash] {
|
||||
mu.Unlock()
|
||||
return
|
||||
}
|
||||
contentHashes[hash] = true
|
||||
mu.Unlock()
|
||||
|
||||
// Extract metadata
|
||||
metadata := map[string]interface{}{
|
||||
@@ -120,30 +176,74 @@ func (s *WebScraper) Scrape(ctx context.Context, source *Source) ([]*Document, e
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
|
||||
mu.Lock()
|
||||
documents = append(documents, doc)
|
||||
mu.Unlock()
|
||||
})
|
||||
|
||||
// Follow links
|
||||
c.OnHTML("a[href]", func(e *colly.HTMLElement) {
|
||||
link := e.Attr("href")
|
||||
absoluteURL := e.Request.AbsoluteURL(link)
|
||||
|
||||
// Skip if already visited
|
||||
if visited[absoluteURL] {
|
||||
if absoluteURL == "" {
|
||||
return
|
||||
}
|
||||
|
||||
linkURL, err := url.Parse(absoluteURL)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
if linkURL.Scheme != "http" && linkURL.Scheme != "https" {
|
||||
return
|
||||
}
|
||||
if !withinScope(linkURL, baseURL, scopePrefix, scopeLeaf) {
|
||||
return
|
||||
}
|
||||
|
||||
// Skip if already visited
|
||||
mu.Lock()
|
||||
if visited[absoluteURL] {
|
||||
mu.Unlock()
|
||||
return
|
||||
}
|
||||
if len(visited) >= maxPages {
|
||||
mu.Unlock()
|
||||
return
|
||||
}
|
||||
mu.Unlock()
|
||||
|
||||
// Check include/exclude patterns
|
||||
if !s.shouldInclude(absoluteURL, source.Include, source.Exclude) {
|
||||
return
|
||||
}
|
||||
|
||||
mu.Lock()
|
||||
if scheduled[absoluteURL] {
|
||||
mu.Unlock()
|
||||
return
|
||||
}
|
||||
if len(scheduled) >= maxPages {
|
||||
mu.Unlock()
|
||||
return
|
||||
}
|
||||
scheduled[absoluteURL] = true
|
||||
mu.Unlock()
|
||||
|
||||
if err := c.Visit(absoluteURL); err != nil {
|
||||
fmt.Printf("Error visiting %s: %v\n", absoluteURL, err)
|
||||
errText := strings.ToLower(err.Error())
|
||||
if strings.Contains(errText, "already visited") {
|
||||
return
|
||||
}
|
||||
mu.Lock()
|
||||
if len(scrapeErrors) < 20 {
|
||||
scrapeErrors = append(scrapeErrors, fmt.Sprintf("%s: %v", absoluteURL, err))
|
||||
}
|
||||
mu.Unlock()
|
||||
}
|
||||
})
|
||||
|
||||
// Start scraping
|
||||
scheduled[source.URL] = true
|
||||
if err := c.Visit(source.URL); err != nil {
|
||||
return nil, fmt.Errorf("failed to start scraping: %w", err)
|
||||
}
|
||||
@@ -151,6 +251,16 @@ func (s *WebScraper) Scrape(ctx context.Context, source *Source) ([]*Document, e
|
||||
// Wait for async scraping to complete
|
||||
c.Wait()
|
||||
|
||||
mu.Lock()
|
||||
defer mu.Unlock()
|
||||
|
||||
if len(documents) == 0 {
|
||||
if len(scrapeErrors) > 0 {
|
||||
return nil, fmt.Errorf("web scrape failed: %s", strings.Join(scrapeErrors, "; "))
|
||||
}
|
||||
return nil, fmt.Errorf("web scrape extracted no documents from %s", source.URL)
|
||||
}
|
||||
|
||||
return documents, nil
|
||||
}
|
||||
|
||||
@@ -283,6 +393,25 @@ func (s *WebScraper) generateHash(content string) string {
|
||||
|
||||
// cleanText removes extra whitespace and normalizes text.
|
||||
func cleanText(text string) string {
|
||||
noisePhrases := []string{
|
||||
"table of contents",
|
||||
"in this article",
|
||||
"additional resources",
|
||||
"feedback",
|
||||
"collaborate with us on github",
|
||||
"copyright",
|
||||
"all rights reserved",
|
||||
"privacy policy",
|
||||
"terms of service",
|
||||
"sign in",
|
||||
"skip to main content",
|
||||
"ask learn",
|
||||
}
|
||||
for _, phrase := range noisePhrases {
|
||||
re := regexp.MustCompile(`(?i)` + regexp.QuoteMeta(phrase))
|
||||
text = re.ReplaceAllString(text, " ")
|
||||
}
|
||||
|
||||
// Replace multiple whitespace with single space
|
||||
re := regexp.MustCompile(`\s+`)
|
||||
text = re.ReplaceAllString(text, " ")
|
||||
@@ -292,3 +421,58 @@ func cleanText(text string) string {
|
||||
|
||||
return text
|
||||
}
|
||||
|
||||
func pathScopePrefix(rawPath string) string {
|
||||
clean := path.Clean(rawPath)
|
||||
if clean == "." || clean == "/" || clean == "" {
|
||||
return ""
|
||||
}
|
||||
|
||||
last := path.Base(clean)
|
||||
if strings.Contains(last, ".") {
|
||||
dir := path.Dir(clean)
|
||||
if dir == "/" {
|
||||
// Root-level document page: keep crawler scoped to this page path.
|
||||
return clean
|
||||
}
|
||||
return dir
|
||||
}
|
||||
|
||||
dir := path.Dir(clean)
|
||||
if dir == "/" {
|
||||
return clean
|
||||
}
|
||||
return dir
|
||||
}
|
||||
|
||||
func pathScopeLeaf(rawPath string) string {
|
||||
clean := path.Clean(rawPath)
|
||||
if clean == "." || clean == "/" || clean == "" {
|
||||
return ""
|
||||
}
|
||||
last := path.Base(clean)
|
||||
if strings.Contains(last, ".") {
|
||||
return last
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func withinScope(target, base *url.URL, prefix, leaf string) bool {
|
||||
if target == nil || base == nil {
|
||||
return false
|
||||
}
|
||||
if !strings.EqualFold(target.Hostname(), base.Hostname()) {
|
||||
return false
|
||||
}
|
||||
if prefix == "" {
|
||||
return true
|
||||
}
|
||||
targetPath := target.Path
|
||||
if targetPath == "" {
|
||||
targetPath = path.Clean("/")
|
||||
}
|
||||
if strings.HasPrefix(targetPath, prefix) {
|
||||
return true
|
||||
}
|
||||
return leaf != "" && path.Base(targetPath) == leaf
|
||||
}
|
||||
|
||||
@@ -0,0 +1,132 @@
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"context"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestWebScraper_ReturnsErrorWhenNothingFetched(t *testing.T) {
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
http.NotFound(w, r)
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
s := NewWebScraper(&Config{
|
||||
UserAgent: "DevourTest/1.0",
|
||||
Timeout: 2 * time.Second,
|
||||
Concurrency: 1,
|
||||
MaxDepth: 1,
|
||||
})
|
||||
|
||||
_, err := s.Scrape(context.Background(), &Source{
|
||||
Name: "missing",
|
||||
Type: SourceTypeWeb,
|
||||
URL: srv.URL + "/missing",
|
||||
})
|
||||
if err == nil {
|
||||
t.Fatal("expected error when web scrape yields no documents")
|
||||
}
|
||||
}
|
||||
|
||||
func TestWebScraper_ReturnsErrorWhenPageHasNoExtractableContent(t *testing.T) {
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Content-Type", "text/html")
|
||||
_, _ = w.Write([]byte(`<html><head><title>Empty</title></head><body><p>tiny</p></body></html>`))
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
s := NewWebScraper(&Config{
|
||||
UserAgent: "DevourTest/1.0",
|
||||
Timeout: 2 * time.Second,
|
||||
Concurrency: 1,
|
||||
MaxDepth: 1,
|
||||
})
|
||||
|
||||
_, err := s.Scrape(context.Background(), &Source{
|
||||
Name: "empty",
|
||||
Type: SourceTypeWeb,
|
||||
URL: srv.URL,
|
||||
})
|
||||
if err == nil {
|
||||
t.Fatal("expected error when page has no extractable docs")
|
||||
}
|
||||
if !strings.Contains(err.Error(), "extracted no documents") {
|
||||
t.Fatalf("unexpected error message: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestWebScraper_AllowsRedirectedDocumentPath(t *testing.T) {
|
||||
content := strings.Repeat("ruby docs content ", 30)
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
switch r.URL.Path {
|
||||
case "/core/Regexp.html":
|
||||
http.Redirect(w, r, "/3.4.1/Regexp.html", http.StatusFound)
|
||||
case "/3.4.1/Regexp.html":
|
||||
w.Header().Set("Content-Type", "text/html")
|
||||
_, _ = w.Write([]byte(`<html><head><title>Regexp</title></head><body><main>` + content + `</main></body></html>`))
|
||||
default:
|
||||
http.NotFound(w, r)
|
||||
}
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
s := NewWebScraper(&Config{
|
||||
UserAgent: "DevourTest/1.0",
|
||||
Timeout: 2 * time.Second,
|
||||
Concurrency: 1,
|
||||
MaxDepth: 1,
|
||||
})
|
||||
|
||||
docs, err := s.Scrape(context.Background(), &Source{
|
||||
Name: "ruby",
|
||||
Type: SourceTypeWeb,
|
||||
URL: srv.URL + "/core/Regexp.html",
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected scrape error: %v", err)
|
||||
}
|
||||
if len(docs) == 0 {
|
||||
t.Fatal("expected redirected page to be scraped")
|
||||
}
|
||||
if !strings.Contains(docs[0].URL, "/3.4.1/Regexp.html") {
|
||||
t.Fatalf("expected final redirected URL, got %q", docs[0].URL)
|
||||
}
|
||||
}
|
||||
|
||||
func TestWebScraper_GlobalWrapperNormalizesOutput(t *testing.T) {
|
||||
content := strings.Repeat("docs content ", 20)
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Content-Type", "text/html")
|
||||
_, _ = w.Write([]byte(`<html><head><title>Regex Guide ¶ deprecated</title></head><body><main>` + content + `</main></body></html>`))
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
s := NewScraper(SourceTypeWeb, &Config{
|
||||
UserAgent: "DevourTest/1.0",
|
||||
Timeout: 2 * time.Second,
|
||||
Concurrency: 1,
|
||||
MaxDepth: 1,
|
||||
})
|
||||
if s == nil {
|
||||
t.Fatal("expected web scraper")
|
||||
}
|
||||
|
||||
docs, err := s.Scrape(context.Background(), &Source{
|
||||
Name: "test",
|
||||
Type: SourceTypeWeb,
|
||||
URL: srv.URL,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected scrape error: %v", err)
|
||||
}
|
||||
if len(docs) == 0 {
|
||||
t.Fatal("expected at least one document")
|
||||
}
|
||||
if docs[0].Title != "Regex Guide" {
|
||||
t.Fatalf("expected normalized title, got %q", docs[0].Title)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,98 @@
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"net"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
// wrappedScraper adds retry and normalization behavior for all scraper implementations.
|
||||
type wrappedScraper struct {
|
||||
inner Scraper
|
||||
}
|
||||
|
||||
func wrapScraper(inner Scraper) Scraper {
|
||||
if inner == nil {
|
||||
return nil
|
||||
}
|
||||
return &wrappedScraper{inner: inner}
|
||||
}
|
||||
|
||||
func (w *wrappedScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
|
||||
if source == nil {
|
||||
return nil, fmt.Errorf("source is required")
|
||||
}
|
||||
|
||||
retries := 2
|
||||
delay := 300 * time.Millisecond
|
||||
|
||||
var lastErr error
|
||||
for attempt := 0; attempt <= retries; attempt++ {
|
||||
docs, err := w.inner.Scrape(ctx, source)
|
||||
if err == nil {
|
||||
return NormalizeDocuments(docs), nil
|
||||
}
|
||||
lastErr = err
|
||||
|
||||
// One fallback: add trailing slash for doc sites when URL path looks page-like.
|
||||
if attempt == 0 && strings.HasPrefix(source.URL, "http") && !strings.HasSuffix(source.URL, "/") {
|
||||
alt := *source
|
||||
alt.URL = source.URL + "/"
|
||||
docs, altErr := w.inner.Scrape(ctx, &alt)
|
||||
if altErr == nil {
|
||||
return NormalizeDocuments(docs), nil
|
||||
}
|
||||
}
|
||||
|
||||
if attempt < retries && isRetriableScrapeError(err) {
|
||||
if !sleepWithContext(ctx, delay) {
|
||||
return nil, ctx.Err()
|
||||
}
|
||||
continue
|
||||
}
|
||||
break
|
||||
}
|
||||
|
||||
return nil, lastErr
|
||||
}
|
||||
|
||||
func (w *wrappedScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
|
||||
return w.inner.DetectChanges(ctx, source, lastHash)
|
||||
}
|
||||
|
||||
func isRetriableScrapeError(err error) bool {
|
||||
if err == nil {
|
||||
return false
|
||||
}
|
||||
|
||||
s := strings.ToLower(err.Error())
|
||||
if strings.Contains(s, "timeout") ||
|
||||
strings.Contains(s, "temporarily unavailable") ||
|
||||
strings.Contains(s, "connection reset") ||
|
||||
strings.Contains(s, "eof") ||
|
||||
strings.Contains(s, "http 429") ||
|
||||
strings.Contains(s, "http 500") ||
|
||||
strings.Contains(s, "http 502") ||
|
||||
strings.Contains(s, "http 503") ||
|
||||
strings.Contains(s, "http 504") {
|
||||
return true
|
||||
}
|
||||
|
||||
var netErr net.Error
|
||||
return errors.As(err, &netErr)
|
||||
}
|
||||
|
||||
func sleepWithContext(ctx context.Context, d time.Duration) bool {
|
||||
t := time.NewTimer(d)
|
||||
defer t.Stop()
|
||||
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return false
|
||||
case <-t.C:
|
||||
return true
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,45 @@
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"testing"
|
||||
)
|
||||
|
||||
type flakyStubScraper struct {
|
||||
failFirst bool
|
||||
calls int
|
||||
}
|
||||
|
||||
func (f *flakyStubScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
|
||||
f.calls++
|
||||
if f.failFirst && f.calls == 1 {
|
||||
return nil, fmt.Errorf("HTTP 503")
|
||||
}
|
||||
return []*Document{
|
||||
{
|
||||
Title: "Example ¶ deprecated",
|
||||
Content: "ok",
|
||||
URL: source.URL,
|
||||
Type: "test",
|
||||
},
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (f *flakyStubScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
|
||||
return true, "hash", nil
|
||||
}
|
||||
|
||||
func TestWrappedScraper_RetriesAndNormalizes(t *testing.T) {
|
||||
w := wrapScraper(&flakyStubScraper{failFirst: true})
|
||||
docs, err := w.Scrape(context.Background(), &Source{URL: "https://example.com"})
|
||||
if err != nil {
|
||||
t.Fatalf("expected retry to succeed, got error: %v", err)
|
||||
}
|
||||
if len(docs) != 1 {
|
||||
t.Fatalf("expected 1 document, got %d", len(docs))
|
||||
}
|
||||
if docs[0].Title != "Example" {
|
||||
t.Fatalf("expected normalized title, got %q", docs[0].Title)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,528 @@
|
||||
package search
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io/fs"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/yourorg/devour/internal/config"
|
||||
)
|
||||
|
||||
type Engine struct {
|
||||
DocsDir string
|
||||
IndexDir string
|
||||
MetadataDir string
|
||||
SnippetLength int
|
||||
}
|
||||
|
||||
type SearchOptions struct {
|
||||
Limit int
|
||||
Threshold float64
|
||||
}
|
||||
|
||||
type Result struct {
|
||||
ID string `json:"id"`
|
||||
DocID string `json:"doc_id"`
|
||||
Title string `json:"title"`
|
||||
URL string `json:"url,omitempty"`
|
||||
Type string `json:"type"`
|
||||
Source string `json:"source,omitempty"`
|
||||
Path string `json:"path"`
|
||||
Score float64 `json:"score"`
|
||||
Snippet string `json:"snippet"`
|
||||
Meta map[string]any `json:"metadata,omitempty"`
|
||||
}
|
||||
|
||||
type IndexStats struct {
|
||||
Documents int `json:"documents"`
|
||||
Tokens int `json:"tokens"`
|
||||
LastIndexedAt time.Time `json:"last_indexed_at"`
|
||||
IndexPath string `json:"index_path"`
|
||||
MetadataPath string `json:"metadata_path"`
|
||||
SourceFileHash string `json:"source_file_hash"`
|
||||
}
|
||||
|
||||
type indexedDoc struct {
|
||||
ID string `json:"id"`
|
||||
DocID string `json:"doc_id"`
|
||||
Title string `json:"title"`
|
||||
URL string `json:"url,omitempty"`
|
||||
Type string `json:"type"`
|
||||
Source string `json:"source,omitempty"`
|
||||
Path string `json:"path"`
|
||||
Content string `json:"content"`
|
||||
TermFreq map[string]int `json:"term_freq"`
|
||||
Length int `json:"length"`
|
||||
}
|
||||
|
||||
type persistedIndex struct {
|
||||
Version string `json:"version"`
|
||||
BuiltAt time.Time `json:"built_at"`
|
||||
Docs []indexedDoc `json:"docs"`
|
||||
}
|
||||
|
||||
type persistedMeta struct {
|
||||
Version string `json:"version"`
|
||||
BuiltAt time.Time `json:"built_at"`
|
||||
DocsDir string `json:"docs_dir"`
|
||||
SourceFileHash string `json:"source_file_hash"`
|
||||
DocCount int `json:"doc_count"`
|
||||
}
|
||||
|
||||
type rawDoc struct {
|
||||
ID string `json:"id"`
|
||||
Source string `json:"source"`
|
||||
Type string `json:"type"`
|
||||
Title string `json:"title"`
|
||||
Content string `json:"content"`
|
||||
URL string `json:"url,omitempty"`
|
||||
Metadata map[string]any `json:"metadata,omitempty"`
|
||||
}
|
||||
|
||||
const (
|
||||
indexFileName = "lexical_index.json"
|
||||
metaFileName = "lexical_index_meta.json"
|
||||
indexVersion = "1"
|
||||
)
|
||||
|
||||
func NewEngine(cfg *config.Config) *Engine {
|
||||
snippetLength := cfg.Indexing.SnippetLength
|
||||
if snippetLength <= 0 {
|
||||
snippetLength = 220
|
||||
}
|
||||
return &Engine{
|
||||
DocsDir: cfg.Storage.DocsDir,
|
||||
IndexDir: cfg.Storage.IndexDir,
|
||||
MetadataDir: cfg.Storage.MetadataDir,
|
||||
SnippetLength: snippetLength,
|
||||
}
|
||||
}
|
||||
|
||||
func (e *Engine) Rebuild(ctx context.Context) (*IndexStats, error) {
|
||||
if strings.TrimSpace(e.DocsDir) == "" {
|
||||
return nil, fmt.Errorf("docs directory is required")
|
||||
}
|
||||
if err := os.MkdirAll(e.IndexDir, 0o755); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if err := os.MkdirAll(e.MetadataDir, 0o755); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
docFiles, sourceHash, err := e.listDocFiles()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
docs := make([]indexedDoc, 0, len(docFiles))
|
||||
tokenCount := 0
|
||||
for _, file := range docFiles {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return nil, ctx.Err()
|
||||
default:
|
||||
}
|
||||
|
||||
rd, err := parseDocFile(file)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
if strings.TrimSpace(rd.Content) == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
id := rd.ID
|
||||
if id == "" {
|
||||
id = hashString(file + ":" + rd.Title)
|
||||
}
|
||||
termFreq := frequency(tokenize(strings.Join([]string{rd.Title, rd.Content, rd.URL, rd.Type}, " ")))
|
||||
length := 0
|
||||
for _, v := range termFreq {
|
||||
length += v
|
||||
}
|
||||
tokenCount += length
|
||||
|
||||
docs = append(docs, indexedDoc{
|
||||
ID: hashString(file),
|
||||
DocID: id,
|
||||
Title: bestTitle(rd.Title, file),
|
||||
URL: strings.TrimSpace(rd.URL),
|
||||
Type: defaultString(strings.TrimSpace(rd.Type), "document"),
|
||||
Source: strings.TrimSpace(rd.Source),
|
||||
Path: file,
|
||||
Content: collapseWhitespace(rd.Content),
|
||||
TermFreq: termFreq,
|
||||
Length: length,
|
||||
})
|
||||
}
|
||||
|
||||
index := persistedIndex{Version: indexVersion, BuiltAt: time.Now(), Docs: docs}
|
||||
indexPath := filepath.Join(e.IndexDir, indexFileName)
|
||||
if err := writeJSON(indexPath, index); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
meta := persistedMeta{
|
||||
Version: indexVersion,
|
||||
BuiltAt: index.BuiltAt,
|
||||
DocsDir: e.DocsDir,
|
||||
SourceFileHash: sourceHash,
|
||||
DocCount: len(docs),
|
||||
}
|
||||
metaPath := filepath.Join(e.MetadataDir, metaFileName)
|
||||
if err := writeJSON(metaPath, meta); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return &IndexStats{
|
||||
Documents: len(docs),
|
||||
Tokens: tokenCount,
|
||||
LastIndexedAt: index.BuiltAt,
|
||||
IndexPath: indexPath,
|
||||
MetadataPath: metaPath,
|
||||
SourceFileHash: sourceHash,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (e *Engine) EnsureIndexed(ctx context.Context) (*IndexStats, error) {
|
||||
metaPath := filepath.Join(e.MetadataDir, metaFileName)
|
||||
b, err := os.ReadFile(metaPath)
|
||||
if err != nil {
|
||||
if os.IsNotExist(err) {
|
||||
return e.Rebuild(ctx)
|
||||
}
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var meta persistedMeta
|
||||
if err := json.Unmarshal(b, &meta); err != nil {
|
||||
return e.Rebuild(ctx)
|
||||
}
|
||||
|
||||
_, sourceHash, err := e.listDocFiles()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if sourceHash != meta.SourceFileHash {
|
||||
return e.Rebuild(ctx)
|
||||
}
|
||||
|
||||
return &IndexStats{
|
||||
Documents: meta.DocCount,
|
||||
LastIndexedAt: meta.BuiltAt,
|
||||
IndexPath: filepath.Join(e.IndexDir, indexFileName),
|
||||
MetadataPath: metaPath,
|
||||
SourceFileHash: meta.SourceFileHash,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (e *Engine) Search(ctx context.Context, query string, opts SearchOptions) ([]Result, *IndexStats, error) {
|
||||
query = strings.TrimSpace(query)
|
||||
if query == "" {
|
||||
return nil, nil, fmt.Errorf("query is required")
|
||||
}
|
||||
|
||||
stats, err := e.EnsureIndexed(ctx)
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
|
||||
indexPath := filepath.Join(e.IndexDir, indexFileName)
|
||||
b, err := os.ReadFile(indexPath)
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
|
||||
var idx persistedIndex
|
||||
if err := json.Unmarshal(b, &idx); err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
|
||||
limit := opts.Limit
|
||||
if limit <= 0 {
|
||||
limit = 5
|
||||
}
|
||||
threshold := opts.Threshold
|
||||
if threshold < 0 {
|
||||
threshold = 0
|
||||
}
|
||||
|
||||
queryTokens := tokenize(query)
|
||||
if len(queryTokens) == 0 {
|
||||
return nil, stats, nil
|
||||
}
|
||||
qFreq := frequency(queryTokens)
|
||||
|
||||
type scored struct {
|
||||
doc indexedDoc
|
||||
score float64
|
||||
}
|
||||
matches := make([]scored, 0)
|
||||
|
||||
for _, doc := range idx.Docs {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return nil, nil, ctx.Err()
|
||||
default:
|
||||
}
|
||||
score := lexicalScore(qFreq, queryTokens, doc)
|
||||
if score <= 0 {
|
||||
continue
|
||||
}
|
||||
if threshold > 0 && score < threshold {
|
||||
continue
|
||||
}
|
||||
matches = append(matches, scored{doc: doc, score: score})
|
||||
}
|
||||
|
||||
sort.Slice(matches, func(i, j int) bool {
|
||||
if matches[i].score == matches[j].score {
|
||||
return matches[i].doc.Title < matches[j].doc.Title
|
||||
}
|
||||
return matches[i].score > matches[j].score
|
||||
})
|
||||
|
||||
if limit > len(matches) {
|
||||
limit = len(matches)
|
||||
}
|
||||
|
||||
results := make([]Result, 0, limit)
|
||||
for i := 0; i < limit; i++ {
|
||||
d := matches[i].doc
|
||||
results = append(results, Result{
|
||||
ID: d.ID,
|
||||
DocID: d.DocID,
|
||||
Title: d.Title,
|
||||
URL: d.URL,
|
||||
Type: d.Type,
|
||||
Source: d.Source,
|
||||
Path: d.Path,
|
||||
Score: matches[i].score,
|
||||
Snippet: bestSnippet(d.Content, queryTokens, e.SnippetLength),
|
||||
Meta: map[string]any{
|
||||
"length": d.Length,
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
return results, stats, nil
|
||||
}
|
||||
|
||||
func (e *Engine) listDocFiles() ([]string, string, error) {
|
||||
files := make([]string, 0)
|
||||
h := sha256.New()
|
||||
|
||||
err := filepath.WalkDir(e.DocsDir, func(path string, d fs.DirEntry, err error) error {
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if d.IsDir() {
|
||||
return nil
|
||||
}
|
||||
ext := strings.ToLower(filepath.Ext(path))
|
||||
switch ext {
|
||||
case ".json", ".md", ".txt":
|
||||
default:
|
||||
return nil
|
||||
}
|
||||
|
||||
info, statErr := d.Info()
|
||||
if statErr != nil {
|
||||
return statErr
|
||||
}
|
||||
files = append(files, path)
|
||||
fmt.Fprintf(h, "%s|%d|%d\n", path, info.Size(), info.ModTime().UnixNano())
|
||||
return nil
|
||||
})
|
||||
if err != nil {
|
||||
if os.IsNotExist(err) {
|
||||
return []string{}, hashString("empty"), nil
|
||||
}
|
||||
return nil, "", err
|
||||
}
|
||||
|
||||
sort.Strings(files)
|
||||
return files, hex.EncodeToString(h.Sum(nil)), nil
|
||||
}
|
||||
|
||||
func parseDocFile(path string) (*rawDoc, error) {
|
||||
b, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
ext := strings.ToLower(filepath.Ext(path))
|
||||
switch ext {
|
||||
case ".json":
|
||||
var d rawDoc
|
||||
if err := json.Unmarshal(b, &d); err == nil && (d.Title != "" || d.Content != "") {
|
||||
return &d, nil
|
||||
}
|
||||
// Not a structured doc JSON, index as raw text fallback.
|
||||
return &rawDoc{Title: filepath.Base(path), Content: string(b), Type: "json", Source: "local"}, nil
|
||||
case ".md":
|
||||
content := string(b)
|
||||
title := markdownTitle(content)
|
||||
return &rawDoc{Title: title, Content: content, Type: "markdown", Source: "local"}, nil
|
||||
default:
|
||||
return &rawDoc{Title: filepath.Base(path), Content: string(b), Type: "text", Source: "local"}, nil
|
||||
}
|
||||
}
|
||||
|
||||
func markdownTitle(content string) string {
|
||||
for _, line := range strings.Split(content, "\n") {
|
||||
trim := strings.TrimSpace(line)
|
||||
if strings.HasPrefix(trim, "#") {
|
||||
trim = strings.TrimLeft(trim, "#")
|
||||
trim = strings.TrimSpace(trim)
|
||||
if trim != "" {
|
||||
return trim
|
||||
}
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func writeJSON(path string, v any) error {
|
||||
b, err := json.MarshalIndent(v, "", " ")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return os.WriteFile(path, b, 0o644)
|
||||
}
|
||||
|
||||
func tokenize(input string) []string {
|
||||
replacer := strings.NewReplacer(",", " ", ".", " ", "(", " ", ")", " ", "[", " ", "]", " ", "{", " ", "}", " ",
|
||||
":", " ", ";", " ", "!", " ", "?", " ", "\n", " ", "\r", " ", "\t", " ", "\"", " ", "'", " ", "`", " ",
|
||||
)
|
||||
clean := strings.ToLower(replacer.Replace(input))
|
||||
parts := strings.Fields(clean)
|
||||
out := make([]string, 0, len(parts))
|
||||
for _, p := range parts {
|
||||
p = strings.TrimSpace(p)
|
||||
if len(p) < 2 {
|
||||
continue
|
||||
}
|
||||
out = append(out, p)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func frequency(tokens []string) map[string]int {
|
||||
m := make(map[string]int, len(tokens))
|
||||
for _, t := range tokens {
|
||||
m[t]++
|
||||
}
|
||||
return m
|
||||
}
|
||||
|
||||
func lexicalScore(qFreq map[string]int, queryTokens []string, doc indexedDoc) float64 {
|
||||
if len(doc.TermFreq) == 0 {
|
||||
return 0
|
||||
}
|
||||
|
||||
titleLower := strings.ToLower(doc.Title)
|
||||
urlLower := strings.ToLower(doc.URL)
|
||||
contentLower := strings.ToLower(doc.Content)
|
||||
|
||||
score := 0.0
|
||||
for token, qCount := range qFreq {
|
||||
dCount := doc.TermFreq[token]
|
||||
if dCount == 0 {
|
||||
continue
|
||||
}
|
||||
part := float64(dCount*qCount) / float64(max(1, doc.Length))
|
||||
score += part * 8.0
|
||||
if strings.Contains(titleLower, token) {
|
||||
score += 2.5
|
||||
}
|
||||
if strings.Contains(urlLower, token) {
|
||||
score += 1.2
|
||||
}
|
||||
}
|
||||
|
||||
phrase := strings.Join(queryTokens, " ")
|
||||
if len(queryTokens) > 1 && strings.Contains(contentLower, phrase) {
|
||||
score += 1.5
|
||||
}
|
||||
return score
|
||||
}
|
||||
|
||||
func bestSnippet(content string, queryTokens []string, maxLen int) string {
|
||||
if maxLen <= 0 {
|
||||
maxLen = 220
|
||||
}
|
||||
flat := collapseWhitespace(content)
|
||||
if flat == "" {
|
||||
return ""
|
||||
}
|
||||
if len(flat) <= maxLen {
|
||||
return flat
|
||||
}
|
||||
|
||||
lower := strings.ToLower(flat)
|
||||
start := 0
|
||||
for _, tok := range queryTokens {
|
||||
if idx := strings.Index(lower, tok); idx >= 0 {
|
||||
start = idx - (maxLen / 4)
|
||||
if start < 0 {
|
||||
start = 0
|
||||
}
|
||||
break
|
||||
}
|
||||
}
|
||||
end := start + maxLen
|
||||
if end > len(flat) {
|
||||
end = len(flat)
|
||||
}
|
||||
snippet := strings.TrimSpace(flat[start:end])
|
||||
if end < len(flat) {
|
||||
snippet += "..."
|
||||
}
|
||||
return snippet
|
||||
}
|
||||
|
||||
func collapseWhitespace(s string) string {
|
||||
return strings.Join(strings.Fields(strings.TrimSpace(s)), " ")
|
||||
}
|
||||
|
||||
func bestTitle(title, path string) string {
|
||||
title = strings.TrimSpace(title)
|
||||
if title != "" {
|
||||
return title
|
||||
}
|
||||
base := filepath.Base(path)
|
||||
base = strings.TrimSuffix(base, filepath.Ext(base))
|
||||
base = strings.ReplaceAll(base, "_", " ")
|
||||
base = strings.TrimSpace(base)
|
||||
if base == "" {
|
||||
return "Documentation"
|
||||
}
|
||||
return base
|
||||
}
|
||||
|
||||
func defaultString(v, fallback string) string {
|
||||
if strings.TrimSpace(v) == "" {
|
||||
return fallback
|
||||
}
|
||||
return v
|
||||
}
|
||||
|
||||
func hashString(s string) string {
|
||||
sum := sha256.Sum256([]byte(s))
|
||||
return hex.EncodeToString(sum[:12])
|
||||
}
|
||||
|
||||
func max(a, b int) int {
|
||||
if a > b {
|
||||
return a
|
||||
}
|
||||
return b
|
||||
}
|
||||
@@ -0,0 +1,56 @@
|
||||
package search
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
|
||||
"github.com/yourorg/devour/internal/config"
|
||||
)
|
||||
|
||||
func TestEngineRebuildAndSearch(t *testing.T) {
|
||||
tmp := t.TempDir()
|
||||
docsDir := filepath.Join(tmp, "docs")
|
||||
indexDir := filepath.Join(tmp, "index")
|
||||
metaDir := filepath.Join(tmp, "metadata")
|
||||
if err := os.MkdirAll(docsDir, 0o755); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
doc := map[string]any{
|
||||
"id": "1",
|
||||
"title": "HTTP Client",
|
||||
"content": "Use net/http client with timeout",
|
||||
"type": "go-doc",
|
||||
"source": "go",
|
||||
"url": "https://pkg.go.dev/net/http",
|
||||
}
|
||||
b, _ := json.Marshal(doc)
|
||||
if err := os.WriteFile(filepath.Join(docsDir, "doc.json"), b, 0o644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
cfg := config.Default()
|
||||
cfg.Storage.DocsDir = docsDir
|
||||
cfg.Storage.IndexDir = indexDir
|
||||
cfg.Storage.MetadataDir = metaDir
|
||||
|
||||
e := NewEngine(cfg)
|
||||
stats, err := e.Rebuild(context.Background())
|
||||
if err != nil {
|
||||
t.Fatalf("rebuild failed: %v", err)
|
||||
}
|
||||
if stats.Documents == 0 {
|
||||
t.Fatal("expected documents in index")
|
||||
}
|
||||
|
||||
results, _, err := e.Search(context.Background(), "http timeout", SearchOptions{Limit: 5})
|
||||
if err != nil {
|
||||
t.Fatalf("search failed: %v", err)
|
||||
}
|
||||
if len(results) == 0 {
|
||||
t.Fatal("expected at least one search result")
|
||||
}
|
||||
}
|
||||
+167
-5
@@ -2,7 +2,16 @@
|
||||
package server
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"os"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
// Config holds server configuration.
|
||||
@@ -11,8 +20,13 @@ type Config struct {
|
||||
Transport string `yaml:"transport"`
|
||||
Host string `yaml:"host"`
|
||||
Port int `yaml:"port"`
|
||||
|
||||
Handler MethodHandler `yaml:"-"`
|
||||
}
|
||||
|
||||
// MethodHandler executes a server method with raw params and returns result payload.
|
||||
type MethodHandler func(ctx context.Context, method string, params json.RawMessage) (any, error)
|
||||
|
||||
// Server defines the MCP server interface.
|
||||
type Server interface {
|
||||
// Start begins listening for connections.
|
||||
@@ -47,9 +61,28 @@ type Result struct {
|
||||
Metadata map[string]any `json:"metadata,omitempty"`
|
||||
}
|
||||
|
||||
type rpcRequest struct {
|
||||
JSONRPC string `json:"jsonrpc"`
|
||||
ID any `json:"id"`
|
||||
Method string `json:"method"`
|
||||
Params json.RawMessage `json:"params,omitempty"`
|
||||
}
|
||||
|
||||
type rpcResponse struct {
|
||||
JSONRPC string `json:"jsonrpc"`
|
||||
ID any `json:"id"`
|
||||
Result any `json:"result,omitempty"`
|
||||
Error *rpcError `json:"error,omitempty"`
|
||||
}
|
||||
|
||||
type rpcError struct {
|
||||
Code int `json:"code"`
|
||||
Message string `json:"message"`
|
||||
}
|
||||
|
||||
// NewServer creates a new MCP server.
|
||||
func NewServer(config *Config) Server {
|
||||
if config.Mode == "remote" {
|
||||
if strings.EqualFold(config.Mode, "remote") {
|
||||
return NewHTTPServer(config)
|
||||
}
|
||||
return NewStdioServer(config)
|
||||
@@ -68,27 +101,156 @@ func NewStdioServer(config *Config) *StdioServer {
|
||||
// HTTPServer implements Server for HTTP transport.
|
||||
type HTTPServer struct {
|
||||
config *Config
|
||||
http *http.Server
|
||||
mu sync.Mutex
|
||||
}
|
||||
|
||||
func (s *HTTPServer) Start(ctx context.Context) error {
|
||||
// TODO: Implement HTTP server with MCP endpoints
|
||||
return nil
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
|
||||
if s.config == nil {
|
||||
return fmt.Errorf("server config is required")
|
||||
}
|
||||
if s.config.Handler == nil {
|
||||
return fmt.Errorf("server handler is required")
|
||||
}
|
||||
|
||||
mux := http.NewServeMux()
|
||||
mux.HandleFunc("/health", func(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
_, _ = io.WriteString(w, `{"ok":true}`)
|
||||
})
|
||||
mux.HandleFunc("/rpc", func(w http.ResponseWriter, r *http.Request) {
|
||||
if r.Method != http.MethodPost {
|
||||
w.WriteHeader(http.StatusMethodNotAllowed)
|
||||
return
|
||||
}
|
||||
defer r.Body.Close()
|
||||
var req rpcRequest
|
||||
if err := json.NewDecoder(io.LimitReader(r.Body, 2<<20)).Decode(&req); err != nil {
|
||||
writeRPC(w, rpcResponse{JSONRPC: "2.0", Error: &rpcError{Code: -32700, Message: "parse error"}})
|
||||
return
|
||||
}
|
||||
resp := s.handleRPC(r.Context(), req)
|
||||
writeRPC(w, resp)
|
||||
})
|
||||
|
||||
host := s.config.Host
|
||||
if host == "" {
|
||||
host = "localhost"
|
||||
}
|
||||
port := s.config.Port
|
||||
if port == 0 {
|
||||
port = 8080
|
||||
}
|
||||
s.http = &http.Server{Addr: fmt.Sprintf("%s:%d", host, port), Handler: mux}
|
||||
|
||||
errCh := make(chan error, 1)
|
||||
go func() {
|
||||
errCh <- s.http.ListenAndServe()
|
||||
}()
|
||||
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
shutdownCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
defer cancel()
|
||||
_ = s.http.Shutdown(shutdownCtx)
|
||||
return ctx.Err()
|
||||
case err := <-errCh:
|
||||
if err != nil && err != http.ErrServerClosed {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
func (s *HTTPServer) Stop(ctx context.Context) error {
|
||||
return nil
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
if s.http == nil {
|
||||
return nil
|
||||
}
|
||||
return s.http.Shutdown(ctx)
|
||||
}
|
||||
|
||||
func (s *HTTPServer) handleRPC(ctx context.Context, req rpcRequest) rpcResponse {
|
||||
return handleRPC(ctx, s.config.Handler, req)
|
||||
}
|
||||
|
||||
// StdioServer implements Server for stdio transport.
|
||||
type StdioServer struct {
|
||||
config *Config
|
||||
mu sync.Mutex
|
||||
stop bool
|
||||
}
|
||||
|
||||
func (s *StdioServer) Start(ctx context.Context) error {
|
||||
// TODO: Implement stdio JSON-RPC server
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
|
||||
if s.config == nil {
|
||||
return fmt.Errorf("server config is required")
|
||||
}
|
||||
if s.config.Handler == nil {
|
||||
return fmt.Errorf("server handler is required")
|
||||
}
|
||||
|
||||
scanner := bufio.NewScanner(os.Stdin)
|
||||
out := json.NewEncoder(os.Stdout)
|
||||
|
||||
for scanner.Scan() {
|
||||
if ctx.Err() != nil || s.stop {
|
||||
break
|
||||
}
|
||||
line := strings.TrimSpace(scanner.Text())
|
||||
if line == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
var req rpcRequest
|
||||
if err := json.Unmarshal([]byte(line), &req); err != nil {
|
||||
_ = out.Encode(rpcResponse{JSONRPC: "2.0", Error: &rpcError{Code: -32700, Message: "parse error"}})
|
||||
continue
|
||||
}
|
||||
resp := handleRPC(ctx, s.config.Handler, req)
|
||||
if err := out.Encode(resp); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
if err := scanner.Err(); err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *StdioServer) Stop(ctx context.Context) error {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
s.stop = true
|
||||
return nil
|
||||
}
|
||||
|
||||
func handleRPC(ctx context.Context, handler MethodHandler, req rpcRequest) rpcResponse {
|
||||
if req.JSONRPC == "" {
|
||||
req.JSONRPC = "2.0"
|
||||
}
|
||||
if req.Method == "" {
|
||||
return rpcResponse{JSONRPC: "2.0", ID: req.ID, Error: &rpcError{Code: -32600, Message: "invalid request"}}
|
||||
}
|
||||
|
||||
result, err := handler(ctx, req.Method, req.Params)
|
||||
if err != nil {
|
||||
return rpcResponse{JSONRPC: "2.0", ID: req.ID, Error: &rpcError{Code: -32000, Message: err.Error()}}
|
||||
}
|
||||
return rpcResponse{JSONRPC: "2.0", ID: req.ID, Result: result}
|
||||
}
|
||||
|
||||
func writeRPC(w http.ResponseWriter, payload rpcResponse) {
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
if payload.Error != nil {
|
||||
w.WriteHeader(http.StatusBadRequest)
|
||||
}
|
||||
_ = json.NewEncoder(w).Encode(payload)
|
||||
}
|
||||
|
||||
@@ -0,0 +1,149 @@
|
||||
package storage
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
"strings"
|
||||
|
||||
"github.com/yourorg/devour/internal/markdown"
|
||||
"github.com/yourorg/devour/internal/scraper"
|
||||
)
|
||||
|
||||
type SaveOptions struct {
|
||||
Format string
|
||||
OutputDir string
|
||||
AllowEmpty bool
|
||||
PrintWriter func(string, ...any)
|
||||
}
|
||||
|
||||
type SaveResult struct {
|
||||
Count int
|
||||
Files []string
|
||||
}
|
||||
|
||||
var slugUnsafe = regexp.MustCompile(`[^a-z0-9._-]+`)
|
||||
|
||||
func SaveDocuments(docs []*scraper.Document, opts SaveOptions) (*SaveResult, error) {
|
||||
if len(docs) == 0 {
|
||||
if opts.AllowEmpty {
|
||||
return &SaveResult{}, nil
|
||||
}
|
||||
return nil, fmt.Errorf("no documents scraped")
|
||||
}
|
||||
|
||||
format := strings.ToLower(strings.TrimSpace(opts.Format))
|
||||
if format == "" {
|
||||
format = "json"
|
||||
}
|
||||
if format != "json" && format != "markdown" {
|
||||
return nil, fmt.Errorf("unsupported format: %s", opts.Format)
|
||||
}
|
||||
|
||||
if strings.TrimSpace(opts.OutputDir) == "" {
|
||||
return nil, fmt.Errorf("output directory is required")
|
||||
}
|
||||
if err := os.MkdirAll(opts.OutputDir, 0o755); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
used := map[string]int{}
|
||||
files := make([]string, 0, len(docs))
|
||||
formatter := markdown.NewFormatter()
|
||||
|
||||
for i, doc := range docs {
|
||||
if doc == nil {
|
||||
continue
|
||||
}
|
||||
base := slugify(defaultTitle(doc.Title, i))
|
||||
ext := ".json"
|
||||
if format == "markdown" {
|
||||
ext = ".md"
|
||||
}
|
||||
name := uniqueName(base, ext, used, doc.ID)
|
||||
path := filepath.Join(opts.OutputDir, name)
|
||||
|
||||
var b []byte
|
||||
var err error
|
||||
if format == "markdown" {
|
||||
md := &markdown.Document{
|
||||
ID: doc.ID,
|
||||
Source: doc.Source,
|
||||
Type: doc.Type,
|
||||
Title: doc.Title,
|
||||
Content: doc.Content,
|
||||
URL: doc.URL,
|
||||
Metadata: doc.Metadata,
|
||||
Hash: doc.Hash,
|
||||
Timestamp: doc.Timestamp,
|
||||
}
|
||||
b = []byte(formatter.FormatWithTOC(md))
|
||||
} else {
|
||||
b, err = json.MarshalIndent(doc, "", " ")
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
if err := os.WriteFile(path, b, 0o644); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
files = append(files, path)
|
||||
if opts.PrintWriter != nil {
|
||||
opts.PrintWriter(" 📄 %s (%s)\n", filepath.Base(path), doc.Type)
|
||||
}
|
||||
}
|
||||
|
||||
if len(files) == 0 && !opts.AllowEmpty {
|
||||
return nil, fmt.Errorf("no documents scraped")
|
||||
}
|
||||
|
||||
return &SaveResult{Count: len(files), Files: files}, nil
|
||||
}
|
||||
|
||||
func defaultTitle(title string, idx int) string {
|
||||
title = strings.TrimSpace(title)
|
||||
if title != "" {
|
||||
return title
|
||||
}
|
||||
return fmt.Sprintf("document_%d", idx)
|
||||
}
|
||||
|
||||
func slugify(name string) string {
|
||||
name = strings.ToLower(strings.TrimSpace(name))
|
||||
name = strings.ReplaceAll(name, " ", "-")
|
||||
name = strings.ReplaceAll(name, "/", "-")
|
||||
name = strings.ReplaceAll(name, "\\", "-")
|
||||
name = strings.ReplaceAll(name, ":", "-")
|
||||
name = strings.ReplaceAll(name, "?", "")
|
||||
name = strings.ReplaceAll(name, "&", "and")
|
||||
name = slugUnsafe.ReplaceAllString(name, "-")
|
||||
name = strings.Trim(name, "-.")
|
||||
if name == "" {
|
||||
name = "document"
|
||||
}
|
||||
if len(name) > 80 {
|
||||
name = strings.Trim(name[:80], "-.")
|
||||
}
|
||||
if name == "" {
|
||||
name = "document"
|
||||
}
|
||||
return name
|
||||
}
|
||||
|
||||
func uniqueName(base, ext string, used map[string]int, id string) string {
|
||||
key := base + ext
|
||||
if used[key] == 0 {
|
||||
used[key] = 1
|
||||
return key
|
||||
}
|
||||
used[key]++
|
||||
suffix := used[key]
|
||||
id = strings.TrimSpace(id)
|
||||
if len(id) >= 8 {
|
||||
return fmt.Sprintf("%s-%s%s", base, id[:8], ext)
|
||||
}
|
||||
return fmt.Sprintf("%s-%d%s", base, suffix, ext)
|
||||
}
|
||||
Reference in New Issue
Block a user