mirror of
https://github.com/Dvorinka/Devour.git
synced 2026-06-04 04:23:02 +00:00
369 lines
9.1 KiB
Go
369 lines
9.1 KiB
Go
package config
|
|
|
|
import (
|
|
"fmt"
|
|
"os"
|
|
"path/filepath"
|
|
"strings"
|
|
"time"
|
|
|
|
"gopkg.in/yaml.v3"
|
|
)
|
|
|
|
// Config is the typed application configuration loaded from devour.yaml.
|
|
type Config struct {
|
|
Version int `yaml:"version"`
|
|
Storage StorageConfig `yaml:"storage"`
|
|
Embeddings EmbeddingsConfig `yaml:"embeddings"`
|
|
VectorDB VectorDBConfig `yaml:"vector_db"`
|
|
Scraper ScraperConfig `yaml:"scraper"`
|
|
Scheduler SchedulerConfig `yaml:"scheduler"`
|
|
Server ServerConfig `yaml:"server"`
|
|
Indexing IndexingConfig `yaml:"indexing"`
|
|
Verification VerificationConfig `yaml:"verification"`
|
|
Sources []SourceConfig `yaml:"sources"`
|
|
|
|
ConfigPath string `yaml:"-"`
|
|
}
|
|
|
|
type StorageConfig struct {
|
|
DocsDir string `yaml:"docs_dir"`
|
|
IndexDir string `yaml:"index_dir"`
|
|
MetadataDir string `yaml:"metadata_dir"`
|
|
CacheDir string `yaml:"cache_dir"`
|
|
}
|
|
|
|
type EmbeddingsConfig struct {
|
|
Provider string `yaml:"provider"`
|
|
Model string `yaml:"model"`
|
|
Dimensions int `yaml:"dimensions"`
|
|
APIKey string `yaml:"api_key"`
|
|
BatchSize int `yaml:"batch_size"`
|
|
BaseURL string `yaml:"base_url"`
|
|
}
|
|
|
|
type VectorDBConfig struct {
|
|
Type string `yaml:"type"`
|
|
Persist bool `yaml:"persist"`
|
|
SimilarityMetric string `yaml:"similarity_metric"`
|
|
PersistDir string `yaml:"persist_dir"`
|
|
}
|
|
|
|
type ScraperConfig struct {
|
|
UserAgent string `yaml:"user_agent"`
|
|
Timeout time.Duration `yaml:"timeout"`
|
|
RetryCount int `yaml:"retry_count"`
|
|
RetryDelay time.Duration `yaml:"retry_delay"`
|
|
Concurrency int `yaml:"concurrency"`
|
|
RateLimit time.Duration `yaml:"rate_limit"`
|
|
MaxDepth int `yaml:"max_depth"`
|
|
CacheDir string `yaml:"cache_dir"`
|
|
}
|
|
|
|
type SchedulerConfig struct {
|
|
Enabled bool `yaml:"enabled"`
|
|
Interval time.Duration `yaml:"interval"`
|
|
CheckMethod string `yaml:"check_method"`
|
|
OnStartup bool `yaml:"on_startup"`
|
|
}
|
|
|
|
type ServerConfig struct {
|
|
Mode string `yaml:"mode"`
|
|
Transport string `yaml:"transport"`
|
|
Host string `yaml:"host"`
|
|
Port int `yaml:"port"`
|
|
}
|
|
|
|
type IndexingConfig struct {
|
|
Enabled bool `yaml:"enabled"`
|
|
AutoReindex bool `yaml:"auto_reindex"`
|
|
SnippetLength int `yaml:"snippet_length"`
|
|
MaxDocs int `yaml:"max_docs"`
|
|
}
|
|
|
|
type VerificationConfig struct {
|
|
Enabled bool `yaml:"enabled"`
|
|
Timeout time.Duration `yaml:"timeout"`
|
|
}
|
|
|
|
type SourceConfig struct {
|
|
Name string `yaml:"name"`
|
|
Type string `yaml:"type"`
|
|
URL string `yaml:"url,omitempty"`
|
|
Query string `yaml:"query,omitempty"`
|
|
ResultLimit int `yaml:"result_limit,omitempty"`
|
|
Domains []string `yaml:"domains,omitempty"`
|
|
Repo string `yaml:"repo,omitempty"`
|
|
Branch string `yaml:"branch,omitempty"`
|
|
Path string `yaml:"path,omitempty"`
|
|
Include []string `yaml:"include,omitempty"`
|
|
Exclude []string `yaml:"exclude,omitempty"`
|
|
Schedule string `yaml:"schedule,omitempty"`
|
|
}
|
|
|
|
// Default returns a default configuration that matches devour init behavior.
|
|
func Default() *Config {
|
|
return &Config{
|
|
Version: 1,
|
|
Storage: StorageConfig{
|
|
DocsDir: "./devour_data/docs",
|
|
IndexDir: "./devour_data/index",
|
|
MetadataDir: "./devour_data/metadata",
|
|
CacheDir: "./devour_data/cache",
|
|
},
|
|
Embeddings: EmbeddingsConfig{
|
|
Provider: "openai",
|
|
Model: "text-embedding-3-small",
|
|
Dimensions: 1536,
|
|
BatchSize: 100,
|
|
APIKey: "${OPENAI_API_KEY}",
|
|
},
|
|
VectorDB: VectorDBConfig{
|
|
Type: "memory",
|
|
Persist: true,
|
|
SimilarityMetric: "cosine",
|
|
},
|
|
Scraper: ScraperConfig{
|
|
UserAgent: "Devour/1.0",
|
|
Timeout: 30 * time.Second,
|
|
RetryCount: 3,
|
|
RetryDelay: 1 * time.Second,
|
|
Concurrency: 10,
|
|
RateLimit: 500 * time.Millisecond,
|
|
MaxDepth: 3,
|
|
CacheDir: "./devour_data/cache",
|
|
},
|
|
Scheduler: SchedulerConfig{
|
|
Enabled: true,
|
|
Interval: 72 * time.Hour,
|
|
CheckMethod: "hash",
|
|
OnStartup: false,
|
|
},
|
|
Server: ServerConfig{
|
|
Mode: "local",
|
|
Transport: "stdio",
|
|
Host: "localhost",
|
|
Port: 8080,
|
|
},
|
|
Indexing: IndexingConfig{
|
|
Enabled: true,
|
|
AutoReindex: true,
|
|
SnippetLength: 220,
|
|
MaxDocs: 10000,
|
|
},
|
|
Verification: VerificationConfig{
|
|
Enabled: true,
|
|
Timeout: 90 * time.Second,
|
|
},
|
|
Sources: []SourceConfig{},
|
|
}
|
|
}
|
|
|
|
const initTemplateSourcesComment = `
|
|
# Sources (add your own)
|
|
sources: []
|
|
# - name: example-docs
|
|
# type: url
|
|
# url: https://docs.example.com
|
|
# include: ["**/*.md", "**/*.html"]
|
|
# - name: local-searxng
|
|
# type: localsearch
|
|
# url: http://127.0.0.1:8080/search
|
|
# query: golang http client
|
|
# result_limit: 8
|
|
# domains: ["pkg.go.dev", "go.dev"]
|
|
`
|
|
|
|
// RenderInitYAML returns the default init config file content from canonical defaults.
|
|
func RenderInitYAML(remote bool) (string, error) {
|
|
cfg := Default()
|
|
if remote {
|
|
cfg.Server.Mode = "remote"
|
|
}
|
|
// Keep the init template comments for discoverability while sourcing
|
|
// the actual values from canonical defaults.
|
|
cfg.Sources = nil
|
|
|
|
body, err := yaml.Marshal(cfg)
|
|
if err != nil {
|
|
return "", fmt.Errorf("marshal default config: %w", err)
|
|
}
|
|
|
|
trimmed := strings.TrimSuffix(string(body), "\n")
|
|
if strings.HasSuffix(trimmed, "sources: []") {
|
|
trimmed = strings.TrimSuffix(trimmed, "sources: []")
|
|
trimmed = strings.TrimSpace(trimmed)
|
|
}
|
|
|
|
return "# Devour Configuration\n" + trimmed + initTemplateSourcesComment, nil
|
|
}
|
|
|
|
// Load loads configuration from an explicit path or the default search paths.
|
|
func Load(explicitPath string) (*Config, error) {
|
|
cfg := Default()
|
|
|
|
path, err := findConfigPath(explicitPath)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if path == "" {
|
|
cfg.ApplyDefaults()
|
|
return cfg, nil
|
|
}
|
|
|
|
b, err := os.ReadFile(path)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("read config: %w", err)
|
|
}
|
|
|
|
if err := yaml.Unmarshal(b, cfg); err != nil {
|
|
return nil, fmt.Errorf("parse config: %w", err)
|
|
}
|
|
|
|
cfg.ConfigPath = path
|
|
cfg.ApplyDefaults()
|
|
return cfg, nil
|
|
}
|
|
|
|
// ApplyDefaults ensures additive backward-compatible defaults after unmarshaling.
|
|
func (c *Config) ApplyDefaults() {
|
|
if c.Version == 0 {
|
|
c.Version = 1
|
|
}
|
|
|
|
if c.Storage.DocsDir == "" {
|
|
c.Storage.DocsDir = "./devour_data/docs"
|
|
}
|
|
if c.Storage.IndexDir == "" {
|
|
c.Storage.IndexDir = "./devour_data/index"
|
|
}
|
|
if c.Storage.MetadataDir == "" {
|
|
c.Storage.MetadataDir = "./devour_data/metadata"
|
|
}
|
|
if c.Storage.CacheDir == "" {
|
|
c.Storage.CacheDir = "./devour_data/cache"
|
|
}
|
|
|
|
if c.Embeddings.Provider == "" {
|
|
c.Embeddings.Provider = "openai"
|
|
}
|
|
if c.Embeddings.Model == "" {
|
|
c.Embeddings.Model = "text-embedding-3-small"
|
|
}
|
|
if c.Embeddings.Dimensions <= 0 {
|
|
c.Embeddings.Dimensions = 1536
|
|
}
|
|
if c.Embeddings.BatchSize <= 0 {
|
|
c.Embeddings.BatchSize = 100
|
|
}
|
|
|
|
if c.VectorDB.Type == "" {
|
|
c.VectorDB.Type = "memory"
|
|
}
|
|
if c.VectorDB.SimilarityMetric == "" {
|
|
c.VectorDB.SimilarityMetric = "cosine"
|
|
}
|
|
|
|
if c.Scraper.UserAgent == "" {
|
|
c.Scraper.UserAgent = "Devour/1.0"
|
|
}
|
|
if c.Scraper.Timeout <= 0 {
|
|
c.Scraper.Timeout = 30 * time.Second
|
|
}
|
|
if c.Scraper.RetryCount <= 0 {
|
|
c.Scraper.RetryCount = 3
|
|
}
|
|
if c.Scraper.RetryDelay <= 0 {
|
|
c.Scraper.RetryDelay = 1 * time.Second
|
|
}
|
|
if c.Scraper.Concurrency <= 0 {
|
|
c.Scraper.Concurrency = 10
|
|
}
|
|
if c.Scraper.RateLimit < 0 {
|
|
c.Scraper.RateLimit = 0
|
|
}
|
|
if c.Scraper.MaxDepth <= 0 {
|
|
c.Scraper.MaxDepth = 3
|
|
}
|
|
if c.Scraper.CacheDir == "" {
|
|
c.Scraper.CacheDir = c.Storage.CacheDir
|
|
}
|
|
|
|
if c.Scheduler.Interval <= 0 {
|
|
c.Scheduler.Interval = 72 * time.Hour
|
|
}
|
|
if c.Scheduler.CheckMethod == "" {
|
|
c.Scheduler.CheckMethod = "hash"
|
|
}
|
|
|
|
if c.Server.Mode == "" {
|
|
c.Server.Mode = "local"
|
|
}
|
|
if c.Server.Transport == "" {
|
|
c.Server.Transport = "stdio"
|
|
}
|
|
if c.Server.Host == "" {
|
|
c.Server.Host = "localhost"
|
|
}
|
|
if c.Server.Port <= 0 {
|
|
c.Server.Port = 8080
|
|
}
|
|
|
|
if !c.Indexing.Enabled {
|
|
// keep explicit false but initialize defaults for remaining fields
|
|
}
|
|
if c.Indexing.SnippetLength <= 0 {
|
|
c.Indexing.SnippetLength = 220
|
|
}
|
|
if c.Indexing.MaxDocs <= 0 {
|
|
c.Indexing.MaxDocs = 10000
|
|
}
|
|
|
|
if c.Verification.Timeout <= 0 {
|
|
c.Verification.Timeout = 90 * time.Second
|
|
}
|
|
}
|
|
|
|
func findConfigPath(explicitPath string) (string, error) {
|
|
if strings.TrimSpace(explicitPath) != "" {
|
|
p, err := filepath.Abs(explicitPath)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
if _, err := os.Stat(p); err != nil {
|
|
return "", fmt.Errorf("config file not found: %s", explicitPath)
|
|
}
|
|
return p, nil
|
|
}
|
|
|
|
candidates := []string{"./devour.yaml"}
|
|
if home, err := os.UserHomeDir(); err == nil {
|
|
candidates = append(candidates, filepath.Join(home, ".devour", "devour.yaml"))
|
|
}
|
|
|
|
for _, c := range candidates {
|
|
if _, err := os.Stat(c); err == nil {
|
|
p, absErr := filepath.Abs(c)
|
|
if absErr != nil {
|
|
return "", absErr
|
|
}
|
|
return p, nil
|
|
}
|
|
}
|
|
return "", nil
|
|
}
|
|
|
|
// EnsureStorageDirs creates required local storage directories.
|
|
func (c *Config) EnsureStorageDirs() error {
|
|
dirs := []string{c.Storage.DocsDir, c.Storage.IndexDir, c.Storage.MetadataDir, c.Storage.CacheDir}
|
|
for _, dir := range dirs {
|
|
if strings.TrimSpace(dir) == "" {
|
|
continue
|
|
}
|
|
if err := os.MkdirAll(dir, 0o755); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
return nil
|
|
}
|