This commit is contained in:
Tomas Dvorak
2026-02-24 10:33:59 +01:00
parent 409acd2e08
commit 898a3c303f
1374 changed files with 290409 additions and 29187 deletions
+302 -98
View File
@@ -2,17 +2,23 @@ package cmd
import (
"context"
"encoding/json"
"crypto/sha256"
"encoding/hex"
"fmt"
"net/url"
"os"
"path/filepath"
"sort"
"strings"
"time"
"github.com/spf13/cobra"
"github.com/yourorg/devour/internal/markdown"
appconfig "github.com/yourorg/devour/internal/config"
"github.com/yourorg/devour/internal/projectstate"
"github.com/yourorg/devour/internal/scraper"
"github.com/yourorg/devour/internal/search"
"github.com/yourorg/devour/internal/storage"
"gopkg.in/yaml.v3"
)
var scrapeCmd = &cobra.Command{
@@ -34,144 +40,283 @@ Supported source types:
- dockerdocs: Docker (docs.docker.com)
- cloudflaredocs: Cloudflare (developers.cloudflare.com)
- astrodocs: Astro (docs.astro.build)
- localsearch: Self-hosted search API returning JSON results
- url: Generic web pages
- github: GitHub repositories
- openapi: OpenAPI/Swagger specs
- local: Local files/directories
Examples:
devour scrape https://pkg.go.dev/net/http --type godocs
devour scrape https://react.dev/reference/react --type reactdocs
devour scrape https://developers.cloudflare.com/ --type cloudflaredocs
devour scrape http://127.0.0.1:8080/search --type localsearch --search-query "golang http client"
devour scrape --sources sources.yaml`,
Args: cobra.MaximumNArgs(1),
RunE: runScrape,
}
var (
scrapeFormat string
scrapeSources string
scrapeOutput string
scrapeConcurrency int
scrapeType string
scrapeFormat string
scrapeSources string
scrapeOutput string
scrapeConcurrency int
scrapeType string
scrapeSearchQuery string
scrapeSearchLimit int
scrapeSearchDomains []string
scrapeInclude []string
scrapeExclude []string
scrapeAllowEmpty bool
)
func init() {
scrapeCmd.Flags().StringVarP(&scrapeFormat, "format", "f", "json", "output format (json, markdown)")
scrapeCmd.Flags().StringVarP(&scrapeSources, "sources", "s", "", "YAML file with source definitions")
scrapeCmd.Flags().StringVarP(&scrapeOutput, "output", "o", "", "output directory (default: devour_data/docs)")
scrapeCmd.Flags().StringVarP(&scrapeOutput, "output", "o", "", "output directory (default: configured docs dir)")
scrapeCmd.Flags().IntVar(&scrapeConcurrency, "concurrency", 10, "parallel scraping workers")
scrapeCmd.Flags().StringVarP(&scrapeType, "type", "t", "", "source type (auto-detected if not specified)")
scrapeCmd.Flags().StringVar(&scrapeSearchQuery, "search-query", "", "search query for --type localsearch")
scrapeCmd.Flags().IntVar(&scrapeSearchLimit, "search-limit", 8, "max result URLs to scrape for --type localsearch")
scrapeCmd.Flags().StringSliceVar(&scrapeSearchDomains, "search-domain", nil, "restrict localsearch results to these domains (repeatable)")
scrapeCmd.Flags().StringSliceVar(&scrapeInclude, "include", nil, "include URL/file regex patterns (repeatable)")
scrapeCmd.Flags().StringSliceVar(&scrapeExclude, "exclude", nil, "exclude URL/file regex patterns (repeatable)")
scrapeCmd.Flags().BoolVar(&scrapeAllowEmpty, "allow-empty", false, "allow success when no documents were extracted")
}
func runScrape(cmd *cobra.Command, args []string) error {
cfg, err := loadAppConfig()
if err != nil {
return err
}
if scrapeSources != "" {
return scrapeFromConfig(scrapeSources)
return scrapeFromConfig(cmd, cfg, scrapeSources)
}
if len(args) == 0 {
return fmt.Errorf("source argument required when not using --sources flag")
}
sourceURL := args[0]
config := &scraper.Config{
UserAgent: "Devour/1.0 (Documentation Scraper)",
Timeout: 30 * time.Second,
RetryCount: 3,
RetryDelay: 1 * time.Second,
Concurrency: scrapeConcurrency,
}
sourceURL := strings.TrimSpace(args[0])
sourceType := scraper.SourceType(scrapeType)
if sourceType == "" {
sourceType = detectSourceType(sourceURL)
}
fmt.Printf("Scraping: %s\n", sourceURL)
fmt.Printf(" Type: %s\n", sourceType)
fmt.Printf(" Concurrency: %d\n", scrapeConcurrency)
source := &scraper.Source{
Name: extractName(sourceURL),
Type: sourceType,
URL: sourceURL,
Query: strings.TrimSpace(scrapeSearchQuery),
ResultLimit: scrapeSearchLimit,
Domains: append([]string(nil), scrapeSearchDomains...),
Include: append([]string(nil), scrapeInclude...),
Exclude: append([]string(nil), scrapeExclude...),
}
if sourceType == scraper.SourceTypeLocal {
source.Path = sourceURL
}
applySourceProfile(source)
outputDir := resolveOutputDir(cfg, scrapeOutput)
count, err := scrapeOne(cmd, cfg, source, outputDir)
if err != nil {
return err
}
if cfg.Indexing.Enabled {
engine := search.NewEngine(cfg)
if _, err := engine.Rebuild(context.Background()); err != nil {
return fmt.Errorf("reindex after scrape: %w", err)
}
}
fmt.Printf("\n✓ Scraping complete!\n")
fmt.Printf(" Output: %s\n", outputDir)
fmt.Printf(" Documents: %d\n", count)
fmt.Println(" Run 'devour status' to inspect local index health")
return nil
}
func scrapeFromConfig(cmd *cobra.Command, cfg *appconfig.Config, configPath string) error {
raw, err := os.ReadFile(configPath)
if err != nil {
return fmt.Errorf("read sources file: %w", err)
}
var list []appconfig.SourceConfig
if err := yaml.Unmarshal(raw, &list); err != nil || len(list) == 0 {
var wrapped struct {
Sources []appconfig.SourceConfig `yaml:"sources"`
}
if wrapErr := yaml.Unmarshal(raw, &wrapped); wrapErr != nil {
return fmt.Errorf("parse sources file: %w", err)
}
list = wrapped.Sources
}
if len(list) == 0 {
return fmt.Errorf("sources file contains no sources")
}
sort.Slice(list, func(i, j int) bool {
return list[i].Name < list[j].Name
})
outputDir := resolveOutputDir(cfg, scrapeOutput)
success := 0
failures := 0
totalDocs := 0
for _, srcCfg := range list {
source := sourceFromConfig(srcCfg)
if source.Type == "" {
if source.URL != "" {
source.Type = detectSourceType(source.URL)
} else if source.Path != "" {
source.Type = scraper.SourceTypeLocal
}
}
if source.Name == "" {
source.Name = extractName(source.URL)
if source.Name == "unknown" && source.Path != "" {
source.Name = filepath.Base(source.Path)
}
}
applySourceProfile(source)
fmt.Printf("\n=== Source: %s (%s) ===\n", source.Name, source.Type)
count, srcErr := scrapeOne(cmd, cfg, source, outputDir)
if srcErr != nil {
failures++
fmt.Printf("✗ %s failed: %v\n", source.Name, srcErr)
continue
}
totalDocs += count
success++
}
if cfg.Indexing.Enabled {
engine := search.NewEngine(cfg)
if _, err := engine.Rebuild(context.Background()); err != nil {
return fmt.Errorf("reindex after scrape sources: %w", err)
}
}
fmt.Printf("\nSummary: %d succeeded, %d failed, %d docs written\n", success, failures, totalDocs)
if failures > 0 {
return fmt.Errorf("one or more sources failed")
}
return nil
}
func scrapeOne(cmd *cobra.Command, cfg *appconfig.Config, source *scraper.Source, outputDir string) (int, error) {
if source == nil {
return 0, fmt.Errorf("source is required")
}
if source.Type == "" {
return 0, fmt.Errorf("source type is required")
}
if source.Type == scraper.SourceTypeLocalSearch && strings.TrimSpace(source.Query) == "" {
return 0, fmt.Errorf("search query is required for localsearch sources")
}
scraperConfig := toScraperConfig(cfg, scrapeConcurrency)
s := scraper.NewScraper(source.Type, scraperConfig)
if s == nil {
return 0, fmt.Errorf("unsupported source type: %s", source.Type)
}
fmt.Printf("Scraping: %s\n", chooseSourceLabel(source))
fmt.Printf(" Type: %s\n", source.Type)
fmt.Printf(" Concurrency: %d\n", scraperConfig.Concurrency)
if source.Type == scraper.SourceTypeLocalSearch {
fmt.Printf(" Search query: %s\n", source.Query)
fmt.Printf(" Search limit: %d\n", source.ResultLimit)
if len(source.Domains) > 0 {
fmt.Printf(" Search domains: %s\n", strings.Join(source.Domains, ", "))
}
}
fmt.Println()
s := scraper.NewScraper(sourceType, config)
if s == nil {
return fmt.Errorf("unsupported source type: %s", sourceType)
}
source := &scraper.Source{
Name: extractName(sourceURL),
Type: sourceType,
URL: sourceURL,
}
ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
ctx, cancel := context.WithTimeout(context.Background(), scraperConfig.Timeout*2)
defer cancel()
docs, err := s.Scrape(ctx, source)
if err != nil {
return fmt.Errorf("scraping failed: %w", err)
return 0, fmt.Errorf("scraping failed: %w", err)
}
fmt.Printf("✓ Scraped %d documents\n\n", len(docs))
if scrapeOutput == "" {
scrapeOutput = "devour_data/docs"
save, err := storage.SaveDocuments(docs, storage.SaveOptions{
Format: scrapeFormat,
OutputDir: outputDir,
AllowEmpty: scrapeAllowEmpty,
PrintWriter: func(format string, args ...any) {
_, _ = fmt.Printf(format, args...)
},
})
if err != nil {
return 0, err
}
if err := os.MkdirAll(scrapeOutput, 0755); err != nil {
return fmt.Errorf("failed to create output directory: %w", err)
fmt.Printf("✓ Scraped %d documents\n", save.Count)
if err := updateSourceState(cfg, source, docs); err != nil {
return save.Count, fmt.Errorf("update source state: %w", err)
}
for i, doc := range docs {
var filename string
var content []byte
if scrapeFormat == "markdown" {
filename = fmt.Sprintf("%s_%d.md", sanitizeFilename(doc.Title), i)
// Create enhanced markdown document
markdownDoc := &markdown.Document{
ID: doc.ID,
Source: doc.Source,
Type: string(doc.Type),
Title: doc.Title,
Content: doc.Content,
URL: doc.URL,
Metadata: doc.Metadata,
Hash: doc.Hash,
Timestamp: doc.Timestamp,
}
formatter := markdown.NewFormatter()
content = []byte(formatter.FormatWithTOC(markdownDoc))
} else {
filename = fmt.Sprintf("%s_%d.json", sanitizeFilename(doc.Title), i)
content, err = json.MarshalIndent(doc, "", " ")
if err != nil {
return fmt.Errorf("failed to marshal document: %w", err)
}
}
filePath := filepath.Join(scrapeOutput, filename)
if err := os.WriteFile(filePath, content, 0644); err != nil {
return fmt.Errorf("failed to write document: %w", err)
}
fmt.Printf(" 📄 %s (%s)\n", filename, doc.Type)
}
fmt.Printf("\n✓ Scraping complete!\n")
fmt.Printf(" Output: %s\n", scrapeOutput)
fmt.Println(" Run 'devour status' to see indexed documents")
return nil
return save.Count, nil
}
func scrapeFromConfig(configPath string) error {
return fmt.Errorf("scraping from config file not yet implemented")
func updateSourceState(cfg *appconfig.Config, source *scraper.Source, docs []*scraper.Document) error {
state, err := projectstate.LoadSourceState(cfg.Storage.MetadataDir)
if err != nil {
return err
}
key := source.Name
if key == "" {
key = chooseSourceLabel(source)
}
h := sha256.New()
for _, d := range docs {
if d == nil {
continue
}
fmt.Fprintf(h, "%s|%s|%s\n", d.ID, d.Hash, d.URL)
}
state.Sources[key] = &projectstate.SourceState{
Name: source.Name,
Type: string(source.Type),
URL: source.URL,
Hash: hex.EncodeToString(h.Sum(nil)),
LastSync: time.Now(),
DocCount: len(docs),
}
return projectstate.SaveSourceState(cfg.Storage.MetadataDir, state)
}
func chooseSourceLabel(source *scraper.Source) string {
if strings.TrimSpace(source.URL) != "" {
return source.URL
}
if strings.TrimSpace(source.Path) != "" {
return source.Path
}
if strings.TrimSpace(source.Repo) != "" {
return source.Repo
}
return source.Name
}
func detectSourceType(sourceURL string) scraper.SourceType {
u, err := url.Parse(sourceURL)
if err != nil {
if sourceURL != "" && !strings.HasPrefix(sourceURL, "http://") && !strings.HasPrefix(sourceURL, "https://") {
return scraper.SourceTypeLocal
}
return scraper.SourceTypeWeb
}
@@ -208,6 +353,11 @@ func detectSourceType(sourceURL string) scraper.SourceType {
return scraper.SourceTypeAstroDocs
case host == "github.com":
return scraper.SourceTypeGitHub
case strings.HasSuffix(path, ".json") || strings.HasSuffix(path, ".yaml") || strings.HasSuffix(path, ".yml"):
if strings.Contains(strings.ToLower(path), "openapi") || strings.Contains(strings.ToLower(path), "swagger") {
return scraper.SourceTypeOpenAPI
}
return scraper.SourceTypeWeb
default:
return scraper.SourceTypeWeb
}
@@ -216,27 +366,81 @@ func detectSourceType(sourceURL string) scraper.SourceType {
func extractName(sourceURL string) string {
u, err := url.Parse(sourceURL)
if err != nil {
if strings.TrimSpace(sourceURL) != "" {
return filepath.Base(sourceURL)
}
return "unknown"
}
parts := strings.Split(strings.Trim(u.Path, "/"), "/")
if len(parts) > 0 {
if len(parts) > 0 && strings.TrimSpace(parts[len(parts)-1]) != "" {
return parts[len(parts)-1]
}
return u.Host
if strings.TrimSpace(u.Host) != "" {
return u.Host
}
return "unknown"
}
func sanitizeFilename(name string) string {
name = strings.ToLower(name)
name = strings.ReplaceAll(name, " ", "_")
name = strings.ReplaceAll(name, "/", "_")
name = strings.ReplaceAll(name, ":", "_")
name = strings.ReplaceAll(name, ".", "_")
if len(name) > 50 {
name = name[:50]
func applySourceProfile(source *scraper.Source) {
if source == nil {
return
}
if source.Type != scraper.SourceTypeWeb && source.Type != scraper.SourceTypeLocalSearch {
return
}
if strings.TrimSpace(source.URL) == "" {
return
}
return name
u, err := url.Parse(source.URL)
if err != nil {
return
}
host := strings.ToLower(u.Host)
if host == "" {
return
}
// Preserve explicit user-provided patterns.
if len(source.Include) > 0 || len(source.Exclude) > 0 {
return
}
switch {
case strings.Contains(host, "learn.microsoft.com"):
source.Include = []string{`/dotnet/`, `/csharp/`, `/base-types/`}
source.Exclude = []string{`/previous-versions/`, `/answers/`, `/support/`, `/training/`, `/events/`, `/products/`}
case strings.Contains(host, "kotlinlang.org"):
source.Include = []string{`/docs/`}
source.Exclude = []string{`/community/`, `/api/`, `/releases/`}
case strings.Contains(host, "php.net"):
source.Include = []string{`/manual/en/`}
source.Exclude = []string{`/manual/(de|fr|es|ja|ru|pt)/`, `/downloads.php`, `/bugs.php`}
case strings.Contains(host, "ruby-doc.org"):
source.Include = []string{`/core/`}
source.Exclude = []string{`/stdlib/`, `/gems/`}
case strings.Contains(host, "hexdocs.pm"):
source.Include = []string{`/elixir/`}
source.Exclude = []string{`/phoenix/`, `/ecto/`}
case strings.Contains(host, "nextjs.org"):
source.Include = []string{`/docs/`}
source.Exclude = []string{`/showcase`, `/blog`, `/learn/`, `/pricing`}
case strings.Contains(host, "svelte.dev"):
source.Include = []string{`/docs/`}
source.Exclude = []string{`/playground`, `/tutorial`, `/blog`}
case strings.Contains(host, "angular.dev"):
source.Include = []string{`/guide/`, `/api/`, `/tutorials/`}
source.Exclude = []string{`/resources/`, `/playground`}
case strings.Contains(host, "remix.run"):
source.Include = []string{`/docs/`}
source.Exclude = []string{`/blog`, `/conf`, `/merch`}
case strings.Contains(host, "solidjs.com"):
source.Include = []string{`/docs/`}
source.Exclude = []string{`/community`, `/showcase`, `/blog`}
case strings.Contains(host, "expressjs.com"):
source.Include = []string{`/en/(guide|api|advanced)/`}
source.Exclude = []string{`/en/starter/`, `/cn/`, `/fr/`, `/es/`, `/de/`}
}
}