mirror of
https://github.com/Dvorinka/Devour.git
synced 2026-06-03 20:13:03 +00:00
450 lines
14 KiB
Go
450 lines
14 KiB
Go
package cmd
|
|
|
|
import (
|
|
"context"
|
|
"crypto/sha256"
|
|
"encoding/hex"
|
|
"errors"
|
|
"fmt"
|
|
"net/url"
|
|
"os"
|
|
"path/filepath"
|
|
"sort"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/spf13/cobra"
|
|
appconfig "github.com/yourorg/devour/internal/config"
|
|
"github.com/yourorg/devour/internal/projectstate"
|
|
"github.com/yourorg/devour/internal/scraper"
|
|
"github.com/yourorg/devour/internal/search"
|
|
"github.com/yourorg/devour/internal/storage"
|
|
"gopkg.in/yaml.v3"
|
|
)
|
|
|
|
var scrapeCmd = &cobra.Command{
|
|
Use: "scrape <source>",
|
|
Short: "Scrape documentation from a source",
|
|
Long: `Scrape and index documentation from various sources.
|
|
|
|
Supported source types:
|
|
- godocs: Go packages (pkg.go.dev)
|
|
- rustdocs: Rust crates (docs.rs)
|
|
- pythondocs: Python modules (docs.python.org)
|
|
- javadocs: Java packages (docs.oracle.com)
|
|
- springdocs: Spring Boot (docs.spring.io)
|
|
- tsdocs: TypeScript (typescriptlang.org)
|
|
- reactdocs: React (react.dev)
|
|
- vuedocs: Vue.js (vuejs.org)
|
|
- nuxtdocs: Nuxt (nuxt.com)
|
|
- mcpdocs: MCP servers (hub.docker.com/mcp)
|
|
- dockerdocs: Docker (docs.docker.com)
|
|
- cloudflaredocs: Cloudflare (developers.cloudflare.com)
|
|
- astrodocs: Astro (docs.astro.build)
|
|
- localsearch: Self-hosted search API returning JSON results
|
|
- url: Generic web pages
|
|
- github: GitHub repositories
|
|
- openapi: OpenAPI/Swagger specs
|
|
- local: Local files/directories
|
|
|
|
Examples:
|
|
devour scrape https://pkg.go.dev/net/http --type godocs
|
|
devour scrape https://react.dev/reference/react --type reactdocs
|
|
devour scrape https://developers.cloudflare.com/ --type cloudflaredocs
|
|
devour scrape http://127.0.0.1:8080/search --type localsearch --search-query "golang http client"
|
|
devour scrape --sources sources.yaml`,
|
|
Args: cobra.MaximumNArgs(1),
|
|
RunE: runScrape,
|
|
}
|
|
|
|
var (
|
|
scrapeFormat string
|
|
scrapeSources string
|
|
scrapeOutput string
|
|
scrapeConcurrency int
|
|
scrapeType string
|
|
scrapeSearchQuery string
|
|
scrapeSearchLimit int
|
|
scrapeSearchDomains []string
|
|
scrapeInclude []string
|
|
scrapeExclude []string
|
|
scrapeAllowEmpty bool
|
|
)
|
|
|
|
func init() {
|
|
scrapeCmd.Flags().StringVarP(&scrapeFormat, "format", "f", "json", "output format (json, markdown)")
|
|
scrapeCmd.Flags().StringVarP(&scrapeSources, "sources", "s", "", "YAML file with source definitions")
|
|
scrapeCmd.Flags().StringVarP(&scrapeOutput, "output", "o", "", "output directory (default: configured docs dir)")
|
|
scrapeCmd.Flags().IntVar(&scrapeConcurrency, "concurrency", 10, "parallel scraping workers")
|
|
scrapeCmd.Flags().StringVarP(&scrapeType, "type", "t", "", "source type (auto-detected if not specified)")
|
|
scrapeCmd.Flags().StringVar(&scrapeSearchQuery, "search-query", "", "search query for --type localsearch")
|
|
scrapeCmd.Flags().IntVar(&scrapeSearchLimit, "search-limit", 8, "max result URLs to scrape for --type localsearch")
|
|
scrapeCmd.Flags().StringSliceVar(&scrapeSearchDomains, "search-domain", nil, "restrict localsearch results to these domains (repeatable)")
|
|
scrapeCmd.Flags().StringSliceVar(&scrapeInclude, "include", nil, "include URL/file regex patterns (repeatable)")
|
|
scrapeCmd.Flags().StringSliceVar(&scrapeExclude, "exclude", nil, "exclude URL/file regex patterns (repeatable)")
|
|
scrapeCmd.Flags().BoolVar(&scrapeAllowEmpty, "allow-empty", false, "allow success when no documents were extracted")
|
|
}
|
|
|
|
func runScrape(cmd *cobra.Command, args []string) error {
|
|
cfg, err := loadAppConfig()
|
|
if err != nil {
|
|
return fmt.Errorf("load app config for scrape command: %w", err)
|
|
}
|
|
|
|
if scrapeSources != "" {
|
|
return scrapeFromConfig(cmd, cfg, scrapeSources)
|
|
}
|
|
|
|
if len(args) == 0 {
|
|
return fmt.Errorf("source argument required when not using --sources flag")
|
|
}
|
|
|
|
sourceURL := strings.TrimSpace(args[0])
|
|
sourceType := scraper.SourceType(scrapeType)
|
|
if sourceType == "" {
|
|
sourceType = detectSourceType(sourceURL)
|
|
}
|
|
|
|
source := &scraper.Source{
|
|
Name: extractName(sourceURL),
|
|
Type: sourceType,
|
|
URL: sourceURL,
|
|
Query: strings.TrimSpace(scrapeSearchQuery),
|
|
ResultLimit: scrapeSearchLimit,
|
|
Domains: append([]string(nil), scrapeSearchDomains...),
|
|
Include: append([]string(nil), scrapeInclude...),
|
|
Exclude: append([]string(nil), scrapeExclude...),
|
|
}
|
|
if sourceType == scraper.SourceTypeLocal {
|
|
source.Path = sourceURL
|
|
}
|
|
applySourceProfile(source)
|
|
|
|
outputDir := resolveOutputDir(cfg, scrapeOutput)
|
|
count, err := scrapeOne(cmd, cfg, source, outputDir)
|
|
if err != nil {
|
|
return fmt.Errorf("scrape source %q: %w", sourceURL, err)
|
|
}
|
|
|
|
if cfg.Indexing.Enabled {
|
|
engine := search.NewEngine(cfg)
|
|
if _, err := engine.Rebuild(context.Background()); err != nil {
|
|
return fmt.Errorf("reindex after scrape: %w", err)
|
|
}
|
|
}
|
|
|
|
fmt.Printf("\n✓ Scraping complete!\n")
|
|
fmt.Printf(" Output: %s\n", outputDir)
|
|
fmt.Printf(" Documents: %d\n", count)
|
|
fmt.Println(" Run 'devour status' to inspect local index health")
|
|
return nil
|
|
}
|
|
|
|
func scrapeFromConfig(cmd *cobra.Command, cfg *appconfig.Config, configPath string) error {
|
|
raw, err := os.ReadFile(configPath)
|
|
if err != nil {
|
|
return fmt.Errorf("read sources file: %w", err)
|
|
}
|
|
|
|
var list []appconfig.SourceConfig
|
|
if err := yaml.Unmarshal(raw, &list); err != nil || len(list) == 0 {
|
|
var wrapped struct {
|
|
Sources []appconfig.SourceConfig `yaml:"sources"`
|
|
}
|
|
if wrapErr := yaml.Unmarshal(raw, &wrapped); wrapErr != nil {
|
|
return fmt.Errorf("parse sources file: %w", wrapErr)
|
|
}
|
|
list = wrapped.Sources
|
|
}
|
|
if len(list) == 0 {
|
|
return fmt.Errorf("sources file contains no sources")
|
|
}
|
|
|
|
sort.Slice(list, func(i, j int) bool {
|
|
return list[i].Name < list[j].Name
|
|
})
|
|
|
|
outputDir := resolveOutputDir(cfg, scrapeOutput)
|
|
success := 0
|
|
failures := 0
|
|
totalDocs := 0
|
|
sourceErrors := make([]error, 0)
|
|
for _, srcCfg := range list {
|
|
source := sourceFromConfig(srcCfg)
|
|
if source.Type == "" {
|
|
if source.URL != "" {
|
|
source.Type = detectSourceType(source.URL)
|
|
} else if source.Path != "" {
|
|
source.Type = scraper.SourceTypeLocal
|
|
}
|
|
}
|
|
if source.Name == "" {
|
|
source.Name = extractName(source.URL)
|
|
if source.Name == "unknown" && source.Path != "" {
|
|
source.Name = filepath.Base(source.Path)
|
|
}
|
|
}
|
|
applySourceProfile(source)
|
|
|
|
fmt.Printf("\n=== Source: %s (%s) ===\n", source.Name, source.Type)
|
|
count, srcErr := scrapeOne(cmd, cfg, source, outputDir)
|
|
if srcErr != nil {
|
|
failures++
|
|
fmt.Printf("✗ %s failed: %v\n", source.Name, srcErr)
|
|
sourceErrors = append(sourceErrors, fmt.Errorf("%s: %w", source.Name, srcErr))
|
|
continue
|
|
}
|
|
totalDocs += count
|
|
success++
|
|
}
|
|
|
|
if cfg.Indexing.Enabled {
|
|
engine := search.NewEngine(cfg)
|
|
if _, err := engine.Rebuild(context.Background()); err != nil {
|
|
return fmt.Errorf("reindex after scrape sources: %w", err)
|
|
}
|
|
}
|
|
|
|
fmt.Printf("\nSummary: %d succeeded, %d failed, %d docs written\n", success, failures, totalDocs)
|
|
if failures > 0 {
|
|
return fmt.Errorf("one or more sources failed: %w", errors.Join(sourceErrors...))
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func scrapeOne(cmd *cobra.Command, cfg *appconfig.Config, source *scraper.Source, outputDir string) (int, error) {
|
|
if source == nil {
|
|
return 0, fmt.Errorf("source is required")
|
|
}
|
|
if source.Type == "" {
|
|
return 0, fmt.Errorf("source type is required")
|
|
}
|
|
|
|
if source.Type == scraper.SourceTypeLocalSearch && strings.TrimSpace(source.Query) == "" {
|
|
return 0, fmt.Errorf("search query is required for localsearch sources")
|
|
}
|
|
|
|
scraperConfig := toScraperConfig(cfg, scrapeConcurrency)
|
|
s := scraper.NewScraper(source.Type, scraperConfig)
|
|
if s == nil {
|
|
return 0, fmt.Errorf("unsupported source type: %s", source.Type)
|
|
}
|
|
|
|
fmt.Printf("Scraping: %s\n", chooseSourceLabel(source))
|
|
fmt.Printf(" Type: %s\n", source.Type)
|
|
fmt.Printf(" Concurrency: %d\n", scraperConfig.Concurrency)
|
|
if source.Type == scraper.SourceTypeLocalSearch {
|
|
fmt.Printf(" Search query: %s\n", source.Query)
|
|
fmt.Printf(" Search limit: %d\n", source.ResultLimit)
|
|
if len(source.Domains) > 0 {
|
|
fmt.Printf(" Search domains: %s\n", strings.Join(source.Domains, ", "))
|
|
}
|
|
}
|
|
fmt.Println()
|
|
|
|
ctx, cancel := context.WithTimeout(context.Background(), scraperConfig.Timeout*2)
|
|
defer cancel()
|
|
|
|
docs, err := s.Scrape(ctx, source)
|
|
if err != nil {
|
|
return 0, fmt.Errorf("scraping failed: %w", err)
|
|
}
|
|
|
|
save, err := storage.SaveDocuments(docs, storage.SaveOptions{
|
|
Format: scrapeFormat,
|
|
OutputDir: outputDir,
|
|
AllowEmpty: scrapeAllowEmpty,
|
|
PrintWriter: func(format string, args ...any) {
|
|
_, _ = fmt.Printf(format, args...)
|
|
},
|
|
})
|
|
if err != nil {
|
|
return 0, err
|
|
}
|
|
|
|
fmt.Printf("✓ Scraped %d documents\n", save.Count)
|
|
|
|
if err := updateSourceState(cfg, source, docs); err != nil {
|
|
return save.Count, fmt.Errorf("update source state: %w", err)
|
|
}
|
|
|
|
return save.Count, nil
|
|
}
|
|
|
|
func updateSourceState(cfg *appconfig.Config, source *scraper.Source, docs []*scraper.Document) error {
|
|
state, err := projectstate.LoadSourceState(cfg.Storage.MetadataDir)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
key := source.Name
|
|
if key == "" {
|
|
key = chooseSourceLabel(source)
|
|
}
|
|
|
|
h := sha256.New()
|
|
for _, d := range docs {
|
|
if d == nil {
|
|
continue
|
|
}
|
|
fmt.Fprintf(h, "%s|%s|%s\n", d.ID, d.Hash, d.URL)
|
|
}
|
|
state.Sources[key] = &projectstate.SourceState{
|
|
Name: source.Name,
|
|
Type: string(source.Type),
|
|
URL: source.URL,
|
|
Hash: hex.EncodeToString(h.Sum(nil)),
|
|
LastSync: time.Now(),
|
|
DocCount: len(docs),
|
|
}
|
|
|
|
return projectstate.SaveSourceState(cfg.Storage.MetadataDir, state)
|
|
}
|
|
|
|
func chooseSourceLabel(source *scraper.Source) string {
|
|
if strings.TrimSpace(source.URL) != "" {
|
|
return source.URL
|
|
}
|
|
if strings.TrimSpace(source.Path) != "" {
|
|
return source.Path
|
|
}
|
|
if strings.TrimSpace(source.Repo) != "" {
|
|
return source.Repo
|
|
}
|
|
return source.Name
|
|
}
|
|
|
|
func detectSourceType(sourceURL string) scraper.SourceType {
|
|
u, err := url.Parse(sourceURL)
|
|
if err != nil {
|
|
if sourceURL != "" && !strings.HasPrefix(sourceURL, "http://") && !strings.HasPrefix(sourceURL, "https://") {
|
|
return scraper.SourceTypeLocal
|
|
}
|
|
return scraper.SourceTypeWeb
|
|
}
|
|
|
|
host := u.Host
|
|
path := u.Path
|
|
|
|
switch {
|
|
case host == "pkg.go.dev" || strings.HasSuffix(host, "pkg.go.dev"):
|
|
return scraper.SourceTypeGoDocs
|
|
case host == "docs.rs" || host == "doc.rust-lang.org":
|
|
return scraper.SourceTypeRustDocs
|
|
case host == "docs.python.org":
|
|
return scraper.SourceTypePythonDocs
|
|
case host == "docs.oracle.com":
|
|
return scraper.SourceTypeJavaDocs
|
|
case host == "docs.spring.io":
|
|
return scraper.SourceTypeSpringDocs
|
|
case host == "www.typescriptlang.org" || host == "typescriptlang.org":
|
|
return scraper.SourceTypeTSDocs
|
|
case host == "react.dev":
|
|
return scraper.SourceTypeReactDocs
|
|
case host == "vuejs.org":
|
|
return scraper.SourceTypeVueDocs
|
|
case host == "nuxt.com":
|
|
return scraper.SourceTypeNuxtDocs
|
|
case strings.Contains(host, "docker.com") || host == "docs.docker.com":
|
|
if strings.Contains(path, "/mcp/") {
|
|
return scraper.SourceTypeMCPDocs
|
|
}
|
|
return scraper.SourceTypeDockerDocs
|
|
case host == "developers.cloudflare.com":
|
|
return scraper.SourceTypeCloudflareDocs
|
|
case host == "docs.astro.build":
|
|
return scraper.SourceTypeAstroDocs
|
|
case host == "github.com":
|
|
return scraper.SourceTypeGitHub
|
|
case strings.HasSuffix(path, ".json") || strings.HasSuffix(path, ".yaml") || strings.HasSuffix(path, ".yml"):
|
|
if strings.Contains(strings.ToLower(path), "openapi") || strings.Contains(strings.ToLower(path), "swagger") {
|
|
return scraper.SourceTypeOpenAPI
|
|
}
|
|
return scraper.SourceTypeWeb
|
|
default:
|
|
return scraper.SourceTypeWeb
|
|
}
|
|
}
|
|
|
|
func extractName(sourceURL string) string {
|
|
u, err := url.Parse(sourceURL)
|
|
if err != nil {
|
|
if strings.TrimSpace(sourceURL) != "" {
|
|
return filepath.Base(sourceURL)
|
|
}
|
|
return "unknown"
|
|
}
|
|
|
|
parts := strings.Split(strings.Trim(u.Path, "/"), "/")
|
|
if len(parts) > 0 && strings.TrimSpace(parts[len(parts)-1]) != "" {
|
|
return parts[len(parts)-1]
|
|
}
|
|
|
|
if strings.TrimSpace(u.Host) != "" {
|
|
return u.Host
|
|
}
|
|
return "unknown"
|
|
}
|
|
|
|
func applySourceProfile(source *scraper.Source) {
|
|
if source == nil {
|
|
return
|
|
}
|
|
if source.Type != scraper.SourceTypeWeb && source.Type != scraper.SourceTypeLocalSearch {
|
|
return
|
|
}
|
|
if strings.TrimSpace(source.URL) == "" {
|
|
return
|
|
}
|
|
|
|
u, err := url.Parse(source.URL)
|
|
if err != nil {
|
|
return
|
|
}
|
|
host := strings.ToLower(u.Host)
|
|
if host == "" {
|
|
return
|
|
}
|
|
|
|
// Preserve explicit user-provided patterns.
|
|
if len(source.Include) > 0 || len(source.Exclude) > 0 {
|
|
return
|
|
}
|
|
|
|
switch {
|
|
case strings.Contains(host, "learn.microsoft.com"):
|
|
source.Include = []string{`/dotnet/`, `/csharp/`, `/base-types/`}
|
|
source.Exclude = []string{`/previous-versions/`, `/answers/`, `/support/`, `/training/`, `/events/`, `/products/`}
|
|
case strings.Contains(host, "kotlinlang.org"):
|
|
source.Include = []string{`/docs/`}
|
|
source.Exclude = []string{`/community/`, `/api/`, `/releases/`}
|
|
case strings.Contains(host, "php.net"):
|
|
source.Include = []string{`/manual/en/`}
|
|
source.Exclude = []string{`/manual/(de|fr|es|ja|ru|pt)/`, `/downloads.php`, `/bugs.php`}
|
|
case strings.Contains(host, "ruby-doc.org"):
|
|
source.Include = []string{`/core/`}
|
|
source.Exclude = []string{`/stdlib/`, `/gems/`}
|
|
case strings.Contains(host, "hexdocs.pm"):
|
|
source.Include = []string{`/elixir/`}
|
|
source.Exclude = []string{`/phoenix/`, `/ecto/`}
|
|
case strings.Contains(host, "nextjs.org"):
|
|
source.Include = []string{`/docs/`}
|
|
source.Exclude = []string{`/showcase`, `/blog`, `/learn/`, `/pricing`}
|
|
case strings.Contains(host, "svelte.dev"):
|
|
source.Include = []string{`/docs/`}
|
|
source.Exclude = []string{`/playground`, `/tutorial`, `/blog`}
|
|
case strings.Contains(host, "angular.dev"):
|
|
source.Include = []string{`/guide/`, `/api/`, `/tutorials/`}
|
|
source.Exclude = []string{`/resources/`, `/playground`}
|
|
case strings.Contains(host, "remix.run"):
|
|
source.Include = []string{`/docs/`}
|
|
source.Exclude = []string{`/blog`, `/conf`, `/merch`}
|
|
case strings.Contains(host, "solidjs.com"):
|
|
source.Include = []string{`/docs/`}
|
|
source.Exclude = []string{`/community`, `/showcase`, `/blog`}
|
|
case strings.Contains(host, "expressjs.com"):
|
|
source.Include = []string{`/en/(guide|api|advanced)/`}
|
|
source.Exclude = []string{`/en/starter/`, `/cn/`, `/fr/`, `/es/`, `/de/`}
|
|
}
|
|
}
|