package cmd import ( "context" "crypto/sha256" "encoding/hex" "errors" "fmt" "net/url" "os" "path/filepath" "sort" "strings" "time" "github.com/spf13/cobra" appconfig "github.com/yourorg/devour/internal/config" "github.com/yourorg/devour/internal/projectstate" "github.com/yourorg/devour/internal/scraper" "github.com/yourorg/devour/internal/search" "github.com/yourorg/devour/internal/storage" "gopkg.in/yaml.v3" ) var scrapeCmd = &cobra.Command{ Use: "scrape ", Short: "Scrape documentation from a source", Long: `Scrape and index documentation from various sources. Supported source types: - godocs: Go packages (pkg.go.dev) - rustdocs: Rust crates (docs.rs) - pythondocs: Python modules (docs.python.org) - javadocs: Java packages (docs.oracle.com) - springdocs: Spring Boot (docs.spring.io) - tsdocs: TypeScript (typescriptlang.org) - reactdocs: React (react.dev) - vuedocs: Vue.js (vuejs.org) - nuxtdocs: Nuxt (nuxt.com) - mcpdocs: MCP servers (hub.docker.com/mcp) - dockerdocs: Docker (docs.docker.com) - cloudflaredocs: Cloudflare (developers.cloudflare.com) - astrodocs: Astro (docs.astro.build) - localsearch: Self-hosted search API returning JSON results - url: Generic web pages - github: GitHub repositories - openapi: OpenAPI/Swagger specs - local: Local files/directories Examples: devour scrape https://pkg.go.dev/net/http --type godocs devour scrape https://react.dev/reference/react --type reactdocs devour scrape https://developers.cloudflare.com/ --type cloudflaredocs devour scrape http://127.0.0.1:8080/search --type localsearch --search-query "golang http client" devour scrape --sources sources.yaml`, Args: cobra.MaximumNArgs(1), RunE: runScrape, } var ( scrapeFormat string scrapeSources string scrapeOutput string scrapeConcurrency int scrapeType string scrapeSearchQuery string scrapeSearchLimit int scrapeSearchDomains []string scrapeInclude []string scrapeExclude []string scrapeAllowEmpty bool ) func init() { scrapeCmd.Flags().StringVarP(&scrapeFormat, "format", "f", "json", "output format (json, markdown)") scrapeCmd.Flags().StringVarP(&scrapeSources, "sources", "s", "", "YAML file with source definitions") scrapeCmd.Flags().StringVarP(&scrapeOutput, "output", "o", "", "output directory (default: configured docs dir)") scrapeCmd.Flags().IntVar(&scrapeConcurrency, "concurrency", 10, "parallel scraping workers") scrapeCmd.Flags().StringVarP(&scrapeType, "type", "t", "", "source type (auto-detected if not specified)") scrapeCmd.Flags().StringVar(&scrapeSearchQuery, "search-query", "", "search query for --type localsearch") scrapeCmd.Flags().IntVar(&scrapeSearchLimit, "search-limit", 8, "max result URLs to scrape for --type localsearch") scrapeCmd.Flags().StringSliceVar(&scrapeSearchDomains, "search-domain", nil, "restrict localsearch results to these domains (repeatable)") scrapeCmd.Flags().StringSliceVar(&scrapeInclude, "include", nil, "include URL/file regex patterns (repeatable)") scrapeCmd.Flags().StringSliceVar(&scrapeExclude, "exclude", nil, "exclude URL/file regex patterns (repeatable)") scrapeCmd.Flags().BoolVar(&scrapeAllowEmpty, "allow-empty", false, "allow success when no documents were extracted") } func runScrape(cmd *cobra.Command, args []string) error { cfg, err := loadAppConfig() if err != nil { return fmt.Errorf("load app config for scrape command: %w", err) } if scrapeSources != "" { return scrapeFromConfig(cmd, cfg, scrapeSources) } if len(args) == 0 { return fmt.Errorf("source argument required when not using --sources flag") } sourceURL := strings.TrimSpace(args[0]) sourceType := scraper.SourceType(scrapeType) if sourceType == "" { sourceType = detectSourceType(sourceURL) } source := &scraper.Source{ Name: extractName(sourceURL), Type: sourceType, URL: sourceURL, Query: strings.TrimSpace(scrapeSearchQuery), ResultLimit: scrapeSearchLimit, Domains: append([]string(nil), scrapeSearchDomains...), Include: append([]string(nil), scrapeInclude...), Exclude: append([]string(nil), scrapeExclude...), } if sourceType == scraper.SourceTypeLocal { source.Path = sourceURL } applySourceProfile(source) outputDir := resolveOutputDir(cfg, scrapeOutput) count, err := scrapeOne(cmd, cfg, source, outputDir) if err != nil { return fmt.Errorf("scrape source %q: %w", sourceURL, err) } if cfg.Indexing.Enabled { engine := search.NewEngine(cfg) if _, err := engine.Rebuild(context.Background()); err != nil { return fmt.Errorf("reindex after scrape: %w", err) } } fmt.Printf("\nāœ“ Scraping complete!\n") fmt.Printf(" Output: %s\n", outputDir) fmt.Printf(" Documents: %d\n", count) fmt.Println(" Run 'devour status' to inspect local index health") return nil } func scrapeFromConfig(cmd *cobra.Command, cfg *appconfig.Config, configPath string) error { raw, err := os.ReadFile(configPath) if err != nil { return fmt.Errorf("read sources file: %w", err) } var list []appconfig.SourceConfig if err := yaml.Unmarshal(raw, &list); err != nil || len(list) == 0 { var wrapped struct { Sources []appconfig.SourceConfig `yaml:"sources"` } if wrapErr := yaml.Unmarshal(raw, &wrapped); wrapErr != nil { return fmt.Errorf("parse sources file: %w", wrapErr) } list = wrapped.Sources } if len(list) == 0 { return fmt.Errorf("sources file contains no sources") } sort.Slice(list, func(i, j int) bool { return list[i].Name < list[j].Name }) outputDir := resolveOutputDir(cfg, scrapeOutput) success := 0 failures := 0 totalDocs := 0 sourceErrors := make([]error, 0) for _, srcCfg := range list { source := sourceFromConfig(srcCfg) if source.Type == "" { if source.URL != "" { source.Type = detectSourceType(source.URL) } else if source.Path != "" { source.Type = scraper.SourceTypeLocal } } if source.Name == "" { source.Name = extractName(source.URL) if source.Name == "unknown" && source.Path != "" { source.Name = filepath.Base(source.Path) } } applySourceProfile(source) fmt.Printf("\n=== Source: %s (%s) ===\n", source.Name, source.Type) count, srcErr := scrapeOne(cmd, cfg, source, outputDir) if srcErr != nil { failures++ fmt.Printf("āœ— %s failed: %v\n", source.Name, srcErr) sourceErrors = append(sourceErrors, fmt.Errorf("%s: %w", source.Name, srcErr)) continue } totalDocs += count success++ } if cfg.Indexing.Enabled { engine := search.NewEngine(cfg) if _, err := engine.Rebuild(context.Background()); err != nil { return fmt.Errorf("reindex after scrape sources: %w", err) } } fmt.Printf("\nSummary: %d succeeded, %d failed, %d docs written\n", success, failures, totalDocs) if failures > 0 { return fmt.Errorf("one or more sources failed: %w", errors.Join(sourceErrors...)) } return nil } func scrapeOne(cmd *cobra.Command, cfg *appconfig.Config, source *scraper.Source, outputDir string) (int, error) { if source == nil { return 0, fmt.Errorf("source is required") } if source.Type == "" { return 0, fmt.Errorf("source type is required") } if source.Type == scraper.SourceTypeLocalSearch && strings.TrimSpace(source.Query) == "" { return 0, fmt.Errorf("search query is required for localsearch sources") } scraperConfig := toScraperConfig(cfg, scrapeConcurrency) s := scraper.NewScraper(source.Type, scraperConfig) if s == nil { return 0, fmt.Errorf("unsupported source type: %s", source.Type) } fmt.Printf("Scraping: %s\n", chooseSourceLabel(source)) fmt.Printf(" Type: %s\n", source.Type) fmt.Printf(" Concurrency: %d\n", scraperConfig.Concurrency) if source.Type == scraper.SourceTypeLocalSearch { fmt.Printf(" Search query: %s\n", source.Query) fmt.Printf(" Search limit: %d\n", source.ResultLimit) if len(source.Domains) > 0 { fmt.Printf(" Search domains: %s\n", strings.Join(source.Domains, ", ")) } } fmt.Println() ctx, cancel := context.WithTimeout(context.Background(), scraperConfig.Timeout*2) defer cancel() docs, err := s.Scrape(ctx, source) if err != nil { return 0, fmt.Errorf("scraping failed: %w", err) } save, err := storage.SaveDocuments(docs, storage.SaveOptions{ Format: scrapeFormat, OutputDir: outputDir, AllowEmpty: scrapeAllowEmpty, PrintWriter: func(format string, args ...any) { _, _ = fmt.Printf(format, args...) }, }) if err != nil { return 0, err } fmt.Printf("āœ“ Scraped %d documents\n", save.Count) if err := updateSourceState(cfg, source, docs); err != nil { return save.Count, fmt.Errorf("update source state: %w", err) } return save.Count, nil } func updateSourceState(cfg *appconfig.Config, source *scraper.Source, docs []*scraper.Document) error { state, err := projectstate.LoadSourceState(cfg.Storage.MetadataDir) if err != nil { return err } key := source.Name if key == "" { key = chooseSourceLabel(source) } h := sha256.New() for _, d := range docs { if d == nil { continue } fmt.Fprintf(h, "%s|%s|%s\n", d.ID, d.Hash, d.URL) } state.Sources[key] = &projectstate.SourceState{ Name: source.Name, Type: string(source.Type), URL: source.URL, Hash: hex.EncodeToString(h.Sum(nil)), LastSync: time.Now(), DocCount: len(docs), } return projectstate.SaveSourceState(cfg.Storage.MetadataDir, state) } func chooseSourceLabel(source *scraper.Source) string { if strings.TrimSpace(source.URL) != "" { return source.URL } if strings.TrimSpace(source.Path) != "" { return source.Path } if strings.TrimSpace(source.Repo) != "" { return source.Repo } return source.Name } func detectSourceType(sourceURL string) scraper.SourceType { u, err := url.Parse(sourceURL) if err != nil { if sourceURL != "" && !strings.HasPrefix(sourceURL, "http://") && !strings.HasPrefix(sourceURL, "https://") { return scraper.SourceTypeLocal } return scraper.SourceTypeWeb } host := u.Host path := u.Path switch { case host == "pkg.go.dev" || strings.HasSuffix(host, "pkg.go.dev"): return scraper.SourceTypeGoDocs case host == "docs.rs" || host == "doc.rust-lang.org": return scraper.SourceTypeRustDocs case host == "docs.python.org": return scraper.SourceTypePythonDocs case host == "docs.oracle.com": return scraper.SourceTypeJavaDocs case host == "docs.spring.io": return scraper.SourceTypeSpringDocs case host == "www.typescriptlang.org" || host == "typescriptlang.org": return scraper.SourceTypeTSDocs case host == "react.dev": return scraper.SourceTypeReactDocs case host == "vuejs.org": return scraper.SourceTypeVueDocs case host == "nuxt.com": return scraper.SourceTypeNuxtDocs case strings.Contains(host, "docker.com") || host == "docs.docker.com": if strings.Contains(path, "/mcp/") { return scraper.SourceTypeMCPDocs } return scraper.SourceTypeDockerDocs case host == "developers.cloudflare.com": return scraper.SourceTypeCloudflareDocs case host == "docs.astro.build": return scraper.SourceTypeAstroDocs case host == "github.com": return scraper.SourceTypeGitHub case strings.HasSuffix(path, ".json") || strings.HasSuffix(path, ".yaml") || strings.HasSuffix(path, ".yml"): if strings.Contains(strings.ToLower(path), "openapi") || strings.Contains(strings.ToLower(path), "swagger") { return scraper.SourceTypeOpenAPI } return scraper.SourceTypeWeb default: return scraper.SourceTypeWeb } } func extractName(sourceURL string) string { u, err := url.Parse(sourceURL) if err != nil { if strings.TrimSpace(sourceURL) != "" { return filepath.Base(sourceURL) } return "unknown" } parts := strings.Split(strings.Trim(u.Path, "/"), "/") if len(parts) > 0 && strings.TrimSpace(parts[len(parts)-1]) != "" { return parts[len(parts)-1] } if strings.TrimSpace(u.Host) != "" { return u.Host } return "unknown" } func applySourceProfile(source *scraper.Source) { if source == nil { return } if source.Type != scraper.SourceTypeWeb && source.Type != scraper.SourceTypeLocalSearch { return } if strings.TrimSpace(source.URL) == "" { return } u, err := url.Parse(source.URL) if err != nil { return } host := strings.ToLower(u.Host) if host == "" { return } // Preserve explicit user-provided patterns. if len(source.Include) > 0 || len(source.Exclude) > 0 { return } switch { case strings.Contains(host, "learn.microsoft.com"): source.Include = []string{`/dotnet/`, `/csharp/`, `/base-types/`} source.Exclude = []string{`/previous-versions/`, `/answers/`, `/support/`, `/training/`, `/events/`, `/products/`} case strings.Contains(host, "kotlinlang.org"): source.Include = []string{`/docs/`} source.Exclude = []string{`/community/`, `/api/`, `/releases/`} case strings.Contains(host, "php.net"): source.Include = []string{`/manual/en/`} source.Exclude = []string{`/manual/(de|fr|es|ja|ru|pt)/`, `/downloads.php`, `/bugs.php`} case strings.Contains(host, "ruby-doc.org"): source.Include = []string{`/core/`} source.Exclude = []string{`/stdlib/`, `/gems/`} case strings.Contains(host, "hexdocs.pm"): source.Include = []string{`/elixir/`} source.Exclude = []string{`/phoenix/`, `/ecto/`} case strings.Contains(host, "nextjs.org"): source.Include = []string{`/docs/`} source.Exclude = []string{`/showcase`, `/blog`, `/learn/`, `/pricing`} case strings.Contains(host, "svelte.dev"): source.Include = []string{`/docs/`} source.Exclude = []string{`/playground`, `/tutorial`, `/blog`} case strings.Contains(host, "angular.dev"): source.Include = []string{`/guide/`, `/api/`, `/tutorials/`} source.Exclude = []string{`/resources/`, `/playground`} case strings.Contains(host, "remix.run"): source.Include = []string{`/docs/`} source.Exclude = []string{`/blog`, `/conf`, `/merch`} case strings.Contains(host, "solidjs.com"): source.Include = []string{`/docs/`} source.Exclude = []string{`/community`, `/showcase`, `/blog`} case strings.Contains(host, "expressjs.com"): source.Include = []string{`/en/(guide|api|advanced)/`} source.Exclude = []string{`/en/starter/`, `/cn/`, `/fr/`, `/es/`, `/de/`} } }