package cmd import ( "context" "encoding/json" "fmt" "net/url" "os" "path/filepath" "strings" "time" "github.com/spf13/cobra" "github.com/yourorg/devour/internal/markdown" "github.com/yourorg/devour/internal/scraper" ) var scrapeCmd = &cobra.Command{ Use: "scrape ", Short: "Scrape documentation from a source", Long: `Scrape and index documentation from various sources. Supported source types: - godocs: Go packages (pkg.go.dev) - rustdocs: Rust crates (docs.rs) - pythondocs: Python modules (docs.python.org) - javadocs: Java packages (docs.oracle.com) - springdocs: Spring Boot (docs.spring.io) - tsdocs: TypeScript (typescriptlang.org) - reactdocs: React (react.dev) - vuedocs: Vue.js (vuejs.org) - nuxtdocs: Nuxt (nuxt.com) - mcpdocs: MCP servers (hub.docker.com/mcp) - dockerdocs: Docker (docs.docker.com) - cloudflaredocs: Cloudflare (developers.cloudflare.com) - astrodocs: Astro (docs.astro.build) - url: Generic web pages - github: GitHub repositories Examples: devour scrape https://pkg.go.dev/net/http --type godocs devour scrape https://react.dev/reference/react --type reactdocs devour scrape https://developers.cloudflare.com/ --type cloudflaredocs devour scrape --sources sources.yaml`, Args: cobra.MaximumNArgs(1), RunE: runScrape, } var ( scrapeFormat string scrapeSources string scrapeOutput string scrapeConcurrency int scrapeType string ) func init() { scrapeCmd.Flags().StringVarP(&scrapeFormat, "format", "f", "json", "output format (json, markdown)") scrapeCmd.Flags().StringVarP(&scrapeSources, "sources", "s", "", "YAML file with source definitions") scrapeCmd.Flags().StringVarP(&scrapeOutput, "output", "o", "", "output directory (default: devour_data/docs)") scrapeCmd.Flags().IntVar(&scrapeConcurrency, "concurrency", 10, "parallel scraping workers") scrapeCmd.Flags().StringVarP(&scrapeType, "type", "t", "", "source type (auto-detected if not specified)") } func runScrape(cmd *cobra.Command, args []string) error { if scrapeSources != "" { return scrapeFromConfig(scrapeSources) } if len(args) == 0 { return fmt.Errorf("source argument required when not using --sources flag") } sourceURL := args[0] config := &scraper.Config{ UserAgent: "Devour/1.0 (Documentation Scraper)", Timeout: 30 * time.Second, RetryCount: 3, RetryDelay: 1 * time.Second, Concurrency: scrapeConcurrency, } sourceType := scraper.SourceType(scrapeType) if sourceType == "" { sourceType = detectSourceType(sourceURL) } fmt.Printf("Scraping: %s\n", sourceURL) fmt.Printf(" Type: %s\n", sourceType) fmt.Printf(" Concurrency: %d\n", scrapeConcurrency) fmt.Println() s := scraper.NewScraper(sourceType, config) if s == nil { return fmt.Errorf("unsupported source type: %s", sourceType) } source := &scraper.Source{ Name: extractName(sourceURL), Type: sourceType, URL: sourceURL, } ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second) defer cancel() docs, err := s.Scrape(ctx, source) if err != nil { return fmt.Errorf("scraping failed: %w", err) } fmt.Printf("āœ“ Scraped %d documents\n\n", len(docs)) if scrapeOutput == "" { scrapeOutput = "devour_data/docs" } if err := os.MkdirAll(scrapeOutput, 0755); err != nil { return fmt.Errorf("failed to create output directory: %w", err) } for i, doc := range docs { var filename string var content []byte if scrapeFormat == "markdown" { filename = fmt.Sprintf("%s_%d.md", sanitizeFilename(doc.Title), i) // Create enhanced markdown document markdownDoc := &markdown.Document{ ID: doc.ID, Source: doc.Source, Type: string(doc.Type), Title: doc.Title, Content: doc.Content, URL: doc.URL, Metadata: doc.Metadata, Hash: doc.Hash, Timestamp: doc.Timestamp, } formatter := markdown.NewFormatter() content = []byte(formatter.FormatWithTOC(markdownDoc)) } else { filename = fmt.Sprintf("%s_%d.json", sanitizeFilename(doc.Title), i) content, err = json.MarshalIndent(doc, "", " ") if err != nil { return fmt.Errorf("failed to marshal document: %w", err) } } filePath := filepath.Join(scrapeOutput, filename) if err := os.WriteFile(filePath, content, 0644); err != nil { return fmt.Errorf("failed to write document: %w", err) } fmt.Printf(" šŸ“„ %s (%s)\n", filename, doc.Type) } fmt.Printf("\nāœ“ Scraping complete!\n") fmt.Printf(" Output: %s\n", scrapeOutput) fmt.Println(" Run 'devour status' to see indexed documents") return nil } func scrapeFromConfig(configPath string) error { return fmt.Errorf("scraping from config file not yet implemented") } func detectSourceType(sourceURL string) scraper.SourceType { u, err := url.Parse(sourceURL) if err != nil { return scraper.SourceTypeWeb } host := u.Host path := u.Path switch { case host == "pkg.go.dev" || strings.HasSuffix(host, "pkg.go.dev"): return scraper.SourceTypeGoDocs case host == "docs.rs" || host == "doc.rust-lang.org": return scraper.SourceTypeRustDocs case host == "docs.python.org": return scraper.SourceTypePythonDocs case host == "docs.oracle.com": return scraper.SourceTypeJavaDocs case host == "docs.spring.io": return scraper.SourceTypeSpringDocs case host == "www.typescriptlang.org" || host == "typescriptlang.org": return scraper.SourceTypeTSDocs case host == "react.dev": return scraper.SourceTypeReactDocs case host == "vuejs.org": return scraper.SourceTypeVueDocs case host == "nuxt.com": return scraper.SourceTypeNuxtDocs case strings.Contains(host, "docker.com") || host == "docs.docker.com": if strings.Contains(path, "/mcp/") { return scraper.SourceTypeMCPDocs } return scraper.SourceTypeDockerDocs case host == "developers.cloudflare.com": return scraper.SourceTypeCloudflareDocs case host == "docs.astro.build": return scraper.SourceTypeAstroDocs case host == "github.com": return scraper.SourceTypeGitHub default: return scraper.SourceTypeWeb } } func extractName(sourceURL string) string { u, err := url.Parse(sourceURL) if err != nil { return "unknown" } parts := strings.Split(strings.Trim(u.Path, "/"), "/") if len(parts) > 0 { return parts[len(parts)-1] } return u.Host } func sanitizeFilename(name string) string { name = strings.ToLower(name) name = strings.ReplaceAll(name, " ", "_") name = strings.ReplaceAll(name, "/", "_") name = strings.ReplaceAll(name, ":", "_") name = strings.ReplaceAll(name, ".", "_") if len(name) > 50 { name = name[:50] } return name }