first commit

2026-07-29 07:33:48 +00:00 · 2026-02-22 10:42:17 +01:00
commit 55885a0e8f
239 changed files with 103690 additions and 0 deletions
@@ -0,0 +1,242 @@
+package cmd
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"net/url"
+	"os"
+	"path/filepath"
+	"strings"
+	"time"
+
+	"github.com/spf13/cobra"
+	"github.com/yourorg/devour/internal/markdown"
+	"github.com/yourorg/devour/internal/scraper"
+)
+
+var scrapeCmd = &cobra.Command{
+	Use:   "scrape <source>",
+	Short: "Scrape documentation from a source",
+	Long: `Scrape and index documentation from various sources.
+
+Supported source types:
+  - godocs: Go packages (pkg.go.dev)
+  - rustdocs: Rust crates (docs.rs)
+  - pythondocs: Python modules (docs.python.org)
+  - javadocs: Java packages (docs.oracle.com)
+  - springdocs: Spring Boot (docs.spring.io)
+  - tsdocs: TypeScript (typescriptlang.org)
+  - reactdocs: React (react.dev)
+  - vuedocs: Vue.js (vuejs.org)
+  - nuxtdocs: Nuxt (nuxt.com)
+  - mcpdocs: MCP servers (hub.docker.com/mcp)
+  - dockerdocs: Docker (docs.docker.com)
+  - cloudflaredocs: Cloudflare (developers.cloudflare.com)
+  - astrodocs: Astro (docs.astro.build)
+  - url: Generic web pages
+  - github: GitHub repositories
+
+Examples:
+  devour scrape https://pkg.go.dev/net/http --type godocs
+  devour scrape https://react.dev/reference/react --type reactdocs
+  devour scrape https://developers.cloudflare.com/ --type cloudflaredocs
+  devour scrape --sources sources.yaml`,
+	Args: cobra.MaximumNArgs(1),
+	RunE: runScrape,
+}
+
+var (
+	scrapeFormat      string
+	scrapeSources     string
+	scrapeOutput      string
+	scrapeConcurrency int
+	scrapeType        string
+)
+
+func init() {
+	scrapeCmd.Flags().StringVarP(&scrapeFormat, "format", "f", "json", "output format (json, markdown)")
+	scrapeCmd.Flags().StringVarP(&scrapeSources, "sources", "s", "", "YAML file with source definitions")
+	scrapeCmd.Flags().StringVarP(&scrapeOutput, "output", "o", "", "output directory (default: devour_data/docs)")
+	scrapeCmd.Flags().IntVar(&scrapeConcurrency, "concurrency", 10, "parallel scraping workers")
+	scrapeCmd.Flags().StringVarP(&scrapeType, "type", "t", "", "source type (auto-detected if not specified)")
+}
+
+func runScrape(cmd *cobra.Command, args []string) error {
+	if scrapeSources != "" {
+		return scrapeFromConfig(scrapeSources)
+	}
+
+	if len(args) == 0 {
+		return fmt.Errorf("source argument required when not using --sources flag")
+	}
+
+	sourceURL := args[0]
+
+	config := &scraper.Config{
+		UserAgent:   "Devour/1.0 (Documentation Scraper)",
+		Timeout:     30 * time.Second,
+		RetryCount:  3,
+		RetryDelay:  1 * time.Second,
+		Concurrency: scrapeConcurrency,
+	}
+
+	sourceType := scraper.SourceType(scrapeType)
+	if sourceType == "" {
+		sourceType = detectSourceType(sourceURL)
+	}
+
+	fmt.Printf("Scraping: %s\n", sourceURL)
+	fmt.Printf("  Type: %s\n", sourceType)
+	fmt.Printf("  Concurrency: %d\n", scrapeConcurrency)
+	fmt.Println()
+
+	s := scraper.NewScraper(sourceType, config)
+	if s == nil {
+		return fmt.Errorf("unsupported source type: %s", sourceType)
+	}
+
+	source := &scraper.Source{
+		Name: extractName(sourceURL),
+		Type: sourceType,
+		URL:  sourceURL,
+	}
+
+	ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
+	defer cancel()
+
+	docs, err := s.Scrape(ctx, source)
+	if err != nil {
+		return fmt.Errorf("scraping failed: %w", err)
+	}
+
+	fmt.Printf("✓ Scraped %d documents\n\n", len(docs))
+
+	if scrapeOutput == "" {
+		scrapeOutput = "devour_data/docs"
+	}
+
+	if err := os.MkdirAll(scrapeOutput, 0755); err != nil {
+		return fmt.Errorf("failed to create output directory: %w", err)
+	}
+
+	for i, doc := range docs {
+		var filename string
+		var content []byte
+
+		if scrapeFormat == "markdown" {
+			filename = fmt.Sprintf("%s_%d.md", sanitizeFilename(doc.Title), i)
+
+			// Create enhanced markdown document
+			markdownDoc := &markdown.Document{
+				ID:        doc.ID,
+				Source:    doc.Source,
+				Type:      string(doc.Type),
+				Title:     doc.Title,
+				Content:   doc.Content,
+				URL:       doc.URL,
+				Metadata:  doc.Metadata,
+				Hash:      doc.Hash,
+				Timestamp: doc.Timestamp,
+			}
+
+			formatter := markdown.NewFormatter()
+			content = []byte(formatter.FormatWithTOC(markdownDoc))
+		} else {
+			filename = fmt.Sprintf("%s_%d.json", sanitizeFilename(doc.Title), i)
+			content, err = json.MarshalIndent(doc, "", "  ")
+			if err != nil {
+				return fmt.Errorf("failed to marshal document: %w", err)
+			}
+		}
+
+		filePath := filepath.Join(scrapeOutput, filename)
+		if err := os.WriteFile(filePath, content, 0644); err != nil {
+			return fmt.Errorf("failed to write document: %w", err)
+		}
+
+		fmt.Printf("  📄 %s (%s)\n", filename, doc.Type)
+	}
+
+	fmt.Printf("\n✓ Scraping complete!\n")
+	fmt.Printf("   Output: %s\n", scrapeOutput)
+	fmt.Println("   Run 'devour status' to see indexed documents")
+
+	return nil
+}
+
+func scrapeFromConfig(configPath string) error {
+	return fmt.Errorf("scraping from config file not yet implemented")
+}
+
+func detectSourceType(sourceURL string) scraper.SourceType {
+	u, err := url.Parse(sourceURL)
+	if err != nil {
+		return scraper.SourceTypeWeb
+	}
+
+	host := u.Host
+	path := u.Path
+
+	switch {
+	case host == "pkg.go.dev" || strings.HasSuffix(host, "pkg.go.dev"):
+		return scraper.SourceTypeGoDocs
+	case host == "docs.rs" || host == "doc.rust-lang.org":
+		return scraper.SourceTypeRustDocs
+	case host == "docs.python.org":
+		return scraper.SourceTypePythonDocs
+	case host == "docs.oracle.com":
+		return scraper.SourceTypeJavaDocs
+	case host == "docs.spring.io":
+		return scraper.SourceTypeSpringDocs
+	case host == "www.typescriptlang.org" || host == "typescriptlang.org":
+		return scraper.SourceTypeTSDocs
+	case host == "react.dev":
+		return scraper.SourceTypeReactDocs
+	case host == "vuejs.org":
+		return scraper.SourceTypeVueDocs
+	case host == "nuxt.com":
+		return scraper.SourceTypeNuxtDocs
+	case strings.Contains(host, "docker.com") || host == "docs.docker.com":
+		if strings.Contains(path, "/mcp/") {
+			return scraper.SourceTypeMCPDocs
+		}
+		return scraper.SourceTypeDockerDocs
+	case host == "developers.cloudflare.com":
+		return scraper.SourceTypeCloudflareDocs
+	case host == "docs.astro.build":
+		return scraper.SourceTypeAstroDocs
+	case host == "github.com":
+		return scraper.SourceTypeGitHub
+	default:
+		return scraper.SourceTypeWeb
+	}
+}
+
+func extractName(sourceURL string) string {
+	u, err := url.Parse(sourceURL)
+	if err != nil {
+		return "unknown"
+	}
+
+	parts := strings.Split(strings.Trim(u.Path, "/"), "/")
+	if len(parts) > 0 {
+		return parts[len(parts)-1]
+	}
+
+	return u.Host
+}
+
+func sanitizeFilename(name string) string {
+	name = strings.ToLower(name)
+	name = strings.ReplaceAll(name, " ", "_")
+	name = strings.ReplaceAll(name, "/", "_")
+	name = strings.ReplaceAll(name, ":", "_")
+	name = strings.ReplaceAll(name, ".", "_")
+
+	if len(name) > 50 {
+		name = name[:50]
+	}
+
+	return name
+}