mirror of
https://github.com/Dvorinka/Devour.git
synced 2026-06-03 20:13:03 +00:00
first commit
This commit is contained in:
+242
@@ -0,0 +1,242 @@
|
||||
package cmd
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net/url"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/spf13/cobra"
|
||||
"github.com/yourorg/devour/internal/markdown"
|
||||
"github.com/yourorg/devour/internal/scraper"
|
||||
)
|
||||
|
||||
var scrapeCmd = &cobra.Command{
|
||||
Use: "scrape <source>",
|
||||
Short: "Scrape documentation from a source",
|
||||
Long: `Scrape and index documentation from various sources.
|
||||
|
||||
Supported source types:
|
||||
- godocs: Go packages (pkg.go.dev)
|
||||
- rustdocs: Rust crates (docs.rs)
|
||||
- pythondocs: Python modules (docs.python.org)
|
||||
- javadocs: Java packages (docs.oracle.com)
|
||||
- springdocs: Spring Boot (docs.spring.io)
|
||||
- tsdocs: TypeScript (typescriptlang.org)
|
||||
- reactdocs: React (react.dev)
|
||||
- vuedocs: Vue.js (vuejs.org)
|
||||
- nuxtdocs: Nuxt (nuxt.com)
|
||||
- mcpdocs: MCP servers (hub.docker.com/mcp)
|
||||
- dockerdocs: Docker (docs.docker.com)
|
||||
- cloudflaredocs: Cloudflare (developers.cloudflare.com)
|
||||
- astrodocs: Astro (docs.astro.build)
|
||||
- url: Generic web pages
|
||||
- github: GitHub repositories
|
||||
|
||||
Examples:
|
||||
devour scrape https://pkg.go.dev/net/http --type godocs
|
||||
devour scrape https://react.dev/reference/react --type reactdocs
|
||||
devour scrape https://developers.cloudflare.com/ --type cloudflaredocs
|
||||
devour scrape --sources sources.yaml`,
|
||||
Args: cobra.MaximumNArgs(1),
|
||||
RunE: runScrape,
|
||||
}
|
||||
|
||||
var (
|
||||
scrapeFormat string
|
||||
scrapeSources string
|
||||
scrapeOutput string
|
||||
scrapeConcurrency int
|
||||
scrapeType string
|
||||
)
|
||||
|
||||
func init() {
|
||||
scrapeCmd.Flags().StringVarP(&scrapeFormat, "format", "f", "json", "output format (json, markdown)")
|
||||
scrapeCmd.Flags().StringVarP(&scrapeSources, "sources", "s", "", "YAML file with source definitions")
|
||||
scrapeCmd.Flags().StringVarP(&scrapeOutput, "output", "o", "", "output directory (default: devour_data/docs)")
|
||||
scrapeCmd.Flags().IntVar(&scrapeConcurrency, "concurrency", 10, "parallel scraping workers")
|
||||
scrapeCmd.Flags().StringVarP(&scrapeType, "type", "t", "", "source type (auto-detected if not specified)")
|
||||
}
|
||||
|
||||
func runScrape(cmd *cobra.Command, args []string) error {
|
||||
if scrapeSources != "" {
|
||||
return scrapeFromConfig(scrapeSources)
|
||||
}
|
||||
|
||||
if len(args) == 0 {
|
||||
return fmt.Errorf("source argument required when not using --sources flag")
|
||||
}
|
||||
|
||||
sourceURL := args[0]
|
||||
|
||||
config := &scraper.Config{
|
||||
UserAgent: "Devour/1.0 (Documentation Scraper)",
|
||||
Timeout: 30 * time.Second,
|
||||
RetryCount: 3,
|
||||
RetryDelay: 1 * time.Second,
|
||||
Concurrency: scrapeConcurrency,
|
||||
}
|
||||
|
||||
sourceType := scraper.SourceType(scrapeType)
|
||||
if sourceType == "" {
|
||||
sourceType = detectSourceType(sourceURL)
|
||||
}
|
||||
|
||||
fmt.Printf("Scraping: %s\n", sourceURL)
|
||||
fmt.Printf(" Type: %s\n", sourceType)
|
||||
fmt.Printf(" Concurrency: %d\n", scrapeConcurrency)
|
||||
fmt.Println()
|
||||
|
||||
s := scraper.NewScraper(sourceType, config)
|
||||
if s == nil {
|
||||
return fmt.Errorf("unsupported source type: %s", sourceType)
|
||||
}
|
||||
|
||||
source := &scraper.Source{
|
||||
Name: extractName(sourceURL),
|
||||
Type: sourceType,
|
||||
URL: sourceURL,
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
|
||||
defer cancel()
|
||||
|
||||
docs, err := s.Scrape(ctx, source)
|
||||
if err != nil {
|
||||
return fmt.Errorf("scraping failed: %w", err)
|
||||
}
|
||||
|
||||
fmt.Printf("✓ Scraped %d documents\n\n", len(docs))
|
||||
|
||||
if scrapeOutput == "" {
|
||||
scrapeOutput = "devour_data/docs"
|
||||
}
|
||||
|
||||
if err := os.MkdirAll(scrapeOutput, 0755); err != nil {
|
||||
return fmt.Errorf("failed to create output directory: %w", err)
|
||||
}
|
||||
|
||||
for i, doc := range docs {
|
||||
var filename string
|
||||
var content []byte
|
||||
|
||||
if scrapeFormat == "markdown" {
|
||||
filename = fmt.Sprintf("%s_%d.md", sanitizeFilename(doc.Title), i)
|
||||
|
||||
// Create enhanced markdown document
|
||||
markdownDoc := &markdown.Document{
|
||||
ID: doc.ID,
|
||||
Source: doc.Source,
|
||||
Type: string(doc.Type),
|
||||
Title: doc.Title,
|
||||
Content: doc.Content,
|
||||
URL: doc.URL,
|
||||
Metadata: doc.Metadata,
|
||||
Hash: doc.Hash,
|
||||
Timestamp: doc.Timestamp,
|
||||
}
|
||||
|
||||
formatter := markdown.NewFormatter()
|
||||
content = []byte(formatter.FormatWithTOC(markdownDoc))
|
||||
} else {
|
||||
filename = fmt.Sprintf("%s_%d.json", sanitizeFilename(doc.Title), i)
|
||||
content, err = json.MarshalIndent(doc, "", " ")
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to marshal document: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
filePath := filepath.Join(scrapeOutput, filename)
|
||||
if err := os.WriteFile(filePath, content, 0644); err != nil {
|
||||
return fmt.Errorf("failed to write document: %w", err)
|
||||
}
|
||||
|
||||
fmt.Printf(" 📄 %s (%s)\n", filename, doc.Type)
|
||||
}
|
||||
|
||||
fmt.Printf("\n✓ Scraping complete!\n")
|
||||
fmt.Printf(" Output: %s\n", scrapeOutput)
|
||||
fmt.Println(" Run 'devour status' to see indexed documents")
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func scrapeFromConfig(configPath string) error {
|
||||
return fmt.Errorf("scraping from config file not yet implemented")
|
||||
}
|
||||
|
||||
func detectSourceType(sourceURL string) scraper.SourceType {
|
||||
u, err := url.Parse(sourceURL)
|
||||
if err != nil {
|
||||
return scraper.SourceTypeWeb
|
||||
}
|
||||
|
||||
host := u.Host
|
||||
path := u.Path
|
||||
|
||||
switch {
|
||||
case host == "pkg.go.dev" || strings.HasSuffix(host, "pkg.go.dev"):
|
||||
return scraper.SourceTypeGoDocs
|
||||
case host == "docs.rs" || host == "doc.rust-lang.org":
|
||||
return scraper.SourceTypeRustDocs
|
||||
case host == "docs.python.org":
|
||||
return scraper.SourceTypePythonDocs
|
||||
case host == "docs.oracle.com":
|
||||
return scraper.SourceTypeJavaDocs
|
||||
case host == "docs.spring.io":
|
||||
return scraper.SourceTypeSpringDocs
|
||||
case host == "www.typescriptlang.org" || host == "typescriptlang.org":
|
||||
return scraper.SourceTypeTSDocs
|
||||
case host == "react.dev":
|
||||
return scraper.SourceTypeReactDocs
|
||||
case host == "vuejs.org":
|
||||
return scraper.SourceTypeVueDocs
|
||||
case host == "nuxt.com":
|
||||
return scraper.SourceTypeNuxtDocs
|
||||
case strings.Contains(host, "docker.com") || host == "docs.docker.com":
|
||||
if strings.Contains(path, "/mcp/") {
|
||||
return scraper.SourceTypeMCPDocs
|
||||
}
|
||||
return scraper.SourceTypeDockerDocs
|
||||
case host == "developers.cloudflare.com":
|
||||
return scraper.SourceTypeCloudflareDocs
|
||||
case host == "docs.astro.build":
|
||||
return scraper.SourceTypeAstroDocs
|
||||
case host == "github.com":
|
||||
return scraper.SourceTypeGitHub
|
||||
default:
|
||||
return scraper.SourceTypeWeb
|
||||
}
|
||||
}
|
||||
|
||||
func extractName(sourceURL string) string {
|
||||
u, err := url.Parse(sourceURL)
|
||||
if err != nil {
|
||||
return "unknown"
|
||||
}
|
||||
|
||||
parts := strings.Split(strings.Trim(u.Path, "/"), "/")
|
||||
if len(parts) > 0 {
|
||||
return parts[len(parts)-1]
|
||||
}
|
||||
|
||||
return u.Host
|
||||
}
|
||||
|
||||
func sanitizeFilename(name string) string {
|
||||
name = strings.ToLower(name)
|
||||
name = strings.ReplaceAll(name, " ", "_")
|
||||
name = strings.ReplaceAll(name, "/", "_")
|
||||
name = strings.ReplaceAll(name, ":", "_")
|
||||
name = strings.ReplaceAll(name, ".", "_")
|
||||
|
||||
if len(name) > 50 {
|
||||
name = name[:50]
|
||||
}
|
||||
|
||||
return name
|
||||
}
|
||||
Reference in New Issue
Block a user