mirror of
https://github.com/Dvorinka/Devour.git
synced 2026-06-03 20:13:03 +00:00
122 lines
3.0 KiB
Go
122 lines
3.0 KiB
Go
package main
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"time"
|
|
|
|
"github.com/yourorg/devour/internal/scraper"
|
|
_ "github.com/yourorg/devour/internal/scraper/external"
|
|
)
|
|
|
|
func main() {
|
|
fmt.Println("=== Devour Real HTTP Scraping Test ===")
|
|
fmt.Println()
|
|
|
|
config := &scraper.Config{
|
|
UserAgent: "Devour/1.0 (Documentation Scraper)",
|
|
Timeout: 30 * time.Second,
|
|
RetryCount: 3,
|
|
RetryDelay: 1 * time.Second,
|
|
Concurrency: 10,
|
|
}
|
|
|
|
ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
|
|
defer cancel()
|
|
|
|
sources := []struct {
|
|
name string
|
|
st scraper.SourceType
|
|
url string
|
|
}{
|
|
{"Go stdlib net/http", scraper.SourceTypeGoDocs, "https://pkg.go.dev/net/http"},
|
|
{"Spring AI MCP", scraper.SourceTypeSpringDocs, "https://docs.spring.io/spring-ai/reference/api/mcp/mcp-overview.html"},
|
|
{"React Hooks", scraper.SourceTypeReactDocs, "https://react.dev/reference/react"},
|
|
{"Vue Composition API", scraper.SourceTypeVueDocs, "https://vuejs.org/api/"},
|
|
{"Cloudflare Docs", scraper.SourceTypeCloudflareDocs, "https://developers.cloudflare.com/"},
|
|
}
|
|
|
|
for _, src := range sources {
|
|
fmt.Printf("=== Testing: %s ===\n", src.name)
|
|
|
|
s := scraper.NewScraper(src.st, config)
|
|
if s == nil {
|
|
fmt.Printf(" ✗ Scraper not available for type: %s\n\n", src.st)
|
|
continue
|
|
}
|
|
|
|
source := &scraper.Source{
|
|
Name: src.name,
|
|
Type: src.st,
|
|
URL: src.url,
|
|
}
|
|
|
|
fmt.Printf(" Fetching: %s\n", src.url)
|
|
|
|
docs, err := s.Scrape(ctx, source)
|
|
if err != nil {
|
|
fmt.Printf(" ✗ Error: %v\n\n", err)
|
|
continue
|
|
}
|
|
|
|
fmt.Printf(" ✓ Scraped %d documents\n", len(docs))
|
|
|
|
if len(docs) > 0 {
|
|
first := docs[0]
|
|
fmt.Printf(" First document:\n")
|
|
fmt.Printf(" Title: %s\n", first.Title)
|
|
fmt.Printf(" Type: %s\n", first.Type)
|
|
if len(first.Content) > 100 {
|
|
fmt.Printf(" Content preview: %s...\n", first.Content[:100])
|
|
} else {
|
|
fmt.Printf(" Content: %s\n", first.Content)
|
|
}
|
|
}
|
|
|
|
changed, hash, err := s.DetectChanges(ctx, source, "")
|
|
if err != nil {
|
|
fmt.Printf(" ✗ Change detection error: %v\n", err)
|
|
} else {
|
|
fmt.Printf(" ✓ Change detection: changed=%v, hash=%s\n", changed, hash[:16]+"...")
|
|
}
|
|
|
|
fmt.Println()
|
|
}
|
|
|
|
fmt.Println("=== All Source Types ===")
|
|
fmt.Println()
|
|
fmt.Println("Available scrapers:")
|
|
allTypes := []scraper.SourceType{
|
|
scraper.SourceTypeWeb,
|
|
scraper.SourceTypeGitHub,
|
|
scraper.SourceTypeOpenAPI,
|
|
scraper.SourceTypeLocal,
|
|
scraper.SourceTypeLocalSearch,
|
|
scraper.SourceTypeGoDocs,
|
|
scraper.SourceTypeRustDocs,
|
|
scraper.SourceTypePythonDocs,
|
|
scraper.SourceTypeJavaDocs,
|
|
scraper.SourceTypeSpringDocs,
|
|
scraper.SourceTypeTSDocs,
|
|
scraper.SourceTypeReactDocs,
|
|
scraper.SourceTypeVueDocs,
|
|
scraper.SourceTypeNuxtDocs,
|
|
scraper.SourceTypeMCPDocs,
|
|
scraper.SourceTypeDockerDocs,
|
|
scraper.SourceTypeCloudflareDocs,
|
|
scraper.SourceTypeAstroDocs,
|
|
}
|
|
|
|
for _, st := range allTypes {
|
|
s := scraper.NewScraper(st, config)
|
|
if s != nil {
|
|
fmt.Printf(" ✓ %s\n", st)
|
|
} else {
|
|
fmt.Printf(" %s (not implemented)\n", st)
|
|
}
|
|
}
|
|
|
|
fmt.Println()
|
|
fmt.Println("=== Test Complete ===")
|
|
}
|