mirror of
https://github.com/Dvorinka/Devour.git
synced 2026-06-04 04:23:02 +00:00
updage
This commit is contained in:
@@ -0,0 +1,10 @@
|
||||
package scraper
|
||||
|
||||
func init() {
|
||||
// Register only core scrapers to reduce coupling
|
||||
// Additional scrapers can be registered in their own packages
|
||||
RegisterScraper(SourceTypeWeb, func(c *Config) Scraper { return NewWebScraper(c) })
|
||||
RegisterScraper(SourceTypeLocal, func(c *Config) Scraper { return NewLocalScraper(c) })
|
||||
RegisterScraper(SourceTypeGitHub, func(c *Config) Scraper { return NewGitHubScraper(c) })
|
||||
RegisterScraper(SourceTypeOpenAPI, func(c *Config) Scraper { return NewOpenAPIScraper(c) })
|
||||
}
|
||||
@@ -0,0 +1,67 @@
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
)
|
||||
|
||||
// ScraperConstructor defines a function that creates a scraper
|
||||
type ScraperConstructor func(*Config) Scraper
|
||||
|
||||
// ScraperRegistry manages scraper constructors without importing them
|
||||
type ScraperRegistry struct {
|
||||
constructors map[SourceType]ScraperConstructor
|
||||
}
|
||||
|
||||
// NewScraperRegistry creates a new registry
|
||||
func NewScraperRegistry() *ScraperRegistry {
|
||||
return &ScraperRegistry{
|
||||
constructors: make(map[SourceType]ScraperConstructor),
|
||||
}
|
||||
}
|
||||
|
||||
// Register registers a scraper constructor
|
||||
func (r *ScraperRegistry) Register(sourceType SourceType, constructor ScraperConstructor) {
|
||||
r.constructors[sourceType] = constructor
|
||||
}
|
||||
|
||||
// Create creates a scraper instance
|
||||
func (r *ScraperRegistry) Create(sourceType SourceType, config *Config) Scraper {
|
||||
if constructor, exists := r.constructors[sourceType]; exists {
|
||||
return constructor(config)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Global registry
|
||||
var globalRegistry = NewScraperRegistry()
|
||||
|
||||
// RegisterScraper registers a scraper globally
|
||||
func RegisterScraper(sourceType SourceType, constructor ScraperConstructor) {
|
||||
globalRegistry.Register(sourceType, constructor)
|
||||
}
|
||||
|
||||
// CreateScraper creates a scraper using the global registry
|
||||
func CreateScraper(sourceType SourceType, config *Config) Scraper {
|
||||
return globalRegistry.Create(sourceType, config)
|
||||
}
|
||||
|
||||
// FallbackScraper provides basic functionality when specific scrapers aren't available
|
||||
type FallbackScraper struct {
|
||||
config *Config
|
||||
}
|
||||
|
||||
// NewFallbackScraper creates a fallback scraper
|
||||
func NewFallbackScraper(config *Config) *FallbackScraper {
|
||||
return &FallbackScraper{config: config}
|
||||
}
|
||||
|
||||
// Scrape implements basic scraping functionality
|
||||
func (f *FallbackScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
|
||||
return nil, fmt.Errorf("fallback scraper not implemented for source type: %s", source.Type)
|
||||
}
|
||||
|
||||
// DetectChanges implements basic change detection
|
||||
func (f *FallbackScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
|
||||
return false, "", fmt.Errorf("fallback scraper not implemented for source type: %s", source.Type)
|
||||
}
|
||||
+49
-42
@@ -3,6 +3,9 @@ package scraper
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
@@ -19,7 +22,6 @@ const (
|
||||
SourceTypePythonDocs SourceType = "pythondocs"
|
||||
SourceTypeJavaDocs SourceType = "javadocs"
|
||||
SourceTypeSpringDocs SourceType = "springdocs"
|
||||
SourceTypeSpringAIDocs SourceType = "springaidocs"
|
||||
SourceTypeTSDocs SourceType = "tsdocs"
|
||||
SourceTypeReactDocs SourceType = "reactdocs"
|
||||
SourceTypeVueDocs SourceType = "vuedocs"
|
||||
@@ -77,53 +79,58 @@ type Scraper interface {
|
||||
DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error)
|
||||
}
|
||||
|
||||
// NewScraper creates a new scraper for the given source type.
|
||||
// NewScraper creates a new scraper for the given source type using the registry.
|
||||
func NewScraper(sourceType SourceType, config *Config) Scraper {
|
||||
switch sourceType {
|
||||
case SourceTypeWeb:
|
||||
return NewWebScraper(config)
|
||||
case SourceTypeGitHub:
|
||||
return NewGitHubScraper(config)
|
||||
case SourceTypeOpenAPI:
|
||||
return NewOpenAPIScraper(config)
|
||||
case SourceTypeLocal:
|
||||
return NewLocalScraper(config)
|
||||
case SourceTypeGoDocs:
|
||||
return NewGoDocsScraper(config)
|
||||
case SourceTypeRustDocs:
|
||||
return NewRustDocsScraper(config)
|
||||
case SourceTypePythonDocs:
|
||||
return NewPythonDocsScraper(config)
|
||||
case SourceTypeJavaDocs:
|
||||
return NewJavaDocsScraper(config)
|
||||
case SourceTypeSpringDocs:
|
||||
return NewSpringDocsScraper(config)
|
||||
case SourceTypeTSDocs:
|
||||
return NewTSDocsScraper(config)
|
||||
case SourceTypeReactDocs:
|
||||
return NewReactDocsScraper(config)
|
||||
case SourceTypeVueDocs:
|
||||
return NewVueDocsScraper(config)
|
||||
case SourceTypeNuxtDocs:
|
||||
return NewNuxtDocsScraper(config)
|
||||
case SourceTypeMCPDocs:
|
||||
return NewMCPDocsScraper(config)
|
||||
case SourceTypeDockerDocs:
|
||||
return NewDockerDocsScraper(config)
|
||||
case SourceTypeCloudflareDocs:
|
||||
return NewCloudflareDocsScraper(config)
|
||||
case SourceTypeAstroDocs:
|
||||
return NewAstroDocsScraper(config)
|
||||
default:
|
||||
return nil
|
||||
}
|
||||
return CreateScraper(sourceType, config)
|
||||
}
|
||||
|
||||
// DetectSourceType determines the source type from a URL or path.
|
||||
func DetectSourceType(input string) SourceType {
|
||||
// TODO: Implement detection logic
|
||||
if len(input) > 4 && input[:4] == "http" {
|
||||
// Check for GitHub repositories
|
||||
if strings.Contains(input, "github.com") {
|
||||
return SourceTypeGitHub
|
||||
}
|
||||
|
||||
// Check for known documentation hosts
|
||||
docsHosts := map[string]SourceType{
|
||||
"pkg.go.dev": SourceTypeGoDocs,
|
||||
"docs.rs": SourceTypeRustDocs,
|
||||
"docs.python.org": SourceTypePythonDocs,
|
||||
"docs.oracle.com": SourceTypeJavaDocs,
|
||||
"docs.spring.io": SourceTypeSpringDocs,
|
||||
"typescriptlang.org": SourceTypeTSDocs,
|
||||
"react.dev": SourceTypeReactDocs,
|
||||
"vuejs.org": SourceTypeVueDocs,
|
||||
"nuxt.com": SourceTypeNuxtDocs,
|
||||
"docs.docker.com": SourceTypeDockerDocs,
|
||||
"developers.cloudflare.com": SourceTypeCloudflareDocs,
|
||||
"docs.astro.build": SourceTypeAstroDocs,
|
||||
}
|
||||
|
||||
for host, sourceType := range docsHosts {
|
||||
if strings.Contains(input, host) {
|
||||
return sourceType
|
||||
}
|
||||
}
|
||||
|
||||
// Check for OpenAPI specs
|
||||
if strings.HasSuffix(input, ".json") || strings.HasSuffix(input, ".yaml") || strings.HasSuffix(input, ".yml") {
|
||||
if strings.Contains(strings.ToLower(input), "openapi") || strings.Contains(strings.ToLower(input), "swagger") {
|
||||
return SourceTypeOpenAPI
|
||||
}
|
||||
}
|
||||
|
||||
// Check for web URLs
|
||||
if strings.HasPrefix(input, "http://") || strings.HasPrefix(input, "https://") {
|
||||
return SourceTypeWeb
|
||||
}
|
||||
|
||||
// Default to local
|
||||
return SourceTypeLocal
|
||||
}
|
||||
|
||||
// generateDocID generates a unique ID for a document.
|
||||
func generateDocID(urlStr string) string {
|
||||
hash := sha256.Sum256([]byte(urlStr))
|
||||
return hex.EncodeToString(hash[:12])
|
||||
}
|
||||
|
||||
@@ -44,11 +44,13 @@ func (s *WebScraper) Scrape(ctx context.Context, source *Source) ([]*Document, e
|
||||
|
||||
// Set rate limiting
|
||||
if s.config.RateLimit > 0 {
|
||||
c.Limit(&colly.LimitRule{
|
||||
if err := c.Limit(&colly.LimitRule{
|
||||
DomainGlob: "*",
|
||||
Parallelism: s.config.Concurrency,
|
||||
Delay: s.config.RateLimit,
|
||||
})
|
||||
}); err != nil {
|
||||
return nil, fmt.Errorf("failed to set rate limiting: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
// Set timeout
|
||||
@@ -136,7 +138,9 @@ func (s *WebScraper) Scrape(ctx context.Context, source *Source) ([]*Document, e
|
||||
return
|
||||
}
|
||||
|
||||
c.Visit(absoluteURL)
|
||||
if err := c.Visit(absoluteURL); err != nil {
|
||||
fmt.Printf("Error visiting %s: %v\n", absoluteURL, err)
|
||||
}
|
||||
})
|
||||
|
||||
// Start scraping
|
||||
@@ -288,9 +292,3 @@ func cleanText(text string) string {
|
||||
|
||||
return text
|
||||
}
|
||||
|
||||
// generateDocID generates a unique ID for a document.
|
||||
func generateDocID(urlStr string) string {
|
||||
hash := sha256.Sum256([]byte(urlStr))
|
||||
return hex.EncodeToString(hash[:12])
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user