This commit is contained in:
Tomas Dvorak
2026-02-22 15:41:27 +01:00
parent 0b88627e54
commit 409acd2e08
84 changed files with 65382 additions and 27475 deletions
+10
View File
@@ -0,0 +1,10 @@
package scraper
func init() {
// Register only core scrapers to reduce coupling
// Additional scrapers can be registered in their own packages
RegisterScraper(SourceTypeWeb, func(c *Config) Scraper { return NewWebScraper(c) })
RegisterScraper(SourceTypeLocal, func(c *Config) Scraper { return NewLocalScraper(c) })
RegisterScraper(SourceTypeGitHub, func(c *Config) Scraper { return NewGitHubScraper(c) })
RegisterScraper(SourceTypeOpenAPI, func(c *Config) Scraper { return NewOpenAPIScraper(c) })
}
+67
View File
@@ -0,0 +1,67 @@
package scraper
import (
"context"
"fmt"
)
// ScraperConstructor defines a function that creates a scraper
type ScraperConstructor func(*Config) Scraper
// ScraperRegistry manages scraper constructors without importing them
type ScraperRegistry struct {
constructors map[SourceType]ScraperConstructor
}
// NewScraperRegistry creates a new registry
func NewScraperRegistry() *ScraperRegistry {
return &ScraperRegistry{
constructors: make(map[SourceType]ScraperConstructor),
}
}
// Register registers a scraper constructor
func (r *ScraperRegistry) Register(sourceType SourceType, constructor ScraperConstructor) {
r.constructors[sourceType] = constructor
}
// Create creates a scraper instance
func (r *ScraperRegistry) Create(sourceType SourceType, config *Config) Scraper {
if constructor, exists := r.constructors[sourceType]; exists {
return constructor(config)
}
return nil
}
// Global registry
var globalRegistry = NewScraperRegistry()
// RegisterScraper registers a scraper globally
func RegisterScraper(sourceType SourceType, constructor ScraperConstructor) {
globalRegistry.Register(sourceType, constructor)
}
// CreateScraper creates a scraper using the global registry
func CreateScraper(sourceType SourceType, config *Config) Scraper {
return globalRegistry.Create(sourceType, config)
}
// FallbackScraper provides basic functionality when specific scrapers aren't available
type FallbackScraper struct {
config *Config
}
// NewFallbackScraper creates a fallback scraper
func NewFallbackScraper(config *Config) *FallbackScraper {
return &FallbackScraper{config: config}
}
// Scrape implements basic scraping functionality
func (f *FallbackScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
return nil, fmt.Errorf("fallback scraper not implemented for source type: %s", source.Type)
}
// DetectChanges implements basic change detection
func (f *FallbackScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
return false, "", fmt.Errorf("fallback scraper not implemented for source type: %s", source.Type)
}
+49 -42
View File
@@ -3,6 +3,9 @@ package scraper
import (
"context"
"crypto/sha256"
"encoding/hex"
"strings"
"time"
)
@@ -19,7 +22,6 @@ const (
SourceTypePythonDocs SourceType = "pythondocs"
SourceTypeJavaDocs SourceType = "javadocs"
SourceTypeSpringDocs SourceType = "springdocs"
SourceTypeSpringAIDocs SourceType = "springaidocs"
SourceTypeTSDocs SourceType = "tsdocs"
SourceTypeReactDocs SourceType = "reactdocs"
SourceTypeVueDocs SourceType = "vuedocs"
@@ -77,53 +79,58 @@ type Scraper interface {
DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error)
}
// NewScraper creates a new scraper for the given source type.
// NewScraper creates a new scraper for the given source type using the registry.
func NewScraper(sourceType SourceType, config *Config) Scraper {
switch sourceType {
case SourceTypeWeb:
return NewWebScraper(config)
case SourceTypeGitHub:
return NewGitHubScraper(config)
case SourceTypeOpenAPI:
return NewOpenAPIScraper(config)
case SourceTypeLocal:
return NewLocalScraper(config)
case SourceTypeGoDocs:
return NewGoDocsScraper(config)
case SourceTypeRustDocs:
return NewRustDocsScraper(config)
case SourceTypePythonDocs:
return NewPythonDocsScraper(config)
case SourceTypeJavaDocs:
return NewJavaDocsScraper(config)
case SourceTypeSpringDocs:
return NewSpringDocsScraper(config)
case SourceTypeTSDocs:
return NewTSDocsScraper(config)
case SourceTypeReactDocs:
return NewReactDocsScraper(config)
case SourceTypeVueDocs:
return NewVueDocsScraper(config)
case SourceTypeNuxtDocs:
return NewNuxtDocsScraper(config)
case SourceTypeMCPDocs:
return NewMCPDocsScraper(config)
case SourceTypeDockerDocs:
return NewDockerDocsScraper(config)
case SourceTypeCloudflareDocs:
return NewCloudflareDocsScraper(config)
case SourceTypeAstroDocs:
return NewAstroDocsScraper(config)
default:
return nil
}
return CreateScraper(sourceType, config)
}
// DetectSourceType determines the source type from a URL or path.
func DetectSourceType(input string) SourceType {
// TODO: Implement detection logic
if len(input) > 4 && input[:4] == "http" {
// Check for GitHub repositories
if strings.Contains(input, "github.com") {
return SourceTypeGitHub
}
// Check for known documentation hosts
docsHosts := map[string]SourceType{
"pkg.go.dev": SourceTypeGoDocs,
"docs.rs": SourceTypeRustDocs,
"docs.python.org": SourceTypePythonDocs,
"docs.oracle.com": SourceTypeJavaDocs,
"docs.spring.io": SourceTypeSpringDocs,
"typescriptlang.org": SourceTypeTSDocs,
"react.dev": SourceTypeReactDocs,
"vuejs.org": SourceTypeVueDocs,
"nuxt.com": SourceTypeNuxtDocs,
"docs.docker.com": SourceTypeDockerDocs,
"developers.cloudflare.com": SourceTypeCloudflareDocs,
"docs.astro.build": SourceTypeAstroDocs,
}
for host, sourceType := range docsHosts {
if strings.Contains(input, host) {
return sourceType
}
}
// Check for OpenAPI specs
if strings.HasSuffix(input, ".json") || strings.HasSuffix(input, ".yaml") || strings.HasSuffix(input, ".yml") {
if strings.Contains(strings.ToLower(input), "openapi") || strings.Contains(strings.ToLower(input), "swagger") {
return SourceTypeOpenAPI
}
}
// Check for web URLs
if strings.HasPrefix(input, "http://") || strings.HasPrefix(input, "https://") {
return SourceTypeWeb
}
// Default to local
return SourceTypeLocal
}
// generateDocID generates a unique ID for a document.
func generateDocID(urlStr string) string {
hash := sha256.Sum256([]byte(urlStr))
return hex.EncodeToString(hash[:12])
}
+7 -9
View File
@@ -44,11 +44,13 @@ func (s *WebScraper) Scrape(ctx context.Context, source *Source) ([]*Document, e
// Set rate limiting
if s.config.RateLimit > 0 {
c.Limit(&colly.LimitRule{
if err := c.Limit(&colly.LimitRule{
DomainGlob: "*",
Parallelism: s.config.Concurrency,
Delay: s.config.RateLimit,
})
}); err != nil {
return nil, fmt.Errorf("failed to set rate limiting: %w", err)
}
}
// Set timeout
@@ -136,7 +138,9 @@ func (s *WebScraper) Scrape(ctx context.Context, source *Source) ([]*Document, e
return
}
c.Visit(absoluteURL)
if err := c.Visit(absoluteURL); err != nil {
fmt.Printf("Error visiting %s: %v\n", absoluteURL, err)
}
})
// Start scraping
@@ -288,9 +292,3 @@ func cleanText(text string) string {
return text
}
// generateDocID generates a unique ID for a document.
func generateDocID(urlStr string) string {
hash := sha256.Sum256([]byte(urlStr))
return hex.EncodeToString(hash[:12])
}