// Package scraper provides document scraping capabilities for various sources. package scraper import ( "context" "crypto/sha256" "encoding/hex" "strings" "time" ) // SourceType represents the type of documentation source. type SourceType string const ( SourceTypeWeb SourceType = "url" SourceTypeGitHub SourceType = "github" SourceTypeOpenAPI SourceType = "openapi" SourceTypeLocal SourceType = "local" SourceTypeLocalSearch SourceType = "localsearch" SourceTypeGoDocs SourceType = "godocs" SourceTypeRustDocs SourceType = "rustdocs" SourceTypePythonDocs SourceType = "pythondocs" SourceTypeJavaDocs SourceType = "javadocs" SourceTypeSpringDocs SourceType = "springdocs" SourceTypeTSDocs SourceType = "tsdocs" SourceTypeReactDocs SourceType = "reactdocs" SourceTypeVueDocs SourceType = "vuedocs" SourceTypeNuxtDocs SourceType = "nuxtdocs" SourceTypeMCPDocs SourceType = "mcpdocs" SourceTypeDockerDocs SourceType = "dockerdocs" SourceTypeCloudflareDocs SourceType = "cloudflaredocs" SourceTypeAstroDocs SourceType = "astrodocs" ) // Source represents a documentation source to scrape. type Source struct { Name string `yaml:"name"` Type SourceType `yaml:"type"` URL string `yaml:"url,omitempty"` Query string `yaml:"query,omitempty"` ResultLimit int `yaml:"result_limit,omitempty"` Domains []string `yaml:"domains,omitempty"` Repo string `yaml:"repo,omitempty"` Branch string `yaml:"branch,omitempty"` Path string `yaml:"path,omitempty"` Include []string `yaml:"include,omitempty"` Exclude []string `yaml:"exclude,omitempty"` Schedule string `yaml:"schedule,omitempty"` } // Document represents a scraped document. type Document struct { ID string `json:"id"` Source string `json:"source"` Type string `json:"type"` Title string `json:"title"` Content string `json:"content"` URL string `json:"url,omitempty"` Metadata map[string]interface{} `json:"metadata"` Hash string `json:"hash"` Timestamp time.Time `json:"timestamp"` } // Config holds scraper configuration. type Config struct { UserAgent string `yaml:"user_agent"` Timeout time.Duration `yaml:"timeout"` RetryCount int `yaml:"retry_count"` RetryDelay time.Duration `yaml:"retry_delay"` Concurrency int `yaml:"concurrency"` RateLimit time.Duration `yaml:"rate_limit"` MaxDepth int `yaml:"max_depth"` CacheDir string `yaml:"cache_dir"` } // Scraper defines the interface for document scrapers. type Scraper interface { // Scrape fetches and parses documents from the source. Scrape(ctx context.Context, source *Source) ([]*Document, error) // DetectChanges checks if the source has changed since last scrape. DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) } // NewScraper creates a new scraper for the given source type using the registry. func NewScraper(sourceType SourceType, config *Config) Scraper { return CreateScraper(sourceType, config) } // DetectSourceType determines the source type from a URL or path. func DetectSourceType(input string) SourceType { // Check for GitHub repositories if strings.Contains(input, "github.com") { return SourceTypeGitHub } // Check for known documentation hosts docsHosts := map[string]SourceType{ "pkg.go.dev": SourceTypeGoDocs, "docs.rs": SourceTypeRustDocs, "docs.python.org": SourceTypePythonDocs, "docs.oracle.com": SourceTypeJavaDocs, "docs.spring.io": SourceTypeSpringDocs, "typescriptlang.org": SourceTypeTSDocs, "react.dev": SourceTypeReactDocs, "vuejs.org": SourceTypeVueDocs, "nuxt.com": SourceTypeNuxtDocs, "docs.docker.com": SourceTypeDockerDocs, "developers.cloudflare.com": SourceTypeCloudflareDocs, "docs.astro.build": SourceTypeAstroDocs, } for host, sourceType := range docsHosts { if strings.Contains(input, host) { return sourceType } } // MCP servers are hosted under Docker Hub paths. if strings.Contains(input, "hub.docker.com/mcp/") { return SourceTypeMCPDocs } // Check for OpenAPI specs if strings.HasSuffix(input, ".json") || strings.HasSuffix(input, ".yaml") || strings.HasSuffix(input, ".yml") { if strings.Contains(strings.ToLower(input), "openapi") || strings.Contains(strings.ToLower(input), "swagger") { return SourceTypeOpenAPI } } // Check for web URLs if strings.HasPrefix(input, "http://") || strings.HasPrefix(input, "https://") { return SourceTypeWeb } // Default to local return SourceTypeLocal } // generateDocID generates a unique ID for a document. func generateDocID(urlStr string) string { hash := sha256.Sum256([]byte(urlStr)) return hex.EncodeToString(hash[:12]) }