// Package scraper provides document scraping capabilities for various sources. package scraper import ( "context" "time" ) // SourceType represents the type of documentation source. type SourceType string const ( SourceTypeWeb SourceType = "url" SourceTypeGitHub SourceType = "github" SourceTypeOpenAPI SourceType = "openapi" SourceTypeLocal SourceType = "local" SourceTypeGoDocs SourceType = "godocs" SourceTypeRustDocs SourceType = "rustdocs" SourceTypePythonDocs SourceType = "pythondocs" SourceTypeJavaDocs SourceType = "javadocs" SourceTypeSpringDocs SourceType = "springdocs" SourceTypeSpringAIDocs SourceType = "springaidocs" SourceTypeTSDocs SourceType = "tsdocs" SourceTypeReactDocs SourceType = "reactdocs" SourceTypeVueDocs SourceType = "vuedocs" SourceTypeNuxtDocs SourceType = "nuxtdocs" SourceTypeMCPDocs SourceType = "mcpdocs" SourceTypeDockerDocs SourceType = "dockerdocs" SourceTypeCloudflareDocs SourceType = "cloudflaredocs" SourceTypeAstroDocs SourceType = "astrodocs" ) // Source represents a documentation source to scrape. type Source struct { Name string `yaml:"name"` Type SourceType `yaml:"type"` URL string `yaml:"url,omitempty"` Repo string `yaml:"repo,omitempty"` Branch string `yaml:"branch,omitempty"` Path string `yaml:"path,omitempty"` Include []string `yaml:"include,omitempty"` Exclude []string `yaml:"exclude,omitempty"` Schedule string `yaml:"schedule,omitempty"` } // Document represents a scraped document. type Document struct { ID string `json:"id"` Source string `json:"source"` Type string `json:"type"` Title string `json:"title"` Content string `json:"content"` URL string `json:"url,omitempty"` Metadata map[string]interface{} `json:"metadata"` Hash string `json:"hash"` Timestamp time.Time `json:"timestamp"` } // Config holds scraper configuration. type Config struct { UserAgent string `yaml:"user_agent"` Timeout time.Duration `yaml:"timeout"` RetryCount int `yaml:"retry_count"` RetryDelay time.Duration `yaml:"retry_delay"` Concurrency int `yaml:"concurrency"` RateLimit time.Duration `yaml:"rate_limit"` MaxDepth int `yaml:"max_depth"` CacheDir string `yaml:"cache_dir"` } // Scraper defines the interface for document scrapers. type Scraper interface { // Scrape fetches and parses documents from the source. Scrape(ctx context.Context, source *Source) ([]*Document, error) // DetectChanges checks if the source has changed since last scrape. DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) } // NewScraper creates a new scraper for the given source type. func NewScraper(sourceType SourceType, config *Config) Scraper { switch sourceType { case SourceTypeWeb: return NewWebScraper(config) case SourceTypeGitHub: return NewGitHubScraper(config) case SourceTypeOpenAPI: return NewOpenAPIScraper(config) case SourceTypeLocal: return NewLocalScraper(config) case SourceTypeGoDocs: return NewGoDocsScraper(config) case SourceTypeRustDocs: return NewRustDocsScraper(config) case SourceTypePythonDocs: return NewPythonDocsScraper(config) case SourceTypeJavaDocs: return NewJavaDocsScraper(config) case SourceTypeSpringDocs: return NewSpringDocsScraper(config) case SourceTypeTSDocs: return NewTSDocsScraper(config) case SourceTypeReactDocs: return NewReactDocsScraper(config) case SourceTypeVueDocs: return NewVueDocsScraper(config) case SourceTypeNuxtDocs: return NewNuxtDocsScraper(config) case SourceTypeMCPDocs: return NewMCPDocsScraper(config) case SourceTypeDockerDocs: return NewDockerDocsScraper(config) case SourceTypeCloudflareDocs: return NewCloudflareDocsScraper(config) case SourceTypeAstroDocs: return NewAstroDocsScraper(config) default: return nil } } // DetectSourceType determines the source type from a URL or path. func DetectSourceType(input string) SourceType { // TODO: Implement detection logic if len(input) > 4 && input[:4] == "http" { return SourceTypeWeb } return SourceTypeLocal }