Files
Devour/internal/scraper/scraper.go
Tomas Dvorak 55885a0e8f first commit
2026-02-22 10:42:17 +01:00

130 lines
4.3 KiB
Go

// Package scraper provides document scraping capabilities for various sources.
package scraper
import (
"context"
"time"
)
// SourceType represents the type of documentation source.
type SourceType string
const (
SourceTypeWeb SourceType = "url"
SourceTypeGitHub SourceType = "github"
SourceTypeOpenAPI SourceType = "openapi"
SourceTypeLocal SourceType = "local"
SourceTypeGoDocs SourceType = "godocs"
SourceTypeRustDocs SourceType = "rustdocs"
SourceTypePythonDocs SourceType = "pythondocs"
SourceTypeJavaDocs SourceType = "javadocs"
SourceTypeSpringDocs SourceType = "springdocs"
SourceTypeSpringAIDocs SourceType = "springaidocs"
SourceTypeTSDocs SourceType = "tsdocs"
SourceTypeReactDocs SourceType = "reactdocs"
SourceTypeVueDocs SourceType = "vuedocs"
SourceTypeNuxtDocs SourceType = "nuxtdocs"
SourceTypeMCPDocs SourceType = "mcpdocs"
SourceTypeDockerDocs SourceType = "dockerdocs"
SourceTypeCloudflareDocs SourceType = "cloudflaredocs"
SourceTypeAstroDocs SourceType = "astrodocs"
)
// Source represents a documentation source to scrape.
type Source struct {
Name string `yaml:"name"`
Type SourceType `yaml:"type"`
URL string `yaml:"url,omitempty"`
Repo string `yaml:"repo,omitempty"`
Branch string `yaml:"branch,omitempty"`
Path string `yaml:"path,omitempty"`
Include []string `yaml:"include,omitempty"`
Exclude []string `yaml:"exclude,omitempty"`
Schedule string `yaml:"schedule,omitempty"`
}
// Document represents a scraped document.
type Document struct {
ID string `json:"id"`
Source string `json:"source"`
Type string `json:"type"`
Title string `json:"title"`
Content string `json:"content"`
URL string `json:"url,omitempty"`
Metadata map[string]interface{} `json:"metadata"`
Hash string `json:"hash"`
Timestamp time.Time `json:"timestamp"`
}
// Config holds scraper configuration.
type Config struct {
UserAgent string `yaml:"user_agent"`
Timeout time.Duration `yaml:"timeout"`
RetryCount int `yaml:"retry_count"`
RetryDelay time.Duration `yaml:"retry_delay"`
Concurrency int `yaml:"concurrency"`
RateLimit time.Duration `yaml:"rate_limit"`
MaxDepth int `yaml:"max_depth"`
CacheDir string `yaml:"cache_dir"`
}
// Scraper defines the interface for document scrapers.
type Scraper interface {
// Scrape fetches and parses documents from the source.
Scrape(ctx context.Context, source *Source) ([]*Document, error)
// DetectChanges checks if the source has changed since last scrape.
DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error)
}
// NewScraper creates a new scraper for the given source type.
func NewScraper(sourceType SourceType, config *Config) Scraper {
switch sourceType {
case SourceTypeWeb:
return NewWebScraper(config)
case SourceTypeGitHub:
return NewGitHubScraper(config)
case SourceTypeOpenAPI:
return NewOpenAPIScraper(config)
case SourceTypeLocal:
return NewLocalScraper(config)
case SourceTypeGoDocs:
return NewGoDocsScraper(config)
case SourceTypeRustDocs:
return NewRustDocsScraper(config)
case SourceTypePythonDocs:
return NewPythonDocsScraper(config)
case SourceTypeJavaDocs:
return NewJavaDocsScraper(config)
case SourceTypeSpringDocs:
return NewSpringDocsScraper(config)
case SourceTypeTSDocs:
return NewTSDocsScraper(config)
case SourceTypeReactDocs:
return NewReactDocsScraper(config)
case SourceTypeVueDocs:
return NewVueDocsScraper(config)
case SourceTypeNuxtDocs:
return NewNuxtDocsScraper(config)
case SourceTypeMCPDocs:
return NewMCPDocsScraper(config)
case SourceTypeDockerDocs:
return NewDockerDocsScraper(config)
case SourceTypeCloudflareDocs:
return NewCloudflareDocsScraper(config)
case SourceTypeAstroDocs:
return NewAstroDocsScraper(config)
default:
return nil
}
}
// DetectSourceType determines the source type from a URL or path.
func DetectSourceType(input string) SourceType {
// TODO: Implement detection logic
if len(input) > 4 && input[:4] == "http" {
return SourceTypeWeb
}
return SourceTypeLocal
}