mirror of
https://github.com/Dvorinka/Devour.git
synced 2026-06-03 20:13:03 +00:00
130 lines
4.3 KiB
Go
130 lines
4.3 KiB
Go
// Package scraper provides document scraping capabilities for various sources.
|
|
package scraper
|
|
|
|
import (
|
|
"context"
|
|
"time"
|
|
)
|
|
|
|
// SourceType represents the type of documentation source.
|
|
type SourceType string
|
|
|
|
const (
|
|
SourceTypeWeb SourceType = "url"
|
|
SourceTypeGitHub SourceType = "github"
|
|
SourceTypeOpenAPI SourceType = "openapi"
|
|
SourceTypeLocal SourceType = "local"
|
|
SourceTypeGoDocs SourceType = "godocs"
|
|
SourceTypeRustDocs SourceType = "rustdocs"
|
|
SourceTypePythonDocs SourceType = "pythondocs"
|
|
SourceTypeJavaDocs SourceType = "javadocs"
|
|
SourceTypeSpringDocs SourceType = "springdocs"
|
|
SourceTypeSpringAIDocs SourceType = "springaidocs"
|
|
SourceTypeTSDocs SourceType = "tsdocs"
|
|
SourceTypeReactDocs SourceType = "reactdocs"
|
|
SourceTypeVueDocs SourceType = "vuedocs"
|
|
SourceTypeNuxtDocs SourceType = "nuxtdocs"
|
|
SourceTypeMCPDocs SourceType = "mcpdocs"
|
|
SourceTypeDockerDocs SourceType = "dockerdocs"
|
|
SourceTypeCloudflareDocs SourceType = "cloudflaredocs"
|
|
SourceTypeAstroDocs SourceType = "astrodocs"
|
|
)
|
|
|
|
// Source represents a documentation source to scrape.
|
|
type Source struct {
|
|
Name string `yaml:"name"`
|
|
Type SourceType `yaml:"type"`
|
|
URL string `yaml:"url,omitempty"`
|
|
Repo string `yaml:"repo,omitempty"`
|
|
Branch string `yaml:"branch,omitempty"`
|
|
Path string `yaml:"path,omitempty"`
|
|
Include []string `yaml:"include,omitempty"`
|
|
Exclude []string `yaml:"exclude,omitempty"`
|
|
Schedule string `yaml:"schedule,omitempty"`
|
|
}
|
|
|
|
// Document represents a scraped document.
|
|
type Document struct {
|
|
ID string `json:"id"`
|
|
Source string `json:"source"`
|
|
Type string `json:"type"`
|
|
Title string `json:"title"`
|
|
Content string `json:"content"`
|
|
URL string `json:"url,omitempty"`
|
|
Metadata map[string]interface{} `json:"metadata"`
|
|
Hash string `json:"hash"`
|
|
Timestamp time.Time `json:"timestamp"`
|
|
}
|
|
|
|
// Config holds scraper configuration.
|
|
type Config struct {
|
|
UserAgent string `yaml:"user_agent"`
|
|
Timeout time.Duration `yaml:"timeout"`
|
|
RetryCount int `yaml:"retry_count"`
|
|
RetryDelay time.Duration `yaml:"retry_delay"`
|
|
Concurrency int `yaml:"concurrency"`
|
|
RateLimit time.Duration `yaml:"rate_limit"`
|
|
MaxDepth int `yaml:"max_depth"`
|
|
CacheDir string `yaml:"cache_dir"`
|
|
}
|
|
|
|
// Scraper defines the interface for document scrapers.
|
|
type Scraper interface {
|
|
// Scrape fetches and parses documents from the source.
|
|
Scrape(ctx context.Context, source *Source) ([]*Document, error)
|
|
|
|
// DetectChanges checks if the source has changed since last scrape.
|
|
DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error)
|
|
}
|
|
|
|
// NewScraper creates a new scraper for the given source type.
|
|
func NewScraper(sourceType SourceType, config *Config) Scraper {
|
|
switch sourceType {
|
|
case SourceTypeWeb:
|
|
return NewWebScraper(config)
|
|
case SourceTypeGitHub:
|
|
return NewGitHubScraper(config)
|
|
case SourceTypeOpenAPI:
|
|
return NewOpenAPIScraper(config)
|
|
case SourceTypeLocal:
|
|
return NewLocalScraper(config)
|
|
case SourceTypeGoDocs:
|
|
return NewGoDocsScraper(config)
|
|
case SourceTypeRustDocs:
|
|
return NewRustDocsScraper(config)
|
|
case SourceTypePythonDocs:
|
|
return NewPythonDocsScraper(config)
|
|
case SourceTypeJavaDocs:
|
|
return NewJavaDocsScraper(config)
|
|
case SourceTypeSpringDocs:
|
|
return NewSpringDocsScraper(config)
|
|
case SourceTypeTSDocs:
|
|
return NewTSDocsScraper(config)
|
|
case SourceTypeReactDocs:
|
|
return NewReactDocsScraper(config)
|
|
case SourceTypeVueDocs:
|
|
return NewVueDocsScraper(config)
|
|
case SourceTypeNuxtDocs:
|
|
return NewNuxtDocsScraper(config)
|
|
case SourceTypeMCPDocs:
|
|
return NewMCPDocsScraper(config)
|
|
case SourceTypeDockerDocs:
|
|
return NewDockerDocsScraper(config)
|
|
case SourceTypeCloudflareDocs:
|
|
return NewCloudflareDocsScraper(config)
|
|
case SourceTypeAstroDocs:
|
|
return NewAstroDocsScraper(config)
|
|
default:
|
|
return nil
|
|
}
|
|
}
|
|
|
|
// DetectSourceType determines the source type from a URL or path.
|
|
func DetectSourceType(input string) SourceType {
|
|
// TODO: Implement detection logic
|
|
if len(input) > 4 && input[:4] == "http" {
|
|
return SourceTypeWeb
|
|
}
|
|
return SourceTypeLocal
|
|
}
|