mirror of
https://github.com/Dvorinka/Devour.git
synced 2026-06-03 20:13:03 +00:00
first commit
This commit is contained in:
@@ -0,0 +1,129 @@
|
||||
// Package scraper provides document scraping capabilities for various sources.
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"context"
|
||||
"time"
|
||||
)
|
||||
|
||||
// SourceType represents the type of documentation source.
|
||||
type SourceType string
|
||||
|
||||
const (
|
||||
SourceTypeWeb SourceType = "url"
|
||||
SourceTypeGitHub SourceType = "github"
|
||||
SourceTypeOpenAPI SourceType = "openapi"
|
||||
SourceTypeLocal SourceType = "local"
|
||||
SourceTypeGoDocs SourceType = "godocs"
|
||||
SourceTypeRustDocs SourceType = "rustdocs"
|
||||
SourceTypePythonDocs SourceType = "pythondocs"
|
||||
SourceTypeJavaDocs SourceType = "javadocs"
|
||||
SourceTypeSpringDocs SourceType = "springdocs"
|
||||
SourceTypeSpringAIDocs SourceType = "springaidocs"
|
||||
SourceTypeTSDocs SourceType = "tsdocs"
|
||||
SourceTypeReactDocs SourceType = "reactdocs"
|
||||
SourceTypeVueDocs SourceType = "vuedocs"
|
||||
SourceTypeNuxtDocs SourceType = "nuxtdocs"
|
||||
SourceTypeMCPDocs SourceType = "mcpdocs"
|
||||
SourceTypeDockerDocs SourceType = "dockerdocs"
|
||||
SourceTypeCloudflareDocs SourceType = "cloudflaredocs"
|
||||
SourceTypeAstroDocs SourceType = "astrodocs"
|
||||
)
|
||||
|
||||
// Source represents a documentation source to scrape.
|
||||
type Source struct {
|
||||
Name string `yaml:"name"`
|
||||
Type SourceType `yaml:"type"`
|
||||
URL string `yaml:"url,omitempty"`
|
||||
Repo string `yaml:"repo,omitempty"`
|
||||
Branch string `yaml:"branch,omitempty"`
|
||||
Path string `yaml:"path,omitempty"`
|
||||
Include []string `yaml:"include,omitempty"`
|
||||
Exclude []string `yaml:"exclude,omitempty"`
|
||||
Schedule string `yaml:"schedule,omitempty"`
|
||||
}
|
||||
|
||||
// Document represents a scraped document.
|
||||
type Document struct {
|
||||
ID string `json:"id"`
|
||||
Source string `json:"source"`
|
||||
Type string `json:"type"`
|
||||
Title string `json:"title"`
|
||||
Content string `json:"content"`
|
||||
URL string `json:"url,omitempty"`
|
||||
Metadata map[string]interface{} `json:"metadata"`
|
||||
Hash string `json:"hash"`
|
||||
Timestamp time.Time `json:"timestamp"`
|
||||
}
|
||||
|
||||
// Config holds scraper configuration.
|
||||
type Config struct {
|
||||
UserAgent string `yaml:"user_agent"`
|
||||
Timeout time.Duration `yaml:"timeout"`
|
||||
RetryCount int `yaml:"retry_count"`
|
||||
RetryDelay time.Duration `yaml:"retry_delay"`
|
||||
Concurrency int `yaml:"concurrency"`
|
||||
RateLimit time.Duration `yaml:"rate_limit"`
|
||||
MaxDepth int `yaml:"max_depth"`
|
||||
CacheDir string `yaml:"cache_dir"`
|
||||
}
|
||||
|
||||
// Scraper defines the interface for document scrapers.
|
||||
type Scraper interface {
|
||||
// Scrape fetches and parses documents from the source.
|
||||
Scrape(ctx context.Context, source *Source) ([]*Document, error)
|
||||
|
||||
// DetectChanges checks if the source has changed since last scrape.
|
||||
DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error)
|
||||
}
|
||||
|
||||
// NewScraper creates a new scraper for the given source type.
|
||||
func NewScraper(sourceType SourceType, config *Config) Scraper {
|
||||
switch sourceType {
|
||||
case SourceTypeWeb:
|
||||
return NewWebScraper(config)
|
||||
case SourceTypeGitHub:
|
||||
return NewGitHubScraper(config)
|
||||
case SourceTypeOpenAPI:
|
||||
return NewOpenAPIScraper(config)
|
||||
case SourceTypeLocal:
|
||||
return NewLocalScraper(config)
|
||||
case SourceTypeGoDocs:
|
||||
return NewGoDocsScraper(config)
|
||||
case SourceTypeRustDocs:
|
||||
return NewRustDocsScraper(config)
|
||||
case SourceTypePythonDocs:
|
||||
return NewPythonDocsScraper(config)
|
||||
case SourceTypeJavaDocs:
|
||||
return NewJavaDocsScraper(config)
|
||||
case SourceTypeSpringDocs:
|
||||
return NewSpringDocsScraper(config)
|
||||
case SourceTypeTSDocs:
|
||||
return NewTSDocsScraper(config)
|
||||
case SourceTypeReactDocs:
|
||||
return NewReactDocsScraper(config)
|
||||
case SourceTypeVueDocs:
|
||||
return NewVueDocsScraper(config)
|
||||
case SourceTypeNuxtDocs:
|
||||
return NewNuxtDocsScraper(config)
|
||||
case SourceTypeMCPDocs:
|
||||
return NewMCPDocsScraper(config)
|
||||
case SourceTypeDockerDocs:
|
||||
return NewDockerDocsScraper(config)
|
||||
case SourceTypeCloudflareDocs:
|
||||
return NewCloudflareDocsScraper(config)
|
||||
case SourceTypeAstroDocs:
|
||||
return NewAstroDocsScraper(config)
|
||||
default:
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
// DetectSourceType determines the source type from a URL or path.
|
||||
func DetectSourceType(input string) SourceType {
|
||||
// TODO: Implement detection logic
|
||||
if len(input) > 4 && input[:4] == "http" {
|
||||
return SourceTypeWeb
|
||||
}
|
||||
return SourceTypeLocal
|
||||
}
|
||||
Reference in New Issue
Block a user