mirror of
https://github.com/Dvorinka/Devour.git
synced 2026-06-03 20:13:03 +00:00
146 lines
5.0 KiB
Go
146 lines
5.0 KiB
Go
// Package scraper provides document scraping capabilities for various sources.
|
|
package scraper
|
|
|
|
import (
|
|
"context"
|
|
"crypto/sha256"
|
|
"encoding/hex"
|
|
"strings"
|
|
"time"
|
|
)
|
|
|
|
// SourceType represents the type of documentation source.
|
|
type SourceType string
|
|
|
|
const (
|
|
SourceTypeWeb SourceType = "url"
|
|
SourceTypeGitHub SourceType = "github"
|
|
SourceTypeOpenAPI SourceType = "openapi"
|
|
SourceTypeLocal SourceType = "local"
|
|
SourceTypeLocalSearch SourceType = "localsearch"
|
|
SourceTypeGoDocs SourceType = "godocs"
|
|
SourceTypeRustDocs SourceType = "rustdocs"
|
|
SourceTypePythonDocs SourceType = "pythondocs"
|
|
SourceTypeJavaDocs SourceType = "javadocs"
|
|
SourceTypeSpringDocs SourceType = "springdocs"
|
|
SourceTypeTSDocs SourceType = "tsdocs"
|
|
SourceTypeReactDocs SourceType = "reactdocs"
|
|
SourceTypeVueDocs SourceType = "vuedocs"
|
|
SourceTypeNuxtDocs SourceType = "nuxtdocs"
|
|
SourceTypeMCPDocs SourceType = "mcpdocs"
|
|
SourceTypeDockerDocs SourceType = "dockerdocs"
|
|
SourceTypeCloudflareDocs SourceType = "cloudflaredocs"
|
|
SourceTypeAstroDocs SourceType = "astrodocs"
|
|
)
|
|
|
|
// Source represents a documentation source to scrape.
|
|
type Source struct {
|
|
Name string `yaml:"name"`
|
|
Type SourceType `yaml:"type"`
|
|
URL string `yaml:"url,omitempty"`
|
|
Query string `yaml:"query,omitempty"`
|
|
ResultLimit int `yaml:"result_limit,omitempty"`
|
|
Domains []string `yaml:"domains,omitempty"`
|
|
Repo string `yaml:"repo,omitempty"`
|
|
Branch string `yaml:"branch,omitempty"`
|
|
Path string `yaml:"path,omitempty"`
|
|
Include []string `yaml:"include,omitempty"`
|
|
Exclude []string `yaml:"exclude,omitempty"`
|
|
Schedule string `yaml:"schedule,omitempty"`
|
|
}
|
|
|
|
// Document represents a scraped document.
|
|
type Document struct {
|
|
ID string `json:"id"`
|
|
Source string `json:"source"`
|
|
Type string `json:"type"`
|
|
Title string `json:"title"`
|
|
Content string `json:"content"`
|
|
URL string `json:"url,omitempty"`
|
|
Metadata map[string]interface{} `json:"metadata"`
|
|
Hash string `json:"hash"`
|
|
Timestamp time.Time `json:"timestamp"`
|
|
}
|
|
|
|
// Config holds scraper configuration.
|
|
type Config struct {
|
|
UserAgent string `yaml:"user_agent"`
|
|
Timeout time.Duration `yaml:"timeout"`
|
|
RetryCount int `yaml:"retry_count"`
|
|
RetryDelay time.Duration `yaml:"retry_delay"`
|
|
Concurrency int `yaml:"concurrency"`
|
|
RateLimit time.Duration `yaml:"rate_limit"`
|
|
MaxDepth int `yaml:"max_depth"`
|
|
CacheDir string `yaml:"cache_dir"`
|
|
}
|
|
|
|
// Scraper defines the interface for document scrapers.
|
|
type Scraper interface {
|
|
// Scrape fetches and parses documents from the source.
|
|
Scrape(ctx context.Context, source *Source) ([]*Document, error)
|
|
|
|
// DetectChanges checks if the source has changed since last scrape.
|
|
DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error)
|
|
}
|
|
|
|
// NewScraper creates a new scraper for the given source type using the registry.
|
|
func NewScraper(sourceType SourceType, config *Config) Scraper {
|
|
return CreateScraper(sourceType, config)
|
|
}
|
|
|
|
// DetectSourceType determines the source type from a URL or path.
|
|
func DetectSourceType(input string) SourceType {
|
|
// Check for GitHub repositories
|
|
if strings.Contains(input, "github.com") {
|
|
return SourceTypeGitHub
|
|
}
|
|
|
|
// Check for known documentation hosts
|
|
docsHosts := map[string]SourceType{
|
|
"pkg.go.dev": SourceTypeGoDocs,
|
|
"docs.rs": SourceTypeRustDocs,
|
|
"docs.python.org": SourceTypePythonDocs,
|
|
"docs.oracle.com": SourceTypeJavaDocs,
|
|
"docs.spring.io": SourceTypeSpringDocs,
|
|
"typescriptlang.org": SourceTypeTSDocs,
|
|
"react.dev": SourceTypeReactDocs,
|
|
"vuejs.org": SourceTypeVueDocs,
|
|
"nuxt.com": SourceTypeNuxtDocs,
|
|
"docs.docker.com": SourceTypeDockerDocs,
|
|
"developers.cloudflare.com": SourceTypeCloudflareDocs,
|
|
"docs.astro.build": SourceTypeAstroDocs,
|
|
}
|
|
|
|
for host, sourceType := range docsHosts {
|
|
if strings.Contains(input, host) {
|
|
return sourceType
|
|
}
|
|
}
|
|
|
|
// MCP servers are hosted under Docker Hub paths.
|
|
if strings.Contains(input, "hub.docker.com/mcp/") {
|
|
return SourceTypeMCPDocs
|
|
}
|
|
|
|
// Check for OpenAPI specs
|
|
if strings.HasSuffix(input, ".json") || strings.HasSuffix(input, ".yaml") || strings.HasSuffix(input, ".yml") {
|
|
if strings.Contains(strings.ToLower(input), "openapi") || strings.Contains(strings.ToLower(input), "swagger") {
|
|
return SourceTypeOpenAPI
|
|
}
|
|
}
|
|
|
|
// Check for web URLs
|
|
if strings.HasPrefix(input, "http://") || strings.HasPrefix(input, "https://") {
|
|
return SourceTypeWeb
|
|
}
|
|
|
|
// Default to local
|
|
return SourceTypeLocal
|
|
}
|
|
|
|
// generateDocID generates a unique ID for a document.
|
|
func generateDocID(urlStr string) string {
|
|
hash := sha256.Sum256([]byte(urlStr))
|
|
return hex.EncodeToString(hash[:12])
|
|
}
|