Files
Devour/internal/scraper/scraper.go
T
Tomas Dvorak 898a3c303f update
2026-02-24 10:33:59 +01:00

146 lines
5.0 KiB
Go

// Package scraper provides document scraping capabilities for various sources.
package scraper
import (
"context"
"crypto/sha256"
"encoding/hex"
"strings"
"time"
)
// SourceType represents the type of documentation source.
type SourceType string
const (
SourceTypeWeb SourceType = "url"
SourceTypeGitHub SourceType = "github"
SourceTypeOpenAPI SourceType = "openapi"
SourceTypeLocal SourceType = "local"
SourceTypeLocalSearch SourceType = "localsearch"
SourceTypeGoDocs SourceType = "godocs"
SourceTypeRustDocs SourceType = "rustdocs"
SourceTypePythonDocs SourceType = "pythondocs"
SourceTypeJavaDocs SourceType = "javadocs"
SourceTypeSpringDocs SourceType = "springdocs"
SourceTypeTSDocs SourceType = "tsdocs"
SourceTypeReactDocs SourceType = "reactdocs"
SourceTypeVueDocs SourceType = "vuedocs"
SourceTypeNuxtDocs SourceType = "nuxtdocs"
SourceTypeMCPDocs SourceType = "mcpdocs"
SourceTypeDockerDocs SourceType = "dockerdocs"
SourceTypeCloudflareDocs SourceType = "cloudflaredocs"
SourceTypeAstroDocs SourceType = "astrodocs"
)
// Source represents a documentation source to scrape.
type Source struct {
Name string `yaml:"name"`
Type SourceType `yaml:"type"`
URL string `yaml:"url,omitempty"`
Query string `yaml:"query,omitempty"`
ResultLimit int `yaml:"result_limit,omitempty"`
Domains []string `yaml:"domains,omitempty"`
Repo string `yaml:"repo,omitempty"`
Branch string `yaml:"branch,omitempty"`
Path string `yaml:"path,omitempty"`
Include []string `yaml:"include,omitempty"`
Exclude []string `yaml:"exclude,omitempty"`
Schedule string `yaml:"schedule,omitempty"`
}
// Document represents a scraped document.
type Document struct {
ID string `json:"id"`
Source string `json:"source"`
Type string `json:"type"`
Title string `json:"title"`
Content string `json:"content"`
URL string `json:"url,omitempty"`
Metadata map[string]interface{} `json:"metadata"`
Hash string `json:"hash"`
Timestamp time.Time `json:"timestamp"`
}
// Config holds scraper configuration.
type Config struct {
UserAgent string `yaml:"user_agent"`
Timeout time.Duration `yaml:"timeout"`
RetryCount int `yaml:"retry_count"`
RetryDelay time.Duration `yaml:"retry_delay"`
Concurrency int `yaml:"concurrency"`
RateLimit time.Duration `yaml:"rate_limit"`
MaxDepth int `yaml:"max_depth"`
CacheDir string `yaml:"cache_dir"`
}
// Scraper defines the interface for document scrapers.
type Scraper interface {
// Scrape fetches and parses documents from the source.
Scrape(ctx context.Context, source *Source) ([]*Document, error)
// DetectChanges checks if the source has changed since last scrape.
DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error)
}
// NewScraper creates a new scraper for the given source type using the registry.
func NewScraper(sourceType SourceType, config *Config) Scraper {
return CreateScraper(sourceType, config)
}
// DetectSourceType determines the source type from a URL or path.
func DetectSourceType(input string) SourceType {
// Check for GitHub repositories
if strings.Contains(input, "github.com") {
return SourceTypeGitHub
}
// Check for known documentation hosts
docsHosts := map[string]SourceType{
"pkg.go.dev": SourceTypeGoDocs,
"docs.rs": SourceTypeRustDocs,
"docs.python.org": SourceTypePythonDocs,
"docs.oracle.com": SourceTypeJavaDocs,
"docs.spring.io": SourceTypeSpringDocs,
"typescriptlang.org": SourceTypeTSDocs,
"react.dev": SourceTypeReactDocs,
"vuejs.org": SourceTypeVueDocs,
"nuxt.com": SourceTypeNuxtDocs,
"docs.docker.com": SourceTypeDockerDocs,
"developers.cloudflare.com": SourceTypeCloudflareDocs,
"docs.astro.build": SourceTypeAstroDocs,
}
for host, sourceType := range docsHosts {
if strings.Contains(input, host) {
return sourceType
}
}
// MCP servers are hosted under Docker Hub paths.
if strings.Contains(input, "hub.docker.com/mcp/") {
return SourceTypeMCPDocs
}
// Check for OpenAPI specs
if strings.HasSuffix(input, ".json") || strings.HasSuffix(input, ".yaml") || strings.HasSuffix(input, ".yml") {
if strings.Contains(strings.ToLower(input), "openapi") || strings.Contains(strings.ToLower(input), "swagger") {
return SourceTypeOpenAPI
}
}
// Check for web URLs
if strings.HasPrefix(input, "http://") || strings.HasPrefix(input, "https://") {
return SourceTypeWeb
}
// Default to local
return SourceTypeLocal
}
// generateDocID generates a unique ID for a document.
func generateDocID(urlStr string) string {
hash := sha256.Sum256([]byte(urlStr))
return hex.EncodeToString(hash[:12])
}