This commit is contained in:
Tomas Dvorak
2026-02-22 15:41:27 +01:00
parent 0b88627e54
commit 409acd2e08
84 changed files with 65382 additions and 27475 deletions
+49 -42
View File
@@ -3,6 +3,9 @@ package scraper
import (
"context"
"crypto/sha256"
"encoding/hex"
"strings"
"time"
)
@@ -19,7 +22,6 @@ const (
SourceTypePythonDocs SourceType = "pythondocs"
SourceTypeJavaDocs SourceType = "javadocs"
SourceTypeSpringDocs SourceType = "springdocs"
SourceTypeSpringAIDocs SourceType = "springaidocs"
SourceTypeTSDocs SourceType = "tsdocs"
SourceTypeReactDocs SourceType = "reactdocs"
SourceTypeVueDocs SourceType = "vuedocs"
@@ -77,53 +79,58 @@ type Scraper interface {
DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error)
}
// NewScraper creates a new scraper for the given source type.
// NewScraper creates a new scraper for the given source type using the registry.
func NewScraper(sourceType SourceType, config *Config) Scraper {
switch sourceType {
case SourceTypeWeb:
return NewWebScraper(config)
case SourceTypeGitHub:
return NewGitHubScraper(config)
case SourceTypeOpenAPI:
return NewOpenAPIScraper(config)
case SourceTypeLocal:
return NewLocalScraper(config)
case SourceTypeGoDocs:
return NewGoDocsScraper(config)
case SourceTypeRustDocs:
return NewRustDocsScraper(config)
case SourceTypePythonDocs:
return NewPythonDocsScraper(config)
case SourceTypeJavaDocs:
return NewJavaDocsScraper(config)
case SourceTypeSpringDocs:
return NewSpringDocsScraper(config)
case SourceTypeTSDocs:
return NewTSDocsScraper(config)
case SourceTypeReactDocs:
return NewReactDocsScraper(config)
case SourceTypeVueDocs:
return NewVueDocsScraper(config)
case SourceTypeNuxtDocs:
return NewNuxtDocsScraper(config)
case SourceTypeMCPDocs:
return NewMCPDocsScraper(config)
case SourceTypeDockerDocs:
return NewDockerDocsScraper(config)
case SourceTypeCloudflareDocs:
return NewCloudflareDocsScraper(config)
case SourceTypeAstroDocs:
return NewAstroDocsScraper(config)
default:
return nil
}
return CreateScraper(sourceType, config)
}
// DetectSourceType determines the source type from a URL or path.
func DetectSourceType(input string) SourceType {
// TODO: Implement detection logic
if len(input) > 4 && input[:4] == "http" {
// Check for GitHub repositories
if strings.Contains(input, "github.com") {
return SourceTypeGitHub
}
// Check for known documentation hosts
docsHosts := map[string]SourceType{
"pkg.go.dev": SourceTypeGoDocs,
"docs.rs": SourceTypeRustDocs,
"docs.python.org": SourceTypePythonDocs,
"docs.oracle.com": SourceTypeJavaDocs,
"docs.spring.io": SourceTypeSpringDocs,
"typescriptlang.org": SourceTypeTSDocs,
"react.dev": SourceTypeReactDocs,
"vuejs.org": SourceTypeVueDocs,
"nuxt.com": SourceTypeNuxtDocs,
"docs.docker.com": SourceTypeDockerDocs,
"developers.cloudflare.com": SourceTypeCloudflareDocs,
"docs.astro.build": SourceTypeAstroDocs,
}
for host, sourceType := range docsHosts {
if strings.Contains(input, host) {
return sourceType
}
}
// Check for OpenAPI specs
if strings.HasSuffix(input, ".json") || strings.HasSuffix(input, ".yaml") || strings.HasSuffix(input, ".yml") {
if strings.Contains(strings.ToLower(input), "openapi") || strings.Contains(strings.ToLower(input), "swagger") {
return SourceTypeOpenAPI
}
}
// Check for web URLs
if strings.HasPrefix(input, "http://") || strings.HasPrefix(input, "https://") {
return SourceTypeWeb
}
// Default to local
return SourceTypeLocal
}
// generateDocID generates a unique ID for a document.
func generateDocID(urlStr string) string {
hash := sha256.Sum256([]byte(urlStr))
return hex.EncodeToString(hash[:12])
}