mirror of
https://github.com/Dvorinka/Devour.git
synced 2026-06-04 04:23:02 +00:00
updage
This commit is contained in:
+49
-42
@@ -3,6 +3,9 @@ package scraper
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
@@ -19,7 +22,6 @@ const (
|
||||
SourceTypePythonDocs SourceType = "pythondocs"
|
||||
SourceTypeJavaDocs SourceType = "javadocs"
|
||||
SourceTypeSpringDocs SourceType = "springdocs"
|
||||
SourceTypeSpringAIDocs SourceType = "springaidocs"
|
||||
SourceTypeTSDocs SourceType = "tsdocs"
|
||||
SourceTypeReactDocs SourceType = "reactdocs"
|
||||
SourceTypeVueDocs SourceType = "vuedocs"
|
||||
@@ -77,53 +79,58 @@ type Scraper interface {
|
||||
DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error)
|
||||
}
|
||||
|
||||
// NewScraper creates a new scraper for the given source type.
|
||||
// NewScraper creates a new scraper for the given source type using the registry.
|
||||
func NewScraper(sourceType SourceType, config *Config) Scraper {
|
||||
switch sourceType {
|
||||
case SourceTypeWeb:
|
||||
return NewWebScraper(config)
|
||||
case SourceTypeGitHub:
|
||||
return NewGitHubScraper(config)
|
||||
case SourceTypeOpenAPI:
|
||||
return NewOpenAPIScraper(config)
|
||||
case SourceTypeLocal:
|
||||
return NewLocalScraper(config)
|
||||
case SourceTypeGoDocs:
|
||||
return NewGoDocsScraper(config)
|
||||
case SourceTypeRustDocs:
|
||||
return NewRustDocsScraper(config)
|
||||
case SourceTypePythonDocs:
|
||||
return NewPythonDocsScraper(config)
|
||||
case SourceTypeJavaDocs:
|
||||
return NewJavaDocsScraper(config)
|
||||
case SourceTypeSpringDocs:
|
||||
return NewSpringDocsScraper(config)
|
||||
case SourceTypeTSDocs:
|
||||
return NewTSDocsScraper(config)
|
||||
case SourceTypeReactDocs:
|
||||
return NewReactDocsScraper(config)
|
||||
case SourceTypeVueDocs:
|
||||
return NewVueDocsScraper(config)
|
||||
case SourceTypeNuxtDocs:
|
||||
return NewNuxtDocsScraper(config)
|
||||
case SourceTypeMCPDocs:
|
||||
return NewMCPDocsScraper(config)
|
||||
case SourceTypeDockerDocs:
|
||||
return NewDockerDocsScraper(config)
|
||||
case SourceTypeCloudflareDocs:
|
||||
return NewCloudflareDocsScraper(config)
|
||||
case SourceTypeAstroDocs:
|
||||
return NewAstroDocsScraper(config)
|
||||
default:
|
||||
return nil
|
||||
}
|
||||
return CreateScraper(sourceType, config)
|
||||
}
|
||||
|
||||
// DetectSourceType determines the source type from a URL or path.
|
||||
func DetectSourceType(input string) SourceType {
|
||||
// TODO: Implement detection logic
|
||||
if len(input) > 4 && input[:4] == "http" {
|
||||
// Check for GitHub repositories
|
||||
if strings.Contains(input, "github.com") {
|
||||
return SourceTypeGitHub
|
||||
}
|
||||
|
||||
// Check for known documentation hosts
|
||||
docsHosts := map[string]SourceType{
|
||||
"pkg.go.dev": SourceTypeGoDocs,
|
||||
"docs.rs": SourceTypeRustDocs,
|
||||
"docs.python.org": SourceTypePythonDocs,
|
||||
"docs.oracle.com": SourceTypeJavaDocs,
|
||||
"docs.spring.io": SourceTypeSpringDocs,
|
||||
"typescriptlang.org": SourceTypeTSDocs,
|
||||
"react.dev": SourceTypeReactDocs,
|
||||
"vuejs.org": SourceTypeVueDocs,
|
||||
"nuxt.com": SourceTypeNuxtDocs,
|
||||
"docs.docker.com": SourceTypeDockerDocs,
|
||||
"developers.cloudflare.com": SourceTypeCloudflareDocs,
|
||||
"docs.astro.build": SourceTypeAstroDocs,
|
||||
}
|
||||
|
||||
for host, sourceType := range docsHosts {
|
||||
if strings.Contains(input, host) {
|
||||
return sourceType
|
||||
}
|
||||
}
|
||||
|
||||
// Check for OpenAPI specs
|
||||
if strings.HasSuffix(input, ".json") || strings.HasSuffix(input, ".yaml") || strings.HasSuffix(input, ".yml") {
|
||||
if strings.Contains(strings.ToLower(input), "openapi") || strings.Contains(strings.ToLower(input), "swagger") {
|
||||
return SourceTypeOpenAPI
|
||||
}
|
||||
}
|
||||
|
||||
// Check for web URLs
|
||||
if strings.HasPrefix(input, "http://") || strings.HasPrefix(input, "https://") {
|
||||
return SourceTypeWeb
|
||||
}
|
||||
|
||||
// Default to local
|
||||
return SourceTypeLocal
|
||||
}
|
||||
|
||||
// generateDocID generates a unique ID for a document.
|
||||
func generateDocID(urlStr string) string {
|
||||
hash := sha256.Sum256([]byte(urlStr))
|
||||
return hex.EncodeToString(hash[:12])
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user