mirror of
https://github.com/Dvorinka/Devour.git
synced 2026-06-03 20:13:03 +00:00
updage
This commit is contained in:
+221
@@ -0,0 +1,221 @@
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/yourorg/devour/pkg/springdocs"
|
||||
)
|
||||
|
||||
type SpringDocsScraper struct {
|
||||
config *Config
|
||||
parser *springdocs.Parser
|
||||
client *http.Client
|
||||
}
|
||||
|
||||
func NewSpringDocsScraper(config *Config) *SpringDocsScraper {
|
||||
return &SpringDocsScraper{
|
||||
config: config,
|
||||
parser: springdocs.NewParser(),
|
||||
client: &http.Client{
|
||||
Timeout: config.Timeout,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func (s *SpringDocsScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
|
||||
var documents []*Document
|
||||
|
||||
if source.URL == "" {
|
||||
return nil, fmt.Errorf("URL is required for Spring docs scraper")
|
||||
}
|
||||
|
||||
html, err := s.fetchPage(ctx, source.URL)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to fetch page: %w", err)
|
||||
}
|
||||
|
||||
module, err := s.parser.ParseModulePage(html, source.URL)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to parse module: %w", err)
|
||||
}
|
||||
|
||||
mainDoc := s.moduleToDocument(module, source.Name)
|
||||
documents = append(documents, mainDoc)
|
||||
|
||||
for _, class := range module.Classes {
|
||||
doc := s.classToDocument(class, module, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
for _, prop := range module.Properties {
|
||||
doc := s.propertyToDocument(prop, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
for _, guide := range module.Guides {
|
||||
doc := s.guideToDocument(guide, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
return documents, nil
|
||||
}
|
||||
|
||||
func (s *SpringDocsScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
|
||||
html, err := s.fetchPage(ctx, source.URL)
|
||||
if err != nil {
|
||||
return false, "", err
|
||||
}
|
||||
|
||||
hash := s.generateHash(html)
|
||||
changed := hash != lastHash
|
||||
|
||||
return changed, hash, nil
|
||||
}
|
||||
|
||||
func (s *SpringDocsScraper) fetchPage(ctx context.Context, url string) (string, error) {
|
||||
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
req.Header.Set("User-Agent", s.config.UserAgent)
|
||||
|
||||
resp, err := s.client.Do(req)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return "", fmt.Errorf("HTTP %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return string(body), nil
|
||||
}
|
||||
|
||||
func (s *SpringDocsScraper) generateHash(content string) string {
|
||||
hash := sha256.Sum256([]byte(content))
|
||||
return hex.EncodeToString(hash[:])
|
||||
}
|
||||
|
||||
func (s *SpringDocsScraper) moduleToDocument(module *springdocs.Module, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# %s\n\n", module.Name)
|
||||
fmt.Fprintf(&content, "%s\n", module.Doc)
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"module": module.Name,
|
||||
"version": module.Version,
|
||||
"doc_url": module.DocURL,
|
||||
"doc_type": "spring-module",
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(module.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "spring-module",
|
||||
Title: module.Name,
|
||||
Content: content.String(),
|
||||
URL: module.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *SpringDocsScraper) classToDocument(class *springdocs.Class, module *springdocs.Module, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# %s\n\n", class.QualifiedName)
|
||||
fmt.Fprintf(&content, "%s\n", class.Doc)
|
||||
|
||||
if len(class.Methods) > 0 {
|
||||
fmt.Fprintf(&content, "\n## Methods\n")
|
||||
for _, m := range class.Methods {
|
||||
fmt.Fprintf(&content, "- `%s`\n", m.Signature)
|
||||
}
|
||||
}
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"module": module.Name,
|
||||
"qualified_name": class.QualifiedName,
|
||||
"kind": class.Kind,
|
||||
"doc_url": class.DocURL,
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(class.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "spring-class",
|
||||
Title: class.QualifiedName,
|
||||
Content: content.String(),
|
||||
URL: class.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *SpringDocsScraper) propertyToDocument(prop *springdocs.Property, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# %s\n\n", prop.Name)
|
||||
fmt.Fprintf(&content, "Type: %s\n", prop.Type)
|
||||
if prop.Default != "" {
|
||||
fmt.Fprintf(&content, "Default: `%s`\n", prop.Default)
|
||||
}
|
||||
fmt.Fprintf(&content, "\n%s\n", prop.Doc)
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"property": prop.Name,
|
||||
"type": prop.Type,
|
||||
"default": prop.Default,
|
||||
"doc_url": prop.DocURL,
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(prop.Name),
|
||||
Source: sourceName,
|
||||
Type: "spring-property",
|
||||
Title: prop.Name,
|
||||
Content: content.String(),
|
||||
URL: prop.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *SpringDocsScraper) guideToDocument(guide *springdocs.Guide, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# %s\n\n", guide.Title)
|
||||
fmt.Fprintf(&content, "%s\n", guide.Description)
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"title": guide.Title,
|
||||
"doc_url": guide.DocURL,
|
||||
"level": guide.Level,
|
||||
"doc_type": "spring-guide",
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(guide.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "spring-guide",
|
||||
Title: guide.Title,
|
||||
Content: content.String(),
|
||||
URL: guide.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user