package scraper import ( "context" "crypto/sha256" "encoding/hex" "fmt" "io" "net/http" "strings" "time" "github.com/yourorg/devour/pkg/springdocs" ) type SpringDocsScraper struct { config *Config parser *springdocs.Parser client *http.Client } func NewSpringDocsScraper(config *Config) *SpringDocsScraper { return &SpringDocsScraper{ config: config, parser: springdocs.NewParser(), client: &http.Client{ Timeout: config.Timeout, }, } } func (s *SpringDocsScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) { var documents []*Document if source.URL == "" { return nil, fmt.Errorf("URL is required for Spring docs scraper") } html, err := s.fetchPage(ctx, source.URL) if err != nil { return nil, fmt.Errorf("failed to fetch page: %w", err) } module, err := s.parser.ParseModulePage(html, source.URL) if err != nil { return nil, fmt.Errorf("failed to parse module: %w", err) } mainDoc := s.moduleToDocument(module, source.Name) documents = append(documents, mainDoc) for _, class := range module.Classes { doc := s.classToDocument(class, module, source.Name) documents = append(documents, doc) } for _, prop := range module.Properties { doc := s.propertyToDocument(prop, source.Name) documents = append(documents, doc) } for _, guide := range module.Guides { doc := s.guideToDocument(guide, source.Name) documents = append(documents, doc) } return documents, nil } func (s *SpringDocsScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) { html, err := s.fetchPage(ctx, source.URL) if err != nil { return false, "", err } hash := s.generateHash(html) changed := hash != lastHash return changed, hash, nil } func (s *SpringDocsScraper) fetchPage(ctx context.Context, url string) (string, error) { req, err := http.NewRequestWithContext(ctx, "GET", url, nil) if err != nil { return "", err } req.Header.Set("User-Agent", s.config.UserAgent) resp, err := s.client.Do(req) if err != nil { return "", err } defer resp.Body.Close() if resp.StatusCode != http.StatusOK { return "", fmt.Errorf("HTTP %d", resp.StatusCode) } body, err := io.ReadAll(resp.Body) if err != nil { return "", err } return string(body), nil } func (s *SpringDocsScraper) generateHash(content string) string { hash := sha256.Sum256([]byte(content)) return hex.EncodeToString(hash[:]) } func (s *SpringDocsScraper) moduleToDocument(module *springdocs.Module, sourceName string) *Document { var content strings.Builder fmt.Fprintf(&content, "# %s\n\n", module.Name) fmt.Fprintf(&content, "%s\n", module.Doc) metadata := map[string]interface{}{ "module": module.Name, "version": module.Version, "doc_url": module.DocURL, "doc_type": "spring-module", } return &Document{ ID: generateDocID(module.DocURL), Source: sourceName, Type: "spring-module", Title: module.Name, Content: content.String(), URL: module.DocURL, Metadata: metadata, Hash: s.generateHash(content.String()), Timestamp: time.Now(), } } func (s *SpringDocsScraper) classToDocument(class *springdocs.Class, module *springdocs.Module, sourceName string) *Document { var content strings.Builder fmt.Fprintf(&content, "# %s\n\n", class.QualifiedName) fmt.Fprintf(&content, "%s\n", class.Doc) if len(class.Methods) > 0 { fmt.Fprintf(&content, "\n## Methods\n") for _, m := range class.Methods { fmt.Fprintf(&content, "- `%s`\n", m.Signature) } } metadata := map[string]interface{}{ "module": module.Name, "qualified_name": class.QualifiedName, "kind": class.Kind, "doc_url": class.DocURL, } return &Document{ ID: generateDocID(class.DocURL), Source: sourceName, Type: "spring-class", Title: class.QualifiedName, Content: content.String(), URL: class.DocURL, Metadata: metadata, Hash: s.generateHash(content.String()), Timestamp: time.Now(), } } func (s *SpringDocsScraper) propertyToDocument(prop *springdocs.Property, sourceName string) *Document { var content strings.Builder fmt.Fprintf(&content, "# %s\n\n", prop.Name) fmt.Fprintf(&content, "Type: %s\n", prop.Type) if prop.Default != "" { fmt.Fprintf(&content, "Default: `%s`\n", prop.Default) } fmt.Fprintf(&content, "\n%s\n", prop.Doc) metadata := map[string]interface{}{ "property": prop.Name, "type": prop.Type, "default": prop.Default, "doc_url": prop.DocURL, } return &Document{ ID: generateDocID(prop.Name), Source: sourceName, Type: "spring-property", Title: prop.Name, Content: content.String(), URL: prop.DocURL, Metadata: metadata, Hash: s.generateHash(content.String()), Timestamp: time.Now(), } } func (s *SpringDocsScraper) guideToDocument(guide *springdocs.Guide, sourceName string) *Document { var content strings.Builder fmt.Fprintf(&content, "# %s\n\n", guide.Title) fmt.Fprintf(&content, "%s\n", guide.Description) metadata := map[string]interface{}{ "title": guide.Title, "doc_url": guide.DocURL, "level": guide.Level, "doc_type": "spring-guide", } return &Document{ ID: generateDocID(guide.DocURL), Source: sourceName, Type: "spring-guide", Title: guide.Title, Content: content.String(), URL: guide.DocURL, Metadata: metadata, Hash: s.generateHash(content.String()), Timestamp: time.Now(), } }