package scraper import ( "context" "crypto/sha256" "encoding/hex" "fmt" "net/http" "strings" "time" "github.com/yourorg/devour/pkg/javadocs" ) type JavaDocsScraper struct { config *Config parser *javadocs.Parser client *http.Client } func NewJavaDocsScraper(config *Config) *JavaDocsScraper { return &JavaDocsScraper{ config: config, parser: javadocs.NewParser(), client: &http.Client{ Timeout: config.Timeout, }, } } func (s *JavaDocsScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) { var documents []*Document if source.URL == "" { return nil, fmt.Errorf("URL is required for Java docs scraper") } html, err := s.fetchPage(ctx, source.URL) if err != nil { return nil, fmt.Errorf("failed to fetch page: %w", err) } pkg, err := s.parser.ParsePackagePage(html, source.URL) if err != nil { return nil, fmt.Errorf("failed to parse package: %w", err) } mainDoc := s.packageToDocument(pkg, source.Name) documents = append(documents, mainDoc) for _, class := range pkg.Classes { doc := s.classToDocument(class, pkg, source.Name) documents = append(documents, doc) } for _, iface := range pkg.Interfaces { doc := s.interfaceToDocument(iface, pkg, source.Name) documents = append(documents, doc) } for _, enum := range pkg.Enums { doc := s.enumToDocument(enum, pkg, source.Name) documents = append(documents, doc) } for _, exc := range pkg.Exceptions { doc := s.exceptionToDocument(exc, pkg, source.Name) documents = append(documents, doc) } return documents, nil } func (s *JavaDocsScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) { html, err := s.fetchPage(ctx, source.URL) if err != nil { return false, "", err } hash := s.generateHash(html) changed := hash != lastHash return changed, hash, nil } func (s *JavaDocsScraper) fetchPage(ctx context.Context, url string) (string, error) { return fetchExternalPage(ctx, s.client, s.config.UserAgent, url) } func (s *JavaDocsScraper) generateHash(content string) string { hash := sha256.Sum256([]byte(content)) return hex.EncodeToString(hash[:]) } func (s *JavaDocsScraper) packageToDocument(pkg *javadocs.Package, sourceName string) *Document { var content strings.Builder fmt.Fprintf(&content, "# Package %s\n\n", pkg.Name) fmt.Fprintf(&content, "%s\n", pkg.Doc) metadata := map[string]interface{}{ "package": pkg.Name, "doc_url": pkg.DocURL, "doc_type": "java-package", } return &Document{ ID: generateDocID(pkg.DocURL), Source: sourceName, Type: "java-package", Title: pkg.Name, Content: content.String(), URL: pkg.DocURL, Metadata: metadata, Hash: s.generateHash(content.String()), Timestamp: time.Now(), } } func (s *JavaDocsScraper) classToDocument(class *javadocs.Class, pkg *javadocs.Package, sourceName string) *Document { var content strings.Builder fmt.Fprintf(&content, "# %s\n\n", class.QualifiedName) fmt.Fprintf(&content, "Kind: %s\n\n", class.Kind) fmt.Fprintf(&content, "%s\n", class.Doc) if len(class.Methods) > 0 { fmt.Fprintf(&content, "\n## Methods\n") for _, m := range class.Methods { fmt.Fprintf(&content, "- `%s`\n", m.Signature) } } metadata := map[string]interface{}{ "package": pkg.Name, "qualified_name": class.QualifiedName, "kind": string(class.Kind), "doc_url": class.DocURL, } return &Document{ ID: generateDocID(class.DocURL), Source: sourceName, Type: "java-class", Title: class.QualifiedName, Content: content.String(), URL: class.DocURL, Metadata: metadata, Hash: s.generateHash(content.String()), Timestamp: time.Now(), } } func (s *JavaDocsScraper) interfaceToDocument(iface *javadocs.Class, pkg *javadocs.Package, sourceName string) *Document { var content strings.Builder fmt.Fprintf(&content, "# %s (interface)\n\n", iface.QualifiedName) fmt.Fprintf(&content, "%s\n", iface.Doc) metadata := map[string]interface{}{ "package": pkg.Name, "qualified_name": iface.QualifiedName, "kind": "interface", "doc_url": iface.DocURL, } return &Document{ ID: generateDocID(iface.DocURL), Source: sourceName, Type: "java-interface", Title: iface.QualifiedName, Content: content.String(), URL: iface.DocURL, Metadata: metadata, Hash: s.generateHash(content.String()), Timestamp: time.Now(), } } func (s *JavaDocsScraper) enumToDocument(enum *javadocs.Enum, pkg *javadocs.Package, sourceName string) *Document { var content strings.Builder fmt.Fprintf(&content, "# %s (enum)\n\n", enum.QualifiedName) fmt.Fprintf(&content, "%s\n", enum.Doc) if len(enum.Constants) > 0 { fmt.Fprintf(&content, "\n## Constants\n") for _, c := range enum.Constants { fmt.Fprintf(&content, "- `%s`\n", c.Name) } } metadata := map[string]interface{}{ "package": pkg.Name, "qualified_name": enum.QualifiedName, "kind": "enum", "doc_url": enum.DocURL, } return &Document{ ID: generateDocID(enum.DocURL), Source: sourceName, Type: "java-enum", Title: enum.QualifiedName, Content: content.String(), URL: enum.DocURL, Metadata: metadata, Hash: s.generateHash(content.String()), Timestamp: time.Now(), } } func (s *JavaDocsScraper) exceptionToDocument(exc *javadocs.Class, pkg *javadocs.Package, sourceName string) *Document { var content strings.Builder fmt.Fprintf(&content, "# %s (exception)\n\n", exc.QualifiedName) fmt.Fprintf(&content, "%s\n", exc.Doc) metadata := map[string]interface{}{ "package": pkg.Name, "qualified_name": exc.QualifiedName, "kind": "exception", "doc_url": exc.DocURL, } return &Document{ ID: generateDocID(exc.DocURL), Source: sourceName, Type: "java-exception", Title: exc.QualifiedName, Content: content.String(), URL: exc.DocURL, Metadata: metadata, Hash: s.generateHash(content.String()), Timestamp: time.Now(), } }