mirror of
https://github.com/Dvorinka/Devour.git
synced 2026-06-04 12:33:04 +00:00
255 lines
6.3 KiB
Go
255 lines
6.3 KiB
Go
package scraper
|
|
|
|
import (
|
|
"context"
|
|
"crypto/sha256"
|
|
"encoding/hex"
|
|
"fmt"
|
|
"io"
|
|
"net/http"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/yourorg/devour/pkg/javadocs"
|
|
)
|
|
|
|
type JavaDocsScraper struct {
|
|
config *Config
|
|
parser *javadocs.Parser
|
|
client *http.Client
|
|
}
|
|
|
|
func NewJavaDocsScraper(config *Config) *JavaDocsScraper {
|
|
return &JavaDocsScraper{
|
|
config: config,
|
|
parser: javadocs.NewParser(),
|
|
client: &http.Client{
|
|
Timeout: config.Timeout,
|
|
},
|
|
}
|
|
}
|
|
|
|
func (s *JavaDocsScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
|
|
var documents []*Document
|
|
|
|
if source.URL == "" {
|
|
return nil, fmt.Errorf("URL is required for Java docs scraper")
|
|
}
|
|
|
|
html, err := s.fetchPage(ctx, source.URL)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to fetch page: %w", err)
|
|
}
|
|
|
|
pkg, err := s.parser.ParsePackagePage(html, source.URL)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to parse package: %w", err)
|
|
}
|
|
|
|
mainDoc := s.packageToDocument(pkg, source.Name)
|
|
documents = append(documents, mainDoc)
|
|
|
|
for _, class := range pkg.Classes {
|
|
doc := s.classToDocument(class, pkg, source.Name)
|
|
documents = append(documents, doc)
|
|
}
|
|
|
|
for _, iface := range pkg.Interfaces {
|
|
doc := s.interfaceToDocument(iface, pkg, source.Name)
|
|
documents = append(documents, doc)
|
|
}
|
|
|
|
for _, enum := range pkg.Enums {
|
|
doc := s.enumToDocument(enum, pkg, source.Name)
|
|
documents = append(documents, doc)
|
|
}
|
|
|
|
for _, exc := range pkg.Exceptions {
|
|
doc := s.exceptionToDocument(exc, pkg, source.Name)
|
|
documents = append(documents, doc)
|
|
}
|
|
|
|
return documents, nil
|
|
}
|
|
|
|
func (s *JavaDocsScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
|
|
html, err := s.fetchPage(ctx, source.URL)
|
|
if err != nil {
|
|
return false, "", err
|
|
}
|
|
|
|
hash := s.generateHash(html)
|
|
changed := hash != lastHash
|
|
|
|
return changed, hash, nil
|
|
}
|
|
|
|
func (s *JavaDocsScraper) fetchPage(ctx context.Context, url string) (string, error) {
|
|
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
|
|
req.Header.Set("User-Agent", s.config.UserAgent)
|
|
|
|
resp, err := s.client.Do(req)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
if resp.StatusCode != http.StatusOK {
|
|
return "", fmt.Errorf("HTTP %d", resp.StatusCode)
|
|
}
|
|
|
|
body, err := io.ReadAll(resp.Body)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
return string(body), nil
|
|
}
|
|
|
|
func (s *JavaDocsScraper) generateHash(content string) string {
|
|
hash := sha256.Sum256([]byte(content))
|
|
return hex.EncodeToString(hash[:])
|
|
}
|
|
|
|
func (s *JavaDocsScraper) packageToDocument(pkg *javadocs.Package, sourceName string) *Document {
|
|
var content strings.Builder
|
|
fmt.Fprintf(&content, "# Package %s\n\n", pkg.Name)
|
|
fmt.Fprintf(&content, "%s\n", pkg.Doc)
|
|
|
|
metadata := map[string]interface{}{
|
|
"package": pkg.Name,
|
|
"doc_url": pkg.DocURL,
|
|
"doc_type": "java-package",
|
|
}
|
|
|
|
return &Document{
|
|
ID: generateDocID(pkg.DocURL),
|
|
Source: sourceName,
|
|
Type: "java-package",
|
|
Title: pkg.Name,
|
|
Content: content.String(),
|
|
URL: pkg.DocURL,
|
|
Metadata: metadata,
|
|
Hash: s.generateHash(content.String()),
|
|
Timestamp: time.Now(),
|
|
}
|
|
}
|
|
|
|
func (s *JavaDocsScraper) classToDocument(class *javadocs.Class, pkg *javadocs.Package, sourceName string) *Document {
|
|
var content strings.Builder
|
|
fmt.Fprintf(&content, "# %s\n\n", class.QualifiedName)
|
|
fmt.Fprintf(&content, "Kind: %s\n\n", class.Kind)
|
|
fmt.Fprintf(&content, "%s\n", class.Doc)
|
|
|
|
if len(class.Methods) > 0 {
|
|
fmt.Fprintf(&content, "\n## Methods\n")
|
|
for _, m := range class.Methods {
|
|
fmt.Fprintf(&content, "- `%s`\n", m.Signature)
|
|
}
|
|
}
|
|
|
|
metadata := map[string]interface{}{
|
|
"package": pkg.Name,
|
|
"qualified_name": class.QualifiedName,
|
|
"kind": string(class.Kind),
|
|
"doc_url": class.DocURL,
|
|
}
|
|
|
|
return &Document{
|
|
ID: generateDocID(class.DocURL),
|
|
Source: sourceName,
|
|
Type: "java-class",
|
|
Title: class.QualifiedName,
|
|
Content: content.String(),
|
|
URL: class.DocURL,
|
|
Metadata: metadata,
|
|
Hash: s.generateHash(content.String()),
|
|
Timestamp: time.Now(),
|
|
}
|
|
}
|
|
|
|
func (s *JavaDocsScraper) interfaceToDocument(iface *javadocs.Class, pkg *javadocs.Package, sourceName string) *Document {
|
|
var content strings.Builder
|
|
fmt.Fprintf(&content, "# %s (interface)\n\n", iface.QualifiedName)
|
|
fmt.Fprintf(&content, "%s\n", iface.Doc)
|
|
|
|
metadata := map[string]interface{}{
|
|
"package": pkg.Name,
|
|
"qualified_name": iface.QualifiedName,
|
|
"kind": "interface",
|
|
"doc_url": iface.DocURL,
|
|
}
|
|
|
|
return &Document{
|
|
ID: generateDocID(iface.DocURL),
|
|
Source: sourceName,
|
|
Type: "java-interface",
|
|
Title: iface.QualifiedName,
|
|
Content: content.String(),
|
|
URL: iface.DocURL,
|
|
Metadata: metadata,
|
|
Hash: s.generateHash(content.String()),
|
|
Timestamp: time.Now(),
|
|
}
|
|
}
|
|
|
|
func (s *JavaDocsScraper) enumToDocument(enum *javadocs.Enum, pkg *javadocs.Package, sourceName string) *Document {
|
|
var content strings.Builder
|
|
fmt.Fprintf(&content, "# %s (enum)\n\n", enum.QualifiedName)
|
|
fmt.Fprintf(&content, "%s\n", enum.Doc)
|
|
|
|
if len(enum.Constants) > 0 {
|
|
fmt.Fprintf(&content, "\n## Constants\n")
|
|
for _, c := range enum.Constants {
|
|
fmt.Fprintf(&content, "- `%s`\n", c.Name)
|
|
}
|
|
}
|
|
|
|
metadata := map[string]interface{}{
|
|
"package": pkg.Name,
|
|
"qualified_name": enum.QualifiedName,
|
|
"kind": "enum",
|
|
"doc_url": enum.DocURL,
|
|
}
|
|
|
|
return &Document{
|
|
ID: generateDocID(enum.DocURL),
|
|
Source: sourceName,
|
|
Type: "java-enum",
|
|
Title: enum.QualifiedName,
|
|
Content: content.String(),
|
|
URL: enum.DocURL,
|
|
Metadata: metadata,
|
|
Hash: s.generateHash(content.String()),
|
|
Timestamp: time.Now(),
|
|
}
|
|
}
|
|
|
|
func (s *JavaDocsScraper) exceptionToDocument(exc *javadocs.Class, pkg *javadocs.Package, sourceName string) *Document {
|
|
var content strings.Builder
|
|
fmt.Fprintf(&content, "# %s (exception)\n\n", exc.QualifiedName)
|
|
fmt.Fprintf(&content, "%s\n", exc.Doc)
|
|
|
|
metadata := map[string]interface{}{
|
|
"package": pkg.Name,
|
|
"qualified_name": exc.QualifiedName,
|
|
"kind": "exception",
|
|
"doc_url": exc.DocURL,
|
|
}
|
|
|
|
return &Document{
|
|
ID: generateDocID(exc.DocURL),
|
|
Source: sourceName,
|
|
Type: "java-exception",
|
|
Title: exc.QualifiedName,
|
|
Content: content.String(),
|
|
URL: exc.DocURL,
|
|
Metadata: metadata,
|
|
Hash: s.generateHash(content.String()),
|
|
Timestamp: time.Now(),
|
|
}
|
|
}
|