package scraper import ( "context" "crypto/sha256" "encoding/hex" "fmt" "net/http" "strings" "time" "github.com/yourorg/devour/pkg/cloudflaredocs" ) type CloudflareDocsScraper struct { config *Config parser *cloudflaredocs.Parser client *http.Client } func NewCloudflareDocsScraper(config *Config) *CloudflareDocsScraper { return &CloudflareDocsScraper{ config: config, parser: cloudflaredocs.NewParser(), client: &http.Client{ Timeout: config.Timeout, }, } } func (s *CloudflareDocsScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) { var documents []*Document if source.URL == "" { return nil, fmt.Errorf("URL is required for Cloudflare docs scraper") } html, err := s.fetchPage(ctx, source.URL) if err != nil { return nil, fmt.Errorf("failed to fetch page: %w", err) } page, err := s.parser.ParsePage(html, source.URL) if err != nil { return nil, fmt.Errorf("failed to parse Cloudflare docs page: %w", err) } mainDoc := s.pageToDocument(page, source.Name) documents = append(documents, mainDoc) for _, section := range page.Sections { doc := s.sectionToDocument(section, page, source.Name) documents = append(documents, doc) } for _, api := range page.APIs { doc := s.apiToDocument(api, page, source.Name) documents = append(documents, doc) } return documents, nil } func (s *CloudflareDocsScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) { html, err := s.fetchPage(ctx, source.URL) if err != nil { return false, "", err } hash := s.generateHash(html) changed := hash != lastHash return changed, hash, nil } func (s *CloudflareDocsScraper) fetchPage(ctx context.Context, url string) (string, error) { return fetchExternalPage(ctx, s.client, s.config.UserAgent, url) } func (s *CloudflareDocsScraper) generateHash(content string) string { hash := sha256.Sum256([]byte(content)) return hex.EncodeToString(hash[:]) } func (s *CloudflareDocsScraper) pageToDocument(page *cloudflaredocs.Page, sourceName string) *Document { var content strings.Builder fmt.Fprintf(&content, "# %s\n\n", page.Title) if page.Product != "" { fmt.Fprintf(&content, "Product: %s\n\n", page.Product) } fmt.Fprintf(&content, "%s\n", page.Description) if len(page.CodeBlocks) > 0 { fmt.Fprintf(&content, "\n## Code Examples\n") for _, cb := range page.CodeBlocks { fmt.Fprintf(&content, "\n```%s\n%s\n```\n", cb.Language, cb.Code) } } metadata := map[string]interface{}{ "title": page.Title, "product": page.Product, "doc_url": page.URL, "doc_type": "cloudflare-docs", } return &Document{ ID: generateDocID(page.URL), Source: sourceName, Type: "cloudflare-docs", Title: page.Title, Content: content.String(), URL: page.URL, Metadata: metadata, Hash: s.generateHash(content.String()), Timestamp: time.Now(), } } func (s *CloudflareDocsScraper) sectionToDocument(section *cloudflaredocs.Section, page *cloudflaredocs.Page, sourceName string) *Document { var content strings.Builder fmt.Fprintf(&content, "# %s\n\n", section.Title) fmt.Fprintf(&content, "%s\n", section.Content) metadata := map[string]interface{}{ "page_title": page.Title, "product": page.Product, "section_id": section.ID, "doc_url": section.DocURL, "doc_type": "cloudflare-section", } return &Document{ ID: generateDocID(section.DocURL), Source: sourceName, Type: "cloudflare-section", Title: fmt.Sprintf("%s - %s", page.Title, section.Title), Content: content.String(), URL: section.DocURL, Metadata: metadata, Hash: s.generateHash(content.String()), Timestamp: time.Now(), } } func (s *CloudflareDocsScraper) apiToDocument(api *cloudflaredocs.API, page *cloudflaredocs.Page, sourceName string) *Document { var content strings.Builder fmt.Fprintf(&content, "# %s %s\n\n", api.Method, api.Endpoint) fmt.Fprintf(&content, "%s\n", api.Description) metadata := map[string]interface{}{ "page_title": page.Title, "product": page.Product, "method": api.Method, "endpoint": api.Endpoint, "doc_url": api.DocURL, "doc_type": "cloudflare-api", } return &Document{ ID: generateDocID(api.DocURL + "#" + api.Endpoint), Source: sourceName, Type: "cloudflare-api", Title: fmt.Sprintf("%s %s", api.Method, api.Endpoint), Content: content.String(), URL: api.DocURL, Metadata: metadata, Hash: s.generateHash(content.String()), Timestamp: time.Now(), } }