mirror of
https://github.com/Dvorinka/Devour.git
synced 2026-06-03 20:13:03 +00:00
194 lines
4.8 KiB
Go
194 lines
4.8 KiB
Go
package scraper
|
|
|
|
import (
|
|
"context"
|
|
"crypto/sha256"
|
|
"encoding/hex"
|
|
"fmt"
|
|
"io"
|
|
"net/http"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/yourorg/devour/pkg/cloudflaredocs"
|
|
)
|
|
|
|
type CloudflareDocsScraper struct {
|
|
config *Config
|
|
parser *cloudflaredocs.Parser
|
|
client *http.Client
|
|
}
|
|
|
|
func NewCloudflareDocsScraper(config *Config) *CloudflareDocsScraper {
|
|
return &CloudflareDocsScraper{
|
|
config: config,
|
|
parser: cloudflaredocs.NewParser(),
|
|
client: &http.Client{
|
|
Timeout: config.Timeout,
|
|
},
|
|
}
|
|
}
|
|
|
|
func (s *CloudflareDocsScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
|
|
var documents []*Document
|
|
|
|
if source.URL == "" {
|
|
return nil, fmt.Errorf("URL is required for Cloudflare docs scraper")
|
|
}
|
|
|
|
html, err := s.fetchPage(ctx, source.URL)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to fetch page: %w", err)
|
|
}
|
|
|
|
page, err := s.parser.ParsePage(html, source.URL)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to parse Cloudflare docs page: %w", err)
|
|
}
|
|
|
|
mainDoc := s.pageToDocument(page, source.Name)
|
|
documents = append(documents, mainDoc)
|
|
|
|
for _, section := range page.Sections {
|
|
doc := s.sectionToDocument(section, page, source.Name)
|
|
documents = append(documents, doc)
|
|
}
|
|
|
|
for _, api := range page.APIs {
|
|
doc := s.apiToDocument(api, page, source.Name)
|
|
documents = append(documents, doc)
|
|
}
|
|
|
|
return documents, nil
|
|
}
|
|
|
|
func (s *CloudflareDocsScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
|
|
html, err := s.fetchPage(ctx, source.URL)
|
|
if err != nil {
|
|
return false, "", err
|
|
}
|
|
|
|
hash := s.generateHash(html)
|
|
changed := hash != lastHash
|
|
|
|
return changed, hash, nil
|
|
}
|
|
|
|
func (s *CloudflareDocsScraper) fetchPage(ctx context.Context, url string) (string, error) {
|
|
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
|
|
req.Header.Set("User-Agent", s.config.UserAgent)
|
|
|
|
resp, err := s.client.Do(req)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
if resp.StatusCode != http.StatusOK {
|
|
return "", fmt.Errorf("HTTP %d", resp.StatusCode)
|
|
}
|
|
|
|
body, err := io.ReadAll(resp.Body)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
return string(body), nil
|
|
}
|
|
|
|
func (s *CloudflareDocsScraper) generateHash(content string) string {
|
|
hash := sha256.Sum256([]byte(content))
|
|
return hex.EncodeToString(hash[:])
|
|
}
|
|
|
|
func (s *CloudflareDocsScraper) pageToDocument(page *cloudflaredocs.Page, sourceName string) *Document {
|
|
var content strings.Builder
|
|
fmt.Fprintf(&content, "# %s\n\n", page.Title)
|
|
if page.Product != "" {
|
|
fmt.Fprintf(&content, "Product: %s\n\n", page.Product)
|
|
}
|
|
fmt.Fprintf(&content, "%s\n", page.Description)
|
|
|
|
if len(page.CodeBlocks) > 0 {
|
|
fmt.Fprintf(&content, "\n## Code Examples\n")
|
|
for _, cb := range page.CodeBlocks {
|
|
fmt.Fprintf(&content, "\n```%s\n%s\n```\n", cb.Language, cb.Code)
|
|
}
|
|
}
|
|
|
|
metadata := map[string]interface{}{
|
|
"title": page.Title,
|
|
"product": page.Product,
|
|
"doc_url": page.URL,
|
|
"doc_type": "cloudflare-docs",
|
|
}
|
|
|
|
return &Document{
|
|
ID: generateDocID(page.URL),
|
|
Source: sourceName,
|
|
Type: "cloudflare-docs",
|
|
Title: page.Title,
|
|
Content: content.String(),
|
|
URL: page.URL,
|
|
Metadata: metadata,
|
|
Hash: s.generateHash(content.String()),
|
|
Timestamp: time.Now(),
|
|
}
|
|
}
|
|
|
|
func (s *CloudflareDocsScraper) sectionToDocument(section *cloudflaredocs.Section, page *cloudflaredocs.Page, sourceName string) *Document {
|
|
var content strings.Builder
|
|
fmt.Fprintf(&content, "# %s\n\n", section.Title)
|
|
fmt.Fprintf(&content, "%s\n", section.Content)
|
|
|
|
metadata := map[string]interface{}{
|
|
"page_title": page.Title,
|
|
"product": page.Product,
|
|
"section_id": section.ID,
|
|
"doc_url": section.DocURL,
|
|
"doc_type": "cloudflare-section",
|
|
}
|
|
|
|
return &Document{
|
|
ID: generateDocID(section.DocURL),
|
|
Source: sourceName,
|
|
Type: "cloudflare-section",
|
|
Title: fmt.Sprintf("%s - %s", page.Title, section.Title),
|
|
Content: content.String(),
|
|
URL: section.DocURL,
|
|
Metadata: metadata,
|
|
Hash: s.generateHash(content.String()),
|
|
Timestamp: time.Now(),
|
|
}
|
|
}
|
|
|
|
func (s *CloudflareDocsScraper) apiToDocument(api *cloudflaredocs.API, page *cloudflaredocs.Page, sourceName string) *Document {
|
|
var content strings.Builder
|
|
fmt.Fprintf(&content, "# %s %s\n\n", api.Method, api.Endpoint)
|
|
fmt.Fprintf(&content, "%s\n", api.Description)
|
|
|
|
metadata := map[string]interface{}{
|
|
"page_title": page.Title,
|
|
"product": page.Product,
|
|
"method": api.Method,
|
|
"endpoint": api.Endpoint,
|
|
"doc_url": api.DocURL,
|
|
"doc_type": "cloudflare-api",
|
|
}
|
|
|
|
return &Document{
|
|
ID: generateDocID(api.DocURL + "#" + api.Endpoint),
|
|
Source: sourceName,
|
|
Type: "cloudflare-api",
|
|
Title: fmt.Sprintf("%s %s", api.Method, api.Endpoint),
|
|
Content: content.String(),
|
|
URL: api.DocURL,
|
|
Metadata: metadata,
|
|
Hash: s.generateHash(content.String()),
|
|
Timestamp: time.Now(),
|
|
}
|
|
}
|