This commit is contained in:
Tomas Dvorak
2026-02-22 15:41:27 +01:00
parent 0b88627e54
commit 409acd2e08
84 changed files with 65382 additions and 27475 deletions
+156
View File
@@ -0,0 +1,156 @@
package scraper
import (
"context"
"crypto/sha256"
"encoding/hex"
"fmt"
"io"
"net/http"
"strings"
"time"
"github.com/yourorg/devour/pkg/astrodocs"
)
type AstroDocsScraper struct {
config *Config
parser *astrodocs.Parser
client *http.Client
}
func NewAstroDocsScraper(config *Config) *AstroDocsScraper {
return &AstroDocsScraper{
config: config,
parser: astrodocs.NewParser(),
client: &http.Client{
Timeout: config.Timeout,
},
}
}
func (s *AstroDocsScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
var documents []*Document
if source.URL == "" {
return nil, fmt.Errorf("URL is required for Astro docs scraper")
}
html, err := s.fetchPage(ctx, source.URL)
if err != nil {
return nil, fmt.Errorf("failed to fetch page: %w", err)
}
page, err := s.parser.ParsePage(html, source.URL)
if err != nil {
return nil, fmt.Errorf("failed to parse Astro docs page: %w", err)
}
mainDoc := s.pageToDocument(page, source.Name)
documents = append(documents, mainDoc)
for _, section := range page.Sections {
doc := s.sectionToDocument(section, page, source.Name)
documents = append(documents, doc)
}
return documents, nil
}
func (s *AstroDocsScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
html, err := s.fetchPage(ctx, source.URL)
if err != nil {
return false, "", err
}
hash := s.generateHash(html)
changed := hash != lastHash
return changed, hash, nil
}
func (s *AstroDocsScraper) fetchPage(ctx context.Context, url string) (string, error) {
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
if err != nil {
return "", err
}
req.Header.Set("User-Agent", s.config.UserAgent)
resp, err := s.client.Do(req)
if err != nil {
return "", err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return "", fmt.Errorf("HTTP %d", resp.StatusCode)
}
body, err := io.ReadAll(resp.Body)
if err != nil {
return "", err
}
return string(body), nil
}
func (s *AstroDocsScraper) generateHash(content string) string {
hash := sha256.Sum256([]byte(content))
return hex.EncodeToString(hash[:])
}
func (s *AstroDocsScraper) pageToDocument(page *astrodocs.Page, sourceName string) *Document {
var content strings.Builder
fmt.Fprintf(&content, "# %s\n\n", page.Title)
fmt.Fprintf(&content, "%s\n", page.Description)
if len(page.CodeBlocks) > 0 {
fmt.Fprintf(&content, "\n## Code Examples\n")
for _, cb := range page.CodeBlocks {
fmt.Fprintf(&content, "\n```%s\n%s\n```\n", cb.Language, cb.Code)
}
}
metadata := map[string]interface{}{
"title": page.Title,
"doc_url": page.URL,
"doc_type": "astro-docs",
}
return &Document{
ID: generateDocID(page.URL),
Source: sourceName,
Type: "astro-docs",
Title: page.Title,
Content: content.String(),
URL: page.URL,
Metadata: metadata,
Hash: s.generateHash(content.String()),
Timestamp: time.Now(),
}
}
func (s *AstroDocsScraper) sectionToDocument(section *astrodocs.Section, page *astrodocs.Page, sourceName string) *Document {
var content strings.Builder
fmt.Fprintf(&content, "# %s\n\n", section.Title)
fmt.Fprintf(&content, "%s\n", section.Content)
metadata := map[string]interface{}{
"page_title": page.Title,
"section_id": section.ID,
"doc_url": section.DocURL,
"doc_type": "astro-section",
}
return &Document{
ID: generateDocID(section.DocURL),
Source: sourceName,
Type: "astro-section",
Title: fmt.Sprintf("%s - %s", page.Title, section.Title),
Content: content.String(),
URL: section.DocURL,
Metadata: metadata,
Hash: s.generateHash(content.String()),
Timestamp: time.Now(),
}
}