package scraper import ( "context" "crypto/sha256" "encoding/hex" "fmt" "net/http" "strings" "time" "github.com/yourorg/devour/pkg/astrodocs" ) type AstroDocsScraper struct { config *Config parser *astrodocs.Parser client *http.Client } func NewAstroDocsScraper(config *Config) *AstroDocsScraper { return &AstroDocsScraper{ config: config, parser: astrodocs.NewParser(), client: &http.Client{ Timeout: config.Timeout, }, } } func (s *AstroDocsScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) { var documents []*Document if source.URL == "" { return nil, fmt.Errorf("URL is required for Astro docs scraper") } html, err := s.fetchPage(ctx, source.URL) if err != nil { return nil, fmt.Errorf("failed to fetch page: %w", err) } page, err := s.parser.ParsePage(html, source.URL) if err != nil { return nil, fmt.Errorf("failed to parse Astro docs page: %w", err) } mainDoc := s.pageToDocument(page, source.Name) documents = append(documents, mainDoc) for _, section := range page.Sections { doc := s.sectionToDocument(section, page, source.Name) documents = append(documents, doc) } return documents, nil } func (s *AstroDocsScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) { html, err := s.fetchPage(ctx, source.URL) if err != nil { return false, "", err } hash := s.generateHash(html) changed := hash != lastHash return changed, hash, nil } func (s *AstroDocsScraper) fetchPage(ctx context.Context, url string) (string, error) { return fetchExternalPage(ctx, s.client, s.config.UserAgent, url) } func (s *AstroDocsScraper) generateHash(content string) string { hash := sha256.Sum256([]byte(content)) return hex.EncodeToString(hash[:]) } func (s *AstroDocsScraper) pageToDocument(page *astrodocs.Page, sourceName string) *Document { var content strings.Builder fmt.Fprintf(&content, "# %s\n\n", page.Title) fmt.Fprintf(&content, "%s\n", page.Description) if len(page.CodeBlocks) > 0 { fmt.Fprintf(&content, "\n## Code Examples\n") for _, cb := range page.CodeBlocks { fmt.Fprintf(&content, "\n```%s\n%s\n```\n", cb.Language, cb.Code) } } metadata := map[string]interface{}{ "title": page.Title, "doc_url": page.URL, "doc_type": "astro-docs", } return &Document{ ID: generateDocID(page.URL), Source: sourceName, Type: "astro-docs", Title: page.Title, Content: content.String(), URL: page.URL, Metadata: metadata, Hash: s.generateHash(content.String()), Timestamp: time.Now(), } } func (s *AstroDocsScraper) sectionToDocument(section *astrodocs.Section, page *astrodocs.Page, sourceName string) *Document { var content strings.Builder fmt.Fprintf(&content, "# %s\n\n", section.Title) fmt.Fprintf(&content, "%s\n", section.Content) metadata := map[string]interface{}{ "page_title": page.Title, "section_id": section.ID, "doc_url": section.DocURL, "doc_type": "astro-section", } return &Document{ ID: generateDocID(section.DocURL), Source: sourceName, Type: "astro-section", Title: fmt.Sprintf("%s - %s", page.Title, section.Title), Content: content.String(), URL: section.DocURL, Metadata: metadata, Hash: s.generateHash(content.String()), Timestamp: time.Now(), } }