package astrodocs import ( "net/url" "strings" "time" "github.com/PuerkitoBio/goquery" ) type Parser struct { baseURL string } func NewParser() *Parser { return &Parser{ baseURL: "https://docs.astro.build", } } func (p *Parser) ParsePage(html string, docURL string) (*Page, error) { doc, err := goquery.NewDocumentFromReader(strings.NewReader(html)) if err != nil { return nil, err } page := &Page{ URL: docURL, FetchedAt: time.Now(), } page.Title = p.extractTitle(doc) page.Description = p.extractDescription(doc) page.Content = p.extractContent(doc) page.Sections = p.extractSections(doc, docURL) page.CodeBlocks = p.extractCodeBlocks(doc) return page, nil } func (p *Parser) ParseSidebar(html string) ([]*Section, error) { doc, err := goquery.NewDocumentFromReader(strings.NewReader(html)) if err != nil { return nil, err } var sections []*Section doc.Find(".sidebar a, nav a, [data-sidebar] a").Each(func(_ int, s *goquery.Selection) { section := &Section{} section.Title = strings.TrimSpace(s.Text()) if href, exists := s.Attr("href"); exists { section.DocURL = resolveURL(p.baseURL, href) } if section.Title != "" && section.DocURL != "" { sections = append(sections, section) } }) return sections, nil } func (p *Parser) extractTitle(doc *goquery.Document) string { title := doc.Find("h1").First().Text() title = strings.TrimSpace(title) if title == "" { title = doc.Find("title").First().Text() title = strings.TrimSpace(title) if idx := strings.Index(title, " | "); idx > 0 { title = title[:idx] } } return title } func (p *Parser) extractDescription(doc *goquery.Document) string { desc := doc.Find("meta[name='description']").AttrOr("content", "") if desc != "" { return desc } return doc.Find("meta[property='og:description']").AttrOr("content", "") } func (p *Parser) extractContent(doc *goquery.Document) string { content := doc.Find("article, main, .content, .sl-markdown-content").First() if content.Length() == 0 { content = doc.Find("body") } return strings.TrimSpace(content.Text()) } func (p *Parser) extractSections(doc *goquery.Document, docURL string) []*Section { var sections []*Section doc.Find("h1, h2, h3").Each(func(_ int, s *goquery.Selection) { section := &Section{} section.Title = strings.TrimSpace(s.Text()) if id, exists := s.Attr("id"); exists { section.ID = id section.DocURL = docURL + "#" + id } else { section.DocURL = docURL } if section.Title != "" { sections = append(sections, section) } }) return sections } func (p *Parser) extractCodeBlocks(doc *goquery.Document) []*CodeBlock { var blocks []*CodeBlock doc.Find("pre code, pre").Each(func(_ int, s *goquery.Selection) { block := &CodeBlock{} if classes, exists := s.Attr("class"); exists { parts := strings.Split(classes, " ") for _, part := range parts { if strings.HasPrefix(part, "language-") { block.Language = strings.TrimPrefix(part, "language-") break } } } if block.Language == "" { parent := s.Parent() if classes, exists := parent.Attr("class"); exists { parts := strings.Split(classes, " ") for _, part := range parts { if strings.HasPrefix(part, "language-") { block.Language = strings.TrimPrefix(part, "language-") break } } } } block.Code = strings.TrimSpace(s.Text()) if block.Code != "" { blocks = append(blocks, block) } }) return blocks } func resolveURL(base string, href string) string { if strings.HasPrefix(href, "http") { return href } baseURL, err := url.Parse(base) if err != nil { return href } hrefURL, err := url.Parse(href) if err != nil { return href } return baseURL.ResolveReference(hrefURL).String() }