package cloudflaredocs import ( "strings" "time" "github.com/PuerkitoBio/goquery" "github.com/yourorg/devour/pkg/parserutil" ) type Parser struct { baseURL string } func NewParser() *Parser { return &Parser{ baseURL: "https://developers.cloudflare.com", } } func (p *Parser) ParsePage(html string, docURL string) (*Page, error) { doc, err := goquery.NewDocumentFromReader(strings.NewReader(html)) if err != nil { return nil, err } page := &Page{ URL: docURL, FetchedAt: time.Now(), } page.Title = p.extractTitle(doc) page.Description = p.extractDescription(doc) page.Product = p.extractProduct(doc) page.Content = p.extractContent(doc) page.Sections = p.extractSections(doc, docURL) page.CodeBlocks = p.extractCodeBlocks(doc) page.APIs = p.extractAPIs(doc, docURL) return page, nil } func (p *Parser) ParseSidebar(html string) ([]*Section, error) { doc, err := goquery.NewDocumentFromReader(strings.NewReader(html)) if err != nil { return nil, err } var sections []*Section doc.Find(".sidebar a, nav a, [data-sidebar] a, .sl-sidebar a").Each(func(_ int, s *goquery.Selection) { section := &Section{} section.Title = strings.TrimSpace(s.Text()) if href, exists := s.Attr("href"); exists { section.DocURL = resolveURL(p.baseURL, href) } if section.Title != "" && section.DocURL != "" { sections = append(sections, section) } }) return sections, nil } func (p *Parser) extractTitle(doc *goquery.Document) string { title := doc.Find("h1").First().Text() title = strings.TrimSpace(title) if title == "" { title = doc.Find("title").First().Text() title = strings.TrimSpace(title) title = strings.TrimSuffix(title, " | Cloudflare Docs") title = strings.TrimSpace(title) } return title } func (p *Parser) extractDescription(doc *goquery.Document) string { desc := doc.Find("meta[name='description']").AttrOr("content", "") if desc != "" { return desc } desc = doc.Find("meta[property='og:description']").AttrOr("content", "") if desc != "" { return desc } return "" } func (p *Parser) extractProduct(doc *goquery.Document) string { product := doc.Find(".product-name, [data-product], .breadcrumb a:first-child").First().Text() return strings.TrimSpace(product) } func (p *Parser) extractContent(doc *goquery.Document) string { content := doc.Find("article, main, .content, .sl-markdown-content").First() if content.Length() == 0 { content = doc.Find("body") } return strings.TrimSpace(content.Text()) } func (p *Parser) extractSections(doc *goquery.Document, docURL string) []*Section { var sections []*Section doc.Find("h1, h2, h3, h4").Each(func(_ int, s *goquery.Selection) { section := &Section{} section.Title = strings.TrimSpace(s.Text()) section.ID = strings.TrimSpace(s.AttrOr("id", "")) if section.ID == "" { section.ID = strings.TrimSpace(s.AttrOr("data-anchor", "")) } if section.ID == "" { if href, exists := s.Find("a[href^='#']").First().Attr("href"); exists { section.ID = strings.TrimPrefix(strings.TrimSpace(href), "#") } } section.DocURL = docURL if section.ID != "" { section.DocURL = docURL + "#" + section.ID } if section.Title != "" { sections = append(sections, section) } }) return sections } func (p *Parser) extractCodeBlocks(doc *goquery.Document) []*CodeBlock { var blocks []*CodeBlock doc.Find("pre code, pre").Each(func(_ int, s *goquery.Selection) { block := &CodeBlock{} if classes, exists := s.Attr("class"); exists { parts := strings.Split(classes, " ") for _, part := range parts { if strings.HasPrefix(part, "language-") { block.Language = strings.TrimPrefix(part, "language-") break } } } block.Code = strings.TrimSpace(s.Text()) if block.Code != "" { blocks = append(blocks, block) } }) return blocks } func (p *Parser) extractAPIs(doc *goquery.Document, docURL string) []*API { var apis []*API apiMethods := []string{"GET", "POST", "PUT", "DELETE", "PATCH"} doc.Find("pre code, code, .api-endpoint").Each(func(_ int, s *goquery.Selection) { text := strings.TrimSpace(s.Text()) for _, method := range apiMethods { if strings.HasPrefix(text, method+" ") || strings.HasPrefix(text, method+"\t") { api := &API{ Method: method, DocURL: docURL, } parts := strings.Fields(text) if len(parts) >= 2 { api.Endpoint = parts[1] api.Name = parts[1] } if api.Endpoint != "" { apis = append(apis, api) } break } } }) return apis } func resolveURL(base string, href string) string { return parserutil.ResolveURL(base, href) }