mirror of
https://github.com/Dvorinka/Devour.git
synced 2026-06-03 20:13:03 +00:00
first commit
This commit is contained in:
@@ -0,0 +1,207 @@
|
||||
package cloudflaredocs
|
||||
|
||||
import (
|
||||
"net/url"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
)
|
||||
|
||||
type Parser struct {
|
||||
baseURL string
|
||||
}
|
||||
|
||||
func NewParser() *Parser {
|
||||
return &Parser{
|
||||
baseURL: "https://developers.cloudflare.com",
|
||||
}
|
||||
}
|
||||
|
||||
func (p *Parser) ParsePage(html string, docURL string) (*Page, error) {
|
||||
doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
page := &Page{
|
||||
URL: docURL,
|
||||
FetchedAt: time.Now(),
|
||||
}
|
||||
|
||||
page.Title = p.extractTitle(doc)
|
||||
page.Description = p.extractDescription(doc)
|
||||
page.Product = p.extractProduct(doc)
|
||||
page.Content = p.extractContent(doc)
|
||||
page.Sections = p.extractSections(doc, docURL)
|
||||
page.CodeBlocks = p.extractCodeBlocks(doc)
|
||||
page.APIs = p.extractAPIs(doc, docURL)
|
||||
|
||||
return page, nil
|
||||
}
|
||||
|
||||
func (p *Parser) ParseSidebar(html string) ([]*Section, error) {
|
||||
doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var sections []*Section
|
||||
|
||||
doc.Find(".sidebar a, nav a, [data-sidebar] a, .sl-sidebar a").Each(func(_ int, s *goquery.Selection) {
|
||||
section := &Section{}
|
||||
|
||||
section.Title = strings.TrimSpace(s.Text())
|
||||
|
||||
if href, exists := s.Attr("href"); exists {
|
||||
section.DocURL = resolveURL(p.baseURL, href)
|
||||
}
|
||||
|
||||
if section.Title != "" && section.DocURL != "" {
|
||||
sections = append(sections, section)
|
||||
}
|
||||
})
|
||||
|
||||
return sections, nil
|
||||
}
|
||||
|
||||
func (p *Parser) extractTitle(doc *goquery.Document) string {
|
||||
title := doc.Find("h1").First().Text()
|
||||
title = strings.TrimSpace(title)
|
||||
|
||||
if title == "" {
|
||||
title = doc.Find("title").First().Text()
|
||||
title = strings.TrimSpace(title)
|
||||
title = strings.TrimSuffix(title, " | Cloudflare Docs")
|
||||
title = strings.TrimSpace(title)
|
||||
}
|
||||
|
||||
return title
|
||||
}
|
||||
|
||||
func (p *Parser) extractDescription(doc *goquery.Document) string {
|
||||
desc := doc.Find("meta[name='description']").AttrOr("content", "")
|
||||
if desc != "" {
|
||||
return desc
|
||||
}
|
||||
|
||||
desc = doc.Find("meta[property='og:description']").AttrOr("content", "")
|
||||
if desc != "" {
|
||||
return desc
|
||||
}
|
||||
|
||||
return ""
|
||||
}
|
||||
|
||||
func (p *Parser) extractProduct(doc *goquery.Document) string {
|
||||
product := doc.Find(".product-name, [data-product], .breadcrumb a:first-child").First().Text()
|
||||
return strings.TrimSpace(product)
|
||||
}
|
||||
|
||||
func (p *Parser) extractContent(doc *goquery.Document) string {
|
||||
content := doc.Find("article, main, .content, .sl-markdown-content").First()
|
||||
if content.Length() == 0 {
|
||||
content = doc.Find("body")
|
||||
}
|
||||
|
||||
return strings.TrimSpace(content.Text())
|
||||
}
|
||||
|
||||
func (p *Parser) extractSections(doc *goquery.Document, docURL string) []*Section {
|
||||
var sections []*Section
|
||||
|
||||
doc.Find("h1, h2, h3").Each(func(_ int, s *goquery.Selection) {
|
||||
section := &Section{}
|
||||
|
||||
section.Title = strings.TrimSpace(s.Text())
|
||||
|
||||
if id, exists := s.Attr("id"); exists {
|
||||
section.ID = id
|
||||
section.DocURL = docURL + "#" + id
|
||||
} else {
|
||||
section.DocURL = docURL
|
||||
}
|
||||
|
||||
if section.Title != "" {
|
||||
sections = append(sections, section)
|
||||
}
|
||||
})
|
||||
|
||||
return sections
|
||||
}
|
||||
|
||||
func (p *Parser) extractCodeBlocks(doc *goquery.Document) []*CodeBlock {
|
||||
var blocks []*CodeBlock
|
||||
|
||||
doc.Find("pre code, pre").Each(func(_ int, s *goquery.Selection) {
|
||||
block := &CodeBlock{}
|
||||
|
||||
if classes, exists := s.Attr("class"); exists {
|
||||
parts := strings.Split(classes, " ")
|
||||
for _, part := range parts {
|
||||
if strings.HasPrefix(part, "language-") {
|
||||
block.Language = strings.TrimPrefix(part, "language-")
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
block.Code = strings.TrimSpace(s.Text())
|
||||
|
||||
if block.Code != "" {
|
||||
blocks = append(blocks, block)
|
||||
}
|
||||
})
|
||||
|
||||
return blocks
|
||||
}
|
||||
|
||||
func (p *Parser) extractAPIs(doc *goquery.Document, docURL string) []*API {
|
||||
var apis []*API
|
||||
|
||||
apiMethods := []string{"GET", "POST", "PUT", "DELETE", "PATCH"}
|
||||
|
||||
doc.Find("pre code, code, .api-endpoint").Each(func(_ int, s *goquery.Selection) {
|
||||
text := strings.TrimSpace(s.Text())
|
||||
|
||||
for _, method := range apiMethods {
|
||||
if strings.HasPrefix(text, method+" ") || strings.HasPrefix(text, method+"\t") {
|
||||
api := &API{
|
||||
Method: method,
|
||||
DocURL: docURL,
|
||||
}
|
||||
|
||||
parts := strings.Fields(text)
|
||||
if len(parts) >= 2 {
|
||||
api.Endpoint = parts[1]
|
||||
api.Name = parts[1]
|
||||
}
|
||||
|
||||
if api.Endpoint != "" {
|
||||
apis = append(apis, api)
|
||||
}
|
||||
break
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
return apis
|
||||
}
|
||||
|
||||
func resolveURL(base string, href string) string {
|
||||
if strings.HasPrefix(href, "http") {
|
||||
return href
|
||||
}
|
||||
|
||||
baseURL, err := url.Parse(base)
|
||||
if err != nil {
|
||||
return href
|
||||
}
|
||||
|
||||
hrefURL, err := url.Parse(href)
|
||||
if err != nil {
|
||||
return href
|
||||
}
|
||||
|
||||
return baseURL.ResolveReference(hrefURL).String()
|
||||
}
|
||||
Reference in New Issue
Block a user