mirror of
https://github.com/Dvorinka/Devour.git
synced 2026-06-03 20:13:03 +00:00
201 lines
4.3 KiB
Go
201 lines
4.3 KiB
Go
package dockerdocs
|
|
|
|
import (
|
|
"net/url"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/PuerkitoBio/goquery"
|
|
)
|
|
|
|
type Parser struct {
|
|
baseURL string
|
|
}
|
|
|
|
func NewParser() *Parser {
|
|
return &Parser{
|
|
baseURL: "https://docs.docker.com",
|
|
}
|
|
}
|
|
|
|
func (p *Parser) ParsePage(html string, docURL string) (*Page, error) {
|
|
doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
page := &Page{
|
|
URL: docURL,
|
|
FetchedAt: time.Now(),
|
|
}
|
|
|
|
page.Title = p.extractTitle(doc)
|
|
page.Description = p.extractDescription(doc)
|
|
page.Content = p.extractContent(doc)
|
|
page.Sections = p.extractSections(doc, docURL)
|
|
page.CodeBlocks = p.extractCodeBlocks(doc)
|
|
page.Links = p.extractLinks(doc, docURL)
|
|
|
|
return page, nil
|
|
}
|
|
|
|
func (p *Parser) ParseToc(html string) ([]*Section, error) {
|
|
doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
var sections []*Section
|
|
|
|
doc.Find("nav a, .toc a, .sidebar a, [data-toc] a").Each(func(_ int, s *goquery.Selection) {
|
|
section := &Section{}
|
|
|
|
section.Title = strings.TrimSpace(s.Text())
|
|
|
|
if href, exists := s.Attr("href"); exists {
|
|
section.DocURL = resolveURL(p.baseURL, href)
|
|
}
|
|
|
|
if section.Title != "" {
|
|
sections = append(sections, section)
|
|
}
|
|
})
|
|
|
|
return sections, nil
|
|
}
|
|
|
|
func (p *Parser) extractTitle(doc *goquery.Document) string {
|
|
title := doc.Find("h1").First().Text()
|
|
title = strings.TrimSpace(title)
|
|
|
|
if title == "" {
|
|
title = doc.Find("title").First().Text()
|
|
title = strings.TrimSpace(title)
|
|
if idx := strings.Index(title, " | "); idx > 0 {
|
|
title = title[:idx]
|
|
}
|
|
if idx := strings.Index(title, " - "); idx > 0 {
|
|
title = title[:idx]
|
|
}
|
|
}
|
|
|
|
return title
|
|
}
|
|
|
|
func (p *Parser) extractDescription(doc *goquery.Document) string {
|
|
desc := doc.Find("meta[name='description']").AttrOr("content", "")
|
|
if desc != "" {
|
|
return desc
|
|
}
|
|
|
|
desc = doc.Find(".lead, .intro, .summary, p:first-of-type").First().Text()
|
|
return strings.TrimSpace(desc)
|
|
}
|
|
|
|
func (p *Parser) extractContent(doc *goquery.Document) string {
|
|
content := doc.Find("article, main, .content, .documentation, .doc-content").First()
|
|
if content.Length() == 0 {
|
|
content = doc.Find("body")
|
|
}
|
|
|
|
return strings.TrimSpace(content.Text())
|
|
}
|
|
|
|
func (p *Parser) extractSections(doc *goquery.Document, docURL string) []*Section {
|
|
var sections []*Section
|
|
|
|
doc.Find("h1, h2, h3").Each(func(_ int, s *goquery.Selection) {
|
|
section := &Section{}
|
|
|
|
section.Title = strings.TrimSpace(s.Text())
|
|
|
|
if id, exists := s.Attr("id"); exists {
|
|
section.ID = id
|
|
section.DocURL = docURL + "#" + id
|
|
} else {
|
|
section.DocURL = docURL
|
|
}
|
|
|
|
next := s.Next()
|
|
var content strings.Builder
|
|
for next.Length() > 0 && !next.Is("h1, h2, h3") {
|
|
content.WriteString(next.Text())
|
|
content.WriteString("\n")
|
|
next = next.Next()
|
|
}
|
|
section.Content = strings.TrimSpace(content.String())
|
|
|
|
if section.Title != "" {
|
|
sections = append(sections, section)
|
|
}
|
|
})
|
|
|
|
return sections
|
|
}
|
|
|
|
func (p *Parser) extractCodeBlocks(doc *goquery.Document) []*CodeBlock {
|
|
var blocks []*CodeBlock
|
|
|
|
doc.Find("pre code, pre").Each(func(_ int, s *goquery.Selection) {
|
|
block := &CodeBlock{}
|
|
|
|
if classes, exists := s.Attr("class"); exists {
|
|
if strings.Contains(classes, "language-") {
|
|
parts := strings.Split(classes, " ")
|
|
for _, part := range parts {
|
|
if strings.HasPrefix(part, "language-") {
|
|
block.Language = strings.TrimPrefix(part, "language-")
|
|
break
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
block.Code = strings.TrimSpace(s.Text())
|
|
|
|
if block.Code != "" {
|
|
blocks = append(blocks, block)
|
|
}
|
|
})
|
|
|
|
return blocks
|
|
}
|
|
|
|
func (p *Parser) extractLinks(doc *goquery.Document, baseURL string) []string {
|
|
var links []string
|
|
seen := make(map[string]bool)
|
|
|
|
doc.Find("a[href]").Each(func(_ int, s *goquery.Selection) {
|
|
href, _ := s.Attr("href")
|
|
if href == "" || strings.HasPrefix(href, "#") || strings.HasPrefix(href, "javascript:") {
|
|
return
|
|
}
|
|
|
|
resolved := resolveURL(baseURL, href)
|
|
if !seen[resolved] {
|
|
seen[resolved] = true
|
|
links = append(links, resolved)
|
|
}
|
|
})
|
|
|
|
return links
|
|
}
|
|
|
|
func resolveURL(base string, href string) string {
|
|
if strings.HasPrefix(href, "http") {
|
|
return href
|
|
}
|
|
|
|
baseURL, err := url.Parse(base)
|
|
if err != nil {
|
|
return href
|
|
}
|
|
|
|
hrefURL, err := url.Parse(href)
|
|
if err != nil {
|
|
return href
|
|
}
|
|
|
|
return baseURL.ResolveReference(hrefURL).String()
|
|
}
|