first commit

This commit is contained in:
Tomas Dvorak
2026-02-22 10:42:17 +01:00
commit 55885a0e8f
239 changed files with 103690 additions and 0 deletions
+177
View File
@@ -0,0 +1,177 @@
package astrodocs
import (
"net/url"
"strings"
"time"
"github.com/PuerkitoBio/goquery"
)
type Parser struct {
baseURL string
}
func NewParser() *Parser {
return &Parser{
baseURL: "https://docs.astro.build",
}
}
func (p *Parser) ParsePage(html string, docURL string) (*Page, error) {
doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
if err != nil {
return nil, err
}
page := &Page{
URL: docURL,
FetchedAt: time.Now(),
}
page.Title = p.extractTitle(doc)
page.Description = p.extractDescription(doc)
page.Content = p.extractContent(doc)
page.Sections = p.extractSections(doc, docURL)
page.CodeBlocks = p.extractCodeBlocks(doc)
return page, nil
}
func (p *Parser) ParseSidebar(html string) ([]*Section, error) {
doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
if err != nil {
return nil, err
}
var sections []*Section
doc.Find(".sidebar a, nav a, [data-sidebar] a").Each(func(_ int, s *goquery.Selection) {
section := &Section{}
section.Title = strings.TrimSpace(s.Text())
if href, exists := s.Attr("href"); exists {
section.DocURL = resolveURL(p.baseURL, href)
}
if section.Title != "" && section.DocURL != "" {
sections = append(sections, section)
}
})
return sections, nil
}
func (p *Parser) extractTitle(doc *goquery.Document) string {
title := doc.Find("h1").First().Text()
title = strings.TrimSpace(title)
if title == "" {
title = doc.Find("title").First().Text()
title = strings.TrimSpace(title)
if idx := strings.Index(title, " | "); idx > 0 {
title = title[:idx]
}
}
return title
}
func (p *Parser) extractDescription(doc *goquery.Document) string {
desc := doc.Find("meta[name='description']").AttrOr("content", "")
if desc != "" {
return desc
}
return doc.Find("meta[property='og:description']").AttrOr("content", "")
}
func (p *Parser) extractContent(doc *goquery.Document) string {
content := doc.Find("article, main, .content, .sl-markdown-content").First()
if content.Length() == 0 {
content = doc.Find("body")
}
return strings.TrimSpace(content.Text())
}
func (p *Parser) extractSections(doc *goquery.Document, docURL string) []*Section {
var sections []*Section
doc.Find("h1, h2, h3").Each(func(_ int, s *goquery.Selection) {
section := &Section{}
section.Title = strings.TrimSpace(s.Text())
if id, exists := s.Attr("id"); exists {
section.ID = id
section.DocURL = docURL + "#" + id
} else {
section.DocURL = docURL
}
if section.Title != "" {
sections = append(sections, section)
}
})
return sections
}
func (p *Parser) extractCodeBlocks(doc *goquery.Document) []*CodeBlock {
var blocks []*CodeBlock
doc.Find("pre code, pre").Each(func(_ int, s *goquery.Selection) {
block := &CodeBlock{}
if classes, exists := s.Attr("class"); exists {
parts := strings.Split(classes, " ")
for _, part := range parts {
if strings.HasPrefix(part, "language-") {
block.Language = strings.TrimPrefix(part, "language-")
break
}
}
}
if block.Language == "" {
parent := s.Parent()
if classes, exists := parent.Attr("class"); exists {
parts := strings.Split(classes, " ")
for _, part := range parts {
if strings.HasPrefix(part, "language-") {
block.Language = strings.TrimPrefix(part, "language-")
break
}
}
}
}
block.Code = strings.TrimSpace(s.Text())
if block.Code != "" {
blocks = append(blocks, block)
}
})
return blocks
}
func resolveURL(base string, href string) string {
if strings.HasPrefix(href, "http") {
return href
}
baseURL, err := url.Parse(base)
if err != nil {
return href
}
hrefURL, err := url.Parse(href)
if err != nil {
return href
}
return baseURL.ResolveReference(hrefURL).String()
}
+32
View File
@@ -0,0 +1,32 @@
package astrodocs
import "time"
type Page struct {
Title string `json:"title"`
Description string `json:"description,omitempty"`
Content string `json:"content,omitempty"`
URL string `json:"url"`
Sections []*Section `json:"sections,omitempty"`
CodeBlocks []*CodeBlock `json:"code_blocks,omitempty"`
Guides []*Guide `json:"guides,omitempty"`
FetchedAt time.Time `json:"fetched_at"`
}
type Section struct {
ID string `json:"id"`
Title string `json:"title"`
Content string `json:"content,omitempty"`
DocURL string `json:"doc_url"`
}
type CodeBlock struct {
Language string `json:"language,omitempty"`
Code string `json:"code"`
}
type Guide struct {
Title string `json:"title"`
Description string `json:"description,omitempty"`
DocURL string `json:"doc_url"`
}