Files
Devour/pkg/rustdocs/parser.go
T
Tomas Dvorak 55885a0e8f first commit
2026-02-22 10:42:17 +01:00

634 lines
16 KiB
Go

package rustdocs
import (
"net/url"
"regexp"
"strings"
"time"
"github.com/PuerkitoBio/goquery"
)
type Parser struct {
baseURL string
}
func NewParser() *Parser {
return &Parser{
baseURL: "https://docs.rs",
}
}
func (p *Parser) ParseCratePage(html string, docURL string) (*Crate, error) {
doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
if err != nil {
return nil, err
}
crate := &Crate{
DocURL: docURL,
FetchedAt: time.Now(),
}
crate.Name = p.extractCrateName(doc)
crate.Version = p.extractVersion(doc)
crate.Description = p.extractDescription(doc)
crate.Repository = p.extractRepository(doc)
crate.Modules = p.extractModules(doc)
crate.Structs = p.extractStructs(doc)
crate.Enums = p.extractEnums(doc)
crate.Traits = p.extractTraits(doc)
crate.Functions = p.extractFunctions(doc)
crate.Macros = p.extractMacros(doc)
crate.Constants = p.extractConstants(doc)
crate.Statics = p.extractStatics(doc)
return crate, nil
}
func (p *Parser) ParseItemPage(html string, docURL string) (*Symbol, error) {
doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
if err != nil {
return nil, err
}
symbol := &Symbol{
DocURL: docURL,
}
symbol.Name = p.extractItemName(doc)
symbol.Path = p.extractItemPath(doc, docURL)
symbol.Kind = p.extractItemKind(doc)
symbol.Signature = p.extractItemSignature(doc)
symbol.Doc = p.extractItemDoc(doc)
return symbol, nil
}
func (p *Parser) ParseSearchResults(html string) ([]*SearchResult, error) {
doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
if err != nil {
return nil, err
}
var results []*SearchResult
doc.Find("#results .search-results a").Each(func(i int, s *goquery.Selection) {
result := &SearchResult{}
classes, _ := s.Attr("class")
result.Kind = extractKindFromClasses(classes)
nameEl := s.Find(".result-name")
result.Name = strings.TrimSpace(nameEl.Find(".method, .struct, .fn, .trait, .enum, .mod, .macro, .const, .static, .attr").Text())
if result.Name == "" {
nameText := nameEl.Text()
result.Name = strings.TrimSpace(strings.Split(nameText, "\n")[0])
}
var pathParts []string
nameEl.Find(".path span").Each(func(_ int, span *goquery.Selection) {
part := strings.TrimSpace(span.Text())
if part != "" {
pathParts = append(pathParts, part)
}
})
result.Path = strings.Join(pathParts, "::")
result.Description = strings.TrimSpace(s.Find(".desc").Text())
if href, exists := s.Attr("href"); exists {
if strings.HasPrefix(href, "http") {
result.DocURL = href
} else {
u, err := url.Parse("https://docs.rs")
if err == nil {
u.Path = href
result.DocURL = u.String()
}
}
}
stabilityEl := s.Find(".stab")
if stabilityEl.Length() > 0 {
if stabilityEl.HasClass("unstable") || stabilityEl.HasClass("experimental") {
result.IsExperimental = true
}
result.Stability = strings.TrimSpace(stabilityEl.Text())
}
results = append(results, result)
})
return results, nil
}
func (p *Parser) extractCrateName(doc *goquery.Document) string {
title := doc.Find(".main-heading h1").Text()
title = strings.TrimSpace(title)
if strings.HasPrefix(title, "Crate ") {
return strings.TrimPrefix(title, "Crate ")
}
if strings.HasPrefix(title, "Module ") {
return strings.TrimPrefix(title, "Module ")
}
h1 := doc.Find("h1").First().Text()
h1 = strings.TrimSpace(h1)
if strings.HasPrefix(h1, "Crate ") {
return strings.TrimPrefix(h1, "Crate ")
}
return title
}
func (p *Parser) extractVersion(doc *goquery.Document) string {
since := doc.Find(".since").Text()
if since != "" {
re := regexp.MustCompile(`\d+\.\d+\.\d+`)
if match := re.FindString(since); match != "" {
return match
}
}
subHeading := doc.Find(".sub-heading").Text()
re := regexp.MustCompile(`v?(\d+\.\d+\.\d+)`)
if match := re.FindStringSubmatch(subHeading); len(match) > 1 {
return match[1]
}
return ""
}
func (p *Parser) extractDescription(doc *goquery.Document) string {
topDoc := doc.Find(".top-doc .docblock").First()
if topDoc.Length() > 0 {
return strings.TrimSpace(topDoc.Text())
}
topDoc = doc.Find(".docblock").First()
if topDoc.Length() > 0 {
return strings.TrimSpace(topDoc.Text())
}
return ""
}
func (p *Parser) extractRepository(doc *goquery.Document) string {
srcLink := doc.Find("a.src")
if srcLink.Length() > 0 {
if href, exists := srcLink.Attr("href"); exists {
if strings.Contains(href, "github.com") {
re := regexp.MustCompile(`https://github\.com/[^/]+/[^/]+`)
if match := re.FindString(href); match != "" {
return match
}
}
}
}
return ""
}
func (p *Parser) extractItemName(doc *goquery.Document) string {
h1 := doc.Find(".main-heading h1").Text()
h1 = strings.TrimSpace(h1)
for _, prefix := range []string{"Struct ", "Enum ", "Trait ", "Fn ", "Macro ", "Const ", "Static ", "Module ", "Type "} {
if strings.HasPrefix(h1, prefix) {
return strings.TrimPrefix(h1, prefix)
}
}
return h1
}
func (p *Parser) extractItemPath(doc *goquery.Document, docURL string) string {
breadcrumbs := doc.Find(".rustdoc-breadcrumbs").Text()
breadcrumbs = strings.TrimSpace(breadcrumbs)
breadcrumbs = strings.ReplaceAll(breadcrumbs, "\n", "")
breadcrumbs = strings.ReplaceAll(breadcrumbs, " ", " ")
breadcrumbs = strings.TrimSpace(breadcrumbs)
if breadcrumbs != "" {
return breadcrumbs
}
if docURL != "" {
u, err := url.Parse(docURL)
if err == nil {
path := strings.TrimPrefix(u.Path, "/")
path = strings.TrimSuffix(path, "/index.html")
path = strings.TrimSuffix(path, ".html")
path = strings.ReplaceAll(path, "/", "::")
return path
}
}
return ""
}
func (p *Parser) extractItemKind(doc *goquery.Document) ItemKind {
h1 := doc.Find(".main-heading h1 span").First()
if h1.Length() > 0 {
class, _ := h1.Attr("class")
switch {
case strings.Contains(class, "struct"):
return ItemKindStruct
case strings.Contains(class, "enum"):
return ItemKindEnum
case strings.Contains(class, "trait"):
return ItemKindTrait
case strings.Contains(class, "fn"):
return ItemKindFn
case strings.Contains(class, "macro"):
return ItemKindMacro
case strings.Contains(class, "const"):
return ItemKindConst
case strings.Contains(class, "static"):
return ItemKindStatic
case strings.Contains(class, "mod"):
return ItemKindMod
case strings.Contains(class, "type"):
return ItemKindType
}
}
title := doc.Find(".main-heading h1").Text()
switch {
case strings.HasPrefix(title, "Struct "):
return ItemKindStruct
case strings.HasPrefix(title, "Enum "):
return ItemKindEnum
case strings.HasPrefix(title, "Trait "):
return ItemKindTrait
case strings.HasPrefix(title, "Fn ") || strings.HasPrefix(title, "Function "):
return ItemKindFn
case strings.HasPrefix(title, "Macro "):
return ItemKindMacro
case strings.HasPrefix(title, "Const "):
return ItemKindConst
case strings.HasPrefix(title, "Static "):
return ItemKindStatic
case strings.HasPrefix(title, "Module "):
return ItemKindMod
}
return ""
}
func (p *Parser) extractItemSignature(doc *goquery.Document) string {
sig := doc.Find("pre.rust.item-decl").Text()
sig = strings.TrimSpace(sig)
if sig != "" {
return sig
}
sig = doc.Find("pre.rust").First().Text()
return strings.TrimSpace(sig)
}
func (p *Parser) extractItemDoc(doc *goquery.Document) string {
docblock := doc.Find(".top-doc .docblock").First()
if docblock.Length() > 0 {
return strings.TrimSpace(docblock.Text())
}
docblock = doc.Find(".docblock").First()
if docblock.Length() > 0 {
return strings.TrimSpace(docblock.Text())
}
return ""
}
func (p *Parser) extractModules(doc *goquery.Document) []*Module {
var modules []*Module
doc.Find(".item-table .mod, .module-item .mod").Each(func(_ int, s *goquery.Selection) {
mod := &Module{}
mod.Name = strings.TrimSpace(s.Find("a.mod").Text())
if mod.Name == "" {
mod.Name = strings.TrimSpace(s.Find("a").First().Text())
}
if href, exists := s.Find("a").First().Attr("href"); exists {
mod.DocURL = resolveURL(p.baseURL, href)
}
mod.Doc = strings.TrimSpace(s.Find(".desc, .item-desc").Text())
mod.IsExperimental = s.Find(".stab.unstable, .stab.experimental").Length() > 0
if mod.Name != "" {
modules = append(modules, mod)
}
})
return modules
}
func (p *Parser) extractStructs(doc *goquery.Document) []*Struct {
var structs []*Struct
doc.Find(".item-table .struct, .struct").Each(func(_ int, s *goquery.Selection) {
st := &Struct{}
st.Name = strings.TrimSpace(s.Find("a.struct").Text())
if st.Name == "" {
st.Name = strings.TrimSpace(s.Find("a").First().Text())
}
if href, exists := s.Find("a").First().Attr("href"); exists {
st.DocURL = resolveURL(p.baseURL, href)
}
st.Doc = strings.TrimSpace(s.Find(".desc, .item-desc").Text())
st.IsExperimental = s.Find(".stab.unstable, .stab.experimental").Length() > 0
if st.Name != "" {
structs = append(structs, st)
}
})
return structs
}
func (p *Parser) extractEnums(doc *goquery.Document) []*Enum {
var enums []*Enum
doc.Find(".item-table .enum, .enum").Each(func(_ int, s *goquery.Selection) {
e := &Enum{}
e.Name = strings.TrimSpace(s.Find("a.enum").Text())
if e.Name == "" {
e.Name = strings.TrimSpace(s.Find("a").First().Text())
}
if href, exists := s.Find("a").First().Attr("href"); exists {
e.DocURL = resolveURL(p.baseURL, href)
}
e.Doc = strings.TrimSpace(s.Find(".desc, .item-desc").Text())
e.IsExperimental = s.Find(".stab.unstable, .stab.experimental").Length() > 0
if e.Name != "" {
enums = append(enums, e)
}
})
return enums
}
func (p *Parser) extractTraits(doc *goquery.Document) []*Trait {
var traits []*Trait
doc.Find(".item-table .trait, .trait").Each(func(_ int, s *goquery.Selection) {
t := &Trait{}
t.Name = strings.TrimSpace(s.Find("a.trait").Text())
if t.Name == "" {
t.Name = strings.TrimSpace(s.Find("a").First().Text())
}
if href, exists := s.Find("a").First().Attr("href"); exists {
t.DocURL = resolveURL(p.baseURL, href)
}
t.Doc = strings.TrimSpace(s.Find(".desc, .item-desc").Text())
t.IsExperimental = s.Find(".stab.unstable, .stab.experimental").Length() > 0
if t.Name != "" {
traits = append(traits, t)
}
})
return traits
}
func (p *Parser) extractFunctions(doc *goquery.Document) []*Func {
var funcs []*Func
doc.Find(".item-table .fn, .fn, .function").Each(func(_ int, s *goquery.Selection) {
f := &Func{}
f.Name = strings.TrimSpace(s.Find("a.fn").Text())
if f.Name == "" {
f.Name = strings.TrimSpace(s.Find("a").First().Text())
}
if href, exists := s.Find("a").First().Attr("href"); exists {
f.DocURL = resolveURL(p.baseURL, href)
}
f.Doc = strings.TrimSpace(s.Find(".desc, .item-desc").Text())
f.IsExperimental = s.Find(".stab.unstable, .stab.experimental").Length() > 0
f.IsUnsafe = strings.Contains(s.Text(), "unsafe")
if f.Name != "" {
funcs = append(funcs, f)
}
})
return funcs
}
func (p *Parser) extractMacros(doc *goquery.Document) []*Macro {
var macros []*Macro
doc.Find(".item-table .macro, .macro").Each(func(_ int, s *goquery.Selection) {
m := &Macro{}
m.Name = strings.TrimSpace(s.Find("a.macro").Text())
if m.Name == "" {
m.Name = strings.TrimSpace(s.Find("a").First().Text())
}
if href, exists := s.Find("a").First().Attr("href"); exists {
m.DocURL = resolveURL(p.baseURL, href)
}
m.Doc = strings.TrimSpace(s.Find(".desc, .item-desc").Text())
m.IsExperimental = s.Find(".stab.unstable, .stab.experimental").Length() > 0
if m.Name != "" {
macros = append(macros, m)
}
})
return macros
}
func (p *Parser) extractConstants(doc *goquery.Document) []*Const {
var constants []*Const
doc.Find(".item-table .constant, .constant").Each(func(_ int, s *goquery.Selection) {
c := &Const{}
c.Name = strings.TrimSpace(s.Find("a.constant").Text())
if c.Name == "" {
c.Name = strings.TrimSpace(s.Find("a").First().Text())
}
if href, exists := s.Find("a").First().Attr("href"); exists {
c.DocURL = resolveURL(p.baseURL, href)
}
c.Doc = strings.TrimSpace(s.Find(".desc, .item-desc").Text())
c.IsExperimental = s.Find(".stab.unstable, .stab.experimental").Length() > 0
if c.Name != "" {
constants = append(constants, c)
}
})
return constants
}
func (p *Parser) extractStatics(doc *goquery.Document) []*Static {
var statics []*Static
doc.Find(".item-table .static, .static").Each(func(_ int, s *goquery.Selection) {
st := &Static{}
st.Name = strings.TrimSpace(s.Find("a.static").Text())
if st.Name == "" {
st.Name = strings.TrimSpace(s.Find("a").First().Text())
}
if href, exists := s.Find("a").First().Attr("href"); exists {
st.DocURL = resolveURL(p.baseURL, href)
}
st.Doc = strings.TrimSpace(s.Find(".desc, .item-desc").Text())
st.IsExperimental = s.Find(".stab.unstable, .stab.experimental").Length() > 0
if st.Name != "" {
statics = append(statics, st)
}
})
return statics
}
func (p *Parser) ExtractMethods(doc *goquery.Document) []*Method {
var methods []*Method
doc.Find(".impl-items .method-toggle, details.method-toggle").Each(func(_ int, s *goquery.Selection) {
m := &Method{}
m.Name = strings.TrimSpace(s.Find(".fn, .method, h4.code-header").Text())
if m.Name == "" {
section := s.Find("section.method")
m.Name = strings.TrimSpace(section.Find(".fn").Text())
}
sig := s.Find("pre, .code-header, h4.code-header")
m.Signature = strings.TrimSpace(sig.Text())
m.Doc = strings.TrimSpace(s.Find(".docblock").Text())
m.IsUnsafe = strings.Contains(m.Signature, "unsafe")
m.IsAsync = strings.Contains(m.Signature, "async")
m.IsConst = strings.Contains(m.Signature, "const")
m.IsExperimental = s.Find(".stab.unstable, .stab.experimental").Length() > 0
if m.Name != "" {
methods = append(methods, m)
}
})
return methods
}
func (p *Parser) ExtractStructFields(doc *goquery.Document) []*Field {
var fields []*Field
doc.Find(".struct .fields tr, .struct-member").Each(func(_ int, s *goquery.Selection) {
f := &Field{}
f.Name = strings.TrimSpace(s.Find(".structfield, td:first-child").Text())
f.Type = strings.TrimSpace(s.Find(".type, td:nth-child(2)").Text())
f.Doc = strings.TrimSpace(s.Find(".docblock, td:last-child").Text())
f.IsPub = strings.Contains(s.Text(), "pub")
if f.Name != "" {
fields = append(fields, f)
}
})
return fields
}
func (p *Parser) ExtractEnumVariants(doc *goquery.Document) []*Variant {
var variants []*Variant
doc.Find(".enum .variants li, .variant").Each(func(_ int, s *goquery.Selection) {
v := &Variant{}
v.Name = strings.TrimSpace(s.Find("a, .variant-name").Text())
if v.Name == "" {
v.Name = strings.TrimSpace(s.Text())
}
v.Doc = strings.TrimSpace(s.Find(".docblock").Text())
sig := s.Text()
v.IsTuple = strings.Contains(sig, "(") && !strings.Contains(sig, "{")
v.IsStruct = strings.Contains(sig, "{")
v.IsUnit = !v.IsTuple && !v.IsStruct
if v.Name != "" {
variants = append(variants, v)
}
})
return variants
}
func extractKindFromClasses(classes string) string {
classList := strings.Fields(classes)
for _, c := range classList {
switch {
case strings.HasPrefix(c, "result-"):
kind := strings.TrimPrefix(c, "result-")
switch kind {
case "struct", "enum", "trait", "fn", "macro", "const", "static", "mod", "type", "primitive", "keyword", "attr":
return kind
case "method":
return "fn"
case "externcrate":
return "mod"
}
}
}
return ""
}
func resolveURL(base string, href string) string {
if strings.HasPrefix(href, "http") {
return href
}
baseURL, err := url.Parse(base)
if err != nil {
return href
}
hrefURL, err := url.Parse(href)
if err != nil {
return href
}
return baseURL.ResolveReference(hrefURL).String()
}
func cleanText(text string) string {
re := regexp.MustCompile(`\s+`)
text = re.ReplaceAllString(text, " ")
return strings.TrimSpace(text)
}