Files
Devour/pkg/pythondocs/parser.go
T
Tomas Dvorak 55885a0e8f first commit
2026-02-22 10:42:17 +01:00

556 lines
13 KiB
Go

package pythondocs
import (
"net/url"
"regexp"
"strings"
"time"
"github.com/PuerkitoBio/goquery"
)
type Parser struct {
baseURL string
}
func NewParser() *Parser {
return &Parser{
baseURL: "https://docs.python.org",
}
}
func (p *Parser) ParseModulePage(html string, docURL string) (*Module, error) {
doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
if err != nil {
return nil, err
}
module := &Module{
DocURL: docURL,
FetchedAt: time.Now(),
}
module.Name = p.extractModuleName(doc)
module.Path = module.Name
module.Doc = p.extractModuleDoc(doc)
module.Synopsis = p.extractSynopsis(doc)
module.Version = p.extractVersion(doc)
module.Classes = p.extractClasses(doc, module.Name, docURL)
module.Functions = p.extractFunctions(doc, module.Name, docURL)
module.Exceptions = p.extractExceptions(doc, module.Name, docURL)
module.Constants = p.extractData(doc, module.Name, docURL)
return module, nil
}
func (p *Parser) ParseSearchResults(html string) ([]*SearchResult, error) {
doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
if err != nil {
return nil, err
}
var results []*SearchResult
doc.Find("ul.search li").Each(func(i int, s *goquery.Selection) {
result := &SearchResult{}
classes, _ := s.Attr("class")
if strings.Contains(classes, "kind-object") {
result.Kind = "object"
} else if strings.Contains(classes, "kind-text") {
result.Kind = "text"
} else if strings.Contains(classes, "kind-title") {
result.Kind = "title"
}
link := s.Find("a").First()
result.Name = strings.TrimSpace(link.Text())
if href, exists := link.Attr("href"); exists {
result.DocURL = resolveURL(p.baseURL, href)
result.Path = extractPathFromURL(href)
}
if score, exists := link.Attr("data-score"); exists {
var scoreInt int
for _, c := range score {
if c >= '0' && c <= '9' {
scoreInt = scoreInt*10 + int(c-'0')
}
}
result.Score = scoreInt
}
span := s.Find("span").Last()
result.Description = strings.TrimSpace(span.Text())
results = append(results, result)
})
return results, nil
}
func (p *Parser) extractModuleName(doc *goquery.Document) string {
section := doc.Find("section[id^='module-']").First()
if section.Length() > 0 {
id, _ := section.Attr("id")
return strings.TrimPrefix(id, "module-")
}
h1 := doc.Find("h1 code").First()
if h1.Length() > 0 {
return strings.TrimSpace(h1.Text())
}
h1 = doc.Find(".body h1").First()
if h1.Length() > 0 {
text := h1.Text()
if strings.HasPrefix(text, "—") {
parts := strings.SplitN(text, "—", 2)
if len(parts) > 0 {
return strings.TrimSpace(parts[0])
}
}
return strings.TrimSpace(text)
}
return ""
}
func (p *Parser) extractModuleDoc(doc *goquery.Document) string {
section := doc.Find("section[id^='module-']").First()
if section.Length() == 0 {
section = doc.Find(".body").First()
}
docblock := section.Find("p").First()
if docblock.Length() > 0 {
return strings.TrimSpace(docblock.Text())
}
return ""
}
func (p *Parser) extractSynopsis(doc *goquery.Document) string {
text := doc.Find(".body p").First().Text()
text = strings.TrimSpace(text)
if len(text) > 200 {
return text[:197] + "..."
}
return text
}
func (p *Parser) extractVersion(doc *goquery.Document) string {
versionAdded := doc.Find(".versionadded").Text()
if versionAdded != "" {
re := regexp.MustCompile(`\d+\.\d+`)
if match := re.FindString(versionAdded); match != "" {
return match
}
}
versionChanged := doc.Find(".versionchanged").Text()
if versionChanged != "" {
re := regexp.MustCompile(`\d+\.\d+`)
if match := re.FindString(versionChanged); match != "" {
return match
}
}
return ""
}
func (p *Parser) extractClasses(doc *goquery.Document, moduleName string, docURL string) []*Class {
var classes []*Class
doc.Find("dl.py.class").Each(func(_ int, s *goquery.Selection) {
class := &Class{
Module: moduleName,
}
dt := s.Find("dt.sig-object").First()
if dt.Length() == 0 {
dt = s.Find("dt").First()
}
sig := dt.Find("code.sig-prename")
class.Name = strings.TrimSpace(sig.Find(".pre").Last().Text())
if class.Name == "" {
class.Name = strings.TrimSpace(dt.Find(".sig-name").Text())
}
if class.Name == "" {
sigText := dt.Text()
sigText = strings.TrimSpace(sigText)
parts := strings.Fields(sigText)
if len(parts) > 0 {
class.Name = parts[0]
}
}
if id, exists := dt.Attr("id"); exists {
class.QualName = id
class.DocURL = docURL + "#" + id
} else {
class.QualName = class.Name
class.DocURL = docURL
}
class.Signature = strings.TrimSpace(dt.Text())
dd := s.Find("dd").First()
class.Doc = strings.TrimSpace(dd.Find("p").First().Text())
bases := dt.Find("a.reference.internal")
bases.Each(func(_ int, b *goquery.Selection) {
base := strings.TrimSpace(b.Text())
if base != "" && base != class.Name {
class.Bases = append(class.Bases, base)
}
})
class.Methods = p.extractMethods(s, class.Name, docURL)
class.ClassMethods = p.extractClassMethods(s, class.Name, docURL)
class.StaticMethods = p.extractStaticMethods(s, class.Name, docURL)
class.Attributes = p.extractAttributes(s, class.Name, docURL)
if class.Name != "" {
classes = append(classes, class)
}
})
return classes
}
func (p *Parser) extractFunctions(doc *goquery.Document, moduleName string, docURL string) []*Function {
var functions []*Function
doc.Find("dl.py.function").Each(func(_ int, s *goquery.Selection) {
fn := &Function{
Module: moduleName,
}
dt := s.Find("dt.sig-object").First()
if dt.Length() == 0 {
dt = s.Find("dt").First()
}
sig := dt.Find("code.sig-prename")
fn.Name = strings.TrimSpace(sig.Find(".pre").Last().Text())
if fn.Name == "" {
fn.Name = strings.TrimSpace(dt.Find(".sig-name").Text())
}
if fn.Name == "" {
sigText := dt.Text()
sigText = strings.TrimSpace(sigText)
if idx := strings.Index(sigText, "("); idx > 0 {
fn.Name = strings.TrimSpace(sigText[:idx])
}
}
if id, exists := dt.Attr("id"); exists {
fn.QualName = id
fn.DocURL = docURL + "#" + id
} else {
fn.QualName = fn.Name
fn.DocURL = docURL
}
fn.Signature = strings.TrimSpace(dt.Text())
dd := s.Find("dd").First()
fn.Doc = strings.TrimSpace(dd.Find("p").First().Text())
fn.Parameters = p.extractParameters(dt)
if class := s.Find("dl.py.method, dl.py.classmethod, dl.py.staticmethod"); class.Length() > 0 {
return
}
if fn.Name != "" {
functions = append(functions, fn)
}
})
return functions
}
func (p *Parser) extractExceptions(doc *goquery.Document, moduleName string, docURL string) []*Exception {
var exceptions []*Exception
doc.Find("dl.py.exception").Each(func(_ int, s *goquery.Selection) {
exc := &Exception{
Module: moduleName,
}
dt := s.Find("dt.sig-object").First()
if dt.Length() == 0 {
dt = s.Find("dt").First()
}
sig := dt.Find("code.sig-prename")
exc.Name = strings.TrimSpace(sig.Find(".pre").Last().Text())
if exc.Name == "" {
exc.Name = strings.TrimSpace(dt.Find(".sig-name").Text())
}
if exc.Name == "" {
sigText := dt.Text()
sigText = strings.TrimSpace(sigText)
if idx := strings.Index(sigText, "("); idx > 0 {
exc.Name = strings.TrimSpace(sigText[:idx])
}
}
if id, exists := dt.Attr("id"); exists {
exc.QualName = id
exc.DocURL = docURL + "#" + id
} else {
exc.QualName = exc.Name
exc.DocURL = docURL
}
exc.Signature = strings.TrimSpace(dt.Text())
dd := s.Find("dd").First()
exc.Doc = strings.TrimSpace(dd.Find("p").First().Text())
if exc.Name != "" {
exceptions = append(exceptions, exc)
}
})
return exceptions
}
func (p *Parser) extractData(doc *goquery.Document, moduleName string, docURL string) []*Data {
var dataList []*Data
doc.Find("dl.py.data").Each(func(_ int, s *goquery.Selection) {
data := &Data{
Module: moduleName,
}
dt := s.Find("dt.sig-object").First()
if dt.Length() == 0 {
dt = s.Find("dt").First()
}
sig := dt.Find("code.sig-prename")
data.Name = strings.TrimSpace(sig.Find(".pre").Last().Text())
if data.Name == "" {
sigText := dt.Text()
sigText = strings.TrimSpace(sigText)
data.Name = strings.Fields(sigText)[0]
}
if id, exists := dt.Attr("id"); exists {
data.DocURL = docURL + "#" + id
} else {
data.DocURL = docURL
}
dd := s.Find("dd").First()
data.Doc = strings.TrimSpace(dd.Find("p").First().Text())
if data.Name != "" {
dataList = append(dataList, data)
}
})
return dataList
}
func (p *Parser) extractMethods(parent *goquery.Selection, className string, docURL string) []*Method {
var methods []*Method
parent.Find("dl.py.method").Each(func(_ int, s *goquery.Selection) {
method := p.parseMethod(s, className, docURL, false, false)
if method != nil {
methods = append(methods, method)
}
})
return methods
}
func (p *Parser) extractClassMethods(parent *goquery.Selection, className string, docURL string) []*Method {
var methods []*Method
parent.Find("dl.py.classmethod").Each(func(_ int, s *goquery.Selection) {
method := p.parseMethod(s, className, docURL, true, false)
if method != nil {
methods = append(methods, method)
}
})
return methods
}
func (p *Parser) extractStaticMethods(parent *goquery.Selection, className string, docURL string) []*Method {
var methods []*Method
parent.Find("dl.py.staticmethod").Each(func(_ int, s *goquery.Selection) {
method := p.parseMethod(s, className, docURL, false, true)
if method != nil {
methods = append(methods, method)
}
})
return methods
}
func (p *Parser) parseMethod(s *goquery.Selection, className string, docURL string, isClassMethod bool, isStatic bool) *Method {
method := &Method{
Class: className,
IsClassMethod: isClassMethod,
IsStatic: isStatic,
}
dt := s.Find("dt.sig-object").First()
if dt.Length() == 0 {
dt = s.Find("dt").First()
}
sig := dt.Find("code.sig-prename")
method.Name = strings.TrimSpace(sig.Find(".pre").Last().Text())
if method.Name == "" {
method.Name = strings.TrimSpace(dt.Find(".sig-name").Text())
}
if method.Name == "" {
sigText := dt.Text()
sigText = strings.TrimSpace(sigText)
if idx := strings.Index(sigText, "("); idx > 0 {
name := strings.TrimSpace(sigText[:idx])
parts := strings.Split(name, ".")
method.Name = parts[len(parts)-1]
}
}
if id, exists := dt.Attr("id"); exists {
method.QualName = id
method.DocURL = docURL + "#" + id
} else {
method.QualName = className + "." + method.Name
method.DocURL = docURL
}
method.Signature = strings.TrimSpace(dt.Text())
dd := s.Find("dd").First()
method.Doc = strings.TrimSpace(dd.Find("p").First().Text())
method.Parameters = p.extractParameters(dt)
if method.Name != "" {
return method
}
return nil
}
func (p *Parser) extractAttributes(parent *goquery.Selection, className string, docURL string) []*Attribute {
var attributes []*Attribute
parent.Find("dl.py.attribute").Each(func(_ int, s *goquery.Selection) {
attr := &Attribute{
Class: className,
}
dt := s.Find("dt.sig-object").First()
if dt.Length() == 0 {
dt = s.Find("dt").First()
}
sig := dt.Find("code.sig-prename")
attr.Name = strings.TrimSpace(sig.Find(".pre").Last().Text())
if attr.Name == "" {
sigText := dt.Text()
sigText = strings.TrimSpace(sigText)
attr.Name = strings.Fields(sigText)[0]
}
if id, exists := dt.Attr("id"); exists {
attr.DocURL = docURL + "#" + id
} else {
attr.DocURL = docURL
}
dd := s.Find("dd").First()
attr.Doc = strings.TrimSpace(dd.Find("p").First().Text())
if attr.Name != "" {
attributes = append(attributes, attr)
}
})
return attributes
}
func (p *Parser) extractParameters(dt *goquery.Selection) []*Param {
var params []*Param
dt.Find("em.sig-param").Each(func(_ int, em *goquery.Selection) {
param := &Param{}
text := strings.TrimSpace(em.Text())
if strings.HasPrefix(text, "*") && !strings.HasPrefix(text, "**") {
param.IsVarArgs = true
text = strings.TrimPrefix(text, "*")
} else if strings.HasPrefix(text, "**") {
param.IsKWArgs = true
text = strings.TrimPrefix(text, "**")
}
if strings.Contains(text, "=") {
parts := strings.SplitN(text, "=", 2)
param.Name = strings.TrimSpace(parts[0])
param.Default = strings.TrimSpace(parts[1])
} else {
param.Name = text
}
if param.Name != "" {
params = append(params, param)
}
})
return params
}
func extractPathFromURL(href string) string {
u, err := url.Parse(href)
if err != nil {
return href
}
path := u.Path
path = strings.TrimSuffix(path, ".html")
path = strings.TrimSuffix(path, "/")
path = strings.TrimPrefix(path, "/")
if strings.Contains(path, "#") {
parts := strings.Split(path, "#")
path = parts[0]
}
return path
}
func resolveURL(base string, href string) string {
if strings.HasPrefix(href, "http") {
return href
}
baseURL, err := url.Parse(base)
if err != nil {
return href
}
hrefURL, err := url.Parse(href)
if err != nil {
return href
}
return baseURL.ResolveReference(hrefURL).String()
}