mirror of
https://github.com/Dvorinka/Devour.git
synced 2026-06-04 12:33:04 +00:00
556 lines
13 KiB
Go
556 lines
13 KiB
Go
package pythondocs
|
|
|
|
import (
|
|
"net/url"
|
|
"regexp"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/PuerkitoBio/goquery"
|
|
)
|
|
|
|
type Parser struct {
|
|
baseURL string
|
|
}
|
|
|
|
func NewParser() *Parser {
|
|
return &Parser{
|
|
baseURL: "https://docs.python.org",
|
|
}
|
|
}
|
|
|
|
func (p *Parser) ParseModulePage(html string, docURL string) (*Module, error) {
|
|
doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
module := &Module{
|
|
DocURL: docURL,
|
|
FetchedAt: time.Now(),
|
|
}
|
|
|
|
module.Name = p.extractModuleName(doc)
|
|
module.Path = module.Name
|
|
module.Doc = p.extractModuleDoc(doc)
|
|
module.Synopsis = p.extractSynopsis(doc)
|
|
module.Version = p.extractVersion(doc)
|
|
|
|
module.Classes = p.extractClasses(doc, module.Name, docURL)
|
|
module.Functions = p.extractFunctions(doc, module.Name, docURL)
|
|
module.Exceptions = p.extractExceptions(doc, module.Name, docURL)
|
|
module.Constants = p.extractData(doc, module.Name, docURL)
|
|
|
|
return module, nil
|
|
}
|
|
|
|
func (p *Parser) ParseSearchResults(html string) ([]*SearchResult, error) {
|
|
doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
var results []*SearchResult
|
|
|
|
doc.Find("ul.search li").Each(func(i int, s *goquery.Selection) {
|
|
result := &SearchResult{}
|
|
|
|
classes, _ := s.Attr("class")
|
|
if strings.Contains(classes, "kind-object") {
|
|
result.Kind = "object"
|
|
} else if strings.Contains(classes, "kind-text") {
|
|
result.Kind = "text"
|
|
} else if strings.Contains(classes, "kind-title") {
|
|
result.Kind = "title"
|
|
}
|
|
|
|
link := s.Find("a").First()
|
|
result.Name = strings.TrimSpace(link.Text())
|
|
|
|
if href, exists := link.Attr("href"); exists {
|
|
result.DocURL = resolveURL(p.baseURL, href)
|
|
result.Path = extractPathFromURL(href)
|
|
}
|
|
|
|
if score, exists := link.Attr("data-score"); exists {
|
|
var scoreInt int
|
|
for _, c := range score {
|
|
if c >= '0' && c <= '9' {
|
|
scoreInt = scoreInt*10 + int(c-'0')
|
|
}
|
|
}
|
|
result.Score = scoreInt
|
|
}
|
|
|
|
span := s.Find("span").Last()
|
|
result.Description = strings.TrimSpace(span.Text())
|
|
|
|
results = append(results, result)
|
|
})
|
|
|
|
return results, nil
|
|
}
|
|
|
|
func (p *Parser) extractModuleName(doc *goquery.Document) string {
|
|
section := doc.Find("section[id^='module-']").First()
|
|
if section.Length() > 0 {
|
|
id, _ := section.Attr("id")
|
|
return strings.TrimPrefix(id, "module-")
|
|
}
|
|
|
|
h1 := doc.Find("h1 code").First()
|
|
if h1.Length() > 0 {
|
|
return strings.TrimSpace(h1.Text())
|
|
}
|
|
|
|
h1 = doc.Find(".body h1").First()
|
|
if h1.Length() > 0 {
|
|
text := h1.Text()
|
|
if strings.HasPrefix(text, "—") {
|
|
parts := strings.SplitN(text, "—", 2)
|
|
if len(parts) > 0 {
|
|
return strings.TrimSpace(parts[0])
|
|
}
|
|
}
|
|
return strings.TrimSpace(text)
|
|
}
|
|
|
|
return ""
|
|
}
|
|
|
|
func (p *Parser) extractModuleDoc(doc *goquery.Document) string {
|
|
section := doc.Find("section[id^='module-']").First()
|
|
if section.Length() == 0 {
|
|
section = doc.Find(".body").First()
|
|
}
|
|
|
|
docblock := section.Find("p").First()
|
|
if docblock.Length() > 0 {
|
|
return strings.TrimSpace(docblock.Text())
|
|
}
|
|
|
|
return ""
|
|
}
|
|
|
|
func (p *Parser) extractSynopsis(doc *goquery.Document) string {
|
|
text := doc.Find(".body p").First().Text()
|
|
text = strings.TrimSpace(text)
|
|
if len(text) > 200 {
|
|
return text[:197] + "..."
|
|
}
|
|
return text
|
|
}
|
|
|
|
func (p *Parser) extractVersion(doc *goquery.Document) string {
|
|
versionAdded := doc.Find(".versionadded").Text()
|
|
if versionAdded != "" {
|
|
re := regexp.MustCompile(`\d+\.\d+`)
|
|
if match := re.FindString(versionAdded); match != "" {
|
|
return match
|
|
}
|
|
}
|
|
|
|
versionChanged := doc.Find(".versionchanged").Text()
|
|
if versionChanged != "" {
|
|
re := regexp.MustCompile(`\d+\.\d+`)
|
|
if match := re.FindString(versionChanged); match != "" {
|
|
return match
|
|
}
|
|
}
|
|
|
|
return ""
|
|
}
|
|
|
|
func (p *Parser) extractClasses(doc *goquery.Document, moduleName string, docURL string) []*Class {
|
|
var classes []*Class
|
|
|
|
doc.Find("dl.py.class").Each(func(_ int, s *goquery.Selection) {
|
|
class := &Class{
|
|
Module: moduleName,
|
|
}
|
|
|
|
dt := s.Find("dt.sig-object").First()
|
|
if dt.Length() == 0 {
|
|
dt = s.Find("dt").First()
|
|
}
|
|
|
|
sig := dt.Find("code.sig-prename")
|
|
class.Name = strings.TrimSpace(sig.Find(".pre").Last().Text())
|
|
if class.Name == "" {
|
|
class.Name = strings.TrimSpace(dt.Find(".sig-name").Text())
|
|
}
|
|
if class.Name == "" {
|
|
sigText := dt.Text()
|
|
sigText = strings.TrimSpace(sigText)
|
|
parts := strings.Fields(sigText)
|
|
if len(parts) > 0 {
|
|
class.Name = parts[0]
|
|
}
|
|
}
|
|
|
|
if id, exists := dt.Attr("id"); exists {
|
|
class.QualName = id
|
|
class.DocURL = docURL + "#" + id
|
|
} else {
|
|
class.QualName = class.Name
|
|
class.DocURL = docURL
|
|
}
|
|
|
|
class.Signature = strings.TrimSpace(dt.Text())
|
|
|
|
dd := s.Find("dd").First()
|
|
class.Doc = strings.TrimSpace(dd.Find("p").First().Text())
|
|
|
|
bases := dt.Find("a.reference.internal")
|
|
bases.Each(func(_ int, b *goquery.Selection) {
|
|
base := strings.TrimSpace(b.Text())
|
|
if base != "" && base != class.Name {
|
|
class.Bases = append(class.Bases, base)
|
|
}
|
|
})
|
|
|
|
class.Methods = p.extractMethods(s, class.Name, docURL)
|
|
class.ClassMethods = p.extractClassMethods(s, class.Name, docURL)
|
|
class.StaticMethods = p.extractStaticMethods(s, class.Name, docURL)
|
|
class.Attributes = p.extractAttributes(s, class.Name, docURL)
|
|
|
|
if class.Name != "" {
|
|
classes = append(classes, class)
|
|
}
|
|
})
|
|
|
|
return classes
|
|
}
|
|
|
|
func (p *Parser) extractFunctions(doc *goquery.Document, moduleName string, docURL string) []*Function {
|
|
var functions []*Function
|
|
|
|
doc.Find("dl.py.function").Each(func(_ int, s *goquery.Selection) {
|
|
fn := &Function{
|
|
Module: moduleName,
|
|
}
|
|
|
|
dt := s.Find("dt.sig-object").First()
|
|
if dt.Length() == 0 {
|
|
dt = s.Find("dt").First()
|
|
}
|
|
|
|
sig := dt.Find("code.sig-prename")
|
|
fn.Name = strings.TrimSpace(sig.Find(".pre").Last().Text())
|
|
if fn.Name == "" {
|
|
fn.Name = strings.TrimSpace(dt.Find(".sig-name").Text())
|
|
}
|
|
if fn.Name == "" {
|
|
sigText := dt.Text()
|
|
sigText = strings.TrimSpace(sigText)
|
|
if idx := strings.Index(sigText, "("); idx > 0 {
|
|
fn.Name = strings.TrimSpace(sigText[:idx])
|
|
}
|
|
}
|
|
|
|
if id, exists := dt.Attr("id"); exists {
|
|
fn.QualName = id
|
|
fn.DocURL = docURL + "#" + id
|
|
} else {
|
|
fn.QualName = fn.Name
|
|
fn.DocURL = docURL
|
|
}
|
|
|
|
fn.Signature = strings.TrimSpace(dt.Text())
|
|
|
|
dd := s.Find("dd").First()
|
|
fn.Doc = strings.TrimSpace(dd.Find("p").First().Text())
|
|
|
|
fn.Parameters = p.extractParameters(dt)
|
|
|
|
if class := s.Find("dl.py.method, dl.py.classmethod, dl.py.staticmethod"); class.Length() > 0 {
|
|
return
|
|
}
|
|
|
|
if fn.Name != "" {
|
|
functions = append(functions, fn)
|
|
}
|
|
})
|
|
|
|
return functions
|
|
}
|
|
|
|
func (p *Parser) extractExceptions(doc *goquery.Document, moduleName string, docURL string) []*Exception {
|
|
var exceptions []*Exception
|
|
|
|
doc.Find("dl.py.exception").Each(func(_ int, s *goquery.Selection) {
|
|
exc := &Exception{
|
|
Module: moduleName,
|
|
}
|
|
|
|
dt := s.Find("dt.sig-object").First()
|
|
if dt.Length() == 0 {
|
|
dt = s.Find("dt").First()
|
|
}
|
|
|
|
sig := dt.Find("code.sig-prename")
|
|
exc.Name = strings.TrimSpace(sig.Find(".pre").Last().Text())
|
|
if exc.Name == "" {
|
|
exc.Name = strings.TrimSpace(dt.Find(".sig-name").Text())
|
|
}
|
|
if exc.Name == "" {
|
|
sigText := dt.Text()
|
|
sigText = strings.TrimSpace(sigText)
|
|
if idx := strings.Index(sigText, "("); idx > 0 {
|
|
exc.Name = strings.TrimSpace(sigText[:idx])
|
|
}
|
|
}
|
|
|
|
if id, exists := dt.Attr("id"); exists {
|
|
exc.QualName = id
|
|
exc.DocURL = docURL + "#" + id
|
|
} else {
|
|
exc.QualName = exc.Name
|
|
exc.DocURL = docURL
|
|
}
|
|
|
|
exc.Signature = strings.TrimSpace(dt.Text())
|
|
|
|
dd := s.Find("dd").First()
|
|
exc.Doc = strings.TrimSpace(dd.Find("p").First().Text())
|
|
|
|
if exc.Name != "" {
|
|
exceptions = append(exceptions, exc)
|
|
}
|
|
})
|
|
|
|
return exceptions
|
|
}
|
|
|
|
func (p *Parser) extractData(doc *goquery.Document, moduleName string, docURL string) []*Data {
|
|
var dataList []*Data
|
|
|
|
doc.Find("dl.py.data").Each(func(_ int, s *goquery.Selection) {
|
|
data := &Data{
|
|
Module: moduleName,
|
|
}
|
|
|
|
dt := s.Find("dt.sig-object").First()
|
|
if dt.Length() == 0 {
|
|
dt = s.Find("dt").First()
|
|
}
|
|
|
|
sig := dt.Find("code.sig-prename")
|
|
data.Name = strings.TrimSpace(sig.Find(".pre").Last().Text())
|
|
if data.Name == "" {
|
|
sigText := dt.Text()
|
|
sigText = strings.TrimSpace(sigText)
|
|
data.Name = strings.Fields(sigText)[0]
|
|
}
|
|
|
|
if id, exists := dt.Attr("id"); exists {
|
|
data.DocURL = docURL + "#" + id
|
|
} else {
|
|
data.DocURL = docURL
|
|
}
|
|
|
|
dd := s.Find("dd").First()
|
|
data.Doc = strings.TrimSpace(dd.Find("p").First().Text())
|
|
|
|
if data.Name != "" {
|
|
dataList = append(dataList, data)
|
|
}
|
|
})
|
|
|
|
return dataList
|
|
}
|
|
|
|
func (p *Parser) extractMethods(parent *goquery.Selection, className string, docURL string) []*Method {
|
|
var methods []*Method
|
|
|
|
parent.Find("dl.py.method").Each(func(_ int, s *goquery.Selection) {
|
|
method := p.parseMethod(s, className, docURL, false, false)
|
|
if method != nil {
|
|
methods = append(methods, method)
|
|
}
|
|
})
|
|
|
|
return methods
|
|
}
|
|
|
|
func (p *Parser) extractClassMethods(parent *goquery.Selection, className string, docURL string) []*Method {
|
|
var methods []*Method
|
|
|
|
parent.Find("dl.py.classmethod").Each(func(_ int, s *goquery.Selection) {
|
|
method := p.parseMethod(s, className, docURL, true, false)
|
|
if method != nil {
|
|
methods = append(methods, method)
|
|
}
|
|
})
|
|
|
|
return methods
|
|
}
|
|
|
|
func (p *Parser) extractStaticMethods(parent *goquery.Selection, className string, docURL string) []*Method {
|
|
var methods []*Method
|
|
|
|
parent.Find("dl.py.staticmethod").Each(func(_ int, s *goquery.Selection) {
|
|
method := p.parseMethod(s, className, docURL, false, true)
|
|
if method != nil {
|
|
methods = append(methods, method)
|
|
}
|
|
})
|
|
|
|
return methods
|
|
}
|
|
|
|
func (p *Parser) parseMethod(s *goquery.Selection, className string, docURL string, isClassMethod bool, isStatic bool) *Method {
|
|
method := &Method{
|
|
Class: className,
|
|
IsClassMethod: isClassMethod,
|
|
IsStatic: isStatic,
|
|
}
|
|
|
|
dt := s.Find("dt.sig-object").First()
|
|
if dt.Length() == 0 {
|
|
dt = s.Find("dt").First()
|
|
}
|
|
|
|
sig := dt.Find("code.sig-prename")
|
|
method.Name = strings.TrimSpace(sig.Find(".pre").Last().Text())
|
|
if method.Name == "" {
|
|
method.Name = strings.TrimSpace(dt.Find(".sig-name").Text())
|
|
}
|
|
if method.Name == "" {
|
|
sigText := dt.Text()
|
|
sigText = strings.TrimSpace(sigText)
|
|
if idx := strings.Index(sigText, "("); idx > 0 {
|
|
name := strings.TrimSpace(sigText[:idx])
|
|
parts := strings.Split(name, ".")
|
|
method.Name = parts[len(parts)-1]
|
|
}
|
|
}
|
|
|
|
if id, exists := dt.Attr("id"); exists {
|
|
method.QualName = id
|
|
method.DocURL = docURL + "#" + id
|
|
} else {
|
|
method.QualName = className + "." + method.Name
|
|
method.DocURL = docURL
|
|
}
|
|
|
|
method.Signature = strings.TrimSpace(dt.Text())
|
|
|
|
dd := s.Find("dd").First()
|
|
method.Doc = strings.TrimSpace(dd.Find("p").First().Text())
|
|
|
|
method.Parameters = p.extractParameters(dt)
|
|
|
|
if method.Name != "" {
|
|
return method
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (p *Parser) extractAttributes(parent *goquery.Selection, className string, docURL string) []*Attribute {
|
|
var attributes []*Attribute
|
|
|
|
parent.Find("dl.py.attribute").Each(func(_ int, s *goquery.Selection) {
|
|
attr := &Attribute{
|
|
Class: className,
|
|
}
|
|
|
|
dt := s.Find("dt.sig-object").First()
|
|
if dt.Length() == 0 {
|
|
dt = s.Find("dt").First()
|
|
}
|
|
|
|
sig := dt.Find("code.sig-prename")
|
|
attr.Name = strings.TrimSpace(sig.Find(".pre").Last().Text())
|
|
if attr.Name == "" {
|
|
sigText := dt.Text()
|
|
sigText = strings.TrimSpace(sigText)
|
|
attr.Name = strings.Fields(sigText)[0]
|
|
}
|
|
|
|
if id, exists := dt.Attr("id"); exists {
|
|
attr.DocURL = docURL + "#" + id
|
|
} else {
|
|
attr.DocURL = docURL
|
|
}
|
|
|
|
dd := s.Find("dd").First()
|
|
attr.Doc = strings.TrimSpace(dd.Find("p").First().Text())
|
|
|
|
if attr.Name != "" {
|
|
attributes = append(attributes, attr)
|
|
}
|
|
})
|
|
|
|
return attributes
|
|
}
|
|
|
|
func (p *Parser) extractParameters(dt *goquery.Selection) []*Param {
|
|
var params []*Param
|
|
|
|
dt.Find("em.sig-param").Each(func(_ int, em *goquery.Selection) {
|
|
param := &Param{}
|
|
|
|
text := strings.TrimSpace(em.Text())
|
|
|
|
if strings.HasPrefix(text, "*") && !strings.HasPrefix(text, "**") {
|
|
param.IsVarArgs = true
|
|
text = strings.TrimPrefix(text, "*")
|
|
} else if strings.HasPrefix(text, "**") {
|
|
param.IsKWArgs = true
|
|
text = strings.TrimPrefix(text, "**")
|
|
}
|
|
|
|
if strings.Contains(text, "=") {
|
|
parts := strings.SplitN(text, "=", 2)
|
|
param.Name = strings.TrimSpace(parts[0])
|
|
param.Default = strings.TrimSpace(parts[1])
|
|
} else {
|
|
param.Name = text
|
|
}
|
|
|
|
if param.Name != "" {
|
|
params = append(params, param)
|
|
}
|
|
})
|
|
|
|
return params
|
|
}
|
|
|
|
func extractPathFromURL(href string) string {
|
|
u, err := url.Parse(href)
|
|
if err != nil {
|
|
return href
|
|
}
|
|
|
|
path := u.Path
|
|
path = strings.TrimSuffix(path, ".html")
|
|
path = strings.TrimSuffix(path, "/")
|
|
path = strings.TrimPrefix(path, "/")
|
|
|
|
if strings.Contains(path, "#") {
|
|
parts := strings.Split(path, "#")
|
|
path = parts[0]
|
|
}
|
|
|
|
return path
|
|
}
|
|
|
|
func resolveURL(base string, href string) string {
|
|
if strings.HasPrefix(href, "http") {
|
|
return href
|
|
}
|
|
|
|
baseURL, err := url.Parse(base)
|
|
if err != nil {
|
|
return href
|
|
}
|
|
|
|
hrefURL, err := url.Parse(href)
|
|
if err != nil {
|
|
return href
|
|
}
|
|
|
|
return baseURL.ResolveReference(hrefURL).String()
|
|
}
|