mirror of
https://github.com/Dvorinka/Devour.git
synced 2026-06-03 20:13:03 +00:00
first commit
This commit is contained in:
@@ -0,0 +1,555 @@
|
||||
package pythondocs
|
||||
|
||||
import (
|
||||
"net/url"
|
||||
"regexp"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
)
|
||||
|
||||
type Parser struct {
|
||||
baseURL string
|
||||
}
|
||||
|
||||
func NewParser() *Parser {
|
||||
return &Parser{
|
||||
baseURL: "https://docs.python.org",
|
||||
}
|
||||
}
|
||||
|
||||
func (p *Parser) ParseModulePage(html string, docURL string) (*Module, error) {
|
||||
doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
module := &Module{
|
||||
DocURL: docURL,
|
||||
FetchedAt: time.Now(),
|
||||
}
|
||||
|
||||
module.Name = p.extractModuleName(doc)
|
||||
module.Path = module.Name
|
||||
module.Doc = p.extractModuleDoc(doc)
|
||||
module.Synopsis = p.extractSynopsis(doc)
|
||||
module.Version = p.extractVersion(doc)
|
||||
|
||||
module.Classes = p.extractClasses(doc, module.Name, docURL)
|
||||
module.Functions = p.extractFunctions(doc, module.Name, docURL)
|
||||
module.Exceptions = p.extractExceptions(doc, module.Name, docURL)
|
||||
module.Constants = p.extractData(doc, module.Name, docURL)
|
||||
|
||||
return module, nil
|
||||
}
|
||||
|
||||
func (p *Parser) ParseSearchResults(html string) ([]*SearchResult, error) {
|
||||
doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var results []*SearchResult
|
||||
|
||||
doc.Find("ul.search li").Each(func(i int, s *goquery.Selection) {
|
||||
result := &SearchResult{}
|
||||
|
||||
classes, _ := s.Attr("class")
|
||||
if strings.Contains(classes, "kind-object") {
|
||||
result.Kind = "object"
|
||||
} else if strings.Contains(classes, "kind-text") {
|
||||
result.Kind = "text"
|
||||
} else if strings.Contains(classes, "kind-title") {
|
||||
result.Kind = "title"
|
||||
}
|
||||
|
||||
link := s.Find("a").First()
|
||||
result.Name = strings.TrimSpace(link.Text())
|
||||
|
||||
if href, exists := link.Attr("href"); exists {
|
||||
result.DocURL = resolveURL(p.baseURL, href)
|
||||
result.Path = extractPathFromURL(href)
|
||||
}
|
||||
|
||||
if score, exists := link.Attr("data-score"); exists {
|
||||
var scoreInt int
|
||||
for _, c := range score {
|
||||
if c >= '0' && c <= '9' {
|
||||
scoreInt = scoreInt*10 + int(c-'0')
|
||||
}
|
||||
}
|
||||
result.Score = scoreInt
|
||||
}
|
||||
|
||||
span := s.Find("span").Last()
|
||||
result.Description = strings.TrimSpace(span.Text())
|
||||
|
||||
results = append(results, result)
|
||||
})
|
||||
|
||||
return results, nil
|
||||
}
|
||||
|
||||
func (p *Parser) extractModuleName(doc *goquery.Document) string {
|
||||
section := doc.Find("section[id^='module-']").First()
|
||||
if section.Length() > 0 {
|
||||
id, _ := section.Attr("id")
|
||||
return strings.TrimPrefix(id, "module-")
|
||||
}
|
||||
|
||||
h1 := doc.Find("h1 code").First()
|
||||
if h1.Length() > 0 {
|
||||
return strings.TrimSpace(h1.Text())
|
||||
}
|
||||
|
||||
h1 = doc.Find(".body h1").First()
|
||||
if h1.Length() > 0 {
|
||||
text := h1.Text()
|
||||
if strings.HasPrefix(text, "—") {
|
||||
parts := strings.SplitN(text, "—", 2)
|
||||
if len(parts) > 0 {
|
||||
return strings.TrimSpace(parts[0])
|
||||
}
|
||||
}
|
||||
return strings.TrimSpace(text)
|
||||
}
|
||||
|
||||
return ""
|
||||
}
|
||||
|
||||
func (p *Parser) extractModuleDoc(doc *goquery.Document) string {
|
||||
section := doc.Find("section[id^='module-']").First()
|
||||
if section.Length() == 0 {
|
||||
section = doc.Find(".body").First()
|
||||
}
|
||||
|
||||
docblock := section.Find("p").First()
|
||||
if docblock.Length() > 0 {
|
||||
return strings.TrimSpace(docblock.Text())
|
||||
}
|
||||
|
||||
return ""
|
||||
}
|
||||
|
||||
func (p *Parser) extractSynopsis(doc *goquery.Document) string {
|
||||
text := doc.Find(".body p").First().Text()
|
||||
text = strings.TrimSpace(text)
|
||||
if len(text) > 200 {
|
||||
return text[:197] + "..."
|
||||
}
|
||||
return text
|
||||
}
|
||||
|
||||
func (p *Parser) extractVersion(doc *goquery.Document) string {
|
||||
versionAdded := doc.Find(".versionadded").Text()
|
||||
if versionAdded != "" {
|
||||
re := regexp.MustCompile(`\d+\.\d+`)
|
||||
if match := re.FindString(versionAdded); match != "" {
|
||||
return match
|
||||
}
|
||||
}
|
||||
|
||||
versionChanged := doc.Find(".versionchanged").Text()
|
||||
if versionChanged != "" {
|
||||
re := regexp.MustCompile(`\d+\.\d+`)
|
||||
if match := re.FindString(versionChanged); match != "" {
|
||||
return match
|
||||
}
|
||||
}
|
||||
|
||||
return ""
|
||||
}
|
||||
|
||||
func (p *Parser) extractClasses(doc *goquery.Document, moduleName string, docURL string) []*Class {
|
||||
var classes []*Class
|
||||
|
||||
doc.Find("dl.py.class").Each(func(_ int, s *goquery.Selection) {
|
||||
class := &Class{
|
||||
Module: moduleName,
|
||||
}
|
||||
|
||||
dt := s.Find("dt.sig-object").First()
|
||||
if dt.Length() == 0 {
|
||||
dt = s.Find("dt").First()
|
||||
}
|
||||
|
||||
sig := dt.Find("code.sig-prename")
|
||||
class.Name = strings.TrimSpace(sig.Find(".pre").Last().Text())
|
||||
if class.Name == "" {
|
||||
class.Name = strings.TrimSpace(dt.Find(".sig-name").Text())
|
||||
}
|
||||
if class.Name == "" {
|
||||
sigText := dt.Text()
|
||||
sigText = strings.TrimSpace(sigText)
|
||||
parts := strings.Fields(sigText)
|
||||
if len(parts) > 0 {
|
||||
class.Name = parts[0]
|
||||
}
|
||||
}
|
||||
|
||||
if id, exists := dt.Attr("id"); exists {
|
||||
class.QualName = id
|
||||
class.DocURL = docURL + "#" + id
|
||||
} else {
|
||||
class.QualName = class.Name
|
||||
class.DocURL = docURL
|
||||
}
|
||||
|
||||
class.Signature = strings.TrimSpace(dt.Text())
|
||||
|
||||
dd := s.Find("dd").First()
|
||||
class.Doc = strings.TrimSpace(dd.Find("p").First().Text())
|
||||
|
||||
bases := dt.Find("a.reference.internal")
|
||||
bases.Each(func(_ int, b *goquery.Selection) {
|
||||
base := strings.TrimSpace(b.Text())
|
||||
if base != "" && base != class.Name {
|
||||
class.Bases = append(class.Bases, base)
|
||||
}
|
||||
})
|
||||
|
||||
class.Methods = p.extractMethods(s, class.Name, docURL)
|
||||
class.ClassMethods = p.extractClassMethods(s, class.Name, docURL)
|
||||
class.StaticMethods = p.extractStaticMethods(s, class.Name, docURL)
|
||||
class.Attributes = p.extractAttributes(s, class.Name, docURL)
|
||||
|
||||
if class.Name != "" {
|
||||
classes = append(classes, class)
|
||||
}
|
||||
})
|
||||
|
||||
return classes
|
||||
}
|
||||
|
||||
func (p *Parser) extractFunctions(doc *goquery.Document, moduleName string, docURL string) []*Function {
|
||||
var functions []*Function
|
||||
|
||||
doc.Find("dl.py.function").Each(func(_ int, s *goquery.Selection) {
|
||||
fn := &Function{
|
||||
Module: moduleName,
|
||||
}
|
||||
|
||||
dt := s.Find("dt.sig-object").First()
|
||||
if dt.Length() == 0 {
|
||||
dt = s.Find("dt").First()
|
||||
}
|
||||
|
||||
sig := dt.Find("code.sig-prename")
|
||||
fn.Name = strings.TrimSpace(sig.Find(".pre").Last().Text())
|
||||
if fn.Name == "" {
|
||||
fn.Name = strings.TrimSpace(dt.Find(".sig-name").Text())
|
||||
}
|
||||
if fn.Name == "" {
|
||||
sigText := dt.Text()
|
||||
sigText = strings.TrimSpace(sigText)
|
||||
if idx := strings.Index(sigText, "("); idx > 0 {
|
||||
fn.Name = strings.TrimSpace(sigText[:idx])
|
||||
}
|
||||
}
|
||||
|
||||
if id, exists := dt.Attr("id"); exists {
|
||||
fn.QualName = id
|
||||
fn.DocURL = docURL + "#" + id
|
||||
} else {
|
||||
fn.QualName = fn.Name
|
||||
fn.DocURL = docURL
|
||||
}
|
||||
|
||||
fn.Signature = strings.TrimSpace(dt.Text())
|
||||
|
||||
dd := s.Find("dd").First()
|
||||
fn.Doc = strings.TrimSpace(dd.Find("p").First().Text())
|
||||
|
||||
fn.Parameters = p.extractParameters(dt)
|
||||
|
||||
if class := s.Find("dl.py.method, dl.py.classmethod, dl.py.staticmethod"); class.Length() > 0 {
|
||||
return
|
||||
}
|
||||
|
||||
if fn.Name != "" {
|
||||
functions = append(functions, fn)
|
||||
}
|
||||
})
|
||||
|
||||
return functions
|
||||
}
|
||||
|
||||
func (p *Parser) extractExceptions(doc *goquery.Document, moduleName string, docURL string) []*Exception {
|
||||
var exceptions []*Exception
|
||||
|
||||
doc.Find("dl.py.exception").Each(func(_ int, s *goquery.Selection) {
|
||||
exc := &Exception{
|
||||
Module: moduleName,
|
||||
}
|
||||
|
||||
dt := s.Find("dt.sig-object").First()
|
||||
if dt.Length() == 0 {
|
||||
dt = s.Find("dt").First()
|
||||
}
|
||||
|
||||
sig := dt.Find("code.sig-prename")
|
||||
exc.Name = strings.TrimSpace(sig.Find(".pre").Last().Text())
|
||||
if exc.Name == "" {
|
||||
exc.Name = strings.TrimSpace(dt.Find(".sig-name").Text())
|
||||
}
|
||||
if exc.Name == "" {
|
||||
sigText := dt.Text()
|
||||
sigText = strings.TrimSpace(sigText)
|
||||
if idx := strings.Index(sigText, "("); idx > 0 {
|
||||
exc.Name = strings.TrimSpace(sigText[:idx])
|
||||
}
|
||||
}
|
||||
|
||||
if id, exists := dt.Attr("id"); exists {
|
||||
exc.QualName = id
|
||||
exc.DocURL = docURL + "#" + id
|
||||
} else {
|
||||
exc.QualName = exc.Name
|
||||
exc.DocURL = docURL
|
||||
}
|
||||
|
||||
exc.Signature = strings.TrimSpace(dt.Text())
|
||||
|
||||
dd := s.Find("dd").First()
|
||||
exc.Doc = strings.TrimSpace(dd.Find("p").First().Text())
|
||||
|
||||
if exc.Name != "" {
|
||||
exceptions = append(exceptions, exc)
|
||||
}
|
||||
})
|
||||
|
||||
return exceptions
|
||||
}
|
||||
|
||||
func (p *Parser) extractData(doc *goquery.Document, moduleName string, docURL string) []*Data {
|
||||
var dataList []*Data
|
||||
|
||||
doc.Find("dl.py.data").Each(func(_ int, s *goquery.Selection) {
|
||||
data := &Data{
|
||||
Module: moduleName,
|
||||
}
|
||||
|
||||
dt := s.Find("dt.sig-object").First()
|
||||
if dt.Length() == 0 {
|
||||
dt = s.Find("dt").First()
|
||||
}
|
||||
|
||||
sig := dt.Find("code.sig-prename")
|
||||
data.Name = strings.TrimSpace(sig.Find(".pre").Last().Text())
|
||||
if data.Name == "" {
|
||||
sigText := dt.Text()
|
||||
sigText = strings.TrimSpace(sigText)
|
||||
data.Name = strings.Fields(sigText)[0]
|
||||
}
|
||||
|
||||
if id, exists := dt.Attr("id"); exists {
|
||||
data.DocURL = docURL + "#" + id
|
||||
} else {
|
||||
data.DocURL = docURL
|
||||
}
|
||||
|
||||
dd := s.Find("dd").First()
|
||||
data.Doc = strings.TrimSpace(dd.Find("p").First().Text())
|
||||
|
||||
if data.Name != "" {
|
||||
dataList = append(dataList, data)
|
||||
}
|
||||
})
|
||||
|
||||
return dataList
|
||||
}
|
||||
|
||||
func (p *Parser) extractMethods(parent *goquery.Selection, className string, docURL string) []*Method {
|
||||
var methods []*Method
|
||||
|
||||
parent.Find("dl.py.method").Each(func(_ int, s *goquery.Selection) {
|
||||
method := p.parseMethod(s, className, docURL, false, false)
|
||||
if method != nil {
|
||||
methods = append(methods, method)
|
||||
}
|
||||
})
|
||||
|
||||
return methods
|
||||
}
|
||||
|
||||
func (p *Parser) extractClassMethods(parent *goquery.Selection, className string, docURL string) []*Method {
|
||||
var methods []*Method
|
||||
|
||||
parent.Find("dl.py.classmethod").Each(func(_ int, s *goquery.Selection) {
|
||||
method := p.parseMethod(s, className, docURL, true, false)
|
||||
if method != nil {
|
||||
methods = append(methods, method)
|
||||
}
|
||||
})
|
||||
|
||||
return methods
|
||||
}
|
||||
|
||||
func (p *Parser) extractStaticMethods(parent *goquery.Selection, className string, docURL string) []*Method {
|
||||
var methods []*Method
|
||||
|
||||
parent.Find("dl.py.staticmethod").Each(func(_ int, s *goquery.Selection) {
|
||||
method := p.parseMethod(s, className, docURL, false, true)
|
||||
if method != nil {
|
||||
methods = append(methods, method)
|
||||
}
|
||||
})
|
||||
|
||||
return methods
|
||||
}
|
||||
|
||||
func (p *Parser) parseMethod(s *goquery.Selection, className string, docURL string, isClassMethod bool, isStatic bool) *Method {
|
||||
method := &Method{
|
||||
Class: className,
|
||||
IsClassMethod: isClassMethod,
|
||||
IsStatic: isStatic,
|
||||
}
|
||||
|
||||
dt := s.Find("dt.sig-object").First()
|
||||
if dt.Length() == 0 {
|
||||
dt = s.Find("dt").First()
|
||||
}
|
||||
|
||||
sig := dt.Find("code.sig-prename")
|
||||
method.Name = strings.TrimSpace(sig.Find(".pre").Last().Text())
|
||||
if method.Name == "" {
|
||||
method.Name = strings.TrimSpace(dt.Find(".sig-name").Text())
|
||||
}
|
||||
if method.Name == "" {
|
||||
sigText := dt.Text()
|
||||
sigText = strings.TrimSpace(sigText)
|
||||
if idx := strings.Index(sigText, "("); idx > 0 {
|
||||
name := strings.TrimSpace(sigText[:idx])
|
||||
parts := strings.Split(name, ".")
|
||||
method.Name = parts[len(parts)-1]
|
||||
}
|
||||
}
|
||||
|
||||
if id, exists := dt.Attr("id"); exists {
|
||||
method.QualName = id
|
||||
method.DocURL = docURL + "#" + id
|
||||
} else {
|
||||
method.QualName = className + "." + method.Name
|
||||
method.DocURL = docURL
|
||||
}
|
||||
|
||||
method.Signature = strings.TrimSpace(dt.Text())
|
||||
|
||||
dd := s.Find("dd").First()
|
||||
method.Doc = strings.TrimSpace(dd.Find("p").First().Text())
|
||||
|
||||
method.Parameters = p.extractParameters(dt)
|
||||
|
||||
if method.Name != "" {
|
||||
return method
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (p *Parser) extractAttributes(parent *goquery.Selection, className string, docURL string) []*Attribute {
|
||||
var attributes []*Attribute
|
||||
|
||||
parent.Find("dl.py.attribute").Each(func(_ int, s *goquery.Selection) {
|
||||
attr := &Attribute{
|
||||
Class: className,
|
||||
}
|
||||
|
||||
dt := s.Find("dt.sig-object").First()
|
||||
if dt.Length() == 0 {
|
||||
dt = s.Find("dt").First()
|
||||
}
|
||||
|
||||
sig := dt.Find("code.sig-prename")
|
||||
attr.Name = strings.TrimSpace(sig.Find(".pre").Last().Text())
|
||||
if attr.Name == "" {
|
||||
sigText := dt.Text()
|
||||
sigText = strings.TrimSpace(sigText)
|
||||
attr.Name = strings.Fields(sigText)[0]
|
||||
}
|
||||
|
||||
if id, exists := dt.Attr("id"); exists {
|
||||
attr.DocURL = docURL + "#" + id
|
||||
} else {
|
||||
attr.DocURL = docURL
|
||||
}
|
||||
|
||||
dd := s.Find("dd").First()
|
||||
attr.Doc = strings.TrimSpace(dd.Find("p").First().Text())
|
||||
|
||||
if attr.Name != "" {
|
||||
attributes = append(attributes, attr)
|
||||
}
|
||||
})
|
||||
|
||||
return attributes
|
||||
}
|
||||
|
||||
func (p *Parser) extractParameters(dt *goquery.Selection) []*Param {
|
||||
var params []*Param
|
||||
|
||||
dt.Find("em.sig-param").Each(func(_ int, em *goquery.Selection) {
|
||||
param := &Param{}
|
||||
|
||||
text := strings.TrimSpace(em.Text())
|
||||
|
||||
if strings.HasPrefix(text, "*") && !strings.HasPrefix(text, "**") {
|
||||
param.IsVarArgs = true
|
||||
text = strings.TrimPrefix(text, "*")
|
||||
} else if strings.HasPrefix(text, "**") {
|
||||
param.IsKWArgs = true
|
||||
text = strings.TrimPrefix(text, "**")
|
||||
}
|
||||
|
||||
if strings.Contains(text, "=") {
|
||||
parts := strings.SplitN(text, "=", 2)
|
||||
param.Name = strings.TrimSpace(parts[0])
|
||||
param.Default = strings.TrimSpace(parts[1])
|
||||
} else {
|
||||
param.Name = text
|
||||
}
|
||||
|
||||
if param.Name != "" {
|
||||
params = append(params, param)
|
||||
}
|
||||
})
|
||||
|
||||
return params
|
||||
}
|
||||
|
||||
func extractPathFromURL(href string) string {
|
||||
u, err := url.Parse(href)
|
||||
if err != nil {
|
||||
return href
|
||||
}
|
||||
|
||||
path := u.Path
|
||||
path = strings.TrimSuffix(path, ".html")
|
||||
path = strings.TrimSuffix(path, "/")
|
||||
path = strings.TrimPrefix(path, "/")
|
||||
|
||||
if strings.Contains(path, "#") {
|
||||
parts := strings.Split(path, "#")
|
||||
path = parts[0]
|
||||
}
|
||||
|
||||
return path
|
||||
}
|
||||
|
||||
func resolveURL(base string, href string) string {
|
||||
if strings.HasPrefix(href, "http") {
|
||||
return href
|
||||
}
|
||||
|
||||
baseURL, err := url.Parse(base)
|
||||
if err != nil {
|
||||
return href
|
||||
}
|
||||
|
||||
hrefURL, err := url.Parse(href)
|
||||
if err != nil {
|
||||
return href
|
||||
}
|
||||
|
||||
return baseURL.ResolveReference(hrefURL).String()
|
||||
}
|
||||
Reference in New Issue
Block a user