package pythondocs import ( "net/url" "regexp" "strings" "time" "github.com/PuerkitoBio/goquery" "github.com/yourorg/devour/pkg/parserutil" ) type Parser struct { baseURL string } func NewParser() *Parser { return &Parser{ baseURL: "https://docs.python.org", } } func (p *Parser) ParseModulePage(html string, docURL string) (*Module, error) { doc, err := goquery.NewDocumentFromReader(strings.NewReader(html)) if err != nil { return nil, err } module := &Module{ DocURL: docURL, FetchedAt: time.Now(), } module.Name = p.extractModuleName(doc) module.Path = module.Name module.Doc = p.extractModuleDoc(doc) module.Synopsis = p.extractSynopsis(doc) module.Version = p.extractVersion(doc) module.Classes = p.extractClasses(doc, module.Name, docURL) module.Functions = p.extractFunctions(doc, module.Name, docURL) module.Exceptions = p.extractExceptions(doc, module.Name, docURL) module.Constants = p.extractData(doc, module.Name, docURL) return module, nil } func (p *Parser) ParseSearchResults(html string) ([]*SearchResult, error) { doc, err := goquery.NewDocumentFromReader(strings.NewReader(html)) if err != nil { return nil, err } var results []*SearchResult doc.Find("ul.search li").Each(func(i int, s *goquery.Selection) { result := &SearchResult{} classes, _ := s.Attr("class") if strings.Contains(classes, "kind-object") { result.Kind = "object" } else if strings.Contains(classes, "kind-text") { result.Kind = "text" } else if strings.Contains(classes, "kind-title") { result.Kind = "title" } link := s.Find("a").First() result.Name = strings.TrimSpace(link.Text()) if href, exists := link.Attr("href"); exists { result.DocURL = resolveURL(p.baseURL, href) result.Path = extractPathFromURL(href) } if score, exists := link.Attr("data-score"); exists { var scoreInt int for _, c := range score { if c >= '0' && c <= '9' { scoreInt = scoreInt*10 + int(c-'0') } } result.Score = scoreInt } span := s.Find("span").Last() result.Description = strings.TrimSpace(span.Text()) results = append(results, result) }) return results, nil } func (p *Parser) extractModuleName(doc *goquery.Document) string { section := doc.Find("section[id^='module-']").First() if section.Length() > 0 { id, _ := section.Attr("id") return strings.TrimPrefix(id, "module-") } h1 := doc.Find("h1 code").First() if h1.Length() > 0 { return strings.TrimSpace(h1.Text()) } h1 = doc.Find(".body h1").First() if h1.Length() > 0 { text := h1.Text() if strings.HasPrefix(text, "—") { parts := strings.SplitN(text, "—", 2) if len(parts) > 0 { return strings.TrimSpace(parts[0]) } } return strings.TrimSpace(text) } return "" } func (p *Parser) extractModuleDoc(doc *goquery.Document) string { section := doc.Find("section[id^='module-']").First() if section.Length() == 0 { section = doc.Find(".body").First() } docblock := section.Find("p").First() if docblock.Length() > 0 { return strings.TrimSpace(docblock.Text()) } return "" } func (p *Parser) extractSynopsis(doc *goquery.Document) string { text := doc.Find(".body p").First().Text() text = strings.TrimSpace(text) if len(text) > 200 { return text[:197] + "..." } return text } func (p *Parser) extractVersion(doc *goquery.Document) string { versionAdded := doc.Find(".versionadded").Text() if versionAdded != "" { re := regexp.MustCompile(`\d+\.\d+`) if match := re.FindString(versionAdded); match != "" { return match } } versionChanged := doc.Find(".versionchanged").Text() if versionChanged != "" { re := regexp.MustCompile(`\d+\.\d+`) if match := re.FindString(versionChanged); match != "" { return match } } return "" } func (p *Parser) extractClasses(doc *goquery.Document, moduleName string, docURL string) []*Class { var classes []*Class doc.Find("dl.py.class").Each(func(_ int, s *goquery.Selection) { class := &Class{ Module: moduleName, } dt := s.Find("dt.sig-object").First() if dt.Length() == 0 { dt = s.Find("dt").First() } sig := dt.Find("code.sig-prename") class.Name = strings.TrimSpace(sig.Find(".pre").Last().Text()) if class.Name == "" { class.Name = strings.TrimSpace(dt.Find(".sig-name").Text()) } if class.Name == "" { sigText := dt.Text() sigText = strings.TrimSpace(sigText) parts := strings.Fields(sigText) if len(parts) > 0 { class.Name = parts[0] } } if id, exists := dt.Attr("id"); exists { class.QualName = id class.DocURL = docURL + "#" + id } else { class.QualName = class.Name class.DocURL = docURL } class.Signature = strings.TrimSpace(dt.Text()) dd := s.Find("dd").First() class.Doc = strings.TrimSpace(dd.Find("p").First().Text()) bases := dt.Find("a.reference.internal") bases.Each(func(_ int, b *goquery.Selection) { base := strings.TrimSpace(b.Text()) if base != "" && base != class.Name { class.Bases = append(class.Bases, base) } }) class.Methods = p.extractMethods(s, class.Name, docURL) class.ClassMethods = p.extractClassMethods(s, class.Name, docURL) class.StaticMethods = p.extractStaticMethods(s, class.Name, docURL) class.Attributes = p.extractAttributes(s, class.Name, docURL) if class.Name != "" { classes = append(classes, class) } }) return classes } func (p *Parser) extractFunctions(doc *goquery.Document, moduleName string, docURL string) []*Function { var functions []*Function doc.Find("dl.py.function").Each(func(_ int, s *goquery.Selection) { fn := &Function{ Module: moduleName, } dt := s.Find("dt.sig-object").First() if dt.Length() == 0 { dt = s.Find("dt").First() } sig := dt.Find("code.sig-prename") fn.Name = strings.TrimSpace(sig.Find(".pre").Last().Text()) if fn.Name == "" { fn.Name = strings.TrimSpace(dt.Find(".sig-name").Text()) } if fn.Name == "" { sigText := dt.Text() sigText = strings.TrimSpace(sigText) if idx := strings.Index(sigText, "("); idx > 0 { fn.Name = strings.TrimSpace(sigText[:idx]) } } if id, exists := dt.Attr("id"); exists { fn.QualName = id fn.DocURL = docURL + "#" + id } else { fn.QualName = fn.Name fn.DocURL = docURL } fn.Signature = strings.TrimSpace(dt.Text()) dd := s.Find("dd").First() fn.Doc = strings.TrimSpace(dd.Find("p").First().Text()) fn.Parameters = p.extractParameters(dt) if class := s.Find("dl.py.method, dl.py.classmethod, dl.py.staticmethod"); class.Length() > 0 { return } if fn.Name != "" { functions = append(functions, fn) } }) return functions } func (p *Parser) extractExceptions(doc *goquery.Document, moduleName string, docURL string) []*Exception { var exceptions []*Exception doc.Find("dl.py.exception").Each(func(_ int, s *goquery.Selection) { exc := &Exception{ Module: moduleName, } dt := s.Find("dt.sig-object").First() if dt.Length() == 0 { dt = s.Find("dt").First() } sig := dt.Find("code.sig-prename") exc.Name = strings.TrimSpace(sig.Find(".pre").Last().Text()) if exc.Name == "" { exc.Name = strings.TrimSpace(dt.Find(".sig-name").Text()) } if exc.Name == "" { sigText := dt.Text() sigText = strings.TrimSpace(sigText) if idx := strings.Index(sigText, "("); idx > 0 { exc.Name = strings.TrimSpace(sigText[:idx]) } } if id, exists := dt.Attr("id"); exists { exc.QualName = id exc.DocURL = docURL + "#" + id } else { exc.QualName = exc.Name exc.DocURL = docURL } exc.Signature = strings.TrimSpace(dt.Text()) dd := s.Find("dd").First() exc.Doc = strings.TrimSpace(dd.Find("p").First().Text()) if exc.Name != "" { exceptions = append(exceptions, exc) } }) return exceptions } func (p *Parser) extractData(doc *goquery.Document, moduleName string, docURL string) []*Data { var dataList []*Data doc.Find("dl.py.data").Each(func(_ int, s *goquery.Selection) { data := &Data{ Module: moduleName, } dt := s.Find("dt.sig-object").First() if dt.Length() == 0 { dt = s.Find("dt").First() } sig := dt.Find("code.sig-prename") data.Name = strings.TrimSpace(sig.Find(".pre").Last().Text()) if data.Name == "" { sigText := dt.Text() sigText = strings.TrimSpace(sigText) data.Name = strings.Fields(sigText)[0] } if id, exists := dt.Attr("id"); exists { data.DocURL = docURL + "#" + id } else { data.DocURL = docURL } dd := s.Find("dd").First() data.Doc = strings.TrimSpace(dd.Find("p").First().Text()) if data.Name != "" { dataList = append(dataList, data) } }) return dataList } func (p *Parser) extractMethods(parent *goquery.Selection, className string, docURL string) []*Method { var methods []*Method parent.Find("dl.py.method").Each(func(_ int, s *goquery.Selection) { method := p.parseMethod(s, className, docURL, false, false) if method != nil { methods = append(methods, method) } }) return methods } func (p *Parser) extractClassMethods(parent *goquery.Selection, className string, docURL string) []*Method { var methods []*Method parent.Find("dl.py.classmethod").Each(func(_ int, s *goquery.Selection) { method := p.parseMethod(s, className, docURL, true, false) if method != nil { methods = append(methods, method) } }) return methods } func (p *Parser) extractStaticMethods(parent *goquery.Selection, className string, docURL string) []*Method { var methods []*Method parent.Find("dl.py.staticmethod").Each(func(_ int, s *goquery.Selection) { method := p.parseMethod(s, className, docURL, false, true) if method != nil { methods = append(methods, method) } }) return methods } func (p *Parser) parseMethod(s *goquery.Selection, className string, docURL string, isClassMethod bool, isStatic bool) *Method { method := &Method{ Class: className, IsClassMethod: isClassMethod, IsStatic: isStatic, } dt := s.Find("dt.sig-object").First() if dt.Length() == 0 { dt = s.Find("dt").First() } sig := dt.Find("code.sig-prename") method.Name = strings.TrimSpace(sig.Find(".pre").Last().Text()) if method.Name == "" { method.Name = strings.TrimSpace(dt.Find(".sig-name").Text()) } if method.Name == "" { sigText := dt.Text() sigText = strings.TrimSpace(sigText) if idx := strings.Index(sigText, "("); idx > 0 { name := strings.TrimSpace(sigText[:idx]) parts := strings.Split(name, ".") method.Name = parts[len(parts)-1] } } if id, exists := dt.Attr("id"); exists { method.QualName = id method.DocURL = docURL + "#" + id } else { method.QualName = className + "." + method.Name method.DocURL = docURL } method.Signature = strings.TrimSpace(dt.Text()) dd := s.Find("dd").First() method.Doc = strings.TrimSpace(dd.Find("p").First().Text()) method.Parameters = p.extractParameters(dt) if method.Name != "" { return method } return nil } func (p *Parser) extractAttributes(parent *goquery.Selection, className string, docURL string) []*Attribute { var attributes []*Attribute parent.Find("dl.py.attribute").Each(func(_ int, s *goquery.Selection) { attr := &Attribute{ Class: className, } dt := s.Find("dt.sig-object").First() if dt.Length() == 0 { dt = s.Find("dt").First() } sig := dt.Find("code.sig-prename") attr.Name = strings.TrimSpace(sig.Find(".pre").Last().Text()) if attr.Name == "" { sigText := dt.Text() sigText = strings.TrimSpace(sigText) attr.Name = strings.Fields(sigText)[0] } if id, exists := dt.Attr("id"); exists { attr.DocURL = docURL + "#" + id } else { attr.DocURL = docURL } dd := s.Find("dd").First() attr.Doc = strings.TrimSpace(dd.Find("p").First().Text()) if attr.Name != "" { attributes = append(attributes, attr) } }) return attributes } func (p *Parser) extractParameters(dt *goquery.Selection) []*Param { var params []*Param dt.Find("em.sig-param").Each(func(_ int, em *goquery.Selection) { param := &Param{} text := strings.TrimSpace(em.Text()) if strings.HasPrefix(text, "*") && !strings.HasPrefix(text, "**") { param.IsVarArgs = true text = strings.TrimPrefix(text, "*") } else if strings.HasPrefix(text, "**") { param.IsKWArgs = true text = strings.TrimPrefix(text, "**") } if strings.Contains(text, "=") { parts := strings.SplitN(text, "=", 2) param.Name = strings.TrimSpace(parts[0]) param.Default = strings.TrimSpace(parts[1]) } else { param.Name = text } if param.Name != "" { params = append(params, param) } }) return params } func extractPathFromURL(href string) string { u, err := url.Parse(href) if err != nil { return href } path := u.Path path = strings.TrimSuffix(path, ".html") path = strings.TrimSuffix(path, "/") path = strings.TrimPrefix(path, "/") if strings.Contains(path, "#") { parts := strings.Split(path, "#") path = parts[0] } return path } func resolveURL(base string, href string) string { return parserutil.ResolveURL(base, href) }