Files
Devour/pkg/springdocs/parser.go
Tomas Dvorak 55885a0e8f first commit
2026-02-22 10:42:17 +01:00

311 lines
7.8 KiB
Go

package springdocs
import (
"net/url"
"strings"
"time"
"github.com/PuerkitoBio/goquery"
)
type Parser struct {
baseURL string
}
func NewParser() *Parser {
return &Parser{
baseURL: "https://docs.spring.io",
}
}
func (p *Parser) ParseModulePage(html string, docURL string) (*Module, error) {
doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
if err != nil {
return nil, err
}
module := &Module{
DocURL: docURL,
FetchedAt: time.Now(),
}
module.Name = p.extractModuleName(doc)
module.Doc = p.extractModuleDoc(doc)
module.Version = p.extractVersion(doc)
module.Classes = p.extractClasses(doc, module.Name, docURL)
module.Properties = p.extractProperties(doc, docURL)
module.Guides = p.extractGuides(doc, docURL)
return module, nil
}
func (p *Parser) ParseSearchResults(html string) ([]*SearchResult, error) {
doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
if err != nil {
return nil, err
}
var results []*SearchResult
doc.Find(".search-result, .ais-Hits-item, article").Each(func(i int, s *goquery.Selection) {
result := &SearchResult{}
link := s.Find("a").First()
result.Name = strings.TrimSpace(link.Text())
if href, exists := link.Attr("href"); exists {
result.DocURL = resolveURL(p.baseURL, href)
}
result.Doc = strings.TrimSpace(s.Find(".summary, .description, p").First().Text())
if strings.Contains(result.DocURL, "/api/") {
result.Kind = "class"
} else if strings.Contains(result.DocURL, "/guides/") || strings.Contains(result.DocURL, "/tutorial/") {
result.Kind = "guide"
} else {
result.Kind = "doc"
}
results = append(results, result)
})
return results, nil
}
func (p *Parser) extractModuleName(doc *goquery.Document) string {
title := doc.Find("h1, .title, .page-title").First().Text()
title = strings.TrimSpace(title)
if title != "" {
if idx := strings.Index(title, " "); idx > 0 {
return title[:idx]
}
return title
}
return ""
}
func (p *Parser) extractModuleDoc(doc *goquery.Document) string {
docblock := doc.Find(".paragraph:first-child p, .lead, #content p:first-of-type").First()
return strings.TrimSpace(docblock.Text())
}
func (p *Parser) extractVersion(doc *goquery.Document) string {
versionEl := doc.Find(".version, .doc-version, [data-version]")
return strings.TrimSpace(versionEl.Text())
}
func (p *Parser) extractClasses(doc *goquery.Document, moduleName string, docURL string) []*Class {
var classes []*Class
doc.Find("table.table tbody tr, .api-list a, .class-link").Each(func(_ int, s *goquery.Selection) {
class := &Class{}
link := s.Find("a")
if link.Length() == 0 {
link = s
}
class.Name = strings.TrimSpace(link.Text())
if href, exists := link.Attr("href"); exists {
class.DocURL = resolveURL(docURL, href)
if strings.Contains(href, "/api/") {
class.QualifiedName = extractQualifiedName(href)
}
}
class.Doc = strings.TrimSpace(s.Find(".description, td:last-child").Text())
if class.Name != "" {
classes = append(classes, class)
}
})
return classes
}
func (p *Parser) extractProperties(doc *goquery.Document, docURL string) []*Property {
var properties []*Property
doc.Find(".configuration-property, table.properties tbody tr, .config-props dt").Each(func(_ int, s *goquery.Selection) {
prop := &Property{}
nameEl := s.Find(".property-name, code, strong, td:first-child").First()
prop.Name = strings.TrimSpace(nameEl.Text())
prop.Type = strings.TrimSpace(s.Find(".property-type, .type").Text())
prop.Default = strings.TrimSpace(s.Find(".default-value, .default").Text())
prop.Doc = strings.TrimSpace(s.Find(".description, dd, td:last-child").Text())
if prop.Name != "" {
properties = append(properties, prop)
}
})
return properties
}
func (p *Parser) extractGuides(doc *goquery.Document, docURL string) []*Guide {
var guides []*Guide
doc.Find(".guide-link, .tutorial-link, .guide-card").Each(func(_ int, s *goquery.Selection) {
guide := &Guide{}
link := s.Find("a")
if link.Length() == 0 {
link = s
}
guide.Title = strings.TrimSpace(link.Text())
if href, exists := link.Attr("href"); exists {
guide.DocURL = resolveURL(docURL, href)
}
guide.Description = strings.TrimSpace(s.Find(".summary, .description").Text())
if guide.Title != "" {
guides = append(guides, guide)
}
})
return guides
}
func (p *Parser) ParseClassPage(html string, docURL string) (*Class, error) {
doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
if err != nil {
return nil, err
}
class := &Class{
DocURL: docURL,
}
header := doc.Find("h1, .title, .class-name").First()
class.Name = strings.TrimSpace(header.Text())
class.QualifiedName = class.Name
class.Doc = strings.TrimSpace(doc.Find(".class-description, .javadoc, .class-comment").First().Text())
class.Methods = p.extractClassMethods(doc, class.Name, docURL)
class.Fields = p.extractClassFields(doc, class.Name, docURL)
class.Constructors = p.extractClassConstructors(doc, class.Name, docURL)
return class, nil
}
func (p *Parser) extractClassMethods(doc *goquery.Document, className string, docURL string) []*Method {
var methods []*Method
doc.Find("table.method-summary tbody tr, .method, .member").Each(func(_ int, s *goquery.Selection) {
method := &Method{
IsConstructor: false,
}
link := s.Find("a").First()
method.Name = strings.TrimSpace(link.Text())
if method.Name == "" {
sig := s.Find(".method-signature, code").Text()
method.Name = extractSpringMethodName(sig)
}
method.Signature = strings.TrimSpace(s.Find(".method-signature, code").Text())
method.Doc = strings.TrimSpace(s.Find(".method-description, td:last-child, dd").Text())
if href, exists := link.Attr("href"); exists {
if strings.HasPrefix(href, "#") {
method.DocURL = docURL + href
} else {
method.DocURL = resolveURL(docURL, href)
}
method.QualifiedName = className + "." + method.Name
}
if method.Name != "" {
methods = append(methods, method)
}
})
return methods
}
func (p *Parser) extractClassFields(doc *goquery.Document, className string, docURL string) []*Field {
var fields []*Field
doc.Find("table.field-summary tbody tr, .field").Each(func(_ int, s *goquery.Selection) {
field := &Field{}
field.Name = strings.TrimSpace(s.Find(".field-name, a, td:first-child").Text())
field.Type = strings.TrimSpace(s.Find(".field-type, td:nth-child(2)").Text())
field.Doc = strings.TrimSpace(s.Find(".field-description, td:last-child").Text())
if field.Name != "" {
fields = append(fields, field)
}
})
return fields
}
func (p *Parser) extractClassConstructors(doc *goquery.Document, className string, docURL string) []*Method {
var constructors []*Method
doc.Find("table.constructor-summary tbody tr, .constructor").Each(func(_ int, s *goquery.Selection) {
ctor := &Method{
IsConstructor: true,
Name: className,
}
ctor.Signature = strings.TrimSpace(s.Find(".constructor-signature, code").Text())
ctor.Doc = strings.TrimSpace(s.Find(".constructor-description, td:last-child").Text())
constructors = append(constructors, ctor)
})
return constructors
}
func extractSpringMethodName(sig string) string {
sig = strings.TrimSpace(sig)
if idx := strings.Index(sig, "("); idx > 0 {
prefix := sig[:idx]
parts := strings.Fields(prefix)
if len(parts) > 0 {
return parts[len(parts)-1]
}
}
return ""
}
func extractQualifiedName(href string) string {
href = strings.TrimSuffix(href, "/")
parts := strings.Split(href, "/")
if len(parts) >= 2 {
return parts[len(parts)-1]
}
return ""
}
func resolveURL(base string, href string) string {
if strings.HasPrefix(href, "http") {
return href
}
baseURL, err := url.Parse(base)
if err != nil {
return href
}
hrefURL, err := url.Parse(href)
if err != nil {
return href
}
return baseURL.ResolveReference(hrefURL).String()
}