package javadocs import ( "strings" "time" "github.com/PuerkitoBio/goquery" "github.com/yourorg/devour/pkg/parserutil" ) type Parser struct { baseURL string } func NewParser() *Parser { return &Parser{ baseURL: "https://docs.oracle.com", } } func (p *Parser) ParsePackagePage(html string, docURL string) (*Package, error) { doc, err := goquery.NewDocumentFromReader(strings.NewReader(html)) if err != nil { return nil, err } pkg := &Package{ DocURL: docURL, FetchedAt: time.Now(), } pkg.Name = p.extractPackageName(doc) pkg.Doc = p.extractPackageDoc(doc) pkg.Classes = p.extractClasses(doc, pkg.Name, docURL) pkg.Interfaces = p.extractInterfaces(doc, pkg.Name, docURL) pkg.Enums = p.extractEnums(doc, pkg.Name, docURL) pkg.Exceptions = p.extractExceptions(doc, pkg.Name, docURL) return pkg, nil } func (p *Parser) ParseSearchResults(html string) ([]*SearchResult, error) { doc, err := goquery.NewDocumentFromReader(strings.NewReader(html)) if err != nil { return nil, err } var results []*SearchResult doc.Find(".result").Each(func(i int, s *goquery.Selection) { result := &SearchResult{} link := s.Find("a").First() result.Name = strings.TrimSpace(link.Text()) if href, exists := link.Attr("href"); exists { result.DocURL = resolveURL(p.baseURL, href) } result.Kind = s.Find(".result-kind").Text() result.QualName = s.Find(".qualified-name").Text() result.Package = s.Find(".package").Text() result.Doc = strings.TrimSpace(s.Find(".description").Text()) results = append(results, result) }) return results, nil } func (p *Parser) extractPackageName(doc *goquery.Document) string { title := doc.Find("h1, .title").First().Text() title = strings.TrimSpace(title) if strings.Contains(title, "Package") { parts := strings.Fields(title) for i, part := range parts { if part == "Package" && i+1 < len(parts) { return parts[i+1] } } } if title != "" { return title } return "" } func (p *Parser) extractPackageDoc(doc *goquery.Document) string { docblock := doc.Find(".block, .description, #package-description").First() return strings.TrimSpace(docblock.Text()) } func (p *Parser) extractClasses(doc *goquery.Document, pkgName string, docURL string) []*Class { var classes []*Class doc.Find("table.type-summary tr, .class-summary .member, section.class tbody tr").Each(func(_ int, s *goquery.Selection) { class := &Class{ Package: pkgName, Kind: ClassKindClass, } link := s.Find("a").First() class.Name = strings.TrimSpace(link.Text()) if class.Name == "" { class.Name = strings.TrimSpace(s.Find(".member-name, td:first-child").Text()) } if href, exists := link.Attr("href"); exists { class.DocURL = resolveURL(docURL, href) class.QualifiedName = pkgName + "." + class.Name } class.Doc = strings.TrimSpace(s.Find(".member-summary, td:last-child").Text()) if class.Name != "" && !strings.Contains(class.Name, "interface") { classes = append(classes, class) } }) return classes } func (p *Parser) extractInterfaces(doc *goquery.Document, pkgName string, docURL string) []*Class { var interfaces []*Class doc.Find("table.interface-summary tr, .interface-summary .member").Each(func(_ int, s *goquery.Selection) { iface := &Class{ Package: pkgName, Kind: ClassKindInterface, } link := s.Find("a").First() iface.Name = strings.TrimSpace(link.Text()) if iface.Name == "" { iface.Name = strings.TrimSpace(s.Find(".member-name").Text()) } if href, exists := link.Attr("href"); exists { iface.DocURL = resolveURL(docURL, href) iface.QualifiedName = pkgName + "." + iface.Name } iface.Doc = strings.TrimSpace(s.Find(".member-summary, td:last-child").Text()) if iface.Name != "" { interfaces = append(interfaces, iface) } }) return interfaces } func (p *Parser) extractEnums(doc *goquery.Document, pkgName string, docURL string) []*Enum { var enums []*Enum doc.Find("table.enum-summary tr, .enum-summary .member").Each(func(_ int, s *goquery.Selection) { enum := &Enum{ Package: pkgName, } link := s.Find("a").First() enum.Name = strings.TrimSpace(link.Text()) if enum.Name == "" { enum.Name = strings.TrimSpace(s.Find(".member-name").Text()) } if href, exists := link.Attr("href"); exists { enum.DocURL = resolveURL(docURL, href) enum.QualifiedName = pkgName + "." + enum.Name } enum.Doc = strings.TrimSpace(s.Find(".member-summary, td:last-child").Text()) if enum.Name != "" { enums = append(enums, enum) } }) return enums } func (p *Parser) extractExceptions(doc *goquery.Document, pkgName string, docURL string) []*Class { var exceptions []*Class doc.Find("table.exception-summary tr, .exception-summary .member").Each(func(_ int, s *goquery.Selection) { exc := &Class{ Package: pkgName, Kind: ClassKindClass, } link := s.Find("a").First() exc.Name = strings.TrimSpace(link.Text()) if exc.Name == "" { exc.Name = strings.TrimSpace(s.Find(".member-name").Text()) } if href, exists := link.Attr("href"); exists { exc.DocURL = resolveURL(docURL, href) exc.QualifiedName = pkgName + "." + exc.Name } exc.Doc = strings.TrimSpace(s.Find(".member-summary, td:last-child").Text()) if exc.Name != "" { exceptions = append(exceptions, exc) } }) return exceptions } func (p *Parser) ParseClassPage(html string, docURL string) (*Class, error) { doc, err := goquery.NewDocumentFromReader(strings.NewReader(html)) if err != nil { return nil, err } class := &Class{ DocURL: docURL, } header := doc.Find(".header, h1, .class-name").First() class.Name = strings.TrimSpace(header.Text()) class.QualifiedName = class.Name if idx := strings.LastIndex(class.Name, "."); idx > 0 { class.Package = class.Name[:idx] class.Name = class.Name[idx+1:] } class.Doc = strings.TrimSpace(doc.Find(".block, .description, .class-description").First().Text()) class.Methods = p.extractMethods(doc, class.Name, docURL) class.Fields = p.extractFields(doc, class.Name, docURL) class.Constructors = p.extractConstructors(doc, class.Name, docURL) return class, nil } func (p *Parser) extractMethods(doc *goquery.Document, className string, docURL string) []*Method { var methods []*Method doc.Find("table.method-summary tr, .method-summary .member, section.method-detail > ul > li").Each(func(_ int, s *goquery.Selection) { method := &Method{ IsConstructor: false, } link := s.Find("a").First() method.Name = strings.TrimSpace(link.Text()) if method.Name == "" { sig := s.Find(".member-signature, code").Text() method.Name = extractMethodName(sig) } sigEl := s.Find(".member-signature, code, .sig") method.Signature = strings.TrimSpace(sigEl.Text()) if id, exists := s.Attr("id"); exists { method.DocURL = docURL + "#" + id method.QualifiedName = className + "." + method.Name } else if href, exists := link.Attr("href"); exists { method.DocURL = resolveURL(docURL, href) method.QualifiedName = className + "." + method.Name } method.Doc = strings.TrimSpace(s.Find(".block, .member-summary, dd").First().Text()) if method.Name != "" { methods = append(methods, method) } }) return methods } func (p *Parser) extractFields(doc *goquery.Document, className string, docURL string) []*Field { var fields []*Field doc.Find("table.field-summary tr, .field-summary .member").Each(func(_ int, s *goquery.Selection) { field := &Field{} link := s.Find("a").First() field.Name = strings.TrimSpace(link.Text()) if field.Name == "" { field.Name = strings.TrimSpace(s.Find(".member-name, td:first-child").Text()) } field.Type = strings.TrimSpace(s.Find(".member-type, td:nth-child(2)").Text()) field.Doc = strings.TrimSpace(s.Find(".member-summary, td:last-child").Text()) if id, exists := s.Attr("id"); exists { field.DocURL = docURL + "#" + id } if field.Name != "" { fields = append(fields, field) } }) return fields } func (p *Parser) extractConstructors(doc *goquery.Document, className string, docURL string) []*Method { var constructors []*Method doc.Find("table.constructor-summary tr, .constructor-summary .member").Each(func(_ int, s *goquery.Selection) { ctor := &Method{ IsConstructor: true, Name: className, } link := s.Find("a").First() if name := strings.TrimSpace(link.Text()); name != "" { ctor.Name = name } sigEl := s.Find(".member-signature, code") ctor.Signature = strings.TrimSpace(sigEl.Text()) ctor.Doc = strings.TrimSpace(s.Find(".block, .member-summary, td:last-child").Text()) if id, exists := s.Attr("id"); exists { ctor.DocURL = docURL + "#" + id } constructors = append(constructors, ctor) }) return constructors } func extractMethodName(sig string) string { sig = strings.TrimSpace(sig) if idx := strings.Index(sig, "("); idx > 0 { prefix := sig[:idx] parts := strings.Fields(prefix) if len(parts) > 0 { return parts[len(parts)-1] } } return "" } func resolveURL(base string, href string) string { return parserutil.ResolveURL(base, href) }