mirror of
https://github.com/Dvorinka/Devour.git
synced 2026-06-04 04:23:02 +00:00
first commit
This commit is contained in:
@@ -0,0 +1,369 @@
|
||||
package javadocs
|
||||
|
||||
import (
|
||||
"net/url"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
)
|
||||
|
||||
type Parser struct {
|
||||
baseURL string
|
||||
}
|
||||
|
||||
func NewParser() *Parser {
|
||||
return &Parser{
|
||||
baseURL: "https://docs.oracle.com",
|
||||
}
|
||||
}
|
||||
|
||||
func (p *Parser) ParsePackagePage(html string, docURL string) (*Package, error) {
|
||||
doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
pkg := &Package{
|
||||
DocURL: docURL,
|
||||
FetchedAt: time.Now(),
|
||||
}
|
||||
|
||||
pkg.Name = p.extractPackageName(doc)
|
||||
pkg.Doc = p.extractPackageDoc(doc)
|
||||
pkg.Classes = p.extractClasses(doc, pkg.Name, docURL)
|
||||
pkg.Interfaces = p.extractInterfaces(doc, pkg.Name, docURL)
|
||||
pkg.Enums = p.extractEnums(doc, pkg.Name, docURL)
|
||||
pkg.Exceptions = p.extractExceptions(doc, pkg.Name, docURL)
|
||||
|
||||
return pkg, nil
|
||||
}
|
||||
|
||||
func (p *Parser) ParseSearchResults(html string) ([]*SearchResult, error) {
|
||||
doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var results []*SearchResult
|
||||
|
||||
doc.Find(".result").Each(func(i int, s *goquery.Selection) {
|
||||
result := &SearchResult{}
|
||||
|
||||
link := s.Find("a").First()
|
||||
result.Name = strings.TrimSpace(link.Text())
|
||||
|
||||
if href, exists := link.Attr("href"); exists {
|
||||
result.DocURL = resolveURL(p.baseURL, href)
|
||||
}
|
||||
|
||||
result.Kind = s.Find(".result-kind").Text()
|
||||
result.QualName = s.Find(".qualified-name").Text()
|
||||
result.Package = s.Find(".package").Text()
|
||||
result.Doc = strings.TrimSpace(s.Find(".description").Text())
|
||||
|
||||
results = append(results, result)
|
||||
})
|
||||
|
||||
return results, nil
|
||||
}
|
||||
|
||||
func (p *Parser) extractPackageName(doc *goquery.Document) string {
|
||||
title := doc.Find("h1, .title").First().Text()
|
||||
title = strings.TrimSpace(title)
|
||||
|
||||
if strings.Contains(title, "Package") {
|
||||
parts := strings.Fields(title)
|
||||
for i, part := range parts {
|
||||
if part == "Package" && i+1 < len(parts) {
|
||||
return parts[i+1]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if title != "" {
|
||||
return title
|
||||
}
|
||||
|
||||
return ""
|
||||
}
|
||||
|
||||
func (p *Parser) extractPackageDoc(doc *goquery.Document) string {
|
||||
docblock := doc.Find(".block, .description, #package-description").First()
|
||||
return strings.TrimSpace(docblock.Text())
|
||||
}
|
||||
|
||||
func (p *Parser) extractClasses(doc *goquery.Document, pkgName string, docURL string) []*Class {
|
||||
var classes []*Class
|
||||
|
||||
doc.Find("table.type-summary tr, .class-summary .member, section.class tbody tr").Each(func(_ int, s *goquery.Selection) {
|
||||
class := &Class{
|
||||
Package: pkgName,
|
||||
Kind: ClassKindClass,
|
||||
}
|
||||
|
||||
link := s.Find("a").First()
|
||||
class.Name = strings.TrimSpace(link.Text())
|
||||
|
||||
if class.Name == "" {
|
||||
class.Name = strings.TrimSpace(s.Find(".member-name, td:first-child").Text())
|
||||
}
|
||||
|
||||
if href, exists := link.Attr("href"); exists {
|
||||
class.DocURL = resolveURL(docURL, href)
|
||||
class.QualifiedName = pkgName + "." + class.Name
|
||||
}
|
||||
|
||||
class.Doc = strings.TrimSpace(s.Find(".member-summary, td:last-child").Text())
|
||||
|
||||
if class.Name != "" && !strings.Contains(class.Name, "interface") {
|
||||
classes = append(classes, class)
|
||||
}
|
||||
})
|
||||
|
||||
return classes
|
||||
}
|
||||
|
||||
func (p *Parser) extractInterfaces(doc *goquery.Document, pkgName string, docURL string) []*Class {
|
||||
var interfaces []*Class
|
||||
|
||||
doc.Find("table.interface-summary tr, .interface-summary .member").Each(func(_ int, s *goquery.Selection) {
|
||||
iface := &Class{
|
||||
Package: pkgName,
|
||||
Kind: ClassKindInterface,
|
||||
}
|
||||
|
||||
link := s.Find("a").First()
|
||||
iface.Name = strings.TrimSpace(link.Text())
|
||||
|
||||
if iface.Name == "" {
|
||||
iface.Name = strings.TrimSpace(s.Find(".member-name").Text())
|
||||
}
|
||||
|
||||
if href, exists := link.Attr("href"); exists {
|
||||
iface.DocURL = resolveURL(docURL, href)
|
||||
iface.QualifiedName = pkgName + "." + iface.Name
|
||||
}
|
||||
|
||||
iface.Doc = strings.TrimSpace(s.Find(".member-summary, td:last-child").Text())
|
||||
|
||||
if iface.Name != "" {
|
||||
interfaces = append(interfaces, iface)
|
||||
}
|
||||
})
|
||||
|
||||
return interfaces
|
||||
}
|
||||
|
||||
func (p *Parser) extractEnums(doc *goquery.Document, pkgName string, docURL string) []*Enum {
|
||||
var enums []*Enum
|
||||
|
||||
doc.Find("table.enum-summary tr, .enum-summary .member").Each(func(_ int, s *goquery.Selection) {
|
||||
enum := &Enum{
|
||||
Package: pkgName,
|
||||
}
|
||||
|
||||
link := s.Find("a").First()
|
||||
enum.Name = strings.TrimSpace(link.Text())
|
||||
|
||||
if enum.Name == "" {
|
||||
enum.Name = strings.TrimSpace(s.Find(".member-name").Text())
|
||||
}
|
||||
|
||||
if href, exists := link.Attr("href"); exists {
|
||||
enum.DocURL = resolveURL(docURL, href)
|
||||
enum.QualifiedName = pkgName + "." + enum.Name
|
||||
}
|
||||
|
||||
enum.Doc = strings.TrimSpace(s.Find(".member-summary, td:last-child").Text())
|
||||
|
||||
if enum.Name != "" {
|
||||
enums = append(enums, enum)
|
||||
}
|
||||
})
|
||||
|
||||
return enums
|
||||
}
|
||||
|
||||
func (p *Parser) extractExceptions(doc *goquery.Document, pkgName string, docURL string) []*Class {
|
||||
var exceptions []*Class
|
||||
|
||||
doc.Find("table.exception-summary tr, .exception-summary .member").Each(func(_ int, s *goquery.Selection) {
|
||||
exc := &Class{
|
||||
Package: pkgName,
|
||||
Kind: ClassKindClass,
|
||||
}
|
||||
|
||||
link := s.Find("a").First()
|
||||
exc.Name = strings.TrimSpace(link.Text())
|
||||
|
||||
if exc.Name == "" {
|
||||
exc.Name = strings.TrimSpace(s.Find(".member-name").Text())
|
||||
}
|
||||
|
||||
if href, exists := link.Attr("href"); exists {
|
||||
exc.DocURL = resolveURL(docURL, href)
|
||||
exc.QualifiedName = pkgName + "." + exc.Name
|
||||
}
|
||||
|
||||
exc.Doc = strings.TrimSpace(s.Find(".member-summary, td:last-child").Text())
|
||||
|
||||
if exc.Name != "" {
|
||||
exceptions = append(exceptions, exc)
|
||||
}
|
||||
})
|
||||
|
||||
return exceptions
|
||||
}
|
||||
|
||||
func (p *Parser) ParseClassPage(html string, docURL string) (*Class, error) {
|
||||
doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
class := &Class{
|
||||
DocURL: docURL,
|
||||
}
|
||||
|
||||
header := doc.Find(".header, h1, .class-name").First()
|
||||
class.Name = strings.TrimSpace(header.Text())
|
||||
|
||||
class.QualifiedName = class.Name
|
||||
if idx := strings.LastIndex(class.Name, "."); idx > 0 {
|
||||
class.Package = class.Name[:idx]
|
||||
class.Name = class.Name[idx+1:]
|
||||
}
|
||||
|
||||
class.Doc = strings.TrimSpace(doc.Find(".block, .description, .class-description").First().Text())
|
||||
|
||||
class.Methods = p.extractMethods(doc, class.Name, docURL)
|
||||
class.Fields = p.extractFields(doc, class.Name, docURL)
|
||||
class.Constructors = p.extractConstructors(doc, class.Name, docURL)
|
||||
|
||||
return class, nil
|
||||
}
|
||||
|
||||
func (p *Parser) extractMethods(doc *goquery.Document, className string, docURL string) []*Method {
|
||||
var methods []*Method
|
||||
|
||||
doc.Find("table.method-summary tr, .method-summary .member, section.method-detail > ul > li").Each(func(_ int, s *goquery.Selection) {
|
||||
method := &Method{
|
||||
IsConstructor: false,
|
||||
}
|
||||
|
||||
link := s.Find("a").First()
|
||||
method.Name = strings.TrimSpace(link.Text())
|
||||
|
||||
if method.Name == "" {
|
||||
sig := s.Find(".member-signature, code").Text()
|
||||
method.Name = extractMethodName(sig)
|
||||
}
|
||||
|
||||
sigEl := s.Find(".member-signature, code, .sig")
|
||||
method.Signature = strings.TrimSpace(sigEl.Text())
|
||||
|
||||
if id, exists := s.Attr("id"); exists {
|
||||
method.DocURL = docURL + "#" + id
|
||||
method.QualifiedName = className + "." + method.Name
|
||||
} else if href, exists := link.Attr("href"); exists {
|
||||
method.DocURL = resolveURL(docURL, href)
|
||||
method.QualifiedName = className + "." + method.Name
|
||||
}
|
||||
|
||||
method.Doc = strings.TrimSpace(s.Find(".block, .member-summary, dd").First().Text())
|
||||
|
||||
if method.Name != "" {
|
||||
methods = append(methods, method)
|
||||
}
|
||||
})
|
||||
|
||||
return methods
|
||||
}
|
||||
|
||||
func (p *Parser) extractFields(doc *goquery.Document, className string, docURL string) []*Field {
|
||||
var fields []*Field
|
||||
|
||||
doc.Find("table.field-summary tr, .field-summary .member").Each(func(_ int, s *goquery.Selection) {
|
||||
field := &Field{}
|
||||
|
||||
link := s.Find("a").First()
|
||||
field.Name = strings.TrimSpace(link.Text())
|
||||
|
||||
if field.Name == "" {
|
||||
field.Name = strings.TrimSpace(s.Find(".member-name, td:first-child").Text())
|
||||
}
|
||||
|
||||
field.Type = strings.TrimSpace(s.Find(".member-type, td:nth-child(2)").Text())
|
||||
field.Doc = strings.TrimSpace(s.Find(".member-summary, td:last-child").Text())
|
||||
|
||||
if id, exists := s.Attr("id"); exists {
|
||||
field.DocURL = docURL + "#" + id
|
||||
}
|
||||
|
||||
if field.Name != "" {
|
||||
fields = append(fields, field)
|
||||
}
|
||||
})
|
||||
|
||||
return fields
|
||||
}
|
||||
|
||||
func (p *Parser) extractConstructors(doc *goquery.Document, className string, docURL string) []*Method {
|
||||
var constructors []*Method
|
||||
|
||||
doc.Find("table.constructor-summary tr, .constructor-summary .member").Each(func(_ int, s *goquery.Selection) {
|
||||
ctor := &Method{
|
||||
IsConstructor: true,
|
||||
Name: className,
|
||||
}
|
||||
|
||||
link := s.Find("a").First()
|
||||
if name := strings.TrimSpace(link.Text()); name != "" {
|
||||
ctor.Name = name
|
||||
}
|
||||
|
||||
sigEl := s.Find(".member-signature, code")
|
||||
ctor.Signature = strings.TrimSpace(sigEl.Text())
|
||||
|
||||
ctor.Doc = strings.TrimSpace(s.Find(".block, .member-summary, td:last-child").Text())
|
||||
|
||||
if id, exists := s.Attr("id"); exists {
|
||||
ctor.DocURL = docURL + "#" + id
|
||||
}
|
||||
|
||||
constructors = append(constructors, ctor)
|
||||
})
|
||||
|
||||
return constructors
|
||||
}
|
||||
|
||||
func extractMethodName(sig string) string {
|
||||
sig = strings.TrimSpace(sig)
|
||||
if idx := strings.Index(sig, "("); idx > 0 {
|
||||
prefix := sig[:idx]
|
||||
parts := strings.Fields(prefix)
|
||||
if len(parts) > 0 {
|
||||
return parts[len(parts)-1]
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func resolveURL(base string, href string) string {
|
||||
if strings.HasPrefix(href, "http") {
|
||||
return href
|
||||
}
|
||||
|
||||
baseURL, err := url.Parse(base)
|
||||
if err != nil {
|
||||
return href
|
||||
}
|
||||
|
||||
hrefURL, err := url.Parse(href)
|
||||
if err != nil {
|
||||
return href
|
||||
}
|
||||
|
||||
return baseURL.ResolveReference(hrefURL).String()
|
||||
}
|
||||
@@ -0,0 +1,115 @@
|
||||
package javadocs
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
)
|
||||
|
||||
const testPackagePageHTML = `
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<body>
|
||||
<h1>Package java.util</h1>
|
||||
<div class="block">Contains the collections framework, legacy collection classes, event model, date and time facilities.</div>
|
||||
|
||||
<table class="type-summary">
|
||||
<tr>
|
||||
<td><a href="ArrayList.html">ArrayList</a></td>
|
||||
<td>Resizable-array implementation of the List interface.</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td><a href="HashMap.html">HashMap</a></td>
|
||||
<td>Hash table based implementation of the Map interface.</td>
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
<table class="interface-summary">
|
||||
<tr>
|
||||
<td><a href="List.html">List</a></td>
|
||||
<td>An ordered collection (also known as a sequence).</td>
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
<table class="exception-summary">
|
||||
<tr>
|
||||
<td><a href="ConcurrentModificationException.html">ConcurrentModificationException</a></td>
|
||||
<td>This exception may be thrown by methods that detect concurrent modification.</td>
|
||||
</tr>
|
||||
</table>
|
||||
</body>
|
||||
</html>
|
||||
`
|
||||
|
||||
func TestParsePackagePage(t *testing.T) {
|
||||
parser := NewParser()
|
||||
pkg, err := parser.ParsePackagePage(testPackagePageHTML, "https://docs.oracle.com/en/java/javase/17/docs/api/java.base/java/util/package-summary.html")
|
||||
if err != nil {
|
||||
t.Fatalf("ParsePackagePage failed: %v", err)
|
||||
}
|
||||
|
||||
if pkg.Name == "" {
|
||||
t.Error("Expected non-empty package name")
|
||||
}
|
||||
|
||||
if pkg.Doc == "" {
|
||||
t.Error("Expected non-empty doc")
|
||||
}
|
||||
|
||||
if len(pkg.Classes) == 0 {
|
||||
t.Error("Expected at least one class")
|
||||
}
|
||||
|
||||
if len(pkg.Interfaces) == 0 {
|
||||
t.Error("Expected at least one interface")
|
||||
}
|
||||
|
||||
if len(pkg.Exceptions) == 0 {
|
||||
t.Error("Expected at least one exception")
|
||||
}
|
||||
}
|
||||
|
||||
func TestExtractClasses(t *testing.T) {
|
||||
parser := NewParser()
|
||||
|
||||
doc, err := goquery.NewDocumentFromReader(strings.NewReader(testPackagePageHTML))
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to parse HTML: %v", err)
|
||||
}
|
||||
|
||||
classes := parser.extractClasses(doc, "java.util", "https://docs.oracle.com/test")
|
||||
|
||||
if len(classes) == 0 {
|
||||
t.Fatal("Expected at least one class")
|
||||
}
|
||||
|
||||
first := classes[0]
|
||||
if first.Name == "" {
|
||||
t.Error("Expected non-empty class name")
|
||||
}
|
||||
|
||||
if first.Package != "java.util" {
|
||||
t.Errorf("Expected package 'java.util', got %q", first.Package)
|
||||
}
|
||||
}
|
||||
|
||||
func TestResolveURL(t *testing.T) {
|
||||
tests := []struct {
|
||||
base string
|
||||
href string
|
||||
expected string
|
||||
}{
|
||||
{"https://docs.oracle.com", "/api/ArrayList.html", "https://docs.oracle.com/api/ArrayList.html"},
|
||||
{"https://docs.oracle.com", "https://example.com/page", "https://example.com/page"},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.href, func(t *testing.T) {
|
||||
got := resolveURL(tt.base, tt.href)
|
||||
if got != tt.expected {
|
||||
t.Errorf("resolveURL(%q, %q) = %q, want %q", tt.base, tt.href, got, tt.expected)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,116 @@
|
||||
// Package javadocs provides parsing and extraction for Java documentation
|
||||
// from docs.oracle.com and javadoc-generated sites.
|
||||
package javadocs
|
||||
|
||||
import "time"
|
||||
|
||||
// Package represents a Java package's documentation.
|
||||
type Package struct {
|
||||
Name string `json:"name"`
|
||||
Doc string `json:"doc,omitempty"`
|
||||
Version string `json:"version,omitempty"`
|
||||
DocURL string `json:"doc_url"`
|
||||
Classes []*Class `json:"classes,omitempty"`
|
||||
Interfaces []*Class `json:"interfaces,omitempty"`
|
||||
Enums []*Enum `json:"enums,omitempty"`
|
||||
Exceptions []*Class `json:"exceptions,omitempty"`
|
||||
Annotations []*Class `json:"annotations,omitempty"`
|
||||
FetchedAt time.Time `json:"fetched_at"`
|
||||
}
|
||||
|
||||
// Class represents a Java class or interface.
|
||||
type Class struct {
|
||||
QualifiedName string `json:"qualified_name"`
|
||||
Name string `json:"name"`
|
||||
Package string `json:"package"`
|
||||
Kind ClassKind `json:"kind"`
|
||||
Doc string `json:"doc,omitempty"`
|
||||
Modifiers []string `json:"modifiers,omitempty"`
|
||||
SuperClass string `json:"super_class,omitempty"`
|
||||
Interfaces []string `json:"interfaces,omitempty"`
|
||||
Fields []*Field `json:"fields,omitempty"`
|
||||
Methods []*Method `json:"methods,omitempty"`
|
||||
Constructors []*Method `json:"constructors,omitempty"`
|
||||
NestedClasses []*Class `json:"nested_classes,omitempty"`
|
||||
DocURL string `json:"doc_url"`
|
||||
Since string `json:"since,omitempty"`
|
||||
Deprecated string `json:"deprecated,omitempty"`
|
||||
}
|
||||
|
||||
// ClassKind represents the kind of class.
|
||||
type ClassKind string
|
||||
|
||||
const (
|
||||
ClassKindClass ClassKind = "class"
|
||||
ClassKindInterface ClassKind = "interface"
|
||||
ClassKindAnnotation ClassKind = "annotation"
|
||||
ClassKindRecord ClassKind = "record"
|
||||
ClassKindSealed ClassKind = "sealed"
|
||||
)
|
||||
|
||||
// Enum represents a Java enum.
|
||||
type Enum struct {
|
||||
QualifiedName string `json:"qualified_name"`
|
||||
Name string `json:"name"`
|
||||
Package string `json:"package"`
|
||||
Doc string `json:"doc,omitempty"`
|
||||
Modifiers []string `json:"modifiers,omitempty"`
|
||||
Constants []*EnumConst `json:"constants,omitempty"`
|
||||
Methods []*Method `json:"methods,omitempty"`
|
||||
DocURL string `json:"doc_url"`
|
||||
Since string `json:"since,omitempty"`
|
||||
}
|
||||
|
||||
// EnumConst represents an enum constant.
|
||||
type EnumConst struct {
|
||||
Name string `json:"name"`
|
||||
Doc string `json:"doc,omitempty"`
|
||||
DocURL string `json:"doc_url"`
|
||||
}
|
||||
|
||||
// Field represents a class field.
|
||||
type Field struct {
|
||||
Name string `json:"name"`
|
||||
Type string `json:"type"`
|
||||
Doc string `json:"doc,omitempty"`
|
||||
Modifiers []string `json:"modifiers,omitempty"`
|
||||
Value string `json:"value,omitempty"`
|
||||
DocURL string `json:"doc_url"`
|
||||
Deprecated string `json:"deprecated,omitempty"`
|
||||
}
|
||||
|
||||
// Method represents a method or constructor.
|
||||
type Method struct {
|
||||
Name string `json:"name"`
|
||||
Doc string `json:"doc,omitempty"`
|
||||
Modifiers []string `json:"modifiers,omitempty"`
|
||||
ReturnType string `json:"return_type,omitempty"`
|
||||
Parameters []*Parameter `json:"parameters,omitempty"`
|
||||
Exceptions []string `json:"exceptions,omitempty"`
|
||||
Signature string `json:"signature"`
|
||||
QualifiedName string `json:"qualified_name"`
|
||||
DocURL string `json:"doc_url"`
|
||||
Since string `json:"since,omitempty"`
|
||||
Deprecated string `json:"deprecated,omitempty"`
|
||||
IsConstructor bool `json:"is_constructor"`
|
||||
IsStatic bool `json:"is_static"`
|
||||
IsDefault bool `json:"is_default"`
|
||||
}
|
||||
|
||||
// Parameter represents a method parameter.
|
||||
type Parameter struct {
|
||||
Name string `json:"name"`
|
||||
Type string `json:"type"`
|
||||
Doc string `json:"doc,omitempty"`
|
||||
}
|
||||
|
||||
// SearchResult represents a search result.
|
||||
type SearchResult struct {
|
||||
Name string `json:"name"`
|
||||
QualName string `json:"qualified_name"`
|
||||
Kind string `json:"kind"` // class, interface, enum, method, field
|
||||
Package string `json:"package"`
|
||||
Doc string `json:"doc,omitempty"`
|
||||
DocURL string `json:"doc_url"`
|
||||
Score int `json:"score"`
|
||||
}
|
||||
Reference in New Issue
Block a user