mirror of
https://github.com/Dvorinka/Devour.git
synced 2026-06-04 04:23:02 +00:00
688 lines
16 KiB
Go
688 lines
16 KiB
Go
package godocs
|
|
|
|
import (
|
|
"net/url"
|
|
"regexp"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/PuerkitoBio/goquery"
|
|
)
|
|
|
|
// Parser parses pkg.go.dev HTML pages into structured documentation.
|
|
type Parser struct {
|
|
baseURL string
|
|
}
|
|
|
|
// NewParser creates a new parser for pkg.go.dev content.
|
|
func NewParser() *Parser {
|
|
return &Parser{
|
|
baseURL: "https://pkg.go.dev",
|
|
}
|
|
}
|
|
|
|
// ParsePackagePage parses a pkg.go.dev package documentation page.
|
|
func (p *Parser) ParsePackagePage(html string, docURL string) (*Package, error) {
|
|
doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
pkg := &Package{
|
|
DocURL: docURL,
|
|
FetchedAt: time.Now(),
|
|
}
|
|
|
|
// Extract import path from URL or breadcrumb
|
|
pkg.ImportPath = p.extractImportPath(doc, docURL)
|
|
pkg.Name = p.extractPackageName(doc)
|
|
|
|
// Extract synopsis
|
|
pkg.Synopsis = p.extractSynopsis(doc)
|
|
|
|
// Extract package documentation
|
|
pkg.Doc = p.extractPackageDoc(doc)
|
|
|
|
// Extract version info
|
|
pkg.Version = p.extractVersion(doc)
|
|
|
|
// Extract module info
|
|
pkg.Module = p.extractModule(doc)
|
|
|
|
// Extract licenses
|
|
pkg.Licenses = p.extractLicenses(doc)
|
|
|
|
// Extract imported by count
|
|
pkg.ImportedBy = p.extractImportedBy(doc)
|
|
|
|
// Extract repository URL
|
|
pkg.Repository = p.extractRepository(doc)
|
|
|
|
// Extract functions
|
|
pkg.Functions = p.extractFunctions(doc)
|
|
|
|
// Extract types
|
|
pkg.Types = p.extractTypes(doc)
|
|
|
|
// Extract constants
|
|
pkg.Constants = p.extractConstants(doc)
|
|
|
|
// Extract variables
|
|
pkg.Variables = p.extractVariables(doc)
|
|
|
|
// Extract examples
|
|
pkg.Examples = p.extractPackageExamples(doc)
|
|
|
|
return pkg, nil
|
|
}
|
|
|
|
// ParseSearchResults parses pkg.go.dev search results page.
|
|
func (p *Parser) ParseSearchResults(html string) ([]*SearchResult, error) {
|
|
doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
var results []*SearchResult
|
|
|
|
doc.Find(".SearchSnippet").Each(func(i int, s *goquery.Selection) {
|
|
result := &SearchResult{}
|
|
|
|
// Extract name and path
|
|
s.Find("h2 a").Each(func(_ int, a *goquery.Selection) {
|
|
result.Name = strings.TrimSpace(a.Text())
|
|
if href, exists := a.Attr("href"); exists {
|
|
result.URL = p.baseURL + href
|
|
result.Path = strings.TrimPrefix(href, "/")
|
|
}
|
|
})
|
|
|
|
// Extract path from span
|
|
pathSpan := s.Find(".SearchSnippet-header-path")
|
|
if pathSpan.Length() > 0 {
|
|
result.Path = strings.Trim(pathSpan.Text(), "()")
|
|
}
|
|
|
|
// Extract synopsis
|
|
synopsis := s.Find(".SearchSnippet-synopsis")
|
|
if synopsis.Length() > 0 {
|
|
result.Synopsis = strings.TrimSpace(synopsis.Text())
|
|
}
|
|
|
|
// Extract imported by count
|
|
infoLabel := s.Find(".SearchSnippet-infoLabel").Text()
|
|
if strings.Contains(infoLabel, "Imported by") {
|
|
re := regexp.MustCompile(`Imported by\s+(\d[\d,]*)`)
|
|
if matches := re.FindStringSubmatch(infoLabel); len(matches) > 1 {
|
|
countStr := strings.ReplaceAll(matches[1], ",", "")
|
|
result.ImportedBy = parseCount(countStr)
|
|
}
|
|
}
|
|
|
|
// Extract version
|
|
versionMatch := regexp.MustCompile(`v?\d+\.\d+(?:\.\d+)?`).FindString(infoLabel)
|
|
result.Version = versionMatch
|
|
|
|
// Extract license
|
|
license := s.Find("[data-test-id='snippet-license'] a")
|
|
if license.Length() > 0 {
|
|
result.License = strings.TrimSpace(license.Text())
|
|
}
|
|
|
|
results = append(results, result)
|
|
})
|
|
|
|
return results, nil
|
|
}
|
|
|
|
// extractImportPath extracts the import path from the page.
|
|
func (p *Parser) extractImportPath(doc *goquery.Document, docURL string) string {
|
|
// Try to extract from breadcrumb
|
|
var importPath string
|
|
doc.Find(".go-Breadcrumb li a").Each(func(i int, s *goquery.Selection) {
|
|
if i > 0 { // Skip "Discover Packages"
|
|
part := strings.TrimSpace(s.Text())
|
|
if part != "" {
|
|
if importPath != "" {
|
|
importPath += "/"
|
|
}
|
|
importPath += part
|
|
}
|
|
}
|
|
})
|
|
|
|
if importPath != "" {
|
|
return importPath
|
|
}
|
|
|
|
// Fallback: extract from URL
|
|
if docURL != "" {
|
|
u, err := url.Parse(docURL)
|
|
if err == nil {
|
|
path := strings.TrimPrefix(u.Path, "/")
|
|
// Remove version suffix like @v1.0.0
|
|
if idx := strings.Index(path, "@"); idx > 0 {
|
|
path = path[:idx]
|
|
}
|
|
return path
|
|
}
|
|
}
|
|
|
|
return ""
|
|
}
|
|
|
|
// extractPackageName extracts the package name.
|
|
func (p *Parser) extractPackageName(doc *goquery.Document) string {
|
|
// Try UnitHeader-title
|
|
title := doc.Find(".UnitHeader-titleHeading").Text()
|
|
title = strings.TrimSpace(title)
|
|
if title != "" {
|
|
return title
|
|
}
|
|
|
|
// Fallback to h1
|
|
title = doc.Find("h1").First().Text()
|
|
return strings.TrimSpace(title)
|
|
}
|
|
|
|
// extractSynopsis extracts the package synopsis.
|
|
func (p *Parser) extractSynopsis(doc *goquery.Document) string {
|
|
// Synopsis is typically in the first paragraph after the package declaration
|
|
docSection := doc.Find(".Documentation").First()
|
|
if docSection.Length() > 0 {
|
|
// Get the first paragraph
|
|
firstP := docSection.Find("p").First()
|
|
if firstP.Length() > 0 {
|
|
synopsis := strings.TrimSpace(firstP.Text())
|
|
// Limit to reasonable length
|
|
if len(synopsis) > 200 {
|
|
synopsis = synopsis[:197] + "..."
|
|
}
|
|
return synopsis
|
|
}
|
|
}
|
|
|
|
return ""
|
|
}
|
|
|
|
// extractPackageDoc extracts the full package documentation.
|
|
func (p *Parser) extractPackageDoc(doc *goquery.Document) string {
|
|
var parts []string
|
|
|
|
doc.Find(".Documentation").Each(func(_ int, s *goquery.Selection) {
|
|
text := s.Text()
|
|
text = cleanWhitespace(text)
|
|
if text != "" {
|
|
parts = append(parts, text)
|
|
}
|
|
})
|
|
|
|
return strings.Join(parts, "\n\n")
|
|
}
|
|
|
|
// extractVersion extracts the version info.
|
|
func (p *Parser) extractVersion(doc *goquery.Document) string {
|
|
versionEl := doc.Find("[data-test-id='UnitHeader-version'] a")
|
|
if versionEl.Length() > 0 {
|
|
return strings.TrimSpace(versionEl.Text())
|
|
}
|
|
return ""
|
|
}
|
|
|
|
// extractModule extracts module information.
|
|
func (p *Parser) extractModule(doc *goquery.Document) *Module {
|
|
modulePath := ""
|
|
moduleVersion := ""
|
|
|
|
// Try to extract from version link
|
|
versionEl := doc.Find("[data-test-id='UnitHeader-version'] a")
|
|
if versionEl.Length() > 0 {
|
|
moduleVersion = strings.TrimSpace(versionEl.Text())
|
|
}
|
|
|
|
// Extract module path from breadcrumb
|
|
doc.Find(".go-Breadcrumb li a").Each(func(i int, s *goquery.Selection) {
|
|
text := strings.TrimSpace(s.Text())
|
|
if strings.Contains(text, "/") && i > 0 {
|
|
modulePath = text
|
|
}
|
|
})
|
|
|
|
if modulePath != "" {
|
|
return &Module{
|
|
Path: modulePath,
|
|
Version: moduleVersion,
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// extractLicenses extracts license information.
|
|
func (p *Parser) extractLicenses(doc *goquery.Document) []License {
|
|
var licenses []License
|
|
|
|
doc.Find("[data-test-id='UnitHeader-license']").Each(func(_ int, s *goquery.Selection) {
|
|
name := strings.TrimSpace(s.Text())
|
|
if name != "" {
|
|
license := License{Name: name}
|
|
if href, exists := s.Attr("href"); exists {
|
|
license.Path = href
|
|
}
|
|
licenses = append(licenses, license)
|
|
}
|
|
})
|
|
|
|
return licenses
|
|
}
|
|
|
|
// extractImportedBy extracts the import count.
|
|
func (p *Parser) extractImportedBy(doc *goquery.Document) int {
|
|
importEl := doc.Find("[data-test-id='UnitHeader-importedby'] a")
|
|
if importEl.Length() > 0 {
|
|
text := importEl.Text()
|
|
// Extract number from "Imported by: 144,729"
|
|
re := regexp.MustCompile(`[\d,]+`)
|
|
if match := re.FindString(text); match != "" {
|
|
match = strings.ReplaceAll(match, ",", "")
|
|
var count int
|
|
for _, c := range match {
|
|
if c >= '0' && c <= '9' {
|
|
count = count*10 + int(c-'0')
|
|
}
|
|
}
|
|
return count
|
|
}
|
|
}
|
|
return 0
|
|
}
|
|
|
|
// extractRepository extracts the repository URL.
|
|
func (p *Parser) extractRepository(doc *goquery.Document) string {
|
|
repoEl := doc.Find(".UnitMeta-repo a")
|
|
if repoEl.Length() > 0 {
|
|
if href, exists := repoEl.Attr("href"); exists {
|
|
return href
|
|
}
|
|
}
|
|
return ""
|
|
}
|
|
|
|
// extractFunctions extracts all function declarations.
|
|
func (p *Parser) extractFunctions(doc *goquery.Document) []*Function {
|
|
var functions []*Function
|
|
|
|
doc.Find(".Documentation-function").Each(func(_ int, s *goquery.Selection) {
|
|
fn := &Function{}
|
|
|
|
// Extract name from the function header
|
|
nameEl := s.Find(".Documentation-functionHeader").First()
|
|
if nameEl.Length() > 0 {
|
|
fn.Name = strings.TrimSpace(nameEl.Text())
|
|
}
|
|
|
|
// Extract signature from code block
|
|
sigEl := s.Find("pre").First()
|
|
if sigEl.Length() > 0 {
|
|
fn.Signature = strings.TrimSpace(sigEl.Text())
|
|
}
|
|
|
|
// Extract documentation
|
|
docEl := s.Find(".Documentation-functionBody p").First()
|
|
if docEl.Length() == 0 {
|
|
docEl = s.Find("p").First()
|
|
}
|
|
if docEl.Length() > 0 {
|
|
fn.Doc = strings.TrimSpace(docEl.Text())
|
|
}
|
|
|
|
// Extract examples
|
|
fn.Examples = p.extractExamples(s)
|
|
|
|
if fn.Name != "" {
|
|
functions = append(functions, fn)
|
|
}
|
|
})
|
|
|
|
return functions
|
|
}
|
|
|
|
// extractTypes extracts all type declarations.
|
|
func (p *Parser) extractTypes(doc *goquery.Document) []*Type {
|
|
var types []*Type
|
|
|
|
doc.Find(".Documentation-type").Each(func(_ int, s *goquery.Selection) {
|
|
t := &Type{}
|
|
|
|
// Extract name from the type header
|
|
nameEl := s.Find(".Documentation-typeHeader").First()
|
|
if nameEl.Length() > 0 {
|
|
t.Name = strings.TrimSpace(nameEl.Text())
|
|
}
|
|
|
|
// Determine kind from signature
|
|
sigEl := s.Find("pre").First()
|
|
if sigEl.Length() > 0 {
|
|
sig := sigEl.Text()
|
|
t.Underlying = strings.TrimSpace(sig)
|
|
|
|
if strings.Contains(sig, "struct{") {
|
|
t.Kind = TypeKindStruct
|
|
t.Fields = p.extractStructFields(sigEl)
|
|
} else if strings.Contains(sig, "interface{") {
|
|
t.Kind = TypeKindInterface
|
|
} else {
|
|
t.Kind = TypeKindAlias
|
|
}
|
|
}
|
|
|
|
// Extract documentation
|
|
docEl := s.Find("p").First()
|
|
if docEl.Length() > 0 {
|
|
t.Doc = strings.TrimSpace(docEl.Text())
|
|
}
|
|
|
|
// Extract methods
|
|
t.Methods = p.extractMethods(s)
|
|
|
|
// Extract examples
|
|
t.Examples = p.extractExamples(s)
|
|
|
|
if t.Name != "" {
|
|
types = append(types, t)
|
|
}
|
|
})
|
|
|
|
return types
|
|
}
|
|
|
|
// extractStructFields extracts struct fields from a type definition.
|
|
func (p *Parser) extractStructFields(sigEl *goquery.Selection) []*Field {
|
|
var fields []*Field
|
|
|
|
sigEl.Find("tr, .Documentation-structField").Each(func(_ int, s *goquery.Selection) {
|
|
text := s.Text()
|
|
text = strings.TrimSpace(text)
|
|
|
|
if text == "" || strings.HasPrefix(text, "//") {
|
|
return
|
|
}
|
|
|
|
// Parse field: Name Type `tag`
|
|
parts := strings.Fields(text)
|
|
if len(parts) >= 1 {
|
|
field := &Field{
|
|
Name: parts[0],
|
|
Exported: isExported(parts[0]),
|
|
}
|
|
|
|
if len(parts) >= 2 {
|
|
field.Type = strings.Join(parts[1:], " ")
|
|
// Remove tag
|
|
if idx := strings.Index(field.Type, "`"); idx > 0 {
|
|
field.Tag = field.Type[idx:]
|
|
field.Type = field.Type[:idx]
|
|
}
|
|
}
|
|
|
|
fields = append(fields, field)
|
|
}
|
|
})
|
|
|
|
return fields
|
|
}
|
|
|
|
// extractMethods extracts methods from a type section.
|
|
func (p *Parser) extractMethods(typeSection *goquery.Selection) []*Method {
|
|
var methods []*Method
|
|
|
|
typeSection.Find(".Documentation-method, .Documentation-function").Each(func(_ int, s *goquery.Selection) {
|
|
m := &Method{}
|
|
|
|
// Extract method name
|
|
nameEl := s.Find(".Documentation-functionHeader, .Documentation-methodHeader").First()
|
|
if nameEl.Length() > 0 {
|
|
name := strings.TrimSpace(nameEl.Text())
|
|
// Extract receiver if present: (t *Type) Method(...)
|
|
if strings.HasPrefix(name, "(") {
|
|
if end := strings.Index(name, ")"); end > 0 {
|
|
m.Receiver = name[1:end]
|
|
name = strings.TrimSpace(name[end+1:])
|
|
}
|
|
}
|
|
m.Name = name
|
|
}
|
|
|
|
// Extract signature
|
|
sigEl := s.Find("pre").First()
|
|
if sigEl.Length() > 0 {
|
|
m.Signature = strings.TrimSpace(sigEl.Text())
|
|
}
|
|
|
|
// Extract documentation
|
|
docEl := s.Find("p").First()
|
|
if docEl.Length() > 0 {
|
|
m.Doc = strings.TrimSpace(docEl.Text())
|
|
}
|
|
|
|
if m.Name != "" {
|
|
methods = append(methods, m)
|
|
}
|
|
})
|
|
|
|
return methods
|
|
}
|
|
|
|
// extractConstants extracts constant declarations.
|
|
func (p *Parser) extractConstants(doc *goquery.Document) []*Value {
|
|
var constants []*Value
|
|
|
|
doc.Find(".Documentation-constants").Each(func(_ int, s *goquery.Selection) {
|
|
// Extract constant group
|
|
codeEl := s.Find("pre").First()
|
|
if codeEl.Length() > 0 {
|
|
v := &Value{
|
|
IsConst: true,
|
|
}
|
|
|
|
// Parse const declarations
|
|
text := codeEl.Text()
|
|
lines := strings.Split(text, "\n")
|
|
|
|
for _, line := range lines {
|
|
line = strings.TrimSpace(line)
|
|
if line == "" || strings.HasPrefix(line, "//") {
|
|
continue
|
|
}
|
|
|
|
// Simple const: Name = value
|
|
if strings.Contains(line, "=") {
|
|
parts := strings.SplitN(line, "=", 2)
|
|
if len(parts) == 2 {
|
|
name := strings.TrimSpace(parts[0])
|
|
if v.Names == nil {
|
|
v.Names = []string{}
|
|
}
|
|
v.Names = append(v.Names, name)
|
|
if v.Name == "" {
|
|
v.Name = name
|
|
}
|
|
v.Value = strings.TrimSpace(parts[1])
|
|
}
|
|
}
|
|
}
|
|
|
|
// Extract documentation
|
|
docEl := s.Find("p").First()
|
|
if docEl.Length() > 0 {
|
|
v.Doc = strings.TrimSpace(docEl.Text())
|
|
}
|
|
|
|
if len(v.Names) > 0 {
|
|
constants = append(constants, v)
|
|
}
|
|
}
|
|
})
|
|
|
|
return constants
|
|
}
|
|
|
|
// extractVariables extracts variable declarations.
|
|
func (p *Parser) extractVariables(doc *goquery.Document) []*Value {
|
|
var variables []*Value
|
|
|
|
doc.Find(".Documentation-variables").Each(func(_ int, s *goquery.Selection) {
|
|
codeEl := s.Find("pre").First()
|
|
if codeEl.Length() > 0 {
|
|
v := &Value{
|
|
IsConst: false,
|
|
}
|
|
|
|
text := codeEl.Text()
|
|
// Parse var declarations
|
|
if strings.HasPrefix(text, "var ") {
|
|
text = strings.TrimPrefix(text, "var ")
|
|
}
|
|
|
|
lines := strings.Split(text, "\n")
|
|
for _, line := range lines {
|
|
line = strings.TrimSpace(line)
|
|
if line == "" {
|
|
continue
|
|
}
|
|
|
|
// Parse: Name Type = value
|
|
if strings.Contains(line, "=") {
|
|
parts := strings.SplitN(line, "=", 2)
|
|
if len(parts) == 2 {
|
|
nameType := strings.TrimSpace(parts[0])
|
|
v.Name = strings.Fields(nameType)[0]
|
|
v.Value = strings.TrimSpace(parts[1])
|
|
break
|
|
}
|
|
} else {
|
|
// Just name and type
|
|
fields := strings.Fields(line)
|
|
if len(fields) >= 1 {
|
|
v.Name = fields[0]
|
|
if len(fields) >= 2 {
|
|
v.Type = strings.Join(fields[1:], " ")
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Extract documentation
|
|
docEl := s.Find("p").First()
|
|
if docEl.Length() > 0 {
|
|
v.Doc = strings.TrimSpace(docEl.Text())
|
|
}
|
|
|
|
if v.Name != "" {
|
|
variables = append(variables, v)
|
|
}
|
|
}
|
|
})
|
|
|
|
return variables
|
|
}
|
|
|
|
// extractExamples extracts examples from a section.
|
|
func (p *Parser) extractExamples(section *goquery.Selection) []*Example {
|
|
var examples []*Example
|
|
|
|
section.Find(".Documentation-example").Each(func(_ int, s *goquery.Selection) {
|
|
ex := &Example{}
|
|
|
|
// Extract example name
|
|
nameEl := s.Find(".Documentation-exampleHeader").First()
|
|
if nameEl.Length() > 0 {
|
|
ex.Name = strings.TrimSpace(nameEl.Text())
|
|
}
|
|
|
|
// Extract code
|
|
codeEl := s.Find("pre, code").First()
|
|
if codeEl.Length() > 0 {
|
|
ex.Code = strings.TrimSpace(codeEl.Text())
|
|
}
|
|
|
|
// Extract output
|
|
outputEl := s.Find(".Documentation-exampleOutput").First()
|
|
if outputEl.Length() > 0 {
|
|
ex.Output = strings.TrimSpace(outputEl.Text())
|
|
}
|
|
|
|
// Extract documentation
|
|
docEl := s.Find("p").First()
|
|
if docEl.Length() > 0 {
|
|
ex.Doc = strings.TrimSpace(docEl.Text())
|
|
}
|
|
|
|
if ex.Code != "" {
|
|
examples = append(examples, ex)
|
|
}
|
|
})
|
|
|
|
return examples
|
|
}
|
|
|
|
// extractPackageExamples extracts package-level examples.
|
|
func (p *Parser) extractPackageExamples(doc *goquery.Document) []*Example {
|
|
var examples []*Example
|
|
|
|
doc.Find(".Documentation-example").Each(func(_ int, s *goquery.Selection) {
|
|
ex := &Example{}
|
|
|
|
// Extract example name
|
|
nameEl := s.Find(".Documentation-exampleHeader").First()
|
|
if nameEl.Length() > 0 {
|
|
ex.Name = strings.TrimSpace(nameEl.Text())
|
|
}
|
|
|
|
// Extract code
|
|
codeEl := s.Find("pre, code").First()
|
|
if codeEl.Length() > 0 {
|
|
ex.Code = strings.TrimSpace(codeEl.Text())
|
|
}
|
|
|
|
// Extract output
|
|
outputEl := s.Find(".Documentation-exampleOutput").First()
|
|
if outputEl.Length() > 0 {
|
|
ex.Output = strings.TrimSpace(outputEl.Text())
|
|
}
|
|
|
|
if ex.Code != "" {
|
|
examples = append(examples, ex)
|
|
}
|
|
})
|
|
|
|
return examples
|
|
}
|
|
|
|
// parseCount parses a count string to int.
|
|
func parseCount(s string) int {
|
|
var count int
|
|
for _, c := range s {
|
|
if c >= '0' && c <= '9' {
|
|
count = count*10 + int(c-'0')
|
|
}
|
|
}
|
|
return count
|
|
}
|
|
|
|
// isExported checks if a name is exported (starts with uppercase).
|
|
func isExported(name string) bool {
|
|
if len(name) == 0 {
|
|
return false
|
|
}
|
|
return name[0] >= 'A' && name[0] <= 'Z'
|
|
}
|
|
|
|
// cleanWhitespace normalizes whitespace in text.
|
|
func cleanWhitespace(text string) string {
|
|
// Replace multiple whitespace with single space
|
|
re := regexp.MustCompile(`\s+`)
|
|
text = re.ReplaceAllString(text, " ")
|
|
return strings.TrimSpace(text)
|
|
}
|