Files
Devour/pkg/godocs/parser.go
T
Tomas Dvorak 409acd2e08 updage
2026-02-22 15:41:27 +01:00

686 lines
16 KiB
Go

package godocs
import (
"net/url"
"regexp"
"strings"
"time"
"github.com/PuerkitoBio/goquery"
)
// Parser parses pkg.go.dev HTML pages into structured documentation.
type Parser struct {
baseURL string
}
// NewParser creates a new parser for pkg.go.dev content.
func NewParser() *Parser {
return &Parser{
baseURL: "https://pkg.go.dev",
}
}
// ParsePackagePage parses a pkg.go.dev package documentation page.
func (p *Parser) ParsePackagePage(html string, docURL string) (*Package, error) {
doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
if err != nil {
return nil, err
}
pkg := &Package{
DocURL: docURL,
FetchedAt: time.Now(),
}
// Extract import path from URL or breadcrumb
pkg.ImportPath = p.extractImportPath(doc, docURL)
pkg.Name = p.extractPackageName(doc)
// Extract synopsis
pkg.Synopsis = p.extractSynopsis(doc)
// Extract package documentation
pkg.Doc = p.extractPackageDoc(doc)
// Extract version info
pkg.Version = p.extractVersion(doc)
// Extract module info
pkg.Module = p.extractModule(doc)
// Extract licenses
pkg.Licenses = p.extractLicenses(doc)
// Extract imported by count
pkg.ImportedBy = p.extractImportedBy(doc)
// Extract repository URL
pkg.Repository = p.extractRepository(doc)
// Extract functions
pkg.Functions = p.extractFunctions(doc)
// Extract types
pkg.Types = p.extractTypes(doc)
// Extract constants
pkg.Constants = p.extractConstants(doc)
// Extract variables
pkg.Variables = p.extractVariables(doc)
// Extract examples
pkg.Examples = p.extractPackageExamples(doc)
return pkg, nil
}
// ParseSearchResults parses pkg.go.dev search results page.
func (p *Parser) ParseSearchResults(html string) ([]*SearchResult, error) {
doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
if err != nil {
return nil, err
}
var results []*SearchResult
doc.Find(".SearchSnippet").Each(func(i int, s *goquery.Selection) {
result := &SearchResult{}
// Extract name and path
s.Find("h2 a").Each(func(_ int, a *goquery.Selection) {
result.Name = strings.TrimSpace(a.Text())
if href, exists := a.Attr("href"); exists {
result.URL = p.baseURL + href
result.Path = strings.TrimPrefix(href, "/")
}
})
// Extract path from span
pathSpan := s.Find(".SearchSnippet-header-path")
if pathSpan.Length() > 0 {
result.Path = strings.Trim(pathSpan.Text(), "()")
}
// Extract synopsis
synopsis := s.Find(".SearchSnippet-synopsis")
if synopsis.Length() > 0 {
result.Synopsis = strings.TrimSpace(synopsis.Text())
}
// Extract imported by count
infoLabel := s.Find(".SearchSnippet-infoLabel").Text()
if strings.Contains(infoLabel, "Imported by") {
re := regexp.MustCompile(`Imported by\s+(\d[\d,]*)`)
if matches := re.FindStringSubmatch(infoLabel); len(matches) > 1 {
countStr := strings.ReplaceAll(matches[1], ",", "")
result.ImportedBy = parseCount(countStr)
}
}
// Extract version
versionMatch := regexp.MustCompile(`v?\d+\.\d+(?:\.\d+)?`).FindString(infoLabel)
result.Version = versionMatch
// Extract license
license := s.Find("[data-test-id='snippet-license'] a")
if license.Length() > 0 {
result.License = strings.TrimSpace(license.Text())
}
results = append(results, result)
})
return results, nil
}
// extractImportPath extracts the import path from the page.
func (p *Parser) extractImportPath(doc *goquery.Document, docURL string) string {
// Try to extract from breadcrumb
var importPath string
doc.Find(".go-Breadcrumb li a").Each(func(i int, s *goquery.Selection) {
if i > 0 { // Skip "Discover Packages"
part := strings.TrimSpace(s.Text())
if part != "" {
if importPath != "" {
importPath += "/"
}
importPath += part
}
}
})
if importPath != "" {
return importPath
}
// Fallback: extract from URL
if docURL != "" {
u, err := url.Parse(docURL)
if err == nil {
path := strings.TrimPrefix(u.Path, "/")
// Remove version suffix like @v1.0.0
if idx := strings.Index(path, "@"); idx > 0 {
path = path[:idx]
}
return path
}
}
return ""
}
// extractPackageName extracts the package name.
func (p *Parser) extractPackageName(doc *goquery.Document) string {
// Try UnitHeader-title
title := doc.Find(".UnitHeader-titleHeading").Text()
title = strings.TrimSpace(title)
if title != "" {
return title
}
// Fallback to h1
title = doc.Find("h1").First().Text()
return strings.TrimSpace(title)
}
// extractSynopsis extracts the package synopsis.
func (p *Parser) extractSynopsis(doc *goquery.Document) string {
// Synopsis is typically in the first paragraph after the package declaration
docSection := doc.Find(".Documentation").First()
if docSection.Length() > 0 {
// Get the first paragraph
firstP := docSection.Find("p").First()
if firstP.Length() > 0 {
synopsis := strings.TrimSpace(firstP.Text())
// Limit to reasonable length
if len(synopsis) > 200 {
synopsis = synopsis[:197] + "..."
}
return synopsis
}
}
return ""
}
// extractPackageDoc extracts the full package documentation.
func (p *Parser) extractPackageDoc(doc *goquery.Document) string {
var parts []string
doc.Find(".Documentation").Each(func(_ int, s *goquery.Selection) {
text := s.Text()
text = cleanWhitespace(text)
if text != "" {
parts = append(parts, text)
}
})
return strings.Join(parts, "\n\n")
}
// extractVersion extracts the version info.
func (p *Parser) extractVersion(doc *goquery.Document) string {
versionEl := doc.Find("[data-test-id='UnitHeader-version'] a")
if versionEl.Length() > 0 {
return strings.TrimSpace(versionEl.Text())
}
return ""
}
// extractModule extracts module information.
func (p *Parser) extractModule(doc *goquery.Document) *Module {
modulePath := ""
moduleVersion := ""
// Try to extract from version link
versionEl := doc.Find("[data-test-id='UnitHeader-version'] a")
if versionEl.Length() > 0 {
moduleVersion = strings.TrimSpace(versionEl.Text())
}
// Extract module path from breadcrumb
doc.Find(".go-Breadcrumb li a").Each(func(i int, s *goquery.Selection) {
text := strings.TrimSpace(s.Text())
if strings.Contains(text, "/") && i > 0 {
modulePath = text
}
})
if modulePath != "" {
return &Module{
Path: modulePath,
Version: moduleVersion,
}
}
return nil
}
// extractLicenses extracts license information.
func (p *Parser) extractLicenses(doc *goquery.Document) []License {
var licenses []License
doc.Find("[data-test-id='UnitHeader-license']").Each(func(_ int, s *goquery.Selection) {
name := strings.TrimSpace(s.Text())
if name != "" {
license := License{Name: name}
if href, exists := s.Attr("href"); exists {
license.Path = href
}
licenses = append(licenses, license)
}
})
return licenses
}
// extractImportedBy extracts the import count.
func (p *Parser) extractImportedBy(doc *goquery.Document) int {
importEl := doc.Find("[data-test-id='UnitHeader-importedby'] a")
if importEl.Length() > 0 {
text := importEl.Text()
// Extract number from "Imported by: 144,729"
re := regexp.MustCompile(`[\d,]+`)
if match := re.FindString(text); match != "" {
match = strings.ReplaceAll(match, ",", "")
var count int
for _, c := range match {
if c >= '0' && c <= '9' {
count = count*10 + int(c-'0')
}
}
return count
}
}
return 0
}
// extractRepository extracts the repository URL.
func (p *Parser) extractRepository(doc *goquery.Document) string {
repoEl := doc.Find(".UnitMeta-repo a")
if repoEl.Length() > 0 {
if href, exists := repoEl.Attr("href"); exists {
return href
}
}
return ""
}
// extractFunctions extracts all function declarations.
func (p *Parser) extractFunctions(doc *goquery.Document) []*Function {
var functions []*Function
doc.Find(".Documentation-function").Each(func(_ int, s *goquery.Selection) {
fn := &Function{}
// Extract name from the function header
nameEl := s.Find(".Documentation-functionHeader").First()
if nameEl.Length() > 0 {
fn.Name = strings.TrimSpace(nameEl.Text())
}
// Extract signature from code block
sigEl := s.Find("pre").First()
if sigEl.Length() > 0 {
fn.Signature = strings.TrimSpace(sigEl.Text())
}
// Extract documentation
docEl := s.Find(".Documentation-functionBody p").First()
if docEl.Length() == 0 {
docEl = s.Find("p").First()
}
if docEl.Length() > 0 {
fn.Doc = strings.TrimSpace(docEl.Text())
}
// Extract examples
fn.Examples = p.extractExamples(s)
if fn.Name != "" {
functions = append(functions, fn)
}
})
return functions
}
// extractTypes extracts all type declarations.
func (p *Parser) extractTypes(doc *goquery.Document) []*Type {
var types []*Type
doc.Find(".Documentation-type").Each(func(_ int, s *goquery.Selection) {
t := &Type{}
// Extract name from the type header
nameEl := s.Find(".Documentation-typeHeader").First()
if nameEl.Length() > 0 {
t.Name = strings.TrimSpace(nameEl.Text())
}
// Determine kind from signature
sigEl := s.Find("pre").First()
if sigEl.Length() > 0 {
sig := sigEl.Text()
t.Underlying = strings.TrimSpace(sig)
if strings.Contains(sig, "struct{") {
t.Kind = TypeKindStruct
t.Fields = p.extractStructFields(sigEl)
} else if strings.Contains(sig, "interface{") {
t.Kind = TypeKindInterface
} else {
t.Kind = TypeKindAlias
}
}
// Extract documentation
docEl := s.Find("p").First()
if docEl.Length() > 0 {
t.Doc = strings.TrimSpace(docEl.Text())
}
// Extract methods
t.Methods = p.extractMethods(s)
// Extract examples
t.Examples = p.extractExamples(s)
if t.Name != "" {
types = append(types, t)
}
})
return types
}
// extractStructFields extracts struct fields from a type definition.
func (p *Parser) extractStructFields(sigEl *goquery.Selection) []*Field {
var fields []*Field
sigEl.Find("tr, .Documentation-structField").Each(func(_ int, s *goquery.Selection) {
text := s.Text()
text = strings.TrimSpace(text)
if text == "" || strings.HasPrefix(text, "//") {
return
}
// Parse field: Name Type `tag`
parts := strings.Fields(text)
if len(parts) >= 1 {
field := &Field{
Name: parts[0],
Exported: isExported(parts[0]),
}
if len(parts) >= 2 {
field.Type = strings.Join(parts[1:], " ")
// Remove tag
if idx := strings.Index(field.Type, "`"); idx > 0 {
field.Tag = field.Type[idx:]
field.Type = field.Type[:idx]
}
}
fields = append(fields, field)
}
})
return fields
}
// extractMethods extracts methods from a type section.
func (p *Parser) extractMethods(typeSection *goquery.Selection) []*Method {
var methods []*Method
typeSection.Find(".Documentation-method, .Documentation-function").Each(func(_ int, s *goquery.Selection) {
m := &Method{}
// Extract method name
nameEl := s.Find(".Documentation-functionHeader, .Documentation-methodHeader").First()
if nameEl.Length() > 0 {
name := strings.TrimSpace(nameEl.Text())
// Extract receiver if present: (t *Type) Method(...)
if strings.HasPrefix(name, "(") {
if end := strings.Index(name, ")"); end > 0 {
m.Receiver = name[1:end]
name = strings.TrimSpace(name[end+1:])
}
}
m.Name = name
}
// Extract signature
sigEl := s.Find("pre").First()
if sigEl.Length() > 0 {
m.Signature = strings.TrimSpace(sigEl.Text())
}
// Extract documentation
docEl := s.Find("p").First()
if docEl.Length() > 0 {
m.Doc = strings.TrimSpace(docEl.Text())
}
if m.Name != "" {
methods = append(methods, m)
}
})
return methods
}
// extractConstants extracts constant declarations.
func (p *Parser) extractConstants(doc *goquery.Document) []*Value {
var constants []*Value
doc.Find(".Documentation-constants").Each(func(_ int, s *goquery.Selection) {
// Extract constant group
codeEl := s.Find("pre").First()
if codeEl.Length() > 0 {
v := &Value{
IsConst: true,
}
// Parse const declarations
text := codeEl.Text()
lines := strings.Split(text, "\n")
for _, line := range lines {
line = strings.TrimSpace(line)
if line == "" || strings.HasPrefix(line, "//") {
continue
}
// Simple const: Name = value
if strings.Contains(line, "=") {
parts := strings.SplitN(line, "=", 2)
if len(parts) == 2 {
name := strings.TrimSpace(parts[0])
if v.Names == nil {
v.Names = []string{}
}
v.Names = append(v.Names, name)
if v.Name == "" {
v.Name = name
}
v.Value = strings.TrimSpace(parts[1])
}
}
}
// Extract documentation
docEl := s.Find("p").First()
if docEl.Length() > 0 {
v.Doc = strings.TrimSpace(docEl.Text())
}
if len(v.Names) > 0 {
constants = append(constants, v)
}
}
})
return constants
}
// extractVariables extracts variable declarations.
func (p *Parser) extractVariables(doc *goquery.Document) []*Value {
var variables []*Value
doc.Find(".Documentation-variables").Each(func(_ int, s *goquery.Selection) {
codeEl := s.Find("pre").First()
if codeEl.Length() > 0 {
v := &Value{
IsConst: false,
}
text := codeEl.Text()
// Parse var declarations
text = strings.TrimPrefix(text, "var ")
lines := strings.Split(text, "\n")
for _, line := range lines {
line = strings.TrimSpace(line)
if line == "" {
continue
}
// Parse: Name Type = value
if strings.Contains(line, "=") {
parts := strings.SplitN(line, "=", 2)
if len(parts) == 2 {
nameType := strings.TrimSpace(parts[0])
v.Name = strings.Fields(nameType)[0]
v.Value = strings.TrimSpace(parts[1])
break
}
} else {
// Just name and type
fields := strings.Fields(line)
if len(fields) >= 1 {
v.Name = fields[0]
if len(fields) >= 2 {
v.Type = strings.Join(fields[1:], " ")
}
}
}
}
// Extract documentation
docEl := s.Find("p").First()
if docEl.Length() > 0 {
v.Doc = strings.TrimSpace(docEl.Text())
}
if v.Name != "" {
variables = append(variables, v)
}
}
})
return variables
}
// extractExamples extracts examples from a section.
func (p *Parser) extractExamples(section *goquery.Selection) []*Example {
var examples []*Example
section.Find(".Documentation-example").Each(func(_ int, s *goquery.Selection) {
ex := &Example{}
// Extract example name
nameEl := s.Find(".Documentation-exampleHeader").First()
if nameEl.Length() > 0 {
ex.Name = strings.TrimSpace(nameEl.Text())
}
// Extract code
codeEl := s.Find("pre, code").First()
if codeEl.Length() > 0 {
ex.Code = strings.TrimSpace(codeEl.Text())
}
// Extract output
outputEl := s.Find(".Documentation-exampleOutput").First()
if outputEl.Length() > 0 {
ex.Output = strings.TrimSpace(outputEl.Text())
}
// Extract documentation
docEl := s.Find("p").First()
if docEl.Length() > 0 {
ex.Doc = strings.TrimSpace(docEl.Text())
}
if ex.Code != "" {
examples = append(examples, ex)
}
})
return examples
}
// extractPackageExamples extracts package-level examples.
func (p *Parser) extractPackageExamples(doc *goquery.Document) []*Example {
var examples []*Example
doc.Find(".Documentation-example").Each(func(_ int, s *goquery.Selection) {
ex := &Example{}
// Extract example name
nameEl := s.Find(".Documentation-exampleHeader").First()
if nameEl.Length() > 0 {
ex.Name = strings.TrimSpace(nameEl.Text())
}
// Extract code
codeEl := s.Find("pre, code").First()
if codeEl.Length() > 0 {
ex.Code = strings.TrimSpace(codeEl.Text())
}
// Extract output
outputEl := s.Find(".Documentation-exampleOutput").First()
if outputEl.Length() > 0 {
ex.Output = strings.TrimSpace(outputEl.Text())
}
if ex.Code != "" {
examples = append(examples, ex)
}
})
return examples
}
// parseCount parses a count string to int.
func parseCount(s string) int {
var count int
for _, c := range s {
if c >= '0' && c <= '9' {
count = count*10 + int(c-'0')
}
}
return count
}
// isExported checks if a name is exported (starts with uppercase).
func isExported(name string) bool {
if len(name) == 0 {
return false
}
return name[0] >= 'A' && name[0] <= 'Z'
}
// cleanWhitespace normalizes whitespace in text.
func cleanWhitespace(text string) string {
// Replace multiple whitespace with single space
re := regexp.MustCompile(`\s+`)
text = re.ReplaceAllString(text, " ")
return strings.TrimSpace(text)
}