mirror of
https://github.com/Dvorinka/Devour.git
synced 2026-06-03 20:13:03 +00:00
first commit
This commit is contained in:
@@ -0,0 +1,687 @@
|
||||
package godocs
|
||||
|
||||
import (
|
||||
"net/url"
|
||||
"regexp"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
)
|
||||
|
||||
// Parser parses pkg.go.dev HTML pages into structured documentation.
|
||||
type Parser struct {
|
||||
baseURL string
|
||||
}
|
||||
|
||||
// NewParser creates a new parser for pkg.go.dev content.
|
||||
func NewParser() *Parser {
|
||||
return &Parser{
|
||||
baseURL: "https://pkg.go.dev",
|
||||
}
|
||||
}
|
||||
|
||||
// ParsePackagePage parses a pkg.go.dev package documentation page.
|
||||
func (p *Parser) ParsePackagePage(html string, docURL string) (*Package, error) {
|
||||
doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
pkg := &Package{
|
||||
DocURL: docURL,
|
||||
FetchedAt: time.Now(),
|
||||
}
|
||||
|
||||
// Extract import path from URL or breadcrumb
|
||||
pkg.ImportPath = p.extractImportPath(doc, docURL)
|
||||
pkg.Name = p.extractPackageName(doc)
|
||||
|
||||
// Extract synopsis
|
||||
pkg.Synopsis = p.extractSynopsis(doc)
|
||||
|
||||
// Extract package documentation
|
||||
pkg.Doc = p.extractPackageDoc(doc)
|
||||
|
||||
// Extract version info
|
||||
pkg.Version = p.extractVersion(doc)
|
||||
|
||||
// Extract module info
|
||||
pkg.Module = p.extractModule(doc)
|
||||
|
||||
// Extract licenses
|
||||
pkg.Licenses = p.extractLicenses(doc)
|
||||
|
||||
// Extract imported by count
|
||||
pkg.ImportedBy = p.extractImportedBy(doc)
|
||||
|
||||
// Extract repository URL
|
||||
pkg.Repository = p.extractRepository(doc)
|
||||
|
||||
// Extract functions
|
||||
pkg.Functions = p.extractFunctions(doc)
|
||||
|
||||
// Extract types
|
||||
pkg.Types = p.extractTypes(doc)
|
||||
|
||||
// Extract constants
|
||||
pkg.Constants = p.extractConstants(doc)
|
||||
|
||||
// Extract variables
|
||||
pkg.Variables = p.extractVariables(doc)
|
||||
|
||||
// Extract examples
|
||||
pkg.Examples = p.extractPackageExamples(doc)
|
||||
|
||||
return pkg, nil
|
||||
}
|
||||
|
||||
// ParseSearchResults parses pkg.go.dev search results page.
|
||||
func (p *Parser) ParseSearchResults(html string) ([]*SearchResult, error) {
|
||||
doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var results []*SearchResult
|
||||
|
||||
doc.Find(".SearchSnippet").Each(func(i int, s *goquery.Selection) {
|
||||
result := &SearchResult{}
|
||||
|
||||
// Extract name and path
|
||||
s.Find("h2 a").Each(func(_ int, a *goquery.Selection) {
|
||||
result.Name = strings.TrimSpace(a.Text())
|
||||
if href, exists := a.Attr("href"); exists {
|
||||
result.URL = p.baseURL + href
|
||||
result.Path = strings.TrimPrefix(href, "/")
|
||||
}
|
||||
})
|
||||
|
||||
// Extract path from span
|
||||
pathSpan := s.Find(".SearchSnippet-header-path")
|
||||
if pathSpan.Length() > 0 {
|
||||
result.Path = strings.Trim(pathSpan.Text(), "()")
|
||||
}
|
||||
|
||||
// Extract synopsis
|
||||
synopsis := s.Find(".SearchSnippet-synopsis")
|
||||
if synopsis.Length() > 0 {
|
||||
result.Synopsis = strings.TrimSpace(synopsis.Text())
|
||||
}
|
||||
|
||||
// Extract imported by count
|
||||
infoLabel := s.Find(".SearchSnippet-infoLabel").Text()
|
||||
if strings.Contains(infoLabel, "Imported by") {
|
||||
re := regexp.MustCompile(`Imported by\s+(\d[\d,]*)`)
|
||||
if matches := re.FindStringSubmatch(infoLabel); len(matches) > 1 {
|
||||
countStr := strings.ReplaceAll(matches[1], ",", "")
|
||||
result.ImportedBy = parseCount(countStr)
|
||||
}
|
||||
}
|
||||
|
||||
// Extract version
|
||||
versionMatch := regexp.MustCompile(`v?\d+\.\d+(?:\.\d+)?`).FindString(infoLabel)
|
||||
result.Version = versionMatch
|
||||
|
||||
// Extract license
|
||||
license := s.Find("[data-test-id='snippet-license'] a")
|
||||
if license.Length() > 0 {
|
||||
result.License = strings.TrimSpace(license.Text())
|
||||
}
|
||||
|
||||
results = append(results, result)
|
||||
})
|
||||
|
||||
return results, nil
|
||||
}
|
||||
|
||||
// extractImportPath extracts the import path from the page.
|
||||
func (p *Parser) extractImportPath(doc *goquery.Document, docURL string) string {
|
||||
// Try to extract from breadcrumb
|
||||
var importPath string
|
||||
doc.Find(".go-Breadcrumb li a").Each(func(i int, s *goquery.Selection) {
|
||||
if i > 0 { // Skip "Discover Packages"
|
||||
part := strings.TrimSpace(s.Text())
|
||||
if part != "" {
|
||||
if importPath != "" {
|
||||
importPath += "/"
|
||||
}
|
||||
importPath += part
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
if importPath != "" {
|
||||
return importPath
|
||||
}
|
||||
|
||||
// Fallback: extract from URL
|
||||
if docURL != "" {
|
||||
u, err := url.Parse(docURL)
|
||||
if err == nil {
|
||||
path := strings.TrimPrefix(u.Path, "/")
|
||||
// Remove version suffix like @v1.0.0
|
||||
if idx := strings.Index(path, "@"); idx > 0 {
|
||||
path = path[:idx]
|
||||
}
|
||||
return path
|
||||
}
|
||||
}
|
||||
|
||||
return ""
|
||||
}
|
||||
|
||||
// extractPackageName extracts the package name.
|
||||
func (p *Parser) extractPackageName(doc *goquery.Document) string {
|
||||
// Try UnitHeader-title
|
||||
title := doc.Find(".UnitHeader-titleHeading").Text()
|
||||
title = strings.TrimSpace(title)
|
||||
if title != "" {
|
||||
return title
|
||||
}
|
||||
|
||||
// Fallback to h1
|
||||
title = doc.Find("h1").First().Text()
|
||||
return strings.TrimSpace(title)
|
||||
}
|
||||
|
||||
// extractSynopsis extracts the package synopsis.
|
||||
func (p *Parser) extractSynopsis(doc *goquery.Document) string {
|
||||
// Synopsis is typically in the first paragraph after the package declaration
|
||||
docSection := doc.Find(".Documentation").First()
|
||||
if docSection.Length() > 0 {
|
||||
// Get the first paragraph
|
||||
firstP := docSection.Find("p").First()
|
||||
if firstP.Length() > 0 {
|
||||
synopsis := strings.TrimSpace(firstP.Text())
|
||||
// Limit to reasonable length
|
||||
if len(synopsis) > 200 {
|
||||
synopsis = synopsis[:197] + "..."
|
||||
}
|
||||
return synopsis
|
||||
}
|
||||
}
|
||||
|
||||
return ""
|
||||
}
|
||||
|
||||
// extractPackageDoc extracts the full package documentation.
|
||||
func (p *Parser) extractPackageDoc(doc *goquery.Document) string {
|
||||
var parts []string
|
||||
|
||||
doc.Find(".Documentation").Each(func(_ int, s *goquery.Selection) {
|
||||
text := s.Text()
|
||||
text = cleanWhitespace(text)
|
||||
if text != "" {
|
||||
parts = append(parts, text)
|
||||
}
|
||||
})
|
||||
|
||||
return strings.Join(parts, "\n\n")
|
||||
}
|
||||
|
||||
// extractVersion extracts the version info.
|
||||
func (p *Parser) extractVersion(doc *goquery.Document) string {
|
||||
versionEl := doc.Find("[data-test-id='UnitHeader-version'] a")
|
||||
if versionEl.Length() > 0 {
|
||||
return strings.TrimSpace(versionEl.Text())
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
// extractModule extracts module information.
|
||||
func (p *Parser) extractModule(doc *goquery.Document) *Module {
|
||||
modulePath := ""
|
||||
moduleVersion := ""
|
||||
|
||||
// Try to extract from version link
|
||||
versionEl := doc.Find("[data-test-id='UnitHeader-version'] a")
|
||||
if versionEl.Length() > 0 {
|
||||
moduleVersion = strings.TrimSpace(versionEl.Text())
|
||||
}
|
||||
|
||||
// Extract module path from breadcrumb
|
||||
doc.Find(".go-Breadcrumb li a").Each(func(i int, s *goquery.Selection) {
|
||||
text := strings.TrimSpace(s.Text())
|
||||
if strings.Contains(text, "/") && i > 0 {
|
||||
modulePath = text
|
||||
}
|
||||
})
|
||||
|
||||
if modulePath != "" {
|
||||
return &Module{
|
||||
Path: modulePath,
|
||||
Version: moduleVersion,
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// extractLicenses extracts license information.
|
||||
func (p *Parser) extractLicenses(doc *goquery.Document) []License {
|
||||
var licenses []License
|
||||
|
||||
doc.Find("[data-test-id='UnitHeader-license']").Each(func(_ int, s *goquery.Selection) {
|
||||
name := strings.TrimSpace(s.Text())
|
||||
if name != "" {
|
||||
license := License{Name: name}
|
||||
if href, exists := s.Attr("href"); exists {
|
||||
license.Path = href
|
||||
}
|
||||
licenses = append(licenses, license)
|
||||
}
|
||||
})
|
||||
|
||||
return licenses
|
||||
}
|
||||
|
||||
// extractImportedBy extracts the import count.
|
||||
func (p *Parser) extractImportedBy(doc *goquery.Document) int {
|
||||
importEl := doc.Find("[data-test-id='UnitHeader-importedby'] a")
|
||||
if importEl.Length() > 0 {
|
||||
text := importEl.Text()
|
||||
// Extract number from "Imported by: 144,729"
|
||||
re := regexp.MustCompile(`[\d,]+`)
|
||||
if match := re.FindString(text); match != "" {
|
||||
match = strings.ReplaceAll(match, ",", "")
|
||||
var count int
|
||||
for _, c := range match {
|
||||
if c >= '0' && c <= '9' {
|
||||
count = count*10 + int(c-'0')
|
||||
}
|
||||
}
|
||||
return count
|
||||
}
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
// extractRepository extracts the repository URL.
|
||||
func (p *Parser) extractRepository(doc *goquery.Document) string {
|
||||
repoEl := doc.Find(".UnitMeta-repo a")
|
||||
if repoEl.Length() > 0 {
|
||||
if href, exists := repoEl.Attr("href"); exists {
|
||||
return href
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
// extractFunctions extracts all function declarations.
|
||||
func (p *Parser) extractFunctions(doc *goquery.Document) []*Function {
|
||||
var functions []*Function
|
||||
|
||||
doc.Find(".Documentation-function").Each(func(_ int, s *goquery.Selection) {
|
||||
fn := &Function{}
|
||||
|
||||
// Extract name from the function header
|
||||
nameEl := s.Find(".Documentation-functionHeader").First()
|
||||
if nameEl.Length() > 0 {
|
||||
fn.Name = strings.TrimSpace(nameEl.Text())
|
||||
}
|
||||
|
||||
// Extract signature from code block
|
||||
sigEl := s.Find("pre").First()
|
||||
if sigEl.Length() > 0 {
|
||||
fn.Signature = strings.TrimSpace(sigEl.Text())
|
||||
}
|
||||
|
||||
// Extract documentation
|
||||
docEl := s.Find(".Documentation-functionBody p").First()
|
||||
if docEl.Length() == 0 {
|
||||
docEl = s.Find("p").First()
|
||||
}
|
||||
if docEl.Length() > 0 {
|
||||
fn.Doc = strings.TrimSpace(docEl.Text())
|
||||
}
|
||||
|
||||
// Extract examples
|
||||
fn.Examples = p.extractExamples(s)
|
||||
|
||||
if fn.Name != "" {
|
||||
functions = append(functions, fn)
|
||||
}
|
||||
})
|
||||
|
||||
return functions
|
||||
}
|
||||
|
||||
// extractTypes extracts all type declarations.
|
||||
func (p *Parser) extractTypes(doc *goquery.Document) []*Type {
|
||||
var types []*Type
|
||||
|
||||
doc.Find(".Documentation-type").Each(func(_ int, s *goquery.Selection) {
|
||||
t := &Type{}
|
||||
|
||||
// Extract name from the type header
|
||||
nameEl := s.Find(".Documentation-typeHeader").First()
|
||||
if nameEl.Length() > 0 {
|
||||
t.Name = strings.TrimSpace(nameEl.Text())
|
||||
}
|
||||
|
||||
// Determine kind from signature
|
||||
sigEl := s.Find("pre").First()
|
||||
if sigEl.Length() > 0 {
|
||||
sig := sigEl.Text()
|
||||
t.Underlying = strings.TrimSpace(sig)
|
||||
|
||||
if strings.Contains(sig, "struct{") {
|
||||
t.Kind = TypeKindStruct
|
||||
t.Fields = p.extractStructFields(sigEl)
|
||||
} else if strings.Contains(sig, "interface{") {
|
||||
t.Kind = TypeKindInterface
|
||||
} else {
|
||||
t.Kind = TypeKindAlias
|
||||
}
|
||||
}
|
||||
|
||||
// Extract documentation
|
||||
docEl := s.Find("p").First()
|
||||
if docEl.Length() > 0 {
|
||||
t.Doc = strings.TrimSpace(docEl.Text())
|
||||
}
|
||||
|
||||
// Extract methods
|
||||
t.Methods = p.extractMethods(s)
|
||||
|
||||
// Extract examples
|
||||
t.Examples = p.extractExamples(s)
|
||||
|
||||
if t.Name != "" {
|
||||
types = append(types, t)
|
||||
}
|
||||
})
|
||||
|
||||
return types
|
||||
}
|
||||
|
||||
// extractStructFields extracts struct fields from a type definition.
|
||||
func (p *Parser) extractStructFields(sigEl *goquery.Selection) []*Field {
|
||||
var fields []*Field
|
||||
|
||||
sigEl.Find("tr, .Documentation-structField").Each(func(_ int, s *goquery.Selection) {
|
||||
text := s.Text()
|
||||
text = strings.TrimSpace(text)
|
||||
|
||||
if text == "" || strings.HasPrefix(text, "//") {
|
||||
return
|
||||
}
|
||||
|
||||
// Parse field: Name Type `tag`
|
||||
parts := strings.Fields(text)
|
||||
if len(parts) >= 1 {
|
||||
field := &Field{
|
||||
Name: parts[0],
|
||||
Exported: isExported(parts[0]),
|
||||
}
|
||||
|
||||
if len(parts) >= 2 {
|
||||
field.Type = strings.Join(parts[1:], " ")
|
||||
// Remove tag
|
||||
if idx := strings.Index(field.Type, "`"); idx > 0 {
|
||||
field.Tag = field.Type[idx:]
|
||||
field.Type = field.Type[:idx]
|
||||
}
|
||||
}
|
||||
|
||||
fields = append(fields, field)
|
||||
}
|
||||
})
|
||||
|
||||
return fields
|
||||
}
|
||||
|
||||
// extractMethods extracts methods from a type section.
|
||||
func (p *Parser) extractMethods(typeSection *goquery.Selection) []*Method {
|
||||
var methods []*Method
|
||||
|
||||
typeSection.Find(".Documentation-method, .Documentation-function").Each(func(_ int, s *goquery.Selection) {
|
||||
m := &Method{}
|
||||
|
||||
// Extract method name
|
||||
nameEl := s.Find(".Documentation-functionHeader, .Documentation-methodHeader").First()
|
||||
if nameEl.Length() > 0 {
|
||||
name := strings.TrimSpace(nameEl.Text())
|
||||
// Extract receiver if present: (t *Type) Method(...)
|
||||
if strings.HasPrefix(name, "(") {
|
||||
if end := strings.Index(name, ")"); end > 0 {
|
||||
m.Receiver = name[1:end]
|
||||
name = strings.TrimSpace(name[end+1:])
|
||||
}
|
||||
}
|
||||
m.Name = name
|
||||
}
|
||||
|
||||
// Extract signature
|
||||
sigEl := s.Find("pre").First()
|
||||
if sigEl.Length() > 0 {
|
||||
m.Signature = strings.TrimSpace(sigEl.Text())
|
||||
}
|
||||
|
||||
// Extract documentation
|
||||
docEl := s.Find("p").First()
|
||||
if docEl.Length() > 0 {
|
||||
m.Doc = strings.TrimSpace(docEl.Text())
|
||||
}
|
||||
|
||||
if m.Name != "" {
|
||||
methods = append(methods, m)
|
||||
}
|
||||
})
|
||||
|
||||
return methods
|
||||
}
|
||||
|
||||
// extractConstants extracts constant declarations.
|
||||
func (p *Parser) extractConstants(doc *goquery.Document) []*Value {
|
||||
var constants []*Value
|
||||
|
||||
doc.Find(".Documentation-constants").Each(func(_ int, s *goquery.Selection) {
|
||||
// Extract constant group
|
||||
codeEl := s.Find("pre").First()
|
||||
if codeEl.Length() > 0 {
|
||||
v := &Value{
|
||||
IsConst: true,
|
||||
}
|
||||
|
||||
// Parse const declarations
|
||||
text := codeEl.Text()
|
||||
lines := strings.Split(text, "\n")
|
||||
|
||||
for _, line := range lines {
|
||||
line = strings.TrimSpace(line)
|
||||
if line == "" || strings.HasPrefix(line, "//") {
|
||||
continue
|
||||
}
|
||||
|
||||
// Simple const: Name = value
|
||||
if strings.Contains(line, "=") {
|
||||
parts := strings.SplitN(line, "=", 2)
|
||||
if len(parts) == 2 {
|
||||
name := strings.TrimSpace(parts[0])
|
||||
if v.Names == nil {
|
||||
v.Names = []string{}
|
||||
}
|
||||
v.Names = append(v.Names, name)
|
||||
if v.Name == "" {
|
||||
v.Name = name
|
||||
}
|
||||
v.Value = strings.TrimSpace(parts[1])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Extract documentation
|
||||
docEl := s.Find("p").First()
|
||||
if docEl.Length() > 0 {
|
||||
v.Doc = strings.TrimSpace(docEl.Text())
|
||||
}
|
||||
|
||||
if len(v.Names) > 0 {
|
||||
constants = append(constants, v)
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
return constants
|
||||
}
|
||||
|
||||
// extractVariables extracts variable declarations.
|
||||
func (p *Parser) extractVariables(doc *goquery.Document) []*Value {
|
||||
var variables []*Value
|
||||
|
||||
doc.Find(".Documentation-variables").Each(func(_ int, s *goquery.Selection) {
|
||||
codeEl := s.Find("pre").First()
|
||||
if codeEl.Length() > 0 {
|
||||
v := &Value{
|
||||
IsConst: false,
|
||||
}
|
||||
|
||||
text := codeEl.Text()
|
||||
// Parse var declarations
|
||||
if strings.HasPrefix(text, "var ") {
|
||||
text = strings.TrimPrefix(text, "var ")
|
||||
}
|
||||
|
||||
lines := strings.Split(text, "\n")
|
||||
for _, line := range lines {
|
||||
line = strings.TrimSpace(line)
|
||||
if line == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
// Parse: Name Type = value
|
||||
if strings.Contains(line, "=") {
|
||||
parts := strings.SplitN(line, "=", 2)
|
||||
if len(parts) == 2 {
|
||||
nameType := strings.TrimSpace(parts[0])
|
||||
v.Name = strings.Fields(nameType)[0]
|
||||
v.Value = strings.TrimSpace(parts[1])
|
||||
break
|
||||
}
|
||||
} else {
|
||||
// Just name and type
|
||||
fields := strings.Fields(line)
|
||||
if len(fields) >= 1 {
|
||||
v.Name = fields[0]
|
||||
if len(fields) >= 2 {
|
||||
v.Type = strings.Join(fields[1:], " ")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Extract documentation
|
||||
docEl := s.Find("p").First()
|
||||
if docEl.Length() > 0 {
|
||||
v.Doc = strings.TrimSpace(docEl.Text())
|
||||
}
|
||||
|
||||
if v.Name != "" {
|
||||
variables = append(variables, v)
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
return variables
|
||||
}
|
||||
|
||||
// extractExamples extracts examples from a section.
|
||||
func (p *Parser) extractExamples(section *goquery.Selection) []*Example {
|
||||
var examples []*Example
|
||||
|
||||
section.Find(".Documentation-example").Each(func(_ int, s *goquery.Selection) {
|
||||
ex := &Example{}
|
||||
|
||||
// Extract example name
|
||||
nameEl := s.Find(".Documentation-exampleHeader").First()
|
||||
if nameEl.Length() > 0 {
|
||||
ex.Name = strings.TrimSpace(nameEl.Text())
|
||||
}
|
||||
|
||||
// Extract code
|
||||
codeEl := s.Find("pre, code").First()
|
||||
if codeEl.Length() > 0 {
|
||||
ex.Code = strings.TrimSpace(codeEl.Text())
|
||||
}
|
||||
|
||||
// Extract output
|
||||
outputEl := s.Find(".Documentation-exampleOutput").First()
|
||||
if outputEl.Length() > 0 {
|
||||
ex.Output = strings.TrimSpace(outputEl.Text())
|
||||
}
|
||||
|
||||
// Extract documentation
|
||||
docEl := s.Find("p").First()
|
||||
if docEl.Length() > 0 {
|
||||
ex.Doc = strings.TrimSpace(docEl.Text())
|
||||
}
|
||||
|
||||
if ex.Code != "" {
|
||||
examples = append(examples, ex)
|
||||
}
|
||||
})
|
||||
|
||||
return examples
|
||||
}
|
||||
|
||||
// extractPackageExamples extracts package-level examples.
|
||||
func (p *Parser) extractPackageExamples(doc *goquery.Document) []*Example {
|
||||
var examples []*Example
|
||||
|
||||
doc.Find(".Documentation-example").Each(func(_ int, s *goquery.Selection) {
|
||||
ex := &Example{}
|
||||
|
||||
// Extract example name
|
||||
nameEl := s.Find(".Documentation-exampleHeader").First()
|
||||
if nameEl.Length() > 0 {
|
||||
ex.Name = strings.TrimSpace(nameEl.Text())
|
||||
}
|
||||
|
||||
// Extract code
|
||||
codeEl := s.Find("pre, code").First()
|
||||
if codeEl.Length() > 0 {
|
||||
ex.Code = strings.TrimSpace(codeEl.Text())
|
||||
}
|
||||
|
||||
// Extract output
|
||||
outputEl := s.Find(".Documentation-exampleOutput").First()
|
||||
if outputEl.Length() > 0 {
|
||||
ex.Output = strings.TrimSpace(outputEl.Text())
|
||||
}
|
||||
|
||||
if ex.Code != "" {
|
||||
examples = append(examples, ex)
|
||||
}
|
||||
})
|
||||
|
||||
return examples
|
||||
}
|
||||
|
||||
// parseCount parses a count string to int.
|
||||
func parseCount(s string) int {
|
||||
var count int
|
||||
for _, c := range s {
|
||||
if c >= '0' && c <= '9' {
|
||||
count = count*10 + int(c-'0')
|
||||
}
|
||||
}
|
||||
return count
|
||||
}
|
||||
|
||||
// isExported checks if a name is exported (starts with uppercase).
|
||||
func isExported(name string) bool {
|
||||
if len(name) == 0 {
|
||||
return false
|
||||
}
|
||||
return name[0] >= 'A' && name[0] <= 'Z'
|
||||
}
|
||||
|
||||
// cleanWhitespace normalizes whitespace in text.
|
||||
func cleanWhitespace(text string) string {
|
||||
// Replace multiple whitespace with single space
|
||||
re := regexp.MustCompile(`\s+`)
|
||||
text = re.ReplaceAllString(text, " ")
|
||||
return strings.TrimSpace(text)
|
||||
}
|
||||
@@ -0,0 +1,268 @@
|
||||
package godocs
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
)
|
||||
|
||||
const testPackageHTML = `
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head><title>runtime - pkg.go.dev</title></head>
|
||||
<body>
|
||||
<nav class="go-Breadcrumb">
|
||||
<ol>
|
||||
<li><a href="/">Discover Packages</a></li>
|
||||
<li><a href="/k8s.io/apimachinery">k8s.io/apimachinery</a></li>
|
||||
<li><a href="/k8s.io/apimachinery/pkg">pkg</a></li>
|
||||
<li><a href="/k8s.io/apimachinery/pkg/runtime">runtime</a></li>
|
||||
</ol>
|
||||
</nav>
|
||||
|
||||
<h1 class="UnitHeader-titleHeading">runtime</h1>
|
||||
|
||||
<div class="go-Main-headerDetails">
|
||||
<span data-test-id="UnitHeader-version"><a href="?tab=versions">v0.35.1</a></span>
|
||||
<span data-test-id="UnitHeader-importedby"><a href="?tab=importedby">Imported by: 144,729</a></span>
|
||||
<span data-test-id="UnitHeader-licenses"><a href="?tab=licenses">Apache-2.0</a></span>
|
||||
</div>
|
||||
|
||||
<div class="Documentation">
|
||||
<p>Package runtime defines conversions between generic types and structs to map query strings to struct objects.</p>
|
||||
<p>This is additional documentation text for the package.</p>
|
||||
</div>
|
||||
|
||||
<div class="Documentation-function">
|
||||
<div class="Documentation-functionHeader">func DecodeInto</div>
|
||||
<pre>func DecodeInto(d Decoder, data []byte, into Object) error</pre>
|
||||
<p>DecodeInto is a helper function that decodes the given data into the provided object.</p>
|
||||
</div>
|
||||
|
||||
<div class="Documentation-type">
|
||||
<div class="Documentation-typeHeader">type Codec</div>
|
||||
<pre>type Codec struct {
|
||||
Encoder Encoder
|
||||
Decoder Decoder
|
||||
}</pre>
|
||||
<p>Codec is a struct that holds an encoder and decoder.</p>
|
||||
<div class="Documentation-method">
|
||||
<div class="Documentation-methodHeader">func (*Codec) Encode</div>
|
||||
<pre>func (c *Codec) Encode(obj Object) ([]byte, error)</pre>
|
||||
<p>Encode encodes the given object.</p>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="Documentation-constants">
|
||||
<pre>const (
|
||||
ContentTypeJSON = "application/json"
|
||||
ContentTypeYAML = "application/yaml"
|
||||
)</pre>
|
||||
<p>Content types for different formats.</p>
|
||||
</div>
|
||||
|
||||
<div class="Documentation-variables">
|
||||
<pre>var DefaultScheme = NewScheme()</pre>
|
||||
<p>DefaultScheme is the default scheme used for encoding/decoding.</p>
|
||||
</div>
|
||||
|
||||
</body>
|
||||
</html>
|
||||
`
|
||||
|
||||
const testSearchHTML = `
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<body>
|
||||
<div class="SearchSnippet">
|
||||
<h2><a href="/k8s.io/apimachinery/pkg/runtime">runtime <span class="SearchSnippet-header-path">(k8s.io/apimachinery/pkg/runtime)</span></a></h2>
|
||||
<p class="SearchSnippet-synopsis">Package runtime defines conversions between generic types and structs.</p>
|
||||
<div class="SearchSnippet-infoLabel">
|
||||
<a href="?tab=importedby">Imported by: <strong>144,729</strong></a>
|
||||
<span>v0.35.1 published on <strong>Dec 4, 2025</strong></span>
|
||||
<a href="?tab=licenses">Apache-2.0</a>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="SearchSnippet">
|
||||
<h2><a href="/github.com/google/go-querystring/query">query <span class="SearchSnippet-header-path">(github.com/google/go-querystring/query)</span></a></h2>
|
||||
<p class="SearchSnippet-synopsis">Package query implements encoding of structs into URL query parameters.</p>
|
||||
<div class="SearchSnippet-infoLabel">
|
||||
<a href="?tab=importedby">Imported by: <strong>5,111</strong></a>
|
||||
<span>v1.2.0 published on <strong>Nov 10, 2025</strong></span>
|
||||
<a href="?tab=licenses">BSD-3-Clause</a>
|
||||
</div>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
`
|
||||
|
||||
func TestParsePackagePage(t *testing.T) {
|
||||
parser := NewParser()
|
||||
pkg, err := parser.ParsePackagePage(testPackageHTML, "https://pkg.go.dev/k8s.io/apimachinery/pkg/runtime")
|
||||
if err != nil {
|
||||
t.Fatalf("ParsePackagePage failed: %v", err)
|
||||
}
|
||||
|
||||
if pkg.Name != "runtime" {
|
||||
t.Errorf("Expected name 'runtime', got '%s'", pkg.Name)
|
||||
}
|
||||
|
||||
if pkg.ImportPath != "k8s.io/apimachinery/pkg/runtime" {
|
||||
t.Errorf("Expected import path 'k8s.io/apimachinery/pkg/runtime', got '%s'", pkg.ImportPath)
|
||||
}
|
||||
|
||||
if pkg.Version != "v0.35.1" {
|
||||
t.Errorf("Expected version 'v0.35.1', got '%s'", pkg.Version)
|
||||
}
|
||||
|
||||
if pkg.ImportedBy != 144729 {
|
||||
t.Errorf("Expected imported by 144729, got %d", pkg.ImportedBy)
|
||||
}
|
||||
|
||||
if pkg.Synopsis == "" {
|
||||
t.Error("Expected non-empty synopsis")
|
||||
}
|
||||
|
||||
if len(pkg.Functions) == 0 {
|
||||
t.Error("Expected at least one function")
|
||||
}
|
||||
|
||||
if len(pkg.Types) == 0 {
|
||||
t.Error("Expected at least one type")
|
||||
}
|
||||
|
||||
if len(pkg.Constants) == 0 {
|
||||
t.Error("Expected at least one constant")
|
||||
}
|
||||
|
||||
if len(pkg.Variables) == 0 {
|
||||
t.Error("Expected at least one variable")
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseSearchResults(t *testing.T) {
|
||||
parser := NewParser()
|
||||
results, err := parser.ParseSearchResults(testSearchHTML)
|
||||
if err != nil {
|
||||
t.Fatalf("ParseSearchResults failed: %v", err)
|
||||
}
|
||||
|
||||
if len(results) < 2 {
|
||||
t.Fatalf("Expected at least 2 results, got %d", len(results))
|
||||
}
|
||||
|
||||
first := results[0]
|
||||
if first.Synopsis == "" {
|
||||
t.Error("Expected non-empty synopsis")
|
||||
}
|
||||
|
||||
if first.Path == "" {
|
||||
t.Error("Expected non-empty path")
|
||||
}
|
||||
|
||||
if first.URL == "" {
|
||||
t.Error("Expected non-empty URL")
|
||||
}
|
||||
}
|
||||
|
||||
func TestIsExported(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
expected bool
|
||||
}{
|
||||
{"Exported", true},
|
||||
{"unexported", false},
|
||||
{"", false},
|
||||
{"CamelCase", true},
|
||||
{"camelCase", false},
|
||||
{"X", true},
|
||||
{"x", false},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
if got := isExported(tt.name); got != tt.expected {
|
||||
t.Errorf("isExported(%q) = %v, want %v", tt.name, got, tt.expected)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestCleanWhitespace(t *testing.T) {
|
||||
tests := []struct {
|
||||
input string
|
||||
expected string
|
||||
}{
|
||||
{" hello world ", "hello world"},
|
||||
{"single", "single"},
|
||||
{"multiple spaces here", "multiple spaces here"},
|
||||
{"\n\ttabs\t\n", "tabs"},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.input, func(t *testing.T) {
|
||||
if got := cleanWhitespace(tt.input); got != tt.expected {
|
||||
t.Errorf("cleanWhitespace(%q) = %q, want %q", tt.input, got, tt.expected)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseCount(t *testing.T) {
|
||||
tests := []struct {
|
||||
input string
|
||||
expected int
|
||||
}{
|
||||
{"144729", 144729},
|
||||
{"5,111", 5111},
|
||||
{"0", 0},
|
||||
{"1,234,567", 1234567},
|
||||
{"abc", 0},
|
||||
{"", 0},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.input, func(t *testing.T) {
|
||||
if got := parseCount(tt.input); got != tt.expected {
|
||||
t.Errorf("parseCount(%q) = %d, want %d", tt.input, got, tt.expected)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestExtractImportPath(t *testing.T) {
|
||||
parser := NewParser()
|
||||
|
||||
tests := []struct {
|
||||
html string
|
||||
url string
|
||||
expected string
|
||||
}{
|
||||
{
|
||||
html: `<nav class="go-Breadcrumb"><li><a href="/">Discover</a></li><li><a href="/k8s.io/apimachinery">k8s.io/apimachinery</a></li><li><a href="/k8s.io/apimachinery/pkg">pkg</a></li><li><a href="/k8s.io/apimachinery/pkg/runtime">runtime</a></li></nav>`,
|
||||
url: "https://pkg.go.dev/k8s.io/apimachinery/pkg/runtime",
|
||||
expected: "k8s.io/apimachinery/pkg/runtime",
|
||||
},
|
||||
{
|
||||
html: `<nav class="go-Breadcrumb"><li><a href="/github.com/user/repo">github.com/user/repo</a></li></nav>`,
|
||||
url: "https://pkg.go.dev/github.com/user/repo@v1.0.0",
|
||||
expected: "github.com/user/repo",
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.expected, func(t *testing.T) {
|
||||
doc, err := goquery.NewDocumentFromReader(strings.NewReader(tt.html))
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to parse HTML: %v", err)
|
||||
}
|
||||
|
||||
got := parser.extractImportPath(doc, tt.url)
|
||||
if got != tt.expected {
|
||||
t.Errorf("extractImportPath() = %q, want %q", got, tt.expected)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,156 @@
|
||||
// Package godocs provides parsing and extraction for Go package documentation
|
||||
// from pkg.go.dev and similar documentation sites.
|
||||
package godocs
|
||||
|
||||
import "time"
|
||||
|
||||
// Package represents a Go package's documentation.
|
||||
type Package struct {
|
||||
// Import path (e.g., "github.com/user/repo/pkg")
|
||||
ImportPath string `json:"import_path"`
|
||||
|
||||
// Package name (last element of import path)
|
||||
Name string `json:"name"`
|
||||
|
||||
// Synopsis is a short one-line description
|
||||
Synopsis string `json:"synopsis"`
|
||||
|
||||
// Full documentation text
|
||||
Doc string `json:"doc"`
|
||||
|
||||
// Version information
|
||||
Version string `json:"version"`
|
||||
|
||||
// Module information
|
||||
Module *Module `json:"module,omitempty"`
|
||||
|
||||
// License information
|
||||
Licenses []License `json:"licenses,omitempty"`
|
||||
|
||||
// Functions exported by the package
|
||||
Functions []*Function `json:"functions,omitempty"`
|
||||
|
||||
// Types defined in the package
|
||||
Types []*Type `json:"types,omitempty"`
|
||||
|
||||
// Constants defined in the package
|
||||
Constants []*Value `json:"constants,omitempty"`
|
||||
|
||||
// Variables defined in the package
|
||||
Variables []*Value `json:"variables,omitempty"`
|
||||
|
||||
// Examples for the package
|
||||
Examples []*Example `json:"examples,omitempty"`
|
||||
|
||||
// Import count
|
||||
ImportedBy int `json:"imported_by"`
|
||||
|
||||
// Repository URL
|
||||
Repository string `json:"repository,omitempty"`
|
||||
|
||||
// Documentation URL
|
||||
DocURL string `json:"doc_url"`
|
||||
|
||||
// When the documentation was fetched
|
||||
FetchedAt time.Time `json:"fetched_at"`
|
||||
}
|
||||
|
||||
// Module represents Go module information.
|
||||
type Module struct {
|
||||
Path string `json:"path"`
|
||||
Version string `json:"version"`
|
||||
}
|
||||
|
||||
// License represents license information.
|
||||
type License struct {
|
||||
Name string `json:"name"`
|
||||
Path string `json:"path,omitempty"`
|
||||
}
|
||||
|
||||
// Function represents a function declaration.
|
||||
type Function struct {
|
||||
Name string `json:"name"`
|
||||
Doc string `json:"doc,omitempty"`
|
||||
Signature string `json:"signature"`
|
||||
Examples []*Example `json:"examples,omitempty"`
|
||||
}
|
||||
|
||||
// Type represents a type declaration.
|
||||
type Type struct {
|
||||
Name string `json:"name"`
|
||||
Doc string `json:"doc,omitempty"`
|
||||
Kind TypeKind `json:"kind"`
|
||||
Underlying string `json:"underlying,omitempty"` // For type aliases
|
||||
Fields []*Field `json:"fields,omitempty"` // For structs
|
||||
Methods []*Method `json:"methods,omitempty"`
|
||||
Examples []*Example `json:"examples,omitempty"`
|
||||
}
|
||||
|
||||
// TypeKind represents the kind of type.
|
||||
type TypeKind string
|
||||
|
||||
const (
|
||||
TypeKindBasic TypeKind = "basic"
|
||||
TypeKindStruct TypeKind = "struct"
|
||||
TypeKindInterface TypeKind = "interface"
|
||||
TypeKindAlias TypeKind = "alias"
|
||||
TypeKindFunc TypeKind = "func"
|
||||
)
|
||||
|
||||
// Field represents a struct field.
|
||||
type Field struct {
|
||||
Name string `json:"name"`
|
||||
Type string `json:"type"`
|
||||
Doc string `json:"doc,omitempty"`
|
||||
Tag string `json:"tag,omitempty"`
|
||||
Embedded bool `json:"embedded,omitempty"`
|
||||
Exported bool `json:"exported"`
|
||||
}
|
||||
|
||||
// Method represents a method on a type.
|
||||
type Method struct {
|
||||
Name string `json:"name"`
|
||||
Doc string `json:"doc,omitempty"`
|
||||
Signature string `json:"signature"`
|
||||
Receiver string `json:"receiver,omitempty"`
|
||||
}
|
||||
|
||||
// Value represents a constant or variable declaration.
|
||||
type Value struct {
|
||||
Name string `json:"name"`
|
||||
Doc string `json:"doc,omitempty"`
|
||||
Type string `json:"type,omitempty"`
|
||||
Value string `json:"value,omitempty"`
|
||||
Names []string `json:"names,omitempty"` // For const groups
|
||||
IsConst bool `json:"is_const"`
|
||||
}
|
||||
|
||||
// Example represents a code example.
|
||||
type Example struct {
|
||||
Name string `json:"name"`
|
||||
Doc string `json:"doc,omitempty"`
|
||||
Code string `json:"code"`
|
||||
Output string `json:"output,omitempty"`
|
||||
PlayURL string `json:"play_url,omitempty"`
|
||||
}
|
||||
|
||||
// SearchResult represents a search result from pkg.go.dev.
|
||||
type SearchResult struct {
|
||||
Name string `json:"name"`
|
||||
Path string `json:"path"`
|
||||
Synopsis string `json:"synopsis"`
|
||||
ImportedBy int `json:"imported_by"`
|
||||
Version string `json:"version"`
|
||||
Published string `json:"published"`
|
||||
License string `json:"license"`
|
||||
URL string `json:"url"`
|
||||
}
|
||||
|
||||
// Symbol represents a symbol (function, type, etc.) within a package.
|
||||
type Symbol struct {
|
||||
Name string `json:"name"`
|
||||
Kind string `json:"kind"` // function, type, constant, variable
|
||||
Signature string `json:"signature,omitempty"`
|
||||
Doc string `json:"doc,omitempty"`
|
||||
Recv string `json:"recv,omitempty"` // For methods
|
||||
}
|
||||
Reference in New Issue
Block a user