package godocs import ( "net/url" "regexp" "strings" "time" "github.com/PuerkitoBio/goquery" ) // Parser parses pkg.go.dev HTML pages into structured documentation. type Parser struct { baseURL string } // NewParser creates a new parser for pkg.go.dev content. func NewParser() *Parser { return &Parser{ baseURL: "https://pkg.go.dev", } } // ParsePackagePage parses a pkg.go.dev package documentation page. func (p *Parser) ParsePackagePage(html string, docURL string) (*Package, error) { doc, err := goquery.NewDocumentFromReader(strings.NewReader(html)) if err != nil { return nil, err } pkg := &Package{ DocURL: docURL, FetchedAt: time.Now(), } // Extract import path from URL or breadcrumb pkg.ImportPath = p.extractImportPath(doc, docURL) pkg.Name = p.extractPackageName(doc) // Extract synopsis pkg.Synopsis = p.extractSynopsis(doc) // Extract package documentation pkg.Doc = p.extractPackageDoc(doc) // Extract version info pkg.Version = p.extractVersion(doc) // Extract module info pkg.Module = p.extractModule(doc) // Extract licenses pkg.Licenses = p.extractLicenses(doc) // Extract imported by count pkg.ImportedBy = p.extractImportedBy(doc) // Extract repository URL pkg.Repository = p.extractRepository(doc) // Extract functions pkg.Functions = p.extractFunctions(doc) // Extract types pkg.Types = p.extractTypes(doc) // Extract constants pkg.Constants = p.extractConstants(doc) // Extract variables pkg.Variables = p.extractVariables(doc) // Extract examples pkg.Examples = p.extractPackageExamples(doc) return pkg, nil } // ParseSearchResults parses pkg.go.dev search results page. func (p *Parser) ParseSearchResults(html string) ([]*SearchResult, error) { doc, err := goquery.NewDocumentFromReader(strings.NewReader(html)) if err != nil { return nil, err } var results []*SearchResult doc.Find(".SearchSnippet").Each(func(i int, s *goquery.Selection) { result := &SearchResult{} // Extract name and path s.Find("h2 a").Each(func(_ int, a *goquery.Selection) { result.Name = strings.TrimSpace(a.Text()) if href, exists := a.Attr("href"); exists { result.URL = p.baseURL + href result.Path = strings.TrimPrefix(href, "/") } }) // Extract path from span pathSpan := s.Find(".SearchSnippet-header-path") if pathSpan.Length() > 0 { result.Path = strings.Trim(pathSpan.Text(), "()") } // Extract synopsis synopsis := s.Find(".SearchSnippet-synopsis") if synopsis.Length() > 0 { result.Synopsis = strings.TrimSpace(synopsis.Text()) } // Extract imported by count infoLabel := s.Find(".SearchSnippet-infoLabel").Text() if strings.Contains(infoLabel, "Imported by") { re := regexp.MustCompile(`Imported by\s+(\d[\d,]*)`) if matches := re.FindStringSubmatch(infoLabel); len(matches) > 1 { countStr := strings.ReplaceAll(matches[1], ",", "") result.ImportedBy = parseCount(countStr) } } // Extract version versionMatch := regexp.MustCompile(`v?\d+\.\d+(?:\.\d+)?`).FindString(infoLabel) result.Version = versionMatch // Extract license license := s.Find("[data-test-id='snippet-license'] a") if license.Length() > 0 { result.License = strings.TrimSpace(license.Text()) } results = append(results, result) }) return results, nil } // extractImportPath extracts the import path from the page. func (p *Parser) extractImportPath(doc *goquery.Document, docURL string) string { // Try to extract from breadcrumb var importPath string doc.Find(".go-Breadcrumb li a").Each(func(i int, s *goquery.Selection) { if i > 0 { // Skip "Discover Packages" part := strings.TrimSpace(s.Text()) if part != "" { if importPath != "" { importPath += "/" } importPath += part } } }) if importPath != "" { return importPath } // Fallback: extract from URL if docURL != "" { u, err := url.Parse(docURL) if err == nil { path := strings.TrimPrefix(u.Path, "/") // Remove version suffix like @v1.0.0 if idx := strings.Index(path, "@"); idx > 0 { path = path[:idx] } return path } } return "" } // extractPackageName extracts the package name. func (p *Parser) extractPackageName(doc *goquery.Document) string { // Try UnitHeader-title title := doc.Find(".UnitHeader-titleHeading").Text() title = strings.TrimSpace(title) if title != "" { return title } // Fallback to h1 title = doc.Find("h1").First().Text() return strings.TrimSpace(title) } // extractSynopsis extracts the package synopsis. func (p *Parser) extractSynopsis(doc *goquery.Document) string { // Synopsis is typically in the first paragraph after the package declaration docSection := doc.Find(".Documentation").First() if docSection.Length() > 0 { // Get the first paragraph firstP := docSection.Find("p").First() if firstP.Length() > 0 { synopsis := strings.TrimSpace(firstP.Text()) // Limit to reasonable length if len(synopsis) > 200 { synopsis = synopsis[:197] + "..." } return synopsis } } return "" } // extractPackageDoc extracts the full package documentation. func (p *Parser) extractPackageDoc(doc *goquery.Document) string { var parts []string doc.Find(".Documentation").Each(func(_ int, s *goquery.Selection) { text := s.Text() text = cleanWhitespace(text) if text != "" { parts = append(parts, text) } }) return strings.Join(parts, "\n\n") } // extractVersion extracts the version info. func (p *Parser) extractVersion(doc *goquery.Document) string { versionEl := doc.Find("[data-test-id='UnitHeader-version'] a") if versionEl.Length() > 0 { return strings.TrimSpace(versionEl.Text()) } return "" } // extractModule extracts module information. func (p *Parser) extractModule(doc *goquery.Document) *Module { modulePath := "" moduleVersion := "" // Try to extract from version link versionEl := doc.Find("[data-test-id='UnitHeader-version'] a") if versionEl.Length() > 0 { moduleVersion = strings.TrimSpace(versionEl.Text()) } // Extract module path from breadcrumb doc.Find(".go-Breadcrumb li a").Each(func(i int, s *goquery.Selection) { text := strings.TrimSpace(s.Text()) if strings.Contains(text, "/") && i > 0 { modulePath = text } }) if modulePath != "" { return &Module{ Path: modulePath, Version: moduleVersion, } } return nil } // extractLicenses extracts license information. func (p *Parser) extractLicenses(doc *goquery.Document) []License { var licenses []License doc.Find("[data-test-id='UnitHeader-license']").Each(func(_ int, s *goquery.Selection) { name := strings.TrimSpace(s.Text()) if name != "" { license := License{Name: name} if href, exists := s.Attr("href"); exists { license.Path = href } licenses = append(licenses, license) } }) return licenses } // extractImportedBy extracts the import count. func (p *Parser) extractImportedBy(doc *goquery.Document) int { importEl := doc.Find("[data-test-id='UnitHeader-importedby'] a") if importEl.Length() > 0 { text := importEl.Text() // Extract number from "Imported by: 144,729" re := regexp.MustCompile(`[\d,]+`) if match := re.FindString(text); match != "" { match = strings.ReplaceAll(match, ",", "") var count int for _, c := range match { if c >= '0' && c <= '9' { count = count*10 + int(c-'0') } } return count } } return 0 } // extractRepository extracts the repository URL. func (p *Parser) extractRepository(doc *goquery.Document) string { repoEl := doc.Find(".UnitMeta-repo a") if repoEl.Length() > 0 { if href, exists := repoEl.Attr("href"); exists { return href } } return "" } // extractFunctions extracts all function declarations. func (p *Parser) extractFunctions(doc *goquery.Document) []*Function { var functions []*Function doc.Find(".Documentation-function").Each(func(_ int, s *goquery.Selection) { fn := &Function{} // Extract name from the function header nameEl := s.Find(".Documentation-functionHeader").First() if nameEl.Length() > 0 { fn.Name = strings.TrimSpace(nameEl.Text()) } // Extract signature from code block sigEl := s.Find("pre").First() if sigEl.Length() > 0 { fn.Signature = strings.TrimSpace(sigEl.Text()) } // Extract documentation docEl := s.Find(".Documentation-functionBody p").First() if docEl.Length() == 0 { docEl = s.Find("p").First() } if docEl.Length() > 0 { fn.Doc = strings.TrimSpace(docEl.Text()) } // Extract examples fn.Examples = p.extractExamples(s) if fn.Name != "" { functions = append(functions, fn) } }) return functions } // extractTypes extracts all type declarations. func (p *Parser) extractTypes(doc *goquery.Document) []*Type { var types []*Type doc.Find(".Documentation-type").Each(func(_ int, s *goquery.Selection) { t := &Type{} // Extract name from the type header nameEl := s.Find(".Documentation-typeHeader").First() if nameEl.Length() > 0 { t.Name = strings.TrimSpace(nameEl.Text()) } // Determine kind from signature sigEl := s.Find("pre").First() if sigEl.Length() > 0 { sig := sigEl.Text() t.Underlying = strings.TrimSpace(sig) if strings.Contains(sig, "struct{") { t.Kind = TypeKindStruct t.Fields = p.extractStructFields(sigEl) } else if strings.Contains(sig, "interface{") { t.Kind = TypeKindInterface } else { t.Kind = TypeKindAlias } } // Extract documentation docEl := s.Find("p").First() if docEl.Length() > 0 { t.Doc = strings.TrimSpace(docEl.Text()) } // Extract methods t.Methods = p.extractMethods(s) // Extract examples t.Examples = p.extractExamples(s) if t.Name != "" { types = append(types, t) } }) return types } // extractStructFields extracts struct fields from a type definition. func (p *Parser) extractStructFields(sigEl *goquery.Selection) []*Field { var fields []*Field sigEl.Find("tr, .Documentation-structField").Each(func(_ int, s *goquery.Selection) { text := s.Text() text = strings.TrimSpace(text) if text == "" || strings.HasPrefix(text, "//") { return } // Parse field: Name Type `tag` parts := strings.Fields(text) if len(parts) >= 1 { field := &Field{ Name: parts[0], Exported: isExported(parts[0]), } if len(parts) >= 2 { field.Type = strings.Join(parts[1:], " ") // Remove tag if idx := strings.Index(field.Type, "`"); idx > 0 { field.Tag = field.Type[idx:] field.Type = field.Type[:idx] } } fields = append(fields, field) } }) return fields } // extractMethods extracts methods from a type section. func (p *Parser) extractMethods(typeSection *goquery.Selection) []*Method { var methods []*Method typeSection.Find(".Documentation-method, .Documentation-function").Each(func(_ int, s *goquery.Selection) { m := &Method{} // Extract method name nameEl := s.Find(".Documentation-functionHeader, .Documentation-methodHeader").First() if nameEl.Length() > 0 { name := strings.TrimSpace(nameEl.Text()) // Extract receiver if present: (t *Type) Method(...) if strings.HasPrefix(name, "(") { if end := strings.Index(name, ")"); end > 0 { m.Receiver = name[1:end] name = strings.TrimSpace(name[end+1:]) } } m.Name = name } // Extract signature sigEl := s.Find("pre").First() if sigEl.Length() > 0 { m.Signature = strings.TrimSpace(sigEl.Text()) } // Extract documentation docEl := s.Find("p").First() if docEl.Length() > 0 { m.Doc = strings.TrimSpace(docEl.Text()) } if m.Name != "" { methods = append(methods, m) } }) return methods } // extractConstants extracts constant declarations. func (p *Parser) extractConstants(doc *goquery.Document) []*Value { var constants []*Value doc.Find(".Documentation-constants").Each(func(_ int, s *goquery.Selection) { // Extract constant group codeEl := s.Find("pre").First() if codeEl.Length() > 0 { v := &Value{ IsConst: true, } // Parse const declarations text := codeEl.Text() lines := strings.Split(text, "\n") for _, line := range lines { line = strings.TrimSpace(line) if line == "" || strings.HasPrefix(line, "//") { continue } // Simple const: Name = value if strings.Contains(line, "=") { parts := strings.SplitN(line, "=", 2) if len(parts) == 2 { name := strings.TrimSpace(parts[0]) if v.Names == nil { v.Names = []string{} } v.Names = append(v.Names, name) if v.Name == "" { v.Name = name } v.Value = strings.TrimSpace(parts[1]) } } } // Extract documentation docEl := s.Find("p").First() if docEl.Length() > 0 { v.Doc = strings.TrimSpace(docEl.Text()) } if len(v.Names) > 0 { constants = append(constants, v) } } }) return constants } // extractVariables extracts variable declarations. func (p *Parser) extractVariables(doc *goquery.Document) []*Value { var variables []*Value doc.Find(".Documentation-variables").Each(func(_ int, s *goquery.Selection) { codeEl := s.Find("pre").First() if codeEl.Length() > 0 { v := &Value{ IsConst: false, } text := codeEl.Text() // Parse var declarations text = strings.TrimPrefix(text, "var ") lines := strings.Split(text, "\n") for _, line := range lines { line = strings.TrimSpace(line) if line == "" { continue } // Parse: Name Type = value if strings.Contains(line, "=") { parts := strings.SplitN(line, "=", 2) if len(parts) == 2 { nameType := strings.TrimSpace(parts[0]) v.Name = strings.Fields(nameType)[0] v.Value = strings.TrimSpace(parts[1]) break } } else { // Just name and type fields := strings.Fields(line) if len(fields) >= 1 { v.Name = fields[0] if len(fields) >= 2 { v.Type = strings.Join(fields[1:], " ") } } } } // Extract documentation docEl := s.Find("p").First() if docEl.Length() > 0 { v.Doc = strings.TrimSpace(docEl.Text()) } if v.Name != "" { variables = append(variables, v) } } }) return variables } // extractExamples extracts examples from a section. func (p *Parser) extractExamples(section *goquery.Selection) []*Example { var examples []*Example section.Find(".Documentation-example").Each(func(_ int, s *goquery.Selection) { ex := &Example{} // Extract example name nameEl := s.Find(".Documentation-exampleHeader").First() if nameEl.Length() > 0 { ex.Name = strings.TrimSpace(nameEl.Text()) } // Extract code codeEl := s.Find("pre, code").First() if codeEl.Length() > 0 { ex.Code = strings.TrimSpace(codeEl.Text()) } // Extract output outputEl := s.Find(".Documentation-exampleOutput").First() if outputEl.Length() > 0 { ex.Output = strings.TrimSpace(outputEl.Text()) } // Extract documentation docEl := s.Find("p").First() if docEl.Length() > 0 { ex.Doc = strings.TrimSpace(docEl.Text()) } if ex.Code != "" { examples = append(examples, ex) } }) return examples } // extractPackageExamples extracts package-level examples. func (p *Parser) extractPackageExamples(doc *goquery.Document) []*Example { var examples []*Example doc.Find(".Documentation-example").Each(func(_ int, s *goquery.Selection) { ex := &Example{} // Extract example name nameEl := s.Find(".Documentation-exampleHeader").First() if nameEl.Length() > 0 { ex.Name = strings.TrimSpace(nameEl.Text()) } // Extract code codeEl := s.Find("pre, code").First() if codeEl.Length() > 0 { ex.Code = strings.TrimSpace(codeEl.Text()) } // Extract output outputEl := s.Find(".Documentation-exampleOutput").First() if outputEl.Length() > 0 { ex.Output = strings.TrimSpace(outputEl.Text()) } if ex.Code != "" { examples = append(examples, ex) } }) return examples } // parseCount parses a count string to int. func parseCount(s string) int { var count int for _, c := range s { if c >= '0' && c <= '9' { count = count*10 + int(c-'0') } } return count } // isExported checks if a name is exported (starts with uppercase). func isExported(name string) bool { if len(name) == 0 { return false } return name[0] >= 'A' && name[0] <= 'Z' } // cleanWhitespace normalizes whitespace in text. func cleanWhitespace(text string) string { // Replace multiple whitespace with single space re := regexp.MustCompile(`\s+`) text = re.ReplaceAllString(text, " ") return strings.TrimSpace(text) }