Files
Devour/internal/scraper/godocs.go
Tomas Dvorak 55885a0e8f first commit
2026-02-22 10:42:17 +01:00

424 lines
11 KiB
Go

package scraper
import (
"context"
"crypto/sha256"
"encoding/hex"
"encoding/json"
"fmt"
"io"
"net/http"
"strings"
"time"
"github.com/yourorg/devour/pkg/godocs"
)
type GoDocsScraper struct {
config *Config
parser *godocs.Parser
client *http.Client
}
func NewGoDocsScraper(config *Config) *GoDocsScraper {
return &GoDocsScraper{
config: config,
parser: godocs.NewParser(),
client: &http.Client{
Timeout: config.Timeout,
},
}
}
func (s *GoDocsScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
var documents []*Document
if source.URL == "" {
return nil, fmt.Errorf("URL is required for Go docs scraper")
}
html, err := s.fetchPage(ctx, source.URL)
if err != nil {
return nil, fmt.Errorf("failed to fetch page: %w", err)
}
pkg, err := s.parser.ParsePackagePage(html, source.URL)
if err != nil {
return nil, fmt.Errorf("failed to parse package: %w", err)
}
mainDoc := s.packageToDocument(pkg, source.Name)
documents = append(documents, mainDoc)
for _, fn := range pkg.Functions {
doc := s.functionToDocument(fn, pkg, source.Name)
documents = append(documents, doc)
}
for _, t := range pkg.Types {
doc := s.typeToDocument(t, pkg, source.Name)
documents = append(documents, doc)
for _, m := range t.Methods {
methodDoc := s.methodToDocument(m, t, pkg, source.Name)
documents = append(documents, methodDoc)
}
}
for _, c := range pkg.Constants {
doc := s.constantToDocument(c, pkg, source.Name)
documents = append(documents, doc)
}
for _, v := range pkg.Variables {
doc := s.variableToDocument(v, pkg, source.Name)
documents = append(documents, doc)
}
return documents, nil
}
func (s *GoDocsScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
html, err := s.fetchPage(ctx, source.URL)
if err != nil {
return false, "", err
}
hash := s.generateHash(html)
changed := hash != lastHash
return changed, hash, nil
}
func (s *GoDocsScraper) fetchPage(ctx context.Context, url string) (string, error) {
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
if err != nil {
return "", err
}
req.Header.Set("User-Agent", s.config.UserAgent)
resp, err := s.client.Do(req)
if err != nil {
return "", err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return "", fmt.Errorf("HTTP %d", resp.StatusCode)
}
body, err := io.ReadAll(resp.Body)
if err != nil {
return "", err
}
return string(body), nil
}
func (s *GoDocsScraper) generateHash(content string) string {
hash := sha256.Sum256([]byte(content))
return hex.EncodeToString(hash[:])
}
func (s *GoDocsScraper) packageToDocument(pkg *godocs.Package, sourceName string) *Document {
content := s.buildPackageContent(pkg)
metadata := map[string]interface{}{
"import_path": pkg.ImportPath,
"version": pkg.Version,
"imported_by": pkg.ImportedBy,
"repository": pkg.Repository,
"doc_url": pkg.DocURL,
}
if pkg.Module != nil {
metadata["module_path"] = pkg.Module.Path
metadata["module_version"] = pkg.Module.Version
}
if len(pkg.Licenses) > 0 {
var licenses []string
for _, l := range pkg.Licenses {
licenses = append(licenses, l.Name)
}
metadata["licenses"] = licenses
}
return &Document{
ID: generateDocID(pkg.DocURL),
Source: sourceName,
Type: "go-package",
Title: fmt.Sprintf("%s - %s", pkg.Name, pkg.ImportPath),
Content: content,
URL: pkg.DocURL,
Metadata: metadata,
Hash: s.generateHash(content),
Timestamp: time.Now(),
}
}
func (s *GoDocsScraper) buildPackageContent(pkg *godocs.Package) string {
var parts []string
parts = append(parts, fmt.Sprintf("# Package %s\n", pkg.ImportPath))
if pkg.Synopsis != "" {
parts = append(parts, pkg.Synopsis)
}
if pkg.Doc != "" {
parts = append(parts, "\n## Documentation\n")
parts = append(parts, pkg.Doc)
}
if len(pkg.Functions) > 0 {
parts = append(parts, fmt.Sprintf("\n## Functions (%d)\n", len(pkg.Functions)))
for _, fn := range pkg.Functions {
parts = append(parts, fmt.Sprintf("- `%s`", fn.Signature))
}
}
if len(pkg.Types) > 0 {
parts = append(parts, fmt.Sprintf("\n## Types (%d)\n", len(pkg.Types)))
for _, t := range pkg.Types {
parts = append(parts, fmt.Sprintf("- `%s` (%s)", t.Name, t.Kind))
}
}
if len(pkg.Constants) > 0 {
parts = append(parts, fmt.Sprintf("\n## Constants (%d)\n", len(pkg.Constants)))
}
if len(pkg.Variables) > 0 {
parts = append(parts, fmt.Sprintf("\n## Variables (%d)\n", len(pkg.Variables)))
}
return strings.Join(parts, "\n")
}
func (s *GoDocsScraper) functionToDocument(fn *godocs.Function, pkg *godocs.Package, sourceName string) *Document {
content := s.buildFunctionContent(fn, pkg)
metadata := map[string]interface{}{
"import_path": pkg.ImportPath,
"package": pkg.Name,
"symbol": fn.Name,
"signature": fn.Signature,
"kind": "function",
}
examplesJSON, _ := json.Marshal(fn.Examples)
metadata["examples"] = string(examplesJSON)
return &Document{
ID: generateDocID(fmt.Sprintf("%s#%s", pkg.DocURL, fn.Name)),
Source: sourceName,
Type: "go-function",
Title: fmt.Sprintf("%s.%s", pkg.Name, fn.Name),
Content: content,
URL: fmt.Sprintf("%s#%s", pkg.DocURL, fn.Name),
Metadata: metadata,
Hash: s.generateHash(content),
Timestamp: time.Now(),
}
}
func (s *GoDocsScraper) buildFunctionContent(fn *godocs.Function, pkg *godocs.Package) string {
var parts []string
parts = append(parts, fmt.Sprintf("# %s.%s\n", pkg.Name, fn.Name))
parts = append(parts, fmt.Sprintf("```\n%s\n```", fn.Signature))
if fn.Doc != "" {
parts = append(parts, "\n"+fn.Doc)
}
for _, ex := range fn.Examples {
parts = append(parts, fmt.Sprintf("\n### Example: %s\n", ex.Name))
if ex.Doc != "" {
parts = append(parts, ex.Doc)
}
parts = append(parts, fmt.Sprintf("```go\n%s\n```", ex.Code))
if ex.Output != "" {
parts = append(parts, fmt.Sprintf("Output:\n```\n%s\n```", ex.Output))
}
}
return strings.Join(parts, "\n")
}
func (s *GoDocsScraper) typeToDocument(t *godocs.Type, pkg *godocs.Package, sourceName string) *Document {
content := s.buildTypeContent(t, pkg)
metadata := map[string]interface{}{
"import_path": pkg.ImportPath,
"package": pkg.Name,
"symbol": t.Name,
"kind": "type",
"type_kind": t.Kind,
"underlying": t.Underlying,
"method_count": len(t.Methods),
}
fieldsJSON, _ := json.Marshal(t.Fields)
metadata["fields"] = string(fieldsJSON)
return &Document{
ID: generateDocID(fmt.Sprintf("%s#%s", pkg.DocURL, t.Name)),
Source: sourceName,
Type: "go-type",
Title: fmt.Sprintf("%s.%s", pkg.Name, t.Name),
Content: content,
URL: fmt.Sprintf("%s#%s", pkg.DocURL, t.Name),
Metadata: metadata,
Hash: s.generateHash(content),
Timestamp: time.Now(),
}
}
func (s *GoDocsScraper) buildTypeContent(t *godocs.Type, pkg *godocs.Package) string {
var parts []string
parts = append(parts, fmt.Sprintf("# type %s.%s\n", pkg.Name, t.Name))
parts = append(parts, fmt.Sprintf("```\n%s\n```", t.Underlying))
if t.Doc != "" {
parts = append(parts, "\n"+t.Doc)
}
if len(t.Fields) > 0 {
parts = append(parts, "\n### Fields\n")
for _, f := range t.Fields {
if f.Doc != "" {
parts = append(parts, fmt.Sprintf("- `%s %s` - %s", f.Name, f.Type, f.Doc))
} else {
parts = append(parts, fmt.Sprintf("- `%s %s`", f.Name, f.Type))
}
}
}
if len(t.Methods) > 0 {
parts = append(parts, fmt.Sprintf("\n### Methods (%d)\n", len(t.Methods)))
for _, m := range t.Methods {
parts = append(parts, fmt.Sprintf("- `%s`", m.Signature))
}
}
return strings.Join(parts, "\n")
}
func (s *GoDocsScraper) methodToDocument(m *godocs.Method, t *godocs.Type, pkg *godocs.Package, sourceName string) *Document {
content := s.buildMethodContent(m, t, pkg)
metadata := map[string]interface{}{
"import_path": pkg.ImportPath,
"package": pkg.Name,
"type": t.Name,
"symbol": m.Name,
"receiver": m.Receiver,
"signature": m.Signature,
"kind": "method",
}
return &Document{
ID: generateDocID(fmt.Sprintf("%s#%s.%s", pkg.DocURL, t.Name, m.Name)),
Source: sourceName,
Type: "go-method",
Title: fmt.Sprintf("%s.%s.%s", pkg.Name, t.Name, m.Name),
Content: content,
URL: fmt.Sprintf("%s#%s.%s", pkg.DocURL, t.Name, m.Name),
Metadata: metadata,
Hash: s.generateHash(content),
Timestamp: time.Now(),
}
}
func (s *GoDocsScraper) buildMethodContent(m *godocs.Method, t *godocs.Type, pkg *godocs.Package) string {
var parts []string
parts = append(parts, fmt.Sprintf("# func (%s) %s\n", m.Receiver, m.Name))
parts = append(parts, fmt.Sprintf("```\n%s\n```", m.Signature))
if m.Doc != "" {
parts = append(parts, "\n"+m.Doc)
}
return strings.Join(parts, "\n")
}
func (s *GoDocsScraper) constantToDocument(c *godocs.Value, pkg *godocs.Package, sourceName string) *Document {
var content strings.Builder
fmt.Fprintf(&content, "# Constants\n\n")
if c.Doc != "" {
fmt.Fprintf(&content, "%s\n\n", c.Doc)
}
if len(c.Names) > 1 {
fmt.Fprintf(&content, "```go\nconst (\n")
for _, name := range c.Names {
fmt.Fprintf(&content, "\t%s\n", name)
}
fmt.Fprintf(&content, ")\n```")
} else {
fmt.Fprintf(&content, "```go\nconst %s = %s\n```", c.Name, c.Value)
}
metadata := map[string]interface{}{
"import_path": pkg.ImportPath,
"package": pkg.Name,
"names": c.Names,
"kind": "constant",
}
return &Document{
ID: generateDocID(fmt.Sprintf("%s#const-%s", pkg.DocURL, c.Name)),
Source: sourceName,
Type: "go-constant",
Title: fmt.Sprintf("%s.%s (const)", pkg.Name, c.Name),
Content: content.String(),
URL: pkg.DocURL,
Metadata: metadata,
Hash: s.generateHash(content.String()),
Timestamp: time.Now(),
}
}
func (s *GoDocsScraper) variableToDocument(v *godocs.Value, pkg *godocs.Package, sourceName string) *Document {
var content strings.Builder
fmt.Fprintf(&content, "# Variables\n\n")
if v.Doc != "" {
fmt.Fprintf(&content, "%s\n\n", v.Doc)
}
fmt.Fprintf(&content, "```go\nvar %s", v.Name)
if v.Type != "" {
fmt.Fprintf(&content, " %s", v.Type)
}
if v.Value != "" {
fmt.Fprintf(&content, " = %s", v.Value)
}
fmt.Fprintf(&content, "\n```")
metadata := map[string]interface{}{
"import_path": pkg.ImportPath,
"package": pkg.Name,
"name": v.Name,
"type": v.Type,
"kind": "variable",
}
return &Document{
ID: generateDocID(fmt.Sprintf("%s#var-%s", pkg.DocURL, v.Name)),
Source: sourceName,
Type: "go-variable",
Title: fmt.Sprintf("%s.%s (var)", pkg.Name, v.Name),
Content: content.String(),
URL: pkg.DocURL,
Metadata: metadata,
Hash: s.generateHash(content.String()),
Timestamp: time.Now(),
}
}