package scraper import ( "context" "crypto/sha256" "encoding/hex" "encoding/json" "fmt" "net/http" "strings" "time" "github.com/yourorg/devour/pkg/godocs" ) type GoDocsScraper struct { config *Config parser *godocs.Parser client *http.Client } func NewGoDocsScraper(config *Config) *GoDocsScraper { return &GoDocsScraper{ config: config, parser: godocs.NewParser(), client: &http.Client{ Timeout: config.Timeout, }, } } func (s *GoDocsScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) { var documents []*Document if source.URL == "" { return nil, fmt.Errorf("URL is required for Go docs scraper") } html, err := s.fetchPage(ctx, source.URL) if err != nil { return nil, fmt.Errorf("failed to fetch page: %w", err) } pkg, err := s.parser.ParsePackagePage(html, source.URL) if err != nil { return nil, fmt.Errorf("failed to parse package: %w", err) } mainDoc := s.packageToDocument(pkg, source.Name) documents = append(documents, mainDoc) for _, fn := range pkg.Functions { doc := s.functionToDocument(fn, pkg, source.Name) documents = append(documents, doc) } for _, t := range pkg.Types { doc := s.typeToDocument(t, pkg, source.Name) documents = append(documents, doc) for _, m := range t.Methods { methodDoc := s.methodToDocument(m, t, pkg, source.Name) documents = append(documents, methodDoc) } } for _, c := range pkg.Constants { doc := s.constantToDocument(c, pkg, source.Name) documents = append(documents, doc) } for _, v := range pkg.Variables { doc := s.variableToDocument(v, pkg, source.Name) documents = append(documents, doc) } return documents, nil } func (s *GoDocsScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) { html, err := s.fetchPage(ctx, source.URL) if err != nil { return false, "", err } hash := s.generateHash(html) changed := hash != lastHash return changed, hash, nil } func (s *GoDocsScraper) fetchPage(ctx context.Context, url string) (string, error) { return fetchExternalPage(ctx, s.client, s.config.UserAgent, url) } func (s *GoDocsScraper) generateHash(content string) string { hash := sha256.Sum256([]byte(content)) return hex.EncodeToString(hash[:]) } func (s *GoDocsScraper) packageToDocument(pkg *godocs.Package, sourceName string) *Document { content := s.buildPackageContent(pkg) metadata := map[string]interface{}{ "import_path": pkg.ImportPath, "version": pkg.Version, "imported_by": pkg.ImportedBy, "repository": pkg.Repository, "doc_url": pkg.DocURL, } if pkg.Module != nil { metadata["module_path"] = pkg.Module.Path metadata["module_version"] = pkg.Module.Version } if len(pkg.Licenses) > 0 { var licenses []string for _, l := range pkg.Licenses { licenses = append(licenses, l.Name) } metadata["licenses"] = licenses } return &Document{ ID: generateDocID(pkg.DocURL), Source: sourceName, Type: "go-package", Title: fmt.Sprintf("%s - %s", pkg.Name, pkg.ImportPath), Content: content, URL: pkg.DocURL, Metadata: metadata, Hash: s.generateHash(content), Timestamp: time.Now(), } } func (s *GoDocsScraper) buildPackageContent(pkg *godocs.Package) string { var parts []string parts = append(parts, fmt.Sprintf("# Package %s\n", pkg.ImportPath)) if pkg.Synopsis != "" { parts = append(parts, pkg.Synopsis) } if pkg.Doc != "" { parts = append(parts, "\n## Documentation\n") parts = append(parts, pkg.Doc) } if len(pkg.Functions) > 0 { parts = append(parts, fmt.Sprintf("\n## Functions (%d)\n", len(pkg.Functions))) for _, fn := range pkg.Functions { parts = append(parts, fmt.Sprintf("- `%s`", fn.Signature)) } } if len(pkg.Types) > 0 { parts = append(parts, fmt.Sprintf("\n## Types (%d)\n", len(pkg.Types))) for _, t := range pkg.Types { parts = append(parts, fmt.Sprintf("- `%s` (%s)", t.Name, t.Kind)) } } if len(pkg.Constants) > 0 { parts = append(parts, fmt.Sprintf("\n## Constants (%d)\n", len(pkg.Constants))) } if len(pkg.Variables) > 0 { parts = append(parts, fmt.Sprintf("\n## Variables (%d)\n", len(pkg.Variables))) } return strings.Join(parts, "\n") } func (s *GoDocsScraper) functionToDocument(fn *godocs.Function, pkg *godocs.Package, sourceName string) *Document { content := s.buildFunctionContent(fn, pkg) metadata := map[string]interface{}{ "import_path": pkg.ImportPath, "package": pkg.Name, "symbol": fn.Name, "signature": fn.Signature, "kind": "function", } examplesJSON, _ := json.Marshal(fn.Examples) metadata["examples"] = string(examplesJSON) return &Document{ ID: generateDocID(fmt.Sprintf("%s#%s", pkg.DocURL, fn.Name)), Source: sourceName, Type: "go-function", Title: fmt.Sprintf("%s.%s", pkg.Name, fn.Name), Content: content, URL: fmt.Sprintf("%s#%s", pkg.DocURL, fn.Name), Metadata: metadata, Hash: s.generateHash(content), Timestamp: time.Now(), } } func (s *GoDocsScraper) buildFunctionContent(fn *godocs.Function, pkg *godocs.Package) string { var parts []string parts = append(parts, fmt.Sprintf("# %s.%s\n", pkg.Name, fn.Name)) parts = append(parts, fmt.Sprintf("```\n%s\n```", fn.Signature)) if fn.Doc != "" { parts = append(parts, "\n"+fn.Doc) } for _, ex := range fn.Examples { parts = append(parts, fmt.Sprintf("\n### Example: %s\n", ex.Name)) if ex.Doc != "" { parts = append(parts, ex.Doc) } parts = append(parts, fmt.Sprintf("```go\n%s\n```", ex.Code)) if ex.Output != "" { parts = append(parts, fmt.Sprintf("Output:\n```\n%s\n```", ex.Output)) } } return strings.Join(parts, "\n") } func (s *GoDocsScraper) typeToDocument(t *godocs.Type, pkg *godocs.Package, sourceName string) *Document { content := s.buildTypeContent(t, pkg) metadata := map[string]interface{}{ "import_path": pkg.ImportPath, "package": pkg.Name, "symbol": t.Name, "kind": "type", "type_kind": t.Kind, "underlying": t.Underlying, "method_count": len(t.Methods), } fieldsJSON, _ := json.Marshal(t.Fields) metadata["fields"] = string(fieldsJSON) return &Document{ ID: generateDocID(fmt.Sprintf("%s#%s", pkg.DocURL, t.Name)), Source: sourceName, Type: "go-type", Title: fmt.Sprintf("%s.%s", pkg.Name, t.Name), Content: content, URL: fmt.Sprintf("%s#%s", pkg.DocURL, t.Name), Metadata: metadata, Hash: s.generateHash(content), Timestamp: time.Now(), } } func (s *GoDocsScraper) buildTypeContent(t *godocs.Type, pkg *godocs.Package) string { var parts []string parts = append(parts, fmt.Sprintf("# type %s.%s\n", pkg.Name, t.Name)) parts = append(parts, fmt.Sprintf("```\n%s\n```", t.Underlying)) if t.Doc != "" { parts = append(parts, "\n"+t.Doc) } if len(t.Fields) > 0 { parts = append(parts, "\n### Fields\n") for _, f := range t.Fields { if f.Doc != "" { parts = append(parts, fmt.Sprintf("- `%s %s` - %s", f.Name, f.Type, f.Doc)) } else { parts = append(parts, fmt.Sprintf("- `%s %s`", f.Name, f.Type)) } } } if len(t.Methods) > 0 { parts = append(parts, fmt.Sprintf("\n### Methods (%d)\n", len(t.Methods))) for _, m := range t.Methods { parts = append(parts, fmt.Sprintf("- `%s`", m.Signature)) } } return strings.Join(parts, "\n") } func (s *GoDocsScraper) methodToDocument(m *godocs.Method, t *godocs.Type, pkg *godocs.Package, sourceName string) *Document { content := s.buildMethodContent(m, t, pkg) metadata := map[string]interface{}{ "import_path": pkg.ImportPath, "package": pkg.Name, "type": t.Name, "symbol": m.Name, "receiver": m.Receiver, "signature": m.Signature, "kind": "method", } return &Document{ ID: generateDocID(fmt.Sprintf("%s#%s.%s", pkg.DocURL, t.Name, m.Name)), Source: sourceName, Type: "go-method", Title: fmt.Sprintf("%s.%s.%s", pkg.Name, t.Name, m.Name), Content: content, URL: fmt.Sprintf("%s#%s.%s", pkg.DocURL, t.Name, m.Name), Metadata: metadata, Hash: s.generateHash(content), Timestamp: time.Now(), } } func (s *GoDocsScraper) buildMethodContent(m *godocs.Method, t *godocs.Type, pkg *godocs.Package) string { var parts []string parts = append(parts, fmt.Sprintf("# func (%s) %s\n", m.Receiver, m.Name)) parts = append(parts, fmt.Sprintf("```\n%s\n```", m.Signature)) if m.Doc != "" { parts = append(parts, "\n"+m.Doc) } return strings.Join(parts, "\n") } func (s *GoDocsScraper) constantToDocument(c *godocs.Value, pkg *godocs.Package, sourceName string) *Document { var content strings.Builder fmt.Fprintf(&content, "# Constants\n\n") if c.Doc != "" { fmt.Fprintf(&content, "%s\n\n", c.Doc) } if len(c.Names) > 1 { fmt.Fprintf(&content, "```go\nconst (\n") for _, name := range c.Names { fmt.Fprintf(&content, "\t%s\n", name) } fmt.Fprintf(&content, ")\n```") } else { fmt.Fprintf(&content, "```go\nconst %s = %s\n```", c.Name, c.Value) } metadata := map[string]interface{}{ "import_path": pkg.ImportPath, "package": pkg.Name, "names": c.Names, "kind": "constant", } return &Document{ ID: generateDocID(fmt.Sprintf("%s#const-%s", pkg.DocURL, c.Name)), Source: sourceName, Type: "go-constant", Title: fmt.Sprintf("%s.%s (const)", pkg.Name, c.Name), Content: content.String(), URL: pkg.DocURL, Metadata: metadata, Hash: s.generateHash(content.String()), Timestamp: time.Now(), } } func (s *GoDocsScraper) variableToDocument(v *godocs.Value, pkg *godocs.Package, sourceName string) *Document { var content strings.Builder fmt.Fprintf(&content, "# Variables\n\n") if v.Doc != "" { fmt.Fprintf(&content, "%s\n\n", v.Doc) } fmt.Fprintf(&content, "```go\nvar %s", v.Name) if v.Type != "" { fmt.Fprintf(&content, " %s", v.Type) } if v.Value != "" { fmt.Fprintf(&content, " = %s", v.Value) } fmt.Fprintf(&content, "\n```") metadata := map[string]interface{}{ "import_path": pkg.ImportPath, "package": pkg.Name, "name": v.Name, "type": v.Type, "kind": "variable", } return &Document{ ID: generateDocID(fmt.Sprintf("%s#var-%s", pkg.DocURL, v.Name)), Source: sourceName, Type: "go-variable", Title: fmt.Sprintf("%s.%s (var)", pkg.Name, v.Name), Content: content.String(), URL: pkg.DocURL, Metadata: metadata, Hash: s.generateHash(content.String()), Timestamp: time.Now(), } }