mirror of
https://github.com/Dvorinka/Devour.git
synced 2026-06-03 20:13:03 +00:00
424 lines
11 KiB
Go
424 lines
11 KiB
Go
package scraper
|
|
|
|
import (
|
|
"context"
|
|
"crypto/sha256"
|
|
"encoding/hex"
|
|
"encoding/json"
|
|
"fmt"
|
|
"io"
|
|
"net/http"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/yourorg/devour/pkg/godocs"
|
|
)
|
|
|
|
type GoDocsScraper struct {
|
|
config *Config
|
|
parser *godocs.Parser
|
|
client *http.Client
|
|
}
|
|
|
|
func NewGoDocsScraper(config *Config) *GoDocsScraper {
|
|
return &GoDocsScraper{
|
|
config: config,
|
|
parser: godocs.NewParser(),
|
|
client: &http.Client{
|
|
Timeout: config.Timeout,
|
|
},
|
|
}
|
|
}
|
|
|
|
func (s *GoDocsScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
|
|
var documents []*Document
|
|
|
|
if source.URL == "" {
|
|
return nil, fmt.Errorf("URL is required for Go docs scraper")
|
|
}
|
|
|
|
html, err := s.fetchPage(ctx, source.URL)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to fetch page: %w", err)
|
|
}
|
|
|
|
pkg, err := s.parser.ParsePackagePage(html, source.URL)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to parse package: %w", err)
|
|
}
|
|
|
|
mainDoc := s.packageToDocument(pkg, source.Name)
|
|
documents = append(documents, mainDoc)
|
|
|
|
for _, fn := range pkg.Functions {
|
|
doc := s.functionToDocument(fn, pkg, source.Name)
|
|
documents = append(documents, doc)
|
|
}
|
|
|
|
for _, t := range pkg.Types {
|
|
doc := s.typeToDocument(t, pkg, source.Name)
|
|
documents = append(documents, doc)
|
|
|
|
for _, m := range t.Methods {
|
|
methodDoc := s.methodToDocument(m, t, pkg, source.Name)
|
|
documents = append(documents, methodDoc)
|
|
}
|
|
}
|
|
|
|
for _, c := range pkg.Constants {
|
|
doc := s.constantToDocument(c, pkg, source.Name)
|
|
documents = append(documents, doc)
|
|
}
|
|
|
|
for _, v := range pkg.Variables {
|
|
doc := s.variableToDocument(v, pkg, source.Name)
|
|
documents = append(documents, doc)
|
|
}
|
|
|
|
return documents, nil
|
|
}
|
|
|
|
func (s *GoDocsScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
|
|
html, err := s.fetchPage(ctx, source.URL)
|
|
if err != nil {
|
|
return false, "", err
|
|
}
|
|
|
|
hash := s.generateHash(html)
|
|
changed := hash != lastHash
|
|
|
|
return changed, hash, nil
|
|
}
|
|
|
|
func (s *GoDocsScraper) fetchPage(ctx context.Context, url string) (string, error) {
|
|
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
|
|
req.Header.Set("User-Agent", s.config.UserAgent)
|
|
|
|
resp, err := s.client.Do(req)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
if resp.StatusCode != http.StatusOK {
|
|
return "", fmt.Errorf("HTTP %d", resp.StatusCode)
|
|
}
|
|
|
|
body, err := io.ReadAll(resp.Body)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
return string(body), nil
|
|
}
|
|
|
|
func (s *GoDocsScraper) generateHash(content string) string {
|
|
hash := sha256.Sum256([]byte(content))
|
|
return hex.EncodeToString(hash[:])
|
|
}
|
|
|
|
func (s *GoDocsScraper) packageToDocument(pkg *godocs.Package, sourceName string) *Document {
|
|
content := s.buildPackageContent(pkg)
|
|
|
|
metadata := map[string]interface{}{
|
|
"import_path": pkg.ImportPath,
|
|
"version": pkg.Version,
|
|
"imported_by": pkg.ImportedBy,
|
|
"repository": pkg.Repository,
|
|
"doc_url": pkg.DocURL,
|
|
}
|
|
|
|
if pkg.Module != nil {
|
|
metadata["module_path"] = pkg.Module.Path
|
|
metadata["module_version"] = pkg.Module.Version
|
|
}
|
|
|
|
if len(pkg.Licenses) > 0 {
|
|
var licenses []string
|
|
for _, l := range pkg.Licenses {
|
|
licenses = append(licenses, l.Name)
|
|
}
|
|
metadata["licenses"] = licenses
|
|
}
|
|
|
|
return &Document{
|
|
ID: generateDocID(pkg.DocURL),
|
|
Source: sourceName,
|
|
Type: "go-package",
|
|
Title: fmt.Sprintf("%s - %s", pkg.Name, pkg.ImportPath),
|
|
Content: content,
|
|
URL: pkg.DocURL,
|
|
Metadata: metadata,
|
|
Hash: s.generateHash(content),
|
|
Timestamp: time.Now(),
|
|
}
|
|
}
|
|
|
|
func (s *GoDocsScraper) buildPackageContent(pkg *godocs.Package) string {
|
|
var parts []string
|
|
|
|
parts = append(parts, fmt.Sprintf("# Package %s\n", pkg.ImportPath))
|
|
|
|
if pkg.Synopsis != "" {
|
|
parts = append(parts, pkg.Synopsis)
|
|
}
|
|
|
|
if pkg.Doc != "" {
|
|
parts = append(parts, "\n## Documentation\n")
|
|
parts = append(parts, pkg.Doc)
|
|
}
|
|
|
|
if len(pkg.Functions) > 0 {
|
|
parts = append(parts, fmt.Sprintf("\n## Functions (%d)\n", len(pkg.Functions)))
|
|
for _, fn := range pkg.Functions {
|
|
parts = append(parts, fmt.Sprintf("- `%s`", fn.Signature))
|
|
}
|
|
}
|
|
|
|
if len(pkg.Types) > 0 {
|
|
parts = append(parts, fmt.Sprintf("\n## Types (%d)\n", len(pkg.Types)))
|
|
for _, t := range pkg.Types {
|
|
parts = append(parts, fmt.Sprintf("- `%s` (%s)", t.Name, t.Kind))
|
|
}
|
|
}
|
|
|
|
if len(pkg.Constants) > 0 {
|
|
parts = append(parts, fmt.Sprintf("\n## Constants (%d)\n", len(pkg.Constants)))
|
|
}
|
|
|
|
if len(pkg.Variables) > 0 {
|
|
parts = append(parts, fmt.Sprintf("\n## Variables (%d)\n", len(pkg.Variables)))
|
|
}
|
|
|
|
return strings.Join(parts, "\n")
|
|
}
|
|
|
|
func (s *GoDocsScraper) functionToDocument(fn *godocs.Function, pkg *godocs.Package, sourceName string) *Document {
|
|
content := s.buildFunctionContent(fn, pkg)
|
|
|
|
metadata := map[string]interface{}{
|
|
"import_path": pkg.ImportPath,
|
|
"package": pkg.Name,
|
|
"symbol": fn.Name,
|
|
"signature": fn.Signature,
|
|
"kind": "function",
|
|
}
|
|
|
|
examplesJSON, _ := json.Marshal(fn.Examples)
|
|
metadata["examples"] = string(examplesJSON)
|
|
|
|
return &Document{
|
|
ID: generateDocID(fmt.Sprintf("%s#%s", pkg.DocURL, fn.Name)),
|
|
Source: sourceName,
|
|
Type: "go-function",
|
|
Title: fmt.Sprintf("%s.%s", pkg.Name, fn.Name),
|
|
Content: content,
|
|
URL: fmt.Sprintf("%s#%s", pkg.DocURL, fn.Name),
|
|
Metadata: metadata,
|
|
Hash: s.generateHash(content),
|
|
Timestamp: time.Now(),
|
|
}
|
|
}
|
|
|
|
func (s *GoDocsScraper) buildFunctionContent(fn *godocs.Function, pkg *godocs.Package) string {
|
|
var parts []string
|
|
|
|
parts = append(parts, fmt.Sprintf("# %s.%s\n", pkg.Name, fn.Name))
|
|
parts = append(parts, fmt.Sprintf("```\n%s\n```", fn.Signature))
|
|
|
|
if fn.Doc != "" {
|
|
parts = append(parts, "\n"+fn.Doc)
|
|
}
|
|
|
|
for _, ex := range fn.Examples {
|
|
parts = append(parts, fmt.Sprintf("\n### Example: %s\n", ex.Name))
|
|
if ex.Doc != "" {
|
|
parts = append(parts, ex.Doc)
|
|
}
|
|
parts = append(parts, fmt.Sprintf("```go\n%s\n```", ex.Code))
|
|
if ex.Output != "" {
|
|
parts = append(parts, fmt.Sprintf("Output:\n```\n%s\n```", ex.Output))
|
|
}
|
|
}
|
|
|
|
return strings.Join(parts, "\n")
|
|
}
|
|
|
|
func (s *GoDocsScraper) typeToDocument(t *godocs.Type, pkg *godocs.Package, sourceName string) *Document {
|
|
content := s.buildTypeContent(t, pkg)
|
|
|
|
metadata := map[string]interface{}{
|
|
"import_path": pkg.ImportPath,
|
|
"package": pkg.Name,
|
|
"symbol": t.Name,
|
|
"kind": "type",
|
|
"type_kind": t.Kind,
|
|
"underlying": t.Underlying,
|
|
"method_count": len(t.Methods),
|
|
}
|
|
|
|
fieldsJSON, _ := json.Marshal(t.Fields)
|
|
metadata["fields"] = string(fieldsJSON)
|
|
|
|
return &Document{
|
|
ID: generateDocID(fmt.Sprintf("%s#%s", pkg.DocURL, t.Name)),
|
|
Source: sourceName,
|
|
Type: "go-type",
|
|
Title: fmt.Sprintf("%s.%s", pkg.Name, t.Name),
|
|
Content: content,
|
|
URL: fmt.Sprintf("%s#%s", pkg.DocURL, t.Name),
|
|
Metadata: metadata,
|
|
Hash: s.generateHash(content),
|
|
Timestamp: time.Now(),
|
|
}
|
|
}
|
|
|
|
func (s *GoDocsScraper) buildTypeContent(t *godocs.Type, pkg *godocs.Package) string {
|
|
var parts []string
|
|
|
|
parts = append(parts, fmt.Sprintf("# type %s.%s\n", pkg.Name, t.Name))
|
|
parts = append(parts, fmt.Sprintf("```\n%s\n```", t.Underlying))
|
|
|
|
if t.Doc != "" {
|
|
parts = append(parts, "\n"+t.Doc)
|
|
}
|
|
|
|
if len(t.Fields) > 0 {
|
|
parts = append(parts, "\n### Fields\n")
|
|
for _, f := range t.Fields {
|
|
if f.Doc != "" {
|
|
parts = append(parts, fmt.Sprintf("- `%s %s` - %s", f.Name, f.Type, f.Doc))
|
|
} else {
|
|
parts = append(parts, fmt.Sprintf("- `%s %s`", f.Name, f.Type))
|
|
}
|
|
}
|
|
}
|
|
|
|
if len(t.Methods) > 0 {
|
|
parts = append(parts, fmt.Sprintf("\n### Methods (%d)\n", len(t.Methods)))
|
|
for _, m := range t.Methods {
|
|
parts = append(parts, fmt.Sprintf("- `%s`", m.Signature))
|
|
}
|
|
}
|
|
|
|
return strings.Join(parts, "\n")
|
|
}
|
|
|
|
func (s *GoDocsScraper) methodToDocument(m *godocs.Method, t *godocs.Type, pkg *godocs.Package, sourceName string) *Document {
|
|
content := s.buildMethodContent(m, t, pkg)
|
|
|
|
metadata := map[string]interface{}{
|
|
"import_path": pkg.ImportPath,
|
|
"package": pkg.Name,
|
|
"type": t.Name,
|
|
"symbol": m.Name,
|
|
"receiver": m.Receiver,
|
|
"signature": m.Signature,
|
|
"kind": "method",
|
|
}
|
|
|
|
return &Document{
|
|
ID: generateDocID(fmt.Sprintf("%s#%s.%s", pkg.DocURL, t.Name, m.Name)),
|
|
Source: sourceName,
|
|
Type: "go-method",
|
|
Title: fmt.Sprintf("%s.%s.%s", pkg.Name, t.Name, m.Name),
|
|
Content: content,
|
|
URL: fmt.Sprintf("%s#%s.%s", pkg.DocURL, t.Name, m.Name),
|
|
Metadata: metadata,
|
|
Hash: s.generateHash(content),
|
|
Timestamp: time.Now(),
|
|
}
|
|
}
|
|
|
|
func (s *GoDocsScraper) buildMethodContent(m *godocs.Method, t *godocs.Type, pkg *godocs.Package) string {
|
|
var parts []string
|
|
|
|
parts = append(parts, fmt.Sprintf("# func (%s) %s\n", m.Receiver, m.Name))
|
|
parts = append(parts, fmt.Sprintf("```\n%s\n```", m.Signature))
|
|
|
|
if m.Doc != "" {
|
|
parts = append(parts, "\n"+m.Doc)
|
|
}
|
|
|
|
return strings.Join(parts, "\n")
|
|
}
|
|
|
|
func (s *GoDocsScraper) constantToDocument(c *godocs.Value, pkg *godocs.Package, sourceName string) *Document {
|
|
var content strings.Builder
|
|
fmt.Fprintf(&content, "# Constants\n\n")
|
|
|
|
if c.Doc != "" {
|
|
fmt.Fprintf(&content, "%s\n\n", c.Doc)
|
|
}
|
|
|
|
if len(c.Names) > 1 {
|
|
fmt.Fprintf(&content, "```go\nconst (\n")
|
|
for _, name := range c.Names {
|
|
fmt.Fprintf(&content, "\t%s\n", name)
|
|
}
|
|
fmt.Fprintf(&content, ")\n```")
|
|
} else {
|
|
fmt.Fprintf(&content, "```go\nconst %s = %s\n```", c.Name, c.Value)
|
|
}
|
|
|
|
metadata := map[string]interface{}{
|
|
"import_path": pkg.ImportPath,
|
|
"package": pkg.Name,
|
|
"names": c.Names,
|
|
"kind": "constant",
|
|
}
|
|
|
|
return &Document{
|
|
ID: generateDocID(fmt.Sprintf("%s#const-%s", pkg.DocURL, c.Name)),
|
|
Source: sourceName,
|
|
Type: "go-constant",
|
|
Title: fmt.Sprintf("%s.%s (const)", pkg.Name, c.Name),
|
|
Content: content.String(),
|
|
URL: pkg.DocURL,
|
|
Metadata: metadata,
|
|
Hash: s.generateHash(content.String()),
|
|
Timestamp: time.Now(),
|
|
}
|
|
}
|
|
|
|
func (s *GoDocsScraper) variableToDocument(v *godocs.Value, pkg *godocs.Package, sourceName string) *Document {
|
|
var content strings.Builder
|
|
fmt.Fprintf(&content, "# Variables\n\n")
|
|
|
|
if v.Doc != "" {
|
|
fmt.Fprintf(&content, "%s\n\n", v.Doc)
|
|
}
|
|
|
|
fmt.Fprintf(&content, "```go\nvar %s", v.Name)
|
|
if v.Type != "" {
|
|
fmt.Fprintf(&content, " %s", v.Type)
|
|
}
|
|
if v.Value != "" {
|
|
fmt.Fprintf(&content, " = %s", v.Value)
|
|
}
|
|
fmt.Fprintf(&content, "\n```")
|
|
|
|
metadata := map[string]interface{}{
|
|
"import_path": pkg.ImportPath,
|
|
"package": pkg.Name,
|
|
"name": v.Name,
|
|
"type": v.Type,
|
|
"kind": "variable",
|
|
}
|
|
|
|
return &Document{
|
|
ID: generateDocID(fmt.Sprintf("%s#var-%s", pkg.DocURL, v.Name)),
|
|
Source: sourceName,
|
|
Type: "go-variable",
|
|
Title: fmt.Sprintf("%s.%s (var)", pkg.Name, v.Name),
|
|
Content: content.String(),
|
|
URL: pkg.DocURL,
|
|
Metadata: metadata,
|
|
Hash: s.generateHash(content.String()),
|
|
Timestamp: time.Now(),
|
|
}
|
|
}
|