mirror of
https://github.com/Dvorinka/Devour.git
synced 2026-06-03 20:13:03 +00:00
464 lines
12 KiB
Go
464 lines
12 KiB
Go
package scraper
|
|
|
|
import (
|
|
"context"
|
|
"crypto/sha256"
|
|
"encoding/hex"
|
|
"fmt"
|
|
"io"
|
|
"net/http"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/yourorg/devour/pkg/pythondocs"
|
|
)
|
|
|
|
type PythonDocsScraper struct {
|
|
config *Config
|
|
parser *pythondocs.Parser
|
|
client *http.Client
|
|
}
|
|
|
|
func NewPythonDocsScraper(config *Config) *PythonDocsScraper {
|
|
return &PythonDocsScraper{
|
|
config: config,
|
|
parser: pythondocs.NewParser(),
|
|
client: &http.Client{
|
|
Timeout: config.Timeout,
|
|
},
|
|
}
|
|
}
|
|
|
|
func (s *PythonDocsScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
|
|
var documents []*Document
|
|
|
|
if source.URL == "" {
|
|
return nil, fmt.Errorf("URL is required for Python docs scraper")
|
|
}
|
|
|
|
html, err := s.fetchPage(ctx, source.URL)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to fetch page: %w", err)
|
|
}
|
|
|
|
module, err := s.parser.ParseModulePage(html, source.URL)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to parse module: %w", err)
|
|
}
|
|
|
|
mainDoc := s.moduleToDocument(module, source.Name)
|
|
documents = append(documents, mainDoc)
|
|
|
|
for _, class := range module.Classes {
|
|
doc := s.classToDocument(class, module, source.Name)
|
|
documents = append(documents, doc)
|
|
|
|
for _, method := range class.Methods {
|
|
methodDoc := s.methodToDocument(method, class, module, source.Name)
|
|
documents = append(documents, methodDoc)
|
|
}
|
|
|
|
for _, method := range class.ClassMethods {
|
|
methodDoc := s.classMethodToDocument(method, class, module, source.Name)
|
|
documents = append(documents, methodDoc)
|
|
}
|
|
|
|
for _, attr := range class.Attributes {
|
|
attrDoc := s.attributeToDocument(attr, class, module, source.Name)
|
|
documents = append(documents, attrDoc)
|
|
}
|
|
}
|
|
|
|
for _, fn := range module.Functions {
|
|
doc := s.functionToDocument(fn, module, source.Name)
|
|
documents = append(documents, doc)
|
|
}
|
|
|
|
for _, exc := range module.Exceptions {
|
|
doc := s.exceptionToDocument(exc, module, source.Name)
|
|
documents = append(documents, doc)
|
|
}
|
|
|
|
for _, data := range module.Constants {
|
|
doc := s.dataToDocument(data, module, source.Name)
|
|
documents = append(documents, doc)
|
|
}
|
|
|
|
return documents, nil
|
|
}
|
|
|
|
func (s *PythonDocsScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
|
|
html, err := s.fetchPage(ctx, source.URL)
|
|
if err != nil {
|
|
return false, "", err
|
|
}
|
|
|
|
hash := s.generateHash(html)
|
|
changed := hash != lastHash
|
|
|
|
return changed, hash, nil
|
|
}
|
|
|
|
func (s *PythonDocsScraper) fetchPage(ctx context.Context, url string) (string, error) {
|
|
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
|
|
req.Header.Set("User-Agent", s.config.UserAgent)
|
|
|
|
resp, err := s.client.Do(req)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
if resp.StatusCode != http.StatusOK {
|
|
return "", fmt.Errorf("HTTP %d", resp.StatusCode)
|
|
}
|
|
|
|
body, err := io.ReadAll(resp.Body)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
return string(body), nil
|
|
}
|
|
|
|
func (s *PythonDocsScraper) generateHash(content string) string {
|
|
hash := sha256.Sum256([]byte(content))
|
|
return hex.EncodeToString(hash[:])
|
|
}
|
|
|
|
func (s *PythonDocsScraper) moduleToDocument(module *pythondocs.Module, sourceName string) *Document {
|
|
content := s.buildModuleContent(module)
|
|
|
|
metadata := map[string]interface{}{
|
|
"name": module.Name,
|
|
"path": module.Path,
|
|
"version": module.Version,
|
|
"doc_url": module.DocURL,
|
|
"class_count": len(module.Classes),
|
|
"function_count": len(module.Functions),
|
|
"exception_count": len(module.Exceptions),
|
|
"data_count": len(module.Constants),
|
|
}
|
|
|
|
return &Document{
|
|
ID: generateDocID(module.DocURL),
|
|
Source: sourceName,
|
|
Type: "python-module",
|
|
Title: fmt.Sprintf("%s - Python", module.Name),
|
|
Content: content,
|
|
URL: module.DocURL,
|
|
Metadata: metadata,
|
|
Hash: s.generateHash(content),
|
|
Timestamp: time.Now(),
|
|
}
|
|
}
|
|
|
|
func (s *PythonDocsScraper) buildModuleContent(module *pythondocs.Module) string {
|
|
var parts []string
|
|
|
|
parts = append(parts, fmt.Sprintf("# Module %s\n", module.Name))
|
|
|
|
if module.Synopsis != "" {
|
|
parts = append(parts, module.Synopsis)
|
|
}
|
|
|
|
if module.Doc != "" {
|
|
parts = append(parts, "\n"+module.Doc)
|
|
}
|
|
|
|
if len(module.Classes) > 0 {
|
|
parts = append(parts, fmt.Sprintf("\n## Classes (%d)\n", len(module.Classes)))
|
|
for _, class := range module.Classes {
|
|
parts = append(parts, fmt.Sprintf("- `%s`", class.Name))
|
|
}
|
|
}
|
|
|
|
if len(module.Functions) > 0 {
|
|
parts = append(parts, fmt.Sprintf("\n## Functions (%d)\n", len(module.Functions)))
|
|
for _, fn := range module.Functions {
|
|
parts = append(parts, fmt.Sprintf("- `%s`", fn.Name))
|
|
}
|
|
}
|
|
|
|
if len(module.Exceptions) > 0 {
|
|
parts = append(parts, fmt.Sprintf("\n## Exceptions (%d)\n", len(module.Exceptions)))
|
|
for _, exc := range module.Exceptions {
|
|
parts = append(parts, fmt.Sprintf("- `%s`", exc.Name))
|
|
}
|
|
}
|
|
|
|
return strings.Join(parts, "\n")
|
|
}
|
|
|
|
func (s *PythonDocsScraper) classToDocument(class *pythondocs.Class, module *pythondocs.Module, sourceName string) *Document {
|
|
content := s.buildClassContent(class, module)
|
|
|
|
metadata := map[string]interface{}{
|
|
"module": module.Name,
|
|
"class": class.Name,
|
|
"qual_name": class.QualName,
|
|
"bases": class.Bases,
|
|
"method_count": len(class.Methods),
|
|
"attribute_count": len(class.Attributes),
|
|
}
|
|
|
|
return &Document{
|
|
ID: generateDocID(class.DocURL),
|
|
Source: sourceName,
|
|
Type: "python-class",
|
|
Title: fmt.Sprintf("%s.%s - Python", module.Name, class.Name),
|
|
Content: content,
|
|
URL: class.DocURL,
|
|
Metadata: metadata,
|
|
Hash: s.generateHash(content),
|
|
Timestamp: time.Now(),
|
|
}
|
|
}
|
|
|
|
func (s *PythonDocsScraper) buildClassContent(class *pythondocs.Class, module *pythondocs.Module) string {
|
|
var parts []string
|
|
|
|
parts = append(parts, fmt.Sprintf("# class %s.%s\n", module.Name, class.Name))
|
|
|
|
if class.Signature != "" {
|
|
parts = append(parts, fmt.Sprintf("```python\n%s\n```", class.Signature))
|
|
}
|
|
|
|
if class.Doc != "" {
|
|
parts = append(parts, "\n"+class.Doc)
|
|
}
|
|
|
|
if len(class.Bases) > 0 {
|
|
parts = append(parts, fmt.Sprintf("\n**Bases:** %s\n", strings.Join(class.Bases, ", ")))
|
|
}
|
|
|
|
if len(class.Methods) > 0 {
|
|
parts = append(parts, fmt.Sprintf("\n### Methods (%d)\n", len(class.Methods)))
|
|
for _, m := range class.Methods {
|
|
parts = append(parts, fmt.Sprintf("- `%s`", m.Name))
|
|
}
|
|
}
|
|
|
|
if len(class.ClassMethods) > 0 {
|
|
parts = append(parts, fmt.Sprintf("\n### Class Methods (%d)\n", len(class.ClassMethods)))
|
|
for _, m := range class.ClassMethods {
|
|
parts = append(parts, fmt.Sprintf("- `%s` (classmethod)", m.Name))
|
|
}
|
|
}
|
|
|
|
if len(class.Attributes) > 0 {
|
|
parts = append(parts, fmt.Sprintf("\n### Attributes (%d)\n", len(class.Attributes)))
|
|
for _, a := range class.Attributes {
|
|
parts = append(parts, fmt.Sprintf("- `%s`", a.Name))
|
|
}
|
|
}
|
|
|
|
return strings.Join(parts, "\n")
|
|
}
|
|
|
|
func (s *PythonDocsScraper) methodToDocument(method *pythondocs.Method, class *pythondocs.Class, module *pythondocs.Module, sourceName string) *Document {
|
|
var content strings.Builder
|
|
fmt.Fprintf(&content, "# %s.%s.%s\n\n", module.Name, class.Name, method.Name)
|
|
|
|
if method.Signature != "" {
|
|
fmt.Fprintf(&content, "```python\n%s\n```\n", method.Signature)
|
|
}
|
|
|
|
if method.Doc != "" {
|
|
fmt.Fprintf(&content, "%s\n", method.Doc)
|
|
}
|
|
|
|
metadata := map[string]interface{}{
|
|
"module": module.Name,
|
|
"class": class.Name,
|
|
"method": method.Name,
|
|
"qual_name": method.QualName,
|
|
"is_static": method.IsStatic,
|
|
"is_async": method.IsAsync,
|
|
}
|
|
|
|
return &Document{
|
|
ID: generateDocID(method.DocURL),
|
|
Source: sourceName,
|
|
Type: "python-method",
|
|
Title: fmt.Sprintf("%s.%s.%s - Python", module.Name, class.Name, method.Name),
|
|
Content: content.String(),
|
|
URL: method.DocURL,
|
|
Metadata: metadata,
|
|
Hash: s.generateHash(content.String()),
|
|
Timestamp: time.Now(),
|
|
}
|
|
}
|
|
|
|
func (s *PythonDocsScraper) classMethodToDocument(method *pythondocs.Method, class *pythondocs.Class, module *pythondocs.Module, sourceName string) *Document {
|
|
content := s.buildMethodContent(method, class, module)
|
|
|
|
metadata := map[string]interface{}{
|
|
"module": module.Name,
|
|
"class": class.Name,
|
|
"method": method.Name,
|
|
"qual_name": method.QualName,
|
|
"is_classmethod": true,
|
|
}
|
|
|
|
return &Document{
|
|
ID: generateDocID(method.DocURL),
|
|
Source: sourceName,
|
|
Type: "python-classmethod",
|
|
Title: fmt.Sprintf("%s.%s.%s (classmethod) - Python", module.Name, class.Name, method.Name),
|
|
Content: content,
|
|
URL: method.DocURL,
|
|
Metadata: metadata,
|
|
Hash: s.generateHash(content),
|
|
Timestamp: time.Now(),
|
|
}
|
|
}
|
|
|
|
func (s *PythonDocsScraper) buildMethodContent(method *pythondocs.Method, class *pythondocs.Class, module *pythondocs.Module) string {
|
|
var parts []string
|
|
|
|
parts = append(parts, fmt.Sprintf("# %s.%s.%s\n", module.Name, class.Name, method.Name))
|
|
|
|
if method.Signature != "" {
|
|
parts = append(parts, fmt.Sprintf("```python\n%s\n```", method.Signature))
|
|
}
|
|
|
|
if method.Doc != "" {
|
|
parts = append(parts, "\n"+method.Doc)
|
|
}
|
|
|
|
return strings.Join(parts, "\n")
|
|
}
|
|
|
|
func (s *PythonDocsScraper) attributeToDocument(attr *pythondocs.Attribute, class *pythondocs.Class, module *pythondocs.Module, sourceName string) *Document {
|
|
var content strings.Builder
|
|
fmt.Fprintf(&content, "# %s.%s.%s\n\n", module.Name, class.Name, attr.Name)
|
|
|
|
if attr.Doc != "" {
|
|
fmt.Fprintf(&content, "%s\n", attr.Doc)
|
|
}
|
|
|
|
metadata := map[string]interface{}{
|
|
"module": module.Name,
|
|
"class": class.Name,
|
|
"attr": attr.Name,
|
|
"type": attr.Type,
|
|
}
|
|
|
|
return &Document{
|
|
ID: generateDocID(attr.DocURL),
|
|
Source: sourceName,
|
|
Type: "python-attribute",
|
|
Title: fmt.Sprintf("%s.%s.%s - Python", module.Name, class.Name, attr.Name),
|
|
Content: content.String(),
|
|
URL: attr.DocURL,
|
|
Metadata: metadata,
|
|
Hash: s.generateHash(content.String()),
|
|
Timestamp: time.Now(),
|
|
}
|
|
}
|
|
|
|
func (s *PythonDocsScraper) functionToDocument(fn *pythondocs.Function, module *pythondocs.Module, sourceName string) *Document {
|
|
content := s.buildFunctionContent(fn, module)
|
|
|
|
metadata := map[string]interface{}{
|
|
"module": module.Name,
|
|
"function": fn.Name,
|
|
"qual_name": fn.QualName,
|
|
"signature": fn.Signature,
|
|
"is_async": fn.IsAsync,
|
|
"is_generator": fn.IsGenerator,
|
|
}
|
|
|
|
return &Document{
|
|
ID: generateDocID(fn.DocURL),
|
|
Source: sourceName,
|
|
Type: "python-function",
|
|
Title: fmt.Sprintf("%s.%s - Python", module.Name, fn.Name),
|
|
Content: content,
|
|
URL: fn.DocURL,
|
|
Metadata: metadata,
|
|
Hash: s.generateHash(content),
|
|
Timestamp: time.Now(),
|
|
}
|
|
}
|
|
|
|
func (s *PythonDocsScraper) buildFunctionContent(fn *pythondocs.Function, module *pythondocs.Module) string {
|
|
var parts []string
|
|
|
|
parts = append(parts, fmt.Sprintf("# %s.%s\n", module.Name, fn.Name))
|
|
|
|
if fn.Signature != "" {
|
|
parts = append(parts, fmt.Sprintf("```python\n%s\n```", fn.Signature))
|
|
}
|
|
|
|
if fn.Doc != "" {
|
|
parts = append(parts, "\n"+fn.Doc)
|
|
}
|
|
|
|
return strings.Join(parts, "\n")
|
|
}
|
|
|
|
func (s *PythonDocsScraper) exceptionToDocument(exc *pythondocs.Exception, module *pythondocs.Module, sourceName string) *Document {
|
|
var content strings.Builder
|
|
fmt.Fprintf(&content, "# %s.%s\n\n", module.Name, exc.Name)
|
|
|
|
if exc.Signature != "" {
|
|
fmt.Fprintf(&content, "```python\n%s\n```\n", exc.Signature)
|
|
}
|
|
|
|
if exc.Doc != "" {
|
|
fmt.Fprintf(&content, "%s\n", exc.Doc)
|
|
}
|
|
|
|
metadata := map[string]interface{}{
|
|
"module": module.Name,
|
|
"exception": exc.Name,
|
|
"qual_name": exc.QualName,
|
|
"bases": exc.Bases,
|
|
}
|
|
|
|
return &Document{
|
|
ID: generateDocID(exc.DocURL),
|
|
Source: sourceName,
|
|
Type: "python-exception",
|
|
Title: fmt.Sprintf("%s.%s - Python", module.Name, exc.Name),
|
|
Content: content.String(),
|
|
URL: exc.DocURL,
|
|
Metadata: metadata,
|
|
Hash: s.generateHash(content.String()),
|
|
Timestamp: time.Now(),
|
|
}
|
|
}
|
|
|
|
func (s *PythonDocsScraper) dataToDocument(data *pythondocs.Data, module *pythondocs.Module, sourceName string) *Document {
|
|
var content strings.Builder
|
|
fmt.Fprintf(&content, "# %s.%s\n\n", module.Name, data.Name)
|
|
|
|
if data.Doc != "" {
|
|
fmt.Fprintf(&content, "%s\n", data.Doc)
|
|
}
|
|
|
|
metadata := map[string]interface{}{
|
|
"module": module.Name,
|
|
"data": data.Name,
|
|
"type": data.Type,
|
|
"value": data.Value,
|
|
}
|
|
|
|
return &Document{
|
|
ID: generateDocID(data.DocURL),
|
|
Source: sourceName,
|
|
Type: "python-data",
|
|
Title: fmt.Sprintf("%s.%s - Python", module.Name, data.Name),
|
|
Content: content.String(),
|
|
URL: data.DocURL,
|
|
Metadata: metadata,
|
|
Hash: s.generateHash(content.String()),
|
|
Timestamp: time.Now(),
|
|
}
|
|
}
|