package scraper import ( "context" "crypto/sha256" "encoding/hex" "fmt" "io" "net/http" "strings" "time" "github.com/yourorg/devour/pkg/pythondocs" ) type PythonDocsScraper struct { config *Config parser *pythondocs.Parser client *http.Client } func NewPythonDocsScraper(config *Config) *PythonDocsScraper { return &PythonDocsScraper{ config: config, parser: pythondocs.NewParser(), client: &http.Client{ Timeout: config.Timeout, }, } } func (s *PythonDocsScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) { var documents []*Document if source.URL == "" { return nil, fmt.Errorf("URL is required for Python docs scraper") } html, err := s.fetchPage(ctx, source.URL) if err != nil { return nil, fmt.Errorf("failed to fetch page: %w", err) } module, err := s.parser.ParseModulePage(html, source.URL) if err != nil { return nil, fmt.Errorf("failed to parse module: %w", err) } mainDoc := s.moduleToDocument(module, source.Name) documents = append(documents, mainDoc) for _, class := range module.Classes { doc := s.classToDocument(class, module, source.Name) documents = append(documents, doc) for _, method := range class.Methods { methodDoc := s.methodToDocument(method, class, module, source.Name) documents = append(documents, methodDoc) } for _, method := range class.ClassMethods { methodDoc := s.classMethodToDocument(method, class, module, source.Name) documents = append(documents, methodDoc) } for _, attr := range class.Attributes { attrDoc := s.attributeToDocument(attr, class, module, source.Name) documents = append(documents, attrDoc) } } for _, fn := range module.Functions { doc := s.functionToDocument(fn, module, source.Name) documents = append(documents, doc) } for _, exc := range module.Exceptions { doc := s.exceptionToDocument(exc, module, source.Name) documents = append(documents, doc) } for _, data := range module.Constants { doc := s.dataToDocument(data, module, source.Name) documents = append(documents, doc) } return documents, nil } func (s *PythonDocsScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) { html, err := s.fetchPage(ctx, source.URL) if err != nil { return false, "", err } hash := s.generateHash(html) changed := hash != lastHash return changed, hash, nil } func (s *PythonDocsScraper) fetchPage(ctx context.Context, url string) (string, error) { req, err := http.NewRequestWithContext(ctx, "GET", url, nil) if err != nil { return "", err } req.Header.Set("User-Agent", s.config.UserAgent) resp, err := s.client.Do(req) if err != nil { return "", err } defer resp.Body.Close() if resp.StatusCode != http.StatusOK { return "", fmt.Errorf("HTTP %d", resp.StatusCode) } body, err := io.ReadAll(resp.Body) if err != nil { return "", err } return string(body), nil } func (s *PythonDocsScraper) generateHash(content string) string { hash := sha256.Sum256([]byte(content)) return hex.EncodeToString(hash[:]) } func (s *PythonDocsScraper) moduleToDocument(module *pythondocs.Module, sourceName string) *Document { content := s.buildModuleContent(module) metadata := map[string]interface{}{ "name": module.Name, "path": module.Path, "version": module.Version, "doc_url": module.DocURL, "class_count": len(module.Classes), "function_count": len(module.Functions), "exception_count": len(module.Exceptions), "data_count": len(module.Constants), } return &Document{ ID: generateDocID(module.DocURL), Source: sourceName, Type: "python-module", Title: fmt.Sprintf("%s - Python", module.Name), Content: content, URL: module.DocURL, Metadata: metadata, Hash: s.generateHash(content), Timestamp: time.Now(), } } func (s *PythonDocsScraper) buildModuleContent(module *pythondocs.Module) string { var parts []string parts = append(parts, fmt.Sprintf("# Module %s\n", module.Name)) if module.Synopsis != "" { parts = append(parts, module.Synopsis) } if module.Doc != "" { parts = append(parts, "\n"+module.Doc) } if len(module.Classes) > 0 { parts = append(parts, fmt.Sprintf("\n## Classes (%d)\n", len(module.Classes))) for _, class := range module.Classes { parts = append(parts, fmt.Sprintf("- `%s`", class.Name)) } } if len(module.Functions) > 0 { parts = append(parts, fmt.Sprintf("\n## Functions (%d)\n", len(module.Functions))) for _, fn := range module.Functions { parts = append(parts, fmt.Sprintf("- `%s`", fn.Name)) } } if len(module.Exceptions) > 0 { parts = append(parts, fmt.Sprintf("\n## Exceptions (%d)\n", len(module.Exceptions))) for _, exc := range module.Exceptions { parts = append(parts, fmt.Sprintf("- `%s`", exc.Name)) } } return strings.Join(parts, "\n") } func (s *PythonDocsScraper) classToDocument(class *pythondocs.Class, module *pythondocs.Module, sourceName string) *Document { content := s.buildClassContent(class, module) metadata := map[string]interface{}{ "module": module.Name, "class": class.Name, "qual_name": class.QualName, "bases": class.Bases, "method_count": len(class.Methods), "attribute_count": len(class.Attributes), } return &Document{ ID: generateDocID(class.DocURL), Source: sourceName, Type: "python-class", Title: fmt.Sprintf("%s.%s - Python", module.Name, class.Name), Content: content, URL: class.DocURL, Metadata: metadata, Hash: s.generateHash(content), Timestamp: time.Now(), } } func (s *PythonDocsScraper) buildClassContent(class *pythondocs.Class, module *pythondocs.Module) string { var parts []string parts = append(parts, fmt.Sprintf("# class %s.%s\n", module.Name, class.Name)) if class.Signature != "" { parts = append(parts, fmt.Sprintf("```python\n%s\n```", class.Signature)) } if class.Doc != "" { parts = append(parts, "\n"+class.Doc) } if len(class.Bases) > 0 { parts = append(parts, fmt.Sprintf("\n**Bases:** %s\n", strings.Join(class.Bases, ", "))) } if len(class.Methods) > 0 { parts = append(parts, fmt.Sprintf("\n### Methods (%d)\n", len(class.Methods))) for _, m := range class.Methods { parts = append(parts, fmt.Sprintf("- `%s`", m.Name)) } } if len(class.ClassMethods) > 0 { parts = append(parts, fmt.Sprintf("\n### Class Methods (%d)\n", len(class.ClassMethods))) for _, m := range class.ClassMethods { parts = append(parts, fmt.Sprintf("- `%s` (classmethod)", m.Name)) } } if len(class.Attributes) > 0 { parts = append(parts, fmt.Sprintf("\n### Attributes (%d)\n", len(class.Attributes))) for _, a := range class.Attributes { parts = append(parts, fmt.Sprintf("- `%s`", a.Name)) } } return strings.Join(parts, "\n") } func (s *PythonDocsScraper) methodToDocument(method *pythondocs.Method, class *pythondocs.Class, module *pythondocs.Module, sourceName string) *Document { var content strings.Builder fmt.Fprintf(&content, "# %s.%s.%s\n\n", module.Name, class.Name, method.Name) if method.Signature != "" { fmt.Fprintf(&content, "```python\n%s\n```\n", method.Signature) } if method.Doc != "" { fmt.Fprintf(&content, "%s\n", method.Doc) } metadata := map[string]interface{}{ "module": module.Name, "class": class.Name, "method": method.Name, "qual_name": method.QualName, "is_static": method.IsStatic, "is_async": method.IsAsync, } return &Document{ ID: generateDocID(method.DocURL), Source: sourceName, Type: "python-method", Title: fmt.Sprintf("%s.%s.%s - Python", module.Name, class.Name, method.Name), Content: content.String(), URL: method.DocURL, Metadata: metadata, Hash: s.generateHash(content.String()), Timestamp: time.Now(), } } func (s *PythonDocsScraper) classMethodToDocument(method *pythondocs.Method, class *pythondocs.Class, module *pythondocs.Module, sourceName string) *Document { content := s.buildMethodContent(method, class, module) metadata := map[string]interface{}{ "module": module.Name, "class": class.Name, "method": method.Name, "qual_name": method.QualName, "is_classmethod": true, } return &Document{ ID: generateDocID(method.DocURL), Source: sourceName, Type: "python-classmethod", Title: fmt.Sprintf("%s.%s.%s (classmethod) - Python", module.Name, class.Name, method.Name), Content: content, URL: method.DocURL, Metadata: metadata, Hash: s.generateHash(content), Timestamp: time.Now(), } } func (s *PythonDocsScraper) buildMethodContent(method *pythondocs.Method, class *pythondocs.Class, module *pythondocs.Module) string { var parts []string parts = append(parts, fmt.Sprintf("# %s.%s.%s\n", module.Name, class.Name, method.Name)) if method.Signature != "" { parts = append(parts, fmt.Sprintf("```python\n%s\n```", method.Signature)) } if method.Doc != "" { parts = append(parts, "\n"+method.Doc) } return strings.Join(parts, "\n") } func (s *PythonDocsScraper) attributeToDocument(attr *pythondocs.Attribute, class *pythondocs.Class, module *pythondocs.Module, sourceName string) *Document { var content strings.Builder fmt.Fprintf(&content, "# %s.%s.%s\n\n", module.Name, class.Name, attr.Name) if attr.Doc != "" { fmt.Fprintf(&content, "%s\n", attr.Doc) } metadata := map[string]interface{}{ "module": module.Name, "class": class.Name, "attr": attr.Name, "type": attr.Type, } return &Document{ ID: generateDocID(attr.DocURL), Source: sourceName, Type: "python-attribute", Title: fmt.Sprintf("%s.%s.%s - Python", module.Name, class.Name, attr.Name), Content: content.String(), URL: attr.DocURL, Metadata: metadata, Hash: s.generateHash(content.String()), Timestamp: time.Now(), } } func (s *PythonDocsScraper) functionToDocument(fn *pythondocs.Function, module *pythondocs.Module, sourceName string) *Document { content := s.buildFunctionContent(fn, module) metadata := map[string]interface{}{ "module": module.Name, "function": fn.Name, "qual_name": fn.QualName, "signature": fn.Signature, "is_async": fn.IsAsync, "is_generator": fn.IsGenerator, } return &Document{ ID: generateDocID(fn.DocURL), Source: sourceName, Type: "python-function", Title: fmt.Sprintf("%s.%s - Python", module.Name, fn.Name), Content: content, URL: fn.DocURL, Metadata: metadata, Hash: s.generateHash(content), Timestamp: time.Now(), } } func (s *PythonDocsScraper) buildFunctionContent(fn *pythondocs.Function, module *pythondocs.Module) string { var parts []string parts = append(parts, fmt.Sprintf("# %s.%s\n", module.Name, fn.Name)) if fn.Signature != "" { parts = append(parts, fmt.Sprintf("```python\n%s\n```", fn.Signature)) } if fn.Doc != "" { parts = append(parts, "\n"+fn.Doc) } return strings.Join(parts, "\n") } func (s *PythonDocsScraper) exceptionToDocument(exc *pythondocs.Exception, module *pythondocs.Module, sourceName string) *Document { var content strings.Builder fmt.Fprintf(&content, "# %s.%s\n\n", module.Name, exc.Name) if exc.Signature != "" { fmt.Fprintf(&content, "```python\n%s\n```\n", exc.Signature) } if exc.Doc != "" { fmt.Fprintf(&content, "%s\n", exc.Doc) } metadata := map[string]interface{}{ "module": module.Name, "exception": exc.Name, "qual_name": exc.QualName, "bases": exc.Bases, } return &Document{ ID: generateDocID(exc.DocURL), Source: sourceName, Type: "python-exception", Title: fmt.Sprintf("%s.%s - Python", module.Name, exc.Name), Content: content.String(), URL: exc.DocURL, Metadata: metadata, Hash: s.generateHash(content.String()), Timestamp: time.Now(), } } func (s *PythonDocsScraper) dataToDocument(data *pythondocs.Data, module *pythondocs.Module, sourceName string) *Document { var content strings.Builder fmt.Fprintf(&content, "# %s.%s\n\n", module.Name, data.Name) if data.Doc != "" { fmt.Fprintf(&content, "%s\n", data.Doc) } metadata := map[string]interface{}{ "module": module.Name, "data": data.Name, "type": data.Type, "value": data.Value, } return &Document{ ID: generateDocID(data.DocURL), Source: sourceName, Type: "python-data", Title: fmt.Sprintf("%s.%s - Python", module.Name, data.Name), Content: content.String(), URL: data.DocURL, Metadata: metadata, Hash: s.generateHash(content.String()), Timestamp: time.Now(), } }