package scraper import ( "context" "crypto/sha256" "encoding/hex" "encoding/json" "fmt" "io" "net/http" "strings" "time" "github.com/yourorg/devour/pkg/rustdocs" ) type RustDocsScraper struct { config *Config parser *rustdocs.Parser client *http.Client } func NewRustDocsScraper(config *Config) *RustDocsScraper { return &RustDocsScraper{ config: config, parser: rustdocs.NewParser(), client: &http.Client{ Timeout: config.Timeout, }, } } func (s *RustDocsScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) { var documents []*Document if source.URL == "" { return nil, fmt.Errorf("URL is required for Rust docs scraper") } html, err := s.fetchPage(ctx, source.URL) if err != nil { return nil, fmt.Errorf("failed to fetch page: %w", err) } crate, err := s.parser.ParseCratePage(html, source.URL) if err != nil { return nil, fmt.Errorf("failed to parse crate: %w", err) } mainDoc := s.crateToDocument(crate, source.Name) documents = append(documents, mainDoc) for _, m := range crate.Modules { doc := s.moduleToDocument(m, crate, source.Name) documents = append(documents, doc) } for _, st := range crate.Structs { doc := s.structToDocument(st, crate, source.Name) documents = append(documents, doc) } for _, e := range crate.Enums { doc := s.enumToDocument(e, crate, source.Name) documents = append(documents, doc) } for _, t := range crate.Traits { doc := s.traitToDocument(t, crate, source.Name) documents = append(documents, doc) } for _, f := range crate.Functions { doc := s.funcToDocument(f, crate, source.Name) documents = append(documents, doc) } for _, m := range crate.Macros { doc := s.macroToDocument(m, crate, source.Name) documents = append(documents, doc) } for _, c := range crate.Constants { doc := s.constToDocument(c, crate, source.Name) documents = append(documents, doc) } for _, st := range crate.Statics { doc := s.staticToDocument(st, crate, source.Name) documents = append(documents, doc) } return documents, nil } func (s *RustDocsScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) { html, err := s.fetchPage(ctx, source.URL) if err != nil { return false, "", err } hash := s.generateHash(html) changed := hash != lastHash return changed, hash, nil } func (s *RustDocsScraper) fetchPage(ctx context.Context, url string) (string, error) { req, err := http.NewRequestWithContext(ctx, "GET", url, nil) if err != nil { return "", err } req.Header.Set("User-Agent", s.config.UserAgent) resp, err := s.client.Do(req) if err != nil { return "", err } defer resp.Body.Close() if resp.StatusCode != http.StatusOK { return "", fmt.Errorf("HTTP %d", resp.StatusCode) } body, err := io.ReadAll(resp.Body) if err != nil { return "", err } return string(body), nil } func (s *RustDocsScraper) generateHash(content string) string { hash := sha256.Sum256([]byte(content)) return hex.EncodeToString(hash[:]) } func (s *RustDocsScraper) crateToDocument(crate *rustdocs.Crate, sourceName string) *Document { content := s.buildCrateContent(crate) metadata := map[string]interface{}{ "name": crate.Name, "version": crate.Version, "repository": crate.Repository, "license": crate.License, "doc_url": crate.DocURL, "module_count": len(crate.Modules), "struct_count": len(crate.Structs), "enum_count": len(crate.Enums), "trait_count": len(crate.Traits), "function_count": len(crate.Functions), "macro_count": len(crate.Macros), "constant_count": len(crate.Constants), "static_count": len(crate.Statics), } return &Document{ ID: generateDocID(crate.DocURL), Source: sourceName, Type: "rust-crate", Title: fmt.Sprintf("%s - Rust", crate.Name), Content: content, URL: crate.DocURL, Metadata: metadata, Hash: s.generateHash(content), Timestamp: time.Now(), } } func (s *RustDocsScraper) buildCrateContent(crate *rustdocs.Crate) string { var parts []string parts = append(parts, fmt.Sprintf("# Crate %s\n", crate.Name)) if crate.Version != "" { parts = append(parts, fmt.Sprintf("Version: %s\n", crate.Version)) } if crate.Description != "" { parts = append(parts, crate.Description) } if len(crate.Modules) > 0 { parts = append(parts, fmt.Sprintf("\n## Modules (%d)\n", len(crate.Modules))) for _, m := range crate.Modules { parts = append(parts, fmt.Sprintf("- `%s`", m.Name)) } } if len(crate.Structs) > 0 { parts = append(parts, fmt.Sprintf("\n## Structs (%d)\n", len(crate.Structs))) for _, st := range crate.Structs { parts = append(parts, fmt.Sprintf("- `%s`", st.Name)) } } if len(crate.Enums) > 0 { parts = append(parts, fmt.Sprintf("\n## Enums (%d)\n", len(crate.Enums))) for _, e := range crate.Enums { parts = append(parts, fmt.Sprintf("- `%s`", e.Name)) } } if len(crate.Traits) > 0 { parts = append(parts, fmt.Sprintf("\n## Traits (%d)\n", len(crate.Traits))) for _, t := range crate.Traits { parts = append(parts, fmt.Sprintf("- `%s`", t.Name)) } } if len(crate.Functions) > 0 { parts = append(parts, fmt.Sprintf("\n## Functions (%d)\n", len(crate.Functions))) for _, f := range crate.Functions { parts = append(parts, fmt.Sprintf("- `%s`", f.Name)) } } if len(crate.Macros) > 0 { parts = append(parts, fmt.Sprintf("\n## Macros (%d)\n", len(crate.Macros))) for _, m := range crate.Macros { parts = append(parts, fmt.Sprintf("- `%s`", m.Name)) } } return strings.Join(parts, "\n") } func (s *RustDocsScraper) moduleToDocument(m *rustdocs.Module, crate *rustdocs.Crate, sourceName string) *Document { var content strings.Builder fmt.Fprintf(&content, "# Module %s::%s\n\n", crate.Name, m.Name) if m.Doc != "" { fmt.Fprintf(&content, "%s\n", m.Doc) } metadata := map[string]interface{}{ "crate": crate.Name, "module": m.Name, "path": m.Path, "is_experimental": m.IsExperimental, "kind": "module", } return &Document{ ID: generateDocID(m.DocURL), Source: sourceName, Type: "rust-module", Title: fmt.Sprintf("%s::%s - Rust", crate.Name, m.Name), Content: content.String(), URL: m.DocURL, Metadata: metadata, Hash: s.generateHash(content.String()), Timestamp: time.Now(), } } func (s *RustDocsScraper) structToDocument(st *rustdocs.Struct, crate *rustdocs.Crate, sourceName string) *Document { content := s.buildStructContent(st, crate) metadata := map[string]interface{}{ "crate": crate.Name, "struct": st.Name, "path": st.Path, "is_experimental": st.IsExperimental, "kind": "struct", "declaration": st.Declaration, } fieldsJSON, _ := json.Marshal(st.Fields) metadata["fields"] = string(fieldsJSON) return &Document{ ID: generateDocID(st.DocURL), Source: sourceName, Type: "rust-struct", Title: fmt.Sprintf("%s::%s - Rust", crate.Name, st.Name), Content: content, URL: st.DocURL, Metadata: metadata, Hash: s.generateHash(content), Timestamp: time.Now(), } } func (s *RustDocsScraper) buildStructContent(st *rustdocs.Struct, crate *rustdocs.Crate) string { var parts []string parts = append(parts, fmt.Sprintf("# struct %s::%s\n", crate.Name, st.Name)) if st.Declaration != "" { parts = append(parts, fmt.Sprintf("```rust\n%s\n```", st.Declaration)) } if st.Doc != "" { parts = append(parts, "\n"+st.Doc) } if len(st.Fields) > 0 { parts = append(parts, "\n### Fields\n") for _, f := range st.Fields { if f.Doc != "" { parts = append(parts, fmt.Sprintf("- `%s: %s` - %s", f.Name, f.Type, f.Doc)) } else { parts = append(parts, fmt.Sprintf("- `%s: %s`", f.Name, f.Type)) } } } if len(st.Methods) > 0 { parts = append(parts, fmt.Sprintf("\n### Methods (%d)\n", len(st.Methods))) for _, m := range st.Methods { parts = append(parts, fmt.Sprintf("- `%s`", m.Name)) } } return strings.Join(parts, "\n") } func (s *RustDocsScraper) enumToDocument(e *rustdocs.Enum, crate *rustdocs.Crate, sourceName string) *Document { content := s.buildEnumContent(e, crate) metadata := map[string]interface{}{ "crate": crate.Name, "enum": e.Name, "path": e.Path, "is_experimental": e.IsExperimental, "kind": "enum", "declaration": e.Declaration, } return &Document{ ID: generateDocID(e.DocURL), Source: sourceName, Type: "rust-enum", Title: fmt.Sprintf("%s::%s - Rust", crate.Name, e.Name), Content: content, URL: e.DocURL, Metadata: metadata, Hash: s.generateHash(content), Timestamp: time.Now(), } } func (s *RustDocsScraper) buildEnumContent(e *rustdocs.Enum, crate *rustdocs.Crate) string { var parts []string parts = append(parts, fmt.Sprintf("# enum %s::%s\n", crate.Name, e.Name)) if e.Declaration != "" { parts = append(parts, fmt.Sprintf("```rust\n%s\n```", e.Declaration)) } if e.Doc != "" { parts = append(parts, "\n"+e.Doc) } if len(e.Variants) > 0 { parts = append(parts, "\n### Variants\n") for _, v := range e.Variants { if v.Doc != "" { parts = append(parts, fmt.Sprintf("- `%s` - %s", v.Name, v.Doc)) } else { parts = append(parts, fmt.Sprintf("- `%s`", v.Name)) } } } return strings.Join(parts, "\n") } func (s *RustDocsScraper) traitToDocument(t *rustdocs.Trait, crate *rustdocs.Crate, sourceName string) *Document { content := s.buildTraitContent(t, crate) metadata := map[string]interface{}{ "crate": crate.Name, "trait": t.Name, "path": t.Path, "is_experimental": t.IsExperimental, "kind": "trait", "declaration": t.Declaration, } return &Document{ ID: generateDocID(t.DocURL), Source: sourceName, Type: "rust-trait", Title: fmt.Sprintf("%s::%s - Rust", crate.Name, t.Name), Content: content, URL: t.DocURL, Metadata: metadata, Hash: s.generateHash(content), Timestamp: time.Now(), } } func (s *RustDocsScraper) buildTraitContent(t *rustdocs.Trait, crate *rustdocs.Crate) string { var parts []string parts = append(parts, fmt.Sprintf("# trait %s::%s\n", crate.Name, t.Name)) if t.Declaration != "" { parts = append(parts, fmt.Sprintf("```rust\n%s\n```", t.Declaration)) } if t.Doc != "" { parts = append(parts, "\n"+t.Doc) } if len(t.Methods) > 0 { parts = append(parts, fmt.Sprintf("\n### Required Methods (%d)\n", len(t.Methods))) for _, m := range t.Methods { parts = append(parts, fmt.Sprintf("- `%s`", m.Signature)) } } return strings.Join(parts, "\n") } func (s *RustDocsScraper) funcToDocument(f *rustdocs.Func, crate *rustdocs.Crate, sourceName string) *Document { var content strings.Builder fmt.Fprintf(&content, "# fn %s::%s\n\n", crate.Name, f.Name) if f.Signature != "" { fmt.Fprintf(&content, "```rust\n%s\n```\n", f.Signature) } if f.Doc != "" { fmt.Fprintf(&content, "%s\n", f.Doc) } metadata := map[string]interface{}{ "crate": crate.Name, "function": f.Name, "path": f.Path, "is_experimental": f.IsExperimental, "is_unsafe": f.IsUnsafe, "is_const": f.IsConst, "is_async": f.IsAsync, "kind": "fn", "signature": f.Signature, } return &Document{ ID: generateDocID(f.DocURL), Source: sourceName, Type: "rust-fn", Title: fmt.Sprintf("%s::%s - Rust", crate.Name, f.Name), Content: content.String(), URL: f.DocURL, Metadata: metadata, Hash: s.generateHash(content.String()), Timestamp: time.Now(), } } func (s *RustDocsScraper) macroToDocument(m *rustdocs.Macro, crate *rustdocs.Crate, sourceName string) *Document { var content strings.Builder fmt.Fprintf(&content, "# macro %s::%s!\n\n", crate.Name, m.Name) if m.Signature != "" { fmt.Fprintf(&content, "```rust\n%s\n```\n", m.Signature) } if m.Doc != "" { fmt.Fprintf(&content, "%s\n", m.Doc) } metadata := map[string]interface{}{ "crate": crate.Name, "macro": m.Name, "path": m.Path, "is_experimental": m.IsExperimental, "kind": "macro", } return &Document{ ID: generateDocID(m.DocURL), Source: sourceName, Type: "rust-macro", Title: fmt.Sprintf("%s::%s! - Rust", crate.Name, m.Name), Content: content.String(), URL: m.DocURL, Metadata: metadata, Hash: s.generateHash(content.String()), Timestamp: time.Now(), } } func (s *RustDocsScraper) constToDocument(c *rustdocs.Const, crate *rustdocs.Crate, sourceName string) *Document { var content strings.Builder fmt.Fprintf(&content, "# const %s::%s\n\n", crate.Name, c.Name) if c.Type != "" { fmt.Fprintf(&content, "Type: `%s`\n", c.Type) } if c.Value != "" { fmt.Fprintf(&content, "Value: `%s`\n", c.Value) } if c.Doc != "" { fmt.Fprintf(&content, "\n%s\n", c.Doc) } metadata := map[string]interface{}{ "crate": crate.Name, "const": c.Name, "path": c.Path, "is_experimental": c.IsExperimental, "type": c.Type, "value": c.Value, "kind": "const", } return &Document{ ID: generateDocID(c.DocURL), Source: sourceName, Type: "rust-const", Title: fmt.Sprintf("%s::%s - Rust", crate.Name, c.Name), Content: content.String(), URL: c.DocURL, Metadata: metadata, Hash: s.generateHash(content.String()), Timestamp: time.Now(), } } func (s *RustDocsScraper) staticToDocument(st *rustdocs.Static, crate *rustdocs.Crate, sourceName string) *Document { var content strings.Builder fmt.Fprintf(&content, "# static %s::%s\n\n", crate.Name, st.Name) if st.Type != "" { fmt.Fprintf(&content, "Type: `%s`\n", st.Type) } if st.IsMutable { fmt.Fprintf(&content, "Mutability: mutable\n") } if st.Doc != "" { fmt.Fprintf(&content, "\n%s\n", st.Doc) } metadata := map[string]interface{}{ "crate": crate.Name, "static": st.Name, "path": st.Path, "is_experimental": st.IsExperimental, "is_mutable": st.IsMutable, "type": st.Type, "kind": "static", } return &Document{ ID: generateDocID(st.DocURL), Source: sourceName, Type: "rust-static", Title: fmt.Sprintf("%s::%s - Rust", crate.Name, st.Name), Content: content.String(), URL: st.DocURL, Metadata: metadata, Hash: s.generateHash(content.String()), Timestamp: time.Now(), } }