This commit is contained in:
Tomas Dvorak
2026-02-22 15:41:27 +01:00
parent 0b88627e54
commit 409acd2e08
84 changed files with 65382 additions and 27475 deletions
+563
View File
@@ -0,0 +1,563 @@
package scraper
import (
"context"
"crypto/sha256"
"encoding/hex"
"encoding/json"
"fmt"
"io"
"net/http"
"strings"
"time"
"github.com/yourorg/devour/pkg/rustdocs"
)
type RustDocsScraper struct {
config *Config
parser *rustdocs.Parser
client *http.Client
}
func NewRustDocsScraper(config *Config) *RustDocsScraper {
return &RustDocsScraper{
config: config,
parser: rustdocs.NewParser(),
client: &http.Client{
Timeout: config.Timeout,
},
}
}
func (s *RustDocsScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
var documents []*Document
if source.URL == "" {
return nil, fmt.Errorf("URL is required for Rust docs scraper")
}
html, err := s.fetchPage(ctx, source.URL)
if err != nil {
return nil, fmt.Errorf("failed to fetch page: %w", err)
}
crate, err := s.parser.ParseCratePage(html, source.URL)
if err != nil {
return nil, fmt.Errorf("failed to parse crate: %w", err)
}
mainDoc := s.crateToDocument(crate, source.Name)
documents = append(documents, mainDoc)
for _, m := range crate.Modules {
doc := s.moduleToDocument(m, crate, source.Name)
documents = append(documents, doc)
}
for _, st := range crate.Structs {
doc := s.structToDocument(st, crate, source.Name)
documents = append(documents, doc)
}
for _, e := range crate.Enums {
doc := s.enumToDocument(e, crate, source.Name)
documents = append(documents, doc)
}
for _, t := range crate.Traits {
doc := s.traitToDocument(t, crate, source.Name)
documents = append(documents, doc)
}
for _, f := range crate.Functions {
doc := s.funcToDocument(f, crate, source.Name)
documents = append(documents, doc)
}
for _, m := range crate.Macros {
doc := s.macroToDocument(m, crate, source.Name)
documents = append(documents, doc)
}
for _, c := range crate.Constants {
doc := s.constToDocument(c, crate, source.Name)
documents = append(documents, doc)
}
for _, st := range crate.Statics {
doc := s.staticToDocument(st, crate, source.Name)
documents = append(documents, doc)
}
return documents, nil
}
func (s *RustDocsScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
html, err := s.fetchPage(ctx, source.URL)
if err != nil {
return false, "", err
}
hash := s.generateHash(html)
changed := hash != lastHash
return changed, hash, nil
}
func (s *RustDocsScraper) fetchPage(ctx context.Context, url string) (string, error) {
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
if err != nil {
return "", err
}
req.Header.Set("User-Agent", s.config.UserAgent)
resp, err := s.client.Do(req)
if err != nil {
return "", err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return "", fmt.Errorf("HTTP %d", resp.StatusCode)
}
body, err := io.ReadAll(resp.Body)
if err != nil {
return "", err
}
return string(body), nil
}
func (s *RustDocsScraper) generateHash(content string) string {
hash := sha256.Sum256([]byte(content))
return hex.EncodeToString(hash[:])
}
func (s *RustDocsScraper) crateToDocument(crate *rustdocs.Crate, sourceName string) *Document {
content := s.buildCrateContent(crate)
metadata := map[string]interface{}{
"name": crate.Name,
"version": crate.Version,
"repository": crate.Repository,
"license": crate.License,
"doc_url": crate.DocURL,
"module_count": len(crate.Modules),
"struct_count": len(crate.Structs),
"enum_count": len(crate.Enums),
"trait_count": len(crate.Traits),
"function_count": len(crate.Functions),
"macro_count": len(crate.Macros),
"constant_count": len(crate.Constants),
"static_count": len(crate.Statics),
}
return &Document{
ID: generateDocID(crate.DocURL),
Source: sourceName,
Type: "rust-crate",
Title: fmt.Sprintf("%s - Rust", crate.Name),
Content: content,
URL: crate.DocURL,
Metadata: metadata,
Hash: s.generateHash(content),
Timestamp: time.Now(),
}
}
func (s *RustDocsScraper) buildCrateContent(crate *rustdocs.Crate) string {
var parts []string
parts = append(parts, fmt.Sprintf("# Crate %s\n", crate.Name))
if crate.Version != "" {
parts = append(parts, fmt.Sprintf("Version: %s\n", crate.Version))
}
if crate.Description != "" {
parts = append(parts, crate.Description)
}
if len(crate.Modules) > 0 {
parts = append(parts, fmt.Sprintf("\n## Modules (%d)\n", len(crate.Modules)))
for _, m := range crate.Modules {
parts = append(parts, fmt.Sprintf("- `%s`", m.Name))
}
}
if len(crate.Structs) > 0 {
parts = append(parts, fmt.Sprintf("\n## Structs (%d)\n", len(crate.Structs)))
for _, st := range crate.Structs {
parts = append(parts, fmt.Sprintf("- `%s`", st.Name))
}
}
if len(crate.Enums) > 0 {
parts = append(parts, fmt.Sprintf("\n## Enums (%d)\n", len(crate.Enums)))
for _, e := range crate.Enums {
parts = append(parts, fmt.Sprintf("- `%s`", e.Name))
}
}
if len(crate.Traits) > 0 {
parts = append(parts, fmt.Sprintf("\n## Traits (%d)\n", len(crate.Traits)))
for _, t := range crate.Traits {
parts = append(parts, fmt.Sprintf("- `%s`", t.Name))
}
}
if len(crate.Functions) > 0 {
parts = append(parts, fmt.Sprintf("\n## Functions (%d)\n", len(crate.Functions)))
for _, f := range crate.Functions {
parts = append(parts, fmt.Sprintf("- `%s`", f.Name))
}
}
if len(crate.Macros) > 0 {
parts = append(parts, fmt.Sprintf("\n## Macros (%d)\n", len(crate.Macros)))
for _, m := range crate.Macros {
parts = append(parts, fmt.Sprintf("- `%s`", m.Name))
}
}
return strings.Join(parts, "\n")
}
func (s *RustDocsScraper) moduleToDocument(m *rustdocs.Module, crate *rustdocs.Crate, sourceName string) *Document {
var content strings.Builder
fmt.Fprintf(&content, "# Module %s::%s\n\n", crate.Name, m.Name)
if m.Doc != "" {
fmt.Fprintf(&content, "%s\n", m.Doc)
}
metadata := map[string]interface{}{
"crate": crate.Name,
"module": m.Name,
"path": m.Path,
"is_experimental": m.IsExperimental,
"kind": "module",
}
return &Document{
ID: generateDocID(m.DocURL),
Source: sourceName,
Type: "rust-module",
Title: fmt.Sprintf("%s::%s - Rust", crate.Name, m.Name),
Content: content.String(),
URL: m.DocURL,
Metadata: metadata,
Hash: s.generateHash(content.String()),
Timestamp: time.Now(),
}
}
func (s *RustDocsScraper) structToDocument(st *rustdocs.Struct, crate *rustdocs.Crate, sourceName string) *Document {
content := s.buildStructContent(st, crate)
metadata := map[string]interface{}{
"crate": crate.Name,
"struct": st.Name,
"path": st.Path,
"is_experimental": st.IsExperimental,
"kind": "struct",
"declaration": st.Declaration,
}
fieldsJSON, _ := json.Marshal(st.Fields)
metadata["fields"] = string(fieldsJSON)
return &Document{
ID: generateDocID(st.DocURL),
Source: sourceName,
Type: "rust-struct",
Title: fmt.Sprintf("%s::%s - Rust", crate.Name, st.Name),
Content: content,
URL: st.DocURL,
Metadata: metadata,
Hash: s.generateHash(content),
Timestamp: time.Now(),
}
}
func (s *RustDocsScraper) buildStructContent(st *rustdocs.Struct, crate *rustdocs.Crate) string {
var parts []string
parts = append(parts, fmt.Sprintf("# struct %s::%s\n", crate.Name, st.Name))
if st.Declaration != "" {
parts = append(parts, fmt.Sprintf("```rust\n%s\n```", st.Declaration))
}
if st.Doc != "" {
parts = append(parts, "\n"+st.Doc)
}
if len(st.Fields) > 0 {
parts = append(parts, "\n### Fields\n")
for _, f := range st.Fields {
if f.Doc != "" {
parts = append(parts, fmt.Sprintf("- `%s: %s` - %s", f.Name, f.Type, f.Doc))
} else {
parts = append(parts, fmt.Sprintf("- `%s: %s`", f.Name, f.Type))
}
}
}
if len(st.Methods) > 0 {
parts = append(parts, fmt.Sprintf("\n### Methods (%d)\n", len(st.Methods)))
for _, m := range st.Methods {
parts = append(parts, fmt.Sprintf("- `%s`", m.Name))
}
}
return strings.Join(parts, "\n")
}
func (s *RustDocsScraper) enumToDocument(e *rustdocs.Enum, crate *rustdocs.Crate, sourceName string) *Document {
content := s.buildEnumContent(e, crate)
metadata := map[string]interface{}{
"crate": crate.Name,
"enum": e.Name,
"path": e.Path,
"is_experimental": e.IsExperimental,
"kind": "enum",
"declaration": e.Declaration,
}
return &Document{
ID: generateDocID(e.DocURL),
Source: sourceName,
Type: "rust-enum",
Title: fmt.Sprintf("%s::%s - Rust", crate.Name, e.Name),
Content: content,
URL: e.DocURL,
Metadata: metadata,
Hash: s.generateHash(content),
Timestamp: time.Now(),
}
}
func (s *RustDocsScraper) buildEnumContent(e *rustdocs.Enum, crate *rustdocs.Crate) string {
var parts []string
parts = append(parts, fmt.Sprintf("# enum %s::%s\n", crate.Name, e.Name))
if e.Declaration != "" {
parts = append(parts, fmt.Sprintf("```rust\n%s\n```", e.Declaration))
}
if e.Doc != "" {
parts = append(parts, "\n"+e.Doc)
}
if len(e.Variants) > 0 {
parts = append(parts, "\n### Variants\n")
for _, v := range e.Variants {
if v.Doc != "" {
parts = append(parts, fmt.Sprintf("- `%s` - %s", v.Name, v.Doc))
} else {
parts = append(parts, fmt.Sprintf("- `%s`", v.Name))
}
}
}
return strings.Join(parts, "\n")
}
func (s *RustDocsScraper) traitToDocument(t *rustdocs.Trait, crate *rustdocs.Crate, sourceName string) *Document {
content := s.buildTraitContent(t, crate)
metadata := map[string]interface{}{
"crate": crate.Name,
"trait": t.Name,
"path": t.Path,
"is_experimental": t.IsExperimental,
"kind": "trait",
"declaration": t.Declaration,
}
return &Document{
ID: generateDocID(t.DocURL),
Source: sourceName,
Type: "rust-trait",
Title: fmt.Sprintf("%s::%s - Rust", crate.Name, t.Name),
Content: content,
URL: t.DocURL,
Metadata: metadata,
Hash: s.generateHash(content),
Timestamp: time.Now(),
}
}
func (s *RustDocsScraper) buildTraitContent(t *rustdocs.Trait, crate *rustdocs.Crate) string {
var parts []string
parts = append(parts, fmt.Sprintf("# trait %s::%s\n", crate.Name, t.Name))
if t.Declaration != "" {
parts = append(parts, fmt.Sprintf("```rust\n%s\n```", t.Declaration))
}
if t.Doc != "" {
parts = append(parts, "\n"+t.Doc)
}
if len(t.Methods) > 0 {
parts = append(parts, fmt.Sprintf("\n### Required Methods (%d)\n", len(t.Methods)))
for _, m := range t.Methods {
parts = append(parts, fmt.Sprintf("- `%s`", m.Signature))
}
}
return strings.Join(parts, "\n")
}
func (s *RustDocsScraper) funcToDocument(f *rustdocs.Func, crate *rustdocs.Crate, sourceName string) *Document {
var content strings.Builder
fmt.Fprintf(&content, "# fn %s::%s\n\n", crate.Name, f.Name)
if f.Signature != "" {
fmt.Fprintf(&content, "```rust\n%s\n```\n", f.Signature)
}
if f.Doc != "" {
fmt.Fprintf(&content, "%s\n", f.Doc)
}
metadata := map[string]interface{}{
"crate": crate.Name,
"function": f.Name,
"path": f.Path,
"is_experimental": f.IsExperimental,
"is_unsafe": f.IsUnsafe,
"is_const": f.IsConst,
"is_async": f.IsAsync,
"kind": "fn",
"signature": f.Signature,
}
return &Document{
ID: generateDocID(f.DocURL),
Source: sourceName,
Type: "rust-fn",
Title: fmt.Sprintf("%s::%s - Rust", crate.Name, f.Name),
Content: content.String(),
URL: f.DocURL,
Metadata: metadata,
Hash: s.generateHash(content.String()),
Timestamp: time.Now(),
}
}
func (s *RustDocsScraper) macroToDocument(m *rustdocs.Macro, crate *rustdocs.Crate, sourceName string) *Document {
var content strings.Builder
fmt.Fprintf(&content, "# macro %s::%s!\n\n", crate.Name, m.Name)
if m.Signature != "" {
fmt.Fprintf(&content, "```rust\n%s\n```\n", m.Signature)
}
if m.Doc != "" {
fmt.Fprintf(&content, "%s\n", m.Doc)
}
metadata := map[string]interface{}{
"crate": crate.Name,
"macro": m.Name,
"path": m.Path,
"is_experimental": m.IsExperimental,
"kind": "macro",
}
return &Document{
ID: generateDocID(m.DocURL),
Source: sourceName,
Type: "rust-macro",
Title: fmt.Sprintf("%s::%s! - Rust", crate.Name, m.Name),
Content: content.String(),
URL: m.DocURL,
Metadata: metadata,
Hash: s.generateHash(content.String()),
Timestamp: time.Now(),
}
}
func (s *RustDocsScraper) constToDocument(c *rustdocs.Const, crate *rustdocs.Crate, sourceName string) *Document {
var content strings.Builder
fmt.Fprintf(&content, "# const %s::%s\n\n", crate.Name, c.Name)
if c.Type != "" {
fmt.Fprintf(&content, "Type: `%s`\n", c.Type)
}
if c.Value != "" {
fmt.Fprintf(&content, "Value: `%s`\n", c.Value)
}
if c.Doc != "" {
fmt.Fprintf(&content, "\n%s\n", c.Doc)
}
metadata := map[string]interface{}{
"crate": crate.Name,
"const": c.Name,
"path": c.Path,
"is_experimental": c.IsExperimental,
"type": c.Type,
"value": c.Value,
"kind": "const",
}
return &Document{
ID: generateDocID(c.DocURL),
Source: sourceName,
Type: "rust-const",
Title: fmt.Sprintf("%s::%s - Rust", crate.Name, c.Name),
Content: content.String(),
URL: c.DocURL,
Metadata: metadata,
Hash: s.generateHash(content.String()),
Timestamp: time.Now(),
}
}
func (s *RustDocsScraper) staticToDocument(st *rustdocs.Static, crate *rustdocs.Crate, sourceName string) *Document {
var content strings.Builder
fmt.Fprintf(&content, "# static %s::%s\n\n", crate.Name, st.Name)
if st.Type != "" {
fmt.Fprintf(&content, "Type: `%s`\n", st.Type)
}
if st.IsMutable {
fmt.Fprintf(&content, "Mutability: mutable\n")
}
if st.Doc != "" {
fmt.Fprintf(&content, "\n%s\n", st.Doc)
}
metadata := map[string]interface{}{
"crate": crate.Name,
"static": st.Name,
"path": st.Path,
"is_experimental": st.IsExperimental,
"is_mutable": st.IsMutable,
"type": st.Type,
"kind": "static",
}
return &Document{
ID: generateDocID(st.DocURL),
Source: sourceName,
Type: "rust-static",
Title: fmt.Sprintf("%s::%s - Rust", crate.Name, st.Name),
Content: content.String(),
URL: st.DocURL,
Metadata: metadata,
Hash: s.generateHash(content.String()),
Timestamp: time.Now(),
}
}