mirror of
https://github.com/Dvorinka/Devour.git
synced 2026-06-04 12:33:04 +00:00
542 lines
14 KiB
Go
542 lines
14 KiB
Go
package scraper
|
|
|
|
import (
|
|
"context"
|
|
"crypto/sha256"
|
|
"encoding/hex"
|
|
"encoding/json"
|
|
"fmt"
|
|
"net/http"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/yourorg/devour/pkg/rustdocs"
|
|
)
|
|
|
|
type RustDocsScraper struct {
|
|
config *Config
|
|
parser *rustdocs.Parser
|
|
client *http.Client
|
|
}
|
|
|
|
func NewRustDocsScraper(config *Config) *RustDocsScraper {
|
|
return &RustDocsScraper{
|
|
config: config,
|
|
parser: rustdocs.NewParser(),
|
|
client: &http.Client{
|
|
Timeout: config.Timeout,
|
|
},
|
|
}
|
|
}
|
|
|
|
func (s *RustDocsScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
|
|
var documents []*Document
|
|
|
|
if source.URL == "" {
|
|
return nil, fmt.Errorf("URL is required for Rust docs scraper")
|
|
}
|
|
|
|
html, err := s.fetchPage(ctx, source.URL)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to fetch page: %w", err)
|
|
}
|
|
|
|
crate, err := s.parser.ParseCratePage(html, source.URL)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to parse crate: %w", err)
|
|
}
|
|
|
|
mainDoc := s.crateToDocument(crate, source.Name)
|
|
documents = append(documents, mainDoc)
|
|
|
|
for _, m := range crate.Modules {
|
|
doc := s.moduleToDocument(m, crate, source.Name)
|
|
documents = append(documents, doc)
|
|
}
|
|
|
|
for _, st := range crate.Structs {
|
|
doc := s.structToDocument(st, crate, source.Name)
|
|
documents = append(documents, doc)
|
|
}
|
|
|
|
for _, e := range crate.Enums {
|
|
doc := s.enumToDocument(e, crate, source.Name)
|
|
documents = append(documents, doc)
|
|
}
|
|
|
|
for _, t := range crate.Traits {
|
|
doc := s.traitToDocument(t, crate, source.Name)
|
|
documents = append(documents, doc)
|
|
}
|
|
|
|
for _, f := range crate.Functions {
|
|
doc := s.funcToDocument(f, crate, source.Name)
|
|
documents = append(documents, doc)
|
|
}
|
|
|
|
for _, m := range crate.Macros {
|
|
doc := s.macroToDocument(m, crate, source.Name)
|
|
documents = append(documents, doc)
|
|
}
|
|
|
|
for _, c := range crate.Constants {
|
|
doc := s.constToDocument(c, crate, source.Name)
|
|
documents = append(documents, doc)
|
|
}
|
|
|
|
for _, st := range crate.Statics {
|
|
doc := s.staticToDocument(st, crate, source.Name)
|
|
documents = append(documents, doc)
|
|
}
|
|
|
|
return documents, nil
|
|
}
|
|
|
|
func (s *RustDocsScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
|
|
html, err := s.fetchPage(ctx, source.URL)
|
|
if err != nil {
|
|
return false, "", err
|
|
}
|
|
|
|
hash := s.generateHash(html)
|
|
changed := hash != lastHash
|
|
|
|
return changed, hash, nil
|
|
}
|
|
|
|
func (s *RustDocsScraper) fetchPage(ctx context.Context, url string) (string, error) {
|
|
return fetchExternalPage(ctx, s.client, s.config.UserAgent, url)
|
|
}
|
|
|
|
func (s *RustDocsScraper) generateHash(content string) string {
|
|
hash := sha256.Sum256([]byte(content))
|
|
return hex.EncodeToString(hash[:])
|
|
}
|
|
|
|
func (s *RustDocsScraper) crateToDocument(crate *rustdocs.Crate, sourceName string) *Document {
|
|
content := s.buildCrateContent(crate)
|
|
|
|
metadata := map[string]interface{}{
|
|
"name": crate.Name,
|
|
"version": crate.Version,
|
|
"repository": crate.Repository,
|
|
"license": crate.License,
|
|
"doc_url": crate.DocURL,
|
|
"module_count": len(crate.Modules),
|
|
"struct_count": len(crate.Structs),
|
|
"enum_count": len(crate.Enums),
|
|
"trait_count": len(crate.Traits),
|
|
"function_count": len(crate.Functions),
|
|
"macro_count": len(crate.Macros),
|
|
"constant_count": len(crate.Constants),
|
|
"static_count": len(crate.Statics),
|
|
}
|
|
|
|
return &Document{
|
|
ID: generateDocID(crate.DocURL),
|
|
Source: sourceName,
|
|
Type: "rust-crate",
|
|
Title: fmt.Sprintf("%s - Rust", crate.Name),
|
|
Content: content,
|
|
URL: crate.DocURL,
|
|
Metadata: metadata,
|
|
Hash: s.generateHash(content),
|
|
Timestamp: time.Now(),
|
|
}
|
|
}
|
|
|
|
func (s *RustDocsScraper) buildCrateContent(crate *rustdocs.Crate) string {
|
|
var parts []string
|
|
|
|
parts = append(parts, fmt.Sprintf("# Crate %s\n", crate.Name))
|
|
|
|
if crate.Version != "" {
|
|
parts = append(parts, fmt.Sprintf("Version: %s\n", crate.Version))
|
|
}
|
|
|
|
if crate.Description != "" {
|
|
parts = append(parts, crate.Description)
|
|
}
|
|
|
|
if len(crate.Modules) > 0 {
|
|
parts = append(parts, fmt.Sprintf("\n## Modules (%d)\n", len(crate.Modules)))
|
|
for _, m := range crate.Modules {
|
|
parts = append(parts, fmt.Sprintf("- `%s`", m.Name))
|
|
}
|
|
}
|
|
|
|
if len(crate.Structs) > 0 {
|
|
parts = append(parts, fmt.Sprintf("\n## Structs (%d)\n", len(crate.Structs)))
|
|
for _, st := range crate.Structs {
|
|
parts = append(parts, fmt.Sprintf("- `%s`", st.Name))
|
|
}
|
|
}
|
|
|
|
if len(crate.Enums) > 0 {
|
|
parts = append(parts, fmt.Sprintf("\n## Enums (%d)\n", len(crate.Enums)))
|
|
for _, e := range crate.Enums {
|
|
parts = append(parts, fmt.Sprintf("- `%s`", e.Name))
|
|
}
|
|
}
|
|
|
|
if len(crate.Traits) > 0 {
|
|
parts = append(parts, fmt.Sprintf("\n## Traits (%d)\n", len(crate.Traits)))
|
|
for _, t := range crate.Traits {
|
|
parts = append(parts, fmt.Sprintf("- `%s`", t.Name))
|
|
}
|
|
}
|
|
|
|
if len(crate.Functions) > 0 {
|
|
parts = append(parts, fmt.Sprintf("\n## Functions (%d)\n", len(crate.Functions)))
|
|
for _, f := range crate.Functions {
|
|
parts = append(parts, fmt.Sprintf("- `%s`", f.Name))
|
|
}
|
|
}
|
|
|
|
if len(crate.Macros) > 0 {
|
|
parts = append(parts, fmt.Sprintf("\n## Macros (%d)\n", len(crate.Macros)))
|
|
for _, m := range crate.Macros {
|
|
parts = append(parts, fmt.Sprintf("- `%s`", m.Name))
|
|
}
|
|
}
|
|
|
|
return strings.Join(parts, "\n")
|
|
}
|
|
|
|
func (s *RustDocsScraper) moduleToDocument(m *rustdocs.Module, crate *rustdocs.Crate, sourceName string) *Document {
|
|
var content strings.Builder
|
|
fmt.Fprintf(&content, "# Module %s::%s\n\n", crate.Name, m.Name)
|
|
|
|
if m.Doc != "" {
|
|
fmt.Fprintf(&content, "%s\n", m.Doc)
|
|
}
|
|
|
|
metadata := map[string]interface{}{
|
|
"crate": crate.Name,
|
|
"module": m.Name,
|
|
"path": m.Path,
|
|
"is_experimental": m.IsExperimental,
|
|
"kind": "module",
|
|
}
|
|
|
|
return &Document{
|
|
ID: generateDocID(m.DocURL),
|
|
Source: sourceName,
|
|
Type: "rust-module",
|
|
Title: fmt.Sprintf("%s::%s - Rust", crate.Name, m.Name),
|
|
Content: content.String(),
|
|
URL: m.DocURL,
|
|
Metadata: metadata,
|
|
Hash: s.generateHash(content.String()),
|
|
Timestamp: time.Now(),
|
|
}
|
|
}
|
|
|
|
func (s *RustDocsScraper) structToDocument(st *rustdocs.Struct, crate *rustdocs.Crate, sourceName string) *Document {
|
|
content := s.buildStructContent(st, crate)
|
|
|
|
metadata := map[string]interface{}{
|
|
"crate": crate.Name,
|
|
"struct": st.Name,
|
|
"path": st.Path,
|
|
"is_experimental": st.IsExperimental,
|
|
"kind": "struct",
|
|
"declaration": st.Declaration,
|
|
}
|
|
|
|
fieldsJSON, _ := json.Marshal(st.Fields)
|
|
metadata["fields"] = string(fieldsJSON)
|
|
|
|
return &Document{
|
|
ID: generateDocID(st.DocURL),
|
|
Source: sourceName,
|
|
Type: "rust-struct",
|
|
Title: fmt.Sprintf("%s::%s - Rust", crate.Name, st.Name),
|
|
Content: content,
|
|
URL: st.DocURL,
|
|
Metadata: metadata,
|
|
Hash: s.generateHash(content),
|
|
Timestamp: time.Now(),
|
|
}
|
|
}
|
|
|
|
func (s *RustDocsScraper) buildStructContent(st *rustdocs.Struct, crate *rustdocs.Crate) string {
|
|
var parts []string
|
|
|
|
parts = append(parts, fmt.Sprintf("# struct %s::%s\n", crate.Name, st.Name))
|
|
|
|
if st.Declaration != "" {
|
|
parts = append(parts, fmt.Sprintf("```rust\n%s\n```", st.Declaration))
|
|
}
|
|
|
|
if st.Doc != "" {
|
|
parts = append(parts, "\n"+st.Doc)
|
|
}
|
|
|
|
if len(st.Fields) > 0 {
|
|
parts = append(parts, "\n### Fields\n")
|
|
for _, f := range st.Fields {
|
|
if f.Doc != "" {
|
|
parts = append(parts, fmt.Sprintf("- `%s: %s` - %s", f.Name, f.Type, f.Doc))
|
|
} else {
|
|
parts = append(parts, fmt.Sprintf("- `%s: %s`", f.Name, f.Type))
|
|
}
|
|
}
|
|
}
|
|
|
|
if len(st.Methods) > 0 {
|
|
parts = append(parts, fmt.Sprintf("\n### Methods (%d)\n", len(st.Methods)))
|
|
for _, m := range st.Methods {
|
|
parts = append(parts, fmt.Sprintf("- `%s`", m.Name))
|
|
}
|
|
}
|
|
|
|
return strings.Join(parts, "\n")
|
|
}
|
|
|
|
func (s *RustDocsScraper) enumToDocument(e *rustdocs.Enum, crate *rustdocs.Crate, sourceName string) *Document {
|
|
content := s.buildEnumContent(e, crate)
|
|
|
|
metadata := map[string]interface{}{
|
|
"crate": crate.Name,
|
|
"enum": e.Name,
|
|
"path": e.Path,
|
|
"is_experimental": e.IsExperimental,
|
|
"kind": "enum",
|
|
"declaration": e.Declaration,
|
|
}
|
|
|
|
return &Document{
|
|
ID: generateDocID(e.DocURL),
|
|
Source: sourceName,
|
|
Type: "rust-enum",
|
|
Title: fmt.Sprintf("%s::%s - Rust", crate.Name, e.Name),
|
|
Content: content,
|
|
URL: e.DocURL,
|
|
Metadata: metadata,
|
|
Hash: s.generateHash(content),
|
|
Timestamp: time.Now(),
|
|
}
|
|
}
|
|
|
|
func (s *RustDocsScraper) buildEnumContent(e *rustdocs.Enum, crate *rustdocs.Crate) string {
|
|
var parts []string
|
|
|
|
parts = append(parts, fmt.Sprintf("# enum %s::%s\n", crate.Name, e.Name))
|
|
|
|
if e.Declaration != "" {
|
|
parts = append(parts, fmt.Sprintf("```rust\n%s\n```", e.Declaration))
|
|
}
|
|
|
|
if e.Doc != "" {
|
|
parts = append(parts, "\n"+e.Doc)
|
|
}
|
|
|
|
if len(e.Variants) > 0 {
|
|
parts = append(parts, "\n### Variants\n")
|
|
for _, v := range e.Variants {
|
|
if v.Doc != "" {
|
|
parts = append(parts, fmt.Sprintf("- `%s` - %s", v.Name, v.Doc))
|
|
} else {
|
|
parts = append(parts, fmt.Sprintf("- `%s`", v.Name))
|
|
}
|
|
}
|
|
}
|
|
|
|
return strings.Join(parts, "\n")
|
|
}
|
|
|
|
func (s *RustDocsScraper) traitToDocument(t *rustdocs.Trait, crate *rustdocs.Crate, sourceName string) *Document {
|
|
content := s.buildTraitContent(t, crate)
|
|
|
|
metadata := map[string]interface{}{
|
|
"crate": crate.Name,
|
|
"trait": t.Name,
|
|
"path": t.Path,
|
|
"is_experimental": t.IsExperimental,
|
|
"kind": "trait",
|
|
"declaration": t.Declaration,
|
|
}
|
|
|
|
return &Document{
|
|
ID: generateDocID(t.DocURL),
|
|
Source: sourceName,
|
|
Type: "rust-trait",
|
|
Title: fmt.Sprintf("%s::%s - Rust", crate.Name, t.Name),
|
|
Content: content,
|
|
URL: t.DocURL,
|
|
Metadata: metadata,
|
|
Hash: s.generateHash(content),
|
|
Timestamp: time.Now(),
|
|
}
|
|
}
|
|
|
|
func (s *RustDocsScraper) buildTraitContent(t *rustdocs.Trait, crate *rustdocs.Crate) string {
|
|
var parts []string
|
|
|
|
parts = append(parts, fmt.Sprintf("# trait %s::%s\n", crate.Name, t.Name))
|
|
|
|
if t.Declaration != "" {
|
|
parts = append(parts, fmt.Sprintf("```rust\n%s\n```", t.Declaration))
|
|
}
|
|
|
|
if t.Doc != "" {
|
|
parts = append(parts, "\n"+t.Doc)
|
|
}
|
|
|
|
if len(t.Methods) > 0 {
|
|
parts = append(parts, fmt.Sprintf("\n### Required Methods (%d)\n", len(t.Methods)))
|
|
for _, m := range t.Methods {
|
|
parts = append(parts, fmt.Sprintf("- `%s`", m.Signature))
|
|
}
|
|
}
|
|
|
|
return strings.Join(parts, "\n")
|
|
}
|
|
|
|
func (s *RustDocsScraper) funcToDocument(f *rustdocs.Func, crate *rustdocs.Crate, sourceName string) *Document {
|
|
var content strings.Builder
|
|
fmt.Fprintf(&content, "# fn %s::%s\n\n", crate.Name, f.Name)
|
|
|
|
if f.Signature != "" {
|
|
fmt.Fprintf(&content, "```rust\n%s\n```\n", f.Signature)
|
|
}
|
|
|
|
if f.Doc != "" {
|
|
fmt.Fprintf(&content, "%s\n", f.Doc)
|
|
}
|
|
|
|
metadata := map[string]interface{}{
|
|
"crate": crate.Name,
|
|
"function": f.Name,
|
|
"path": f.Path,
|
|
"is_experimental": f.IsExperimental,
|
|
"is_unsafe": f.IsUnsafe,
|
|
"is_const": f.IsConst,
|
|
"is_async": f.IsAsync,
|
|
"kind": "fn",
|
|
"signature": f.Signature,
|
|
}
|
|
|
|
return &Document{
|
|
ID: generateDocID(f.DocURL),
|
|
Source: sourceName,
|
|
Type: "rust-fn",
|
|
Title: fmt.Sprintf("%s::%s - Rust", crate.Name, f.Name),
|
|
Content: content.String(),
|
|
URL: f.DocURL,
|
|
Metadata: metadata,
|
|
Hash: s.generateHash(content.String()),
|
|
Timestamp: time.Now(),
|
|
}
|
|
}
|
|
|
|
func (s *RustDocsScraper) macroToDocument(m *rustdocs.Macro, crate *rustdocs.Crate, sourceName string) *Document {
|
|
var content strings.Builder
|
|
fmt.Fprintf(&content, "# macro %s::%s!\n\n", crate.Name, m.Name)
|
|
|
|
if m.Signature != "" {
|
|
fmt.Fprintf(&content, "```rust\n%s\n```\n", m.Signature)
|
|
}
|
|
|
|
if m.Doc != "" {
|
|
fmt.Fprintf(&content, "%s\n", m.Doc)
|
|
}
|
|
|
|
metadata := map[string]interface{}{
|
|
"crate": crate.Name,
|
|
"macro": m.Name,
|
|
"path": m.Path,
|
|
"is_experimental": m.IsExperimental,
|
|
"kind": "macro",
|
|
}
|
|
|
|
return &Document{
|
|
ID: generateDocID(m.DocURL),
|
|
Source: sourceName,
|
|
Type: "rust-macro",
|
|
Title: fmt.Sprintf("%s::%s! - Rust", crate.Name, m.Name),
|
|
Content: content.String(),
|
|
URL: m.DocURL,
|
|
Metadata: metadata,
|
|
Hash: s.generateHash(content.String()),
|
|
Timestamp: time.Now(),
|
|
}
|
|
}
|
|
|
|
func (s *RustDocsScraper) constToDocument(c *rustdocs.Const, crate *rustdocs.Crate, sourceName string) *Document {
|
|
var content strings.Builder
|
|
fmt.Fprintf(&content, "# const %s::%s\n\n", crate.Name, c.Name)
|
|
|
|
if c.Type != "" {
|
|
fmt.Fprintf(&content, "Type: `%s`\n", c.Type)
|
|
}
|
|
if c.Value != "" {
|
|
fmt.Fprintf(&content, "Value: `%s`\n", c.Value)
|
|
}
|
|
|
|
if c.Doc != "" {
|
|
fmt.Fprintf(&content, "\n%s\n", c.Doc)
|
|
}
|
|
|
|
metadata := map[string]interface{}{
|
|
"crate": crate.Name,
|
|
"const": c.Name,
|
|
"path": c.Path,
|
|
"is_experimental": c.IsExperimental,
|
|
"type": c.Type,
|
|
"value": c.Value,
|
|
"kind": "const",
|
|
}
|
|
|
|
return &Document{
|
|
ID: generateDocID(c.DocURL),
|
|
Source: sourceName,
|
|
Type: "rust-const",
|
|
Title: fmt.Sprintf("%s::%s - Rust", crate.Name, c.Name),
|
|
Content: content.String(),
|
|
URL: c.DocURL,
|
|
Metadata: metadata,
|
|
Hash: s.generateHash(content.String()),
|
|
Timestamp: time.Now(),
|
|
}
|
|
}
|
|
|
|
func (s *RustDocsScraper) staticToDocument(st *rustdocs.Static, crate *rustdocs.Crate, sourceName string) *Document {
|
|
var content strings.Builder
|
|
fmt.Fprintf(&content, "# static %s::%s\n\n", crate.Name, st.Name)
|
|
|
|
if st.Type != "" {
|
|
fmt.Fprintf(&content, "Type: `%s`\n", st.Type)
|
|
}
|
|
if st.IsMutable {
|
|
fmt.Fprintf(&content, "Mutability: mutable\n")
|
|
}
|
|
|
|
if st.Doc != "" {
|
|
fmt.Fprintf(&content, "\n%s\n", st.Doc)
|
|
}
|
|
|
|
metadata := map[string]interface{}{
|
|
"crate": crate.Name,
|
|
"static": st.Name,
|
|
"path": st.Path,
|
|
"is_experimental": st.IsExperimental,
|
|
"is_mutable": st.IsMutable,
|
|
"type": st.Type,
|
|
"kind": "static",
|
|
}
|
|
|
|
return &Document{
|
|
ID: generateDocID(st.DocURL),
|
|
Source: sourceName,
|
|
Type: "rust-static",
|
|
Title: fmt.Sprintf("%s::%s - Rust", crate.Name, st.Name),
|
|
Content: content.String(),
|
|
URL: st.DocURL,
|
|
Metadata: metadata,
|
|
Hash: s.generateHash(content.String()),
|
|
Timestamp: time.Now(),
|
|
}
|
|
}
|