mirror of
https://github.com/Dvorinka/Devour.git
synced 2026-06-04 12:33:04 +00:00
updage
This commit is contained in:
+563
@@ -0,0 +1,563 @@
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/yourorg/devour/pkg/rustdocs"
|
||||
)
|
||||
|
||||
type RustDocsScraper struct {
|
||||
config *Config
|
||||
parser *rustdocs.Parser
|
||||
client *http.Client
|
||||
}
|
||||
|
||||
func NewRustDocsScraper(config *Config) *RustDocsScraper {
|
||||
return &RustDocsScraper{
|
||||
config: config,
|
||||
parser: rustdocs.NewParser(),
|
||||
client: &http.Client{
|
||||
Timeout: config.Timeout,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func (s *RustDocsScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
|
||||
var documents []*Document
|
||||
|
||||
if source.URL == "" {
|
||||
return nil, fmt.Errorf("URL is required for Rust docs scraper")
|
||||
}
|
||||
|
||||
html, err := s.fetchPage(ctx, source.URL)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to fetch page: %w", err)
|
||||
}
|
||||
|
||||
crate, err := s.parser.ParseCratePage(html, source.URL)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to parse crate: %w", err)
|
||||
}
|
||||
|
||||
mainDoc := s.crateToDocument(crate, source.Name)
|
||||
documents = append(documents, mainDoc)
|
||||
|
||||
for _, m := range crate.Modules {
|
||||
doc := s.moduleToDocument(m, crate, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
for _, st := range crate.Structs {
|
||||
doc := s.structToDocument(st, crate, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
for _, e := range crate.Enums {
|
||||
doc := s.enumToDocument(e, crate, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
for _, t := range crate.Traits {
|
||||
doc := s.traitToDocument(t, crate, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
for _, f := range crate.Functions {
|
||||
doc := s.funcToDocument(f, crate, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
for _, m := range crate.Macros {
|
||||
doc := s.macroToDocument(m, crate, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
for _, c := range crate.Constants {
|
||||
doc := s.constToDocument(c, crate, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
for _, st := range crate.Statics {
|
||||
doc := s.staticToDocument(st, crate, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
return documents, nil
|
||||
}
|
||||
|
||||
func (s *RustDocsScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
|
||||
html, err := s.fetchPage(ctx, source.URL)
|
||||
if err != nil {
|
||||
return false, "", err
|
||||
}
|
||||
|
||||
hash := s.generateHash(html)
|
||||
changed := hash != lastHash
|
||||
|
||||
return changed, hash, nil
|
||||
}
|
||||
|
||||
func (s *RustDocsScraper) fetchPage(ctx context.Context, url string) (string, error) {
|
||||
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
req.Header.Set("User-Agent", s.config.UserAgent)
|
||||
|
||||
resp, err := s.client.Do(req)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return "", fmt.Errorf("HTTP %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return string(body), nil
|
||||
}
|
||||
|
||||
func (s *RustDocsScraper) generateHash(content string) string {
|
||||
hash := sha256.Sum256([]byte(content))
|
||||
return hex.EncodeToString(hash[:])
|
||||
}
|
||||
|
||||
func (s *RustDocsScraper) crateToDocument(crate *rustdocs.Crate, sourceName string) *Document {
|
||||
content := s.buildCrateContent(crate)
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"name": crate.Name,
|
||||
"version": crate.Version,
|
||||
"repository": crate.Repository,
|
||||
"license": crate.License,
|
||||
"doc_url": crate.DocURL,
|
||||
"module_count": len(crate.Modules),
|
||||
"struct_count": len(crate.Structs),
|
||||
"enum_count": len(crate.Enums),
|
||||
"trait_count": len(crate.Traits),
|
||||
"function_count": len(crate.Functions),
|
||||
"macro_count": len(crate.Macros),
|
||||
"constant_count": len(crate.Constants),
|
||||
"static_count": len(crate.Statics),
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(crate.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "rust-crate",
|
||||
Title: fmt.Sprintf("%s - Rust", crate.Name),
|
||||
Content: content,
|
||||
URL: crate.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *RustDocsScraper) buildCrateContent(crate *rustdocs.Crate) string {
|
||||
var parts []string
|
||||
|
||||
parts = append(parts, fmt.Sprintf("# Crate %s\n", crate.Name))
|
||||
|
||||
if crate.Version != "" {
|
||||
parts = append(parts, fmt.Sprintf("Version: %s\n", crate.Version))
|
||||
}
|
||||
|
||||
if crate.Description != "" {
|
||||
parts = append(parts, crate.Description)
|
||||
}
|
||||
|
||||
if len(crate.Modules) > 0 {
|
||||
parts = append(parts, fmt.Sprintf("\n## Modules (%d)\n", len(crate.Modules)))
|
||||
for _, m := range crate.Modules {
|
||||
parts = append(parts, fmt.Sprintf("- `%s`", m.Name))
|
||||
}
|
||||
}
|
||||
|
||||
if len(crate.Structs) > 0 {
|
||||
parts = append(parts, fmt.Sprintf("\n## Structs (%d)\n", len(crate.Structs)))
|
||||
for _, st := range crate.Structs {
|
||||
parts = append(parts, fmt.Sprintf("- `%s`", st.Name))
|
||||
}
|
||||
}
|
||||
|
||||
if len(crate.Enums) > 0 {
|
||||
parts = append(parts, fmt.Sprintf("\n## Enums (%d)\n", len(crate.Enums)))
|
||||
for _, e := range crate.Enums {
|
||||
parts = append(parts, fmt.Sprintf("- `%s`", e.Name))
|
||||
}
|
||||
}
|
||||
|
||||
if len(crate.Traits) > 0 {
|
||||
parts = append(parts, fmt.Sprintf("\n## Traits (%d)\n", len(crate.Traits)))
|
||||
for _, t := range crate.Traits {
|
||||
parts = append(parts, fmt.Sprintf("- `%s`", t.Name))
|
||||
}
|
||||
}
|
||||
|
||||
if len(crate.Functions) > 0 {
|
||||
parts = append(parts, fmt.Sprintf("\n## Functions (%d)\n", len(crate.Functions)))
|
||||
for _, f := range crate.Functions {
|
||||
parts = append(parts, fmt.Sprintf("- `%s`", f.Name))
|
||||
}
|
||||
}
|
||||
|
||||
if len(crate.Macros) > 0 {
|
||||
parts = append(parts, fmt.Sprintf("\n## Macros (%d)\n", len(crate.Macros)))
|
||||
for _, m := range crate.Macros {
|
||||
parts = append(parts, fmt.Sprintf("- `%s`", m.Name))
|
||||
}
|
||||
}
|
||||
|
||||
return strings.Join(parts, "\n")
|
||||
}
|
||||
|
||||
func (s *RustDocsScraper) moduleToDocument(m *rustdocs.Module, crate *rustdocs.Crate, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# Module %s::%s\n\n", crate.Name, m.Name)
|
||||
|
||||
if m.Doc != "" {
|
||||
fmt.Fprintf(&content, "%s\n", m.Doc)
|
||||
}
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"crate": crate.Name,
|
||||
"module": m.Name,
|
||||
"path": m.Path,
|
||||
"is_experimental": m.IsExperimental,
|
||||
"kind": "module",
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(m.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "rust-module",
|
||||
Title: fmt.Sprintf("%s::%s - Rust", crate.Name, m.Name),
|
||||
Content: content.String(),
|
||||
URL: m.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *RustDocsScraper) structToDocument(st *rustdocs.Struct, crate *rustdocs.Crate, sourceName string) *Document {
|
||||
content := s.buildStructContent(st, crate)
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"crate": crate.Name,
|
||||
"struct": st.Name,
|
||||
"path": st.Path,
|
||||
"is_experimental": st.IsExperimental,
|
||||
"kind": "struct",
|
||||
"declaration": st.Declaration,
|
||||
}
|
||||
|
||||
fieldsJSON, _ := json.Marshal(st.Fields)
|
||||
metadata["fields"] = string(fieldsJSON)
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(st.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "rust-struct",
|
||||
Title: fmt.Sprintf("%s::%s - Rust", crate.Name, st.Name),
|
||||
Content: content,
|
||||
URL: st.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *RustDocsScraper) buildStructContent(st *rustdocs.Struct, crate *rustdocs.Crate) string {
|
||||
var parts []string
|
||||
|
||||
parts = append(parts, fmt.Sprintf("# struct %s::%s\n", crate.Name, st.Name))
|
||||
|
||||
if st.Declaration != "" {
|
||||
parts = append(parts, fmt.Sprintf("```rust\n%s\n```", st.Declaration))
|
||||
}
|
||||
|
||||
if st.Doc != "" {
|
||||
parts = append(parts, "\n"+st.Doc)
|
||||
}
|
||||
|
||||
if len(st.Fields) > 0 {
|
||||
parts = append(parts, "\n### Fields\n")
|
||||
for _, f := range st.Fields {
|
||||
if f.Doc != "" {
|
||||
parts = append(parts, fmt.Sprintf("- `%s: %s` - %s", f.Name, f.Type, f.Doc))
|
||||
} else {
|
||||
parts = append(parts, fmt.Sprintf("- `%s: %s`", f.Name, f.Type))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if len(st.Methods) > 0 {
|
||||
parts = append(parts, fmt.Sprintf("\n### Methods (%d)\n", len(st.Methods)))
|
||||
for _, m := range st.Methods {
|
||||
parts = append(parts, fmt.Sprintf("- `%s`", m.Name))
|
||||
}
|
||||
}
|
||||
|
||||
return strings.Join(parts, "\n")
|
||||
}
|
||||
|
||||
func (s *RustDocsScraper) enumToDocument(e *rustdocs.Enum, crate *rustdocs.Crate, sourceName string) *Document {
|
||||
content := s.buildEnumContent(e, crate)
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"crate": crate.Name,
|
||||
"enum": e.Name,
|
||||
"path": e.Path,
|
||||
"is_experimental": e.IsExperimental,
|
||||
"kind": "enum",
|
||||
"declaration": e.Declaration,
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(e.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "rust-enum",
|
||||
Title: fmt.Sprintf("%s::%s - Rust", crate.Name, e.Name),
|
||||
Content: content,
|
||||
URL: e.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *RustDocsScraper) buildEnumContent(e *rustdocs.Enum, crate *rustdocs.Crate) string {
|
||||
var parts []string
|
||||
|
||||
parts = append(parts, fmt.Sprintf("# enum %s::%s\n", crate.Name, e.Name))
|
||||
|
||||
if e.Declaration != "" {
|
||||
parts = append(parts, fmt.Sprintf("```rust\n%s\n```", e.Declaration))
|
||||
}
|
||||
|
||||
if e.Doc != "" {
|
||||
parts = append(parts, "\n"+e.Doc)
|
||||
}
|
||||
|
||||
if len(e.Variants) > 0 {
|
||||
parts = append(parts, "\n### Variants\n")
|
||||
for _, v := range e.Variants {
|
||||
if v.Doc != "" {
|
||||
parts = append(parts, fmt.Sprintf("- `%s` - %s", v.Name, v.Doc))
|
||||
} else {
|
||||
parts = append(parts, fmt.Sprintf("- `%s`", v.Name))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return strings.Join(parts, "\n")
|
||||
}
|
||||
|
||||
func (s *RustDocsScraper) traitToDocument(t *rustdocs.Trait, crate *rustdocs.Crate, sourceName string) *Document {
|
||||
content := s.buildTraitContent(t, crate)
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"crate": crate.Name,
|
||||
"trait": t.Name,
|
||||
"path": t.Path,
|
||||
"is_experimental": t.IsExperimental,
|
||||
"kind": "trait",
|
||||
"declaration": t.Declaration,
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(t.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "rust-trait",
|
||||
Title: fmt.Sprintf("%s::%s - Rust", crate.Name, t.Name),
|
||||
Content: content,
|
||||
URL: t.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *RustDocsScraper) buildTraitContent(t *rustdocs.Trait, crate *rustdocs.Crate) string {
|
||||
var parts []string
|
||||
|
||||
parts = append(parts, fmt.Sprintf("# trait %s::%s\n", crate.Name, t.Name))
|
||||
|
||||
if t.Declaration != "" {
|
||||
parts = append(parts, fmt.Sprintf("```rust\n%s\n```", t.Declaration))
|
||||
}
|
||||
|
||||
if t.Doc != "" {
|
||||
parts = append(parts, "\n"+t.Doc)
|
||||
}
|
||||
|
||||
if len(t.Methods) > 0 {
|
||||
parts = append(parts, fmt.Sprintf("\n### Required Methods (%d)\n", len(t.Methods)))
|
||||
for _, m := range t.Methods {
|
||||
parts = append(parts, fmt.Sprintf("- `%s`", m.Signature))
|
||||
}
|
||||
}
|
||||
|
||||
return strings.Join(parts, "\n")
|
||||
}
|
||||
|
||||
func (s *RustDocsScraper) funcToDocument(f *rustdocs.Func, crate *rustdocs.Crate, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# fn %s::%s\n\n", crate.Name, f.Name)
|
||||
|
||||
if f.Signature != "" {
|
||||
fmt.Fprintf(&content, "```rust\n%s\n```\n", f.Signature)
|
||||
}
|
||||
|
||||
if f.Doc != "" {
|
||||
fmt.Fprintf(&content, "%s\n", f.Doc)
|
||||
}
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"crate": crate.Name,
|
||||
"function": f.Name,
|
||||
"path": f.Path,
|
||||
"is_experimental": f.IsExperimental,
|
||||
"is_unsafe": f.IsUnsafe,
|
||||
"is_const": f.IsConst,
|
||||
"is_async": f.IsAsync,
|
||||
"kind": "fn",
|
||||
"signature": f.Signature,
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(f.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "rust-fn",
|
||||
Title: fmt.Sprintf("%s::%s - Rust", crate.Name, f.Name),
|
||||
Content: content.String(),
|
||||
URL: f.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *RustDocsScraper) macroToDocument(m *rustdocs.Macro, crate *rustdocs.Crate, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# macro %s::%s!\n\n", crate.Name, m.Name)
|
||||
|
||||
if m.Signature != "" {
|
||||
fmt.Fprintf(&content, "```rust\n%s\n```\n", m.Signature)
|
||||
}
|
||||
|
||||
if m.Doc != "" {
|
||||
fmt.Fprintf(&content, "%s\n", m.Doc)
|
||||
}
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"crate": crate.Name,
|
||||
"macro": m.Name,
|
||||
"path": m.Path,
|
||||
"is_experimental": m.IsExperimental,
|
||||
"kind": "macro",
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(m.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "rust-macro",
|
||||
Title: fmt.Sprintf("%s::%s! - Rust", crate.Name, m.Name),
|
||||
Content: content.String(),
|
||||
URL: m.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *RustDocsScraper) constToDocument(c *rustdocs.Const, crate *rustdocs.Crate, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# const %s::%s\n\n", crate.Name, c.Name)
|
||||
|
||||
if c.Type != "" {
|
||||
fmt.Fprintf(&content, "Type: `%s`\n", c.Type)
|
||||
}
|
||||
if c.Value != "" {
|
||||
fmt.Fprintf(&content, "Value: `%s`\n", c.Value)
|
||||
}
|
||||
|
||||
if c.Doc != "" {
|
||||
fmt.Fprintf(&content, "\n%s\n", c.Doc)
|
||||
}
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"crate": crate.Name,
|
||||
"const": c.Name,
|
||||
"path": c.Path,
|
||||
"is_experimental": c.IsExperimental,
|
||||
"type": c.Type,
|
||||
"value": c.Value,
|
||||
"kind": "const",
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(c.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "rust-const",
|
||||
Title: fmt.Sprintf("%s::%s - Rust", crate.Name, c.Name),
|
||||
Content: content.String(),
|
||||
URL: c.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *RustDocsScraper) staticToDocument(st *rustdocs.Static, crate *rustdocs.Crate, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# static %s::%s\n\n", crate.Name, st.Name)
|
||||
|
||||
if st.Type != "" {
|
||||
fmt.Fprintf(&content, "Type: `%s`\n", st.Type)
|
||||
}
|
||||
if st.IsMutable {
|
||||
fmt.Fprintf(&content, "Mutability: mutable\n")
|
||||
}
|
||||
|
||||
if st.Doc != "" {
|
||||
fmt.Fprintf(&content, "\n%s\n", st.Doc)
|
||||
}
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"crate": crate.Name,
|
||||
"static": st.Name,
|
||||
"path": st.Path,
|
||||
"is_experimental": st.IsExperimental,
|
||||
"is_mutable": st.IsMutable,
|
||||
"type": st.Type,
|
||||
"kind": "static",
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(st.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "rust-static",
|
||||
Title: fmt.Sprintf("%s::%s - Rust", crate.Name, st.Name),
|
||||
Content: content.String(),
|
||||
URL: st.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user