mirror of
https://github.com/Dvorinka/Devour.git
synced 2026-06-04 12:33:04 +00:00
first commit
This commit is contained in:
@@ -0,0 +1,298 @@
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/yourorg/devour/pkg/nuxtdocs"
|
||||
)
|
||||
|
||||
type NuxtDocsScraper struct {
|
||||
config *Config
|
||||
parser *nuxtdocs.Parser
|
||||
client *http.Client
|
||||
}
|
||||
|
||||
func NewNuxtDocsScraper(config *Config) *NuxtDocsScraper {
|
||||
return &NuxtDocsScraper{
|
||||
config: config,
|
||||
parser: nuxtdocs.NewParser(),
|
||||
client: &http.Client{
|
||||
Timeout: config.Timeout,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func (s *NuxtDocsScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
|
||||
var documents []*Document
|
||||
|
||||
if source.URL == "" {
|
||||
return nil, fmt.Errorf("URL is required for Nuxt docs scraper")
|
||||
}
|
||||
|
||||
html, err := s.fetchPage(ctx, source.URL)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to fetch page: %w", err)
|
||||
}
|
||||
|
||||
ref, err := s.parser.ParseReferencePage(html, source.URL)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to parse reference: %w", err)
|
||||
}
|
||||
|
||||
mainDoc := s.referenceToDocument(ref, source.Name)
|
||||
documents = append(documents, mainDoc)
|
||||
|
||||
for _, comp := range ref.Components {
|
||||
doc := s.componentToDocument(comp, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
for _, comp := range ref.Composables {
|
||||
doc := s.composableToDocument(comp, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
for _, util := range ref.Utilities {
|
||||
doc := s.utilityToDocument(util, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
for _, cfg := range ref.Configs {
|
||||
doc := s.configToDocument(cfg, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
for _, cmd := range ref.Commands {
|
||||
doc := s.commandToDocument(cmd, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
return documents, nil
|
||||
}
|
||||
|
||||
func (s *NuxtDocsScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
|
||||
html, err := s.fetchPage(ctx, source.URL)
|
||||
if err != nil {
|
||||
return false, "", err
|
||||
}
|
||||
|
||||
hash := s.generateHash(html)
|
||||
changed := hash != lastHash
|
||||
|
||||
return changed, hash, nil
|
||||
}
|
||||
|
||||
func (s *NuxtDocsScraper) fetchPage(ctx context.Context, url string) (string, error) {
|
||||
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
req.Header.Set("User-Agent", s.config.UserAgent)
|
||||
|
||||
resp, err := s.client.Do(req)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return "", fmt.Errorf("HTTP %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return string(body), nil
|
||||
}
|
||||
|
||||
func (s *NuxtDocsScraper) generateHash(content string) string {
|
||||
hash := sha256.Sum256([]byte(content))
|
||||
return hex.EncodeToString(hash[:])
|
||||
}
|
||||
|
||||
func (s *NuxtDocsScraper) referenceToDocument(ref *nuxtdocs.Reference, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# Nuxt API Reference\n\n")
|
||||
fmt.Fprintf(&content, "Components: %d, Composables: %d, Utilities: %d, Configs: %d, Commands: %d\n",
|
||||
len(ref.Components), len(ref.Composables), len(ref.Utilities), len(ref.Configs), len(ref.Commands))
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(ref.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "nuxt-reference",
|
||||
Title: "Nuxt API Reference",
|
||||
Content: content.String(),
|
||||
URL: ref.DocURL,
|
||||
Metadata: map[string]interface{}{"doc_type": "nuxt-reference"},
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *NuxtDocsScraper) componentToDocument(comp *nuxtdocs.Component, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# <%s />\n\n", comp.Name)
|
||||
fmt.Fprintf(&content, "%s\n", comp.Doc)
|
||||
|
||||
if len(comp.Props) > 0 {
|
||||
fmt.Fprintf(&content, "\n## Props\n")
|
||||
for _, p := range comp.Props {
|
||||
fmt.Fprintf(&content, "- `%s: %s`\n", p.Name, p.Type)
|
||||
}
|
||||
}
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"name": comp.Name,
|
||||
"category": comp.Category,
|
||||
"doc_url": comp.DocURL,
|
||||
"doc_type": "nuxt-component",
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(comp.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "nuxt-component",
|
||||
Title: comp.Name,
|
||||
Content: content.String(),
|
||||
URL: comp.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *NuxtDocsScraper) composableToDocument(comp *nuxtdocs.Composable, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# %s\n\n", comp.Name)
|
||||
if comp.Signature != "" {
|
||||
fmt.Fprintf(&content, "```javascript\n%s\n```\n\n", comp.Signature)
|
||||
}
|
||||
fmt.Fprintf(&content, "%s\n", comp.Doc)
|
||||
|
||||
if comp.Returns != "" {
|
||||
fmt.Fprintf(&content, "\n**Returns:** `%s`\n", comp.Returns)
|
||||
}
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"name": comp.Name,
|
||||
"category": comp.Category,
|
||||
"doc_url": comp.DocURL,
|
||||
"doc_type": "nuxt-composable",
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(comp.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "nuxt-composable",
|
||||
Title: comp.Name,
|
||||
Content: content.String(),
|
||||
URL: comp.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *NuxtDocsScraper) utilityToDocument(util *nuxtdocs.Utility, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# %s\n\n", util.Name)
|
||||
if util.Signature != "" {
|
||||
fmt.Fprintf(&content, "```javascript\n%s\n```\n\n", util.Signature)
|
||||
}
|
||||
fmt.Fprintf(&content, "%s\n", util.Doc)
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"name": util.Name,
|
||||
"doc_url": util.DocURL,
|
||||
"doc_type": "nuxt-utility",
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(util.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "nuxt-utility",
|
||||
Title: util.Name,
|
||||
Content: content.String(),
|
||||
URL: util.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *NuxtDocsScraper) configToDocument(cfg *nuxtdocs.Config, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# %s\n\n", cfg.Name)
|
||||
if cfg.Type != "" {
|
||||
fmt.Fprintf(&content, "Type: `%s`\n\n", cfg.Type)
|
||||
}
|
||||
if cfg.Default != "" {
|
||||
fmt.Fprintf(&content, "Default: `%s`\n\n", cfg.Default)
|
||||
}
|
||||
fmt.Fprintf(&content, "%s\n", cfg.Doc)
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"name": cfg.Name,
|
||||
"type": cfg.Type,
|
||||
"default": cfg.Default,
|
||||
"category": cfg.Category,
|
||||
"doc_url": cfg.DocURL,
|
||||
"doc_type": "nuxt-config",
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(cfg.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "nuxt-config",
|
||||
Title: cfg.Name,
|
||||
Content: content.String(),
|
||||
URL: cfg.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *NuxtDocsScraper) commandToDocument(cmd *nuxtdocs.Command, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# %s\n\n", cmd.Name)
|
||||
if cmd.Usage != "" {
|
||||
fmt.Fprintf(&content, "```\n%s\n```\n\n", cmd.Usage)
|
||||
}
|
||||
fmt.Fprintf(&content, "%s\n", cmd.Doc)
|
||||
|
||||
if len(cmd.Flags) > 0 {
|
||||
fmt.Fprintf(&content, "\n## Flags\n")
|
||||
for _, f := range cmd.Flags {
|
||||
fmt.Fprintf(&content, "- `--%s`: %s\n", f.Name, f.Doc)
|
||||
}
|
||||
}
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"name": cmd.Name,
|
||||
"usage": cmd.Usage,
|
||||
"doc_url": cmd.DocURL,
|
||||
"doc_type": "nuxt-command",
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(cmd.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "nuxt-command",
|
||||
Title: cmd.Name,
|
||||
Content: content.String(),
|
||||
URL: cmd.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user