Files
Devour/internal/scraper/mcpdocs.go
Tomas Dvorak 55885a0e8f first commit
2026-02-22 10:42:17 +01:00

223 lines
5.6 KiB
Go

package scraper
import (
"context"
"crypto/sha256"
"encoding/hex"
"fmt"
"io"
"net/http"
"strings"
"time"
"github.com/yourorg/devour/pkg/mcpdocs"
)
type MCPDocsScraper struct {
config *Config
parser *mcpdocs.Parser
client *http.Client
}
func NewMCPDocsScraper(config *Config) *MCPDocsScraper {
return &MCPDocsScraper{
config: config,
parser: mcpdocs.NewParser(),
client: &http.Client{
Timeout: config.Timeout,
},
}
}
func (s *MCPDocsScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
var documents []*Document
if source.URL == "" {
return nil, fmt.Errorf("URL is required for MCP docs scraper")
}
html, err := s.fetchPage(ctx, source.URL)
if err != nil {
return nil, fmt.Errorf("failed to fetch page: %w", err)
}
server, err := s.parser.ParseServerPage(html, source.URL)
if err != nil {
return nil, fmt.Errorf("failed to parse MCP server page: %w", err)
}
mainDoc := s.serverToDocument(server, source.Name)
documents = append(documents, mainDoc)
for _, tool := range server.Tools {
doc := s.toolToDocument(tool, server, source.Name)
documents = append(documents, doc)
}
for _, res := range server.Resources {
doc := s.resourceToDocument(res, server, source.Name)
documents = append(documents, doc)
}
for _, prompt := range server.Prompts {
doc := s.promptToDocument(prompt, server, source.Name)
documents = append(documents, doc)
}
return documents, nil
}
func (s *MCPDocsScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
html, err := s.fetchPage(ctx, source.URL)
if err != nil {
return false, "", err
}
hash := s.generateHash(html)
changed := hash != lastHash
return changed, hash, nil
}
func (s *MCPDocsScraper) fetchPage(ctx context.Context, url string) (string, error) {
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
if err != nil {
return "", err
}
req.Header.Set("User-Agent", s.config.UserAgent)
resp, err := s.client.Do(req)
if err != nil {
return "", err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return "", fmt.Errorf("HTTP %d", resp.StatusCode)
}
body, err := io.ReadAll(resp.Body)
if err != nil {
return "", err
}
return string(body), nil
}
func (s *MCPDocsScraper) generateHash(content string) string {
hash := sha256.Sum256([]byte(content))
return hex.EncodeToString(hash[:])
}
func (s *MCPDocsScraper) serverToDocument(server *mcpdocs.Server, sourceName string) *Document {
var content strings.Builder
fmt.Fprintf(&content, "# %s\n\n", server.Name)
fmt.Fprintf(&content, "%s\n", server.Description)
if len(server.Tools) > 0 {
fmt.Fprintf(&content, "\n## Tools (%d)\n", len(server.Tools))
for _, t := range server.Tools {
fmt.Fprintf(&content, "- `%s`: %s\n", t.Name, t.Description)
}
}
metadata := map[string]interface{}{
"server": server.Name,
"category": server.Category,
"doc_url": server.DocURL,
"doc_type": "mcp-server",
}
return &Document{
ID: generateDocID(server.DocURL),
Source: sourceName,
Type: "mcp-server",
Title: server.Name,
Content: content.String(),
URL: server.DocURL,
Metadata: metadata,
Hash: s.generateHash(content.String()),
Timestamp: time.Now(),
}
}
func (s *MCPDocsScraper) toolToDocument(tool *mcpdocs.Tool, server *mcpdocs.Server, sourceName string) *Document {
var content strings.Builder
fmt.Fprintf(&content, "# %s\n\n", tool.Name)
fmt.Fprintf(&content, "Server: %s\n\n", server.Name)
fmt.Fprintf(&content, "%s\n", tool.Description)
metadata := map[string]interface{}{
"server": server.Name,
"tool": tool.Name,
"doc_url": tool.DocURL,
"doc_type": "mcp-tool",
}
return &Document{
ID: generateDocID(tool.DocURL + "#" + tool.Name),
Source: sourceName,
Type: "mcp-tool",
Title: fmt.Sprintf("%s.%s", server.Name, tool.Name),
Content: content.String(),
URL: tool.DocURL,
Metadata: metadata,
Hash: s.generateHash(content.String()),
Timestamp: time.Now(),
}
}
func (s *MCPDocsScraper) resourceToDocument(res *mcpdocs.Resource, server *mcpdocs.Server, sourceName string) *Document {
var content strings.Builder
fmt.Fprintf(&content, "# %s\n\n", res.Name)
fmt.Fprintf(&content, "Server: %s\n", server.Name)
fmt.Fprintf(&content, "URI: %s\n\n", res.URI)
fmt.Fprintf(&content, "%s\n", res.Description)
metadata := map[string]interface{}{
"server": server.Name,
"resource": res.Name,
"uri": res.URI,
"doc_url": res.DocURL,
"doc_type": "mcp-resource",
}
return &Document{
ID: generateDocID(res.DocURL + "#" + res.Name),
Source: sourceName,
Type: "mcp-resource",
Title: res.Name,
Content: content.String(),
URL: res.DocURL,
Metadata: metadata,
Hash: s.generateHash(content.String()),
Timestamp: time.Now(),
}
}
func (s *MCPDocsScraper) promptToDocument(prompt *mcpdocs.Prompt, server *mcpdocs.Server, sourceName string) *Document {
var content strings.Builder
fmt.Fprintf(&content, "# %s\n\n", prompt.Name)
fmt.Fprintf(&content, "Server: %s\n\n", server.Name)
fmt.Fprintf(&content, "%s\n", prompt.Description)
metadata := map[string]interface{}{
"server": server.Name,
"prompt": prompt.Name,
"doc_url": prompt.DocURL,
"doc_type": "mcp-prompt",
}
return &Document{
ID: generateDocID(prompt.DocURL + "#" + prompt.Name),
Source: sourceName,
Type: "mcp-prompt",
Title: prompt.Name,
Content: content.String(),
URL: prompt.DocURL,
Metadata: metadata,
Hash: s.generateHash(content.String()),
Timestamp: time.Now(),
}
}