package scraper import ( "context" "crypto/sha256" "encoding/hex" "fmt" "io" "net/http" "strings" "time" "github.com/yourorg/devour/pkg/mcpdocs" ) type MCPDocsScraper struct { config *Config parser *mcpdocs.Parser client *http.Client } func NewMCPDocsScraper(config *Config) *MCPDocsScraper { return &MCPDocsScraper{ config: config, parser: mcpdocs.NewParser(), client: &http.Client{ Timeout: config.Timeout, }, } } func (s *MCPDocsScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) { var documents []*Document if source.URL == "" { return nil, fmt.Errorf("URL is required for MCP docs scraper") } html, err := s.fetchPage(ctx, source.URL) if err != nil { return nil, fmt.Errorf("failed to fetch page: %w", err) } server, err := s.parser.ParseServerPage(html, source.URL) if err != nil { return nil, fmt.Errorf("failed to parse MCP server page: %w", err) } mainDoc := s.serverToDocument(server, source.Name) documents = append(documents, mainDoc) for _, tool := range server.Tools { doc := s.toolToDocument(tool, server, source.Name) documents = append(documents, doc) } for _, res := range server.Resources { doc := s.resourceToDocument(res, server, source.Name) documents = append(documents, doc) } for _, prompt := range server.Prompts { doc := s.promptToDocument(prompt, server, source.Name) documents = append(documents, doc) } return documents, nil } func (s *MCPDocsScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) { html, err := s.fetchPage(ctx, source.URL) if err != nil { return false, "", err } hash := s.generateHash(html) changed := hash != lastHash return changed, hash, nil } func (s *MCPDocsScraper) fetchPage(ctx context.Context, url string) (string, error) { req, err := http.NewRequestWithContext(ctx, "GET", url, nil) if err != nil { return "", err } req.Header.Set("User-Agent", s.config.UserAgent) resp, err := s.client.Do(req) if err != nil { return "", err } defer resp.Body.Close() if resp.StatusCode != http.StatusOK { return "", fmt.Errorf("HTTP %d", resp.StatusCode) } body, err := io.ReadAll(resp.Body) if err != nil { return "", err } return string(body), nil } func (s *MCPDocsScraper) generateHash(content string) string { hash := sha256.Sum256([]byte(content)) return hex.EncodeToString(hash[:]) } func (s *MCPDocsScraper) serverToDocument(server *mcpdocs.Server, sourceName string) *Document { var content strings.Builder fmt.Fprintf(&content, "# %s\n\n", server.Name) fmt.Fprintf(&content, "%s\n", server.Description) if len(server.Tools) > 0 { fmt.Fprintf(&content, "\n## Tools (%d)\n", len(server.Tools)) for _, t := range server.Tools { fmt.Fprintf(&content, "- `%s`: %s\n", t.Name, t.Description) } } metadata := map[string]interface{}{ "server": server.Name, "category": server.Category, "doc_url": server.DocURL, "doc_type": "mcp-server", } return &Document{ ID: generateDocID(server.DocURL), Source: sourceName, Type: "mcp-server", Title: server.Name, Content: content.String(), URL: server.DocURL, Metadata: metadata, Hash: s.generateHash(content.String()), Timestamp: time.Now(), } } func (s *MCPDocsScraper) toolToDocument(tool *mcpdocs.Tool, server *mcpdocs.Server, sourceName string) *Document { var content strings.Builder fmt.Fprintf(&content, "# %s\n\n", tool.Name) fmt.Fprintf(&content, "Server: %s\n\n", server.Name) fmt.Fprintf(&content, "%s\n", tool.Description) metadata := map[string]interface{}{ "server": server.Name, "tool": tool.Name, "doc_url": tool.DocURL, "doc_type": "mcp-tool", } return &Document{ ID: generateDocID(tool.DocURL + "#" + tool.Name), Source: sourceName, Type: "mcp-tool", Title: fmt.Sprintf("%s.%s", server.Name, tool.Name), Content: content.String(), URL: tool.DocURL, Metadata: metadata, Hash: s.generateHash(content.String()), Timestamp: time.Now(), } } func (s *MCPDocsScraper) resourceToDocument(res *mcpdocs.Resource, server *mcpdocs.Server, sourceName string) *Document { var content strings.Builder fmt.Fprintf(&content, "# %s\n\n", res.Name) fmt.Fprintf(&content, "Server: %s\n", server.Name) fmt.Fprintf(&content, "URI: %s\n\n", res.URI) fmt.Fprintf(&content, "%s\n", res.Description) metadata := map[string]interface{}{ "server": server.Name, "resource": res.Name, "uri": res.URI, "doc_url": res.DocURL, "doc_type": "mcp-resource", } return &Document{ ID: generateDocID(res.DocURL + "#" + res.Name), Source: sourceName, Type: "mcp-resource", Title: res.Name, Content: content.String(), URL: res.DocURL, Metadata: metadata, Hash: s.generateHash(content.String()), Timestamp: time.Now(), } } func (s *MCPDocsScraper) promptToDocument(prompt *mcpdocs.Prompt, server *mcpdocs.Server, sourceName string) *Document { var content strings.Builder fmt.Fprintf(&content, "# %s\n\n", prompt.Name) fmt.Fprintf(&content, "Server: %s\n\n", server.Name) fmt.Fprintf(&content, "%s\n", prompt.Description) metadata := map[string]interface{}{ "server": server.Name, "prompt": prompt.Name, "doc_url": prompt.DocURL, "doc_type": "mcp-prompt", } return &Document{ ID: generateDocID(prompt.DocURL + "#" + prompt.Name), Source: sourceName, Type: "mcp-prompt", Title: prompt.Name, Content: content.String(), URL: prompt.DocURL, Metadata: metadata, Hash: s.generateHash(content.String()), Timestamp: time.Now(), } }