Files
Devour/pkg/mcpdocs/parser.go
T
Tomas Dvorak 898a3c303f update
2026-02-24 10:33:59 +01:00

199 lines
5.2 KiB
Go

package mcpdocs
import (
"strings"
"time"
"github.com/PuerkitoBio/goquery"
"github.com/yourorg/devour/pkg/parserutil"
)
type Parser struct {
baseURL string
}
func NewParser() *Parser {
return &Parser{
baseURL: "https://hub.docker.com",
}
}
func (p *Parser) ParseServerPage(html string, docURL string) (*Server, error) {
doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
if err != nil {
return nil, err
}
server := &Server{
DocURL: docURL,
FetchedAt: time.Now(),
}
server.Name = p.extractServerName(doc)
server.Description = p.extractDescription(doc)
server.Image = p.extractImage(doc)
server.Category = p.extractCategory(doc)
server.Tools = p.extractTools(doc, docURL)
server.Resources = p.extractResources(doc, docURL)
server.Prompts = p.extractPrompts(doc, docURL)
return server, nil
}
func (p *Parser) ParseHubPage(html string) ([]*Server, error) {
doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
if err != nil {
return nil, err
}
var servers []*Server
doc.Find("a[href*='/mcp/server/'], .server-card, .mcp-server-item").Each(func(_ int, s *goquery.Selection) {
server := &Server{}
server.Name = strings.TrimSpace(s.Find("h1, h2, h3, .name, .title").First().Text())
server.Description = strings.TrimSpace(s.Find(".description, p").First().Text())
if href, exists := s.Attr("href"); exists {
server.DocURL = resolveURL(p.baseURL, href)
}
if server.Name != "" {
servers = append(servers, server)
}
})
return servers, nil
}
func (p *Parser) extractServerName(doc *goquery.Document) string {
title := doc.Find("h1").First().Text()
title = strings.TrimSpace(title)
if title == "" {
title = doc.Find("title").First().Text()
title = strings.TrimSpace(title)
if idx := strings.Index(title, " | "); idx > 0 {
title = title[:idx]
}
}
return title
}
func (p *Parser) extractDescription(doc *goquery.Document) string {
desc := doc.Find("meta[name='description']").AttrOr("content", "")
if desc != "" {
return desc
}
desc = doc.Find(".description, .overview, .introduction, p:first-of-type").First().Text()
return strings.TrimSpace(desc)
}
func (p *Parser) extractImage(doc *goquery.Document) string {
return doc.Find("meta[property='og:image']").AttrOr("content", "")
}
func (p *Parser) extractCategory(doc *goquery.Document) string {
return doc.Find(".category, .tag").First().Text()
}
func (p *Parser) extractTools(doc *goquery.Document, docURL string) []*Tool {
var tools []*Tool
doc.Find("h2:contains('Tools'), h3:contains('Tools')").Each(func(_ int, heading *goquery.Selection) {
container := heading.Next()
for container.Length() > 0 && !container.Is("h2, h3") {
container.Find("li, .tool, .item").Each(func(_ int, item *goquery.Selection) {
tool := &Tool{}
tool.Name = strings.TrimSpace(item.Find("code, .name, strong").First().Text())
tool.Description = strings.TrimSpace(item.Find(".description, p").First().Text())
tool.DocURL = docURL
if tool.Name != "" {
tools = append(tools, tool)
}
})
container = container.Next()
}
})
doc.Find("pre code, .code-block").Each(func(_ int, code *goquery.Selection) {
text := code.Text()
if strings.Contains(text, "tools") && strings.Contains(text, "name") {
lines := strings.Split(text, "\n")
for _, line := range lines {
line = strings.TrimSpace(line)
if strings.HasPrefix(line, "name:") || strings.Contains(line, `"name"`) {
tool := &Tool{
DocURL: docURL,
}
parts := strings.SplitN(line, ":", 2)
if len(parts) > 1 {
tool.Name = strings.Trim(strings.TrimSpace(parts[1]), `"`)
}
if tool.Name != "" {
tools = append(tools, tool)
}
}
}
}
})
return tools
}
func (p *Parser) extractResources(doc *goquery.Document, docURL string) []*Resource {
var resources []*Resource
doc.Find("h2:contains('Resources'), h3:contains('Resources')").Each(func(_ int, heading *goquery.Selection) {
container := heading.Next()
for container.Length() > 0 && !container.Is("h2, h3") {
container.Find("li, .resource, .item").Each(func(_ int, item *goquery.Selection) {
res := &Resource{}
res.Name = strings.TrimSpace(item.Find("code, .name, strong").First().Text())
res.Description = strings.TrimSpace(item.Find(".description, p").First().Text())
res.DocURL = docURL
if res.Name != "" {
resources = append(resources, res)
}
})
container = container.Next()
}
})
return resources
}
func (p *Parser) extractPrompts(doc *goquery.Document, docURL string) []*Prompt {
var prompts []*Prompt
doc.Find("h2:contains('Prompts'), h3:contains('Prompts')").Each(func(_ int, heading *goquery.Selection) {
container := heading.Next()
for container.Length() > 0 && !container.Is("h2, h3") {
container.Find("li, .prompt, .item").Each(func(_ int, item *goquery.Selection) {
prompt := &Prompt{}
prompt.Name = strings.TrimSpace(item.Find("code, .name, strong").First().Text())
prompt.Description = strings.TrimSpace(item.Find(".description, p").First().Text())
prompt.DocURL = docURL
if prompt.Name != "" {
prompts = append(prompts, prompt)
}
})
container = container.Next()
}
})
return prompts
}
func resolveURL(base string, href string) string {
return parserutil.ResolveURL(base, href)
}