package mcpdocs import ( "net/url" "strings" "time" "github.com/PuerkitoBio/goquery" ) type Parser struct { baseURL string } func NewParser() *Parser { return &Parser{ baseURL: "https://hub.docker.com", } } func (p *Parser) ParseServerPage(html string, docURL string) (*Server, error) { doc, err := goquery.NewDocumentFromReader(strings.NewReader(html)) if err != nil { return nil, err } server := &Server{ DocURL: docURL, FetchedAt: time.Now(), } server.Name = p.extractServerName(doc) server.Description = p.extractDescription(doc) server.Image = p.extractImage(doc) server.Category = p.extractCategory(doc) server.Tools = p.extractTools(doc, docURL) server.Resources = p.extractResources(doc, docURL) server.Prompts = p.extractPrompts(doc, docURL) return server, nil } func (p *Parser) ParseHubPage(html string) ([]*Server, error) { doc, err := goquery.NewDocumentFromReader(strings.NewReader(html)) if err != nil { return nil, err } var servers []*Server doc.Find("a[href*='/mcp/server/'], .server-card, .mcp-server-item").Each(func(_ int, s *goquery.Selection) { server := &Server{} server.Name = strings.TrimSpace(s.Find("h1, h2, h3, .name, .title").First().Text()) server.Description = strings.TrimSpace(s.Find(".description, p").First().Text()) if href, exists := s.Attr("href"); exists { server.DocURL = resolveURL(p.baseURL, href) } if server.Name != "" { servers = append(servers, server) } }) return servers, nil } func (p *Parser) extractServerName(doc *goquery.Document) string { title := doc.Find("h1").First().Text() title = strings.TrimSpace(title) if title == "" { title = doc.Find("title").First().Text() title = strings.TrimSpace(title) if idx := strings.Index(title, " | "); idx > 0 { title = title[:idx] } } return title } func (p *Parser) extractDescription(doc *goquery.Document) string { desc := doc.Find("meta[name='description']").AttrOr("content", "") if desc != "" { return desc } desc = doc.Find(".description, .overview, .introduction, p:first-of-type").First().Text() return strings.TrimSpace(desc) } func (p *Parser) extractImage(doc *goquery.Document) string { return doc.Find("meta[property='og:image']").AttrOr("content", "") } func (p *Parser) extractCategory(doc *goquery.Document) string { return doc.Find(".category, .tag").First().Text() } func (p *Parser) extractTools(doc *goquery.Document, docURL string) []*Tool { var tools []*Tool doc.Find("h2:contains('Tools'), h3:contains('Tools')").Each(func(_ int, heading *goquery.Selection) { container := heading.Next() for container.Length() > 0 && !container.Is("h2, h3") { container.Find("li, .tool, .item").Each(func(_ int, item *goquery.Selection) { tool := &Tool{} tool.Name = strings.TrimSpace(item.Find("code, .name, strong").First().Text()) tool.Description = strings.TrimSpace(item.Find(".description, p").First().Text()) tool.DocURL = docURL if tool.Name != "" { tools = append(tools, tool) } }) container = container.Next() } }) doc.Find("pre code, .code-block").Each(func(_ int, code *goquery.Selection) { text := code.Text() if strings.Contains(text, "tools") && strings.Contains(text, "name") { lines := strings.Split(text, "\n") for _, line := range lines { line = strings.TrimSpace(line) if strings.HasPrefix(line, "name:") || strings.Contains(line, `"name"`) { tool := &Tool{ DocURL: docURL, } parts := strings.SplitN(line, ":", 2) if len(parts) > 1 { tool.Name = strings.Trim(strings.TrimSpace(parts[1]), `"`) } if tool.Name != "" { tools = append(tools, tool) } } } } }) return tools } func (p *Parser) extractResources(doc *goquery.Document, docURL string) []*Resource { var resources []*Resource doc.Find("h2:contains('Resources'), h3:contains('Resources')").Each(func(_ int, heading *goquery.Selection) { container := heading.Next() for container.Length() > 0 && !container.Is("h2, h3") { container.Find("li, .resource, .item").Each(func(_ int, item *goquery.Selection) { res := &Resource{} res.Name = strings.TrimSpace(item.Find("code, .name, strong").First().Text()) res.Description = strings.TrimSpace(item.Find(".description, p").First().Text()) res.DocURL = docURL if res.Name != "" { resources = append(resources, res) } }) container = container.Next() } }) return resources } func (p *Parser) extractPrompts(doc *goquery.Document, docURL string) []*Prompt { var prompts []*Prompt doc.Find("h2:contains('Prompts'), h3:contains('Prompts')").Each(func(_ int, heading *goquery.Selection) { container := heading.Next() for container.Length() > 0 && !container.Is("h2, h3") { container.Find("li, .prompt, .item").Each(func(_ int, item *goquery.Selection) { prompt := &Prompt{} prompt.Name = strings.TrimSpace(item.Find("code, .name, strong").First().Text()) prompt.Description = strings.TrimSpace(item.Find(".description, p").First().Text()) prompt.DocURL = docURL if prompt.Name != "" { prompts = append(prompts, prompt) } }) container = container.Next() } }) return prompts } func resolveURL(base string, href string) string { if strings.HasPrefix(href, "http") { return href } baseURL, err := url.Parse(base) if err != nil { return href } hrefURL, err := url.Parse(href) if err != nil { return href } return baseURL.ResolveReference(hrefURL).String() }