This commit is contained in:
Tomas Dvorak
2026-02-24 10:33:59 +01:00
parent 409acd2e08
commit 898a3c303f
1374 changed files with 290409 additions and 29187 deletions
+2 -16
View File
@@ -1,11 +1,11 @@
package astrodocs
import (
"net/url"
"strings"
"time"
"github.com/PuerkitoBio/goquery"
"github.com/yourorg/devour/pkg/parserutil"
)
type Parser struct {
@@ -159,19 +159,5 @@ func (p *Parser) extractCodeBlocks(doc *goquery.Document) []*CodeBlock {
}
func resolveURL(base string, href string) string {
if strings.HasPrefix(href, "http") {
return href
}
baseURL, err := url.Parse(base)
if err != nil {
return href
}
hrefURL, err := url.Parse(href)
if err != nil {
return href
}
return baseURL.ResolveReference(hrefURL).String()
return parserutil.ResolveURL(base, href)
}
+84
View File
@@ -0,0 +1,84 @@
package astrodocs
import "testing"
const testAstroPageHTML = `
<!DOCTYPE html>
<html>
<head>
<title>Components | Astro Docs</title>
<meta name="description" content="Build pages using Astro components.">
</head>
<body>
<main>
<h1 id="components">Components</h1>
<p>Astro components are basic HTML templates.</p>
<h2 id="props">Props</h2>
<p>Pass props to customize output.</p>
<pre class="language-js"><code>const { title } = Astro.props;</code></pre>
</main>
</body>
</html>
`
func TestParsePage(t *testing.T) {
parser := NewParser()
page, err := parser.ParsePage(testAstroPageHTML, "https://docs.astro.build/en/guides/components/")
if err != nil {
t.Fatalf("ParsePage failed: %v", err)
}
if page.Title != "Components" {
t.Fatalf("unexpected title: %q", page.Title)
}
if page.Description == "" {
t.Fatal("expected non-empty description")
}
if len(page.Sections) < 2 {
t.Fatalf("expected at least 2 sections, got %d", len(page.Sections))
}
if len(page.CodeBlocks) == 0 {
t.Fatal("expected at least one code block")
}
foundJS := false
for _, block := range page.CodeBlocks {
if block.Language == "js" {
foundJS = true
break
}
}
if !foundJS {
t.Fatalf("expected at least one js code block, got %+v", page.CodeBlocks)
}
}
func TestParseSidebar(t *testing.T) {
parser := NewParser()
html := `
<nav>
<a href="/en/guides/components/">Components</a>
<a href="/en/guides/routing/">Routing</a>
</nav>`
sections, err := parser.ParseSidebar(html)
if err != nil {
t.Fatalf("ParseSidebar failed: %v", err)
}
if len(sections) != 2 {
t.Fatalf("expected 2 sections, got %d", len(sections))
}
if sections[0].DocURL != "https://docs.astro.build/en/guides/components/" {
t.Fatalf("unexpected resolved URL: %q", sections[0].DocURL)
}
}
func TestResolveURL(t *testing.T) {
got := resolveURL("https://docs.astro.build", "/en/guides/components/")
if got != "https://docs.astro.build/en/guides/components/" {
t.Fatalf("resolveURL returned %q", got)
}
}
+16 -22
View File
@@ -1,11 +1,11 @@
package cloudflaredocs
import (
"net/url"
"strings"
"time"
"github.com/PuerkitoBio/goquery"
"github.com/yourorg/devour/pkg/parserutil"
)
type Parser struct {
@@ -110,16 +110,24 @@ func (p *Parser) extractContent(doc *goquery.Document) string {
func (p *Parser) extractSections(doc *goquery.Document, docURL string) []*Section {
var sections []*Section
doc.Find("h1, h2, h3").Each(func(_ int, s *goquery.Selection) {
doc.Find("h1, h2, h3, h4").Each(func(_ int, s *goquery.Selection) {
section := &Section{}
section.Title = strings.TrimSpace(s.Text())
section.ID = strings.TrimSpace(s.AttrOr("id", ""))
if id, exists := s.Attr("id"); exists {
section.ID = id
section.DocURL = docURL + "#" + id
} else {
section.DocURL = docURL
if section.ID == "" {
section.ID = strings.TrimSpace(s.AttrOr("data-anchor", ""))
}
if section.ID == "" {
if href, exists := s.Find("a[href^='#']").First().Attr("href"); exists {
section.ID = strings.TrimPrefix(strings.TrimSpace(href), "#")
}
}
section.DocURL = docURL
if section.ID != "" {
section.DocURL = docURL + "#" + section.ID
}
if section.Title != "" {
@@ -189,19 +197,5 @@ func (p *Parser) extractAPIs(doc *goquery.Document, docURL string) []*API {
}
func resolveURL(base string, href string) string {
if strings.HasPrefix(href, "http") {
return href
}
baseURL, err := url.Parse(base)
if err != nil {
return href
}
hrefURL, err := url.Parse(href)
if err != nil {
return href
}
return baseURL.ResolveReference(hrefURL).String()
return parserutil.ResolveURL(base, href)
}
+88
View File
@@ -0,0 +1,88 @@
package cloudflaredocs
import (
"strings"
"testing"
)
const testCloudflarePageHTML = `
<!DOCTYPE html>
<html>
<head>
<title>Workers API | Cloudflare Docs</title>
<meta name="description" content="Build and deploy serverless apps.">
</head>
<body>
<div class="product-name">Workers</div>
<main>
<h1 id="workers-api">Workers API</h1>
<p>Cloudflare Workers lets you run JavaScript at the edge.</p>
<h2 id="endpoints">Endpoints</h2>
<pre><code class="language-http">GET /client/v4/accounts/{id}/workers/scripts</code></pre>
</main>
</body>
</html>
`
func TestParsePage(t *testing.T) {
parser := NewParser()
page, err := parser.ParsePage(testCloudflarePageHTML, "https://developers.cloudflare.com/workers/api/")
if err != nil {
t.Fatalf("ParsePage failed: %v", err)
}
if page.Title != "Workers API" {
t.Fatalf("unexpected title: %q", page.Title)
}
if page.Product != "Workers" {
t.Fatalf("unexpected product: %q", page.Product)
}
if page.Description == "" {
t.Fatal("expected non-empty description")
}
if len(page.Sections) < 2 {
t.Fatalf("expected at least 2 sections, got %d", len(page.Sections))
}
if len(page.CodeBlocks) == 0 {
t.Fatal("expected at least one code block")
}
if len(page.APIs) == 0 {
t.Fatal("expected at least one parsed API endpoint")
}
if page.APIs[0].Method != "GET" {
t.Fatalf("expected API method GET, got %q", page.APIs[0].Method)
}
if !strings.HasPrefix(page.APIs[0].Endpoint, "/client/v4/") {
t.Fatalf("unexpected endpoint: %q", page.APIs[0].Endpoint)
}
}
func TestParseSidebar(t *testing.T) {
parser := NewParser()
html := `
<div class="sidebar">
<a href="/workers/">Workers</a>
<a href="/dns/">DNS</a>
</div>`
sections, err := parser.ParseSidebar(html)
if err != nil {
t.Fatalf("ParseSidebar failed: %v", err)
}
if len(sections) != 2 {
t.Fatalf("expected 2 sections, got %d", len(sections))
}
if sections[0].DocURL != "https://developers.cloudflare.com/workers/" {
t.Fatalf("unexpected resolved URL: %q", sections[0].DocURL)
}
}
func TestResolveURL(t *testing.T) {
got := resolveURL("https://developers.cloudflare.com", "/workers/")
if got != "https://developers.cloudflare.com/workers/" {
t.Fatalf("resolveURL returned %q", got)
}
}
+2 -16
View File
@@ -1,11 +1,11 @@
package dockerdocs
import (
"net/url"
"strings"
"time"
"github.com/PuerkitoBio/goquery"
"github.com/yourorg/devour/pkg/parserutil"
)
type Parser struct {
@@ -182,19 +182,5 @@ func (p *Parser) extractLinks(doc *goquery.Document, baseURL string) []string {
}
func resolveURL(base string, href string) string {
if strings.HasPrefix(href, "http") {
return href
}
baseURL, err := url.Parse(base)
if err != nil {
return href
}
hrefURL, err := url.Parse(href)
if err != nil {
return href
}
return baseURL.ResolveReference(hrefURL).String()
return parserutil.ResolveURL(base, href)
}
+89
View File
@@ -0,0 +1,89 @@
package dockerdocs
import "testing"
const testDockerPageHTML = `
<!DOCTYPE html>
<html>
<head>
<title>Docker Compose | Docker Docs</title>
<meta name="description" content="Define and run multi-container applications.">
</head>
<body>
<article>
<h1 id="compose">Docker Compose</h1>
<p>Compose is for defining and running multi-container Docker apps.</p>
<h2 id="quickstart">Quickstart</h2>
<p>Use docker compose up.</p>
<pre><code class="language-bash">docker compose up</code></pre>
<a href="/compose/install/">Install</a>
<a href="https://docs.docker.com/engine/">Engine</a>
</article>
</body>
</html>
`
func TestParsePage(t *testing.T) {
parser := NewParser()
page, err := parser.ParsePage(testDockerPageHTML, "https://docs.docker.com/compose/")
if err != nil {
t.Fatalf("ParsePage failed: %v", err)
}
if page.Title != "Docker Compose" {
t.Fatalf("unexpected title: %q", page.Title)
}
if page.Description == "" {
t.Fatal("expected non-empty description")
}
if len(page.Sections) < 2 {
t.Fatalf("expected at least 2 sections, got %d", len(page.Sections))
}
if len(page.CodeBlocks) == 0 {
t.Fatal("expected at least one code block")
}
foundBash := false
for _, block := range page.CodeBlocks {
if block.Language == "bash" {
foundBash = true
break
}
}
if !foundBash {
t.Fatalf("expected at least one bash code block, got %+v", page.CodeBlocks)
}
if len(page.Links) < 2 {
t.Fatalf("expected at least 2 links, got %d", len(page.Links))
}
}
func TestParseToc(t *testing.T) {
parser := NewParser()
html := `
<nav>
<a href="/compose/install/">Install</a>
<a href="/compose/how-tos/">How-tos</a>
</nav>`
sections, err := parser.ParseToc(html)
if err != nil {
t.Fatalf("ParseToc failed: %v", err)
}
if len(sections) != 2 {
t.Fatalf("expected 2 sections, got %d", len(sections))
}
if sections[0].DocURL != "https://docs.docker.com/compose/install/" {
t.Fatalf("unexpected resolved URL: %q", sections[0].DocURL)
}
}
func TestResolveURL(t *testing.T) {
got := resolveURL("https://docs.docker.com", "/compose/")
if got != "https://docs.docker.com/compose/" {
t.Fatalf("resolveURL returned %q", got)
}
}
+2 -16
View File
@@ -1,11 +1,11 @@
package javadocs
import (
"net/url"
"strings"
"time"
"github.com/PuerkitoBio/goquery"
"github.com/yourorg/devour/pkg/parserutil"
)
type Parser struct {
@@ -351,19 +351,5 @@ func extractMethodName(sig string) string {
}
func resolveURL(base string, href string) string {
if strings.HasPrefix(href, "http") {
return href
}
baseURL, err := url.Parse(base)
if err != nil {
return href
}
hrefURL, err := url.Parse(href)
if err != nil {
return href
}
return baseURL.ResolveReference(hrefURL).String()
return parserutil.ResolveURL(base, href)
}
+2 -16
View File
@@ -1,11 +1,11 @@
package mcpdocs
import (
"net/url"
"strings"
"time"
"github.com/PuerkitoBio/goquery"
"github.com/yourorg/devour/pkg/parserutil"
)
type Parser struct {
@@ -194,19 +194,5 @@ func (p *Parser) extractPrompts(doc *goquery.Document, docURL string) []*Prompt
}
func resolveURL(base string, href string) string {
if strings.HasPrefix(href, "http") {
return href
}
baseURL, err := url.Parse(base)
if err != nil {
return href
}
hrefURL, err := url.Parse(href)
if err != nil {
return href
}
return baseURL.ResolveReference(hrefURL).String()
return parserutil.ResolveURL(base, href)
}
+102
View File
@@ -0,0 +1,102 @@
package mcpdocs
import (
"strings"
"testing"
)
const testMCPServerPageHTML = `
<!DOCTYPE html>
<html>
<head>
<title>GitHub MCP Server | Docker Hub</title>
<meta name="description" content="MCP server for GitHub automation.">
<meta property="og:image" content="https://hub.docker.com/image.png">
</head>
<body>
<span class="category">developer-tools</span>
<main>
<h1>GitHub MCP Server</h1>
<h2>Tools</h2>
<ul>
<li><code>search_repos</code><p>Search repositories.</p></li>
<li><code>open_pr</code><p>Create pull requests.</p></li>
</ul>
<h2>Resources</h2>
<ul>
<li><code>repo_readme</code><p>Repository readme content.</p></li>
</ul>
<h2>Prompts</h2>
<ul>
<li><code>summarize_pr</code><p>Summarize pull requests.</p></li>
</ul>
</main>
</body>
</html>
`
func TestParseServerPage(t *testing.T) {
parser := NewParser()
server, err := parser.ParseServerPage(testMCPServerPageHTML, "https://hub.docker.com/mcp/server/github")
if err != nil {
t.Fatalf("ParseServerPage failed: %v", err)
}
if server.Name != "GitHub MCP Server" {
t.Fatalf("unexpected server name: %q", server.Name)
}
if server.Description == "" {
t.Fatal("expected non-empty description")
}
if strings.TrimSpace(server.Category) != "developer-tools" {
t.Fatalf("unexpected category: %q", server.Category)
}
if len(server.Tools) != 2 {
t.Fatalf("expected 2 tools, got %d", len(server.Tools))
}
if len(server.Resources) != 1 {
t.Fatalf("expected 1 resource, got %d", len(server.Resources))
}
if len(server.Prompts) != 1 {
t.Fatalf("expected 1 prompt, got %d", len(server.Prompts))
}
}
func TestParseHubPage(t *testing.T) {
parser := NewParser()
html := `
<div>
<a href="/mcp/server/github">
<h3>GitHub MCP Server</h3>
<p>MCP server for GitHub automation.</p>
</a>
<a href="/mcp/server/slack">
<h3>Slack MCP Server</h3>
<p>MCP server for Slack workflows.</p>
</a>
</div>`
servers, err := parser.ParseHubPage(html)
if err != nil {
t.Fatalf("ParseHubPage failed: %v", err)
}
if len(servers) != 2 {
t.Fatalf("expected 2 servers, got %d", len(servers))
}
if servers[0].DocURL != "https://hub.docker.com/mcp/server/github" {
t.Fatalf("unexpected resolved URL: %q", servers[0].DocURL)
}
}
func TestResolveURL(t *testing.T) {
got := resolveURL("https://hub.docker.com", "/mcp/server/github")
if got != "https://hub.docker.com/mcp/server/github" {
t.Fatalf("resolveURL returned %q", got)
}
}
+2 -16
View File
@@ -1,11 +1,11 @@
package nuxtdocs
import (
"net/url"
"strings"
"time"
"github.com/PuerkitoBio/goquery"
"github.com/yourorg/devour/pkg/parserutil"
)
type Parser struct {
@@ -248,19 +248,5 @@ func (p *Parser) extractCommands(doc *goquery.Document, docURL string) []*Comman
}
func resolveURL(base string, href string) string {
if strings.HasPrefix(href, "http") {
return href
}
baseURL, err := url.Parse(base)
if err != nil {
return href
}
hrefURL, err := url.Parse(href)
if err != nil {
return href
}
return baseURL.ResolveReference(hrefURL).String()
return parserutil.ResolveURL(base, href)
}
+25
View File
@@ -0,0 +1,25 @@
package parserutil
import (
"net/url"
"strings"
)
// ResolveURL converts relative href values into absolute URLs.
func ResolveURL(base string, href string) string {
if strings.HasPrefix(href, "http") {
return href
}
baseURL, err := url.Parse(base)
if err != nil {
return href
}
hrefURL, err := url.Parse(href)
if err != nil {
return href
}
return baseURL.ResolveReference(hrefURL).String()
}
+2 -15
View File
@@ -7,6 +7,7 @@ import (
"time"
"github.com/PuerkitoBio/goquery"
"github.com/yourorg/devour/pkg/parserutil"
)
type Parser struct {
@@ -537,19 +538,5 @@ func extractPathFromURL(href string) string {
}
func resolveURL(base string, href string) string {
if strings.HasPrefix(href, "http") {
return href
}
baseURL, err := url.Parse(base)
if err != nil {
return href
}
hrefURL, err := url.Parse(href)
if err != nil {
return href
}
return baseURL.ResolveReference(hrefURL).String()
return parserutil.ResolveURL(base, href)
}
+2 -16
View File
@@ -1,11 +1,11 @@
package reactdocs
import (
"net/url"
"strings"
"time"
"github.com/PuerkitoBio/goquery"
"github.com/yourorg/devour/pkg/parserutil"
)
type Parser struct {
@@ -268,19 +268,5 @@ func (p *Parser) ParseHookPage(html string, docURL string) (*Hook, error) {
}
func resolveURL(base string, href string) string {
if strings.HasPrefix(href, "http") {
return href
}
baseURL, err := url.Parse(base)
if err != nil {
return href
}
hrefURL, err := url.Parse(href)
if err != nil {
return href
}
return baseURL.ResolveReference(hrefURL).String()
return parserutil.ResolveURL(base, href)
}
+2 -15
View File
@@ -7,6 +7,7 @@ import (
"time"
"github.com/PuerkitoBio/goquery"
"github.com/yourorg/devour/pkg/parserutil"
)
type Parser struct {
@@ -609,21 +610,7 @@ func extractKindFromClasses(classes string) string {
}
func resolveURL(base string, href string) string {
if strings.HasPrefix(href, "http") {
return href
}
baseURL, err := url.Parse(base)
if err != nil {
return href
}
hrefURL, err := url.Parse(href)
if err != nil {
return href
}
return baseURL.ResolveReference(hrefURL).String()
return parserutil.ResolveURL(base, href)
}
func cleanText(text string) string {
+2 -16
View File
@@ -1,11 +1,11 @@
package springdocs
import (
"net/url"
"strings"
"time"
"github.com/PuerkitoBio/goquery"
"github.com/yourorg/devour/pkg/parserutil"
)
type Parser struct {
@@ -292,19 +292,5 @@ func extractQualifiedName(href string) string {
}
func resolveURL(base string, href string) string {
if strings.HasPrefix(href, "http") {
return href
}
baseURL, err := url.Parse(base)
if err != nil {
return href
}
hrefURL, err := url.Parse(href)
if err != nil {
return href
}
return baseURL.ResolveReference(hrefURL).String()
return parserutil.ResolveURL(base, href)
}
+2 -16
View File
@@ -1,11 +1,11 @@
package tsdocs
import (
"net/url"
"strings"
"time"
"github.com/PuerkitoBio/goquery"
"github.com/yourorg/devour/pkg/parserutil"
)
type Parser struct {
@@ -289,19 +289,5 @@ func (p *Parser) extractVariables(doc *goquery.Document, moduleName string, docU
}
func resolveURL(base string, href string) string {
if strings.HasPrefix(href, "http") {
return href
}
baseURL, err := url.Parse(base)
if err != nil {
return href
}
hrefURL, err := url.Parse(href)
if err != nil {
return href
}
return baseURL.ResolveReference(hrefURL).String()
return parserutil.ResolveURL(base, href)
}
+12 -9
View File
@@ -38,15 +38,18 @@ type SearchResult struct {
// Source represents a documentation source.
type Source struct {
Name string `yaml:"name"`
Type string `yaml:"type"`
URL string `yaml:"url,omitempty"`
Repo string `yaml:"repo,omitempty"`
Branch string `yaml:"branch,omitempty"`
Path string `yaml:"path,omitempty"`
Include []string `yaml:"include,omitempty"`
Exclude []string `yaml:"exclude,omitempty"`
Schedule string `yaml:"schedule,omitempty"`
Name string `yaml:"name"`
Type string `yaml:"type"`
URL string `yaml:"url,omitempty"`
Query string `yaml:"query,omitempty"`
ResultLimit int `yaml:"result_limit,omitempty"`
Domains []string `yaml:"domains,omitempty"`
Repo string `yaml:"repo,omitempty"`
Branch string `yaml:"branch,omitempty"`
Path string `yaml:"path,omitempty"`
Include []string `yaml:"include,omitempty"`
Exclude []string `yaml:"exclude,omitempty"`
Schedule string `yaml:"schedule,omitempty"`
}
// Status represents index status.
+2 -16
View File
@@ -1,11 +1,11 @@
package vuedocs
import (
"net/url"
"strings"
"time"
"github.com/PuerkitoBio/goquery"
"github.com/yourorg/devour/pkg/parserutil"
)
type Parser struct {
@@ -315,19 +315,5 @@ func (p *Parser) extractSpecialAttrs(doc *goquery.Document, docURL string) []*Sp
}
func resolveURL(base string, href string) string {
if strings.HasPrefix(href, "http") {
return href
}
baseURL, err := url.Parse(base)
if err != nil {
return href
}
hrefURL, err := url.Parse(href)
if err != nil {
return href
}
return baseURL.ResolveReference(hrefURL).String()
return parserutil.ResolveURL(base, href)
}