This commit is contained in:
Tomas Dvorak
2026-02-24 10:33:59 +01:00
parent 409acd2e08
commit 898a3c303f
1374 changed files with 290409 additions and 29187 deletions
+45
View File
@@ -0,0 +1,45 @@
package scraper
import basescraper "github.com/yourorg/devour/internal/scraper"
func init() {
basescraper.RegisterScraper(basescraper.SourceTypeGoDocs, func(c *basescraper.Config) basescraper.Scraper {
return NewGoDocsScraper(c)
})
basescraper.RegisterScraper(basescraper.SourceTypeRustDocs, func(c *basescraper.Config) basescraper.Scraper {
return NewRustDocsScraper(c)
})
basescraper.RegisterScraper(basescraper.SourceTypePythonDocs, func(c *basescraper.Config) basescraper.Scraper {
return NewPythonDocsScraper(c)
})
basescraper.RegisterScraper(basescraper.SourceTypeJavaDocs, func(c *basescraper.Config) basescraper.Scraper {
return NewJavaDocsScraper(c)
})
basescraper.RegisterScraper(basescraper.SourceTypeSpringDocs, func(c *basescraper.Config) basescraper.Scraper {
return NewSpringDocsScraper(c)
})
basescraper.RegisterScraper(basescraper.SourceTypeTSDocs, func(c *basescraper.Config) basescraper.Scraper {
return NewTSDocsScraper(c)
})
basescraper.RegisterScraper(basescraper.SourceTypeReactDocs, func(c *basescraper.Config) basescraper.Scraper {
return NewReactDocsScraper(c)
})
basescraper.RegisterScraper(basescraper.SourceTypeVueDocs, func(c *basescraper.Config) basescraper.Scraper {
return NewVueDocsScraper(c)
})
basescraper.RegisterScraper(basescraper.SourceTypeNuxtDocs, func(c *basescraper.Config) basescraper.Scraper {
return NewNuxtDocsScraper(c)
})
basescraper.RegisterScraper(basescraper.SourceTypeMCPDocs, func(c *basescraper.Config) basescraper.Scraper {
return NewMCPDocsScraper(c)
})
basescraper.RegisterScraper(basescraper.SourceTypeDockerDocs, func(c *basescraper.Config) basescraper.Scraper {
return NewDockerDocsScraper(c)
})
basescraper.RegisterScraper(basescraper.SourceTypeCloudflareDocs, func(c *basescraper.Config) basescraper.Scraper {
return NewCloudflareDocsScraper(c)
})
basescraper.RegisterScraper(basescraper.SourceTypeAstroDocs, func(c *basescraper.Config) basescraper.Scraper {
return NewAstroDocsScraper(c)
})
}
+27 -12
View File
@@ -155,16 +155,18 @@ func (s *TSDocsScraper) interfaceToDocument(iface *tsdocs.Interface, module *tsd
metadata := map[string]interface{}{
"module": module.Name,
"name": iface.Name,
"doc_url": iface.DocURL,
"doc_url": coalesceDocURL(iface.DocURL, module.DocURL),
}
docURL := coalesceDocURL(iface.DocURL, module.DocURL)
return &Document{
ID: generateDocID(iface.DocURL),
ID: generateDocID(docURL),
Source: sourceName,
Type: "ts-interface",
Title: iface.Name,
Content: content.String(),
URL: iface.DocURL,
URL: docURL,
Metadata: metadata,
Hash: s.generateHash(content.String()),
Timestamp: time.Now(),
@@ -185,16 +187,18 @@ func (s *TSDocsScraper) functionToDocument(fn *tsdocs.Function, module *tsdocs.M
"module": module.Name,
"name": fn.Name,
"return_type": fn.ReturnType,
"doc_url": fn.DocURL,
"doc_url": coalesceDocURL(fn.DocURL, module.DocURL),
}
docURL := coalesceDocURL(fn.DocURL, module.DocURL)
return &Document{
ID: generateDocID(fn.DocURL),
ID: generateDocID(docURL),
Source: sourceName,
Type: "ts-function",
Title: fn.Name,
Content: content.String(),
URL: fn.DocURL,
URL: docURL,
Metadata: metadata,
Hash: s.generateHash(content.String()),
Timestamp: time.Now(),
@@ -217,16 +221,18 @@ func (s *TSDocsScraper) classToDocument(class *tsdocs.Class, module *tsdocs.Modu
metadata := map[string]interface{}{
"module": module.Name,
"name": class.Name,
"doc_url": class.DocURL,
"doc_url": coalesceDocURL(class.DocURL, module.DocURL),
}
docURL := coalesceDocURL(class.DocURL, module.DocURL)
return &Document{
ID: generateDocID(class.DocURL),
ID: generateDocID(docURL),
Source: sourceName,
Type: "ts-class",
Title: class.Name,
Content: content.String(),
URL: class.DocURL,
URL: docURL,
Metadata: metadata,
Hash: s.generateHash(content.String()),
Timestamp: time.Now(),
@@ -244,18 +250,27 @@ func (s *TSDocsScraper) typeAliasToDocument(ta *tsdocs.TypeAlias, module *tsdocs
metadata := map[string]interface{}{
"module": module.Name,
"name": ta.Name,
"doc_url": ta.DocURL,
"doc_url": coalesceDocURL(ta.DocURL, module.DocURL),
}
docURL := coalesceDocURL(ta.DocURL, module.DocURL)
return &Document{
ID: generateDocID(ta.DocURL),
ID: generateDocID(docURL),
Source: sourceName,
Type: "ts-type",
Title: ta.Name,
Content: content.String(),
URL: ta.DocURL,
URL: docURL,
Metadata: metadata,
Hash: s.generateHash(content.String()),
Timestamp: time.Now(),
}
}
func coalesceDocURL(primary, fallback string) string {
if strings.TrimSpace(primary) != "" {
return primary
}
return fallback
}
+65
View File
@@ -0,0 +1,65 @@
package scraper
import (
"testing"
"github.com/yourorg/devour/pkg/tsdocs"
)
func TestTSDocsSubDocsFallbackToModuleURL(t *testing.T) {
s := &TSDocsScraper{}
module := &tsdocs.Module{
Name: "Module",
DocURL: "https://www.typescriptlang.org/docs/handbook/2/basic-types.html",
}
cases := []struct {
name string
build func() *Document
docType string
}{
{
name: "interface",
build: func() *Document {
return s.interfaceToDocument(&tsdocs.Interface{Name: "User", DocURL: ""}, module, "ts")
},
docType: "ts-interface",
},
{
name: "function",
build: func() *Document {
return s.functionToDocument(&tsdocs.Function{Name: "parse", DocURL: ""}, module, "ts")
},
docType: "ts-function",
},
{
name: "class",
build: func() *Document {
return s.classToDocument(&tsdocs.Class{Name: "Service", DocURL: ""}, module, "ts")
},
docType: "ts-class",
},
{
name: "type alias",
build: func() *Document {
return s.typeAliasToDocument(&tsdocs.TypeAlias{Name: "ID", Type: "string", DocURL: ""}, module, "ts")
},
docType: "ts-type",
},
}
for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
doc := tc.build()
if doc.URL != module.DocURL {
t.Fatalf("expected fallback URL %q, got %q", module.DocURL, doc.URL)
}
if got := doc.Metadata["doc_url"]; got != module.DocURL {
t.Fatalf("expected metadata doc_url %q, got %#v", module.DocURL, got)
}
if doc.Type != tc.docType {
t.Fatalf("expected doc type %q, got %q", tc.docType, doc.Type)
}
})
}
}
+21
View File
@@ -0,0 +1,21 @@
package scraper
import (
"crypto/sha256"
"encoding/hex"
basescraper "github.com/yourorg/devour/internal/scraper"
)
type SourceType = basescraper.SourceType
type Source = basescraper.Source
type Document = basescraper.Document
type Config = basescraper.Config
func generateDocID(urlStr string) string {
hash := sha256.Sum256([]byte(urlStr))
return hex.EncodeToString(hash[:12])
}