mirror of
https://github.com/Dvorinka/Devour.git
synced 2026-06-04 04:23:02 +00:00
updage
This commit is contained in:
+156
@@ -0,0 +1,156 @@
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/yourorg/devour/pkg/astrodocs"
|
||||
)
|
||||
|
||||
type AstroDocsScraper struct {
|
||||
config *Config
|
||||
parser *astrodocs.Parser
|
||||
client *http.Client
|
||||
}
|
||||
|
||||
func NewAstroDocsScraper(config *Config) *AstroDocsScraper {
|
||||
return &AstroDocsScraper{
|
||||
config: config,
|
||||
parser: astrodocs.NewParser(),
|
||||
client: &http.Client{
|
||||
Timeout: config.Timeout,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func (s *AstroDocsScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
|
||||
var documents []*Document
|
||||
|
||||
if source.URL == "" {
|
||||
return nil, fmt.Errorf("URL is required for Astro docs scraper")
|
||||
}
|
||||
|
||||
html, err := s.fetchPage(ctx, source.URL)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to fetch page: %w", err)
|
||||
}
|
||||
|
||||
page, err := s.parser.ParsePage(html, source.URL)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to parse Astro docs page: %w", err)
|
||||
}
|
||||
|
||||
mainDoc := s.pageToDocument(page, source.Name)
|
||||
documents = append(documents, mainDoc)
|
||||
|
||||
for _, section := range page.Sections {
|
||||
doc := s.sectionToDocument(section, page, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
return documents, nil
|
||||
}
|
||||
|
||||
func (s *AstroDocsScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
|
||||
html, err := s.fetchPage(ctx, source.URL)
|
||||
if err != nil {
|
||||
return false, "", err
|
||||
}
|
||||
|
||||
hash := s.generateHash(html)
|
||||
changed := hash != lastHash
|
||||
|
||||
return changed, hash, nil
|
||||
}
|
||||
|
||||
func (s *AstroDocsScraper) fetchPage(ctx context.Context, url string) (string, error) {
|
||||
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
req.Header.Set("User-Agent", s.config.UserAgent)
|
||||
|
||||
resp, err := s.client.Do(req)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return "", fmt.Errorf("HTTP %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return string(body), nil
|
||||
}
|
||||
|
||||
func (s *AstroDocsScraper) generateHash(content string) string {
|
||||
hash := sha256.Sum256([]byte(content))
|
||||
return hex.EncodeToString(hash[:])
|
||||
}
|
||||
|
||||
func (s *AstroDocsScraper) pageToDocument(page *astrodocs.Page, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# %s\n\n", page.Title)
|
||||
fmt.Fprintf(&content, "%s\n", page.Description)
|
||||
|
||||
if len(page.CodeBlocks) > 0 {
|
||||
fmt.Fprintf(&content, "\n## Code Examples\n")
|
||||
for _, cb := range page.CodeBlocks {
|
||||
fmt.Fprintf(&content, "\n```%s\n%s\n```\n", cb.Language, cb.Code)
|
||||
}
|
||||
}
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"title": page.Title,
|
||||
"doc_url": page.URL,
|
||||
"doc_type": "astro-docs",
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(page.URL),
|
||||
Source: sourceName,
|
||||
Type: "astro-docs",
|
||||
Title: page.Title,
|
||||
Content: content.String(),
|
||||
URL: page.URL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *AstroDocsScraper) sectionToDocument(section *astrodocs.Section, page *astrodocs.Page, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# %s\n\n", section.Title)
|
||||
fmt.Fprintf(&content, "%s\n", section.Content)
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"page_title": page.Title,
|
||||
"section_id": section.ID,
|
||||
"doc_url": section.DocURL,
|
||||
"doc_type": "astro-section",
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(section.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "astro-section",
|
||||
Title: fmt.Sprintf("%s - %s", page.Title, section.Title),
|
||||
Content: content.String(),
|
||||
URL: section.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
+193
@@ -0,0 +1,193 @@
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/yourorg/devour/pkg/cloudflaredocs"
|
||||
)
|
||||
|
||||
type CloudflareDocsScraper struct {
|
||||
config *Config
|
||||
parser *cloudflaredocs.Parser
|
||||
client *http.Client
|
||||
}
|
||||
|
||||
func NewCloudflareDocsScraper(config *Config) *CloudflareDocsScraper {
|
||||
return &CloudflareDocsScraper{
|
||||
config: config,
|
||||
parser: cloudflaredocs.NewParser(),
|
||||
client: &http.Client{
|
||||
Timeout: config.Timeout,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func (s *CloudflareDocsScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
|
||||
var documents []*Document
|
||||
|
||||
if source.URL == "" {
|
||||
return nil, fmt.Errorf("URL is required for Cloudflare docs scraper")
|
||||
}
|
||||
|
||||
html, err := s.fetchPage(ctx, source.URL)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to fetch page: %w", err)
|
||||
}
|
||||
|
||||
page, err := s.parser.ParsePage(html, source.URL)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to parse Cloudflare docs page: %w", err)
|
||||
}
|
||||
|
||||
mainDoc := s.pageToDocument(page, source.Name)
|
||||
documents = append(documents, mainDoc)
|
||||
|
||||
for _, section := range page.Sections {
|
||||
doc := s.sectionToDocument(section, page, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
for _, api := range page.APIs {
|
||||
doc := s.apiToDocument(api, page, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
return documents, nil
|
||||
}
|
||||
|
||||
func (s *CloudflareDocsScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
|
||||
html, err := s.fetchPage(ctx, source.URL)
|
||||
if err != nil {
|
||||
return false, "", err
|
||||
}
|
||||
|
||||
hash := s.generateHash(html)
|
||||
changed := hash != lastHash
|
||||
|
||||
return changed, hash, nil
|
||||
}
|
||||
|
||||
func (s *CloudflareDocsScraper) fetchPage(ctx context.Context, url string) (string, error) {
|
||||
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
req.Header.Set("User-Agent", s.config.UserAgent)
|
||||
|
||||
resp, err := s.client.Do(req)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return "", fmt.Errorf("HTTP %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return string(body), nil
|
||||
}
|
||||
|
||||
func (s *CloudflareDocsScraper) generateHash(content string) string {
|
||||
hash := sha256.Sum256([]byte(content))
|
||||
return hex.EncodeToString(hash[:])
|
||||
}
|
||||
|
||||
func (s *CloudflareDocsScraper) pageToDocument(page *cloudflaredocs.Page, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# %s\n\n", page.Title)
|
||||
if page.Product != "" {
|
||||
fmt.Fprintf(&content, "Product: %s\n\n", page.Product)
|
||||
}
|
||||
fmt.Fprintf(&content, "%s\n", page.Description)
|
||||
|
||||
if len(page.CodeBlocks) > 0 {
|
||||
fmt.Fprintf(&content, "\n## Code Examples\n")
|
||||
for _, cb := range page.CodeBlocks {
|
||||
fmt.Fprintf(&content, "\n```%s\n%s\n```\n", cb.Language, cb.Code)
|
||||
}
|
||||
}
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"title": page.Title,
|
||||
"product": page.Product,
|
||||
"doc_url": page.URL,
|
||||
"doc_type": "cloudflare-docs",
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(page.URL),
|
||||
Source: sourceName,
|
||||
Type: "cloudflare-docs",
|
||||
Title: page.Title,
|
||||
Content: content.String(),
|
||||
URL: page.URL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *CloudflareDocsScraper) sectionToDocument(section *cloudflaredocs.Section, page *cloudflaredocs.Page, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# %s\n\n", section.Title)
|
||||
fmt.Fprintf(&content, "%s\n", section.Content)
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"page_title": page.Title,
|
||||
"product": page.Product,
|
||||
"section_id": section.ID,
|
||||
"doc_url": section.DocURL,
|
||||
"doc_type": "cloudflare-section",
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(section.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "cloudflare-section",
|
||||
Title: fmt.Sprintf("%s - %s", page.Title, section.Title),
|
||||
Content: content.String(),
|
||||
URL: section.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *CloudflareDocsScraper) apiToDocument(api *cloudflaredocs.API, page *cloudflaredocs.Page, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# %s %s\n\n", api.Method, api.Endpoint)
|
||||
fmt.Fprintf(&content, "%s\n", api.Description)
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"page_title": page.Title,
|
||||
"product": page.Product,
|
||||
"method": api.Method,
|
||||
"endpoint": api.Endpoint,
|
||||
"doc_url": api.DocURL,
|
||||
"doc_type": "cloudflare-api",
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(api.DocURL + "#" + api.Endpoint),
|
||||
Source: sourceName,
|
||||
Type: "cloudflare-api",
|
||||
Title: fmt.Sprintf("%s %s", api.Method, api.Endpoint),
|
||||
Content: content.String(),
|
||||
URL: api.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
+156
@@ -0,0 +1,156 @@
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/yourorg/devour/pkg/dockerdocs"
|
||||
)
|
||||
|
||||
type DockerDocsScraper struct {
|
||||
config *Config
|
||||
parser *dockerdocs.Parser
|
||||
client *http.Client
|
||||
}
|
||||
|
||||
func NewDockerDocsScraper(config *Config) *DockerDocsScraper {
|
||||
return &DockerDocsScraper{
|
||||
config: config,
|
||||
parser: dockerdocs.NewParser(),
|
||||
client: &http.Client{
|
||||
Timeout: config.Timeout,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func (s *DockerDocsScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
|
||||
var documents []*Document
|
||||
|
||||
if source.URL == "" {
|
||||
return nil, fmt.Errorf("URL is required for Docker docs scraper")
|
||||
}
|
||||
|
||||
html, err := s.fetchPage(ctx, source.URL)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to fetch page: %w", err)
|
||||
}
|
||||
|
||||
page, err := s.parser.ParsePage(html, source.URL)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to parse Docker docs page: %w", err)
|
||||
}
|
||||
|
||||
mainDoc := s.pageToDocument(page, source.Name)
|
||||
documents = append(documents, mainDoc)
|
||||
|
||||
for _, section := range page.Sections {
|
||||
doc := s.sectionToDocument(section, page, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
return documents, nil
|
||||
}
|
||||
|
||||
func (s *DockerDocsScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
|
||||
html, err := s.fetchPage(ctx, source.URL)
|
||||
if err != nil {
|
||||
return false, "", err
|
||||
}
|
||||
|
||||
hash := s.generateHash(html)
|
||||
changed := hash != lastHash
|
||||
|
||||
return changed, hash, nil
|
||||
}
|
||||
|
||||
func (s *DockerDocsScraper) fetchPage(ctx context.Context, url string) (string, error) {
|
||||
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
req.Header.Set("User-Agent", s.config.UserAgent)
|
||||
|
||||
resp, err := s.client.Do(req)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return "", fmt.Errorf("HTTP %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return string(body), nil
|
||||
}
|
||||
|
||||
func (s *DockerDocsScraper) generateHash(content string) string {
|
||||
hash := sha256.Sum256([]byte(content))
|
||||
return hex.EncodeToString(hash[:])
|
||||
}
|
||||
|
||||
func (s *DockerDocsScraper) pageToDocument(page *dockerdocs.Page, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# %s\n\n", page.Title)
|
||||
fmt.Fprintf(&content, "%s\n", page.Description)
|
||||
|
||||
if len(page.CodeBlocks) > 0 {
|
||||
fmt.Fprintf(&content, "\n## Code Examples\n")
|
||||
for _, cb := range page.CodeBlocks {
|
||||
fmt.Fprintf(&content, "\n```%s\n%s\n```\n", cb.Language, cb.Code)
|
||||
}
|
||||
}
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"title": page.Title,
|
||||
"doc_url": page.URL,
|
||||
"doc_type": "docker-docs",
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(page.URL),
|
||||
Source: sourceName,
|
||||
Type: "docker-docs",
|
||||
Title: page.Title,
|
||||
Content: content.String(),
|
||||
URL: page.URL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *DockerDocsScraper) sectionToDocument(section *dockerdocs.Section, page *dockerdocs.Page, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# %s\n\n", section.Title)
|
||||
fmt.Fprintf(&content, "%s\n", section.Content)
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"page_title": page.Title,
|
||||
"section_id": section.ID,
|
||||
"doc_url": section.DocURL,
|
||||
"doc_type": "docker-section",
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(section.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "docker-section",
|
||||
Title: fmt.Sprintf("%s - %s", page.Title, section.Title),
|
||||
Content: content.String(),
|
||||
URL: section.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
Vendored
+423
@@ -0,0 +1,423 @@
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/yourorg/devour/pkg/godocs"
|
||||
)
|
||||
|
||||
type GoDocsScraper struct {
|
||||
config *Config
|
||||
parser *godocs.Parser
|
||||
client *http.Client
|
||||
}
|
||||
|
||||
func NewGoDocsScraper(config *Config) *GoDocsScraper {
|
||||
return &GoDocsScraper{
|
||||
config: config,
|
||||
parser: godocs.NewParser(),
|
||||
client: &http.Client{
|
||||
Timeout: config.Timeout,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func (s *GoDocsScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
|
||||
var documents []*Document
|
||||
|
||||
if source.URL == "" {
|
||||
return nil, fmt.Errorf("URL is required for Go docs scraper")
|
||||
}
|
||||
|
||||
html, err := s.fetchPage(ctx, source.URL)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to fetch page: %w", err)
|
||||
}
|
||||
|
||||
pkg, err := s.parser.ParsePackagePage(html, source.URL)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to parse package: %w", err)
|
||||
}
|
||||
|
||||
mainDoc := s.packageToDocument(pkg, source.Name)
|
||||
documents = append(documents, mainDoc)
|
||||
|
||||
for _, fn := range pkg.Functions {
|
||||
doc := s.functionToDocument(fn, pkg, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
for _, t := range pkg.Types {
|
||||
doc := s.typeToDocument(t, pkg, source.Name)
|
||||
documents = append(documents, doc)
|
||||
|
||||
for _, m := range t.Methods {
|
||||
methodDoc := s.methodToDocument(m, t, pkg, source.Name)
|
||||
documents = append(documents, methodDoc)
|
||||
}
|
||||
}
|
||||
|
||||
for _, c := range pkg.Constants {
|
||||
doc := s.constantToDocument(c, pkg, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
for _, v := range pkg.Variables {
|
||||
doc := s.variableToDocument(v, pkg, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
return documents, nil
|
||||
}
|
||||
|
||||
func (s *GoDocsScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
|
||||
html, err := s.fetchPage(ctx, source.URL)
|
||||
if err != nil {
|
||||
return false, "", err
|
||||
}
|
||||
|
||||
hash := s.generateHash(html)
|
||||
changed := hash != lastHash
|
||||
|
||||
return changed, hash, nil
|
||||
}
|
||||
|
||||
func (s *GoDocsScraper) fetchPage(ctx context.Context, url string) (string, error) {
|
||||
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
req.Header.Set("User-Agent", s.config.UserAgent)
|
||||
|
||||
resp, err := s.client.Do(req)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return "", fmt.Errorf("HTTP %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return string(body), nil
|
||||
}
|
||||
|
||||
func (s *GoDocsScraper) generateHash(content string) string {
|
||||
hash := sha256.Sum256([]byte(content))
|
||||
return hex.EncodeToString(hash[:])
|
||||
}
|
||||
|
||||
func (s *GoDocsScraper) packageToDocument(pkg *godocs.Package, sourceName string) *Document {
|
||||
content := s.buildPackageContent(pkg)
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"import_path": pkg.ImportPath,
|
||||
"version": pkg.Version,
|
||||
"imported_by": pkg.ImportedBy,
|
||||
"repository": pkg.Repository,
|
||||
"doc_url": pkg.DocURL,
|
||||
}
|
||||
|
||||
if pkg.Module != nil {
|
||||
metadata["module_path"] = pkg.Module.Path
|
||||
metadata["module_version"] = pkg.Module.Version
|
||||
}
|
||||
|
||||
if len(pkg.Licenses) > 0 {
|
||||
var licenses []string
|
||||
for _, l := range pkg.Licenses {
|
||||
licenses = append(licenses, l.Name)
|
||||
}
|
||||
metadata["licenses"] = licenses
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(pkg.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "go-package",
|
||||
Title: fmt.Sprintf("%s - %s", pkg.Name, pkg.ImportPath),
|
||||
Content: content,
|
||||
URL: pkg.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *GoDocsScraper) buildPackageContent(pkg *godocs.Package) string {
|
||||
var parts []string
|
||||
|
||||
parts = append(parts, fmt.Sprintf("# Package %s\n", pkg.ImportPath))
|
||||
|
||||
if pkg.Synopsis != "" {
|
||||
parts = append(parts, pkg.Synopsis)
|
||||
}
|
||||
|
||||
if pkg.Doc != "" {
|
||||
parts = append(parts, "\n## Documentation\n")
|
||||
parts = append(parts, pkg.Doc)
|
||||
}
|
||||
|
||||
if len(pkg.Functions) > 0 {
|
||||
parts = append(parts, fmt.Sprintf("\n## Functions (%d)\n", len(pkg.Functions)))
|
||||
for _, fn := range pkg.Functions {
|
||||
parts = append(parts, fmt.Sprintf("- `%s`", fn.Signature))
|
||||
}
|
||||
}
|
||||
|
||||
if len(pkg.Types) > 0 {
|
||||
parts = append(parts, fmt.Sprintf("\n## Types (%d)\n", len(pkg.Types)))
|
||||
for _, t := range pkg.Types {
|
||||
parts = append(parts, fmt.Sprintf("- `%s` (%s)", t.Name, t.Kind))
|
||||
}
|
||||
}
|
||||
|
||||
if len(pkg.Constants) > 0 {
|
||||
parts = append(parts, fmt.Sprintf("\n## Constants (%d)\n", len(pkg.Constants)))
|
||||
}
|
||||
|
||||
if len(pkg.Variables) > 0 {
|
||||
parts = append(parts, fmt.Sprintf("\n## Variables (%d)\n", len(pkg.Variables)))
|
||||
}
|
||||
|
||||
return strings.Join(parts, "\n")
|
||||
}
|
||||
|
||||
func (s *GoDocsScraper) functionToDocument(fn *godocs.Function, pkg *godocs.Package, sourceName string) *Document {
|
||||
content := s.buildFunctionContent(fn, pkg)
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"import_path": pkg.ImportPath,
|
||||
"package": pkg.Name,
|
||||
"symbol": fn.Name,
|
||||
"signature": fn.Signature,
|
||||
"kind": "function",
|
||||
}
|
||||
|
||||
examplesJSON, _ := json.Marshal(fn.Examples)
|
||||
metadata["examples"] = string(examplesJSON)
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(fmt.Sprintf("%s#%s", pkg.DocURL, fn.Name)),
|
||||
Source: sourceName,
|
||||
Type: "go-function",
|
||||
Title: fmt.Sprintf("%s.%s", pkg.Name, fn.Name),
|
||||
Content: content,
|
||||
URL: fmt.Sprintf("%s#%s", pkg.DocURL, fn.Name),
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *GoDocsScraper) buildFunctionContent(fn *godocs.Function, pkg *godocs.Package) string {
|
||||
var parts []string
|
||||
|
||||
parts = append(parts, fmt.Sprintf("# %s.%s\n", pkg.Name, fn.Name))
|
||||
parts = append(parts, fmt.Sprintf("```\n%s\n```", fn.Signature))
|
||||
|
||||
if fn.Doc != "" {
|
||||
parts = append(parts, "\n"+fn.Doc)
|
||||
}
|
||||
|
||||
for _, ex := range fn.Examples {
|
||||
parts = append(parts, fmt.Sprintf("\n### Example: %s\n", ex.Name))
|
||||
if ex.Doc != "" {
|
||||
parts = append(parts, ex.Doc)
|
||||
}
|
||||
parts = append(parts, fmt.Sprintf("```go\n%s\n```", ex.Code))
|
||||
if ex.Output != "" {
|
||||
parts = append(parts, fmt.Sprintf("Output:\n```\n%s\n```", ex.Output))
|
||||
}
|
||||
}
|
||||
|
||||
return strings.Join(parts, "\n")
|
||||
}
|
||||
|
||||
func (s *GoDocsScraper) typeToDocument(t *godocs.Type, pkg *godocs.Package, sourceName string) *Document {
|
||||
content := s.buildTypeContent(t, pkg)
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"import_path": pkg.ImportPath,
|
||||
"package": pkg.Name,
|
||||
"symbol": t.Name,
|
||||
"kind": "type",
|
||||
"type_kind": t.Kind,
|
||||
"underlying": t.Underlying,
|
||||
"method_count": len(t.Methods),
|
||||
}
|
||||
|
||||
fieldsJSON, _ := json.Marshal(t.Fields)
|
||||
metadata["fields"] = string(fieldsJSON)
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(fmt.Sprintf("%s#%s", pkg.DocURL, t.Name)),
|
||||
Source: sourceName,
|
||||
Type: "go-type",
|
||||
Title: fmt.Sprintf("%s.%s", pkg.Name, t.Name),
|
||||
Content: content,
|
||||
URL: fmt.Sprintf("%s#%s", pkg.DocURL, t.Name),
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *GoDocsScraper) buildTypeContent(t *godocs.Type, pkg *godocs.Package) string {
|
||||
var parts []string
|
||||
|
||||
parts = append(parts, fmt.Sprintf("# type %s.%s\n", pkg.Name, t.Name))
|
||||
parts = append(parts, fmt.Sprintf("```\n%s\n```", t.Underlying))
|
||||
|
||||
if t.Doc != "" {
|
||||
parts = append(parts, "\n"+t.Doc)
|
||||
}
|
||||
|
||||
if len(t.Fields) > 0 {
|
||||
parts = append(parts, "\n### Fields\n")
|
||||
for _, f := range t.Fields {
|
||||
if f.Doc != "" {
|
||||
parts = append(parts, fmt.Sprintf("- `%s %s` - %s", f.Name, f.Type, f.Doc))
|
||||
} else {
|
||||
parts = append(parts, fmt.Sprintf("- `%s %s`", f.Name, f.Type))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if len(t.Methods) > 0 {
|
||||
parts = append(parts, fmt.Sprintf("\n### Methods (%d)\n", len(t.Methods)))
|
||||
for _, m := range t.Methods {
|
||||
parts = append(parts, fmt.Sprintf("- `%s`", m.Signature))
|
||||
}
|
||||
}
|
||||
|
||||
return strings.Join(parts, "\n")
|
||||
}
|
||||
|
||||
func (s *GoDocsScraper) methodToDocument(m *godocs.Method, t *godocs.Type, pkg *godocs.Package, sourceName string) *Document {
|
||||
content := s.buildMethodContent(m, t, pkg)
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"import_path": pkg.ImportPath,
|
||||
"package": pkg.Name,
|
||||
"type": t.Name,
|
||||
"symbol": m.Name,
|
||||
"receiver": m.Receiver,
|
||||
"signature": m.Signature,
|
||||
"kind": "method",
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(fmt.Sprintf("%s#%s.%s", pkg.DocURL, t.Name, m.Name)),
|
||||
Source: sourceName,
|
||||
Type: "go-method",
|
||||
Title: fmt.Sprintf("%s.%s.%s", pkg.Name, t.Name, m.Name),
|
||||
Content: content,
|
||||
URL: fmt.Sprintf("%s#%s.%s", pkg.DocURL, t.Name, m.Name),
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *GoDocsScraper) buildMethodContent(m *godocs.Method, t *godocs.Type, pkg *godocs.Package) string {
|
||||
var parts []string
|
||||
|
||||
parts = append(parts, fmt.Sprintf("# func (%s) %s\n", m.Receiver, m.Name))
|
||||
parts = append(parts, fmt.Sprintf("```\n%s\n```", m.Signature))
|
||||
|
||||
if m.Doc != "" {
|
||||
parts = append(parts, "\n"+m.Doc)
|
||||
}
|
||||
|
||||
return strings.Join(parts, "\n")
|
||||
}
|
||||
|
||||
func (s *GoDocsScraper) constantToDocument(c *godocs.Value, pkg *godocs.Package, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# Constants\n\n")
|
||||
|
||||
if c.Doc != "" {
|
||||
fmt.Fprintf(&content, "%s\n\n", c.Doc)
|
||||
}
|
||||
|
||||
if len(c.Names) > 1 {
|
||||
fmt.Fprintf(&content, "```go\nconst (\n")
|
||||
for _, name := range c.Names {
|
||||
fmt.Fprintf(&content, "\t%s\n", name)
|
||||
}
|
||||
fmt.Fprintf(&content, ")\n```")
|
||||
} else {
|
||||
fmt.Fprintf(&content, "```go\nconst %s = %s\n```", c.Name, c.Value)
|
||||
}
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"import_path": pkg.ImportPath,
|
||||
"package": pkg.Name,
|
||||
"names": c.Names,
|
||||
"kind": "constant",
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(fmt.Sprintf("%s#const-%s", pkg.DocURL, c.Name)),
|
||||
Source: sourceName,
|
||||
Type: "go-constant",
|
||||
Title: fmt.Sprintf("%s.%s (const)", pkg.Name, c.Name),
|
||||
Content: content.String(),
|
||||
URL: pkg.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *GoDocsScraper) variableToDocument(v *godocs.Value, pkg *godocs.Package, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# Variables\n\n")
|
||||
|
||||
if v.Doc != "" {
|
||||
fmt.Fprintf(&content, "%s\n\n", v.Doc)
|
||||
}
|
||||
|
||||
fmt.Fprintf(&content, "```go\nvar %s", v.Name)
|
||||
if v.Type != "" {
|
||||
fmt.Fprintf(&content, " %s", v.Type)
|
||||
}
|
||||
if v.Value != "" {
|
||||
fmt.Fprintf(&content, " = %s", v.Value)
|
||||
}
|
||||
fmt.Fprintf(&content, "\n```")
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"import_path": pkg.ImportPath,
|
||||
"package": pkg.Name,
|
||||
"name": v.Name,
|
||||
"type": v.Type,
|
||||
"kind": "variable",
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(fmt.Sprintf("%s#var-%s", pkg.DocURL, v.Name)),
|
||||
Source: sourceName,
|
||||
Type: "go-variable",
|
||||
Title: fmt.Sprintf("%s.%s (var)", pkg.Name, v.Name),
|
||||
Content: content.String(),
|
||||
URL: pkg.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
+254
@@ -0,0 +1,254 @@
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/yourorg/devour/pkg/javadocs"
|
||||
)
|
||||
|
||||
type JavaDocsScraper struct {
|
||||
config *Config
|
||||
parser *javadocs.Parser
|
||||
client *http.Client
|
||||
}
|
||||
|
||||
func NewJavaDocsScraper(config *Config) *JavaDocsScraper {
|
||||
return &JavaDocsScraper{
|
||||
config: config,
|
||||
parser: javadocs.NewParser(),
|
||||
client: &http.Client{
|
||||
Timeout: config.Timeout,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func (s *JavaDocsScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
|
||||
var documents []*Document
|
||||
|
||||
if source.URL == "" {
|
||||
return nil, fmt.Errorf("URL is required for Java docs scraper")
|
||||
}
|
||||
|
||||
html, err := s.fetchPage(ctx, source.URL)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to fetch page: %w", err)
|
||||
}
|
||||
|
||||
pkg, err := s.parser.ParsePackagePage(html, source.URL)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to parse package: %w", err)
|
||||
}
|
||||
|
||||
mainDoc := s.packageToDocument(pkg, source.Name)
|
||||
documents = append(documents, mainDoc)
|
||||
|
||||
for _, class := range pkg.Classes {
|
||||
doc := s.classToDocument(class, pkg, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
for _, iface := range pkg.Interfaces {
|
||||
doc := s.interfaceToDocument(iface, pkg, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
for _, enum := range pkg.Enums {
|
||||
doc := s.enumToDocument(enum, pkg, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
for _, exc := range pkg.Exceptions {
|
||||
doc := s.exceptionToDocument(exc, pkg, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
return documents, nil
|
||||
}
|
||||
|
||||
func (s *JavaDocsScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
|
||||
html, err := s.fetchPage(ctx, source.URL)
|
||||
if err != nil {
|
||||
return false, "", err
|
||||
}
|
||||
|
||||
hash := s.generateHash(html)
|
||||
changed := hash != lastHash
|
||||
|
||||
return changed, hash, nil
|
||||
}
|
||||
|
||||
func (s *JavaDocsScraper) fetchPage(ctx context.Context, url string) (string, error) {
|
||||
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
req.Header.Set("User-Agent", s.config.UserAgent)
|
||||
|
||||
resp, err := s.client.Do(req)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return "", fmt.Errorf("HTTP %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return string(body), nil
|
||||
}
|
||||
|
||||
func (s *JavaDocsScraper) generateHash(content string) string {
|
||||
hash := sha256.Sum256([]byte(content))
|
||||
return hex.EncodeToString(hash[:])
|
||||
}
|
||||
|
||||
func (s *JavaDocsScraper) packageToDocument(pkg *javadocs.Package, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# Package %s\n\n", pkg.Name)
|
||||
fmt.Fprintf(&content, "%s\n", pkg.Doc)
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"package": pkg.Name,
|
||||
"doc_url": pkg.DocURL,
|
||||
"doc_type": "java-package",
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(pkg.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "java-package",
|
||||
Title: pkg.Name,
|
||||
Content: content.String(),
|
||||
URL: pkg.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *JavaDocsScraper) classToDocument(class *javadocs.Class, pkg *javadocs.Package, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# %s\n\n", class.QualifiedName)
|
||||
fmt.Fprintf(&content, "Kind: %s\n\n", class.Kind)
|
||||
fmt.Fprintf(&content, "%s\n", class.Doc)
|
||||
|
||||
if len(class.Methods) > 0 {
|
||||
fmt.Fprintf(&content, "\n## Methods\n")
|
||||
for _, m := range class.Methods {
|
||||
fmt.Fprintf(&content, "- `%s`\n", m.Signature)
|
||||
}
|
||||
}
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"package": pkg.Name,
|
||||
"qualified_name": class.QualifiedName,
|
||||
"kind": string(class.Kind),
|
||||
"doc_url": class.DocURL,
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(class.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "java-class",
|
||||
Title: class.QualifiedName,
|
||||
Content: content.String(),
|
||||
URL: class.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *JavaDocsScraper) interfaceToDocument(iface *javadocs.Class, pkg *javadocs.Package, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# %s (interface)\n\n", iface.QualifiedName)
|
||||
fmt.Fprintf(&content, "%s\n", iface.Doc)
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"package": pkg.Name,
|
||||
"qualified_name": iface.QualifiedName,
|
||||
"kind": "interface",
|
||||
"doc_url": iface.DocURL,
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(iface.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "java-interface",
|
||||
Title: iface.QualifiedName,
|
||||
Content: content.String(),
|
||||
URL: iface.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *JavaDocsScraper) enumToDocument(enum *javadocs.Enum, pkg *javadocs.Package, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# %s (enum)\n\n", enum.QualifiedName)
|
||||
fmt.Fprintf(&content, "%s\n", enum.Doc)
|
||||
|
||||
if len(enum.Constants) > 0 {
|
||||
fmt.Fprintf(&content, "\n## Constants\n")
|
||||
for _, c := range enum.Constants {
|
||||
fmt.Fprintf(&content, "- `%s`\n", c.Name)
|
||||
}
|
||||
}
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"package": pkg.Name,
|
||||
"qualified_name": enum.QualifiedName,
|
||||
"kind": "enum",
|
||||
"doc_url": enum.DocURL,
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(enum.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "java-enum",
|
||||
Title: enum.QualifiedName,
|
||||
Content: content.String(),
|
||||
URL: enum.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *JavaDocsScraper) exceptionToDocument(exc *javadocs.Class, pkg *javadocs.Package, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# %s (exception)\n\n", exc.QualifiedName)
|
||||
fmt.Fprintf(&content, "%s\n", exc.Doc)
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"package": pkg.Name,
|
||||
"qualified_name": exc.QualifiedName,
|
||||
"kind": "exception",
|
||||
"doc_url": exc.DocURL,
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(exc.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "java-exception",
|
||||
Title: exc.QualifiedName,
|
||||
Content: content.String(),
|
||||
URL: exc.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
Vendored
+222
@@ -0,0 +1,222 @@
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/yourorg/devour/pkg/mcpdocs"
|
||||
)
|
||||
|
||||
type MCPDocsScraper struct {
|
||||
config *Config
|
||||
parser *mcpdocs.Parser
|
||||
client *http.Client
|
||||
}
|
||||
|
||||
func NewMCPDocsScraper(config *Config) *MCPDocsScraper {
|
||||
return &MCPDocsScraper{
|
||||
config: config,
|
||||
parser: mcpdocs.NewParser(),
|
||||
client: &http.Client{
|
||||
Timeout: config.Timeout,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func (s *MCPDocsScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
|
||||
var documents []*Document
|
||||
|
||||
if source.URL == "" {
|
||||
return nil, fmt.Errorf("URL is required for MCP docs scraper")
|
||||
}
|
||||
|
||||
html, err := s.fetchPage(ctx, source.URL)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to fetch page: %w", err)
|
||||
}
|
||||
|
||||
server, err := s.parser.ParseServerPage(html, source.URL)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to parse MCP server page: %w", err)
|
||||
}
|
||||
|
||||
mainDoc := s.serverToDocument(server, source.Name)
|
||||
documents = append(documents, mainDoc)
|
||||
|
||||
for _, tool := range server.Tools {
|
||||
doc := s.toolToDocument(tool, server, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
for _, res := range server.Resources {
|
||||
doc := s.resourceToDocument(res, server, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
for _, prompt := range server.Prompts {
|
||||
doc := s.promptToDocument(prompt, server, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
return documents, nil
|
||||
}
|
||||
|
||||
func (s *MCPDocsScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
|
||||
html, err := s.fetchPage(ctx, source.URL)
|
||||
if err != nil {
|
||||
return false, "", err
|
||||
}
|
||||
|
||||
hash := s.generateHash(html)
|
||||
changed := hash != lastHash
|
||||
|
||||
return changed, hash, nil
|
||||
}
|
||||
|
||||
func (s *MCPDocsScraper) fetchPage(ctx context.Context, url string) (string, error) {
|
||||
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
req.Header.Set("User-Agent", s.config.UserAgent)
|
||||
|
||||
resp, err := s.client.Do(req)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return "", fmt.Errorf("HTTP %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return string(body), nil
|
||||
}
|
||||
|
||||
func (s *MCPDocsScraper) generateHash(content string) string {
|
||||
hash := sha256.Sum256([]byte(content))
|
||||
return hex.EncodeToString(hash[:])
|
||||
}
|
||||
|
||||
func (s *MCPDocsScraper) serverToDocument(server *mcpdocs.Server, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# %s\n\n", server.Name)
|
||||
fmt.Fprintf(&content, "%s\n", server.Description)
|
||||
|
||||
if len(server.Tools) > 0 {
|
||||
fmt.Fprintf(&content, "\n## Tools (%d)\n", len(server.Tools))
|
||||
for _, t := range server.Tools {
|
||||
fmt.Fprintf(&content, "- `%s`: %s\n", t.Name, t.Description)
|
||||
}
|
||||
}
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"server": server.Name,
|
||||
"category": server.Category,
|
||||
"doc_url": server.DocURL,
|
||||
"doc_type": "mcp-server",
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(server.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "mcp-server",
|
||||
Title: server.Name,
|
||||
Content: content.String(),
|
||||
URL: server.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *MCPDocsScraper) toolToDocument(tool *mcpdocs.Tool, server *mcpdocs.Server, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# %s\n\n", tool.Name)
|
||||
fmt.Fprintf(&content, "Server: %s\n\n", server.Name)
|
||||
fmt.Fprintf(&content, "%s\n", tool.Description)
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"server": server.Name,
|
||||
"tool": tool.Name,
|
||||
"doc_url": tool.DocURL,
|
||||
"doc_type": "mcp-tool",
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(tool.DocURL + "#" + tool.Name),
|
||||
Source: sourceName,
|
||||
Type: "mcp-tool",
|
||||
Title: fmt.Sprintf("%s.%s", server.Name, tool.Name),
|
||||
Content: content.String(),
|
||||
URL: tool.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *MCPDocsScraper) resourceToDocument(res *mcpdocs.Resource, server *mcpdocs.Server, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# %s\n\n", res.Name)
|
||||
fmt.Fprintf(&content, "Server: %s\n", server.Name)
|
||||
fmt.Fprintf(&content, "URI: %s\n\n", res.URI)
|
||||
fmt.Fprintf(&content, "%s\n", res.Description)
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"server": server.Name,
|
||||
"resource": res.Name,
|
||||
"uri": res.URI,
|
||||
"doc_url": res.DocURL,
|
||||
"doc_type": "mcp-resource",
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(res.DocURL + "#" + res.Name),
|
||||
Source: sourceName,
|
||||
Type: "mcp-resource",
|
||||
Title: res.Name,
|
||||
Content: content.String(),
|
||||
URL: res.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *MCPDocsScraper) promptToDocument(prompt *mcpdocs.Prompt, server *mcpdocs.Server, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# %s\n\n", prompt.Name)
|
||||
fmt.Fprintf(&content, "Server: %s\n\n", server.Name)
|
||||
fmt.Fprintf(&content, "%s\n", prompt.Description)
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"server": server.Name,
|
||||
"prompt": prompt.Name,
|
||||
"doc_url": prompt.DocURL,
|
||||
"doc_type": "mcp-prompt",
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(prompt.DocURL + "#" + prompt.Name),
|
||||
Source: sourceName,
|
||||
Type: "mcp-prompt",
|
||||
Title: prompt.Name,
|
||||
Content: content.String(),
|
||||
URL: prompt.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
+298
@@ -0,0 +1,298 @@
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/yourorg/devour/pkg/nuxtdocs"
|
||||
)
|
||||
|
||||
type NuxtDocsScraper struct {
|
||||
config *Config
|
||||
parser *nuxtdocs.Parser
|
||||
client *http.Client
|
||||
}
|
||||
|
||||
func NewNuxtDocsScraper(config *Config) *NuxtDocsScraper {
|
||||
return &NuxtDocsScraper{
|
||||
config: config,
|
||||
parser: nuxtdocs.NewParser(),
|
||||
client: &http.Client{
|
||||
Timeout: config.Timeout,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func (s *NuxtDocsScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
|
||||
var documents []*Document
|
||||
|
||||
if source.URL == "" {
|
||||
return nil, fmt.Errorf("URL is required for Nuxt docs scraper")
|
||||
}
|
||||
|
||||
html, err := s.fetchPage(ctx, source.URL)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to fetch page: %w", err)
|
||||
}
|
||||
|
||||
ref, err := s.parser.ParseReferencePage(html, source.URL)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to parse reference: %w", err)
|
||||
}
|
||||
|
||||
mainDoc := s.referenceToDocument(ref, source.Name)
|
||||
documents = append(documents, mainDoc)
|
||||
|
||||
for _, comp := range ref.Components {
|
||||
doc := s.componentToDocument(comp, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
for _, comp := range ref.Composables {
|
||||
doc := s.composableToDocument(comp, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
for _, util := range ref.Utilities {
|
||||
doc := s.utilityToDocument(util, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
for _, cfg := range ref.Configs {
|
||||
doc := s.configToDocument(cfg, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
for _, cmd := range ref.Commands {
|
||||
doc := s.commandToDocument(cmd, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
return documents, nil
|
||||
}
|
||||
|
||||
func (s *NuxtDocsScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
|
||||
html, err := s.fetchPage(ctx, source.URL)
|
||||
if err != nil {
|
||||
return false, "", err
|
||||
}
|
||||
|
||||
hash := s.generateHash(html)
|
||||
changed := hash != lastHash
|
||||
|
||||
return changed, hash, nil
|
||||
}
|
||||
|
||||
func (s *NuxtDocsScraper) fetchPage(ctx context.Context, url string) (string, error) {
|
||||
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
req.Header.Set("User-Agent", s.config.UserAgent)
|
||||
|
||||
resp, err := s.client.Do(req)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return "", fmt.Errorf("HTTP %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return string(body), nil
|
||||
}
|
||||
|
||||
func (s *NuxtDocsScraper) generateHash(content string) string {
|
||||
hash := sha256.Sum256([]byte(content))
|
||||
return hex.EncodeToString(hash[:])
|
||||
}
|
||||
|
||||
func (s *NuxtDocsScraper) referenceToDocument(ref *nuxtdocs.Reference, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# Nuxt API Reference\n\n")
|
||||
fmt.Fprintf(&content, "Components: %d, Composables: %d, Utilities: %d, Configs: %d, Commands: %d\n",
|
||||
len(ref.Components), len(ref.Composables), len(ref.Utilities), len(ref.Configs), len(ref.Commands))
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(ref.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "nuxt-reference",
|
||||
Title: "Nuxt API Reference",
|
||||
Content: content.String(),
|
||||
URL: ref.DocURL,
|
||||
Metadata: map[string]interface{}{"doc_type": "nuxt-reference"},
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *NuxtDocsScraper) componentToDocument(comp *nuxtdocs.Component, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# <%s />\n\n", comp.Name)
|
||||
fmt.Fprintf(&content, "%s\n", comp.Doc)
|
||||
|
||||
if len(comp.Props) > 0 {
|
||||
fmt.Fprintf(&content, "\n## Props\n")
|
||||
for _, p := range comp.Props {
|
||||
fmt.Fprintf(&content, "- `%s: %s`\n", p.Name, p.Type)
|
||||
}
|
||||
}
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"name": comp.Name,
|
||||
"category": comp.Category,
|
||||
"doc_url": comp.DocURL,
|
||||
"doc_type": "nuxt-component",
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(comp.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "nuxt-component",
|
||||
Title: comp.Name,
|
||||
Content: content.String(),
|
||||
URL: comp.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *NuxtDocsScraper) composableToDocument(comp *nuxtdocs.Composable, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# %s\n\n", comp.Name)
|
||||
if comp.Signature != "" {
|
||||
fmt.Fprintf(&content, "```javascript\n%s\n```\n\n", comp.Signature)
|
||||
}
|
||||
fmt.Fprintf(&content, "%s\n", comp.Doc)
|
||||
|
||||
if comp.Returns != "" {
|
||||
fmt.Fprintf(&content, "\n**Returns:** `%s`\n", comp.Returns)
|
||||
}
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"name": comp.Name,
|
||||
"category": comp.Category,
|
||||
"doc_url": comp.DocURL,
|
||||
"doc_type": "nuxt-composable",
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(comp.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "nuxt-composable",
|
||||
Title: comp.Name,
|
||||
Content: content.String(),
|
||||
URL: comp.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *NuxtDocsScraper) utilityToDocument(util *nuxtdocs.Utility, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# %s\n\n", util.Name)
|
||||
if util.Signature != "" {
|
||||
fmt.Fprintf(&content, "```javascript\n%s\n```\n\n", util.Signature)
|
||||
}
|
||||
fmt.Fprintf(&content, "%s\n", util.Doc)
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"name": util.Name,
|
||||
"doc_url": util.DocURL,
|
||||
"doc_type": "nuxt-utility",
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(util.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "nuxt-utility",
|
||||
Title: util.Name,
|
||||
Content: content.String(),
|
||||
URL: util.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *NuxtDocsScraper) configToDocument(cfg *nuxtdocs.Config, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# %s\n\n", cfg.Name)
|
||||
if cfg.Type != "" {
|
||||
fmt.Fprintf(&content, "Type: `%s`\n\n", cfg.Type)
|
||||
}
|
||||
if cfg.Default != "" {
|
||||
fmt.Fprintf(&content, "Default: `%s`\n\n", cfg.Default)
|
||||
}
|
||||
fmt.Fprintf(&content, "%s\n", cfg.Doc)
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"name": cfg.Name,
|
||||
"type": cfg.Type,
|
||||
"default": cfg.Default,
|
||||
"category": cfg.Category,
|
||||
"doc_url": cfg.DocURL,
|
||||
"doc_type": "nuxt-config",
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(cfg.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "nuxt-config",
|
||||
Title: cfg.Name,
|
||||
Content: content.String(),
|
||||
URL: cfg.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *NuxtDocsScraper) commandToDocument(cmd *nuxtdocs.Command, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# %s\n\n", cmd.Name)
|
||||
if cmd.Usage != "" {
|
||||
fmt.Fprintf(&content, "```\n%s\n```\n\n", cmd.Usage)
|
||||
}
|
||||
fmt.Fprintf(&content, "%s\n", cmd.Doc)
|
||||
|
||||
if len(cmd.Flags) > 0 {
|
||||
fmt.Fprintf(&content, "\n## Flags\n")
|
||||
for _, f := range cmd.Flags {
|
||||
fmt.Fprintf(&content, "- `--%s`: %s\n", f.Name, f.Doc)
|
||||
}
|
||||
}
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"name": cmd.Name,
|
||||
"usage": cmd.Usage,
|
||||
"doc_url": cmd.DocURL,
|
||||
"doc_type": "nuxt-command",
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(cmd.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "nuxt-command",
|
||||
Title: cmd.Name,
|
||||
Content: content.String(),
|
||||
URL: cmd.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
+463
@@ -0,0 +1,463 @@
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/yourorg/devour/pkg/pythondocs"
|
||||
)
|
||||
|
||||
type PythonDocsScraper struct {
|
||||
config *Config
|
||||
parser *pythondocs.Parser
|
||||
client *http.Client
|
||||
}
|
||||
|
||||
func NewPythonDocsScraper(config *Config) *PythonDocsScraper {
|
||||
return &PythonDocsScraper{
|
||||
config: config,
|
||||
parser: pythondocs.NewParser(),
|
||||
client: &http.Client{
|
||||
Timeout: config.Timeout,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func (s *PythonDocsScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
|
||||
var documents []*Document
|
||||
|
||||
if source.URL == "" {
|
||||
return nil, fmt.Errorf("URL is required for Python docs scraper")
|
||||
}
|
||||
|
||||
html, err := s.fetchPage(ctx, source.URL)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to fetch page: %w", err)
|
||||
}
|
||||
|
||||
module, err := s.parser.ParseModulePage(html, source.URL)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to parse module: %w", err)
|
||||
}
|
||||
|
||||
mainDoc := s.moduleToDocument(module, source.Name)
|
||||
documents = append(documents, mainDoc)
|
||||
|
||||
for _, class := range module.Classes {
|
||||
doc := s.classToDocument(class, module, source.Name)
|
||||
documents = append(documents, doc)
|
||||
|
||||
for _, method := range class.Methods {
|
||||
methodDoc := s.methodToDocument(method, class, module, source.Name)
|
||||
documents = append(documents, methodDoc)
|
||||
}
|
||||
|
||||
for _, method := range class.ClassMethods {
|
||||
methodDoc := s.classMethodToDocument(method, class, module, source.Name)
|
||||
documents = append(documents, methodDoc)
|
||||
}
|
||||
|
||||
for _, attr := range class.Attributes {
|
||||
attrDoc := s.attributeToDocument(attr, class, module, source.Name)
|
||||
documents = append(documents, attrDoc)
|
||||
}
|
||||
}
|
||||
|
||||
for _, fn := range module.Functions {
|
||||
doc := s.functionToDocument(fn, module, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
for _, exc := range module.Exceptions {
|
||||
doc := s.exceptionToDocument(exc, module, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
for _, data := range module.Constants {
|
||||
doc := s.dataToDocument(data, module, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
return documents, nil
|
||||
}
|
||||
|
||||
func (s *PythonDocsScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
|
||||
html, err := s.fetchPage(ctx, source.URL)
|
||||
if err != nil {
|
||||
return false, "", err
|
||||
}
|
||||
|
||||
hash := s.generateHash(html)
|
||||
changed := hash != lastHash
|
||||
|
||||
return changed, hash, nil
|
||||
}
|
||||
|
||||
func (s *PythonDocsScraper) fetchPage(ctx context.Context, url string) (string, error) {
|
||||
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
req.Header.Set("User-Agent", s.config.UserAgent)
|
||||
|
||||
resp, err := s.client.Do(req)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return "", fmt.Errorf("HTTP %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return string(body), nil
|
||||
}
|
||||
|
||||
func (s *PythonDocsScraper) generateHash(content string) string {
|
||||
hash := sha256.Sum256([]byte(content))
|
||||
return hex.EncodeToString(hash[:])
|
||||
}
|
||||
|
||||
func (s *PythonDocsScraper) moduleToDocument(module *pythondocs.Module, sourceName string) *Document {
|
||||
content := s.buildModuleContent(module)
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"name": module.Name,
|
||||
"path": module.Path,
|
||||
"version": module.Version,
|
||||
"doc_url": module.DocURL,
|
||||
"class_count": len(module.Classes),
|
||||
"function_count": len(module.Functions),
|
||||
"exception_count": len(module.Exceptions),
|
||||
"data_count": len(module.Constants),
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(module.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "python-module",
|
||||
Title: fmt.Sprintf("%s - Python", module.Name),
|
||||
Content: content,
|
||||
URL: module.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *PythonDocsScraper) buildModuleContent(module *pythondocs.Module) string {
|
||||
var parts []string
|
||||
|
||||
parts = append(parts, fmt.Sprintf("# Module %s\n", module.Name))
|
||||
|
||||
if module.Synopsis != "" {
|
||||
parts = append(parts, module.Synopsis)
|
||||
}
|
||||
|
||||
if module.Doc != "" {
|
||||
parts = append(parts, "\n"+module.Doc)
|
||||
}
|
||||
|
||||
if len(module.Classes) > 0 {
|
||||
parts = append(parts, fmt.Sprintf("\n## Classes (%d)\n", len(module.Classes)))
|
||||
for _, class := range module.Classes {
|
||||
parts = append(parts, fmt.Sprintf("- `%s`", class.Name))
|
||||
}
|
||||
}
|
||||
|
||||
if len(module.Functions) > 0 {
|
||||
parts = append(parts, fmt.Sprintf("\n## Functions (%d)\n", len(module.Functions)))
|
||||
for _, fn := range module.Functions {
|
||||
parts = append(parts, fmt.Sprintf("- `%s`", fn.Name))
|
||||
}
|
||||
}
|
||||
|
||||
if len(module.Exceptions) > 0 {
|
||||
parts = append(parts, fmt.Sprintf("\n## Exceptions (%d)\n", len(module.Exceptions)))
|
||||
for _, exc := range module.Exceptions {
|
||||
parts = append(parts, fmt.Sprintf("- `%s`", exc.Name))
|
||||
}
|
||||
}
|
||||
|
||||
return strings.Join(parts, "\n")
|
||||
}
|
||||
|
||||
func (s *PythonDocsScraper) classToDocument(class *pythondocs.Class, module *pythondocs.Module, sourceName string) *Document {
|
||||
content := s.buildClassContent(class, module)
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"module": module.Name,
|
||||
"class": class.Name,
|
||||
"qual_name": class.QualName,
|
||||
"bases": class.Bases,
|
||||
"method_count": len(class.Methods),
|
||||
"attribute_count": len(class.Attributes),
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(class.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "python-class",
|
||||
Title: fmt.Sprintf("%s.%s - Python", module.Name, class.Name),
|
||||
Content: content,
|
||||
URL: class.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *PythonDocsScraper) buildClassContent(class *pythondocs.Class, module *pythondocs.Module) string {
|
||||
var parts []string
|
||||
|
||||
parts = append(parts, fmt.Sprintf("# class %s.%s\n", module.Name, class.Name))
|
||||
|
||||
if class.Signature != "" {
|
||||
parts = append(parts, fmt.Sprintf("```python\n%s\n```", class.Signature))
|
||||
}
|
||||
|
||||
if class.Doc != "" {
|
||||
parts = append(parts, "\n"+class.Doc)
|
||||
}
|
||||
|
||||
if len(class.Bases) > 0 {
|
||||
parts = append(parts, fmt.Sprintf("\n**Bases:** %s\n", strings.Join(class.Bases, ", ")))
|
||||
}
|
||||
|
||||
if len(class.Methods) > 0 {
|
||||
parts = append(parts, fmt.Sprintf("\n### Methods (%d)\n", len(class.Methods)))
|
||||
for _, m := range class.Methods {
|
||||
parts = append(parts, fmt.Sprintf("- `%s`", m.Name))
|
||||
}
|
||||
}
|
||||
|
||||
if len(class.ClassMethods) > 0 {
|
||||
parts = append(parts, fmt.Sprintf("\n### Class Methods (%d)\n", len(class.ClassMethods)))
|
||||
for _, m := range class.ClassMethods {
|
||||
parts = append(parts, fmt.Sprintf("- `%s` (classmethod)", m.Name))
|
||||
}
|
||||
}
|
||||
|
||||
if len(class.Attributes) > 0 {
|
||||
parts = append(parts, fmt.Sprintf("\n### Attributes (%d)\n", len(class.Attributes)))
|
||||
for _, a := range class.Attributes {
|
||||
parts = append(parts, fmt.Sprintf("- `%s`", a.Name))
|
||||
}
|
||||
}
|
||||
|
||||
return strings.Join(parts, "\n")
|
||||
}
|
||||
|
||||
func (s *PythonDocsScraper) methodToDocument(method *pythondocs.Method, class *pythondocs.Class, module *pythondocs.Module, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# %s.%s.%s\n\n", module.Name, class.Name, method.Name)
|
||||
|
||||
if method.Signature != "" {
|
||||
fmt.Fprintf(&content, "```python\n%s\n```\n", method.Signature)
|
||||
}
|
||||
|
||||
if method.Doc != "" {
|
||||
fmt.Fprintf(&content, "%s\n", method.Doc)
|
||||
}
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"module": module.Name,
|
||||
"class": class.Name,
|
||||
"method": method.Name,
|
||||
"qual_name": method.QualName,
|
||||
"is_static": method.IsStatic,
|
||||
"is_async": method.IsAsync,
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(method.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "python-method",
|
||||
Title: fmt.Sprintf("%s.%s.%s - Python", module.Name, class.Name, method.Name),
|
||||
Content: content.String(),
|
||||
URL: method.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *PythonDocsScraper) classMethodToDocument(method *pythondocs.Method, class *pythondocs.Class, module *pythondocs.Module, sourceName string) *Document {
|
||||
content := s.buildMethodContent(method, class, module)
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"module": module.Name,
|
||||
"class": class.Name,
|
||||
"method": method.Name,
|
||||
"qual_name": method.QualName,
|
||||
"is_classmethod": true,
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(method.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "python-classmethod",
|
||||
Title: fmt.Sprintf("%s.%s.%s (classmethod) - Python", module.Name, class.Name, method.Name),
|
||||
Content: content,
|
||||
URL: method.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *PythonDocsScraper) buildMethodContent(method *pythondocs.Method, class *pythondocs.Class, module *pythondocs.Module) string {
|
||||
var parts []string
|
||||
|
||||
parts = append(parts, fmt.Sprintf("# %s.%s.%s\n", module.Name, class.Name, method.Name))
|
||||
|
||||
if method.Signature != "" {
|
||||
parts = append(parts, fmt.Sprintf("```python\n%s\n```", method.Signature))
|
||||
}
|
||||
|
||||
if method.Doc != "" {
|
||||
parts = append(parts, "\n"+method.Doc)
|
||||
}
|
||||
|
||||
return strings.Join(parts, "\n")
|
||||
}
|
||||
|
||||
func (s *PythonDocsScraper) attributeToDocument(attr *pythondocs.Attribute, class *pythondocs.Class, module *pythondocs.Module, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# %s.%s.%s\n\n", module.Name, class.Name, attr.Name)
|
||||
|
||||
if attr.Doc != "" {
|
||||
fmt.Fprintf(&content, "%s\n", attr.Doc)
|
||||
}
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"module": module.Name,
|
||||
"class": class.Name,
|
||||
"attr": attr.Name,
|
||||
"type": attr.Type,
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(attr.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "python-attribute",
|
||||
Title: fmt.Sprintf("%s.%s.%s - Python", module.Name, class.Name, attr.Name),
|
||||
Content: content.String(),
|
||||
URL: attr.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *PythonDocsScraper) functionToDocument(fn *pythondocs.Function, module *pythondocs.Module, sourceName string) *Document {
|
||||
content := s.buildFunctionContent(fn, module)
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"module": module.Name,
|
||||
"function": fn.Name,
|
||||
"qual_name": fn.QualName,
|
||||
"signature": fn.Signature,
|
||||
"is_async": fn.IsAsync,
|
||||
"is_generator": fn.IsGenerator,
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(fn.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "python-function",
|
||||
Title: fmt.Sprintf("%s.%s - Python", module.Name, fn.Name),
|
||||
Content: content,
|
||||
URL: fn.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *PythonDocsScraper) buildFunctionContent(fn *pythondocs.Function, module *pythondocs.Module) string {
|
||||
var parts []string
|
||||
|
||||
parts = append(parts, fmt.Sprintf("# %s.%s\n", module.Name, fn.Name))
|
||||
|
||||
if fn.Signature != "" {
|
||||
parts = append(parts, fmt.Sprintf("```python\n%s\n```", fn.Signature))
|
||||
}
|
||||
|
||||
if fn.Doc != "" {
|
||||
parts = append(parts, "\n"+fn.Doc)
|
||||
}
|
||||
|
||||
return strings.Join(parts, "\n")
|
||||
}
|
||||
|
||||
func (s *PythonDocsScraper) exceptionToDocument(exc *pythondocs.Exception, module *pythondocs.Module, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# %s.%s\n\n", module.Name, exc.Name)
|
||||
|
||||
if exc.Signature != "" {
|
||||
fmt.Fprintf(&content, "```python\n%s\n```\n", exc.Signature)
|
||||
}
|
||||
|
||||
if exc.Doc != "" {
|
||||
fmt.Fprintf(&content, "%s\n", exc.Doc)
|
||||
}
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"module": module.Name,
|
||||
"exception": exc.Name,
|
||||
"qual_name": exc.QualName,
|
||||
"bases": exc.Bases,
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(exc.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "python-exception",
|
||||
Title: fmt.Sprintf("%s.%s - Python", module.Name, exc.Name),
|
||||
Content: content.String(),
|
||||
URL: exc.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *PythonDocsScraper) dataToDocument(data *pythondocs.Data, module *pythondocs.Module, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# %s.%s\n\n", module.Name, data.Name)
|
||||
|
||||
if data.Doc != "" {
|
||||
fmt.Fprintf(&content, "%s\n", data.Doc)
|
||||
}
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"module": module.Name,
|
||||
"data": data.Name,
|
||||
"type": data.Type,
|
||||
"value": data.Value,
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(data.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "python-data",
|
||||
Title: fmt.Sprintf("%s.%s - Python", module.Name, data.Name),
|
||||
Content: content.String(),
|
||||
URL: data.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
+214
@@ -0,0 +1,214 @@
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/yourorg/devour/pkg/reactdocs"
|
||||
)
|
||||
|
||||
type ReactDocsScraper struct {
|
||||
config *Config
|
||||
parser *reactdocs.Parser
|
||||
client *http.Client
|
||||
}
|
||||
|
||||
func NewReactDocsScraper(config *Config) *ReactDocsScraper {
|
||||
return &ReactDocsScraper{
|
||||
config: config,
|
||||
parser: reactdocs.NewParser(),
|
||||
client: &http.Client{
|
||||
Timeout: config.Timeout,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func (s *ReactDocsScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
|
||||
var documents []*Document
|
||||
|
||||
if source.URL == "" {
|
||||
return nil, fmt.Errorf("URL is required for React docs scraper")
|
||||
}
|
||||
|
||||
html, err := s.fetchPage(ctx, source.URL)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to fetch page: %w", err)
|
||||
}
|
||||
|
||||
ref, err := s.parser.ParseReferencePage(html, source.URL)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to parse reference: %w", err)
|
||||
}
|
||||
|
||||
mainDoc := s.referenceToDocument(ref, source.Name)
|
||||
documents = append(documents, mainDoc)
|
||||
|
||||
for _, hook := range ref.Hooks {
|
||||
doc := s.hookToDocument(hook, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
for _, comp := range ref.Components {
|
||||
doc := s.componentToDocument(comp, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
for _, api := range ref.APIs {
|
||||
doc := s.apiToDocument(api, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
return documents, nil
|
||||
}
|
||||
|
||||
func (s *ReactDocsScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
|
||||
html, err := s.fetchPage(ctx, source.URL)
|
||||
if err != nil {
|
||||
return false, "", err
|
||||
}
|
||||
|
||||
hash := s.generateHash(html)
|
||||
changed := hash != lastHash
|
||||
|
||||
return changed, hash, nil
|
||||
}
|
||||
|
||||
func (s *ReactDocsScraper) fetchPage(ctx context.Context, url string) (string, error) {
|
||||
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
req.Header.Set("User-Agent", s.config.UserAgent)
|
||||
|
||||
resp, err := s.client.Do(req)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return "", fmt.Errorf("HTTP %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return string(body), nil
|
||||
}
|
||||
|
||||
func (s *ReactDocsScraper) generateHash(content string) string {
|
||||
hash := sha256.Sum256([]byte(content))
|
||||
return hex.EncodeToString(hash[:])
|
||||
}
|
||||
|
||||
func (s *ReactDocsScraper) referenceToDocument(ref *reactdocs.Reference, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# React API Reference\n\n")
|
||||
fmt.Fprintf(&content, "Hooks: %d, Components: %d, APIs: %d\n", len(ref.Hooks), len(ref.Components), len(ref.APIs))
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(ref.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "react-reference",
|
||||
Title: "React API Reference",
|
||||
Content: content.String(),
|
||||
URL: ref.DocURL,
|
||||
Metadata: map[string]interface{}{"doc_type": "react-reference"},
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *ReactDocsScraper) hookToDocument(hook *reactdocs.Hook, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# %s\n\n", hook.Name)
|
||||
if hook.Signature != "" {
|
||||
fmt.Fprintf(&content, "```javascript\n%s\n```\n\n", hook.Signature)
|
||||
}
|
||||
fmt.Fprintf(&content, "%s\n", hook.Doc)
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"name": hook.Name,
|
||||
"category": hook.Category,
|
||||
"doc_url": hook.DocURL,
|
||||
"doc_type": "react-hook",
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(hook.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "react-hook",
|
||||
Title: hook.Name,
|
||||
Content: content.String(),
|
||||
URL: hook.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *ReactDocsScraper) componentToDocument(comp *reactdocs.Component, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# <%s />\n\n", comp.Name)
|
||||
fmt.Fprintf(&content, "%s\n", comp.Doc)
|
||||
|
||||
if len(comp.Props) > 0 {
|
||||
fmt.Fprintf(&content, "\n## Props\n")
|
||||
for _, p := range comp.Props {
|
||||
fmt.Fprintf(&content, "- `%s: %s`\n", p.Name, p.Type)
|
||||
}
|
||||
}
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"name": comp.Name,
|
||||
"doc_url": comp.DocURL,
|
||||
"doc_type": "react-component",
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(comp.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "react-component",
|
||||
Title: comp.Name,
|
||||
Content: content.String(),
|
||||
URL: comp.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *ReactDocsScraper) apiToDocument(api *reactdocs.API, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# %s()\n\n", api.Name)
|
||||
if api.Signature != "" {
|
||||
fmt.Fprintf(&content, "```javascript\n%s\n```\n\n", api.Signature)
|
||||
}
|
||||
fmt.Fprintf(&content, "%s\n", api.Doc)
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"name": api.Name,
|
||||
"doc_url": api.DocURL,
|
||||
"doc_type": "react-api",
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(api.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "react-api",
|
||||
Title: api.Name,
|
||||
Content: content.String(),
|
||||
URL: api.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
+563
@@ -0,0 +1,563 @@
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/yourorg/devour/pkg/rustdocs"
|
||||
)
|
||||
|
||||
type RustDocsScraper struct {
|
||||
config *Config
|
||||
parser *rustdocs.Parser
|
||||
client *http.Client
|
||||
}
|
||||
|
||||
func NewRustDocsScraper(config *Config) *RustDocsScraper {
|
||||
return &RustDocsScraper{
|
||||
config: config,
|
||||
parser: rustdocs.NewParser(),
|
||||
client: &http.Client{
|
||||
Timeout: config.Timeout,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func (s *RustDocsScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
|
||||
var documents []*Document
|
||||
|
||||
if source.URL == "" {
|
||||
return nil, fmt.Errorf("URL is required for Rust docs scraper")
|
||||
}
|
||||
|
||||
html, err := s.fetchPage(ctx, source.URL)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to fetch page: %w", err)
|
||||
}
|
||||
|
||||
crate, err := s.parser.ParseCratePage(html, source.URL)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to parse crate: %w", err)
|
||||
}
|
||||
|
||||
mainDoc := s.crateToDocument(crate, source.Name)
|
||||
documents = append(documents, mainDoc)
|
||||
|
||||
for _, m := range crate.Modules {
|
||||
doc := s.moduleToDocument(m, crate, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
for _, st := range crate.Structs {
|
||||
doc := s.structToDocument(st, crate, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
for _, e := range crate.Enums {
|
||||
doc := s.enumToDocument(e, crate, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
for _, t := range crate.Traits {
|
||||
doc := s.traitToDocument(t, crate, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
for _, f := range crate.Functions {
|
||||
doc := s.funcToDocument(f, crate, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
for _, m := range crate.Macros {
|
||||
doc := s.macroToDocument(m, crate, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
for _, c := range crate.Constants {
|
||||
doc := s.constToDocument(c, crate, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
for _, st := range crate.Statics {
|
||||
doc := s.staticToDocument(st, crate, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
return documents, nil
|
||||
}
|
||||
|
||||
func (s *RustDocsScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
|
||||
html, err := s.fetchPage(ctx, source.URL)
|
||||
if err != nil {
|
||||
return false, "", err
|
||||
}
|
||||
|
||||
hash := s.generateHash(html)
|
||||
changed := hash != lastHash
|
||||
|
||||
return changed, hash, nil
|
||||
}
|
||||
|
||||
func (s *RustDocsScraper) fetchPage(ctx context.Context, url string) (string, error) {
|
||||
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
req.Header.Set("User-Agent", s.config.UserAgent)
|
||||
|
||||
resp, err := s.client.Do(req)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return "", fmt.Errorf("HTTP %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return string(body), nil
|
||||
}
|
||||
|
||||
func (s *RustDocsScraper) generateHash(content string) string {
|
||||
hash := sha256.Sum256([]byte(content))
|
||||
return hex.EncodeToString(hash[:])
|
||||
}
|
||||
|
||||
func (s *RustDocsScraper) crateToDocument(crate *rustdocs.Crate, sourceName string) *Document {
|
||||
content := s.buildCrateContent(crate)
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"name": crate.Name,
|
||||
"version": crate.Version,
|
||||
"repository": crate.Repository,
|
||||
"license": crate.License,
|
||||
"doc_url": crate.DocURL,
|
||||
"module_count": len(crate.Modules),
|
||||
"struct_count": len(crate.Structs),
|
||||
"enum_count": len(crate.Enums),
|
||||
"trait_count": len(crate.Traits),
|
||||
"function_count": len(crate.Functions),
|
||||
"macro_count": len(crate.Macros),
|
||||
"constant_count": len(crate.Constants),
|
||||
"static_count": len(crate.Statics),
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(crate.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "rust-crate",
|
||||
Title: fmt.Sprintf("%s - Rust", crate.Name),
|
||||
Content: content,
|
||||
URL: crate.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *RustDocsScraper) buildCrateContent(crate *rustdocs.Crate) string {
|
||||
var parts []string
|
||||
|
||||
parts = append(parts, fmt.Sprintf("# Crate %s\n", crate.Name))
|
||||
|
||||
if crate.Version != "" {
|
||||
parts = append(parts, fmt.Sprintf("Version: %s\n", crate.Version))
|
||||
}
|
||||
|
||||
if crate.Description != "" {
|
||||
parts = append(parts, crate.Description)
|
||||
}
|
||||
|
||||
if len(crate.Modules) > 0 {
|
||||
parts = append(parts, fmt.Sprintf("\n## Modules (%d)\n", len(crate.Modules)))
|
||||
for _, m := range crate.Modules {
|
||||
parts = append(parts, fmt.Sprintf("- `%s`", m.Name))
|
||||
}
|
||||
}
|
||||
|
||||
if len(crate.Structs) > 0 {
|
||||
parts = append(parts, fmt.Sprintf("\n## Structs (%d)\n", len(crate.Structs)))
|
||||
for _, st := range crate.Structs {
|
||||
parts = append(parts, fmt.Sprintf("- `%s`", st.Name))
|
||||
}
|
||||
}
|
||||
|
||||
if len(crate.Enums) > 0 {
|
||||
parts = append(parts, fmt.Sprintf("\n## Enums (%d)\n", len(crate.Enums)))
|
||||
for _, e := range crate.Enums {
|
||||
parts = append(parts, fmt.Sprintf("- `%s`", e.Name))
|
||||
}
|
||||
}
|
||||
|
||||
if len(crate.Traits) > 0 {
|
||||
parts = append(parts, fmt.Sprintf("\n## Traits (%d)\n", len(crate.Traits)))
|
||||
for _, t := range crate.Traits {
|
||||
parts = append(parts, fmt.Sprintf("- `%s`", t.Name))
|
||||
}
|
||||
}
|
||||
|
||||
if len(crate.Functions) > 0 {
|
||||
parts = append(parts, fmt.Sprintf("\n## Functions (%d)\n", len(crate.Functions)))
|
||||
for _, f := range crate.Functions {
|
||||
parts = append(parts, fmt.Sprintf("- `%s`", f.Name))
|
||||
}
|
||||
}
|
||||
|
||||
if len(crate.Macros) > 0 {
|
||||
parts = append(parts, fmt.Sprintf("\n## Macros (%d)\n", len(crate.Macros)))
|
||||
for _, m := range crate.Macros {
|
||||
parts = append(parts, fmt.Sprintf("- `%s`", m.Name))
|
||||
}
|
||||
}
|
||||
|
||||
return strings.Join(parts, "\n")
|
||||
}
|
||||
|
||||
func (s *RustDocsScraper) moduleToDocument(m *rustdocs.Module, crate *rustdocs.Crate, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# Module %s::%s\n\n", crate.Name, m.Name)
|
||||
|
||||
if m.Doc != "" {
|
||||
fmt.Fprintf(&content, "%s\n", m.Doc)
|
||||
}
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"crate": crate.Name,
|
||||
"module": m.Name,
|
||||
"path": m.Path,
|
||||
"is_experimental": m.IsExperimental,
|
||||
"kind": "module",
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(m.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "rust-module",
|
||||
Title: fmt.Sprintf("%s::%s - Rust", crate.Name, m.Name),
|
||||
Content: content.String(),
|
||||
URL: m.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *RustDocsScraper) structToDocument(st *rustdocs.Struct, crate *rustdocs.Crate, sourceName string) *Document {
|
||||
content := s.buildStructContent(st, crate)
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"crate": crate.Name,
|
||||
"struct": st.Name,
|
||||
"path": st.Path,
|
||||
"is_experimental": st.IsExperimental,
|
||||
"kind": "struct",
|
||||
"declaration": st.Declaration,
|
||||
}
|
||||
|
||||
fieldsJSON, _ := json.Marshal(st.Fields)
|
||||
metadata["fields"] = string(fieldsJSON)
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(st.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "rust-struct",
|
||||
Title: fmt.Sprintf("%s::%s - Rust", crate.Name, st.Name),
|
||||
Content: content,
|
||||
URL: st.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *RustDocsScraper) buildStructContent(st *rustdocs.Struct, crate *rustdocs.Crate) string {
|
||||
var parts []string
|
||||
|
||||
parts = append(parts, fmt.Sprintf("# struct %s::%s\n", crate.Name, st.Name))
|
||||
|
||||
if st.Declaration != "" {
|
||||
parts = append(parts, fmt.Sprintf("```rust\n%s\n```", st.Declaration))
|
||||
}
|
||||
|
||||
if st.Doc != "" {
|
||||
parts = append(parts, "\n"+st.Doc)
|
||||
}
|
||||
|
||||
if len(st.Fields) > 0 {
|
||||
parts = append(parts, "\n### Fields\n")
|
||||
for _, f := range st.Fields {
|
||||
if f.Doc != "" {
|
||||
parts = append(parts, fmt.Sprintf("- `%s: %s` - %s", f.Name, f.Type, f.Doc))
|
||||
} else {
|
||||
parts = append(parts, fmt.Sprintf("- `%s: %s`", f.Name, f.Type))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if len(st.Methods) > 0 {
|
||||
parts = append(parts, fmt.Sprintf("\n### Methods (%d)\n", len(st.Methods)))
|
||||
for _, m := range st.Methods {
|
||||
parts = append(parts, fmt.Sprintf("- `%s`", m.Name))
|
||||
}
|
||||
}
|
||||
|
||||
return strings.Join(parts, "\n")
|
||||
}
|
||||
|
||||
func (s *RustDocsScraper) enumToDocument(e *rustdocs.Enum, crate *rustdocs.Crate, sourceName string) *Document {
|
||||
content := s.buildEnumContent(e, crate)
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"crate": crate.Name,
|
||||
"enum": e.Name,
|
||||
"path": e.Path,
|
||||
"is_experimental": e.IsExperimental,
|
||||
"kind": "enum",
|
||||
"declaration": e.Declaration,
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(e.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "rust-enum",
|
||||
Title: fmt.Sprintf("%s::%s - Rust", crate.Name, e.Name),
|
||||
Content: content,
|
||||
URL: e.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *RustDocsScraper) buildEnumContent(e *rustdocs.Enum, crate *rustdocs.Crate) string {
|
||||
var parts []string
|
||||
|
||||
parts = append(parts, fmt.Sprintf("# enum %s::%s\n", crate.Name, e.Name))
|
||||
|
||||
if e.Declaration != "" {
|
||||
parts = append(parts, fmt.Sprintf("```rust\n%s\n```", e.Declaration))
|
||||
}
|
||||
|
||||
if e.Doc != "" {
|
||||
parts = append(parts, "\n"+e.Doc)
|
||||
}
|
||||
|
||||
if len(e.Variants) > 0 {
|
||||
parts = append(parts, "\n### Variants\n")
|
||||
for _, v := range e.Variants {
|
||||
if v.Doc != "" {
|
||||
parts = append(parts, fmt.Sprintf("- `%s` - %s", v.Name, v.Doc))
|
||||
} else {
|
||||
parts = append(parts, fmt.Sprintf("- `%s`", v.Name))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return strings.Join(parts, "\n")
|
||||
}
|
||||
|
||||
func (s *RustDocsScraper) traitToDocument(t *rustdocs.Trait, crate *rustdocs.Crate, sourceName string) *Document {
|
||||
content := s.buildTraitContent(t, crate)
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"crate": crate.Name,
|
||||
"trait": t.Name,
|
||||
"path": t.Path,
|
||||
"is_experimental": t.IsExperimental,
|
||||
"kind": "trait",
|
||||
"declaration": t.Declaration,
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(t.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "rust-trait",
|
||||
Title: fmt.Sprintf("%s::%s - Rust", crate.Name, t.Name),
|
||||
Content: content,
|
||||
URL: t.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *RustDocsScraper) buildTraitContent(t *rustdocs.Trait, crate *rustdocs.Crate) string {
|
||||
var parts []string
|
||||
|
||||
parts = append(parts, fmt.Sprintf("# trait %s::%s\n", crate.Name, t.Name))
|
||||
|
||||
if t.Declaration != "" {
|
||||
parts = append(parts, fmt.Sprintf("```rust\n%s\n```", t.Declaration))
|
||||
}
|
||||
|
||||
if t.Doc != "" {
|
||||
parts = append(parts, "\n"+t.Doc)
|
||||
}
|
||||
|
||||
if len(t.Methods) > 0 {
|
||||
parts = append(parts, fmt.Sprintf("\n### Required Methods (%d)\n", len(t.Methods)))
|
||||
for _, m := range t.Methods {
|
||||
parts = append(parts, fmt.Sprintf("- `%s`", m.Signature))
|
||||
}
|
||||
}
|
||||
|
||||
return strings.Join(parts, "\n")
|
||||
}
|
||||
|
||||
func (s *RustDocsScraper) funcToDocument(f *rustdocs.Func, crate *rustdocs.Crate, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# fn %s::%s\n\n", crate.Name, f.Name)
|
||||
|
||||
if f.Signature != "" {
|
||||
fmt.Fprintf(&content, "```rust\n%s\n```\n", f.Signature)
|
||||
}
|
||||
|
||||
if f.Doc != "" {
|
||||
fmt.Fprintf(&content, "%s\n", f.Doc)
|
||||
}
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"crate": crate.Name,
|
||||
"function": f.Name,
|
||||
"path": f.Path,
|
||||
"is_experimental": f.IsExperimental,
|
||||
"is_unsafe": f.IsUnsafe,
|
||||
"is_const": f.IsConst,
|
||||
"is_async": f.IsAsync,
|
||||
"kind": "fn",
|
||||
"signature": f.Signature,
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(f.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "rust-fn",
|
||||
Title: fmt.Sprintf("%s::%s - Rust", crate.Name, f.Name),
|
||||
Content: content.String(),
|
||||
URL: f.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *RustDocsScraper) macroToDocument(m *rustdocs.Macro, crate *rustdocs.Crate, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# macro %s::%s!\n\n", crate.Name, m.Name)
|
||||
|
||||
if m.Signature != "" {
|
||||
fmt.Fprintf(&content, "```rust\n%s\n```\n", m.Signature)
|
||||
}
|
||||
|
||||
if m.Doc != "" {
|
||||
fmt.Fprintf(&content, "%s\n", m.Doc)
|
||||
}
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"crate": crate.Name,
|
||||
"macro": m.Name,
|
||||
"path": m.Path,
|
||||
"is_experimental": m.IsExperimental,
|
||||
"kind": "macro",
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(m.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "rust-macro",
|
||||
Title: fmt.Sprintf("%s::%s! - Rust", crate.Name, m.Name),
|
||||
Content: content.String(),
|
||||
URL: m.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *RustDocsScraper) constToDocument(c *rustdocs.Const, crate *rustdocs.Crate, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# const %s::%s\n\n", crate.Name, c.Name)
|
||||
|
||||
if c.Type != "" {
|
||||
fmt.Fprintf(&content, "Type: `%s`\n", c.Type)
|
||||
}
|
||||
if c.Value != "" {
|
||||
fmt.Fprintf(&content, "Value: `%s`\n", c.Value)
|
||||
}
|
||||
|
||||
if c.Doc != "" {
|
||||
fmt.Fprintf(&content, "\n%s\n", c.Doc)
|
||||
}
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"crate": crate.Name,
|
||||
"const": c.Name,
|
||||
"path": c.Path,
|
||||
"is_experimental": c.IsExperimental,
|
||||
"type": c.Type,
|
||||
"value": c.Value,
|
||||
"kind": "const",
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(c.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "rust-const",
|
||||
Title: fmt.Sprintf("%s::%s - Rust", crate.Name, c.Name),
|
||||
Content: content.String(),
|
||||
URL: c.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *RustDocsScraper) staticToDocument(st *rustdocs.Static, crate *rustdocs.Crate, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# static %s::%s\n\n", crate.Name, st.Name)
|
||||
|
||||
if st.Type != "" {
|
||||
fmt.Fprintf(&content, "Type: `%s`\n", st.Type)
|
||||
}
|
||||
if st.IsMutable {
|
||||
fmt.Fprintf(&content, "Mutability: mutable\n")
|
||||
}
|
||||
|
||||
if st.Doc != "" {
|
||||
fmt.Fprintf(&content, "\n%s\n", st.Doc)
|
||||
}
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"crate": crate.Name,
|
||||
"static": st.Name,
|
||||
"path": st.Path,
|
||||
"is_experimental": st.IsExperimental,
|
||||
"is_mutable": st.IsMutable,
|
||||
"type": st.Type,
|
||||
"kind": "static",
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(st.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "rust-static",
|
||||
Title: fmt.Sprintf("%s::%s - Rust", crate.Name, st.Name),
|
||||
Content: content.String(),
|
||||
URL: st.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
+221
@@ -0,0 +1,221 @@
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/yourorg/devour/pkg/springdocs"
|
||||
)
|
||||
|
||||
type SpringDocsScraper struct {
|
||||
config *Config
|
||||
parser *springdocs.Parser
|
||||
client *http.Client
|
||||
}
|
||||
|
||||
func NewSpringDocsScraper(config *Config) *SpringDocsScraper {
|
||||
return &SpringDocsScraper{
|
||||
config: config,
|
||||
parser: springdocs.NewParser(),
|
||||
client: &http.Client{
|
||||
Timeout: config.Timeout,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func (s *SpringDocsScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
|
||||
var documents []*Document
|
||||
|
||||
if source.URL == "" {
|
||||
return nil, fmt.Errorf("URL is required for Spring docs scraper")
|
||||
}
|
||||
|
||||
html, err := s.fetchPage(ctx, source.URL)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to fetch page: %w", err)
|
||||
}
|
||||
|
||||
module, err := s.parser.ParseModulePage(html, source.URL)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to parse module: %w", err)
|
||||
}
|
||||
|
||||
mainDoc := s.moduleToDocument(module, source.Name)
|
||||
documents = append(documents, mainDoc)
|
||||
|
||||
for _, class := range module.Classes {
|
||||
doc := s.classToDocument(class, module, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
for _, prop := range module.Properties {
|
||||
doc := s.propertyToDocument(prop, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
for _, guide := range module.Guides {
|
||||
doc := s.guideToDocument(guide, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
return documents, nil
|
||||
}
|
||||
|
||||
func (s *SpringDocsScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
|
||||
html, err := s.fetchPage(ctx, source.URL)
|
||||
if err != nil {
|
||||
return false, "", err
|
||||
}
|
||||
|
||||
hash := s.generateHash(html)
|
||||
changed := hash != lastHash
|
||||
|
||||
return changed, hash, nil
|
||||
}
|
||||
|
||||
func (s *SpringDocsScraper) fetchPage(ctx context.Context, url string) (string, error) {
|
||||
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
req.Header.Set("User-Agent", s.config.UserAgent)
|
||||
|
||||
resp, err := s.client.Do(req)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return "", fmt.Errorf("HTTP %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return string(body), nil
|
||||
}
|
||||
|
||||
func (s *SpringDocsScraper) generateHash(content string) string {
|
||||
hash := sha256.Sum256([]byte(content))
|
||||
return hex.EncodeToString(hash[:])
|
||||
}
|
||||
|
||||
func (s *SpringDocsScraper) moduleToDocument(module *springdocs.Module, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# %s\n\n", module.Name)
|
||||
fmt.Fprintf(&content, "%s\n", module.Doc)
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"module": module.Name,
|
||||
"version": module.Version,
|
||||
"doc_url": module.DocURL,
|
||||
"doc_type": "spring-module",
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(module.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "spring-module",
|
||||
Title: module.Name,
|
||||
Content: content.String(),
|
||||
URL: module.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *SpringDocsScraper) classToDocument(class *springdocs.Class, module *springdocs.Module, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# %s\n\n", class.QualifiedName)
|
||||
fmt.Fprintf(&content, "%s\n", class.Doc)
|
||||
|
||||
if len(class.Methods) > 0 {
|
||||
fmt.Fprintf(&content, "\n## Methods\n")
|
||||
for _, m := range class.Methods {
|
||||
fmt.Fprintf(&content, "- `%s`\n", m.Signature)
|
||||
}
|
||||
}
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"module": module.Name,
|
||||
"qualified_name": class.QualifiedName,
|
||||
"kind": class.Kind,
|
||||
"doc_url": class.DocURL,
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(class.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "spring-class",
|
||||
Title: class.QualifiedName,
|
||||
Content: content.String(),
|
||||
URL: class.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *SpringDocsScraper) propertyToDocument(prop *springdocs.Property, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# %s\n\n", prop.Name)
|
||||
fmt.Fprintf(&content, "Type: %s\n", prop.Type)
|
||||
if prop.Default != "" {
|
||||
fmt.Fprintf(&content, "Default: `%s`\n", prop.Default)
|
||||
}
|
||||
fmt.Fprintf(&content, "\n%s\n", prop.Doc)
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"property": prop.Name,
|
||||
"type": prop.Type,
|
||||
"default": prop.Default,
|
||||
"doc_url": prop.DocURL,
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(prop.Name),
|
||||
Source: sourceName,
|
||||
Type: "spring-property",
|
||||
Title: prop.Name,
|
||||
Content: content.String(),
|
||||
URL: prop.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *SpringDocsScraper) guideToDocument(guide *springdocs.Guide, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# %s\n\n", guide.Title)
|
||||
fmt.Fprintf(&content, "%s\n", guide.Description)
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"title": guide.Title,
|
||||
"doc_url": guide.DocURL,
|
||||
"level": guide.Level,
|
||||
"doc_type": "spring-guide",
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(guide.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "spring-guide",
|
||||
Title: guide.Title,
|
||||
Content: content.String(),
|
||||
URL: guide.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
Vendored
+261
@@ -0,0 +1,261 @@
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/yourorg/devour/pkg/tsdocs"
|
||||
)
|
||||
|
||||
type TSDocsScraper struct {
|
||||
config *Config
|
||||
parser *tsdocs.Parser
|
||||
client *http.Client
|
||||
}
|
||||
|
||||
func NewTSDocsScraper(config *Config) *TSDocsScraper {
|
||||
return &TSDocsScraper{
|
||||
config: config,
|
||||
parser: tsdocs.NewParser(),
|
||||
client: &http.Client{
|
||||
Timeout: config.Timeout,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func (s *TSDocsScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
|
||||
var documents []*Document
|
||||
|
||||
if source.URL == "" {
|
||||
return nil, fmt.Errorf("URL is required for TypeScript docs scraper")
|
||||
}
|
||||
|
||||
html, err := s.fetchPage(ctx, source.URL)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to fetch page: %w", err)
|
||||
}
|
||||
|
||||
module, err := s.parser.ParseModulePage(html, source.URL)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to parse module: %w", err)
|
||||
}
|
||||
|
||||
mainDoc := s.moduleToDocument(module, source.Name)
|
||||
documents = append(documents, mainDoc)
|
||||
|
||||
for _, iface := range module.Interfaces {
|
||||
doc := s.interfaceToDocument(iface, module, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
for _, fn := range module.Functions {
|
||||
doc := s.functionToDocument(fn, module, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
for _, class := range module.Classes {
|
||||
doc := s.classToDocument(class, module, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
for _, ta := range module.Types {
|
||||
doc := s.typeAliasToDocument(ta, module, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
return documents, nil
|
||||
}
|
||||
|
||||
func (s *TSDocsScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
|
||||
html, err := s.fetchPage(ctx, source.URL)
|
||||
if err != nil {
|
||||
return false, "", err
|
||||
}
|
||||
|
||||
hash := s.generateHash(html)
|
||||
changed := hash != lastHash
|
||||
|
||||
return changed, hash, nil
|
||||
}
|
||||
|
||||
func (s *TSDocsScraper) fetchPage(ctx context.Context, url string) (string, error) {
|
||||
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
req.Header.Set("User-Agent", s.config.UserAgent)
|
||||
|
||||
resp, err := s.client.Do(req)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return "", fmt.Errorf("HTTP %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return string(body), nil
|
||||
}
|
||||
|
||||
func (s *TSDocsScraper) generateHash(content string) string {
|
||||
hash := sha256.Sum256([]byte(content))
|
||||
return hex.EncodeToString(hash[:])
|
||||
}
|
||||
|
||||
func (s *TSDocsScraper) moduleToDocument(module *tsdocs.Module, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# %s\n\n", module.Name)
|
||||
fmt.Fprintf(&content, "%s\n", module.Doc)
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"module": module.Name,
|
||||
"version": module.Version,
|
||||
"doc_url": module.DocURL,
|
||||
"doc_type": "ts-module",
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(module.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "ts-module",
|
||||
Title: module.Name,
|
||||
Content: content.String(),
|
||||
URL: module.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *TSDocsScraper) interfaceToDocument(iface *tsdocs.Interface, module *tsdocs.Module, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# %s (interface)\n\n", iface.Name)
|
||||
if iface.Doc != "" {
|
||||
fmt.Fprintf(&content, "%s\n\n", iface.Doc)
|
||||
}
|
||||
if len(iface.Properties) > 0 {
|
||||
fmt.Fprintf(&content, "## Properties\n")
|
||||
for _, p := range iface.Properties {
|
||||
fmt.Fprintf(&content, "- `%s: %s`\n", p.Name, p.Type)
|
||||
}
|
||||
}
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"module": module.Name,
|
||||
"name": iface.Name,
|
||||
"doc_url": iface.DocURL,
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(iface.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "ts-interface",
|
||||
Title: iface.Name,
|
||||
Content: content.String(),
|
||||
URL: iface.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *TSDocsScraper) functionToDocument(fn *tsdocs.Function, module *tsdocs.Module, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# %s()\n\n", fn.Name)
|
||||
if fn.Signature != "" {
|
||||
fmt.Fprintf(&content, "```typescript\n%s\n```\n\n", fn.Signature)
|
||||
}
|
||||
if fn.Doc != "" {
|
||||
fmt.Fprintf(&content, "%s\n", fn.Doc)
|
||||
}
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"module": module.Name,
|
||||
"name": fn.Name,
|
||||
"return_type": fn.ReturnType,
|
||||
"doc_url": fn.DocURL,
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(fn.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "ts-function",
|
||||
Title: fn.Name,
|
||||
Content: content.String(),
|
||||
URL: fn.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *TSDocsScraper) classToDocument(class *tsdocs.Class, module *tsdocs.Module, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# %s (class)\n\n", class.Name)
|
||||
if class.Doc != "" {
|
||||
fmt.Fprintf(&content, "%s\n\n", class.Doc)
|
||||
}
|
||||
if len(class.Methods) > 0 {
|
||||
fmt.Fprintf(&content, "## Methods\n")
|
||||
for _, m := range class.Methods {
|
||||
fmt.Fprintf(&content, "- `%s()`\n", m.Name)
|
||||
}
|
||||
}
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"module": module.Name,
|
||||
"name": class.Name,
|
||||
"doc_url": class.DocURL,
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(class.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "ts-class",
|
||||
Title: class.Name,
|
||||
Content: content.String(),
|
||||
URL: class.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *TSDocsScraper) typeAliasToDocument(ta *tsdocs.TypeAlias, module *tsdocs.Module, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# %s (type)\n\n", ta.Name)
|
||||
fmt.Fprintf(&content, "```typescript\ntype %s = %s\n```\n\n", ta.Name, ta.Type)
|
||||
if ta.Doc != "" {
|
||||
fmt.Fprintf(&content, "%s\n", ta.Doc)
|
||||
}
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"module": module.Name,
|
||||
"name": ta.Name,
|
||||
"doc_url": ta.DocURL,
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(ta.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "ts-type",
|
||||
Title: ta.Name,
|
||||
Content: content.String(),
|
||||
URL: ta.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
Vendored
+244
@@ -0,0 +1,244 @@
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/yourorg/devour/pkg/vuedocs"
|
||||
)
|
||||
|
||||
type VueDocsScraper struct {
|
||||
config *Config
|
||||
parser *vuedocs.Parser
|
||||
client *http.Client
|
||||
}
|
||||
|
||||
func NewVueDocsScraper(config *Config) *VueDocsScraper {
|
||||
return &VueDocsScraper{
|
||||
config: config,
|
||||
parser: vuedocs.NewParser(),
|
||||
client: &http.Client{
|
||||
Timeout: config.Timeout,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func (s *VueDocsScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
|
||||
var documents []*Document
|
||||
|
||||
if source.URL == "" {
|
||||
return nil, fmt.Errorf("URL is required for Vue docs scraper")
|
||||
}
|
||||
|
||||
html, err := s.fetchPage(ctx, source.URL)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to fetch page: %w", err)
|
||||
}
|
||||
|
||||
ref, err := s.parser.ParseReferencePage(html, source.URL)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to parse reference: %w", err)
|
||||
}
|
||||
|
||||
mainDoc := s.referenceToDocument(ref, source.Name)
|
||||
documents = append(documents, mainDoc)
|
||||
|
||||
for _, comp := range ref.Composition {
|
||||
doc := s.compositionToDocument(comp, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
for _, dir := range ref.Directives {
|
||||
doc := s.directiveToDocument(dir, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
for _, comp := range ref.Components {
|
||||
doc := s.componentToDocument(comp, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
for _, api := range ref.GlobalAPI {
|
||||
doc := s.globalAPIToDocument(api, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
return documents, nil
|
||||
}
|
||||
|
||||
func (s *VueDocsScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
|
||||
html, err := s.fetchPage(ctx, source.URL)
|
||||
if err != nil {
|
||||
return false, "", err
|
||||
}
|
||||
|
||||
hash := s.generateHash(html)
|
||||
changed := hash != lastHash
|
||||
|
||||
return changed, hash, nil
|
||||
}
|
||||
|
||||
func (s *VueDocsScraper) fetchPage(ctx context.Context, url string) (string, error) {
|
||||
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
req.Header.Set("User-Agent", s.config.UserAgent)
|
||||
|
||||
resp, err := s.client.Do(req)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return "", fmt.Errorf("HTTP %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return string(body), nil
|
||||
}
|
||||
|
||||
func (s *VueDocsScraper) generateHash(content string) string {
|
||||
hash := sha256.Sum256([]byte(content))
|
||||
return hex.EncodeToString(hash[:])
|
||||
}
|
||||
|
||||
func (s *VueDocsScraper) referenceToDocument(ref *vuedocs.Reference, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# Vue API Reference\n\n")
|
||||
fmt.Fprintf(&content, "Composition API: %d, Directives: %d, Components: %d\n", len(ref.Composition), len(ref.Directives), len(ref.Components))
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(ref.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "vue-reference",
|
||||
Title: "Vue API Reference",
|
||||
Content: content.String(),
|
||||
URL: ref.DocURL,
|
||||
Metadata: map[string]interface{}{"doc_type": "vue-reference"},
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *VueDocsScraper) compositionToDocument(comp *vuedocs.Composition, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# %s\n\n", comp.Name)
|
||||
if comp.Signature != "" {
|
||||
fmt.Fprintf(&content, "```javascript\n%s\n```\n\n", comp.Signature)
|
||||
}
|
||||
fmt.Fprintf(&content, "%s\n", comp.Doc)
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"name": comp.Name,
|
||||
"kind": comp.Kind,
|
||||
"doc_url": comp.DocURL,
|
||||
"doc_type": "vue-composition",
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(comp.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "vue-composition",
|
||||
Title: comp.Name,
|
||||
Content: content.String(),
|
||||
URL: comp.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *VueDocsScraper) directiveToDocument(dir *vuedocs.Directive, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# %s\n\n", dir.Name)
|
||||
fmt.Fprintf(&content, "%s\n", dir.Doc)
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"name": dir.Name,
|
||||
"doc_url": dir.DocURL,
|
||||
"doc_type": "vue-directive",
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(dir.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "vue-directive",
|
||||
Title: dir.Name,
|
||||
Content: content.String(),
|
||||
URL: dir.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *VueDocsScraper) componentToDocument(comp *vuedocs.Component, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# <%s />\n\n", comp.Name)
|
||||
fmt.Fprintf(&content, "%s\n", comp.Doc)
|
||||
|
||||
if len(comp.Props) > 0 {
|
||||
fmt.Fprintf(&content, "\n## Props\n")
|
||||
for _, p := range comp.Props {
|
||||
fmt.Fprintf(&content, "- `%s: %s`\n", p.Name, p.Type)
|
||||
}
|
||||
}
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"name": comp.Name,
|
||||
"doc_url": comp.DocURL,
|
||||
"doc_type": "vue-component",
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(comp.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "vue-component",
|
||||
Title: comp.Name,
|
||||
Content: content.String(),
|
||||
URL: comp.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *VueDocsScraper) globalAPIToDocument(api *vuedocs.API, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# %s\n\n", api.Name)
|
||||
if api.Signature != "" {
|
||||
fmt.Fprintf(&content, "```javascript\n%s\n```\n\n", api.Signature)
|
||||
}
|
||||
fmt.Fprintf(&content, "%s\n", api.Doc)
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"name": api.Name,
|
||||
"category": api.Category,
|
||||
"doc_url": api.DocURL,
|
||||
"doc_type": "vue-api",
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(api.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "vue-api",
|
||||
Title: api.Name,
|
||||
Content: content.String(),
|
||||
URL: api.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user