mirror of
https://github.com/Dvorinka/Devour.git
synced 2026-06-04 04:23:02 +00:00
first commit
This commit is contained in:
@@ -0,0 +1,156 @@
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/yourorg/devour/pkg/astrodocs"
|
||||
)
|
||||
|
||||
type AstroDocsScraper struct {
|
||||
config *Config
|
||||
parser *astrodocs.Parser
|
||||
client *http.Client
|
||||
}
|
||||
|
||||
func NewAstroDocsScraper(config *Config) *AstroDocsScraper {
|
||||
return &AstroDocsScraper{
|
||||
config: config,
|
||||
parser: astrodocs.NewParser(),
|
||||
client: &http.Client{
|
||||
Timeout: config.Timeout,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func (s *AstroDocsScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
|
||||
var documents []*Document
|
||||
|
||||
if source.URL == "" {
|
||||
return nil, fmt.Errorf("URL is required for Astro docs scraper")
|
||||
}
|
||||
|
||||
html, err := s.fetchPage(ctx, source.URL)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to fetch page: %w", err)
|
||||
}
|
||||
|
||||
page, err := s.parser.ParsePage(html, source.URL)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to parse Astro docs page: %w", err)
|
||||
}
|
||||
|
||||
mainDoc := s.pageToDocument(page, source.Name)
|
||||
documents = append(documents, mainDoc)
|
||||
|
||||
for _, section := range page.Sections {
|
||||
doc := s.sectionToDocument(section, page, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
return documents, nil
|
||||
}
|
||||
|
||||
func (s *AstroDocsScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
|
||||
html, err := s.fetchPage(ctx, source.URL)
|
||||
if err != nil {
|
||||
return false, "", err
|
||||
}
|
||||
|
||||
hash := s.generateHash(html)
|
||||
changed := hash != lastHash
|
||||
|
||||
return changed, hash, nil
|
||||
}
|
||||
|
||||
func (s *AstroDocsScraper) fetchPage(ctx context.Context, url string) (string, error) {
|
||||
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
req.Header.Set("User-Agent", s.config.UserAgent)
|
||||
|
||||
resp, err := s.client.Do(req)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return "", fmt.Errorf("HTTP %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return string(body), nil
|
||||
}
|
||||
|
||||
func (s *AstroDocsScraper) generateHash(content string) string {
|
||||
hash := sha256.Sum256([]byte(content))
|
||||
return hex.EncodeToString(hash[:])
|
||||
}
|
||||
|
||||
func (s *AstroDocsScraper) pageToDocument(page *astrodocs.Page, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# %s\n\n", page.Title)
|
||||
fmt.Fprintf(&content, "%s\n", page.Description)
|
||||
|
||||
if len(page.CodeBlocks) > 0 {
|
||||
fmt.Fprintf(&content, "\n## Code Examples\n")
|
||||
for _, cb := range page.CodeBlocks {
|
||||
fmt.Fprintf(&content, "\n```%s\n%s\n```\n", cb.Language, cb.Code)
|
||||
}
|
||||
}
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"title": page.Title,
|
||||
"doc_url": page.URL,
|
||||
"doc_type": "astro-docs",
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(page.URL),
|
||||
Source: sourceName,
|
||||
Type: "astro-docs",
|
||||
Title: page.Title,
|
||||
Content: content.String(),
|
||||
URL: page.URL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *AstroDocsScraper) sectionToDocument(section *astrodocs.Section, page *astrodocs.Page, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# %s\n\n", section.Title)
|
||||
fmt.Fprintf(&content, "%s\n", section.Content)
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"page_title": page.Title,
|
||||
"section_id": section.ID,
|
||||
"doc_url": section.DocURL,
|
||||
"doc_type": "astro-section",
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(section.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "astro-section",
|
||||
Title: fmt.Sprintf("%s - %s", page.Title, section.Title),
|
||||
Content: content.String(),
|
||||
URL: section.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,193 @@
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/yourorg/devour/pkg/cloudflaredocs"
|
||||
)
|
||||
|
||||
type CloudflareDocsScraper struct {
|
||||
config *Config
|
||||
parser *cloudflaredocs.Parser
|
||||
client *http.Client
|
||||
}
|
||||
|
||||
func NewCloudflareDocsScraper(config *Config) *CloudflareDocsScraper {
|
||||
return &CloudflareDocsScraper{
|
||||
config: config,
|
||||
parser: cloudflaredocs.NewParser(),
|
||||
client: &http.Client{
|
||||
Timeout: config.Timeout,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func (s *CloudflareDocsScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
|
||||
var documents []*Document
|
||||
|
||||
if source.URL == "" {
|
||||
return nil, fmt.Errorf("URL is required for Cloudflare docs scraper")
|
||||
}
|
||||
|
||||
html, err := s.fetchPage(ctx, source.URL)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to fetch page: %w", err)
|
||||
}
|
||||
|
||||
page, err := s.parser.ParsePage(html, source.URL)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to parse Cloudflare docs page: %w", err)
|
||||
}
|
||||
|
||||
mainDoc := s.pageToDocument(page, source.Name)
|
||||
documents = append(documents, mainDoc)
|
||||
|
||||
for _, section := range page.Sections {
|
||||
doc := s.sectionToDocument(section, page, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
for _, api := range page.APIs {
|
||||
doc := s.apiToDocument(api, page, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
return documents, nil
|
||||
}
|
||||
|
||||
func (s *CloudflareDocsScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
|
||||
html, err := s.fetchPage(ctx, source.URL)
|
||||
if err != nil {
|
||||
return false, "", err
|
||||
}
|
||||
|
||||
hash := s.generateHash(html)
|
||||
changed := hash != lastHash
|
||||
|
||||
return changed, hash, nil
|
||||
}
|
||||
|
||||
func (s *CloudflareDocsScraper) fetchPage(ctx context.Context, url string) (string, error) {
|
||||
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
req.Header.Set("User-Agent", s.config.UserAgent)
|
||||
|
||||
resp, err := s.client.Do(req)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return "", fmt.Errorf("HTTP %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return string(body), nil
|
||||
}
|
||||
|
||||
func (s *CloudflareDocsScraper) generateHash(content string) string {
|
||||
hash := sha256.Sum256([]byte(content))
|
||||
return hex.EncodeToString(hash[:])
|
||||
}
|
||||
|
||||
func (s *CloudflareDocsScraper) pageToDocument(page *cloudflaredocs.Page, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# %s\n\n", page.Title)
|
||||
if page.Product != "" {
|
||||
fmt.Fprintf(&content, "Product: %s\n\n", page.Product)
|
||||
}
|
||||
fmt.Fprintf(&content, "%s\n", page.Description)
|
||||
|
||||
if len(page.CodeBlocks) > 0 {
|
||||
fmt.Fprintf(&content, "\n## Code Examples\n")
|
||||
for _, cb := range page.CodeBlocks {
|
||||
fmt.Fprintf(&content, "\n```%s\n%s\n```\n", cb.Language, cb.Code)
|
||||
}
|
||||
}
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"title": page.Title,
|
||||
"product": page.Product,
|
||||
"doc_url": page.URL,
|
||||
"doc_type": "cloudflare-docs",
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(page.URL),
|
||||
Source: sourceName,
|
||||
Type: "cloudflare-docs",
|
||||
Title: page.Title,
|
||||
Content: content.String(),
|
||||
URL: page.URL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *CloudflareDocsScraper) sectionToDocument(section *cloudflaredocs.Section, page *cloudflaredocs.Page, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# %s\n\n", section.Title)
|
||||
fmt.Fprintf(&content, "%s\n", section.Content)
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"page_title": page.Title,
|
||||
"product": page.Product,
|
||||
"section_id": section.ID,
|
||||
"doc_url": section.DocURL,
|
||||
"doc_type": "cloudflare-section",
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(section.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "cloudflare-section",
|
||||
Title: fmt.Sprintf("%s - %s", page.Title, section.Title),
|
||||
Content: content.String(),
|
||||
URL: section.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *CloudflareDocsScraper) apiToDocument(api *cloudflaredocs.API, page *cloudflaredocs.Page, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# %s %s\n\n", api.Method, api.Endpoint)
|
||||
fmt.Fprintf(&content, "%s\n", api.Description)
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"page_title": page.Title,
|
||||
"product": page.Product,
|
||||
"method": api.Method,
|
||||
"endpoint": api.Endpoint,
|
||||
"doc_url": api.DocURL,
|
||||
"doc_type": "cloudflare-api",
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(api.DocURL + "#" + api.Endpoint),
|
||||
Source: sourceName,
|
||||
Type: "cloudflare-api",
|
||||
Title: fmt.Sprintf("%s %s", api.Method, api.Endpoint),
|
||||
Content: content.String(),
|
||||
URL: api.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,156 @@
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/yourorg/devour/pkg/dockerdocs"
|
||||
)
|
||||
|
||||
type DockerDocsScraper struct {
|
||||
config *Config
|
||||
parser *dockerdocs.Parser
|
||||
client *http.Client
|
||||
}
|
||||
|
||||
func NewDockerDocsScraper(config *Config) *DockerDocsScraper {
|
||||
return &DockerDocsScraper{
|
||||
config: config,
|
||||
parser: dockerdocs.NewParser(),
|
||||
client: &http.Client{
|
||||
Timeout: config.Timeout,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func (s *DockerDocsScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
|
||||
var documents []*Document
|
||||
|
||||
if source.URL == "" {
|
||||
return nil, fmt.Errorf("URL is required for Docker docs scraper")
|
||||
}
|
||||
|
||||
html, err := s.fetchPage(ctx, source.URL)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to fetch page: %w", err)
|
||||
}
|
||||
|
||||
page, err := s.parser.ParsePage(html, source.URL)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to parse Docker docs page: %w", err)
|
||||
}
|
||||
|
||||
mainDoc := s.pageToDocument(page, source.Name)
|
||||
documents = append(documents, mainDoc)
|
||||
|
||||
for _, section := range page.Sections {
|
||||
doc := s.sectionToDocument(section, page, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
return documents, nil
|
||||
}
|
||||
|
||||
func (s *DockerDocsScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
|
||||
html, err := s.fetchPage(ctx, source.URL)
|
||||
if err != nil {
|
||||
return false, "", err
|
||||
}
|
||||
|
||||
hash := s.generateHash(html)
|
||||
changed := hash != lastHash
|
||||
|
||||
return changed, hash, nil
|
||||
}
|
||||
|
||||
func (s *DockerDocsScraper) fetchPage(ctx context.Context, url string) (string, error) {
|
||||
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
req.Header.Set("User-Agent", s.config.UserAgent)
|
||||
|
||||
resp, err := s.client.Do(req)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return "", fmt.Errorf("HTTP %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return string(body), nil
|
||||
}
|
||||
|
||||
func (s *DockerDocsScraper) generateHash(content string) string {
|
||||
hash := sha256.Sum256([]byte(content))
|
||||
return hex.EncodeToString(hash[:])
|
||||
}
|
||||
|
||||
func (s *DockerDocsScraper) pageToDocument(page *dockerdocs.Page, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# %s\n\n", page.Title)
|
||||
fmt.Fprintf(&content, "%s\n", page.Description)
|
||||
|
||||
if len(page.CodeBlocks) > 0 {
|
||||
fmt.Fprintf(&content, "\n## Code Examples\n")
|
||||
for _, cb := range page.CodeBlocks {
|
||||
fmt.Fprintf(&content, "\n```%s\n%s\n```\n", cb.Language, cb.Code)
|
||||
}
|
||||
}
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"title": page.Title,
|
||||
"doc_url": page.URL,
|
||||
"doc_type": "docker-docs",
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(page.URL),
|
||||
Source: sourceName,
|
||||
Type: "docker-docs",
|
||||
Title: page.Title,
|
||||
Content: content.String(),
|
||||
URL: page.URL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *DockerDocsScraper) sectionToDocument(section *dockerdocs.Section, page *dockerdocs.Page, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# %s\n\n", section.Title)
|
||||
fmt.Fprintf(&content, "%s\n", section.Content)
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"page_title": page.Title,
|
||||
"section_id": section.ID,
|
||||
"doc_url": section.DocURL,
|
||||
"doc_type": "docker-section",
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(section.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "docker-section",
|
||||
Title: fmt.Sprintf("%s - %s", page.Title, section.Title),
|
||||
Content: content.String(),
|
||||
URL: section.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,31 @@
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"context"
|
||||
)
|
||||
|
||||
// GitHubScraper scrapes documentation from GitHub repositories.
|
||||
type GitHubScraper struct {
|
||||
config *Config
|
||||
}
|
||||
|
||||
// NewGitHubScraper creates a new GitHub scraper.
|
||||
func NewGitHubScraper(config *Config) *GitHubScraper {
|
||||
return &GitHubScraper{config: config}
|
||||
}
|
||||
|
||||
// Scrape clones and parses documents from a GitHub repository.
|
||||
func (s *GitHubScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
|
||||
// TODO: Implement GitHub scraping
|
||||
// 1. Clone repository (shallow)
|
||||
// 2. Find markdown files in specified paths
|
||||
// 3. Parse README, docs/, wiki
|
||||
// 4. Extract code structure
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
// DetectChanges checks if the repository has new commits.
|
||||
func (s *GitHubScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
|
||||
// TODO: Check latest commit hash
|
||||
return false, "", nil
|
||||
}
|
||||
@@ -0,0 +1,423 @@
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/yourorg/devour/pkg/godocs"
|
||||
)
|
||||
|
||||
type GoDocsScraper struct {
|
||||
config *Config
|
||||
parser *godocs.Parser
|
||||
client *http.Client
|
||||
}
|
||||
|
||||
func NewGoDocsScraper(config *Config) *GoDocsScraper {
|
||||
return &GoDocsScraper{
|
||||
config: config,
|
||||
parser: godocs.NewParser(),
|
||||
client: &http.Client{
|
||||
Timeout: config.Timeout,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func (s *GoDocsScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
|
||||
var documents []*Document
|
||||
|
||||
if source.URL == "" {
|
||||
return nil, fmt.Errorf("URL is required for Go docs scraper")
|
||||
}
|
||||
|
||||
html, err := s.fetchPage(ctx, source.URL)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to fetch page: %w", err)
|
||||
}
|
||||
|
||||
pkg, err := s.parser.ParsePackagePage(html, source.URL)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to parse package: %w", err)
|
||||
}
|
||||
|
||||
mainDoc := s.packageToDocument(pkg, source.Name)
|
||||
documents = append(documents, mainDoc)
|
||||
|
||||
for _, fn := range pkg.Functions {
|
||||
doc := s.functionToDocument(fn, pkg, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
for _, t := range pkg.Types {
|
||||
doc := s.typeToDocument(t, pkg, source.Name)
|
||||
documents = append(documents, doc)
|
||||
|
||||
for _, m := range t.Methods {
|
||||
methodDoc := s.methodToDocument(m, t, pkg, source.Name)
|
||||
documents = append(documents, methodDoc)
|
||||
}
|
||||
}
|
||||
|
||||
for _, c := range pkg.Constants {
|
||||
doc := s.constantToDocument(c, pkg, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
for _, v := range pkg.Variables {
|
||||
doc := s.variableToDocument(v, pkg, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
return documents, nil
|
||||
}
|
||||
|
||||
func (s *GoDocsScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
|
||||
html, err := s.fetchPage(ctx, source.URL)
|
||||
if err != nil {
|
||||
return false, "", err
|
||||
}
|
||||
|
||||
hash := s.generateHash(html)
|
||||
changed := hash != lastHash
|
||||
|
||||
return changed, hash, nil
|
||||
}
|
||||
|
||||
func (s *GoDocsScraper) fetchPage(ctx context.Context, url string) (string, error) {
|
||||
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
req.Header.Set("User-Agent", s.config.UserAgent)
|
||||
|
||||
resp, err := s.client.Do(req)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return "", fmt.Errorf("HTTP %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return string(body), nil
|
||||
}
|
||||
|
||||
func (s *GoDocsScraper) generateHash(content string) string {
|
||||
hash := sha256.Sum256([]byte(content))
|
||||
return hex.EncodeToString(hash[:])
|
||||
}
|
||||
|
||||
func (s *GoDocsScraper) packageToDocument(pkg *godocs.Package, sourceName string) *Document {
|
||||
content := s.buildPackageContent(pkg)
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"import_path": pkg.ImportPath,
|
||||
"version": pkg.Version,
|
||||
"imported_by": pkg.ImportedBy,
|
||||
"repository": pkg.Repository,
|
||||
"doc_url": pkg.DocURL,
|
||||
}
|
||||
|
||||
if pkg.Module != nil {
|
||||
metadata["module_path"] = pkg.Module.Path
|
||||
metadata["module_version"] = pkg.Module.Version
|
||||
}
|
||||
|
||||
if len(pkg.Licenses) > 0 {
|
||||
var licenses []string
|
||||
for _, l := range pkg.Licenses {
|
||||
licenses = append(licenses, l.Name)
|
||||
}
|
||||
metadata["licenses"] = licenses
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(pkg.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "go-package",
|
||||
Title: fmt.Sprintf("%s - %s", pkg.Name, pkg.ImportPath),
|
||||
Content: content,
|
||||
URL: pkg.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *GoDocsScraper) buildPackageContent(pkg *godocs.Package) string {
|
||||
var parts []string
|
||||
|
||||
parts = append(parts, fmt.Sprintf("# Package %s\n", pkg.ImportPath))
|
||||
|
||||
if pkg.Synopsis != "" {
|
||||
parts = append(parts, pkg.Synopsis)
|
||||
}
|
||||
|
||||
if pkg.Doc != "" {
|
||||
parts = append(parts, "\n## Documentation\n")
|
||||
parts = append(parts, pkg.Doc)
|
||||
}
|
||||
|
||||
if len(pkg.Functions) > 0 {
|
||||
parts = append(parts, fmt.Sprintf("\n## Functions (%d)\n", len(pkg.Functions)))
|
||||
for _, fn := range pkg.Functions {
|
||||
parts = append(parts, fmt.Sprintf("- `%s`", fn.Signature))
|
||||
}
|
||||
}
|
||||
|
||||
if len(pkg.Types) > 0 {
|
||||
parts = append(parts, fmt.Sprintf("\n## Types (%d)\n", len(pkg.Types)))
|
||||
for _, t := range pkg.Types {
|
||||
parts = append(parts, fmt.Sprintf("- `%s` (%s)", t.Name, t.Kind))
|
||||
}
|
||||
}
|
||||
|
||||
if len(pkg.Constants) > 0 {
|
||||
parts = append(parts, fmt.Sprintf("\n## Constants (%d)\n", len(pkg.Constants)))
|
||||
}
|
||||
|
||||
if len(pkg.Variables) > 0 {
|
||||
parts = append(parts, fmt.Sprintf("\n## Variables (%d)\n", len(pkg.Variables)))
|
||||
}
|
||||
|
||||
return strings.Join(parts, "\n")
|
||||
}
|
||||
|
||||
func (s *GoDocsScraper) functionToDocument(fn *godocs.Function, pkg *godocs.Package, sourceName string) *Document {
|
||||
content := s.buildFunctionContent(fn, pkg)
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"import_path": pkg.ImportPath,
|
||||
"package": pkg.Name,
|
||||
"symbol": fn.Name,
|
||||
"signature": fn.Signature,
|
||||
"kind": "function",
|
||||
}
|
||||
|
||||
examplesJSON, _ := json.Marshal(fn.Examples)
|
||||
metadata["examples"] = string(examplesJSON)
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(fmt.Sprintf("%s#%s", pkg.DocURL, fn.Name)),
|
||||
Source: sourceName,
|
||||
Type: "go-function",
|
||||
Title: fmt.Sprintf("%s.%s", pkg.Name, fn.Name),
|
||||
Content: content,
|
||||
URL: fmt.Sprintf("%s#%s", pkg.DocURL, fn.Name),
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *GoDocsScraper) buildFunctionContent(fn *godocs.Function, pkg *godocs.Package) string {
|
||||
var parts []string
|
||||
|
||||
parts = append(parts, fmt.Sprintf("# %s.%s\n", pkg.Name, fn.Name))
|
||||
parts = append(parts, fmt.Sprintf("```\n%s\n```", fn.Signature))
|
||||
|
||||
if fn.Doc != "" {
|
||||
parts = append(parts, "\n"+fn.Doc)
|
||||
}
|
||||
|
||||
for _, ex := range fn.Examples {
|
||||
parts = append(parts, fmt.Sprintf("\n### Example: %s\n", ex.Name))
|
||||
if ex.Doc != "" {
|
||||
parts = append(parts, ex.Doc)
|
||||
}
|
||||
parts = append(parts, fmt.Sprintf("```go\n%s\n```", ex.Code))
|
||||
if ex.Output != "" {
|
||||
parts = append(parts, fmt.Sprintf("Output:\n```\n%s\n```", ex.Output))
|
||||
}
|
||||
}
|
||||
|
||||
return strings.Join(parts, "\n")
|
||||
}
|
||||
|
||||
func (s *GoDocsScraper) typeToDocument(t *godocs.Type, pkg *godocs.Package, sourceName string) *Document {
|
||||
content := s.buildTypeContent(t, pkg)
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"import_path": pkg.ImportPath,
|
||||
"package": pkg.Name,
|
||||
"symbol": t.Name,
|
||||
"kind": "type",
|
||||
"type_kind": t.Kind,
|
||||
"underlying": t.Underlying,
|
||||
"method_count": len(t.Methods),
|
||||
}
|
||||
|
||||
fieldsJSON, _ := json.Marshal(t.Fields)
|
||||
metadata["fields"] = string(fieldsJSON)
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(fmt.Sprintf("%s#%s", pkg.DocURL, t.Name)),
|
||||
Source: sourceName,
|
||||
Type: "go-type",
|
||||
Title: fmt.Sprintf("%s.%s", pkg.Name, t.Name),
|
||||
Content: content,
|
||||
URL: fmt.Sprintf("%s#%s", pkg.DocURL, t.Name),
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *GoDocsScraper) buildTypeContent(t *godocs.Type, pkg *godocs.Package) string {
|
||||
var parts []string
|
||||
|
||||
parts = append(parts, fmt.Sprintf("# type %s.%s\n", pkg.Name, t.Name))
|
||||
parts = append(parts, fmt.Sprintf("```\n%s\n```", t.Underlying))
|
||||
|
||||
if t.Doc != "" {
|
||||
parts = append(parts, "\n"+t.Doc)
|
||||
}
|
||||
|
||||
if len(t.Fields) > 0 {
|
||||
parts = append(parts, "\n### Fields\n")
|
||||
for _, f := range t.Fields {
|
||||
if f.Doc != "" {
|
||||
parts = append(parts, fmt.Sprintf("- `%s %s` - %s", f.Name, f.Type, f.Doc))
|
||||
} else {
|
||||
parts = append(parts, fmt.Sprintf("- `%s %s`", f.Name, f.Type))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if len(t.Methods) > 0 {
|
||||
parts = append(parts, fmt.Sprintf("\n### Methods (%d)\n", len(t.Methods)))
|
||||
for _, m := range t.Methods {
|
||||
parts = append(parts, fmt.Sprintf("- `%s`", m.Signature))
|
||||
}
|
||||
}
|
||||
|
||||
return strings.Join(parts, "\n")
|
||||
}
|
||||
|
||||
func (s *GoDocsScraper) methodToDocument(m *godocs.Method, t *godocs.Type, pkg *godocs.Package, sourceName string) *Document {
|
||||
content := s.buildMethodContent(m, t, pkg)
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"import_path": pkg.ImportPath,
|
||||
"package": pkg.Name,
|
||||
"type": t.Name,
|
||||
"symbol": m.Name,
|
||||
"receiver": m.Receiver,
|
||||
"signature": m.Signature,
|
||||
"kind": "method",
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(fmt.Sprintf("%s#%s.%s", pkg.DocURL, t.Name, m.Name)),
|
||||
Source: sourceName,
|
||||
Type: "go-method",
|
||||
Title: fmt.Sprintf("%s.%s.%s", pkg.Name, t.Name, m.Name),
|
||||
Content: content,
|
||||
URL: fmt.Sprintf("%s#%s.%s", pkg.DocURL, t.Name, m.Name),
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *GoDocsScraper) buildMethodContent(m *godocs.Method, t *godocs.Type, pkg *godocs.Package) string {
|
||||
var parts []string
|
||||
|
||||
parts = append(parts, fmt.Sprintf("# func (%s) %s\n", m.Receiver, m.Name))
|
||||
parts = append(parts, fmt.Sprintf("```\n%s\n```", m.Signature))
|
||||
|
||||
if m.Doc != "" {
|
||||
parts = append(parts, "\n"+m.Doc)
|
||||
}
|
||||
|
||||
return strings.Join(parts, "\n")
|
||||
}
|
||||
|
||||
func (s *GoDocsScraper) constantToDocument(c *godocs.Value, pkg *godocs.Package, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# Constants\n\n")
|
||||
|
||||
if c.Doc != "" {
|
||||
fmt.Fprintf(&content, "%s\n\n", c.Doc)
|
||||
}
|
||||
|
||||
if len(c.Names) > 1 {
|
||||
fmt.Fprintf(&content, "```go\nconst (\n")
|
||||
for _, name := range c.Names {
|
||||
fmt.Fprintf(&content, "\t%s\n", name)
|
||||
}
|
||||
fmt.Fprintf(&content, ")\n```")
|
||||
} else {
|
||||
fmt.Fprintf(&content, "```go\nconst %s = %s\n```", c.Name, c.Value)
|
||||
}
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"import_path": pkg.ImportPath,
|
||||
"package": pkg.Name,
|
||||
"names": c.Names,
|
||||
"kind": "constant",
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(fmt.Sprintf("%s#const-%s", pkg.DocURL, c.Name)),
|
||||
Source: sourceName,
|
||||
Type: "go-constant",
|
||||
Title: fmt.Sprintf("%s.%s (const)", pkg.Name, c.Name),
|
||||
Content: content.String(),
|
||||
URL: pkg.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *GoDocsScraper) variableToDocument(v *godocs.Value, pkg *godocs.Package, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# Variables\n\n")
|
||||
|
||||
if v.Doc != "" {
|
||||
fmt.Fprintf(&content, "%s\n\n", v.Doc)
|
||||
}
|
||||
|
||||
fmt.Fprintf(&content, "```go\nvar %s", v.Name)
|
||||
if v.Type != "" {
|
||||
fmt.Fprintf(&content, " %s", v.Type)
|
||||
}
|
||||
if v.Value != "" {
|
||||
fmt.Fprintf(&content, " = %s", v.Value)
|
||||
}
|
||||
fmt.Fprintf(&content, "\n```")
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"import_path": pkg.ImportPath,
|
||||
"package": pkg.Name,
|
||||
"name": v.Name,
|
||||
"type": v.Type,
|
||||
"kind": "variable",
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(fmt.Sprintf("%s#var-%s", pkg.DocURL, v.Name)),
|
||||
Source: sourceName,
|
||||
Type: "go-variable",
|
||||
Title: fmt.Sprintf("%s.%s (var)", pkg.Name, v.Name),
|
||||
Content: content.String(),
|
||||
URL: pkg.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,254 @@
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/yourorg/devour/pkg/javadocs"
|
||||
)
|
||||
|
||||
type JavaDocsScraper struct {
|
||||
config *Config
|
||||
parser *javadocs.Parser
|
||||
client *http.Client
|
||||
}
|
||||
|
||||
func NewJavaDocsScraper(config *Config) *JavaDocsScraper {
|
||||
return &JavaDocsScraper{
|
||||
config: config,
|
||||
parser: javadocs.NewParser(),
|
||||
client: &http.Client{
|
||||
Timeout: config.Timeout,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func (s *JavaDocsScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
|
||||
var documents []*Document
|
||||
|
||||
if source.URL == "" {
|
||||
return nil, fmt.Errorf("URL is required for Java docs scraper")
|
||||
}
|
||||
|
||||
html, err := s.fetchPage(ctx, source.URL)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to fetch page: %w", err)
|
||||
}
|
||||
|
||||
pkg, err := s.parser.ParsePackagePage(html, source.URL)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to parse package: %w", err)
|
||||
}
|
||||
|
||||
mainDoc := s.packageToDocument(pkg, source.Name)
|
||||
documents = append(documents, mainDoc)
|
||||
|
||||
for _, class := range pkg.Classes {
|
||||
doc := s.classToDocument(class, pkg, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
for _, iface := range pkg.Interfaces {
|
||||
doc := s.interfaceToDocument(iface, pkg, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
for _, enum := range pkg.Enums {
|
||||
doc := s.enumToDocument(enum, pkg, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
for _, exc := range pkg.Exceptions {
|
||||
doc := s.exceptionToDocument(exc, pkg, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
return documents, nil
|
||||
}
|
||||
|
||||
func (s *JavaDocsScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
|
||||
html, err := s.fetchPage(ctx, source.URL)
|
||||
if err != nil {
|
||||
return false, "", err
|
||||
}
|
||||
|
||||
hash := s.generateHash(html)
|
||||
changed := hash != lastHash
|
||||
|
||||
return changed, hash, nil
|
||||
}
|
||||
|
||||
func (s *JavaDocsScraper) fetchPage(ctx context.Context, url string) (string, error) {
|
||||
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
req.Header.Set("User-Agent", s.config.UserAgent)
|
||||
|
||||
resp, err := s.client.Do(req)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return "", fmt.Errorf("HTTP %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return string(body), nil
|
||||
}
|
||||
|
||||
func (s *JavaDocsScraper) generateHash(content string) string {
|
||||
hash := sha256.Sum256([]byte(content))
|
||||
return hex.EncodeToString(hash[:])
|
||||
}
|
||||
|
||||
func (s *JavaDocsScraper) packageToDocument(pkg *javadocs.Package, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# Package %s\n\n", pkg.Name)
|
||||
fmt.Fprintf(&content, "%s\n", pkg.Doc)
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"package": pkg.Name,
|
||||
"doc_url": pkg.DocURL,
|
||||
"doc_type": "java-package",
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(pkg.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "java-package",
|
||||
Title: pkg.Name,
|
||||
Content: content.String(),
|
||||
URL: pkg.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *JavaDocsScraper) classToDocument(class *javadocs.Class, pkg *javadocs.Package, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# %s\n\n", class.QualifiedName)
|
||||
fmt.Fprintf(&content, "Kind: %s\n\n", class.Kind)
|
||||
fmt.Fprintf(&content, "%s\n", class.Doc)
|
||||
|
||||
if len(class.Methods) > 0 {
|
||||
fmt.Fprintf(&content, "\n## Methods\n")
|
||||
for _, m := range class.Methods {
|
||||
fmt.Fprintf(&content, "- `%s`\n", m.Signature)
|
||||
}
|
||||
}
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"package": pkg.Name,
|
||||
"qualified_name": class.QualifiedName,
|
||||
"kind": string(class.Kind),
|
||||
"doc_url": class.DocURL,
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(class.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "java-class",
|
||||
Title: class.QualifiedName,
|
||||
Content: content.String(),
|
||||
URL: class.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *JavaDocsScraper) interfaceToDocument(iface *javadocs.Class, pkg *javadocs.Package, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# %s (interface)\n\n", iface.QualifiedName)
|
||||
fmt.Fprintf(&content, "%s\n", iface.Doc)
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"package": pkg.Name,
|
||||
"qualified_name": iface.QualifiedName,
|
||||
"kind": "interface",
|
||||
"doc_url": iface.DocURL,
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(iface.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "java-interface",
|
||||
Title: iface.QualifiedName,
|
||||
Content: content.String(),
|
||||
URL: iface.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *JavaDocsScraper) enumToDocument(enum *javadocs.Enum, pkg *javadocs.Package, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# %s (enum)\n\n", enum.QualifiedName)
|
||||
fmt.Fprintf(&content, "%s\n", enum.Doc)
|
||||
|
||||
if len(enum.Constants) > 0 {
|
||||
fmt.Fprintf(&content, "\n## Constants\n")
|
||||
for _, c := range enum.Constants {
|
||||
fmt.Fprintf(&content, "- `%s`\n", c.Name)
|
||||
}
|
||||
}
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"package": pkg.Name,
|
||||
"qualified_name": enum.QualifiedName,
|
||||
"kind": "enum",
|
||||
"doc_url": enum.DocURL,
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(enum.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "java-enum",
|
||||
Title: enum.QualifiedName,
|
||||
Content: content.String(),
|
||||
URL: enum.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *JavaDocsScraper) exceptionToDocument(exc *javadocs.Class, pkg *javadocs.Package, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# %s (exception)\n\n", exc.QualifiedName)
|
||||
fmt.Fprintf(&content, "%s\n", exc.Doc)
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"package": pkg.Name,
|
||||
"qualified_name": exc.QualifiedName,
|
||||
"kind": "exception",
|
||||
"doc_url": exc.DocURL,
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(exc.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "java-exception",
|
||||
Title: exc.QualifiedName,
|
||||
Content: content.String(),
|
||||
URL: exc.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,31 @@
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"context"
|
||||
)
|
||||
|
||||
// LocalScraper scrapes documentation from local filesystem.
|
||||
type LocalScraper struct {
|
||||
config *Config
|
||||
}
|
||||
|
||||
// NewLocalScraper creates a new local scraper.
|
||||
func NewLocalScraper(config *Config) *LocalScraper {
|
||||
return &LocalScraper{config: config}
|
||||
}
|
||||
|
||||
// Scrape scans and parses documents from a local directory.
|
||||
func (s *LocalScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
|
||||
// TODO: Implement local scraping
|
||||
// 1. Walk directory tree
|
||||
// 2. Filter by include/exclude patterns
|
||||
// 3. Parse markdown, text, code files
|
||||
// 4. Extract structure and content
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
// DetectChanges checks if files have been modified.
|
||||
func (s *LocalScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
|
||||
// TODO: Check file modification times
|
||||
return false, "", nil
|
||||
}
|
||||
@@ -0,0 +1,222 @@
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/yourorg/devour/pkg/mcpdocs"
|
||||
)
|
||||
|
||||
type MCPDocsScraper struct {
|
||||
config *Config
|
||||
parser *mcpdocs.Parser
|
||||
client *http.Client
|
||||
}
|
||||
|
||||
func NewMCPDocsScraper(config *Config) *MCPDocsScraper {
|
||||
return &MCPDocsScraper{
|
||||
config: config,
|
||||
parser: mcpdocs.NewParser(),
|
||||
client: &http.Client{
|
||||
Timeout: config.Timeout,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func (s *MCPDocsScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
|
||||
var documents []*Document
|
||||
|
||||
if source.URL == "" {
|
||||
return nil, fmt.Errorf("URL is required for MCP docs scraper")
|
||||
}
|
||||
|
||||
html, err := s.fetchPage(ctx, source.URL)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to fetch page: %w", err)
|
||||
}
|
||||
|
||||
server, err := s.parser.ParseServerPage(html, source.URL)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to parse MCP server page: %w", err)
|
||||
}
|
||||
|
||||
mainDoc := s.serverToDocument(server, source.Name)
|
||||
documents = append(documents, mainDoc)
|
||||
|
||||
for _, tool := range server.Tools {
|
||||
doc := s.toolToDocument(tool, server, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
for _, res := range server.Resources {
|
||||
doc := s.resourceToDocument(res, server, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
for _, prompt := range server.Prompts {
|
||||
doc := s.promptToDocument(prompt, server, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
return documents, nil
|
||||
}
|
||||
|
||||
func (s *MCPDocsScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
|
||||
html, err := s.fetchPage(ctx, source.URL)
|
||||
if err != nil {
|
||||
return false, "", err
|
||||
}
|
||||
|
||||
hash := s.generateHash(html)
|
||||
changed := hash != lastHash
|
||||
|
||||
return changed, hash, nil
|
||||
}
|
||||
|
||||
func (s *MCPDocsScraper) fetchPage(ctx context.Context, url string) (string, error) {
|
||||
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
req.Header.Set("User-Agent", s.config.UserAgent)
|
||||
|
||||
resp, err := s.client.Do(req)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return "", fmt.Errorf("HTTP %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return string(body), nil
|
||||
}
|
||||
|
||||
func (s *MCPDocsScraper) generateHash(content string) string {
|
||||
hash := sha256.Sum256([]byte(content))
|
||||
return hex.EncodeToString(hash[:])
|
||||
}
|
||||
|
||||
func (s *MCPDocsScraper) serverToDocument(server *mcpdocs.Server, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# %s\n\n", server.Name)
|
||||
fmt.Fprintf(&content, "%s\n", server.Description)
|
||||
|
||||
if len(server.Tools) > 0 {
|
||||
fmt.Fprintf(&content, "\n## Tools (%d)\n", len(server.Tools))
|
||||
for _, t := range server.Tools {
|
||||
fmt.Fprintf(&content, "- `%s`: %s\n", t.Name, t.Description)
|
||||
}
|
||||
}
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"server": server.Name,
|
||||
"category": server.Category,
|
||||
"doc_url": server.DocURL,
|
||||
"doc_type": "mcp-server",
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(server.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "mcp-server",
|
||||
Title: server.Name,
|
||||
Content: content.String(),
|
||||
URL: server.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *MCPDocsScraper) toolToDocument(tool *mcpdocs.Tool, server *mcpdocs.Server, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# %s\n\n", tool.Name)
|
||||
fmt.Fprintf(&content, "Server: %s\n\n", server.Name)
|
||||
fmt.Fprintf(&content, "%s\n", tool.Description)
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"server": server.Name,
|
||||
"tool": tool.Name,
|
||||
"doc_url": tool.DocURL,
|
||||
"doc_type": "mcp-tool",
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(tool.DocURL + "#" + tool.Name),
|
||||
Source: sourceName,
|
||||
Type: "mcp-tool",
|
||||
Title: fmt.Sprintf("%s.%s", server.Name, tool.Name),
|
||||
Content: content.String(),
|
||||
URL: tool.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *MCPDocsScraper) resourceToDocument(res *mcpdocs.Resource, server *mcpdocs.Server, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# %s\n\n", res.Name)
|
||||
fmt.Fprintf(&content, "Server: %s\n", server.Name)
|
||||
fmt.Fprintf(&content, "URI: %s\n\n", res.URI)
|
||||
fmt.Fprintf(&content, "%s\n", res.Description)
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"server": server.Name,
|
||||
"resource": res.Name,
|
||||
"uri": res.URI,
|
||||
"doc_url": res.DocURL,
|
||||
"doc_type": "mcp-resource",
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(res.DocURL + "#" + res.Name),
|
||||
Source: sourceName,
|
||||
Type: "mcp-resource",
|
||||
Title: res.Name,
|
||||
Content: content.String(),
|
||||
URL: res.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *MCPDocsScraper) promptToDocument(prompt *mcpdocs.Prompt, server *mcpdocs.Server, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# %s\n\n", prompt.Name)
|
||||
fmt.Fprintf(&content, "Server: %s\n\n", server.Name)
|
||||
fmt.Fprintf(&content, "%s\n", prompt.Description)
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"server": server.Name,
|
||||
"prompt": prompt.Name,
|
||||
"doc_url": prompt.DocURL,
|
||||
"doc_type": "mcp-prompt",
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(prompt.DocURL + "#" + prompt.Name),
|
||||
Source: sourceName,
|
||||
Type: "mcp-prompt",
|
||||
Title: prompt.Name,
|
||||
Content: content.String(),
|
||||
URL: prompt.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,298 @@
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/yourorg/devour/pkg/nuxtdocs"
|
||||
)
|
||||
|
||||
type NuxtDocsScraper struct {
|
||||
config *Config
|
||||
parser *nuxtdocs.Parser
|
||||
client *http.Client
|
||||
}
|
||||
|
||||
func NewNuxtDocsScraper(config *Config) *NuxtDocsScraper {
|
||||
return &NuxtDocsScraper{
|
||||
config: config,
|
||||
parser: nuxtdocs.NewParser(),
|
||||
client: &http.Client{
|
||||
Timeout: config.Timeout,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func (s *NuxtDocsScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
|
||||
var documents []*Document
|
||||
|
||||
if source.URL == "" {
|
||||
return nil, fmt.Errorf("URL is required for Nuxt docs scraper")
|
||||
}
|
||||
|
||||
html, err := s.fetchPage(ctx, source.URL)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to fetch page: %w", err)
|
||||
}
|
||||
|
||||
ref, err := s.parser.ParseReferencePage(html, source.URL)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to parse reference: %w", err)
|
||||
}
|
||||
|
||||
mainDoc := s.referenceToDocument(ref, source.Name)
|
||||
documents = append(documents, mainDoc)
|
||||
|
||||
for _, comp := range ref.Components {
|
||||
doc := s.componentToDocument(comp, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
for _, comp := range ref.Composables {
|
||||
doc := s.composableToDocument(comp, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
for _, util := range ref.Utilities {
|
||||
doc := s.utilityToDocument(util, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
for _, cfg := range ref.Configs {
|
||||
doc := s.configToDocument(cfg, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
for _, cmd := range ref.Commands {
|
||||
doc := s.commandToDocument(cmd, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
return documents, nil
|
||||
}
|
||||
|
||||
func (s *NuxtDocsScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
|
||||
html, err := s.fetchPage(ctx, source.URL)
|
||||
if err != nil {
|
||||
return false, "", err
|
||||
}
|
||||
|
||||
hash := s.generateHash(html)
|
||||
changed := hash != lastHash
|
||||
|
||||
return changed, hash, nil
|
||||
}
|
||||
|
||||
func (s *NuxtDocsScraper) fetchPage(ctx context.Context, url string) (string, error) {
|
||||
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
req.Header.Set("User-Agent", s.config.UserAgent)
|
||||
|
||||
resp, err := s.client.Do(req)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return "", fmt.Errorf("HTTP %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return string(body), nil
|
||||
}
|
||||
|
||||
func (s *NuxtDocsScraper) generateHash(content string) string {
|
||||
hash := sha256.Sum256([]byte(content))
|
||||
return hex.EncodeToString(hash[:])
|
||||
}
|
||||
|
||||
func (s *NuxtDocsScraper) referenceToDocument(ref *nuxtdocs.Reference, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# Nuxt API Reference\n\n")
|
||||
fmt.Fprintf(&content, "Components: %d, Composables: %d, Utilities: %d, Configs: %d, Commands: %d\n",
|
||||
len(ref.Components), len(ref.Composables), len(ref.Utilities), len(ref.Configs), len(ref.Commands))
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(ref.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "nuxt-reference",
|
||||
Title: "Nuxt API Reference",
|
||||
Content: content.String(),
|
||||
URL: ref.DocURL,
|
||||
Metadata: map[string]interface{}{"doc_type": "nuxt-reference"},
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *NuxtDocsScraper) componentToDocument(comp *nuxtdocs.Component, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# <%s />\n\n", comp.Name)
|
||||
fmt.Fprintf(&content, "%s\n", comp.Doc)
|
||||
|
||||
if len(comp.Props) > 0 {
|
||||
fmt.Fprintf(&content, "\n## Props\n")
|
||||
for _, p := range comp.Props {
|
||||
fmt.Fprintf(&content, "- `%s: %s`\n", p.Name, p.Type)
|
||||
}
|
||||
}
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"name": comp.Name,
|
||||
"category": comp.Category,
|
||||
"doc_url": comp.DocURL,
|
||||
"doc_type": "nuxt-component",
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(comp.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "nuxt-component",
|
||||
Title: comp.Name,
|
||||
Content: content.String(),
|
||||
URL: comp.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *NuxtDocsScraper) composableToDocument(comp *nuxtdocs.Composable, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# %s\n\n", comp.Name)
|
||||
if comp.Signature != "" {
|
||||
fmt.Fprintf(&content, "```javascript\n%s\n```\n\n", comp.Signature)
|
||||
}
|
||||
fmt.Fprintf(&content, "%s\n", comp.Doc)
|
||||
|
||||
if comp.Returns != "" {
|
||||
fmt.Fprintf(&content, "\n**Returns:** `%s`\n", comp.Returns)
|
||||
}
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"name": comp.Name,
|
||||
"category": comp.Category,
|
||||
"doc_url": comp.DocURL,
|
||||
"doc_type": "nuxt-composable",
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(comp.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "nuxt-composable",
|
||||
Title: comp.Name,
|
||||
Content: content.String(),
|
||||
URL: comp.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *NuxtDocsScraper) utilityToDocument(util *nuxtdocs.Utility, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# %s\n\n", util.Name)
|
||||
if util.Signature != "" {
|
||||
fmt.Fprintf(&content, "```javascript\n%s\n```\n\n", util.Signature)
|
||||
}
|
||||
fmt.Fprintf(&content, "%s\n", util.Doc)
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"name": util.Name,
|
||||
"doc_url": util.DocURL,
|
||||
"doc_type": "nuxt-utility",
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(util.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "nuxt-utility",
|
||||
Title: util.Name,
|
||||
Content: content.String(),
|
||||
URL: util.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *NuxtDocsScraper) configToDocument(cfg *nuxtdocs.Config, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# %s\n\n", cfg.Name)
|
||||
if cfg.Type != "" {
|
||||
fmt.Fprintf(&content, "Type: `%s`\n\n", cfg.Type)
|
||||
}
|
||||
if cfg.Default != "" {
|
||||
fmt.Fprintf(&content, "Default: `%s`\n\n", cfg.Default)
|
||||
}
|
||||
fmt.Fprintf(&content, "%s\n", cfg.Doc)
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"name": cfg.Name,
|
||||
"type": cfg.Type,
|
||||
"default": cfg.Default,
|
||||
"category": cfg.Category,
|
||||
"doc_url": cfg.DocURL,
|
||||
"doc_type": "nuxt-config",
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(cfg.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "nuxt-config",
|
||||
Title: cfg.Name,
|
||||
Content: content.String(),
|
||||
URL: cfg.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *NuxtDocsScraper) commandToDocument(cmd *nuxtdocs.Command, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# %s\n\n", cmd.Name)
|
||||
if cmd.Usage != "" {
|
||||
fmt.Fprintf(&content, "```\n%s\n```\n\n", cmd.Usage)
|
||||
}
|
||||
fmt.Fprintf(&content, "%s\n", cmd.Doc)
|
||||
|
||||
if len(cmd.Flags) > 0 {
|
||||
fmt.Fprintf(&content, "\n## Flags\n")
|
||||
for _, f := range cmd.Flags {
|
||||
fmt.Fprintf(&content, "- `--%s`: %s\n", f.Name, f.Doc)
|
||||
}
|
||||
}
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"name": cmd.Name,
|
||||
"usage": cmd.Usage,
|
||||
"doc_url": cmd.DocURL,
|
||||
"doc_type": "nuxt-command",
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(cmd.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "nuxt-command",
|
||||
Title: cmd.Name,
|
||||
Content: content.String(),
|
||||
URL: cmd.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,31 @@
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"context"
|
||||
)
|
||||
|
||||
// OpenAPIScraper parses OpenAPI/Swagger specifications.
|
||||
type OpenAPIScraper struct {
|
||||
config *Config
|
||||
}
|
||||
|
||||
// NewOpenAPIScraper creates a new OpenAPI scraper.
|
||||
func NewOpenAPIScraper(config *Config) *OpenAPIScraper {
|
||||
return &OpenAPIScraper{config: config}
|
||||
}
|
||||
|
||||
// Scrape fetches and parses an OpenAPI specification.
|
||||
func (s *OpenAPIScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
|
||||
// TODO: Implement OpenAPI parsing
|
||||
// 1. Fetch spec from URL
|
||||
// 2. Parse endpoints, schemas, descriptions
|
||||
// 3. Create documents per endpoint
|
||||
// 4. Include authentication, parameters
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
// DetectChanges checks if the spec has been updated.
|
||||
func (s *OpenAPIScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
|
||||
// TODO: Check spec content hash
|
||||
return false, "", nil
|
||||
}
|
||||
@@ -0,0 +1,463 @@
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/yourorg/devour/pkg/pythondocs"
|
||||
)
|
||||
|
||||
type PythonDocsScraper struct {
|
||||
config *Config
|
||||
parser *pythondocs.Parser
|
||||
client *http.Client
|
||||
}
|
||||
|
||||
func NewPythonDocsScraper(config *Config) *PythonDocsScraper {
|
||||
return &PythonDocsScraper{
|
||||
config: config,
|
||||
parser: pythondocs.NewParser(),
|
||||
client: &http.Client{
|
||||
Timeout: config.Timeout,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func (s *PythonDocsScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
|
||||
var documents []*Document
|
||||
|
||||
if source.URL == "" {
|
||||
return nil, fmt.Errorf("URL is required for Python docs scraper")
|
||||
}
|
||||
|
||||
html, err := s.fetchPage(ctx, source.URL)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to fetch page: %w", err)
|
||||
}
|
||||
|
||||
module, err := s.parser.ParseModulePage(html, source.URL)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to parse module: %w", err)
|
||||
}
|
||||
|
||||
mainDoc := s.moduleToDocument(module, source.Name)
|
||||
documents = append(documents, mainDoc)
|
||||
|
||||
for _, class := range module.Classes {
|
||||
doc := s.classToDocument(class, module, source.Name)
|
||||
documents = append(documents, doc)
|
||||
|
||||
for _, method := range class.Methods {
|
||||
methodDoc := s.methodToDocument(method, class, module, source.Name)
|
||||
documents = append(documents, methodDoc)
|
||||
}
|
||||
|
||||
for _, method := range class.ClassMethods {
|
||||
methodDoc := s.classMethodToDocument(method, class, module, source.Name)
|
||||
documents = append(documents, methodDoc)
|
||||
}
|
||||
|
||||
for _, attr := range class.Attributes {
|
||||
attrDoc := s.attributeToDocument(attr, class, module, source.Name)
|
||||
documents = append(documents, attrDoc)
|
||||
}
|
||||
}
|
||||
|
||||
for _, fn := range module.Functions {
|
||||
doc := s.functionToDocument(fn, module, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
for _, exc := range module.Exceptions {
|
||||
doc := s.exceptionToDocument(exc, module, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
for _, data := range module.Constants {
|
||||
doc := s.dataToDocument(data, module, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
return documents, nil
|
||||
}
|
||||
|
||||
func (s *PythonDocsScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
|
||||
html, err := s.fetchPage(ctx, source.URL)
|
||||
if err != nil {
|
||||
return false, "", err
|
||||
}
|
||||
|
||||
hash := s.generateHash(html)
|
||||
changed := hash != lastHash
|
||||
|
||||
return changed, hash, nil
|
||||
}
|
||||
|
||||
func (s *PythonDocsScraper) fetchPage(ctx context.Context, url string) (string, error) {
|
||||
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
req.Header.Set("User-Agent", s.config.UserAgent)
|
||||
|
||||
resp, err := s.client.Do(req)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return "", fmt.Errorf("HTTP %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return string(body), nil
|
||||
}
|
||||
|
||||
func (s *PythonDocsScraper) generateHash(content string) string {
|
||||
hash := sha256.Sum256([]byte(content))
|
||||
return hex.EncodeToString(hash[:])
|
||||
}
|
||||
|
||||
func (s *PythonDocsScraper) moduleToDocument(module *pythondocs.Module, sourceName string) *Document {
|
||||
content := s.buildModuleContent(module)
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"name": module.Name,
|
||||
"path": module.Path,
|
||||
"version": module.Version,
|
||||
"doc_url": module.DocURL,
|
||||
"class_count": len(module.Classes),
|
||||
"function_count": len(module.Functions),
|
||||
"exception_count": len(module.Exceptions),
|
||||
"data_count": len(module.Constants),
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(module.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "python-module",
|
||||
Title: fmt.Sprintf("%s - Python", module.Name),
|
||||
Content: content,
|
||||
URL: module.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *PythonDocsScraper) buildModuleContent(module *pythondocs.Module) string {
|
||||
var parts []string
|
||||
|
||||
parts = append(parts, fmt.Sprintf("# Module %s\n", module.Name))
|
||||
|
||||
if module.Synopsis != "" {
|
||||
parts = append(parts, module.Synopsis)
|
||||
}
|
||||
|
||||
if module.Doc != "" {
|
||||
parts = append(parts, "\n"+module.Doc)
|
||||
}
|
||||
|
||||
if len(module.Classes) > 0 {
|
||||
parts = append(parts, fmt.Sprintf("\n## Classes (%d)\n", len(module.Classes)))
|
||||
for _, class := range module.Classes {
|
||||
parts = append(parts, fmt.Sprintf("- `%s`", class.Name))
|
||||
}
|
||||
}
|
||||
|
||||
if len(module.Functions) > 0 {
|
||||
parts = append(parts, fmt.Sprintf("\n## Functions (%d)\n", len(module.Functions)))
|
||||
for _, fn := range module.Functions {
|
||||
parts = append(parts, fmt.Sprintf("- `%s`", fn.Name))
|
||||
}
|
||||
}
|
||||
|
||||
if len(module.Exceptions) > 0 {
|
||||
parts = append(parts, fmt.Sprintf("\n## Exceptions (%d)\n", len(module.Exceptions)))
|
||||
for _, exc := range module.Exceptions {
|
||||
parts = append(parts, fmt.Sprintf("- `%s`", exc.Name))
|
||||
}
|
||||
}
|
||||
|
||||
return strings.Join(parts, "\n")
|
||||
}
|
||||
|
||||
func (s *PythonDocsScraper) classToDocument(class *pythondocs.Class, module *pythondocs.Module, sourceName string) *Document {
|
||||
content := s.buildClassContent(class, module)
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"module": module.Name,
|
||||
"class": class.Name,
|
||||
"qual_name": class.QualName,
|
||||
"bases": class.Bases,
|
||||
"method_count": len(class.Methods),
|
||||
"attribute_count": len(class.Attributes),
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(class.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "python-class",
|
||||
Title: fmt.Sprintf("%s.%s - Python", module.Name, class.Name),
|
||||
Content: content,
|
||||
URL: class.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *PythonDocsScraper) buildClassContent(class *pythondocs.Class, module *pythondocs.Module) string {
|
||||
var parts []string
|
||||
|
||||
parts = append(parts, fmt.Sprintf("# class %s.%s\n", module.Name, class.Name))
|
||||
|
||||
if class.Signature != "" {
|
||||
parts = append(parts, fmt.Sprintf("```python\n%s\n```", class.Signature))
|
||||
}
|
||||
|
||||
if class.Doc != "" {
|
||||
parts = append(parts, "\n"+class.Doc)
|
||||
}
|
||||
|
||||
if len(class.Bases) > 0 {
|
||||
parts = append(parts, fmt.Sprintf("\n**Bases:** %s\n", strings.Join(class.Bases, ", ")))
|
||||
}
|
||||
|
||||
if len(class.Methods) > 0 {
|
||||
parts = append(parts, fmt.Sprintf("\n### Methods (%d)\n", len(class.Methods)))
|
||||
for _, m := range class.Methods {
|
||||
parts = append(parts, fmt.Sprintf("- `%s`", m.Name))
|
||||
}
|
||||
}
|
||||
|
||||
if len(class.ClassMethods) > 0 {
|
||||
parts = append(parts, fmt.Sprintf("\n### Class Methods (%d)\n", len(class.ClassMethods)))
|
||||
for _, m := range class.ClassMethods {
|
||||
parts = append(parts, fmt.Sprintf("- `%s` (classmethod)", m.Name))
|
||||
}
|
||||
}
|
||||
|
||||
if len(class.Attributes) > 0 {
|
||||
parts = append(parts, fmt.Sprintf("\n### Attributes (%d)\n", len(class.Attributes)))
|
||||
for _, a := range class.Attributes {
|
||||
parts = append(parts, fmt.Sprintf("- `%s`", a.Name))
|
||||
}
|
||||
}
|
||||
|
||||
return strings.Join(parts, "\n")
|
||||
}
|
||||
|
||||
func (s *PythonDocsScraper) methodToDocument(method *pythondocs.Method, class *pythondocs.Class, module *pythondocs.Module, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# %s.%s.%s\n\n", module.Name, class.Name, method.Name)
|
||||
|
||||
if method.Signature != "" {
|
||||
fmt.Fprintf(&content, "```python\n%s\n```\n", method.Signature)
|
||||
}
|
||||
|
||||
if method.Doc != "" {
|
||||
fmt.Fprintf(&content, "%s\n", method.Doc)
|
||||
}
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"module": module.Name,
|
||||
"class": class.Name,
|
||||
"method": method.Name,
|
||||
"qual_name": method.QualName,
|
||||
"is_static": method.IsStatic,
|
||||
"is_async": method.IsAsync,
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(method.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "python-method",
|
||||
Title: fmt.Sprintf("%s.%s.%s - Python", module.Name, class.Name, method.Name),
|
||||
Content: content.String(),
|
||||
URL: method.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *PythonDocsScraper) classMethodToDocument(method *pythondocs.Method, class *pythondocs.Class, module *pythondocs.Module, sourceName string) *Document {
|
||||
content := s.buildMethodContent(method, class, module)
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"module": module.Name,
|
||||
"class": class.Name,
|
||||
"method": method.Name,
|
||||
"qual_name": method.QualName,
|
||||
"is_classmethod": true,
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(method.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "python-classmethod",
|
||||
Title: fmt.Sprintf("%s.%s.%s (classmethod) - Python", module.Name, class.Name, method.Name),
|
||||
Content: content,
|
||||
URL: method.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *PythonDocsScraper) buildMethodContent(method *pythondocs.Method, class *pythondocs.Class, module *pythondocs.Module) string {
|
||||
var parts []string
|
||||
|
||||
parts = append(parts, fmt.Sprintf("# %s.%s.%s\n", module.Name, class.Name, method.Name))
|
||||
|
||||
if method.Signature != "" {
|
||||
parts = append(parts, fmt.Sprintf("```python\n%s\n```", method.Signature))
|
||||
}
|
||||
|
||||
if method.Doc != "" {
|
||||
parts = append(parts, "\n"+method.Doc)
|
||||
}
|
||||
|
||||
return strings.Join(parts, "\n")
|
||||
}
|
||||
|
||||
func (s *PythonDocsScraper) attributeToDocument(attr *pythondocs.Attribute, class *pythondocs.Class, module *pythondocs.Module, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# %s.%s.%s\n\n", module.Name, class.Name, attr.Name)
|
||||
|
||||
if attr.Doc != "" {
|
||||
fmt.Fprintf(&content, "%s\n", attr.Doc)
|
||||
}
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"module": module.Name,
|
||||
"class": class.Name,
|
||||
"attr": attr.Name,
|
||||
"type": attr.Type,
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(attr.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "python-attribute",
|
||||
Title: fmt.Sprintf("%s.%s.%s - Python", module.Name, class.Name, attr.Name),
|
||||
Content: content.String(),
|
||||
URL: attr.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *PythonDocsScraper) functionToDocument(fn *pythondocs.Function, module *pythondocs.Module, sourceName string) *Document {
|
||||
content := s.buildFunctionContent(fn, module)
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"module": module.Name,
|
||||
"function": fn.Name,
|
||||
"qual_name": fn.QualName,
|
||||
"signature": fn.Signature,
|
||||
"is_async": fn.IsAsync,
|
||||
"is_generator": fn.IsGenerator,
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(fn.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "python-function",
|
||||
Title: fmt.Sprintf("%s.%s - Python", module.Name, fn.Name),
|
||||
Content: content,
|
||||
URL: fn.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *PythonDocsScraper) buildFunctionContent(fn *pythondocs.Function, module *pythondocs.Module) string {
|
||||
var parts []string
|
||||
|
||||
parts = append(parts, fmt.Sprintf("# %s.%s\n", module.Name, fn.Name))
|
||||
|
||||
if fn.Signature != "" {
|
||||
parts = append(parts, fmt.Sprintf("```python\n%s\n```", fn.Signature))
|
||||
}
|
||||
|
||||
if fn.Doc != "" {
|
||||
parts = append(parts, "\n"+fn.Doc)
|
||||
}
|
||||
|
||||
return strings.Join(parts, "\n")
|
||||
}
|
||||
|
||||
func (s *PythonDocsScraper) exceptionToDocument(exc *pythondocs.Exception, module *pythondocs.Module, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# %s.%s\n\n", module.Name, exc.Name)
|
||||
|
||||
if exc.Signature != "" {
|
||||
fmt.Fprintf(&content, "```python\n%s\n```\n", exc.Signature)
|
||||
}
|
||||
|
||||
if exc.Doc != "" {
|
||||
fmt.Fprintf(&content, "%s\n", exc.Doc)
|
||||
}
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"module": module.Name,
|
||||
"exception": exc.Name,
|
||||
"qual_name": exc.QualName,
|
||||
"bases": exc.Bases,
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(exc.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "python-exception",
|
||||
Title: fmt.Sprintf("%s.%s - Python", module.Name, exc.Name),
|
||||
Content: content.String(),
|
||||
URL: exc.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *PythonDocsScraper) dataToDocument(data *pythondocs.Data, module *pythondocs.Module, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# %s.%s\n\n", module.Name, data.Name)
|
||||
|
||||
if data.Doc != "" {
|
||||
fmt.Fprintf(&content, "%s\n", data.Doc)
|
||||
}
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"module": module.Name,
|
||||
"data": data.Name,
|
||||
"type": data.Type,
|
||||
"value": data.Value,
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(data.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "python-data",
|
||||
Title: fmt.Sprintf("%s.%s - Python", module.Name, data.Name),
|
||||
Content: content.String(),
|
||||
URL: data.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,214 @@
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/yourorg/devour/pkg/reactdocs"
|
||||
)
|
||||
|
||||
type ReactDocsScraper struct {
|
||||
config *Config
|
||||
parser *reactdocs.Parser
|
||||
client *http.Client
|
||||
}
|
||||
|
||||
func NewReactDocsScraper(config *Config) *ReactDocsScraper {
|
||||
return &ReactDocsScraper{
|
||||
config: config,
|
||||
parser: reactdocs.NewParser(),
|
||||
client: &http.Client{
|
||||
Timeout: config.Timeout,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func (s *ReactDocsScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
|
||||
var documents []*Document
|
||||
|
||||
if source.URL == "" {
|
||||
return nil, fmt.Errorf("URL is required for React docs scraper")
|
||||
}
|
||||
|
||||
html, err := s.fetchPage(ctx, source.URL)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to fetch page: %w", err)
|
||||
}
|
||||
|
||||
ref, err := s.parser.ParseReferencePage(html, source.URL)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to parse reference: %w", err)
|
||||
}
|
||||
|
||||
mainDoc := s.referenceToDocument(ref, source.Name)
|
||||
documents = append(documents, mainDoc)
|
||||
|
||||
for _, hook := range ref.Hooks {
|
||||
doc := s.hookToDocument(hook, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
for _, comp := range ref.Components {
|
||||
doc := s.componentToDocument(comp, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
for _, api := range ref.APIs {
|
||||
doc := s.apiToDocument(api, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
return documents, nil
|
||||
}
|
||||
|
||||
func (s *ReactDocsScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
|
||||
html, err := s.fetchPage(ctx, source.URL)
|
||||
if err != nil {
|
||||
return false, "", err
|
||||
}
|
||||
|
||||
hash := s.generateHash(html)
|
||||
changed := hash != lastHash
|
||||
|
||||
return changed, hash, nil
|
||||
}
|
||||
|
||||
func (s *ReactDocsScraper) fetchPage(ctx context.Context, url string) (string, error) {
|
||||
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
req.Header.Set("User-Agent", s.config.UserAgent)
|
||||
|
||||
resp, err := s.client.Do(req)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return "", fmt.Errorf("HTTP %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return string(body), nil
|
||||
}
|
||||
|
||||
func (s *ReactDocsScraper) generateHash(content string) string {
|
||||
hash := sha256.Sum256([]byte(content))
|
||||
return hex.EncodeToString(hash[:])
|
||||
}
|
||||
|
||||
func (s *ReactDocsScraper) referenceToDocument(ref *reactdocs.Reference, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# React API Reference\n\n")
|
||||
fmt.Fprintf(&content, "Hooks: %d, Components: %d, APIs: %d\n", len(ref.Hooks), len(ref.Components), len(ref.APIs))
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(ref.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "react-reference",
|
||||
Title: "React API Reference",
|
||||
Content: content.String(),
|
||||
URL: ref.DocURL,
|
||||
Metadata: map[string]interface{}{"doc_type": "react-reference"},
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *ReactDocsScraper) hookToDocument(hook *reactdocs.Hook, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# %s\n\n", hook.Name)
|
||||
if hook.Signature != "" {
|
||||
fmt.Fprintf(&content, "```javascript\n%s\n```\n\n", hook.Signature)
|
||||
}
|
||||
fmt.Fprintf(&content, "%s\n", hook.Doc)
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"name": hook.Name,
|
||||
"category": hook.Category,
|
||||
"doc_url": hook.DocURL,
|
||||
"doc_type": "react-hook",
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(hook.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "react-hook",
|
||||
Title: hook.Name,
|
||||
Content: content.String(),
|
||||
URL: hook.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *ReactDocsScraper) componentToDocument(comp *reactdocs.Component, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# <%s />\n\n", comp.Name)
|
||||
fmt.Fprintf(&content, "%s\n", comp.Doc)
|
||||
|
||||
if len(comp.Props) > 0 {
|
||||
fmt.Fprintf(&content, "\n## Props\n")
|
||||
for _, p := range comp.Props {
|
||||
fmt.Fprintf(&content, "- `%s: %s`\n", p.Name, p.Type)
|
||||
}
|
||||
}
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"name": comp.Name,
|
||||
"doc_url": comp.DocURL,
|
||||
"doc_type": "react-component",
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(comp.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "react-component",
|
||||
Title: comp.Name,
|
||||
Content: content.String(),
|
||||
URL: comp.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *ReactDocsScraper) apiToDocument(api *reactdocs.API, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# %s()\n\n", api.Name)
|
||||
if api.Signature != "" {
|
||||
fmt.Fprintf(&content, "```javascript\n%s\n```\n\n", api.Signature)
|
||||
}
|
||||
fmt.Fprintf(&content, "%s\n", api.Doc)
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"name": api.Name,
|
||||
"doc_url": api.DocURL,
|
||||
"doc_type": "react-api",
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(api.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "react-api",
|
||||
Title: api.Name,
|
||||
Content: content.String(),
|
||||
URL: api.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,563 @@
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/yourorg/devour/pkg/rustdocs"
|
||||
)
|
||||
|
||||
type RustDocsScraper struct {
|
||||
config *Config
|
||||
parser *rustdocs.Parser
|
||||
client *http.Client
|
||||
}
|
||||
|
||||
func NewRustDocsScraper(config *Config) *RustDocsScraper {
|
||||
return &RustDocsScraper{
|
||||
config: config,
|
||||
parser: rustdocs.NewParser(),
|
||||
client: &http.Client{
|
||||
Timeout: config.Timeout,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func (s *RustDocsScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
|
||||
var documents []*Document
|
||||
|
||||
if source.URL == "" {
|
||||
return nil, fmt.Errorf("URL is required for Rust docs scraper")
|
||||
}
|
||||
|
||||
html, err := s.fetchPage(ctx, source.URL)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to fetch page: %w", err)
|
||||
}
|
||||
|
||||
crate, err := s.parser.ParseCratePage(html, source.URL)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to parse crate: %w", err)
|
||||
}
|
||||
|
||||
mainDoc := s.crateToDocument(crate, source.Name)
|
||||
documents = append(documents, mainDoc)
|
||||
|
||||
for _, m := range crate.Modules {
|
||||
doc := s.moduleToDocument(m, crate, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
for _, st := range crate.Structs {
|
||||
doc := s.structToDocument(st, crate, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
for _, e := range crate.Enums {
|
||||
doc := s.enumToDocument(e, crate, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
for _, t := range crate.Traits {
|
||||
doc := s.traitToDocument(t, crate, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
for _, f := range crate.Functions {
|
||||
doc := s.funcToDocument(f, crate, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
for _, m := range crate.Macros {
|
||||
doc := s.macroToDocument(m, crate, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
for _, c := range crate.Constants {
|
||||
doc := s.constToDocument(c, crate, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
for _, st := range crate.Statics {
|
||||
doc := s.staticToDocument(st, crate, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
return documents, nil
|
||||
}
|
||||
|
||||
func (s *RustDocsScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
|
||||
html, err := s.fetchPage(ctx, source.URL)
|
||||
if err != nil {
|
||||
return false, "", err
|
||||
}
|
||||
|
||||
hash := s.generateHash(html)
|
||||
changed := hash != lastHash
|
||||
|
||||
return changed, hash, nil
|
||||
}
|
||||
|
||||
func (s *RustDocsScraper) fetchPage(ctx context.Context, url string) (string, error) {
|
||||
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
req.Header.Set("User-Agent", s.config.UserAgent)
|
||||
|
||||
resp, err := s.client.Do(req)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return "", fmt.Errorf("HTTP %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return string(body), nil
|
||||
}
|
||||
|
||||
func (s *RustDocsScraper) generateHash(content string) string {
|
||||
hash := sha256.Sum256([]byte(content))
|
||||
return hex.EncodeToString(hash[:])
|
||||
}
|
||||
|
||||
func (s *RustDocsScraper) crateToDocument(crate *rustdocs.Crate, sourceName string) *Document {
|
||||
content := s.buildCrateContent(crate)
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"name": crate.Name,
|
||||
"version": crate.Version,
|
||||
"repository": crate.Repository,
|
||||
"license": crate.License,
|
||||
"doc_url": crate.DocURL,
|
||||
"module_count": len(crate.Modules),
|
||||
"struct_count": len(crate.Structs),
|
||||
"enum_count": len(crate.Enums),
|
||||
"trait_count": len(crate.Traits),
|
||||
"function_count": len(crate.Functions),
|
||||
"macro_count": len(crate.Macros),
|
||||
"constant_count": len(crate.Constants),
|
||||
"static_count": len(crate.Statics),
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(crate.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "rust-crate",
|
||||
Title: fmt.Sprintf("%s - Rust", crate.Name),
|
||||
Content: content,
|
||||
URL: crate.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *RustDocsScraper) buildCrateContent(crate *rustdocs.Crate) string {
|
||||
var parts []string
|
||||
|
||||
parts = append(parts, fmt.Sprintf("# Crate %s\n", crate.Name))
|
||||
|
||||
if crate.Version != "" {
|
||||
parts = append(parts, fmt.Sprintf("Version: %s\n", crate.Version))
|
||||
}
|
||||
|
||||
if crate.Description != "" {
|
||||
parts = append(parts, crate.Description)
|
||||
}
|
||||
|
||||
if len(crate.Modules) > 0 {
|
||||
parts = append(parts, fmt.Sprintf("\n## Modules (%d)\n", len(crate.Modules)))
|
||||
for _, m := range crate.Modules {
|
||||
parts = append(parts, fmt.Sprintf("- `%s`", m.Name))
|
||||
}
|
||||
}
|
||||
|
||||
if len(crate.Structs) > 0 {
|
||||
parts = append(parts, fmt.Sprintf("\n## Structs (%d)\n", len(crate.Structs)))
|
||||
for _, st := range crate.Structs {
|
||||
parts = append(parts, fmt.Sprintf("- `%s`", st.Name))
|
||||
}
|
||||
}
|
||||
|
||||
if len(crate.Enums) > 0 {
|
||||
parts = append(parts, fmt.Sprintf("\n## Enums (%d)\n", len(crate.Enums)))
|
||||
for _, e := range crate.Enums {
|
||||
parts = append(parts, fmt.Sprintf("- `%s`", e.Name))
|
||||
}
|
||||
}
|
||||
|
||||
if len(crate.Traits) > 0 {
|
||||
parts = append(parts, fmt.Sprintf("\n## Traits (%d)\n", len(crate.Traits)))
|
||||
for _, t := range crate.Traits {
|
||||
parts = append(parts, fmt.Sprintf("- `%s`", t.Name))
|
||||
}
|
||||
}
|
||||
|
||||
if len(crate.Functions) > 0 {
|
||||
parts = append(parts, fmt.Sprintf("\n## Functions (%d)\n", len(crate.Functions)))
|
||||
for _, f := range crate.Functions {
|
||||
parts = append(parts, fmt.Sprintf("- `%s`", f.Name))
|
||||
}
|
||||
}
|
||||
|
||||
if len(crate.Macros) > 0 {
|
||||
parts = append(parts, fmt.Sprintf("\n## Macros (%d)\n", len(crate.Macros)))
|
||||
for _, m := range crate.Macros {
|
||||
parts = append(parts, fmt.Sprintf("- `%s`", m.Name))
|
||||
}
|
||||
}
|
||||
|
||||
return strings.Join(parts, "\n")
|
||||
}
|
||||
|
||||
func (s *RustDocsScraper) moduleToDocument(m *rustdocs.Module, crate *rustdocs.Crate, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# Module %s::%s\n\n", crate.Name, m.Name)
|
||||
|
||||
if m.Doc != "" {
|
||||
fmt.Fprintf(&content, "%s\n", m.Doc)
|
||||
}
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"crate": crate.Name,
|
||||
"module": m.Name,
|
||||
"path": m.Path,
|
||||
"is_experimental": m.IsExperimental,
|
||||
"kind": "module",
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(m.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "rust-module",
|
||||
Title: fmt.Sprintf("%s::%s - Rust", crate.Name, m.Name),
|
||||
Content: content.String(),
|
||||
URL: m.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *RustDocsScraper) structToDocument(st *rustdocs.Struct, crate *rustdocs.Crate, sourceName string) *Document {
|
||||
content := s.buildStructContent(st, crate)
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"crate": crate.Name,
|
||||
"struct": st.Name,
|
||||
"path": st.Path,
|
||||
"is_experimental": st.IsExperimental,
|
||||
"kind": "struct",
|
||||
"declaration": st.Declaration,
|
||||
}
|
||||
|
||||
fieldsJSON, _ := json.Marshal(st.Fields)
|
||||
metadata["fields"] = string(fieldsJSON)
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(st.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "rust-struct",
|
||||
Title: fmt.Sprintf("%s::%s - Rust", crate.Name, st.Name),
|
||||
Content: content,
|
||||
URL: st.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *RustDocsScraper) buildStructContent(st *rustdocs.Struct, crate *rustdocs.Crate) string {
|
||||
var parts []string
|
||||
|
||||
parts = append(parts, fmt.Sprintf("# struct %s::%s\n", crate.Name, st.Name))
|
||||
|
||||
if st.Declaration != "" {
|
||||
parts = append(parts, fmt.Sprintf("```rust\n%s\n```", st.Declaration))
|
||||
}
|
||||
|
||||
if st.Doc != "" {
|
||||
parts = append(parts, "\n"+st.Doc)
|
||||
}
|
||||
|
||||
if len(st.Fields) > 0 {
|
||||
parts = append(parts, "\n### Fields\n")
|
||||
for _, f := range st.Fields {
|
||||
if f.Doc != "" {
|
||||
parts = append(parts, fmt.Sprintf("- `%s: %s` - %s", f.Name, f.Type, f.Doc))
|
||||
} else {
|
||||
parts = append(parts, fmt.Sprintf("- `%s: %s`", f.Name, f.Type))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if len(st.Methods) > 0 {
|
||||
parts = append(parts, fmt.Sprintf("\n### Methods (%d)\n", len(st.Methods)))
|
||||
for _, m := range st.Methods {
|
||||
parts = append(parts, fmt.Sprintf("- `%s`", m.Name))
|
||||
}
|
||||
}
|
||||
|
||||
return strings.Join(parts, "\n")
|
||||
}
|
||||
|
||||
func (s *RustDocsScraper) enumToDocument(e *rustdocs.Enum, crate *rustdocs.Crate, sourceName string) *Document {
|
||||
content := s.buildEnumContent(e, crate)
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"crate": crate.Name,
|
||||
"enum": e.Name,
|
||||
"path": e.Path,
|
||||
"is_experimental": e.IsExperimental,
|
||||
"kind": "enum",
|
||||
"declaration": e.Declaration,
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(e.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "rust-enum",
|
||||
Title: fmt.Sprintf("%s::%s - Rust", crate.Name, e.Name),
|
||||
Content: content,
|
||||
URL: e.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *RustDocsScraper) buildEnumContent(e *rustdocs.Enum, crate *rustdocs.Crate) string {
|
||||
var parts []string
|
||||
|
||||
parts = append(parts, fmt.Sprintf("# enum %s::%s\n", crate.Name, e.Name))
|
||||
|
||||
if e.Declaration != "" {
|
||||
parts = append(parts, fmt.Sprintf("```rust\n%s\n```", e.Declaration))
|
||||
}
|
||||
|
||||
if e.Doc != "" {
|
||||
parts = append(parts, "\n"+e.Doc)
|
||||
}
|
||||
|
||||
if len(e.Variants) > 0 {
|
||||
parts = append(parts, "\n### Variants\n")
|
||||
for _, v := range e.Variants {
|
||||
if v.Doc != "" {
|
||||
parts = append(parts, fmt.Sprintf("- `%s` - %s", v.Name, v.Doc))
|
||||
} else {
|
||||
parts = append(parts, fmt.Sprintf("- `%s`", v.Name))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return strings.Join(parts, "\n")
|
||||
}
|
||||
|
||||
func (s *RustDocsScraper) traitToDocument(t *rustdocs.Trait, crate *rustdocs.Crate, sourceName string) *Document {
|
||||
content := s.buildTraitContent(t, crate)
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"crate": crate.Name,
|
||||
"trait": t.Name,
|
||||
"path": t.Path,
|
||||
"is_experimental": t.IsExperimental,
|
||||
"kind": "trait",
|
||||
"declaration": t.Declaration,
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(t.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "rust-trait",
|
||||
Title: fmt.Sprintf("%s::%s - Rust", crate.Name, t.Name),
|
||||
Content: content,
|
||||
URL: t.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *RustDocsScraper) buildTraitContent(t *rustdocs.Trait, crate *rustdocs.Crate) string {
|
||||
var parts []string
|
||||
|
||||
parts = append(parts, fmt.Sprintf("# trait %s::%s\n", crate.Name, t.Name))
|
||||
|
||||
if t.Declaration != "" {
|
||||
parts = append(parts, fmt.Sprintf("```rust\n%s\n```", t.Declaration))
|
||||
}
|
||||
|
||||
if t.Doc != "" {
|
||||
parts = append(parts, "\n"+t.Doc)
|
||||
}
|
||||
|
||||
if len(t.Methods) > 0 {
|
||||
parts = append(parts, fmt.Sprintf("\n### Required Methods (%d)\n", len(t.Methods)))
|
||||
for _, m := range t.Methods {
|
||||
parts = append(parts, fmt.Sprintf("- `%s`", m.Signature))
|
||||
}
|
||||
}
|
||||
|
||||
return strings.Join(parts, "\n")
|
||||
}
|
||||
|
||||
func (s *RustDocsScraper) funcToDocument(f *rustdocs.Func, crate *rustdocs.Crate, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# fn %s::%s\n\n", crate.Name, f.Name)
|
||||
|
||||
if f.Signature != "" {
|
||||
fmt.Fprintf(&content, "```rust\n%s\n```\n", f.Signature)
|
||||
}
|
||||
|
||||
if f.Doc != "" {
|
||||
fmt.Fprintf(&content, "%s\n", f.Doc)
|
||||
}
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"crate": crate.Name,
|
||||
"function": f.Name,
|
||||
"path": f.Path,
|
||||
"is_experimental": f.IsExperimental,
|
||||
"is_unsafe": f.IsUnsafe,
|
||||
"is_const": f.IsConst,
|
||||
"is_async": f.IsAsync,
|
||||
"kind": "fn",
|
||||
"signature": f.Signature,
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(f.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "rust-fn",
|
||||
Title: fmt.Sprintf("%s::%s - Rust", crate.Name, f.Name),
|
||||
Content: content.String(),
|
||||
URL: f.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *RustDocsScraper) macroToDocument(m *rustdocs.Macro, crate *rustdocs.Crate, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# macro %s::%s!\n\n", crate.Name, m.Name)
|
||||
|
||||
if m.Signature != "" {
|
||||
fmt.Fprintf(&content, "```rust\n%s\n```\n", m.Signature)
|
||||
}
|
||||
|
||||
if m.Doc != "" {
|
||||
fmt.Fprintf(&content, "%s\n", m.Doc)
|
||||
}
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"crate": crate.Name,
|
||||
"macro": m.Name,
|
||||
"path": m.Path,
|
||||
"is_experimental": m.IsExperimental,
|
||||
"kind": "macro",
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(m.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "rust-macro",
|
||||
Title: fmt.Sprintf("%s::%s! - Rust", crate.Name, m.Name),
|
||||
Content: content.String(),
|
||||
URL: m.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *RustDocsScraper) constToDocument(c *rustdocs.Const, crate *rustdocs.Crate, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# const %s::%s\n\n", crate.Name, c.Name)
|
||||
|
||||
if c.Type != "" {
|
||||
fmt.Fprintf(&content, "Type: `%s`\n", c.Type)
|
||||
}
|
||||
if c.Value != "" {
|
||||
fmt.Fprintf(&content, "Value: `%s`\n", c.Value)
|
||||
}
|
||||
|
||||
if c.Doc != "" {
|
||||
fmt.Fprintf(&content, "\n%s\n", c.Doc)
|
||||
}
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"crate": crate.Name,
|
||||
"const": c.Name,
|
||||
"path": c.Path,
|
||||
"is_experimental": c.IsExperimental,
|
||||
"type": c.Type,
|
||||
"value": c.Value,
|
||||
"kind": "const",
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(c.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "rust-const",
|
||||
Title: fmt.Sprintf("%s::%s - Rust", crate.Name, c.Name),
|
||||
Content: content.String(),
|
||||
URL: c.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *RustDocsScraper) staticToDocument(st *rustdocs.Static, crate *rustdocs.Crate, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# static %s::%s\n\n", crate.Name, st.Name)
|
||||
|
||||
if st.Type != "" {
|
||||
fmt.Fprintf(&content, "Type: `%s`\n", st.Type)
|
||||
}
|
||||
if st.IsMutable {
|
||||
fmt.Fprintf(&content, "Mutability: mutable\n")
|
||||
}
|
||||
|
||||
if st.Doc != "" {
|
||||
fmt.Fprintf(&content, "\n%s\n", st.Doc)
|
||||
}
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"crate": crate.Name,
|
||||
"static": st.Name,
|
||||
"path": st.Path,
|
||||
"is_experimental": st.IsExperimental,
|
||||
"is_mutable": st.IsMutable,
|
||||
"type": st.Type,
|
||||
"kind": "static",
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(st.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "rust-static",
|
||||
Title: fmt.Sprintf("%s::%s - Rust", crate.Name, st.Name),
|
||||
Content: content.String(),
|
||||
URL: st.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,129 @@
|
||||
// Package scraper provides document scraping capabilities for various sources.
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"context"
|
||||
"time"
|
||||
)
|
||||
|
||||
// SourceType represents the type of documentation source.
|
||||
type SourceType string
|
||||
|
||||
const (
|
||||
SourceTypeWeb SourceType = "url"
|
||||
SourceTypeGitHub SourceType = "github"
|
||||
SourceTypeOpenAPI SourceType = "openapi"
|
||||
SourceTypeLocal SourceType = "local"
|
||||
SourceTypeGoDocs SourceType = "godocs"
|
||||
SourceTypeRustDocs SourceType = "rustdocs"
|
||||
SourceTypePythonDocs SourceType = "pythondocs"
|
||||
SourceTypeJavaDocs SourceType = "javadocs"
|
||||
SourceTypeSpringDocs SourceType = "springdocs"
|
||||
SourceTypeSpringAIDocs SourceType = "springaidocs"
|
||||
SourceTypeTSDocs SourceType = "tsdocs"
|
||||
SourceTypeReactDocs SourceType = "reactdocs"
|
||||
SourceTypeVueDocs SourceType = "vuedocs"
|
||||
SourceTypeNuxtDocs SourceType = "nuxtdocs"
|
||||
SourceTypeMCPDocs SourceType = "mcpdocs"
|
||||
SourceTypeDockerDocs SourceType = "dockerdocs"
|
||||
SourceTypeCloudflareDocs SourceType = "cloudflaredocs"
|
||||
SourceTypeAstroDocs SourceType = "astrodocs"
|
||||
)
|
||||
|
||||
// Source represents a documentation source to scrape.
|
||||
type Source struct {
|
||||
Name string `yaml:"name"`
|
||||
Type SourceType `yaml:"type"`
|
||||
URL string `yaml:"url,omitempty"`
|
||||
Repo string `yaml:"repo,omitempty"`
|
||||
Branch string `yaml:"branch,omitempty"`
|
||||
Path string `yaml:"path,omitempty"`
|
||||
Include []string `yaml:"include,omitempty"`
|
||||
Exclude []string `yaml:"exclude,omitempty"`
|
||||
Schedule string `yaml:"schedule,omitempty"`
|
||||
}
|
||||
|
||||
// Document represents a scraped document.
|
||||
type Document struct {
|
||||
ID string `json:"id"`
|
||||
Source string `json:"source"`
|
||||
Type string `json:"type"`
|
||||
Title string `json:"title"`
|
||||
Content string `json:"content"`
|
||||
URL string `json:"url,omitempty"`
|
||||
Metadata map[string]interface{} `json:"metadata"`
|
||||
Hash string `json:"hash"`
|
||||
Timestamp time.Time `json:"timestamp"`
|
||||
}
|
||||
|
||||
// Config holds scraper configuration.
|
||||
type Config struct {
|
||||
UserAgent string `yaml:"user_agent"`
|
||||
Timeout time.Duration `yaml:"timeout"`
|
||||
RetryCount int `yaml:"retry_count"`
|
||||
RetryDelay time.Duration `yaml:"retry_delay"`
|
||||
Concurrency int `yaml:"concurrency"`
|
||||
RateLimit time.Duration `yaml:"rate_limit"`
|
||||
MaxDepth int `yaml:"max_depth"`
|
||||
CacheDir string `yaml:"cache_dir"`
|
||||
}
|
||||
|
||||
// Scraper defines the interface for document scrapers.
|
||||
type Scraper interface {
|
||||
// Scrape fetches and parses documents from the source.
|
||||
Scrape(ctx context.Context, source *Source) ([]*Document, error)
|
||||
|
||||
// DetectChanges checks if the source has changed since last scrape.
|
||||
DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error)
|
||||
}
|
||||
|
||||
// NewScraper creates a new scraper for the given source type.
|
||||
func NewScraper(sourceType SourceType, config *Config) Scraper {
|
||||
switch sourceType {
|
||||
case SourceTypeWeb:
|
||||
return NewWebScraper(config)
|
||||
case SourceTypeGitHub:
|
||||
return NewGitHubScraper(config)
|
||||
case SourceTypeOpenAPI:
|
||||
return NewOpenAPIScraper(config)
|
||||
case SourceTypeLocal:
|
||||
return NewLocalScraper(config)
|
||||
case SourceTypeGoDocs:
|
||||
return NewGoDocsScraper(config)
|
||||
case SourceTypeRustDocs:
|
||||
return NewRustDocsScraper(config)
|
||||
case SourceTypePythonDocs:
|
||||
return NewPythonDocsScraper(config)
|
||||
case SourceTypeJavaDocs:
|
||||
return NewJavaDocsScraper(config)
|
||||
case SourceTypeSpringDocs:
|
||||
return NewSpringDocsScraper(config)
|
||||
case SourceTypeTSDocs:
|
||||
return NewTSDocsScraper(config)
|
||||
case SourceTypeReactDocs:
|
||||
return NewReactDocsScraper(config)
|
||||
case SourceTypeVueDocs:
|
||||
return NewVueDocsScraper(config)
|
||||
case SourceTypeNuxtDocs:
|
||||
return NewNuxtDocsScraper(config)
|
||||
case SourceTypeMCPDocs:
|
||||
return NewMCPDocsScraper(config)
|
||||
case SourceTypeDockerDocs:
|
||||
return NewDockerDocsScraper(config)
|
||||
case SourceTypeCloudflareDocs:
|
||||
return NewCloudflareDocsScraper(config)
|
||||
case SourceTypeAstroDocs:
|
||||
return NewAstroDocsScraper(config)
|
||||
default:
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
// DetectSourceType determines the source type from a URL or path.
|
||||
func DetectSourceType(input string) SourceType {
|
||||
// TODO: Implement detection logic
|
||||
if len(input) > 4 && input[:4] == "http" {
|
||||
return SourceTypeWeb
|
||||
}
|
||||
return SourceTypeLocal
|
||||
}
|
||||
@@ -0,0 +1,221 @@
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/yourorg/devour/pkg/springdocs"
|
||||
)
|
||||
|
||||
type SpringDocsScraper struct {
|
||||
config *Config
|
||||
parser *springdocs.Parser
|
||||
client *http.Client
|
||||
}
|
||||
|
||||
func NewSpringDocsScraper(config *Config) *SpringDocsScraper {
|
||||
return &SpringDocsScraper{
|
||||
config: config,
|
||||
parser: springdocs.NewParser(),
|
||||
client: &http.Client{
|
||||
Timeout: config.Timeout,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func (s *SpringDocsScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
|
||||
var documents []*Document
|
||||
|
||||
if source.URL == "" {
|
||||
return nil, fmt.Errorf("URL is required for Spring docs scraper")
|
||||
}
|
||||
|
||||
html, err := s.fetchPage(ctx, source.URL)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to fetch page: %w", err)
|
||||
}
|
||||
|
||||
module, err := s.parser.ParseModulePage(html, source.URL)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to parse module: %w", err)
|
||||
}
|
||||
|
||||
mainDoc := s.moduleToDocument(module, source.Name)
|
||||
documents = append(documents, mainDoc)
|
||||
|
||||
for _, class := range module.Classes {
|
||||
doc := s.classToDocument(class, module, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
for _, prop := range module.Properties {
|
||||
doc := s.propertyToDocument(prop, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
for _, guide := range module.Guides {
|
||||
doc := s.guideToDocument(guide, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
return documents, nil
|
||||
}
|
||||
|
||||
func (s *SpringDocsScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
|
||||
html, err := s.fetchPage(ctx, source.URL)
|
||||
if err != nil {
|
||||
return false, "", err
|
||||
}
|
||||
|
||||
hash := s.generateHash(html)
|
||||
changed := hash != lastHash
|
||||
|
||||
return changed, hash, nil
|
||||
}
|
||||
|
||||
func (s *SpringDocsScraper) fetchPage(ctx context.Context, url string) (string, error) {
|
||||
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
req.Header.Set("User-Agent", s.config.UserAgent)
|
||||
|
||||
resp, err := s.client.Do(req)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return "", fmt.Errorf("HTTP %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return string(body), nil
|
||||
}
|
||||
|
||||
func (s *SpringDocsScraper) generateHash(content string) string {
|
||||
hash := sha256.Sum256([]byte(content))
|
||||
return hex.EncodeToString(hash[:])
|
||||
}
|
||||
|
||||
func (s *SpringDocsScraper) moduleToDocument(module *springdocs.Module, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# %s\n\n", module.Name)
|
||||
fmt.Fprintf(&content, "%s\n", module.Doc)
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"module": module.Name,
|
||||
"version": module.Version,
|
||||
"doc_url": module.DocURL,
|
||||
"doc_type": "spring-module",
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(module.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "spring-module",
|
||||
Title: module.Name,
|
||||
Content: content.String(),
|
||||
URL: module.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *SpringDocsScraper) classToDocument(class *springdocs.Class, module *springdocs.Module, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# %s\n\n", class.QualifiedName)
|
||||
fmt.Fprintf(&content, "%s\n", class.Doc)
|
||||
|
||||
if len(class.Methods) > 0 {
|
||||
fmt.Fprintf(&content, "\n## Methods\n")
|
||||
for _, m := range class.Methods {
|
||||
fmt.Fprintf(&content, "- `%s`\n", m.Signature)
|
||||
}
|
||||
}
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"module": module.Name,
|
||||
"qualified_name": class.QualifiedName,
|
||||
"kind": class.Kind,
|
||||
"doc_url": class.DocURL,
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(class.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "spring-class",
|
||||
Title: class.QualifiedName,
|
||||
Content: content.String(),
|
||||
URL: class.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *SpringDocsScraper) propertyToDocument(prop *springdocs.Property, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# %s\n\n", prop.Name)
|
||||
fmt.Fprintf(&content, "Type: %s\n", prop.Type)
|
||||
if prop.Default != "" {
|
||||
fmt.Fprintf(&content, "Default: `%s`\n", prop.Default)
|
||||
}
|
||||
fmt.Fprintf(&content, "\n%s\n", prop.Doc)
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"property": prop.Name,
|
||||
"type": prop.Type,
|
||||
"default": prop.Default,
|
||||
"doc_url": prop.DocURL,
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(prop.Name),
|
||||
Source: sourceName,
|
||||
Type: "spring-property",
|
||||
Title: prop.Name,
|
||||
Content: content.String(),
|
||||
URL: prop.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *SpringDocsScraper) guideToDocument(guide *springdocs.Guide, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# %s\n\n", guide.Title)
|
||||
fmt.Fprintf(&content, "%s\n", guide.Description)
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"title": guide.Title,
|
||||
"doc_url": guide.DocURL,
|
||||
"level": guide.Level,
|
||||
"doc_type": "spring-guide",
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(guide.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "spring-guide",
|
||||
Title: guide.Title,
|
||||
Content: content.String(),
|
||||
URL: guide.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,261 @@
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/yourorg/devour/pkg/tsdocs"
|
||||
)
|
||||
|
||||
type TSDocsScraper struct {
|
||||
config *Config
|
||||
parser *tsdocs.Parser
|
||||
client *http.Client
|
||||
}
|
||||
|
||||
func NewTSDocsScraper(config *Config) *TSDocsScraper {
|
||||
return &TSDocsScraper{
|
||||
config: config,
|
||||
parser: tsdocs.NewParser(),
|
||||
client: &http.Client{
|
||||
Timeout: config.Timeout,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func (s *TSDocsScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
|
||||
var documents []*Document
|
||||
|
||||
if source.URL == "" {
|
||||
return nil, fmt.Errorf("URL is required for TypeScript docs scraper")
|
||||
}
|
||||
|
||||
html, err := s.fetchPage(ctx, source.URL)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to fetch page: %w", err)
|
||||
}
|
||||
|
||||
module, err := s.parser.ParseModulePage(html, source.URL)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to parse module: %w", err)
|
||||
}
|
||||
|
||||
mainDoc := s.moduleToDocument(module, source.Name)
|
||||
documents = append(documents, mainDoc)
|
||||
|
||||
for _, iface := range module.Interfaces {
|
||||
doc := s.interfaceToDocument(iface, module, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
for _, fn := range module.Functions {
|
||||
doc := s.functionToDocument(fn, module, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
for _, class := range module.Classes {
|
||||
doc := s.classToDocument(class, module, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
for _, ta := range module.Types {
|
||||
doc := s.typeAliasToDocument(ta, module, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
return documents, nil
|
||||
}
|
||||
|
||||
func (s *TSDocsScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
|
||||
html, err := s.fetchPage(ctx, source.URL)
|
||||
if err != nil {
|
||||
return false, "", err
|
||||
}
|
||||
|
||||
hash := s.generateHash(html)
|
||||
changed := hash != lastHash
|
||||
|
||||
return changed, hash, nil
|
||||
}
|
||||
|
||||
func (s *TSDocsScraper) fetchPage(ctx context.Context, url string) (string, error) {
|
||||
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
req.Header.Set("User-Agent", s.config.UserAgent)
|
||||
|
||||
resp, err := s.client.Do(req)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return "", fmt.Errorf("HTTP %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return string(body), nil
|
||||
}
|
||||
|
||||
func (s *TSDocsScraper) generateHash(content string) string {
|
||||
hash := sha256.Sum256([]byte(content))
|
||||
return hex.EncodeToString(hash[:])
|
||||
}
|
||||
|
||||
func (s *TSDocsScraper) moduleToDocument(module *tsdocs.Module, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# %s\n\n", module.Name)
|
||||
fmt.Fprintf(&content, "%s\n", module.Doc)
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"module": module.Name,
|
||||
"version": module.Version,
|
||||
"doc_url": module.DocURL,
|
||||
"doc_type": "ts-module",
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(module.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "ts-module",
|
||||
Title: module.Name,
|
||||
Content: content.String(),
|
||||
URL: module.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *TSDocsScraper) interfaceToDocument(iface *tsdocs.Interface, module *tsdocs.Module, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# %s (interface)\n\n", iface.Name)
|
||||
if iface.Doc != "" {
|
||||
fmt.Fprintf(&content, "%s\n\n", iface.Doc)
|
||||
}
|
||||
if len(iface.Properties) > 0 {
|
||||
fmt.Fprintf(&content, "## Properties\n")
|
||||
for _, p := range iface.Properties {
|
||||
fmt.Fprintf(&content, "- `%s: %s`\n", p.Name, p.Type)
|
||||
}
|
||||
}
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"module": module.Name,
|
||||
"name": iface.Name,
|
||||
"doc_url": iface.DocURL,
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(iface.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "ts-interface",
|
||||
Title: iface.Name,
|
||||
Content: content.String(),
|
||||
URL: iface.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *TSDocsScraper) functionToDocument(fn *tsdocs.Function, module *tsdocs.Module, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# %s()\n\n", fn.Name)
|
||||
if fn.Signature != "" {
|
||||
fmt.Fprintf(&content, "```typescript\n%s\n```\n\n", fn.Signature)
|
||||
}
|
||||
if fn.Doc != "" {
|
||||
fmt.Fprintf(&content, "%s\n", fn.Doc)
|
||||
}
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"module": module.Name,
|
||||
"name": fn.Name,
|
||||
"return_type": fn.ReturnType,
|
||||
"doc_url": fn.DocURL,
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(fn.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "ts-function",
|
||||
Title: fn.Name,
|
||||
Content: content.String(),
|
||||
URL: fn.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *TSDocsScraper) classToDocument(class *tsdocs.Class, module *tsdocs.Module, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# %s (class)\n\n", class.Name)
|
||||
if class.Doc != "" {
|
||||
fmt.Fprintf(&content, "%s\n\n", class.Doc)
|
||||
}
|
||||
if len(class.Methods) > 0 {
|
||||
fmt.Fprintf(&content, "## Methods\n")
|
||||
for _, m := range class.Methods {
|
||||
fmt.Fprintf(&content, "- `%s()`\n", m.Name)
|
||||
}
|
||||
}
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"module": module.Name,
|
||||
"name": class.Name,
|
||||
"doc_url": class.DocURL,
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(class.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "ts-class",
|
||||
Title: class.Name,
|
||||
Content: content.String(),
|
||||
URL: class.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *TSDocsScraper) typeAliasToDocument(ta *tsdocs.TypeAlias, module *tsdocs.Module, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# %s (type)\n\n", ta.Name)
|
||||
fmt.Fprintf(&content, "```typescript\ntype %s = %s\n```\n\n", ta.Name, ta.Type)
|
||||
if ta.Doc != "" {
|
||||
fmt.Fprintf(&content, "%s\n", ta.Doc)
|
||||
}
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"module": module.Name,
|
||||
"name": ta.Name,
|
||||
"doc_url": ta.DocURL,
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(ta.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "ts-type",
|
||||
Title: ta.Name,
|
||||
Content: content.String(),
|
||||
URL: ta.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,244 @@
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/yourorg/devour/pkg/vuedocs"
|
||||
)
|
||||
|
||||
type VueDocsScraper struct {
|
||||
config *Config
|
||||
parser *vuedocs.Parser
|
||||
client *http.Client
|
||||
}
|
||||
|
||||
func NewVueDocsScraper(config *Config) *VueDocsScraper {
|
||||
return &VueDocsScraper{
|
||||
config: config,
|
||||
parser: vuedocs.NewParser(),
|
||||
client: &http.Client{
|
||||
Timeout: config.Timeout,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func (s *VueDocsScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
|
||||
var documents []*Document
|
||||
|
||||
if source.URL == "" {
|
||||
return nil, fmt.Errorf("URL is required for Vue docs scraper")
|
||||
}
|
||||
|
||||
html, err := s.fetchPage(ctx, source.URL)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to fetch page: %w", err)
|
||||
}
|
||||
|
||||
ref, err := s.parser.ParseReferencePage(html, source.URL)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to parse reference: %w", err)
|
||||
}
|
||||
|
||||
mainDoc := s.referenceToDocument(ref, source.Name)
|
||||
documents = append(documents, mainDoc)
|
||||
|
||||
for _, comp := range ref.Composition {
|
||||
doc := s.compositionToDocument(comp, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
for _, dir := range ref.Directives {
|
||||
doc := s.directiveToDocument(dir, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
for _, comp := range ref.Components {
|
||||
doc := s.componentToDocument(comp, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
for _, api := range ref.GlobalAPI {
|
||||
doc := s.globalAPIToDocument(api, source.Name)
|
||||
documents = append(documents, doc)
|
||||
}
|
||||
|
||||
return documents, nil
|
||||
}
|
||||
|
||||
func (s *VueDocsScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
|
||||
html, err := s.fetchPage(ctx, source.URL)
|
||||
if err != nil {
|
||||
return false, "", err
|
||||
}
|
||||
|
||||
hash := s.generateHash(html)
|
||||
changed := hash != lastHash
|
||||
|
||||
return changed, hash, nil
|
||||
}
|
||||
|
||||
func (s *VueDocsScraper) fetchPage(ctx context.Context, url string) (string, error) {
|
||||
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
req.Header.Set("User-Agent", s.config.UserAgent)
|
||||
|
||||
resp, err := s.client.Do(req)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return "", fmt.Errorf("HTTP %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return string(body), nil
|
||||
}
|
||||
|
||||
func (s *VueDocsScraper) generateHash(content string) string {
|
||||
hash := sha256.Sum256([]byte(content))
|
||||
return hex.EncodeToString(hash[:])
|
||||
}
|
||||
|
||||
func (s *VueDocsScraper) referenceToDocument(ref *vuedocs.Reference, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# Vue API Reference\n\n")
|
||||
fmt.Fprintf(&content, "Composition API: %d, Directives: %d, Components: %d\n", len(ref.Composition), len(ref.Directives), len(ref.Components))
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(ref.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "vue-reference",
|
||||
Title: "Vue API Reference",
|
||||
Content: content.String(),
|
||||
URL: ref.DocURL,
|
||||
Metadata: map[string]interface{}{"doc_type": "vue-reference"},
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *VueDocsScraper) compositionToDocument(comp *vuedocs.Composition, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# %s\n\n", comp.Name)
|
||||
if comp.Signature != "" {
|
||||
fmt.Fprintf(&content, "```javascript\n%s\n```\n\n", comp.Signature)
|
||||
}
|
||||
fmt.Fprintf(&content, "%s\n", comp.Doc)
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"name": comp.Name,
|
||||
"kind": comp.Kind,
|
||||
"doc_url": comp.DocURL,
|
||||
"doc_type": "vue-composition",
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(comp.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "vue-composition",
|
||||
Title: comp.Name,
|
||||
Content: content.String(),
|
||||
URL: comp.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *VueDocsScraper) directiveToDocument(dir *vuedocs.Directive, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# %s\n\n", dir.Name)
|
||||
fmt.Fprintf(&content, "%s\n", dir.Doc)
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"name": dir.Name,
|
||||
"doc_url": dir.DocURL,
|
||||
"doc_type": "vue-directive",
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(dir.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "vue-directive",
|
||||
Title: dir.Name,
|
||||
Content: content.String(),
|
||||
URL: dir.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *VueDocsScraper) componentToDocument(comp *vuedocs.Component, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# <%s />\n\n", comp.Name)
|
||||
fmt.Fprintf(&content, "%s\n", comp.Doc)
|
||||
|
||||
if len(comp.Props) > 0 {
|
||||
fmt.Fprintf(&content, "\n## Props\n")
|
||||
for _, p := range comp.Props {
|
||||
fmt.Fprintf(&content, "- `%s: %s`\n", p.Name, p.Type)
|
||||
}
|
||||
}
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"name": comp.Name,
|
||||
"doc_url": comp.DocURL,
|
||||
"doc_type": "vue-component",
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(comp.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "vue-component",
|
||||
Title: comp.Name,
|
||||
Content: content.String(),
|
||||
URL: comp.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *VueDocsScraper) globalAPIToDocument(api *vuedocs.API, sourceName string) *Document {
|
||||
var content strings.Builder
|
||||
fmt.Fprintf(&content, "# %s\n\n", api.Name)
|
||||
if api.Signature != "" {
|
||||
fmt.Fprintf(&content, "```javascript\n%s\n```\n\n", api.Signature)
|
||||
}
|
||||
fmt.Fprintf(&content, "%s\n", api.Doc)
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"name": api.Name,
|
||||
"category": api.Category,
|
||||
"doc_url": api.DocURL,
|
||||
"doc_type": "vue-api",
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(api.DocURL),
|
||||
Source: sourceName,
|
||||
Type: "vue-api",
|
||||
Title: api.Name,
|
||||
Content: content.String(),
|
||||
URL: api.DocURL,
|
||||
Metadata: metadata,
|
||||
Hash: s.generateHash(content.String()),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,296 @@
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"fmt"
|
||||
"net/url"
|
||||
"regexp"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/gocolly/colly/v2"
|
||||
)
|
||||
|
||||
// WebScraper scrapes documentation from web URLs.
|
||||
type WebScraper struct {
|
||||
config *Config
|
||||
}
|
||||
|
||||
// NewWebScraper creates a new web scraper.
|
||||
func NewWebScraper(config *Config) *WebScraper {
|
||||
return &WebScraper{config: config}
|
||||
}
|
||||
|
||||
// Scrape fetches and parses documents from a web source.
|
||||
func (s *WebScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
|
||||
var documents []*Document
|
||||
visited := make(map[string]bool)
|
||||
|
||||
// Parse base URL for domain restrictions
|
||||
baseURL, err := url.Parse(source.URL)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("invalid URL: %w", err)
|
||||
}
|
||||
|
||||
// Create Colly collector
|
||||
c := colly.NewCollector(
|
||||
colly.AllowedDomains(baseURL.Host),
|
||||
colly.MaxDepth(s.config.MaxDepth),
|
||||
colly.Async(true),
|
||||
colly.UserAgent(s.config.UserAgent),
|
||||
)
|
||||
|
||||
// Set rate limiting
|
||||
if s.config.RateLimit > 0 {
|
||||
c.Limit(&colly.LimitRule{
|
||||
DomainGlob: "*",
|
||||
Parallelism: s.config.Concurrency,
|
||||
Delay: s.config.RateLimit,
|
||||
})
|
||||
}
|
||||
|
||||
// Set timeout
|
||||
if s.config.Timeout > 0 {
|
||||
c.SetRequestTimeout(s.config.Timeout)
|
||||
}
|
||||
|
||||
// Enable caching if cache directory is set
|
||||
if s.config.CacheDir != "" {
|
||||
c.CacheDir = s.config.CacheDir
|
||||
}
|
||||
|
||||
// Handle errors
|
||||
c.OnError(func(r *colly.Response, err error) {
|
||||
fmt.Printf("Error scraping %s: %v\n", r.Request.URL, err)
|
||||
})
|
||||
|
||||
// Extract content from pages
|
||||
c.OnHTML("html", func(e *colly.HTMLElement) {
|
||||
pageURL := e.Request.URL.String()
|
||||
|
||||
// Skip if already visited
|
||||
if visited[pageURL] {
|
||||
return
|
||||
}
|
||||
visited[pageURL] = true
|
||||
|
||||
// Check include/exclude patterns
|
||||
if !s.shouldInclude(pageURL, source.Include, source.Exclude) {
|
||||
return
|
||||
}
|
||||
|
||||
// Extract title
|
||||
title := e.ChildText("title")
|
||||
if title == "" {
|
||||
title = e.ChildText("h1")
|
||||
}
|
||||
|
||||
// Extract main content
|
||||
content := s.extractContent(e)
|
||||
|
||||
// Skip if content is too short
|
||||
if len(content) < 100 {
|
||||
return
|
||||
}
|
||||
|
||||
// Generate hash for change detection
|
||||
hash := s.generateHash(content)
|
||||
|
||||
// Extract metadata
|
||||
metadata := map[string]interface{}{
|
||||
"headings": s.extractHeadings(e),
|
||||
"links": s.extractLinks(e),
|
||||
"images": s.extractImages(e),
|
||||
"description": e.ChildAttr(`meta[name="description"]`, "content"),
|
||||
}
|
||||
|
||||
doc := &Document{
|
||||
ID: generateDocID(pageURL),
|
||||
Source: source.Name,
|
||||
Type: "html",
|
||||
Title: strings.TrimSpace(title),
|
||||
Content: content,
|
||||
URL: pageURL,
|
||||
Metadata: metadata,
|
||||
Hash: hash,
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
|
||||
documents = append(documents, doc)
|
||||
})
|
||||
|
||||
// Follow links
|
||||
c.OnHTML("a[href]", func(e *colly.HTMLElement) {
|
||||
link := e.Attr("href")
|
||||
absoluteURL := e.Request.AbsoluteURL(link)
|
||||
|
||||
// Skip if already visited
|
||||
if visited[absoluteURL] {
|
||||
return
|
||||
}
|
||||
|
||||
// Check include/exclude patterns
|
||||
if !s.shouldInclude(absoluteURL, source.Include, source.Exclude) {
|
||||
return
|
||||
}
|
||||
|
||||
c.Visit(absoluteURL)
|
||||
})
|
||||
|
||||
// Start scraping
|
||||
if err := c.Visit(source.URL); err != nil {
|
||||
return nil, fmt.Errorf("failed to start scraping: %w", err)
|
||||
}
|
||||
|
||||
// Wait for async scraping to complete
|
||||
c.Wait()
|
||||
|
||||
return documents, nil
|
||||
}
|
||||
|
||||
// DetectChanges checks if the web source has changed.
|
||||
func (s *WebScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
|
||||
// Quick check by fetching just the main page
|
||||
c := colly.NewCollector(
|
||||
colly.UserAgent(s.config.UserAgent),
|
||||
)
|
||||
c.SetRequestTimeout(s.config.Timeout)
|
||||
|
||||
var content string
|
||||
c.OnHTML("html", func(e *colly.HTMLElement) {
|
||||
content = s.extractContent(e)
|
||||
})
|
||||
|
||||
if err := c.Visit(source.URL); err != nil {
|
||||
return false, "", err
|
||||
}
|
||||
|
||||
currentHash := s.generateHash(content)
|
||||
changed := currentHash != lastHash
|
||||
|
||||
return changed, currentHash, nil
|
||||
}
|
||||
|
||||
// extractContent extracts the main text content from a page.
|
||||
func (s *WebScraper) extractContent(e *colly.HTMLElement) string {
|
||||
// Try common content selectors
|
||||
selectors := []string{
|
||||
"article",
|
||||
"main",
|
||||
".content",
|
||||
".documentation",
|
||||
".docs",
|
||||
".markdown-body",
|
||||
"[role='main']",
|
||||
"#content",
|
||||
"#main",
|
||||
}
|
||||
|
||||
var content string
|
||||
for _, selector := range selectors {
|
||||
content = e.ChildText(selector)
|
||||
if len(content) > 200 {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback to body if no content found
|
||||
if content == "" {
|
||||
content = e.ChildText("body")
|
||||
}
|
||||
|
||||
// Clean up content
|
||||
content = cleanText(content)
|
||||
|
||||
return content
|
||||
}
|
||||
|
||||
// extractHeadings extracts heading structure.
|
||||
func (s *WebScraper) extractHeadings(e *colly.HTMLElement) []string {
|
||||
var headings []string
|
||||
e.ForEach("h1, h2, h3, h4, h5, h6", func(_ int, h *colly.HTMLElement) {
|
||||
text := strings.TrimSpace(h.Text)
|
||||
if text != "" {
|
||||
headings = append(headings, text)
|
||||
}
|
||||
})
|
||||
return headings
|
||||
}
|
||||
|
||||
// extractLinks extracts internal links.
|
||||
func (s *WebScraper) extractLinks(e *colly.HTMLElement) []string {
|
||||
var links []string
|
||||
seen := make(map[string]bool)
|
||||
e.ForEach("a[href]", func(_ int, a *colly.HTMLElement) {
|
||||
href := a.Attr("href")
|
||||
if href != "" && !seen[href] && !strings.HasPrefix(href, "#") {
|
||||
links = append(links, href)
|
||||
seen[href] = true
|
||||
}
|
||||
})
|
||||
return links
|
||||
}
|
||||
|
||||
// extractImages extracts image URLs.
|
||||
func (s *WebScraper) extractImages(e *colly.HTMLElement) []string {
|
||||
var images []string
|
||||
e.ForEach("img[src]", func(_ int, img *colly.HTMLElement) {
|
||||
src := img.Attr("src")
|
||||
if src != "" {
|
||||
images = append(images, src)
|
||||
}
|
||||
})
|
||||
return images
|
||||
}
|
||||
|
||||
// shouldInclude checks if a URL should be included based on patterns.
|
||||
func (s *WebScraper) shouldInclude(urlStr string, include, exclude []string) bool {
|
||||
// Check exclude patterns first
|
||||
for _, pattern := range exclude {
|
||||
matched, _ := regexp.MatchString(pattern, urlStr)
|
||||
if matched {
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
// If no include patterns, include all
|
||||
if len(include) == 0 {
|
||||
return true
|
||||
}
|
||||
|
||||
// Check include patterns
|
||||
for _, pattern := range include {
|
||||
matched, _ := regexp.MatchString(pattern, urlStr)
|
||||
if matched {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
// generateHash generates a SHA256 hash of content.
|
||||
func (s *WebScraper) generateHash(content string) string {
|
||||
hash := sha256.Sum256([]byte(content))
|
||||
return hex.EncodeToString(hash[:])
|
||||
}
|
||||
|
||||
// cleanText removes extra whitespace and normalizes text.
|
||||
func cleanText(text string) string {
|
||||
// Replace multiple whitespace with single space
|
||||
re := regexp.MustCompile(`\s+`)
|
||||
text = re.ReplaceAllString(text, " ")
|
||||
|
||||
// Trim spaces
|
||||
text = strings.TrimSpace(text)
|
||||
|
||||
return text
|
||||
}
|
||||
|
||||
// generateDocID generates a unique ID for a document.
|
||||
func generateDocID(urlStr string) string {
|
||||
hash := sha256.Sum256([]byte(urlStr))
|
||||
return hex.EncodeToString(hash[:12])
|
||||
}
|
||||
Reference in New Issue
Block a user