mirror of
https://github.com/Dvorinka/Devour.git
synced 2026-06-04 04:23:02 +00:00
195 lines
5.8 KiB
Go
195 lines
5.8 KiB
Go
package scraper
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"net/url"
|
|
"os"
|
|
"os/exec"
|
|
"path/filepath"
|
|
"strings"
|
|
)
|
|
|
|
// GitHubScraper scrapes documentation from GitHub repositories.
|
|
type GitHubScraper struct {
|
|
config *Config
|
|
}
|
|
|
|
// NewGitHubScraper creates a new GitHub scraper.
|
|
func NewGitHubScraper(config *Config) *GitHubScraper {
|
|
return &GitHubScraper{config: config}
|
|
}
|
|
|
|
// Scrape clones and parses documents from a GitHub repository.
|
|
func (s *GitHubScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
|
|
if source == nil {
|
|
return nil, fmt.Errorf("source is required")
|
|
}
|
|
|
|
repoURL, repoName, err := s.resolveRepo(source)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("resolve github repository: %w", err)
|
|
}
|
|
|
|
tmpDir, err := os.MkdirTemp("", "devour-github-*")
|
|
if err != nil {
|
|
return nil, fmt.Errorf("create temporary clone directory: %w", err)
|
|
}
|
|
defer os.RemoveAll(tmpDir)
|
|
|
|
cloneArgs := []string{"clone", "--depth", "1", "--filter=blob:none", "--sparse", repoURL, tmpDir}
|
|
if branch := strings.TrimSpace(source.Branch); branch != "" {
|
|
cloneArgs = []string{"clone", "--depth", "1", "--filter=blob:none", "--sparse", "--branch", branch, repoURL, tmpDir}
|
|
}
|
|
|
|
cmd := exec.CommandContext(ctx, "git", cloneArgs...)
|
|
output, err := cmd.CombinedOutput()
|
|
if err != nil {
|
|
return nil, fmt.Errorf("git clone failed: %v (%s)", err, strings.TrimSpace(string(output)))
|
|
}
|
|
|
|
if len(source.Include) == 0 {
|
|
// Try sparse checkout for common docs locations to reduce clone and parse cost.
|
|
sparse := exec.CommandContext(ctx, "git", "-C", tmpDir, "sparse-checkout", "set", "--skip-checks",
|
|
"docs", "doc", "src/routes", "website/docs", "packages/*/docs")
|
|
if sparseOut, sparseErr := sparse.CombinedOutput(); sparseErr != nil {
|
|
_ = sparseOut
|
|
_ = exec.CommandContext(ctx, "git", "-C", tmpDir, "sparse-checkout", "disable").Run()
|
|
}
|
|
} else {
|
|
_ = exec.CommandContext(ctx, "git", "-C", tmpDir, "sparse-checkout", "disable").Run()
|
|
}
|
|
|
|
localSource := &Source{
|
|
Name: coalesce(source.Name, repoName),
|
|
Type: SourceTypeLocal,
|
|
Path: tmpDir,
|
|
Include: append([]string(nil), source.Include...),
|
|
Exclude: append([]string(nil), source.Exclude...),
|
|
Schedule: source.Schedule,
|
|
}
|
|
|
|
if len(localSource.Include) == 0 {
|
|
localSource.Include = []string{
|
|
`(?i)(^|/)readme\.md$`,
|
|
`(?i)(^|/)docs?/`,
|
|
`(?i)\.md$`,
|
|
`(?i)\.mdx$`,
|
|
}
|
|
}
|
|
|
|
local := NewLocalScraper(s.config)
|
|
docs, err := local.Scrape(ctx, localSource)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("scrape repository docs: %w", err)
|
|
}
|
|
if len(docs) == 0 && len(source.Include) == 0 {
|
|
// Sparse patterns did not match this repository layout; retry full checkout.
|
|
_ = exec.CommandContext(ctx, "git", "-C", tmpDir, "sparse-checkout", "disable").Run()
|
|
docs, err = local.Scrape(ctx, localSource)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("scrape repository docs after sparse fallback: %w", err)
|
|
}
|
|
}
|
|
|
|
for _, doc := range docs {
|
|
if doc == nil {
|
|
continue
|
|
}
|
|
branchForURL := strings.TrimSpace(source.Branch)
|
|
if branchForURL == "" {
|
|
branchForURL = "HEAD"
|
|
}
|
|
if doc.Metadata == nil {
|
|
doc.Metadata = map[string]interface{}{}
|
|
}
|
|
if rawPath, ok := doc.Metadata["path"].(string); ok {
|
|
if relPath, relErr := filepath.Rel(tmpDir, rawPath); relErr == nil {
|
|
relPath = filepath.ToSlash(relPath)
|
|
relPath = strings.TrimPrefix(relPath, "./")
|
|
if relPath != "" && relPath != "." {
|
|
doc.URL = fmt.Sprintf("https://github.com/%s/blob/%s/%s", repoName, branchForURL, relPath)
|
|
doc.ID = generateDocID(doc.URL)
|
|
doc.Metadata["path"] = relPath
|
|
}
|
|
}
|
|
}
|
|
doc.Type = "github-document"
|
|
doc.Metadata["repo"] = repoName
|
|
doc.Metadata["repo_url"] = repoURL
|
|
doc.Metadata["source_type"] = "github"
|
|
}
|
|
return docs, nil
|
|
}
|
|
|
|
// DetectChanges checks if the repository has new commits.
|
|
func (s *GitHubScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
|
|
if source == nil {
|
|
return false, "", fmt.Errorf("source is required")
|
|
}
|
|
_, repoName, err := s.resolveRepo(source)
|
|
if err != nil {
|
|
return false, "", fmt.Errorf("resolve github repository: %w", err)
|
|
}
|
|
|
|
remote := "https://github.com/" + strings.TrimSuffix(repoName, ".git") + ".git"
|
|
branch := strings.TrimSpace(source.Branch)
|
|
if branch == "" {
|
|
branch = "HEAD"
|
|
}
|
|
|
|
cmd := exec.CommandContext(ctx, "git", "ls-remote", remote, branch)
|
|
output, err := cmd.Output()
|
|
if err != nil {
|
|
return false, "", fmt.Errorf("run git ls-remote for %s (%s): %w", remote, branch, err)
|
|
}
|
|
line := strings.TrimSpace(string(output))
|
|
if line == "" {
|
|
return false, "", fmt.Errorf("empty ls-remote output")
|
|
}
|
|
parts := strings.Fields(line)
|
|
if len(parts) == 0 {
|
|
return false, "", fmt.Errorf("unexpected ls-remote output")
|
|
}
|
|
hash := parts[0]
|
|
return hash != lastHash, hash, nil
|
|
}
|
|
|
|
func (s *GitHubScraper) resolveRepo(source *Source) (repoURL string, repoName string, err error) {
|
|
if strings.TrimSpace(source.Repo) != "" {
|
|
repoName = strings.Trim(strings.TrimSpace(source.Repo), "/")
|
|
repoName = strings.TrimSuffix(repoName, ".git")
|
|
return "https://github.com/" + repoName + ".git", repoName, nil
|
|
}
|
|
|
|
raw := strings.TrimSpace(source.URL)
|
|
if raw == "" {
|
|
return "", "", fmt.Errorf("github source requires repo or url")
|
|
}
|
|
|
|
u, err := url.Parse(raw)
|
|
if err != nil {
|
|
return "", "", fmt.Errorf("parse github url %q: %w", raw, err)
|
|
}
|
|
if !strings.Contains(strings.ToLower(u.Host), "github.com") {
|
|
return "", "", fmt.Errorf("not a github url: %s", raw)
|
|
}
|
|
parts := strings.Split(strings.Trim(u.Path, "/"), "/")
|
|
if len(parts) < 2 {
|
|
return "", "", fmt.Errorf("invalid github repo url: %s", raw)
|
|
}
|
|
repoName = parts[0] + "/" + strings.TrimSuffix(parts[1], ".git")
|
|
repoURL = "https://github.com/" + repoName + ".git"
|
|
return repoURL, repoName, nil
|
|
}
|
|
|
|
func coalesce(primary, fallback string) string {
|
|
if strings.TrimSpace(primary) != "" {
|
|
return primary
|
|
}
|
|
if strings.TrimSpace(fallback) != "" {
|
|
return filepath.Base(fallback)
|
|
}
|
|
return "github"
|
|
}
|