This commit is contained in:
Tomas Dvorak
2026-02-24 10:33:59 +01:00
parent 409acd2e08
commit 898a3c303f
1374 changed files with 290409 additions and 29187 deletions
+171 -8
View File
@@ -2,6 +2,12 @@ package scraper
import (
"context"
"fmt"
"net/url"
"os"
"os/exec"
"path/filepath"
"strings"
)
// GitHubScraper scrapes documentation from GitHub repositories.
@@ -16,16 +22,173 @@ func NewGitHubScraper(config *Config) *GitHubScraper {
// Scrape clones and parses documents from a GitHub repository.
func (s *GitHubScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
// TODO: Implement GitHub scraping
// 1. Clone repository (shallow)
// 2. Find markdown files in specified paths
// 3. Parse README, docs/, wiki
// 4. Extract code structure
return nil, nil
if source == nil {
return nil, fmt.Errorf("source is required")
}
repoURL, repoName, err := s.resolveRepo(source)
if err != nil {
return nil, err
}
tmpDir, err := os.MkdirTemp("", "devour-github-*")
if err != nil {
return nil, err
}
defer os.RemoveAll(tmpDir)
cloneArgs := []string{"clone", "--depth", "1", "--filter=blob:none", "--sparse", repoURL, tmpDir}
if branch := strings.TrimSpace(source.Branch); branch != "" {
cloneArgs = []string{"clone", "--depth", "1", "--filter=blob:none", "--sparse", "--branch", branch, repoURL, tmpDir}
}
cmd := exec.CommandContext(ctx, "git", cloneArgs...)
output, err := cmd.CombinedOutput()
if err != nil {
return nil, fmt.Errorf("git clone failed: %v (%s)", err, strings.TrimSpace(string(output)))
}
if len(source.Include) == 0 {
// Try sparse checkout for common docs locations to reduce clone and parse cost.
sparse := exec.CommandContext(ctx, "git", "-C", tmpDir, "sparse-checkout", "set", "--skip-checks",
"docs", "doc", "src/routes", "website/docs", "packages/*/docs")
if sparseOut, sparseErr := sparse.CombinedOutput(); sparseErr != nil {
_ = sparseOut
_ = exec.CommandContext(ctx, "git", "-C", tmpDir, "sparse-checkout", "disable").Run()
}
} else {
_ = exec.CommandContext(ctx, "git", "-C", tmpDir, "sparse-checkout", "disable").Run()
}
localSource := &Source{
Name: coalesce(source.Name, repoName),
Type: SourceTypeLocal,
Path: tmpDir,
Include: append([]string(nil), source.Include...),
Exclude: append([]string(nil), source.Exclude...),
Schedule: source.Schedule,
}
if len(localSource.Include) == 0 {
localSource.Include = []string{
`(?i)(^|/)readme\.md$`,
`(?i)(^|/)docs?/`,
`(?i)\.md$`,
`(?i)\.mdx$`,
}
}
local := NewLocalScraper(s.config)
docs, err := local.Scrape(ctx, localSource)
if err != nil {
return nil, err
}
if len(docs) == 0 && len(source.Include) == 0 {
// Sparse patterns did not match this repository layout; retry full checkout.
_ = exec.CommandContext(ctx, "git", "-C", tmpDir, "sparse-checkout", "disable").Run()
docs, err = local.Scrape(ctx, localSource)
if err != nil {
return nil, err
}
}
for _, doc := range docs {
if doc == nil {
continue
}
branchForURL := strings.TrimSpace(source.Branch)
if branchForURL == "" {
branchForURL = "HEAD"
}
if doc.Metadata == nil {
doc.Metadata = map[string]interface{}{}
}
if rawPath, ok := doc.Metadata["path"].(string); ok {
if relPath, relErr := filepath.Rel(tmpDir, rawPath); relErr == nil {
relPath = filepath.ToSlash(relPath)
relPath = strings.TrimPrefix(relPath, "./")
if relPath != "" && relPath != "." {
doc.URL = fmt.Sprintf("https://github.com/%s/blob/%s/%s", repoName, branchForURL, relPath)
doc.ID = generateDocID(doc.URL)
doc.Metadata["path"] = relPath
}
}
}
doc.Type = "github-document"
doc.Metadata["repo"] = repoName
doc.Metadata["repo_url"] = repoURL
doc.Metadata["source_type"] = "github"
}
return docs, nil
}
// DetectChanges checks if the repository has new commits.
func (s *GitHubScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
// TODO: Check latest commit hash
return false, "", nil
if source == nil {
return false, "", fmt.Errorf("source is required")
}
_, repoName, err := s.resolveRepo(source)
if err != nil {
return false, "", err
}
remote := "https://github.com/" + strings.TrimSuffix(repoName, ".git") + ".git"
branch := strings.TrimSpace(source.Branch)
if branch == "" {
branch = "HEAD"
}
cmd := exec.CommandContext(ctx, "git", "ls-remote", remote, branch)
output, err := cmd.Output()
if err != nil {
return false, "", err
}
line := strings.TrimSpace(string(output))
if line == "" {
return false, "", fmt.Errorf("empty ls-remote output")
}
parts := strings.Fields(line)
if len(parts) == 0 {
return false, "", fmt.Errorf("unexpected ls-remote output")
}
hash := parts[0]
return hash != lastHash, hash, nil
}
func (s *GitHubScraper) resolveRepo(source *Source) (repoURL string, repoName string, err error) {
if strings.TrimSpace(source.Repo) != "" {
repoName = strings.Trim(strings.TrimSpace(source.Repo), "/")
repoName = strings.TrimSuffix(repoName, ".git")
return "https://github.com/" + repoName + ".git", repoName, nil
}
raw := strings.TrimSpace(source.URL)
if raw == "" {
return "", "", fmt.Errorf("github source requires repo or url")
}
u, err := url.Parse(raw)
if err != nil {
return "", "", err
}
if !strings.Contains(strings.ToLower(u.Host), "github.com") {
return "", "", fmt.Errorf("not a github url: %s", raw)
}
parts := strings.Split(strings.Trim(u.Path, "/"), "/")
if len(parts) < 2 {
return "", "", fmt.Errorf("invalid github repo url: %s", raw)
}
repoName = parts[0] + "/" + strings.TrimSuffix(parts[1], ".git")
repoURL = "https://github.com/" + repoName + ".git"
return repoURL, repoName, nil
}
func coalesce(primary, fallback string) string {
if strings.TrimSpace(primary) != "" {
return primary
}
if strings.TrimSpace(fallback) != "" {
return filepath.Base(fallback)
}
return "github"
}