package scraper import ( "context" "fmt" "net/url" "os" "os/exec" "path/filepath" "strings" ) // GitHubScraper scrapes documentation from GitHub repositories. type GitHubScraper struct { config *Config } // NewGitHubScraper creates a new GitHub scraper. func NewGitHubScraper(config *Config) *GitHubScraper { return &GitHubScraper{config: config} } // Scrape clones and parses documents from a GitHub repository. func (s *GitHubScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) { if source == nil { return nil, fmt.Errorf("source is required") } repoURL, repoName, err := s.resolveRepo(source) if err != nil { return nil, fmt.Errorf("resolve github repository: %w", err) } tmpDir, err := os.MkdirTemp("", "devour-github-*") if err != nil { return nil, fmt.Errorf("create temporary clone directory: %w", err) } defer os.RemoveAll(tmpDir) cloneArgs := []string{"clone", "--depth", "1", "--filter=blob:none", "--sparse", repoURL, tmpDir} if branch := strings.TrimSpace(source.Branch); branch != "" { cloneArgs = []string{"clone", "--depth", "1", "--filter=blob:none", "--sparse", "--branch", branch, repoURL, tmpDir} } cmd := exec.CommandContext(ctx, "git", cloneArgs...) output, err := cmd.CombinedOutput() if err != nil { return nil, fmt.Errorf("git clone failed: %v (%s)", err, strings.TrimSpace(string(output))) } if len(source.Include) == 0 { // Try sparse checkout for common docs locations to reduce clone and parse cost. sparse := exec.CommandContext(ctx, "git", "-C", tmpDir, "sparse-checkout", "set", "--skip-checks", "docs", "doc", "src/routes", "website/docs", "packages/*/docs") if sparseOut, sparseErr := sparse.CombinedOutput(); sparseErr != nil { _ = sparseOut _ = exec.CommandContext(ctx, "git", "-C", tmpDir, "sparse-checkout", "disable").Run() } } else { _ = exec.CommandContext(ctx, "git", "-C", tmpDir, "sparse-checkout", "disable").Run() } localSource := &Source{ Name: coalesce(source.Name, repoName), Type: SourceTypeLocal, Path: tmpDir, Include: append([]string(nil), source.Include...), Exclude: append([]string(nil), source.Exclude...), Schedule: source.Schedule, } if len(localSource.Include) == 0 { localSource.Include = []string{ `(?i)(^|/)readme\.md$`, `(?i)(^|/)docs?/`, `(?i)\.md$`, `(?i)\.mdx$`, } } local := NewLocalScraper(s.config) docs, err := local.Scrape(ctx, localSource) if err != nil { return nil, fmt.Errorf("scrape repository docs: %w", err) } if len(docs) == 0 && len(source.Include) == 0 { // Sparse patterns did not match this repository layout; retry full checkout. _ = exec.CommandContext(ctx, "git", "-C", tmpDir, "sparse-checkout", "disable").Run() docs, err = local.Scrape(ctx, localSource) if err != nil { return nil, fmt.Errorf("scrape repository docs after sparse fallback: %w", err) } } for _, doc := range docs { if doc == nil { continue } branchForURL := strings.TrimSpace(source.Branch) if branchForURL == "" { branchForURL = "HEAD" } if doc.Metadata == nil { doc.Metadata = map[string]interface{}{} } if rawPath, ok := doc.Metadata["path"].(string); ok { if relPath, relErr := filepath.Rel(tmpDir, rawPath); relErr == nil { relPath = filepath.ToSlash(relPath) relPath = strings.TrimPrefix(relPath, "./") if relPath != "" && relPath != "." { doc.URL = fmt.Sprintf("https://github.com/%s/blob/%s/%s", repoName, branchForURL, relPath) doc.ID = generateDocID(doc.URL) doc.Metadata["path"] = relPath } } } doc.Type = "github-document" doc.Metadata["repo"] = repoName doc.Metadata["repo_url"] = repoURL doc.Metadata["source_type"] = "github" } return docs, nil } // DetectChanges checks if the repository has new commits. func (s *GitHubScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) { if source == nil { return false, "", fmt.Errorf("source is required") } _, repoName, err := s.resolveRepo(source) if err != nil { return false, "", fmt.Errorf("resolve github repository: %w", err) } remote := "https://github.com/" + strings.TrimSuffix(repoName, ".git") + ".git" branch := strings.TrimSpace(source.Branch) if branch == "" { branch = "HEAD" } cmd := exec.CommandContext(ctx, "git", "ls-remote", remote, branch) output, err := cmd.Output() if err != nil { return false, "", fmt.Errorf("run git ls-remote for %s (%s): %w", remote, branch, err) } line := strings.TrimSpace(string(output)) if line == "" { return false, "", fmt.Errorf("empty ls-remote output") } parts := strings.Fields(line) if len(parts) == 0 { return false, "", fmt.Errorf("unexpected ls-remote output") } hash := parts[0] return hash != lastHash, hash, nil } func (s *GitHubScraper) resolveRepo(source *Source) (repoURL string, repoName string, err error) { if strings.TrimSpace(source.Repo) != "" { repoName = strings.Trim(strings.TrimSpace(source.Repo), "/") repoName = strings.TrimSuffix(repoName, ".git") return "https://github.com/" + repoName + ".git", repoName, nil } raw := strings.TrimSpace(source.URL) if raw == "" { return "", "", fmt.Errorf("github source requires repo or url") } u, err := url.Parse(raw) if err != nil { return "", "", fmt.Errorf("parse github url %q: %w", raw, err) } if !strings.Contains(strings.ToLower(u.Host), "github.com") { return "", "", fmt.Errorf("not a github url: %s", raw) } parts := strings.Split(strings.Trim(u.Path, "/"), "/") if len(parts) < 2 { return "", "", fmt.Errorf("invalid github repo url: %s", raw) } repoName = parts[0] + "/" + strings.TrimSuffix(parts[1], ".git") repoURL = "https://github.com/" + repoName + ".git" return repoURL, repoName, nil } func coalesce(primary, fallback string) string { if strings.TrimSpace(primary) != "" { return primary } if strings.TrimSpace(fallback) != "" { return filepath.Base(fallback) } return "github" }