mirror of
https://github.com/Dvorinka/Devour.git
synced 2026-06-04 04:23:02 +00:00
update
This commit is contained in:
+171
-8
@@ -2,6 +2,12 @@ package scraper
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"net/url"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// GitHubScraper scrapes documentation from GitHub repositories.
|
||||
@@ -16,16 +22,173 @@ func NewGitHubScraper(config *Config) *GitHubScraper {
|
||||
|
||||
// Scrape clones and parses documents from a GitHub repository.
|
||||
func (s *GitHubScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
|
||||
// TODO: Implement GitHub scraping
|
||||
// 1. Clone repository (shallow)
|
||||
// 2. Find markdown files in specified paths
|
||||
// 3. Parse README, docs/, wiki
|
||||
// 4. Extract code structure
|
||||
return nil, nil
|
||||
if source == nil {
|
||||
return nil, fmt.Errorf("source is required")
|
||||
}
|
||||
|
||||
repoURL, repoName, err := s.resolveRepo(source)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
tmpDir, err := os.MkdirTemp("", "devour-github-*")
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer os.RemoveAll(tmpDir)
|
||||
|
||||
cloneArgs := []string{"clone", "--depth", "1", "--filter=blob:none", "--sparse", repoURL, tmpDir}
|
||||
if branch := strings.TrimSpace(source.Branch); branch != "" {
|
||||
cloneArgs = []string{"clone", "--depth", "1", "--filter=blob:none", "--sparse", "--branch", branch, repoURL, tmpDir}
|
||||
}
|
||||
|
||||
cmd := exec.CommandContext(ctx, "git", cloneArgs...)
|
||||
output, err := cmd.CombinedOutput()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("git clone failed: %v (%s)", err, strings.TrimSpace(string(output)))
|
||||
}
|
||||
|
||||
if len(source.Include) == 0 {
|
||||
// Try sparse checkout for common docs locations to reduce clone and parse cost.
|
||||
sparse := exec.CommandContext(ctx, "git", "-C", tmpDir, "sparse-checkout", "set", "--skip-checks",
|
||||
"docs", "doc", "src/routes", "website/docs", "packages/*/docs")
|
||||
if sparseOut, sparseErr := sparse.CombinedOutput(); sparseErr != nil {
|
||||
_ = sparseOut
|
||||
_ = exec.CommandContext(ctx, "git", "-C", tmpDir, "sparse-checkout", "disable").Run()
|
||||
}
|
||||
} else {
|
||||
_ = exec.CommandContext(ctx, "git", "-C", tmpDir, "sparse-checkout", "disable").Run()
|
||||
}
|
||||
|
||||
localSource := &Source{
|
||||
Name: coalesce(source.Name, repoName),
|
||||
Type: SourceTypeLocal,
|
||||
Path: tmpDir,
|
||||
Include: append([]string(nil), source.Include...),
|
||||
Exclude: append([]string(nil), source.Exclude...),
|
||||
Schedule: source.Schedule,
|
||||
}
|
||||
|
||||
if len(localSource.Include) == 0 {
|
||||
localSource.Include = []string{
|
||||
`(?i)(^|/)readme\.md$`,
|
||||
`(?i)(^|/)docs?/`,
|
||||
`(?i)\.md$`,
|
||||
`(?i)\.mdx$`,
|
||||
}
|
||||
}
|
||||
|
||||
local := NewLocalScraper(s.config)
|
||||
docs, err := local.Scrape(ctx, localSource)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if len(docs) == 0 && len(source.Include) == 0 {
|
||||
// Sparse patterns did not match this repository layout; retry full checkout.
|
||||
_ = exec.CommandContext(ctx, "git", "-C", tmpDir, "sparse-checkout", "disable").Run()
|
||||
docs, err = local.Scrape(ctx, localSource)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
for _, doc := range docs {
|
||||
if doc == nil {
|
||||
continue
|
||||
}
|
||||
branchForURL := strings.TrimSpace(source.Branch)
|
||||
if branchForURL == "" {
|
||||
branchForURL = "HEAD"
|
||||
}
|
||||
if doc.Metadata == nil {
|
||||
doc.Metadata = map[string]interface{}{}
|
||||
}
|
||||
if rawPath, ok := doc.Metadata["path"].(string); ok {
|
||||
if relPath, relErr := filepath.Rel(tmpDir, rawPath); relErr == nil {
|
||||
relPath = filepath.ToSlash(relPath)
|
||||
relPath = strings.TrimPrefix(relPath, "./")
|
||||
if relPath != "" && relPath != "." {
|
||||
doc.URL = fmt.Sprintf("https://github.com/%s/blob/%s/%s", repoName, branchForURL, relPath)
|
||||
doc.ID = generateDocID(doc.URL)
|
||||
doc.Metadata["path"] = relPath
|
||||
}
|
||||
}
|
||||
}
|
||||
doc.Type = "github-document"
|
||||
doc.Metadata["repo"] = repoName
|
||||
doc.Metadata["repo_url"] = repoURL
|
||||
doc.Metadata["source_type"] = "github"
|
||||
}
|
||||
return docs, nil
|
||||
}
|
||||
|
||||
// DetectChanges checks if the repository has new commits.
|
||||
func (s *GitHubScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
|
||||
// TODO: Check latest commit hash
|
||||
return false, "", nil
|
||||
if source == nil {
|
||||
return false, "", fmt.Errorf("source is required")
|
||||
}
|
||||
_, repoName, err := s.resolveRepo(source)
|
||||
if err != nil {
|
||||
return false, "", err
|
||||
}
|
||||
|
||||
remote := "https://github.com/" + strings.TrimSuffix(repoName, ".git") + ".git"
|
||||
branch := strings.TrimSpace(source.Branch)
|
||||
if branch == "" {
|
||||
branch = "HEAD"
|
||||
}
|
||||
|
||||
cmd := exec.CommandContext(ctx, "git", "ls-remote", remote, branch)
|
||||
output, err := cmd.Output()
|
||||
if err != nil {
|
||||
return false, "", err
|
||||
}
|
||||
line := strings.TrimSpace(string(output))
|
||||
if line == "" {
|
||||
return false, "", fmt.Errorf("empty ls-remote output")
|
||||
}
|
||||
parts := strings.Fields(line)
|
||||
if len(parts) == 0 {
|
||||
return false, "", fmt.Errorf("unexpected ls-remote output")
|
||||
}
|
||||
hash := parts[0]
|
||||
return hash != lastHash, hash, nil
|
||||
}
|
||||
|
||||
func (s *GitHubScraper) resolveRepo(source *Source) (repoURL string, repoName string, err error) {
|
||||
if strings.TrimSpace(source.Repo) != "" {
|
||||
repoName = strings.Trim(strings.TrimSpace(source.Repo), "/")
|
||||
repoName = strings.TrimSuffix(repoName, ".git")
|
||||
return "https://github.com/" + repoName + ".git", repoName, nil
|
||||
}
|
||||
|
||||
raw := strings.TrimSpace(source.URL)
|
||||
if raw == "" {
|
||||
return "", "", fmt.Errorf("github source requires repo or url")
|
||||
}
|
||||
|
||||
u, err := url.Parse(raw)
|
||||
if err != nil {
|
||||
return "", "", err
|
||||
}
|
||||
if !strings.Contains(strings.ToLower(u.Host), "github.com") {
|
||||
return "", "", fmt.Errorf("not a github url: %s", raw)
|
||||
}
|
||||
parts := strings.Split(strings.Trim(u.Path, "/"), "/")
|
||||
if len(parts) < 2 {
|
||||
return "", "", fmt.Errorf("invalid github repo url: %s", raw)
|
||||
}
|
||||
repoName = parts[0] + "/" + strings.TrimSuffix(parts[1], ".git")
|
||||
repoURL = "https://github.com/" + repoName + ".git"
|
||||
return repoURL, repoName, nil
|
||||
}
|
||||
|
||||
func coalesce(primary, fallback string) string {
|
||||
if strings.TrimSpace(primary) != "" {
|
||||
return primary
|
||||
}
|
||||
if strings.TrimSpace(fallback) != "" {
|
||||
return filepath.Base(fallback)
|
||||
}
|
||||
return "github"
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user