mirror of
https://github.com/Dvorinka/Devour.git
synced 2026-06-03 20:13:03 +00:00
263 lines
6.3 KiB
Go
263 lines
6.3 KiB
Go
package scraper
|
|
|
|
import (
|
|
"context"
|
|
"crypto/sha256"
|
|
"encoding/hex"
|
|
"errors"
|
|
"fmt"
|
|
"io/fs"
|
|
"log"
|
|
"os"
|
|
"path/filepath"
|
|
"regexp"
|
|
"strings"
|
|
"time"
|
|
)
|
|
|
|
var (
|
|
reLocalBlankLines = regexp.MustCompile(`\n{3,}`)
|
|
reFrontMatterBlock = regexp.MustCompile(`(?s)\A---\s*\n.*?\n---\s*\n`)
|
|
)
|
|
|
|
// LocalScraper scrapes documentation from local filesystem.
|
|
type LocalScraper struct {
|
|
config *Config
|
|
}
|
|
|
|
// NewLocalScraper creates a new local scraper.
|
|
func NewLocalScraper(config *Config) *LocalScraper {
|
|
return &LocalScraper{config: config}
|
|
}
|
|
|
|
// Scrape scans and parses documents from a local directory.
|
|
func (s *LocalScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
|
|
if source == nil {
|
|
return nil, fmt.Errorf("source is required")
|
|
}
|
|
|
|
root := strings.TrimSpace(source.Path)
|
|
if root == "" {
|
|
root = strings.TrimSpace(source.URL)
|
|
}
|
|
if root == "" {
|
|
return nil, fmt.Errorf("path or url is required for local source")
|
|
}
|
|
|
|
info, err := os.Stat(root)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("stat local source root %q: %w", root, err)
|
|
}
|
|
|
|
docs := make([]*Document, 0)
|
|
nonFatalErrors := make([]error, 0)
|
|
if !info.IsDir() {
|
|
doc, err := s.fileToDocument(root, source)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("convert local source file %q: %w", root, err)
|
|
}
|
|
return []*Document{doc}, nil
|
|
}
|
|
|
|
web := NewWebScraper(s.config)
|
|
err = filepath.WalkDir(root, func(path string, d fs.DirEntry, err error) error {
|
|
if err != nil {
|
|
return err
|
|
}
|
|
select {
|
|
case <-ctx.Done():
|
|
return ctx.Err()
|
|
default:
|
|
}
|
|
|
|
if d.IsDir() {
|
|
name := d.Name()
|
|
if strings.HasPrefix(name, ".") || name == "node_modules" || name == "vendor" || name == "dist" || name == "build" {
|
|
return filepath.SkipDir
|
|
}
|
|
return nil
|
|
}
|
|
|
|
relPath := path
|
|
if rel, relErr := filepath.Rel(root, path); relErr == nil {
|
|
relPath = rel
|
|
}
|
|
normalized := filepath.ToSlash(relPath)
|
|
if !web.shouldInclude(normalized, source.Include, source.Exclude) {
|
|
return nil
|
|
}
|
|
if !isDocumentationFile(path) {
|
|
return nil
|
|
}
|
|
|
|
doc, err := s.fileToDocument(path, source)
|
|
if err != nil {
|
|
if len(nonFatalErrors) < 20 {
|
|
nonFatalErrors = append(nonFatalErrors, fmt.Errorf("%s: %w", path, err))
|
|
}
|
|
return nil
|
|
}
|
|
docs = append(docs, doc)
|
|
return nil
|
|
})
|
|
if err != nil {
|
|
return nil, fmt.Errorf("walk local source root %q: %w", root, err)
|
|
}
|
|
if len(nonFatalErrors) > 0 {
|
|
log.Printf("local scraper skipped %d files due to conversion errors (sample: %v)", len(nonFatalErrors), nonFatalErrors[0])
|
|
if len(docs) == 0 {
|
|
return nil, fmt.Errorf("local scrape failed while converting files: %w", errors.Join(nonFatalErrors...))
|
|
}
|
|
}
|
|
|
|
return docs, nil
|
|
}
|
|
|
|
// DetectChanges checks if files have been modified.
|
|
func (s *LocalScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
|
|
if source == nil {
|
|
return false, "", fmt.Errorf("source is required")
|
|
}
|
|
|
|
root := strings.TrimSpace(source.Path)
|
|
if root == "" {
|
|
root = strings.TrimSpace(source.URL)
|
|
}
|
|
if root == "" {
|
|
return false, "", fmt.Errorf("path or url is required for local source")
|
|
}
|
|
|
|
h := sha256.New()
|
|
err := filepath.WalkDir(root, func(path string, d fs.DirEntry, err error) error {
|
|
if err != nil {
|
|
return fmt.Errorf("walk local source path %q: %w", path, err)
|
|
}
|
|
if d.IsDir() {
|
|
name := d.Name()
|
|
if strings.HasPrefix(name, ".") || name == "node_modules" || name == "vendor" || name == "dist" || name == "build" {
|
|
return filepath.SkipDir
|
|
}
|
|
return nil
|
|
}
|
|
if !isDocumentationFile(path) {
|
|
return nil
|
|
}
|
|
|
|
info, infoErr := d.Info()
|
|
if infoErr != nil {
|
|
return fmt.Errorf("stat local source file %q: %w", path, infoErr)
|
|
}
|
|
fmt.Fprintf(h, "%s|%d|%d\n", path, info.Size(), info.ModTime().UnixNano())
|
|
return nil
|
|
})
|
|
if err != nil {
|
|
return false, "", fmt.Errorf("walk local source root %q for change detection: %w", root, err)
|
|
}
|
|
|
|
hash := hex.EncodeToString(h.Sum(nil))
|
|
return hash != lastHash, hash, nil
|
|
}
|
|
|
|
func (s *LocalScraper) fileToDocument(path string, source *Source) (*Document, error) {
|
|
b, err := os.ReadFile(path)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("read local source file %q: %w", path, err)
|
|
}
|
|
|
|
ext := strings.ToLower(filepath.Ext(path))
|
|
content := normalizeLocalContent(string(b), ext)
|
|
if content == "" {
|
|
return nil, fmt.Errorf("empty file")
|
|
}
|
|
|
|
title := strings.TrimSuffix(filepath.Base(path), filepath.Ext(path))
|
|
hash := sha256.Sum256(b)
|
|
uri := "file://" + filepath.ToSlash(path)
|
|
|
|
docType := "local-document"
|
|
switch ext {
|
|
case ".md", ".mdx":
|
|
docType = "local-markdown"
|
|
case ".txt":
|
|
docType = "local-text"
|
|
case ".json", ".yaml", ".yml":
|
|
docType = "local-data"
|
|
case ".go", ".js", ".ts", ".tsx", ".py", ".java", ".rs", ".rb", ".php":
|
|
docType = "local-code"
|
|
}
|
|
|
|
name := source.Name
|
|
if strings.TrimSpace(name) == "" {
|
|
name = filepath.Base(filepath.Dir(path))
|
|
}
|
|
|
|
return &Document{
|
|
ID: generateDocID(uri),
|
|
Source: name,
|
|
Type: docType,
|
|
Title: title,
|
|
Content: content,
|
|
URL: uri,
|
|
Metadata: map[string]interface{}{
|
|
"path": path,
|
|
"size": len(b),
|
|
},
|
|
Hash: hex.EncodeToString(hash[:]),
|
|
Timestamp: time.Now(),
|
|
}, nil
|
|
}
|
|
|
|
func normalizeLocalContent(content, ext string) string {
|
|
content = strings.TrimSpace(content)
|
|
if content == "" {
|
|
return ""
|
|
}
|
|
|
|
switch ext {
|
|
case ".md", ".mdx":
|
|
content = stripMarkdownFrontmatter(content)
|
|
content = stripMDXPreamble(content)
|
|
}
|
|
|
|
// Collapse excessive blank lines to reduce indexing noise.
|
|
content = reLocalBlankLines.ReplaceAllString(content, "\n\n")
|
|
return strings.TrimSpace(content)
|
|
}
|
|
|
|
func stripMarkdownFrontmatter(content string) string {
|
|
if !strings.HasPrefix(content, "---\n") && !strings.HasPrefix(content, "---\r\n") {
|
|
return content
|
|
}
|
|
|
|
trimmed := reFrontMatterBlock.ReplaceAllString(content, "")
|
|
return trimmed
|
|
}
|
|
|
|
func stripMDXPreamble(content string) string {
|
|
lines := strings.Split(content, "\n")
|
|
i := 0
|
|
for i < len(lines) {
|
|
line := strings.TrimSpace(lines[i])
|
|
if line == "" {
|
|
i++
|
|
continue
|
|
}
|
|
if strings.HasPrefix(line, "import ") || strings.HasPrefix(line, "export ") {
|
|
i++
|
|
continue
|
|
}
|
|
break
|
|
}
|
|
return strings.Join(lines[i:], "\n")
|
|
}
|
|
|
|
func isDocumentationFile(path string) bool {
|
|
ext := strings.ToLower(filepath.Ext(path))
|
|
switch ext {
|
|
case ".md", ".mdx", ".txt", ".rst", ".adoc", ".json", ".yaml", ".yml", ".go", ".py", ".js", ".ts", ".tsx", ".java", ".rs", ".rb", ".php", ".html":
|
|
return true
|
|
default:
|
|
return false
|
|
}
|
|
}
|