Files
Devour/internal/scraper/local.go
T
Tomas Dvorak 898a3c303f update
2026-02-24 10:33:59 +01:00

251 lines
5.5 KiB
Go

package scraper
import (
"context"
"crypto/sha256"
"encoding/hex"
"fmt"
"io/fs"
"os"
"path/filepath"
"regexp"
"strings"
"time"
)
var (
reLocalBlankLines = regexp.MustCompile(`\n{3,}`)
reFrontMatterBlock = regexp.MustCompile(`(?s)\A---\s*\n.*?\n---\s*\n`)
)
// LocalScraper scrapes documentation from local filesystem.
type LocalScraper struct {
config *Config
}
// NewLocalScraper creates a new local scraper.
func NewLocalScraper(config *Config) *LocalScraper {
return &LocalScraper{config: config}
}
// Scrape scans and parses documents from a local directory.
func (s *LocalScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
if source == nil {
return nil, fmt.Errorf("source is required")
}
root := strings.TrimSpace(source.Path)
if root == "" {
root = strings.TrimSpace(source.URL)
}
if root == "" {
return nil, fmt.Errorf("path or url is required for local source")
}
info, err := os.Stat(root)
if err != nil {
return nil, err
}
docs := make([]*Document, 0)
if !info.IsDir() {
doc, err := s.fileToDocument(root, source)
if err != nil {
return nil, err
}
return []*Document{doc}, nil
}
web := NewWebScraper(s.config)
err = filepath.WalkDir(root, func(path string, d fs.DirEntry, err error) error {
if err != nil {
return err
}
select {
case <-ctx.Done():
return ctx.Err()
default:
}
if d.IsDir() {
name := d.Name()
if strings.HasPrefix(name, ".") || name == "node_modules" || name == "vendor" || name == "dist" || name == "build" {
return filepath.SkipDir
}
return nil
}
relPath := path
if rel, relErr := filepath.Rel(root, path); relErr == nil {
relPath = rel
}
normalized := filepath.ToSlash(relPath)
if !web.shouldInclude(normalized, source.Include, source.Exclude) {
return nil
}
if !isDocumentationFile(path) {
return nil
}
doc, err := s.fileToDocument(path, source)
if err != nil {
return nil
}
docs = append(docs, doc)
return nil
})
if err != nil {
return nil, err
}
return docs, nil
}
// DetectChanges checks if files have been modified.
func (s *LocalScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
if source == nil {
return false, "", fmt.Errorf("source is required")
}
root := strings.TrimSpace(source.Path)
if root == "" {
root = strings.TrimSpace(source.URL)
}
if root == "" {
return false, "", fmt.Errorf("path or url is required for local source")
}
h := sha256.New()
err := filepath.WalkDir(root, func(path string, d fs.DirEntry, err error) error {
if err != nil {
return err
}
if d.IsDir() {
name := d.Name()
if strings.HasPrefix(name, ".") || name == "node_modules" || name == "vendor" || name == "dist" || name == "build" {
return filepath.SkipDir
}
return nil
}
if !isDocumentationFile(path) {
return nil
}
info, infoErr := d.Info()
if infoErr != nil {
return infoErr
}
fmt.Fprintf(h, "%s|%d|%d\n", path, info.Size(), info.ModTime().UnixNano())
return nil
})
if err != nil {
return false, "", err
}
hash := hex.EncodeToString(h.Sum(nil))
return hash != lastHash, hash, nil
}
func (s *LocalScraper) fileToDocument(path string, source *Source) (*Document, error) {
b, err := os.ReadFile(path)
if err != nil {
return nil, err
}
ext := strings.ToLower(filepath.Ext(path))
content := normalizeLocalContent(string(b), ext)
if content == "" {
return nil, fmt.Errorf("empty file")
}
title := strings.TrimSuffix(filepath.Base(path), filepath.Ext(path))
hash := sha256.Sum256(b)
uri := "file://" + filepath.ToSlash(path)
docType := "local-document"
switch ext {
case ".md", ".mdx":
docType = "local-markdown"
case ".txt":
docType = "local-text"
case ".json", ".yaml", ".yml":
docType = "local-data"
case ".go", ".js", ".ts", ".tsx", ".py", ".java", ".rs", ".rb", ".php":
docType = "local-code"
}
name := source.Name
if strings.TrimSpace(name) == "" {
name = filepath.Base(filepath.Dir(path))
}
return &Document{
ID: generateDocID(uri),
Source: name,
Type: docType,
Title: title,
Content: content,
URL: uri,
Metadata: map[string]interface{}{
"path": path,
"size": len(b),
},
Hash: hex.EncodeToString(hash[:]),
Timestamp: time.Now(),
}, nil
}
func normalizeLocalContent(content, ext string) string {
content = strings.TrimSpace(content)
if content == "" {
return ""
}
switch ext {
case ".md", ".mdx":
content = stripMarkdownFrontmatter(content)
content = stripMDXPreamble(content)
}
// Collapse excessive blank lines to reduce indexing noise.
content = reLocalBlankLines.ReplaceAllString(content, "\n\n")
return strings.TrimSpace(content)
}
func stripMarkdownFrontmatter(content string) string {
if !strings.HasPrefix(content, "---\n") && !strings.HasPrefix(content, "---\r\n") {
return content
}
trimmed := reFrontMatterBlock.ReplaceAllString(content, "")
return trimmed
}
func stripMDXPreamble(content string) string {
lines := strings.Split(content, "\n")
i := 0
for i < len(lines) {
line := strings.TrimSpace(lines[i])
if line == "" {
i++
continue
}
if strings.HasPrefix(line, "import ") || strings.HasPrefix(line, "export ") {
i++
continue
}
break
}
return strings.Join(lines[i:], "\n")
}
func isDocumentationFile(path string) bool {
ext := strings.ToLower(filepath.Ext(path))
switch ext {
case ".md", ".mdx", ".txt", ".rst", ".adoc", ".json", ".yaml", ".yml", ".go", ".py", ".js", ".ts", ".tsx", ".java", ".rs", ".rb", ".php", ".html":
return true
default:
return false
}
}