mirror of
https://github.com/Dvorinka/Devour.git
synced 2026-06-04 04:23:02 +00:00
update
This commit is contained in:
+227
-8
@@ -2,6 +2,20 @@ package scraper
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"fmt"
|
||||
"io/fs"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
var (
|
||||
reLocalBlankLines = regexp.MustCompile(`\n{3,}`)
|
||||
reFrontMatterBlock = regexp.MustCompile(`(?s)\A---\s*\n.*?\n---\s*\n`)
|
||||
)
|
||||
|
||||
// LocalScraper scrapes documentation from local filesystem.
|
||||
@@ -16,16 +30,221 @@ func NewLocalScraper(config *Config) *LocalScraper {
|
||||
|
||||
// Scrape scans and parses documents from a local directory.
|
||||
func (s *LocalScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
|
||||
// TODO: Implement local scraping
|
||||
// 1. Walk directory tree
|
||||
// 2. Filter by include/exclude patterns
|
||||
// 3. Parse markdown, text, code files
|
||||
// 4. Extract structure and content
|
||||
return nil, nil
|
||||
if source == nil {
|
||||
return nil, fmt.Errorf("source is required")
|
||||
}
|
||||
|
||||
root := strings.TrimSpace(source.Path)
|
||||
if root == "" {
|
||||
root = strings.TrimSpace(source.URL)
|
||||
}
|
||||
if root == "" {
|
||||
return nil, fmt.Errorf("path or url is required for local source")
|
||||
}
|
||||
|
||||
info, err := os.Stat(root)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
docs := make([]*Document, 0)
|
||||
if !info.IsDir() {
|
||||
doc, err := s.fileToDocument(root, source)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return []*Document{doc}, nil
|
||||
}
|
||||
|
||||
web := NewWebScraper(s.config)
|
||||
err = filepath.WalkDir(root, func(path string, d fs.DirEntry, err error) error {
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
default:
|
||||
}
|
||||
|
||||
if d.IsDir() {
|
||||
name := d.Name()
|
||||
if strings.HasPrefix(name, ".") || name == "node_modules" || name == "vendor" || name == "dist" || name == "build" {
|
||||
return filepath.SkipDir
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
relPath := path
|
||||
if rel, relErr := filepath.Rel(root, path); relErr == nil {
|
||||
relPath = rel
|
||||
}
|
||||
normalized := filepath.ToSlash(relPath)
|
||||
if !web.shouldInclude(normalized, source.Include, source.Exclude) {
|
||||
return nil
|
||||
}
|
||||
if !isDocumentationFile(path) {
|
||||
return nil
|
||||
}
|
||||
|
||||
doc, err := s.fileToDocument(path, source)
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
docs = append(docs, doc)
|
||||
return nil
|
||||
})
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return docs, nil
|
||||
}
|
||||
|
||||
// DetectChanges checks if files have been modified.
|
||||
func (s *LocalScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
|
||||
// TODO: Check file modification times
|
||||
return false, "", nil
|
||||
if source == nil {
|
||||
return false, "", fmt.Errorf("source is required")
|
||||
}
|
||||
|
||||
root := strings.TrimSpace(source.Path)
|
||||
if root == "" {
|
||||
root = strings.TrimSpace(source.URL)
|
||||
}
|
||||
if root == "" {
|
||||
return false, "", fmt.Errorf("path or url is required for local source")
|
||||
}
|
||||
|
||||
h := sha256.New()
|
||||
err := filepath.WalkDir(root, func(path string, d fs.DirEntry, err error) error {
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if d.IsDir() {
|
||||
name := d.Name()
|
||||
if strings.HasPrefix(name, ".") || name == "node_modules" || name == "vendor" || name == "dist" || name == "build" {
|
||||
return filepath.SkipDir
|
||||
}
|
||||
return nil
|
||||
}
|
||||
if !isDocumentationFile(path) {
|
||||
return nil
|
||||
}
|
||||
|
||||
info, infoErr := d.Info()
|
||||
if infoErr != nil {
|
||||
return infoErr
|
||||
}
|
||||
fmt.Fprintf(h, "%s|%d|%d\n", path, info.Size(), info.ModTime().UnixNano())
|
||||
return nil
|
||||
})
|
||||
if err != nil {
|
||||
return false, "", err
|
||||
}
|
||||
|
||||
hash := hex.EncodeToString(h.Sum(nil))
|
||||
return hash != lastHash, hash, nil
|
||||
}
|
||||
|
||||
func (s *LocalScraper) fileToDocument(path string, source *Source) (*Document, error) {
|
||||
b, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
ext := strings.ToLower(filepath.Ext(path))
|
||||
content := normalizeLocalContent(string(b), ext)
|
||||
if content == "" {
|
||||
return nil, fmt.Errorf("empty file")
|
||||
}
|
||||
|
||||
title := strings.TrimSuffix(filepath.Base(path), filepath.Ext(path))
|
||||
hash := sha256.Sum256(b)
|
||||
uri := "file://" + filepath.ToSlash(path)
|
||||
|
||||
docType := "local-document"
|
||||
switch ext {
|
||||
case ".md", ".mdx":
|
||||
docType = "local-markdown"
|
||||
case ".txt":
|
||||
docType = "local-text"
|
||||
case ".json", ".yaml", ".yml":
|
||||
docType = "local-data"
|
||||
case ".go", ".js", ".ts", ".tsx", ".py", ".java", ".rs", ".rb", ".php":
|
||||
docType = "local-code"
|
||||
}
|
||||
|
||||
name := source.Name
|
||||
if strings.TrimSpace(name) == "" {
|
||||
name = filepath.Base(filepath.Dir(path))
|
||||
}
|
||||
|
||||
return &Document{
|
||||
ID: generateDocID(uri),
|
||||
Source: name,
|
||||
Type: docType,
|
||||
Title: title,
|
||||
Content: content,
|
||||
URL: uri,
|
||||
Metadata: map[string]interface{}{
|
||||
"path": path,
|
||||
"size": len(b),
|
||||
},
|
||||
Hash: hex.EncodeToString(hash[:]),
|
||||
Timestamp: time.Now(),
|
||||
}, nil
|
||||
}
|
||||
|
||||
func normalizeLocalContent(content, ext string) string {
|
||||
content = strings.TrimSpace(content)
|
||||
if content == "" {
|
||||
return ""
|
||||
}
|
||||
|
||||
switch ext {
|
||||
case ".md", ".mdx":
|
||||
content = stripMarkdownFrontmatter(content)
|
||||
content = stripMDXPreamble(content)
|
||||
}
|
||||
|
||||
// Collapse excessive blank lines to reduce indexing noise.
|
||||
content = reLocalBlankLines.ReplaceAllString(content, "\n\n")
|
||||
return strings.TrimSpace(content)
|
||||
}
|
||||
|
||||
func stripMarkdownFrontmatter(content string) string {
|
||||
if !strings.HasPrefix(content, "---\n") && !strings.HasPrefix(content, "---\r\n") {
|
||||
return content
|
||||
}
|
||||
|
||||
trimmed := reFrontMatterBlock.ReplaceAllString(content, "")
|
||||
return trimmed
|
||||
}
|
||||
|
||||
func stripMDXPreamble(content string) string {
|
||||
lines := strings.Split(content, "\n")
|
||||
i := 0
|
||||
for i < len(lines) {
|
||||
line := strings.TrimSpace(lines[i])
|
||||
if line == "" {
|
||||
i++
|
||||
continue
|
||||
}
|
||||
if strings.HasPrefix(line, "import ") || strings.HasPrefix(line, "export ") {
|
||||
i++
|
||||
continue
|
||||
}
|
||||
break
|
||||
}
|
||||
return strings.Join(lines[i:], "\n")
|
||||
}
|
||||
|
||||
func isDocumentationFile(path string) bool {
|
||||
ext := strings.ToLower(filepath.Ext(path))
|
||||
switch ext {
|
||||
case ".md", ".mdx", ".txt", ".rst", ".adoc", ".json", ".yaml", ".yml", ".go", ".py", ".js", ".ts", ".tsx", ".java", ".rs", ".rb", ".php", ".html":
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user