package scraper import ( "context" "crypto/sha256" "encoding/hex" "fmt" "io/fs" "os" "path/filepath" "regexp" "strings" "time" ) var ( reLocalBlankLines = regexp.MustCompile(`\n{3,}`) reFrontMatterBlock = regexp.MustCompile(`(?s)\A---\s*\n.*?\n---\s*\n`) ) // LocalScraper scrapes documentation from local filesystem. type LocalScraper struct { config *Config } // NewLocalScraper creates a new local scraper. func NewLocalScraper(config *Config) *LocalScraper { return &LocalScraper{config: config} } // Scrape scans and parses documents from a local directory. func (s *LocalScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) { if source == nil { return nil, fmt.Errorf("source is required") } root := strings.TrimSpace(source.Path) if root == "" { root = strings.TrimSpace(source.URL) } if root == "" { return nil, fmt.Errorf("path or url is required for local source") } info, err := os.Stat(root) if err != nil { return nil, err } docs := make([]*Document, 0) if !info.IsDir() { doc, err := s.fileToDocument(root, source) if err != nil { return nil, err } return []*Document{doc}, nil } web := NewWebScraper(s.config) err = filepath.WalkDir(root, func(path string, d fs.DirEntry, err error) error { if err != nil { return err } select { case <-ctx.Done(): return ctx.Err() default: } if d.IsDir() { name := d.Name() if strings.HasPrefix(name, ".") || name == "node_modules" || name == "vendor" || name == "dist" || name == "build" { return filepath.SkipDir } return nil } relPath := path if rel, relErr := filepath.Rel(root, path); relErr == nil { relPath = rel } normalized := filepath.ToSlash(relPath) if !web.shouldInclude(normalized, source.Include, source.Exclude) { return nil } if !isDocumentationFile(path) { return nil } doc, err := s.fileToDocument(path, source) if err != nil { return nil } docs = append(docs, doc) return nil }) if err != nil { return nil, err } return docs, nil } // DetectChanges checks if files have been modified. func (s *LocalScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) { if source == nil { return false, "", fmt.Errorf("source is required") } root := strings.TrimSpace(source.Path) if root == "" { root = strings.TrimSpace(source.URL) } if root == "" { return false, "", fmt.Errorf("path or url is required for local source") } h := sha256.New() err := filepath.WalkDir(root, func(path string, d fs.DirEntry, err error) error { if err != nil { return err } if d.IsDir() { name := d.Name() if strings.HasPrefix(name, ".") || name == "node_modules" || name == "vendor" || name == "dist" || name == "build" { return filepath.SkipDir } return nil } if !isDocumentationFile(path) { return nil } info, infoErr := d.Info() if infoErr != nil { return infoErr } fmt.Fprintf(h, "%s|%d|%d\n", path, info.Size(), info.ModTime().UnixNano()) return nil }) if err != nil { return false, "", err } hash := hex.EncodeToString(h.Sum(nil)) return hash != lastHash, hash, nil } func (s *LocalScraper) fileToDocument(path string, source *Source) (*Document, error) { b, err := os.ReadFile(path) if err != nil { return nil, err } ext := strings.ToLower(filepath.Ext(path)) content := normalizeLocalContent(string(b), ext) if content == "" { return nil, fmt.Errorf("empty file") } title := strings.TrimSuffix(filepath.Base(path), filepath.Ext(path)) hash := sha256.Sum256(b) uri := "file://" + filepath.ToSlash(path) docType := "local-document" switch ext { case ".md", ".mdx": docType = "local-markdown" case ".txt": docType = "local-text" case ".json", ".yaml", ".yml": docType = "local-data" case ".go", ".js", ".ts", ".tsx", ".py", ".java", ".rs", ".rb", ".php": docType = "local-code" } name := source.Name if strings.TrimSpace(name) == "" { name = filepath.Base(filepath.Dir(path)) } return &Document{ ID: generateDocID(uri), Source: name, Type: docType, Title: title, Content: content, URL: uri, Metadata: map[string]interface{}{ "path": path, "size": len(b), }, Hash: hex.EncodeToString(hash[:]), Timestamp: time.Now(), }, nil } func normalizeLocalContent(content, ext string) string { content = strings.TrimSpace(content) if content == "" { return "" } switch ext { case ".md", ".mdx": content = stripMarkdownFrontmatter(content) content = stripMDXPreamble(content) } // Collapse excessive blank lines to reduce indexing noise. content = reLocalBlankLines.ReplaceAllString(content, "\n\n") return strings.TrimSpace(content) } func stripMarkdownFrontmatter(content string) string { if !strings.HasPrefix(content, "---\n") && !strings.HasPrefix(content, "---\r\n") { return content } trimmed := reFrontMatterBlock.ReplaceAllString(content, "") return trimmed } func stripMDXPreamble(content string) string { lines := strings.Split(content, "\n") i := 0 for i < len(lines) { line := strings.TrimSpace(lines[i]) if line == "" { i++ continue } if strings.HasPrefix(line, "import ") || strings.HasPrefix(line, "export ") { i++ continue } break } return strings.Join(lines[i:], "\n") } func isDocumentationFile(path string) bool { ext := strings.ToLower(filepath.Ext(path)) switch ext { case ".md", ".mdx", ".txt", ".rst", ".adoc", ".json", ".yaml", ".yml", ".go", ".py", ".js", ".ts", ".tsx", ".java", ".rs", ".rb", ".php", ".html": return true default: return false } }