package storage import ( "encoding/json" "fmt" "os" "path/filepath" "regexp" "strings" "github.com/yourorg/devour/internal/markdown" "github.com/yourorg/devour/internal/scraper" ) type SaveOptions struct { Format string OutputDir string AllowEmpty bool PrintWriter func(string, ...any) } type SaveResult struct { Count int Files []string } var slugUnsafe = regexp.MustCompile(`[^a-z0-9._-]+`) func SaveDocuments(docs []*scraper.Document, opts SaveOptions) (*SaveResult, error) { if len(docs) == 0 { if opts.AllowEmpty { return &SaveResult{}, nil } return nil, fmt.Errorf("no documents scraped") } format := strings.ToLower(strings.TrimSpace(opts.Format)) if format == "" { format = "json" } if format != "json" && format != "markdown" { return nil, fmt.Errorf("unsupported format: %s", opts.Format) } if strings.TrimSpace(opts.OutputDir) == "" { return nil, fmt.Errorf("output directory is required") } if err := os.MkdirAll(opts.OutputDir, 0o755); err != nil { return nil, err } used := map[string]int{} files := make([]string, 0, len(docs)) formatter := markdown.NewFormatter() for i, doc := range docs { if doc == nil { continue } base := slugify(defaultTitle(doc.Title, i)) ext := ".json" if format == "markdown" { ext = ".md" } name := uniqueName(base, ext, used, doc.ID) path := filepath.Join(opts.OutputDir, name) var b []byte var err error if format == "markdown" { md := &markdown.Document{ ID: doc.ID, Source: doc.Source, Type: doc.Type, Title: doc.Title, Content: doc.Content, URL: doc.URL, Metadata: doc.Metadata, Hash: doc.Hash, Timestamp: doc.Timestamp, } b = []byte(formatter.FormatWithTOC(md)) } else { b, err = json.MarshalIndent(doc, "", " ") if err != nil { return nil, err } } if err := os.WriteFile(path, b, 0o644); err != nil { return nil, err } files = append(files, path) if opts.PrintWriter != nil { opts.PrintWriter(" 📄 %s (%s)\n", filepath.Base(path), doc.Type) } } if len(files) == 0 && !opts.AllowEmpty { return nil, fmt.Errorf("no documents scraped") } return &SaveResult{Count: len(files), Files: files}, nil } func defaultTitle(title string, idx int) string { title = strings.TrimSpace(title) if title != "" { return title } return fmt.Sprintf("document_%d", idx) } func slugify(name string) string { name = strings.ToLower(strings.TrimSpace(name)) name = strings.ReplaceAll(name, " ", "-") name = strings.ReplaceAll(name, "/", "-") name = strings.ReplaceAll(name, "\\", "-") name = strings.ReplaceAll(name, ":", "-") name = strings.ReplaceAll(name, "?", "") name = strings.ReplaceAll(name, "&", "and") name = slugUnsafe.ReplaceAllString(name, "-") name = strings.Trim(name, "-.") if name == "" { name = "document" } if len(name) > 80 { name = strings.Trim(name[:80], "-.") } if name == "" { name = "document" } return name } func uniqueName(base, ext string, used map[string]int, id string) string { key := base + ext if used[key] == 0 { used[key] = 1 return key } used[key]++ suffix := used[key] id = strings.TrimSpace(id) if len(id) >= 8 { return fmt.Sprintf("%s-%s%s", base, id[:8], ext) } return fmt.Sprintf("%s-%d%s", base, suffix, ext) }