Files
Devour/internal/storage/writer.go
T
Tomas Dvorak 898a3c303f update
2026-02-24 10:33:59 +01:00

150 lines
3.3 KiB
Go

package storage
import (
"encoding/json"
"fmt"
"os"
"path/filepath"
"regexp"
"strings"
"github.com/yourorg/devour/internal/markdown"
"github.com/yourorg/devour/internal/scraper"
)
type SaveOptions struct {
Format string
OutputDir string
AllowEmpty bool
PrintWriter func(string, ...any)
}
type SaveResult struct {
Count int
Files []string
}
var slugUnsafe = regexp.MustCompile(`[^a-z0-9._-]+`)
func SaveDocuments(docs []*scraper.Document, opts SaveOptions) (*SaveResult, error) {
if len(docs) == 0 {
if opts.AllowEmpty {
return &SaveResult{}, nil
}
return nil, fmt.Errorf("no documents scraped")
}
format := strings.ToLower(strings.TrimSpace(opts.Format))
if format == "" {
format = "json"
}
if format != "json" && format != "markdown" {
return nil, fmt.Errorf("unsupported format: %s", opts.Format)
}
if strings.TrimSpace(opts.OutputDir) == "" {
return nil, fmt.Errorf("output directory is required")
}
if err := os.MkdirAll(opts.OutputDir, 0o755); err != nil {
return nil, err
}
used := map[string]int{}
files := make([]string, 0, len(docs))
formatter := markdown.NewFormatter()
for i, doc := range docs {
if doc == nil {
continue
}
base := slugify(defaultTitle(doc.Title, i))
ext := ".json"
if format == "markdown" {
ext = ".md"
}
name := uniqueName(base, ext, used, doc.ID)
path := filepath.Join(opts.OutputDir, name)
var b []byte
var err error
if format == "markdown" {
md := &markdown.Document{
ID: doc.ID,
Source: doc.Source,
Type: doc.Type,
Title: doc.Title,
Content: doc.Content,
URL: doc.URL,
Metadata: doc.Metadata,
Hash: doc.Hash,
Timestamp: doc.Timestamp,
}
b = []byte(formatter.FormatWithTOC(md))
} else {
b, err = json.MarshalIndent(doc, "", " ")
if err != nil {
return nil, err
}
}
if err := os.WriteFile(path, b, 0o644); err != nil {
return nil, err
}
files = append(files, path)
if opts.PrintWriter != nil {
opts.PrintWriter(" 📄 %s (%s)\n", filepath.Base(path), doc.Type)
}
}
if len(files) == 0 && !opts.AllowEmpty {
return nil, fmt.Errorf("no documents scraped")
}
return &SaveResult{Count: len(files), Files: files}, nil
}
func defaultTitle(title string, idx int) string {
title = strings.TrimSpace(title)
if title != "" {
return title
}
return fmt.Sprintf("document_%d", idx)
}
func slugify(name string) string {
name = strings.ToLower(strings.TrimSpace(name))
name = strings.ReplaceAll(name, " ", "-")
name = strings.ReplaceAll(name, "/", "-")
name = strings.ReplaceAll(name, "\\", "-")
name = strings.ReplaceAll(name, ":", "-")
name = strings.ReplaceAll(name, "?", "")
name = strings.ReplaceAll(name, "&", "and")
name = slugUnsafe.ReplaceAllString(name, "-")
name = strings.Trim(name, "-.")
if name == "" {
name = "document"
}
if len(name) > 80 {
name = strings.Trim(name[:80], "-.")
}
if name == "" {
name = "document"
}
return name
}
func uniqueName(base, ext string, used map[string]int, id string) string {
key := base + ext
if used[key] == 0 {
used[key] = 1
return key
}
used[key]++
suffix := used[key]
id = strings.TrimSpace(id)
if len(id) >= 8 {
return fmt.Sprintf("%s-%s%s", base, id[:8], ext)
}
return fmt.Sprintf("%s-%d%s", base, suffix, ext)
}