mirror of
https://github.com/Dvorinka/Devour.git
synced 2026-06-04 20:43:05 +00:00
150 lines
3.3 KiB
Go
150 lines
3.3 KiB
Go
package storage
|
|
|
|
import (
|
|
"encoding/json"
|
|
"fmt"
|
|
"os"
|
|
"path/filepath"
|
|
"regexp"
|
|
"strings"
|
|
|
|
"github.com/yourorg/devour/internal/markdown"
|
|
"github.com/yourorg/devour/internal/scraper"
|
|
)
|
|
|
|
type SaveOptions struct {
|
|
Format string
|
|
OutputDir string
|
|
AllowEmpty bool
|
|
PrintWriter func(string, ...any)
|
|
}
|
|
|
|
type SaveResult struct {
|
|
Count int
|
|
Files []string
|
|
}
|
|
|
|
var slugUnsafe = regexp.MustCompile(`[^a-z0-9._-]+`)
|
|
|
|
func SaveDocuments(docs []*scraper.Document, opts SaveOptions) (*SaveResult, error) {
|
|
if len(docs) == 0 {
|
|
if opts.AllowEmpty {
|
|
return &SaveResult{}, nil
|
|
}
|
|
return nil, fmt.Errorf("no documents scraped")
|
|
}
|
|
|
|
format := strings.ToLower(strings.TrimSpace(opts.Format))
|
|
if format == "" {
|
|
format = "json"
|
|
}
|
|
if format != "json" && format != "markdown" {
|
|
return nil, fmt.Errorf("unsupported format: %s", opts.Format)
|
|
}
|
|
|
|
if strings.TrimSpace(opts.OutputDir) == "" {
|
|
return nil, fmt.Errorf("output directory is required")
|
|
}
|
|
if err := os.MkdirAll(opts.OutputDir, 0o755); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
used := map[string]int{}
|
|
files := make([]string, 0, len(docs))
|
|
formatter := markdown.NewFormatter()
|
|
|
|
for i, doc := range docs {
|
|
if doc == nil {
|
|
continue
|
|
}
|
|
base := slugify(defaultTitle(doc.Title, i))
|
|
ext := ".json"
|
|
if format == "markdown" {
|
|
ext = ".md"
|
|
}
|
|
name := uniqueName(base, ext, used, doc.ID)
|
|
path := filepath.Join(opts.OutputDir, name)
|
|
|
|
var b []byte
|
|
var err error
|
|
if format == "markdown" {
|
|
md := &markdown.Document{
|
|
ID: doc.ID,
|
|
Source: doc.Source,
|
|
Type: doc.Type,
|
|
Title: doc.Title,
|
|
Content: doc.Content,
|
|
URL: doc.URL,
|
|
Metadata: doc.Metadata,
|
|
Hash: doc.Hash,
|
|
Timestamp: doc.Timestamp,
|
|
}
|
|
b = []byte(formatter.FormatWithTOC(md))
|
|
} else {
|
|
b, err = json.MarshalIndent(doc, "", " ")
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
|
|
if err := os.WriteFile(path, b, 0o644); err != nil {
|
|
return nil, err
|
|
}
|
|
files = append(files, path)
|
|
if opts.PrintWriter != nil {
|
|
opts.PrintWriter(" 📄 %s (%s)\n", filepath.Base(path), doc.Type)
|
|
}
|
|
}
|
|
|
|
if len(files) == 0 && !opts.AllowEmpty {
|
|
return nil, fmt.Errorf("no documents scraped")
|
|
}
|
|
|
|
return &SaveResult{Count: len(files), Files: files}, nil
|
|
}
|
|
|
|
func defaultTitle(title string, idx int) string {
|
|
title = strings.TrimSpace(title)
|
|
if title != "" {
|
|
return title
|
|
}
|
|
return fmt.Sprintf("document_%d", idx)
|
|
}
|
|
|
|
func slugify(name string) string {
|
|
name = strings.ToLower(strings.TrimSpace(name))
|
|
name = strings.ReplaceAll(name, " ", "-")
|
|
name = strings.ReplaceAll(name, "/", "-")
|
|
name = strings.ReplaceAll(name, "\\", "-")
|
|
name = strings.ReplaceAll(name, ":", "-")
|
|
name = strings.ReplaceAll(name, "?", "")
|
|
name = strings.ReplaceAll(name, "&", "and")
|
|
name = slugUnsafe.ReplaceAllString(name, "-")
|
|
name = strings.Trim(name, "-.")
|
|
if name == "" {
|
|
name = "document"
|
|
}
|
|
if len(name) > 80 {
|
|
name = strings.Trim(name[:80], "-.")
|
|
}
|
|
if name == "" {
|
|
name = "document"
|
|
}
|
|
return name
|
|
}
|
|
|
|
func uniqueName(base, ext string, used map[string]int, id string) string {
|
|
key := base + ext
|
|
if used[key] == 0 {
|
|
used[key] = 1
|
|
return key
|
|
}
|
|
used[key]++
|
|
suffix := used[key]
|
|
id = strings.TrimSpace(id)
|
|
if len(id) >= 8 {
|
|
return fmt.Sprintf("%s-%s%s", base, id[:8], ext)
|
|
}
|
|
return fmt.Sprintf("%s-%d%s", base, suffix, ext)
|
|
}
|