mirror of
https://github.com/Dvorinka/Devour.git
synced 2026-06-04 04:23:02 +00:00
update
This commit is contained in:
@@ -0,0 +1,149 @@
|
||||
package storage
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
"strings"
|
||||
|
||||
"github.com/yourorg/devour/internal/markdown"
|
||||
"github.com/yourorg/devour/internal/scraper"
|
||||
)
|
||||
|
||||
type SaveOptions struct {
|
||||
Format string
|
||||
OutputDir string
|
||||
AllowEmpty bool
|
||||
PrintWriter func(string, ...any)
|
||||
}
|
||||
|
||||
type SaveResult struct {
|
||||
Count int
|
||||
Files []string
|
||||
}
|
||||
|
||||
var slugUnsafe = regexp.MustCompile(`[^a-z0-9._-]+`)
|
||||
|
||||
func SaveDocuments(docs []*scraper.Document, opts SaveOptions) (*SaveResult, error) {
|
||||
if len(docs) == 0 {
|
||||
if opts.AllowEmpty {
|
||||
return &SaveResult{}, nil
|
||||
}
|
||||
return nil, fmt.Errorf("no documents scraped")
|
||||
}
|
||||
|
||||
format := strings.ToLower(strings.TrimSpace(opts.Format))
|
||||
if format == "" {
|
||||
format = "json"
|
||||
}
|
||||
if format != "json" && format != "markdown" {
|
||||
return nil, fmt.Errorf("unsupported format: %s", opts.Format)
|
||||
}
|
||||
|
||||
if strings.TrimSpace(opts.OutputDir) == "" {
|
||||
return nil, fmt.Errorf("output directory is required")
|
||||
}
|
||||
if err := os.MkdirAll(opts.OutputDir, 0o755); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
used := map[string]int{}
|
||||
files := make([]string, 0, len(docs))
|
||||
formatter := markdown.NewFormatter()
|
||||
|
||||
for i, doc := range docs {
|
||||
if doc == nil {
|
||||
continue
|
||||
}
|
||||
base := slugify(defaultTitle(doc.Title, i))
|
||||
ext := ".json"
|
||||
if format == "markdown" {
|
||||
ext = ".md"
|
||||
}
|
||||
name := uniqueName(base, ext, used, doc.ID)
|
||||
path := filepath.Join(opts.OutputDir, name)
|
||||
|
||||
var b []byte
|
||||
var err error
|
||||
if format == "markdown" {
|
||||
md := &markdown.Document{
|
||||
ID: doc.ID,
|
||||
Source: doc.Source,
|
||||
Type: doc.Type,
|
||||
Title: doc.Title,
|
||||
Content: doc.Content,
|
||||
URL: doc.URL,
|
||||
Metadata: doc.Metadata,
|
||||
Hash: doc.Hash,
|
||||
Timestamp: doc.Timestamp,
|
||||
}
|
||||
b = []byte(formatter.FormatWithTOC(md))
|
||||
} else {
|
||||
b, err = json.MarshalIndent(doc, "", " ")
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
if err := os.WriteFile(path, b, 0o644); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
files = append(files, path)
|
||||
if opts.PrintWriter != nil {
|
||||
opts.PrintWriter(" 📄 %s (%s)\n", filepath.Base(path), doc.Type)
|
||||
}
|
||||
}
|
||||
|
||||
if len(files) == 0 && !opts.AllowEmpty {
|
||||
return nil, fmt.Errorf("no documents scraped")
|
||||
}
|
||||
|
||||
return &SaveResult{Count: len(files), Files: files}, nil
|
||||
}
|
||||
|
||||
func defaultTitle(title string, idx int) string {
|
||||
title = strings.TrimSpace(title)
|
||||
if title != "" {
|
||||
return title
|
||||
}
|
||||
return fmt.Sprintf("document_%d", idx)
|
||||
}
|
||||
|
||||
func slugify(name string) string {
|
||||
name = strings.ToLower(strings.TrimSpace(name))
|
||||
name = strings.ReplaceAll(name, " ", "-")
|
||||
name = strings.ReplaceAll(name, "/", "-")
|
||||
name = strings.ReplaceAll(name, "\\", "-")
|
||||
name = strings.ReplaceAll(name, ":", "-")
|
||||
name = strings.ReplaceAll(name, "?", "")
|
||||
name = strings.ReplaceAll(name, "&", "and")
|
||||
name = slugUnsafe.ReplaceAllString(name, "-")
|
||||
name = strings.Trim(name, "-.")
|
||||
if name == "" {
|
||||
name = "document"
|
||||
}
|
||||
if len(name) > 80 {
|
||||
name = strings.Trim(name[:80], "-.")
|
||||
}
|
||||
if name == "" {
|
||||
name = "document"
|
||||
}
|
||||
return name
|
||||
}
|
||||
|
||||
func uniqueName(base, ext string, used map[string]int, id string) string {
|
||||
key := base + ext
|
||||
if used[key] == 0 {
|
||||
used[key] = 1
|
||||
return key
|
||||
}
|
||||
used[key]++
|
||||
suffix := used[key]
|
||||
id = strings.TrimSpace(id)
|
||||
if len(id) >= 8 {
|
||||
return fmt.Sprintf("%s-%s%s", base, id[:8], ext)
|
||||
}
|
||||
return fmt.Sprintf("%s-%d%s", base, suffix, ext)
|
||||
}
|
||||
Reference in New Issue
Block a user