mirror of
https://github.com/Dvorinka/Devour.git
synced 2026-06-03 20:13:03 +00:00
183 lines
5.6 KiB
Go
183 lines
5.6 KiB
Go
package markdown
|
|
|
|
import (
|
|
"bytes"
|
|
"fmt"
|
|
"regexp"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/yuin/goldmark"
|
|
"github.com/yuin/goldmark/extension"
|
|
"github.com/yuin/goldmark/parser"
|
|
"github.com/yuin/goldmark/renderer/html"
|
|
)
|
|
|
|
// Document represents a scraped document to be formatted as markdown
|
|
type Document struct {
|
|
ID string `json:"id"`
|
|
Source string `json:"source"`
|
|
Type string `json:"type"`
|
|
Title string `json:"title"`
|
|
Content string `json:"content"`
|
|
URL string `json:"url"`
|
|
Metadata map[string]interface{} `json:"metadata"`
|
|
Hash string `json:"hash"`
|
|
Timestamp time.Time `json:"timestamp"`
|
|
}
|
|
|
|
// Formatter handles markdown conversion and enhancement
|
|
type Formatter struct {
|
|
md goldmark.Markdown
|
|
}
|
|
|
|
// NewFormatter creates a new markdown formatter with extensions
|
|
func NewFormatter() *Formatter {
|
|
md := goldmark.New(
|
|
goldmark.WithExtensions(extension.GFM, extension.Table, extension.Strikethrough),
|
|
goldmark.WithParserOptions(
|
|
parser.WithAutoHeadingID(),
|
|
),
|
|
goldmark.WithRendererOptions(
|
|
html.WithHardWraps(),
|
|
html.WithXHTML(),
|
|
),
|
|
)
|
|
|
|
return &Formatter{md: md}
|
|
}
|
|
|
|
// ToMarkdown converts a document to enhanced markdown format
|
|
func (f *Formatter) ToMarkdown(doc *Document) string {
|
|
var buf bytes.Buffer
|
|
|
|
// Header with metadata
|
|
buf.WriteString(fmt.Sprintf("# %s\n\n", doc.Title))
|
|
|
|
// Document metadata table
|
|
buf.WriteString("## 📋 Document Information\n\n")
|
|
buf.WriteString("| Property | Value |\n")
|
|
buf.WriteString("|----------|-------|\n")
|
|
buf.WriteString(fmt.Sprintf("| **Source** | %s |\n", doc.URL))
|
|
buf.WriteString(fmt.Sprintf("| **Type** | `%s` |\n", doc.Type))
|
|
buf.WriteString(fmt.Sprintf("| **Scraped** | %s |\n", doc.Timestamp.Format("2006-01-02 15:04:05")))
|
|
|
|
// Add metadata if available
|
|
if doc.Metadata != nil {
|
|
for key, value := range doc.Metadata {
|
|
if strValue := fmt.Sprintf("%v", value); strValue != "" && strValue != "<nil>" {
|
|
buf.WriteString(fmt.Sprintf("| **%s** | %s |\n", strings.Title(key), strValue))
|
|
}
|
|
}
|
|
}
|
|
buf.WriteString("\n")
|
|
|
|
// Process and enhance content
|
|
enhancedContent := f.enhanceContent(doc.Content)
|
|
buf.WriteString("## 📚 Content\n\n")
|
|
buf.WriteString(enhancedContent)
|
|
buf.WriteString("\n")
|
|
|
|
// Footer
|
|
buf.WriteString("---\n\n")
|
|
buf.WriteString(fmt.Sprintf("*Document scraped by [Devour](https://github.com/yourorg/devour) on %s*\n", doc.Timestamp.Format("2006-01-02 15:04:05")))
|
|
buf.WriteString(fmt.Sprintf("*Source: [%s](%s)*\n", doc.URL, doc.URL))
|
|
|
|
return buf.String()
|
|
}
|
|
|
|
// enhanceContent improves the readability of scraped content
|
|
func (f *Formatter) enhanceContent(content string) string {
|
|
// Clean up common issues
|
|
content = strings.TrimSpace(content)
|
|
|
|
// Fix multiple consecutive newlines
|
|
content = regexp.MustCompile(`\n{3,}`).ReplaceAllString(content, "\n\n")
|
|
|
|
// Ensure proper heading spacing
|
|
content = regexp.MustCompile(`([^\n])\n(#{1,6})`).ReplaceAllString(content, "$1\n\n$2")
|
|
|
|
// Fix code blocks that might be malformed
|
|
content = regexp.MustCompile("```(\\w+)?\\n([^`]*)\\n```").ReplaceAllStringFunc(content, func(match string) string {
|
|
// Ensure code blocks are properly formatted
|
|
lines := strings.Split(match, "\n")
|
|
if len(lines) < 3 {
|
|
return match
|
|
}
|
|
|
|
lang := strings.TrimPrefix(lines[0], "```")
|
|
codeContent := strings.Join(lines[1:len(lines)-1], "\n")
|
|
|
|
return fmt.Sprintf("```%s\n%s\n```", lang, codeContent)
|
|
})
|
|
|
|
// Convert plain URLs to markdown links
|
|
urlRegex := regexp.MustCompile(`(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s\!()\[\]{};:'".,<>?«»""'']))`)
|
|
content = urlRegex.ReplaceAllStringFunc(content, func(url string) string {
|
|
if strings.HasPrefix(url, "http") {
|
|
return fmt.Sprintf("[%s](%s)", url, url)
|
|
}
|
|
return url
|
|
})
|
|
|
|
// Add emoji indicators for common patterns
|
|
content = strings.ReplaceAll(content, "Example:", "💡 **Example:**")
|
|
content = strings.ReplaceAll(content, "Note:", "📝 **Note:**")
|
|
content = strings.ReplaceAll(content, "Warning:", "⚠️ **Warning:**")
|
|
content = strings.ReplaceAll(content, "Important:", "❗ **Important:**")
|
|
content = strings.ReplaceAll(content, "TODO:", "📋 **TODO:**")
|
|
|
|
return content
|
|
}
|
|
|
|
// ToTableOfContents generates a TOC for the document
|
|
func (f *Formatter) ToTableOfContents(content string) string {
|
|
lines := strings.Split(content, "\n")
|
|
var toc []string
|
|
var level int
|
|
|
|
for _, line := range lines {
|
|
line = strings.TrimSpace(line)
|
|
if strings.HasPrefix(line, "#") {
|
|
// Count heading level
|
|
level = 0
|
|
for _, char := range line {
|
|
if char == '#' {
|
|
level++
|
|
} else {
|
|
break
|
|
}
|
|
}
|
|
|
|
if level <= 3 { // Only include up to H3 in TOC
|
|
title := strings.TrimSpace(line[level:])
|
|
anchor := strings.ToLower(strings.ReplaceAll(title, " ", "-"))
|
|
anchor = regexp.MustCompile(`[^a-z0-9\-]`).ReplaceAllString(anchor, "")
|
|
|
|
indent := strings.Repeat(" ", level-1)
|
|
toc = append(toc, fmt.Sprintf("%s- [%s](#%s)", indent, title, anchor))
|
|
}
|
|
}
|
|
}
|
|
|
|
if len(toc) == 0 {
|
|
return ""
|
|
}
|
|
|
|
return "## 📑 Table of Contents\n\n" + strings.Join(toc, "\n") + "\n\n"
|
|
}
|
|
|
|
// FormatWithTOC formats a document with an automatically generated table of contents
|
|
func (f *Formatter) FormatWithTOC(doc *Document) string {
|
|
markdown := f.ToMarkdown(doc)
|
|
|
|
// Insert TOC after the header but before the content
|
|
parts := strings.SplitN(markdown, "## 📚 Content", 2)
|
|
if len(parts) == 2 {
|
|
toc := f.ToTableOfContents(parts[1])
|
|
return parts[0] + toc + "## 📚 Content" + parts[1]
|
|
}
|
|
|
|
return markdown
|
|
}
|