mirror of
https://github.com/Dvorinka/Devour.git
synced 2026-06-03 20:13:03 +00:00
first commit
This commit is contained in:
@@ -0,0 +1,182 @@
|
||||
package markdown
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"regexp"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/yuin/goldmark"
|
||||
"github.com/yuin/goldmark/extension"
|
||||
"github.com/yuin/goldmark/parser"
|
||||
"github.com/yuin/goldmark/renderer/html"
|
||||
)
|
||||
|
||||
// Document represents a scraped document to be formatted as markdown
|
||||
type Document struct {
|
||||
ID string `json:"id"`
|
||||
Source string `json:"source"`
|
||||
Type string `json:"type"`
|
||||
Title string `json:"title"`
|
||||
Content string `json:"content"`
|
||||
URL string `json:"url"`
|
||||
Metadata map[string]interface{} `json:"metadata"`
|
||||
Hash string `json:"hash"`
|
||||
Timestamp time.Time `json:"timestamp"`
|
||||
}
|
||||
|
||||
// Formatter handles markdown conversion and enhancement
|
||||
type Formatter struct {
|
||||
md goldmark.Markdown
|
||||
}
|
||||
|
||||
// NewFormatter creates a new markdown formatter with extensions
|
||||
func NewFormatter() *Formatter {
|
||||
md := goldmark.New(
|
||||
goldmark.WithExtensions(extension.GFM, extension.Table, extension.Strikethrough),
|
||||
goldmark.WithParserOptions(
|
||||
parser.WithAutoHeadingID(),
|
||||
),
|
||||
goldmark.WithRendererOptions(
|
||||
html.WithHardWraps(),
|
||||
html.WithXHTML(),
|
||||
),
|
||||
)
|
||||
|
||||
return &Formatter{md: md}
|
||||
}
|
||||
|
||||
// ToMarkdown converts a document to enhanced markdown format
|
||||
func (f *Formatter) ToMarkdown(doc *Document) string {
|
||||
var buf bytes.Buffer
|
||||
|
||||
// Header with metadata
|
||||
buf.WriteString(fmt.Sprintf("# %s\n\n", doc.Title))
|
||||
|
||||
// Document metadata table
|
||||
buf.WriteString("## 📋 Document Information\n\n")
|
||||
buf.WriteString("| Property | Value |\n")
|
||||
buf.WriteString("|----------|-------|\n")
|
||||
buf.WriteString(fmt.Sprintf("| **Source** | %s |\n", doc.URL))
|
||||
buf.WriteString(fmt.Sprintf("| **Type** | `%s` |\n", doc.Type))
|
||||
buf.WriteString(fmt.Sprintf("| **Scraped** | %s |\n", doc.Timestamp.Format("2006-01-02 15:04:05")))
|
||||
|
||||
// Add metadata if available
|
||||
if doc.Metadata != nil {
|
||||
for key, value := range doc.Metadata {
|
||||
if strValue := fmt.Sprintf("%v", value); strValue != "" && strValue != "<nil>" {
|
||||
buf.WriteString(fmt.Sprintf("| **%s** | %s |\n", strings.Title(key), strValue))
|
||||
}
|
||||
}
|
||||
}
|
||||
buf.WriteString("\n")
|
||||
|
||||
// Process and enhance content
|
||||
enhancedContent := f.enhanceContent(doc.Content)
|
||||
buf.WriteString("## 📚 Content\n\n")
|
||||
buf.WriteString(enhancedContent)
|
||||
buf.WriteString("\n")
|
||||
|
||||
// Footer
|
||||
buf.WriteString("---\n\n")
|
||||
buf.WriteString(fmt.Sprintf("*Document scraped by [Devour](https://github.com/yourorg/devour) on %s*\n", doc.Timestamp.Format("2006-01-02 15:04:05")))
|
||||
buf.WriteString(fmt.Sprintf("*Source: [%s](%s)*\n", doc.URL, doc.URL))
|
||||
|
||||
return buf.String()
|
||||
}
|
||||
|
||||
// enhanceContent improves the readability of scraped content
|
||||
func (f *Formatter) enhanceContent(content string) string {
|
||||
// Clean up common issues
|
||||
content = strings.TrimSpace(content)
|
||||
|
||||
// Fix multiple consecutive newlines
|
||||
content = regexp.MustCompile(`\n{3,}`).ReplaceAllString(content, "\n\n")
|
||||
|
||||
// Ensure proper heading spacing
|
||||
content = regexp.MustCompile(`([^\n])\n(#{1,6})`).ReplaceAllString(content, "$1\n\n$2")
|
||||
|
||||
// Fix code blocks that might be malformed
|
||||
content = regexp.MustCompile("```(\\w+)?\\n([^`]*)\\n```").ReplaceAllStringFunc(content, func(match string) string {
|
||||
// Ensure code blocks are properly formatted
|
||||
lines := strings.Split(match, "\n")
|
||||
if len(lines) < 3 {
|
||||
return match
|
||||
}
|
||||
|
||||
lang := strings.TrimPrefix(lines[0], "```")
|
||||
codeContent := strings.Join(lines[1:len(lines)-1], "\n")
|
||||
|
||||
return fmt.Sprintf("```%s\n%s\n```", lang, codeContent)
|
||||
})
|
||||
|
||||
// Convert plain URLs to markdown links
|
||||
urlRegex := regexp.MustCompile(`(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s\!()\[\]{};:'".,<>?«»""'']))`)
|
||||
content = urlRegex.ReplaceAllStringFunc(content, func(url string) string {
|
||||
if strings.HasPrefix(url, "http") {
|
||||
return fmt.Sprintf("[%s](%s)", url, url)
|
||||
}
|
||||
return url
|
||||
})
|
||||
|
||||
// Add emoji indicators for common patterns
|
||||
content = strings.ReplaceAll(content, "Example:", "💡 **Example:**")
|
||||
content = strings.ReplaceAll(content, "Note:", "📝 **Note:**")
|
||||
content = strings.ReplaceAll(content, "Warning:", "⚠️ **Warning:**")
|
||||
content = strings.ReplaceAll(content, "Important:", "❗ **Important:**")
|
||||
content = strings.ReplaceAll(content, "TODO:", "📋 **TODO:**")
|
||||
|
||||
return content
|
||||
}
|
||||
|
||||
// ToTableOfContents generates a TOC for the document
|
||||
func (f *Formatter) ToTableOfContents(content string) string {
|
||||
lines := strings.Split(content, "\n")
|
||||
var toc []string
|
||||
var level int
|
||||
|
||||
for _, line := range lines {
|
||||
line = strings.TrimSpace(line)
|
||||
if strings.HasPrefix(line, "#") {
|
||||
// Count heading level
|
||||
level = 0
|
||||
for _, char := range line {
|
||||
if char == '#' {
|
||||
level++
|
||||
} else {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if level <= 3 { // Only include up to H3 in TOC
|
||||
title := strings.TrimSpace(line[level:])
|
||||
anchor := strings.ToLower(strings.ReplaceAll(title, " ", "-"))
|
||||
anchor = regexp.MustCompile(`[^a-z0-9\-]`).ReplaceAllString(anchor, "")
|
||||
|
||||
indent := strings.Repeat(" ", level-1)
|
||||
toc = append(toc, fmt.Sprintf("%s- [%s](#%s)", indent, title, anchor))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if len(toc) == 0 {
|
||||
return ""
|
||||
}
|
||||
|
||||
return "## 📑 Table of Contents\n\n" + strings.Join(toc, "\n") + "\n\n"
|
||||
}
|
||||
|
||||
// FormatWithTOC formats a document with an automatically generated table of contents
|
||||
func (f *Formatter) FormatWithTOC(doc *Document) string {
|
||||
markdown := f.ToMarkdown(doc)
|
||||
|
||||
// Insert TOC after the header but before the content
|
||||
parts := strings.SplitN(markdown, "## 📚 Content", 2)
|
||||
if len(parts) == 2 {
|
||||
toc := f.ToTableOfContents(parts[1])
|
||||
return parts[0] + toc + "## 📚 Content" + parts[1]
|
||||
}
|
||||
|
||||
return markdown
|
||||
}
|
||||
Reference in New Issue
Block a user