first commit

This commit is contained in:
Tomas Dvorak
2026-02-22 10:42:17 +01:00
commit 55885a0e8f
239 changed files with 103690 additions and 0 deletions
+182
View File
@@ -0,0 +1,182 @@
package markdown
import (
"bytes"
"fmt"
"regexp"
"strings"
"time"
"github.com/yuin/goldmark"
"github.com/yuin/goldmark/extension"
"github.com/yuin/goldmark/parser"
"github.com/yuin/goldmark/renderer/html"
)
// Document represents a scraped document to be formatted as markdown
type Document struct {
ID string `json:"id"`
Source string `json:"source"`
Type string `json:"type"`
Title string `json:"title"`
Content string `json:"content"`
URL string `json:"url"`
Metadata map[string]interface{} `json:"metadata"`
Hash string `json:"hash"`
Timestamp time.Time `json:"timestamp"`
}
// Formatter handles markdown conversion and enhancement
type Formatter struct {
md goldmark.Markdown
}
// NewFormatter creates a new markdown formatter with extensions
func NewFormatter() *Formatter {
md := goldmark.New(
goldmark.WithExtensions(extension.GFM, extension.Table, extension.Strikethrough),
goldmark.WithParserOptions(
parser.WithAutoHeadingID(),
),
goldmark.WithRendererOptions(
html.WithHardWraps(),
html.WithXHTML(),
),
)
return &Formatter{md: md}
}
// ToMarkdown converts a document to enhanced markdown format
func (f *Formatter) ToMarkdown(doc *Document) string {
var buf bytes.Buffer
// Header with metadata
buf.WriteString(fmt.Sprintf("# %s\n\n", doc.Title))
// Document metadata table
buf.WriteString("## 📋 Document Information\n\n")
buf.WriteString("| Property | Value |\n")
buf.WriteString("|----------|-------|\n")
buf.WriteString(fmt.Sprintf("| **Source** | %s |\n", doc.URL))
buf.WriteString(fmt.Sprintf("| **Type** | `%s` |\n", doc.Type))
buf.WriteString(fmt.Sprintf("| **Scraped** | %s |\n", doc.Timestamp.Format("2006-01-02 15:04:05")))
// Add metadata if available
if doc.Metadata != nil {
for key, value := range doc.Metadata {
if strValue := fmt.Sprintf("%v", value); strValue != "" && strValue != "<nil>" {
buf.WriteString(fmt.Sprintf("| **%s** | %s |\n", strings.Title(key), strValue))
}
}
}
buf.WriteString("\n")
// Process and enhance content
enhancedContent := f.enhanceContent(doc.Content)
buf.WriteString("## 📚 Content\n\n")
buf.WriteString(enhancedContent)
buf.WriteString("\n")
// Footer
buf.WriteString("---\n\n")
buf.WriteString(fmt.Sprintf("*Document scraped by [Devour](https://github.com/yourorg/devour) on %s*\n", doc.Timestamp.Format("2006-01-02 15:04:05")))
buf.WriteString(fmt.Sprintf("*Source: [%s](%s)*\n", doc.URL, doc.URL))
return buf.String()
}
// enhanceContent improves the readability of scraped content
func (f *Formatter) enhanceContent(content string) string {
// Clean up common issues
content = strings.TrimSpace(content)
// Fix multiple consecutive newlines
content = regexp.MustCompile(`\n{3,}`).ReplaceAllString(content, "\n\n")
// Ensure proper heading spacing
content = regexp.MustCompile(`([^\n])\n(#{1,6})`).ReplaceAllString(content, "$1\n\n$2")
// Fix code blocks that might be malformed
content = regexp.MustCompile("```(\\w+)?\\n([^`]*)\\n```").ReplaceAllStringFunc(content, func(match string) string {
// Ensure code blocks are properly formatted
lines := strings.Split(match, "\n")
if len(lines) < 3 {
return match
}
lang := strings.TrimPrefix(lines[0], "```")
codeContent := strings.Join(lines[1:len(lines)-1], "\n")
return fmt.Sprintf("```%s\n%s\n```", lang, codeContent)
})
// Convert plain URLs to markdown links
urlRegex := regexp.MustCompile(`(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s\!()\[\]{};:'".,<>?«»""'']))`)
content = urlRegex.ReplaceAllStringFunc(content, func(url string) string {
if strings.HasPrefix(url, "http") {
return fmt.Sprintf("[%s](%s)", url, url)
}
return url
})
// Add emoji indicators for common patterns
content = strings.ReplaceAll(content, "Example:", "💡 **Example:**")
content = strings.ReplaceAll(content, "Note:", "📝 **Note:**")
content = strings.ReplaceAll(content, "Warning:", "⚠️ **Warning:**")
content = strings.ReplaceAll(content, "Important:", "❗ **Important:**")
content = strings.ReplaceAll(content, "TODO:", "📋 **TODO:**")
return content
}
// ToTableOfContents generates a TOC for the document
func (f *Formatter) ToTableOfContents(content string) string {
lines := strings.Split(content, "\n")
var toc []string
var level int
for _, line := range lines {
line = strings.TrimSpace(line)
if strings.HasPrefix(line, "#") {
// Count heading level
level = 0
for _, char := range line {
if char == '#' {
level++
} else {
break
}
}
if level <= 3 { // Only include up to H3 in TOC
title := strings.TrimSpace(line[level:])
anchor := strings.ToLower(strings.ReplaceAll(title, " ", "-"))
anchor = regexp.MustCompile(`[^a-z0-9\-]`).ReplaceAllString(anchor, "")
indent := strings.Repeat(" ", level-1)
toc = append(toc, fmt.Sprintf("%s- [%s](#%s)", indent, title, anchor))
}
}
}
if len(toc) == 0 {
return ""
}
return "## 📑 Table of Contents\n\n" + strings.Join(toc, "\n") + "\n\n"
}
// FormatWithTOC formats a document with an automatically generated table of contents
func (f *Formatter) FormatWithTOC(doc *Document) string {
markdown := f.ToMarkdown(doc)
// Insert TOC after the header but before the content
parts := strings.SplitN(markdown, "## 📚 Content", 2)
if len(parts) == 2 {
toc := f.ToTableOfContents(parts[1])
return parts[0] + toc + "## 📚 Content" + parts[1]
}
return markdown
}