first commit

2026-07-29 07:33:48 +00:00 · 2026-02-22 10:42:17 +01:00
commit 55885a0e8f
239 changed files with 103690 additions and 0 deletions
@@ -0,0 +1,182 @@
+package markdown
+
+import (
+	"bytes"
+	"fmt"
+	"regexp"
+	"strings"
+	"time"
+
+	"github.com/yuin/goldmark"
+	"github.com/yuin/goldmark/extension"
+	"github.com/yuin/goldmark/parser"
+	"github.com/yuin/goldmark/renderer/html"
+)
+
+// Document represents a scraped document to be formatted as markdown
+type Document struct {
+	ID        string                 `json:"id"`
+	Source    string                 `json:"source"`
+	Type      string                 `json:"type"`
+	Title     string                 `json:"title"`
+	Content   string                 `json:"content"`
+	URL       string                 `json:"url"`
+	Metadata  map[string]interface{} `json:"metadata"`
+	Hash      string                 `json:"hash"`
+	Timestamp time.Time              `json:"timestamp"`
+}
+
+// Formatter handles markdown conversion and enhancement
+type Formatter struct {
+	md goldmark.Markdown
+}
+
+// NewFormatter creates a new markdown formatter with extensions
+func NewFormatter() *Formatter {
+	md := goldmark.New(
+		goldmark.WithExtensions(extension.GFM, extension.Table, extension.Strikethrough),
+		goldmark.WithParserOptions(
+			parser.WithAutoHeadingID(),
+		),
+		goldmark.WithRendererOptions(
+			html.WithHardWraps(),
+			html.WithXHTML(),
+		),
+	)
+
+	return &Formatter{md: md}
+}
+
+// ToMarkdown converts a document to enhanced markdown format
+func (f *Formatter) ToMarkdown(doc *Document) string {
+	var buf bytes.Buffer
+
+	// Header with metadata
+	buf.WriteString(fmt.Sprintf("# %s\n\n", doc.Title))
+
+	// Document metadata table
+	buf.WriteString("## 📋 Document Information\n\n")
+	buf.WriteString("| Property | Value |\n")
+	buf.WriteString("|----------|-------|\n")
+	buf.WriteString(fmt.Sprintf("| **Source** | %s |\n", doc.URL))
+	buf.WriteString(fmt.Sprintf("| **Type** | `%s` |\n", doc.Type))
+	buf.WriteString(fmt.Sprintf("| **Scraped** | %s |\n", doc.Timestamp.Format("2006-01-02 15:04:05")))
+
+	// Add metadata if available
+	if doc.Metadata != nil {
+		for key, value := range doc.Metadata {
+			if strValue := fmt.Sprintf("%v", value); strValue != "" && strValue != "<nil>" {
+				buf.WriteString(fmt.Sprintf("| **%s** | %s |\n", strings.Title(key), strValue))
+			}
+		}
+	}
+	buf.WriteString("\n")
+
+	// Process and enhance content
+	enhancedContent := f.enhanceContent(doc.Content)
+	buf.WriteString("## 📚 Content\n\n")
+	buf.WriteString(enhancedContent)
+	buf.WriteString("\n")
+
+	// Footer
+	buf.WriteString("---\n\n")
+	buf.WriteString(fmt.Sprintf("*Document scraped by [Devour](https://github.com/yourorg/devour) on %s*\n", doc.Timestamp.Format("2006-01-02 15:04:05")))
+	buf.WriteString(fmt.Sprintf("*Source: [%s](%s)*\n", doc.URL, doc.URL))
+
+	return buf.String()
+}
+
+// enhanceContent improves the readability of scraped content
+func (f *Formatter) enhanceContent(content string) string {
+	// Clean up common issues
+	content = strings.TrimSpace(content)
+
+	// Fix multiple consecutive newlines
+	content = regexp.MustCompile(`\n{3,}`).ReplaceAllString(content, "\n\n")
+
+	// Ensure proper heading spacing
+	content = regexp.MustCompile(`([^\n])\n(#{1,6})`).ReplaceAllString(content, "$1\n\n$2")
+
+	// Fix code blocks that might be malformed
+	content = regexp.MustCompile("```(\\w+)?\\n([^`]*)\\n```").ReplaceAllStringFunc(content, func(match string) string {
+		// Ensure code blocks are properly formatted
+		lines := strings.Split(match, "\n")
+		if len(lines) < 3 {
+			return match
+		}
+
+		lang := strings.TrimPrefix(lines[0], "```")
+		codeContent := strings.Join(lines[1:len(lines)-1], "\n")
+
+		return fmt.Sprintf("```%s\n%s\n```", lang, codeContent)
+	})
+
+	// Convert plain URLs to markdown links
+	urlRegex := regexp.MustCompile(`(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s\!()\[\]{};:'".,<>?«»""'']))`)
+	content = urlRegex.ReplaceAllStringFunc(content, func(url string) string {
+		if strings.HasPrefix(url, "http") {
+			return fmt.Sprintf("[%s](%s)", url, url)
+		}
+		return url
+	})
+
+	// Add emoji indicators for common patterns
+	content = strings.ReplaceAll(content, "Example:", "💡 **Example:**")
+	content = strings.ReplaceAll(content, "Note:", "📝 **Note:**")
+	content = strings.ReplaceAll(content, "Warning:", "⚠️ **Warning:**")
+	content = strings.ReplaceAll(content, "Important:", "❗ **Important:**")
+	content = strings.ReplaceAll(content, "TODO:", "📋 **TODO:**")
+
+	return content
+}
+
+// ToTableOfContents generates a TOC for the document
+func (f *Formatter) ToTableOfContents(content string) string {
+	lines := strings.Split(content, "\n")
+	var toc []string
+	var level int
+
+	for _, line := range lines {
+		line = strings.TrimSpace(line)
+		if strings.HasPrefix(line, "#") {
+			// Count heading level
+			level = 0
+			for _, char := range line {
+				if char == '#' {
+					level++
+				} else {
+					break
+				}
+			}
+
+			if level <= 3 { // Only include up to H3 in TOC
+				title := strings.TrimSpace(line[level:])
+				anchor := strings.ToLower(strings.ReplaceAll(title, " ", "-"))
+				anchor = regexp.MustCompile(`[^a-z0-9\-]`).ReplaceAllString(anchor, "")
+
+				indent := strings.Repeat("  ", level-1)
+				toc = append(toc, fmt.Sprintf("%s- [%s](#%s)", indent, title, anchor))
+			}
+		}
+	}
+
+	if len(toc) == 0 {
+		return ""
+	}
+
+	return "## 📑 Table of Contents\n\n" + strings.Join(toc, "\n") + "\n\n"
+}
+
+// FormatWithTOC formats a document with an automatically generated table of contents
+func (f *Formatter) FormatWithTOC(doc *Document) string {
+	markdown := f.ToMarkdown(doc)
+
+	// Insert TOC after the header but before the content
+	parts := strings.SplitN(markdown, "## 📚 Content", 2)
+	if len(parts) == 2 {
+		toc := f.ToTableOfContents(parts[1])
+		return parts[0] + toc + "## 📚 Content" + parts[1]
+	}
+
+	return markdown
+}