Devour/internal/markdown/formatter.go

package markdown

import (
	"bytes"
	"fmt"
	"regexp"
	"strings"
	"time"

	"github.com/yuin/goldmark"
	"github.com/yuin/goldmark/extension"
	"github.com/yuin/goldmark/parser"
	"github.com/yuin/goldmark/renderer/html"
)

// Document represents a scraped document to be formatted as markdown
type Document struct {
	ID        string                 `json:"id"`
	Source    string                 `json:"source"`
	Type      string                 `json:"type"`
	Title     string                 `json:"title"`
	Content   string                 `json:"content"`
	URL       string                 `json:"url"`
	Metadata  map[string]interface{} `json:"metadata"`
	Hash      string                 `json:"hash"`
	Timestamp time.Time              `json:"timestamp"`
}

// Formatter handles markdown conversion and enhancement
type Formatter struct {
	md goldmark.Markdown
}

// NewFormatter creates a new markdown formatter with extensions
func NewFormatter() *Formatter {
	md := goldmark.New(
		goldmark.WithExtensions(extension.GFM, extension.Table, extension.Strikethrough),
		goldmark.WithParserOptions(
			parser.WithAutoHeadingID(),
		),
		goldmark.WithRendererOptions(
			html.WithHardWraps(),
			html.WithXHTML(),
		),
	)

	return &Formatter{md: md}
}

// ToMarkdown converts a document to enhanced markdown format
func (f *Formatter) ToMarkdown(doc *Document) string {
	var buf bytes.Buffer

	// Header with metadata
	buf.WriteString(fmt.Sprintf("# %s\n\n", doc.Title))

	// Document metadata table
	buf.WriteString("## 📋 Document Information\n\n")
	buf.WriteString("| Property | Value |\n")
	buf.WriteString("|----------|-------|\n")
	buf.WriteString(fmt.Sprintf("| **Source** | %s |\n", doc.URL))
	buf.WriteString(fmt.Sprintf("| **Type** | `%s` |\n", doc.Type))
	buf.WriteString(fmt.Sprintf("| **Scraped** | %s |\n", doc.Timestamp.Format("2006-01-02 15:04:05")))

	// Add metadata if available
	if doc.Metadata != nil {
		for key, value := range doc.Metadata {
			if strValue := fmt.Sprintf("%v", value); strValue != "" && strValue != "<nil>" {
				buf.WriteString(fmt.Sprintf("| **%s** | %s |\n", strings.Title(key), strValue))
			}
		}
	}
	buf.WriteString("\n")

	// Process and enhance content
	enhancedContent := f.enhanceContent(doc.Content)
	buf.WriteString("## 📚 Content\n\n")
	buf.WriteString(enhancedContent)
	buf.WriteString("\n")

	// Footer
	buf.WriteString("---\n\n")
	buf.WriteString(fmt.Sprintf("*Document scraped by [Devour](https://github.com/yourorg/devour) on %s*\n", doc.Timestamp.Format("2006-01-02 15:04:05")))
	buf.WriteString(fmt.Sprintf("*Source: [%s](%s)*\n", doc.URL, doc.URL))

	return buf.String()
}

// enhanceContent improves the readability of scraped content
func (f *Formatter) enhanceContent(content string) string {
	// Clean up common issues
	content = strings.TrimSpace(content)

	// Fix multiple consecutive newlines
	content = regexp.MustCompile(`\n{3,}`).ReplaceAllString(content, "\n\n")

	// Ensure proper heading spacing
	content = regexp.MustCompile(`([^\n])\n(#{1,6})`).ReplaceAllString(content, "$1\n\n$2")

	// Fix code blocks that might be malformed
	content = regexp.MustCompile("```(\\w+)?\\n([^`]*)\\n```").ReplaceAllStringFunc(content, func(match string) string {
		// Ensure code blocks are properly formatted
		lines := strings.Split(match, "\n")
		if len(lines) < 3 {
			return match
		}

		lang := strings.TrimPrefix(lines[0], "```")
		codeContent := strings.Join(lines[1:len(lines)-1], "\n")

		return fmt.Sprintf("```%s\n%s\n```", lang, codeContent)
	})

	// Convert plain URLs to markdown links
	urlRegex := regexp.MustCompile(`(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s\!()\[\]{};:'".,<>?«»""'']))`)
	content = urlRegex.ReplaceAllStringFunc(content, func(url string) string {
		if strings.HasPrefix(url, "http") {
			return fmt.Sprintf("[%s](%s)", url, url)
		}
		return url
	})

	// Add emoji indicators for common patterns
	content = strings.ReplaceAll(content, "Example:", "💡 **Example:**")
	content = strings.ReplaceAll(content, "Note:", "📝 **Note:**")
	content = strings.ReplaceAll(content, "Warning:", "⚠️ **Warning:**")
	content = strings.ReplaceAll(content, "Important:", "❗ **Important:**")
	content = strings.ReplaceAll(content, "TODO:", "📋 **TODO:**")

	return content
}

// ToTableOfContents generates a TOC for the document
func (f *Formatter) ToTableOfContents(content string) string {
	lines := strings.Split(content, "\n")
	var toc []string
	var level int

	for _, line := range lines {
		line = strings.TrimSpace(line)
		if strings.HasPrefix(line, "#") {
			// Count heading level
			level = 0
			for _, char := range line {
				if char == '#' {
					level++
				} else {
					break
				}
			}

			if level <= 3 { // Only include up to H3 in TOC
				title := strings.TrimSpace(line[level:])
				anchor := strings.ToLower(strings.ReplaceAll(title, " ", "-"))
				anchor = regexp.MustCompile(`[^a-z0-9\-]`).ReplaceAllString(anchor, "")

				indent := strings.Repeat("  ", level-1)
				toc = append(toc, fmt.Sprintf("%s- [%s](#%s)", indent, title, anchor))
			}
		}
	}

	if len(toc) == 0 {
		return ""
	}

	return "## 📑 Table of Contents\n\n" + strings.Join(toc, "\n") + "\n\n"
}

// FormatWithTOC formats a document with an automatically generated table of contents
func (f *Formatter) FormatWithTOC(doc *Document) string {
	markdown := f.ToMarkdown(doc)

	// Insert TOC after the header but before the content
	parts := strings.SplitN(markdown, "## 📚 Content", 2)
	if len(parts) == 2 {
		toc := f.ToTableOfContents(parts[1])
		return parts[0] + toc + "## 📚 Content" + parts[1]
	}

	return markdown
}