package markdown import ( "bytes" "fmt" "regexp" "strings" "time" "github.com/yuin/goldmark" "github.com/yuin/goldmark/extension" "github.com/yuin/goldmark/parser" "github.com/yuin/goldmark/renderer/html" ) // titleCase converts a string to title case (first letter of each word capitalized) func titleCase(s string) string { if s == "" { return s } words := strings.Fields(s) for i, word := range words { if len(word) > 0 { words[i] = strings.ToUpper(word[:1]) + strings.ToLower(word[1:]) } } return strings.Join(words, " ") } // Document represents a scraped document to be formatted as markdown type Document struct { ID string `json:"id"` Source string `json:"source"` Type string `json:"type"` Title string `json:"title"` Content string `json:"content"` URL string `json:"url"` Metadata map[string]interface{} `json:"metadata"` Hash string `json:"hash"` Timestamp time.Time `json:"timestamp"` } // Formatter handles markdown conversion and enhancement type Formatter struct { md goldmark.Markdown } // NewFormatter creates a new markdown formatter with extensions func NewFormatter() *Formatter { md := goldmark.New( goldmark.WithExtensions(extension.GFM, extension.Table, extension.Strikethrough), goldmark.WithParserOptions( parser.WithAutoHeadingID(), ), goldmark.WithRendererOptions( html.WithHardWraps(), html.WithXHTML(), ), ) return &Formatter{md: md} } // ToMarkdown converts a document to enhanced markdown format func (f *Formatter) ToMarkdown(doc *Document) string { var buf bytes.Buffer // Header with metadata buf.WriteString(fmt.Sprintf("# %s\n\n", doc.Title)) // Document metadata table buf.WriteString("## 📋 Document Information\n\n") buf.WriteString("| Property | Value |\n") buf.WriteString("|----------|-------|\n") buf.WriteString(fmt.Sprintf("| **Source** | %s |\n", doc.URL)) buf.WriteString(fmt.Sprintf("| **Type** | `%s` |\n", doc.Type)) buf.WriteString(fmt.Sprintf("| **Scraped** | %s |\n", doc.Timestamp.Format("2006-01-02 15:04:05"))) // Add metadata if available if doc.Metadata != nil { for key, value := range doc.Metadata { if strValue := fmt.Sprintf("%v", value); strValue != "" && strValue != "" { buf.WriteString(fmt.Sprintf("| **%s** | %s |\n", titleCase(key), strValue)) } } } buf.WriteString("\n") // Process and enhance content enhancedContent := f.enhanceContent(doc.Content) buf.WriteString("## 📚 Content\n\n") buf.WriteString(enhancedContent) buf.WriteString("\n") // Footer buf.WriteString("---\n\n") buf.WriteString(fmt.Sprintf("*Document scraped by [Devour](https://github.com/yourorg/devour) on %s*\n", doc.Timestamp.Format("2006-01-02 15:04:05"))) buf.WriteString(fmt.Sprintf("*Source: [%s](%s)*\n", doc.URL, doc.URL)) return buf.String() } // enhanceContent improves the readability of scraped content func (f *Formatter) enhanceContent(content string) string { // Clean up common issues content = strings.TrimSpace(content) // Fix multiple consecutive newlines content = regexp.MustCompile(`\n{3,}`).ReplaceAllString(content, "\n\n") // Ensure proper heading spacing content = regexp.MustCompile(`([^\n])\n(#{1,6})`).ReplaceAllString(content, "$1\n\n$2") // Fix code blocks that might be malformed content = regexp.MustCompile("```(\\w+)?\\n([^`]*)\\n```").ReplaceAllStringFunc(content, func(match string) string { // Ensure code blocks are properly formatted lines := strings.Split(match, "\n") if len(lines) < 3 { return match } lang := strings.TrimPrefix(lines[0], "```") codeContent := strings.Join(lines[1:len(lines)-1], "\n") return fmt.Sprintf("```%s\n%s\n```", lang, codeContent) }) // Convert plain URLs to markdown links urlRegex := regexp.MustCompile(`(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|$([^\s()<>]+|(\([^\s()<>]+$))*\))+(?:$([^\s()<>]+|(\([^\s()<>]+$))*\)|[^\s\!()\[\]{};:'".,<>?«»""'']))`) content = urlRegex.ReplaceAllStringFunc(content, func(url string) string { if strings.HasPrefix(url, "http") { return fmt.Sprintf("[%s](%s)", url, url) } return url }) // Add emoji indicators for common patterns content = strings.ReplaceAll(content, "Example:", "💡 **Example:**") content = strings.ReplaceAll(content, "Note:", "📝 **Note:**") content = strings.ReplaceAll(content, "Warning:", "⚠️ **Warning:**") content = strings.ReplaceAll(content, "Important:", "❗ **Important:**") content = strings.ReplaceAll(content, "TODO:", "📋 **TODO:**") return content } // ToTableOfContents generates a TOC for the document func (f *Formatter) ToTableOfContents(content string) string { lines := strings.Split(content, "\n") var toc []string var level int for _, line := range lines { line = strings.TrimSpace(line) if strings.HasPrefix(line, "#") { // Count heading level level = 0 for _, char := range line { if char == '#' { level++ } else { break } } if level <= 3 { // Only include up to H3 in TOC title := strings.TrimSpace(line[level:]) anchor := strings.ToLower(strings.ReplaceAll(title, " ", "-")) anchor = regexp.MustCompile(`[^a-z0-9\-]`).ReplaceAllString(anchor, "") indent := strings.Repeat(" ", level-1) toc = append(toc, fmt.Sprintf("%s- [%s](#%s)", indent, title, anchor)) } } } if len(toc) == 0 { return "" } return "## 📑 Table of Contents\n\n" + strings.Join(toc, "\n") + "\n\n" } // FormatWithTOC formats a document with an automatically generated table of contents func (f *Formatter) FormatWithTOC(doc *Document) string { markdown := f.ToMarkdown(doc) // Insert TOC after the header but before the content parts := strings.SplitN(markdown, "## 📚 Content", 2) if len(parts) == 2 { toc := f.ToTableOfContents(parts[1]) return parts[0] + toc + "## 📚 Content" + parts[1] } return markdown }