mirror of
https://github.com/Dvorinka/Devour.git
synced 2026-06-03 20:13:03 +00:00
89 lines
2.2 KiB
Go
89 lines
2.2 KiB
Go
package scraper
|
|
|
|
import (
|
|
"net/url"
|
|
"path"
|
|
"regexp"
|
|
"strings"
|
|
)
|
|
|
|
var (
|
|
titleNoiseRe = regexp.MustCompile(`(?i)\b(added in go[0-9]+(\.[0-9]+)?|deprecated)\b`)
|
|
titleSpaceRe = regexp.MustCompile(`\s+`)
|
|
contentSpaceRe = regexp.MustCompile(`[ \t]+\n`)
|
|
multiNewlineRe = regexp.MustCompile(`\n{3,}`)
|
|
nonPrintableTitle = regexp.MustCompile(`[[:cntrl:]]`)
|
|
)
|
|
|
|
// NormalizeDocuments applies normalization to a list of scraped documents.
|
|
func NormalizeDocuments(docs []*Document) []*Document {
|
|
for _, doc := range docs {
|
|
NormalizeDocument(doc)
|
|
}
|
|
return docs
|
|
}
|
|
|
|
// NormalizeDocument applies cross-scraper output cleanup.
|
|
func NormalizeDocument(doc *Document) {
|
|
if doc == nil {
|
|
return
|
|
}
|
|
|
|
doc.URL = strings.TrimSpace(doc.URL)
|
|
doc.Type = strings.TrimSpace(doc.Type)
|
|
doc.Title = normalizeTitle(doc.Title)
|
|
doc.Content = normalizeContent(doc.Content)
|
|
|
|
if doc.Title == "" {
|
|
doc.Title = inferTitleFromURL(doc.URL)
|
|
}
|
|
}
|
|
|
|
func normalizeTitle(title string) string {
|
|
title = strings.ReplaceAll(title, "¶", " ")
|
|
title = strings.ReplaceAll(title, "_", " ")
|
|
title = nonPrintableTitle.ReplaceAllString(title, " ")
|
|
title = titleNoiseRe.ReplaceAllString(title, " ")
|
|
title = titleSpaceRe.ReplaceAllString(strings.TrimSpace(title), " ")
|
|
|
|
// Remove dangling punctuation if it became a suffix after cleanup.
|
|
title = strings.TrimSpace(strings.Trim(title, "-:.,;"))
|
|
return title
|
|
}
|
|
|
|
func normalizeContent(content string) string {
|
|
content = strings.ReplaceAll(content, "\r\n", "\n")
|
|
content = strings.TrimSpace(content)
|
|
content = contentSpaceRe.ReplaceAllString(content, "\n")
|
|
content = multiNewlineRe.ReplaceAllString(content, "\n\n")
|
|
return content
|
|
}
|
|
|
|
func inferTitleFromURL(rawURL string) string {
|
|
if rawURL == "" {
|
|
return "Documentation"
|
|
}
|
|
|
|
u, err := url.Parse(rawURL)
|
|
if err != nil {
|
|
return "Documentation"
|
|
}
|
|
|
|
base := path.Base(strings.Trim(u.Path, "/"))
|
|
if base == "" || base == "." || base == "/" {
|
|
if u.Host != "" {
|
|
return u.Host
|
|
}
|
|
return "Documentation"
|
|
}
|
|
|
|
base = strings.TrimSuffix(base, ".html")
|
|
base = strings.ReplaceAll(base, "-", " ")
|
|
base = strings.ReplaceAll(base, "_", " ")
|
|
base = titleSpaceRe.ReplaceAllString(strings.TrimSpace(base), " ")
|
|
if base == "" {
|
|
return "Documentation"
|
|
}
|
|
return base
|
|
}
|