mirror of
https://github.com/Dvorinka/Devour.git
synced 2026-06-04 04:23:02 +00:00
update
This commit is contained in:
@@ -0,0 +1,88 @@
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"net/url"
|
||||
"path"
|
||||
"regexp"
|
||||
"strings"
|
||||
)
|
||||
|
||||
var (
|
||||
titleNoiseRe = regexp.MustCompile(`(?i)\b(added in go[0-9]+(\.[0-9]+)?|deprecated)\b`)
|
||||
titleSpaceRe = regexp.MustCompile(`\s+`)
|
||||
contentSpaceRe = regexp.MustCompile(`[ \t]+\n`)
|
||||
multiNewlineRe = regexp.MustCompile(`\n{3,}`)
|
||||
nonPrintableTitle = regexp.MustCompile(`[[:cntrl:]]`)
|
||||
)
|
||||
|
||||
// NormalizeDocuments applies normalization to a list of scraped documents.
|
||||
func NormalizeDocuments(docs []*Document) []*Document {
|
||||
for _, doc := range docs {
|
||||
NormalizeDocument(doc)
|
||||
}
|
||||
return docs
|
||||
}
|
||||
|
||||
// NormalizeDocument applies cross-scraper output cleanup.
|
||||
func NormalizeDocument(doc *Document) {
|
||||
if doc == nil {
|
||||
return
|
||||
}
|
||||
|
||||
doc.URL = strings.TrimSpace(doc.URL)
|
||||
doc.Type = strings.TrimSpace(doc.Type)
|
||||
doc.Title = normalizeTitle(doc.Title)
|
||||
doc.Content = normalizeContent(doc.Content)
|
||||
|
||||
if doc.Title == "" {
|
||||
doc.Title = inferTitleFromURL(doc.URL)
|
||||
}
|
||||
}
|
||||
|
||||
func normalizeTitle(title string) string {
|
||||
title = strings.ReplaceAll(title, "¶", " ")
|
||||
title = strings.ReplaceAll(title, "_", " ")
|
||||
title = nonPrintableTitle.ReplaceAllString(title, " ")
|
||||
title = titleNoiseRe.ReplaceAllString(title, " ")
|
||||
title = titleSpaceRe.ReplaceAllString(strings.TrimSpace(title), " ")
|
||||
|
||||
// Remove dangling punctuation if it became a suffix after cleanup.
|
||||
title = strings.TrimSpace(strings.Trim(title, "-:.,;"))
|
||||
return title
|
||||
}
|
||||
|
||||
func normalizeContent(content string) string {
|
||||
content = strings.ReplaceAll(content, "\r\n", "\n")
|
||||
content = strings.TrimSpace(content)
|
||||
content = contentSpaceRe.ReplaceAllString(content, "\n")
|
||||
content = multiNewlineRe.ReplaceAllString(content, "\n\n")
|
||||
return content
|
||||
}
|
||||
|
||||
func inferTitleFromURL(rawURL string) string {
|
||||
if rawURL == "" {
|
||||
return "Documentation"
|
||||
}
|
||||
|
||||
u, err := url.Parse(rawURL)
|
||||
if err != nil {
|
||||
return "Documentation"
|
||||
}
|
||||
|
||||
base := path.Base(strings.Trim(u.Path, "/"))
|
||||
if base == "" || base == "." || base == "/" {
|
||||
if u.Host != "" {
|
||||
return u.Host
|
||||
}
|
||||
return "Documentation"
|
||||
}
|
||||
|
||||
base = strings.TrimSuffix(base, ".html")
|
||||
base = strings.ReplaceAll(base, "-", " ")
|
||||
base = strings.ReplaceAll(base, "_", " ")
|
||||
base = titleSpaceRe.ReplaceAllString(strings.TrimSpace(base), " ")
|
||||
if base == "" {
|
||||
return "Documentation"
|
||||
}
|
||||
return base
|
||||
}
|
||||
Reference in New Issue
Block a user