This commit is contained in:
Tomas Dvorak
2026-02-24 10:33:59 +01:00
parent 409acd2e08
commit 898a3c303f
1374 changed files with 290409 additions and 29187 deletions
+88
View File
@@ -0,0 +1,88 @@
package scraper
import (
"net/url"
"path"
"regexp"
"strings"
)
var (
titleNoiseRe = regexp.MustCompile(`(?i)\b(added in go[0-9]+(\.[0-9]+)?|deprecated)\b`)
titleSpaceRe = regexp.MustCompile(`\s+`)
contentSpaceRe = regexp.MustCompile(`[ \t]+\n`)
multiNewlineRe = regexp.MustCompile(`\n{3,}`)
nonPrintableTitle = regexp.MustCompile(`[[:cntrl:]]`)
)
// NormalizeDocuments applies normalization to a list of scraped documents.
func NormalizeDocuments(docs []*Document) []*Document {
for _, doc := range docs {
NormalizeDocument(doc)
}
return docs
}
// NormalizeDocument applies cross-scraper output cleanup.
func NormalizeDocument(doc *Document) {
if doc == nil {
return
}
doc.URL = strings.TrimSpace(doc.URL)
doc.Type = strings.TrimSpace(doc.Type)
doc.Title = normalizeTitle(doc.Title)
doc.Content = normalizeContent(doc.Content)
if doc.Title == "" {
doc.Title = inferTitleFromURL(doc.URL)
}
}
func normalizeTitle(title string) string {
title = strings.ReplaceAll(title, "¶", " ")
title = strings.ReplaceAll(title, "_", " ")
title = nonPrintableTitle.ReplaceAllString(title, " ")
title = titleNoiseRe.ReplaceAllString(title, " ")
title = titleSpaceRe.ReplaceAllString(strings.TrimSpace(title), " ")
// Remove dangling punctuation if it became a suffix after cleanup.
title = strings.TrimSpace(strings.Trim(title, "-:.,;"))
return title
}
func normalizeContent(content string) string {
content = strings.ReplaceAll(content, "\r\n", "\n")
content = strings.TrimSpace(content)
content = contentSpaceRe.ReplaceAllString(content, "\n")
content = multiNewlineRe.ReplaceAllString(content, "\n\n")
return content
}
func inferTitleFromURL(rawURL string) string {
if rawURL == "" {
return "Documentation"
}
u, err := url.Parse(rawURL)
if err != nil {
return "Documentation"
}
base := path.Base(strings.Trim(u.Path, "/"))
if base == "" || base == "." || base == "/" {
if u.Host != "" {
return u.Host
}
return "Documentation"
}
base = strings.TrimSuffix(base, ".html")
base = strings.ReplaceAll(base, "-", " ")
base = strings.ReplaceAll(base, "_", " ")
base = titleSpaceRe.ReplaceAllString(strings.TrimSpace(base), " ")
if base == "" {
return "Documentation"
}
return base
}