package scraper import ( "net/url" "path" "regexp" "strings" ) var ( titleNoiseRe = regexp.MustCompile(`(?i)\b(added in go[0-9]+(\.[0-9]+)?|deprecated)\b`) titleSpaceRe = regexp.MustCompile(`\s+`) contentSpaceRe = regexp.MustCompile(`[ \t]+\n`) multiNewlineRe = regexp.MustCompile(`\n{3,}`) nonPrintableTitle = regexp.MustCompile(`[[:cntrl:]]`) ) // NormalizeDocuments applies normalization to a list of scraped documents. func NormalizeDocuments(docs []*Document) []*Document { for _, doc := range docs { NormalizeDocument(doc) } return docs } // NormalizeDocument applies cross-scraper output cleanup. func NormalizeDocument(doc *Document) { if doc == nil { return } doc.URL = strings.TrimSpace(doc.URL) doc.Type = strings.TrimSpace(doc.Type) doc.Title = normalizeTitle(doc.Title) doc.Content = normalizeContent(doc.Content) if doc.Title == "" { doc.Title = inferTitleFromURL(doc.URL) } } func normalizeTitle(title string) string { title = strings.ReplaceAll(title, "ΒΆ", " ") title = strings.ReplaceAll(title, "_", " ") title = nonPrintableTitle.ReplaceAllString(title, " ") title = titleNoiseRe.ReplaceAllString(title, " ") title = titleSpaceRe.ReplaceAllString(strings.TrimSpace(title), " ") // Remove dangling punctuation if it became a suffix after cleanup. title = strings.TrimSpace(strings.Trim(title, "-:.,;")) return title } func normalizeContent(content string) string { content = strings.ReplaceAll(content, "\r\n", "\n") content = strings.TrimSpace(content) content = contentSpaceRe.ReplaceAllString(content, "\n") content = multiNewlineRe.ReplaceAllString(content, "\n\n") return content } func inferTitleFromURL(rawURL string) string { if rawURL == "" { return "Documentation" } u, err := url.Parse(rawURL) if err != nil { return "Documentation" } base := path.Base(strings.Trim(u.Path, "/")) if base == "" || base == "." || base == "/" { if u.Host != "" { return u.Host } return "Documentation" } base = strings.TrimSuffix(base, ".html") base = strings.ReplaceAll(base, "-", " ") base = strings.ReplaceAll(base, "_", " ") base = titleSpaceRe.ReplaceAllString(strings.TrimSpace(base), " ") if base == "" { return "Documentation" } return base }