dev day #79

2026-07-29 05:03:49 +00:00 · 2025-11-02 01:04:02 +01:00
parent ac886502e0
commit b9cea0cd77
153 changed files with 43713 additions and 1700 deletions
@@ -0,0 +1,197 @@
+package services
+
+import (
+	"regexp"
+	"strings"
+)
+
+// A compact list of Czech and English bad words with family-friendly replacements.
+// Note: This is a lightweight, non-exhaustive list intended for community sites.
+var badWordMap = map[string]string{
+	// Czech
+	"kráva": "osobo",
+	"debil": "nezdvořák",
+	"idiot": "nešika",
+	"blbec": "popleta",
+	"pitomec": "nezbeda",
+	"trouba": "popleta",
+	"sprostý": "nevhodný",
+	"sráč": "strašpytel",
+	"čůrák": "šibal",
+	"kokot": "popleta",
+	"kretén": "nešika",
+	"hovno": "ťuťo",
+	"nasrat": "naštvat",
+	"nasr**": "naštv**",
+	"prdel": "zadek",
+	"píča": "potížistka",
+	"piča": "potížistka",
+	"zmrd": "nezbeda",
+	"sračka": "nepěknost",
+	"sračky": "nepěknosti",
+	"posrat": "pokazit",
+	"posranej": "zkalený",
+	"šukat": "láskovat",
+	"mrdat": "lumpačit",
+	"mrdka": "neplecha",
+	"kurva": "mrška",
+	"zasran": "nepříjemn",
+	"do prdele": "sakryš",
+	"čubka": "neposedná",
+	"svině": "nezdárná",
+	
+	// English
+	"shit": "shoot",
+	"fuck": "flip",
+	"fucking": "flipping",
+	"asshole": "meanie",
+	"bitch": "rascal",
+	"bastard": "rascal",
+	"dick": "goof",
+	"dickhead": "goof",
+	"cock": "goof",
+	"pussy": "rascal",
+	"cunt": "rascal",
+	"crap": "crud",
+	"damn": "darn",
+}
+
+// Compiled replacement patterns and sensitive patterns
+type compiledReplacement struct {
+	re *regexp.Regexp
+	replacement string
+}
+
+var compiledRepls []compiledReplacement
+var sensitiveRegexps []*regexp.Regexp
+
+func init() {
+	// Build compiled replacements from explicit words/phrases
+	for w, rep := range badWordMap {
+		var pat string
+		if strings.Contains(w, " ") {
+			// phrase: allow flexible spacing
+			pat = "(?i)\\b" + strings.ReplaceAll(regexp.QuoteMeta(w), " ", "\\s+") + "\\b"
+		} else {
+			pat = "(?i)\\b" + regexp.QuoteMeta(w) + "[a-zá-ž0-9]*\\b"
+		}
+		compiledRepls = append(compiledRepls, compiledReplacement{ re: regexp.MustCompile(pat), replacement: rep })
+	}
+
+	// Add Czech stems with diacritic + leet tolerant patterns
+	czStems := []struct{ stem, rep string }{
+		{"kurv", "mrška"}, {"píc", "potížistka"}, {"pic", "potížistka"}, {"mrd", "lumpačit"}, {"šuk", "láskovat"}, {"srač", "nepěknost"}, {"hovn", "ťuťo"}, {"zmrd", "nezbeda"}, {"čubk", "neposedná"}, {"svin", "nezdárná"}, {"kokot", "popleta"}, {"čur", "šibal"}, {"cur", "šibal"},
+		{"debil", "nezdvořák"}, {"idiot", "nešika"}, {"kretén", "nešika"}, {"blbec", "popleta"}, {"prdel", "zadek"},
+	}
+	for _, it := range czStems {
+		pat := "(?i)\\b" + diacriticLeetPattern(it.stem) + "[a-zá-ž0-9]*\\b"
+		compiledRepls = append(compiledRepls, compiledReplacement{ re: regexp.MustCompile(pat), replacement: it.rep })
+	}
+
+	// English stems (simple suffix handling)
+	en := []struct{ rawPattern, rep string }{
+		{`(?i)\bshit(ty|head|s|ting)?\b`, "shoot"},
+		{`(?i)\bfuck(ing|er|ers|ed|s)?\b`, "flip"},
+		{`(?i)\bass(hole|hat|es)?\b`, "meanie"},
+		{`(?i)\bbitch(es|y)?\b`, "rascal"},
+		{`(?i)\bbastard(s)?\b`, "rascal"},
+		{`(?i)\bdick(head|s)?\b`, "goof"},
+		{`(?i)\bcock(s|ing)?\b`, "goof"},
+		{`(?i)\bpussy\b`, "rascal"},
+		{`(?i)\bcunt(s)?\b`, "rascal"},
+		{`(?i)\bcrap(py|s)?\b`, "crud"},
+		{`(?i)\bdamn(ed|s|ing)?\b`, "darn"},
+	}
+	for _, e := range en {
+		compiledRepls = append(compiledRepls, compiledReplacement{ re: regexp.MustCompile(e.rawPattern), replacement: e.rep })
+	}
+
+	// Sensitive stems (trigger moderation)
+	sensStems := []string{"kurv", "píc", "pic", "mrd", "šuk", "čur", "cur", "kokot", "cunt", "fuck"}
+	for _, s := range sensStems {
+		// Czech stems get diacritic+leet tolerant pattern; English raw
+		var re *regexp.Regexp
+		if isASCII(s) {
+			re = regexp.MustCompile("(?i)\\b" + regexp.QuoteMeta(s) + "[a-z0-9]*\\b")
+		} else {
+			re = regexp.MustCompile("(?i)\\b" + diacriticLeetPattern(s) + "[a-zá-ž0-9]*\\b")
+		}
+		sensitiveRegexps = append(sensitiveRegexps, re)
+	}
+}
+
+// FilterBadWords replaces bad words with friendlier counterparts while preserving approximate case.
+func FilterBadWords(s string) (string, bool) {
+	if strings.TrimSpace(s) == "" { return s, false }
+	out := s
+	replaced := false
+	for _, cr := range compiledRepls {
+		out2 := cr.re.ReplaceAllStringFunc(out, func(m string) string {
+			replaced = true
+			// preserve basic case style
+			if isTitle(m) { return title(cr.replacement) }
+			if isUpper(m) { return strings.ToUpper(cr.replacement) }
+			return cr.replacement
+		})
+		out = out2
+	}
+	return out, replaced
+}
+
+// ContainsSensitiveWords returns true and the matched words if content contains strong/explicit terms.
+func ContainsSensitiveWords(s string) (bool, []string) {
+	if strings.TrimSpace(s) == "" { return false, nil }
+	found := []string{}
+	for _, re := range sensitiveRegexps {
+		if loc := re.FindStringIndex(s); loc != nil {
+			found = append(found, s[loc[0]:loc[1]])
+		}
+	}
+	if len(found) == 0 { return false, nil }
+	return true, found
+}
+
+func isUpper(s string) bool { return s == strings.ToUpper(s) }
+func isTitle(s string) bool { return len(s) > 0 && strings.ToUpper(s[:1]) == s[:1] && strings.ToLower(s[1:]) == s[1:] }
+func title(s string) string { if len(s)==0 {return s}; return strings.ToUpper(s[:1]) + s[1:] }
+
+// Helpers for Czech diacritics + simple leetspeak
+func diacriticLeetPattern(stem string) string {
+	var b strings.Builder
+	for _, r := range stem {
+		b.WriteString(expandRune(r))
+	}
+	return b.String()
+}
+
+func expandRune(r rune) string {
+    switch r {
+    case 'a', 'A': return "[aá@4]"
+    case 'e', 'E': return "[eéě3]"
+    case 'i', 'I', 'l', 'L': return "[iíl1!]"
+    case 'o', 'O': return "[oó0]"
+    case 'u', 'U': return "[uúů]"
+    case 'y', 'Y': return "[yý]"
+    case 'c', 'C': return "[cč]"
+    case 's', 'S': return "[sš5]"
+    case 'z', 'Z': return "[zž2]"
+    case 'r', 'R': return "[rř]"
+    case 't', 'T': return "[tť7]"
+    case 'n', 'N': return "[nň]"
+    case 'd', 'D': return "[dď]"
+    case 'p', 'P': return "[p]"
+    case 'k', 'K': return "[k]"
+    case 'm', 'M': return "[m]"
+    case 'v', 'V': return "[v]"
+    case 'h', 'H': return "[h]"
+    case 'g', 'G': return "[g]"
+    default:
+        // escape everything else
+        return regexp.QuoteMeta(string(r))
+    }
+}
+
+func isASCII(s string) bool {
+    for i := 0; i < len(s); i++ { if s[i] >= 128 { return false } }
+    return true
+}