This commit is contained in:
Tomas Dvorak
2025-11-11 10:29:30 +01:00
parent d5b4faea61
commit 8762bde4bf
139 changed files with 7240 additions and 2870 deletions
+31 -2
View File
@@ -17,8 +17,7 @@ func EvaluateSpamScore(s string) (float64, []string) {
rules = append(rules, "too_short")
}
// Excessive repeated characters like 'aaaaaa' or '!!!!'
repeatRe := regexp.MustCompile(`([a-zA-Z!?.])\1{4,}`)
if repeatRe.MatchString(content) {
if hasExcessiveRepetition(content, 5) {
rules = append(rules, "repeated_chars")
}
// Low vowel ratio suggests gibberish in Czech/English latin text
@@ -53,3 +52,33 @@ func EvaluateSpamScore(s string) (float64, []string) {
if score > 1.0 { score = 1.0 }
return score, rules
}
// hasExcessiveRepetition checks if s contains a run of the same character of length >= minRun
// Limited to ASCII letters and the punctuation characters ! ? . to mirror the previous intent.
func hasExcessiveRepetition(s string, minRun int) bool {
if minRun < 2 { minRun = 2 }
run := 1
var prev rune
first := true
for _, r := range s {
if first {
prev = r
first = false
continue
}
if r == prev && (isAsciiLetter(r) || r == '!' || r == '?' || r == '.') {
run++
if run >= minRun {
return true
}
} else {
prev = r
run = 1
}
}
return false
}
func isAsciiLetter(r rune) bool {
return (r >= 'a' && r <= 'z') || (r >= 'A' && r <= 'Z')
}