Files
MyClub/internal/services/spam.go
T
Tomas Dvorak 8762bde4bf dev day #89
2025-11-11 10:29:30 +01:00

85 lines
2.3 KiB
Go

package services
import (
"regexp"
"strings"
)
// Simple heuristics to evaluate spammy text. Returns score 0..1 and triggered rules.
func EvaluateSpamScore(s string) (float64, []string) {
var rules []string
content := strings.TrimSpace(s)
if content == "" {
return 1.0, []string{"empty"}
}
// Too short
if len([]rune(content)) < 6 {
rules = append(rules, "too_short")
}
// Excessive repeated characters like 'aaaaaa' or '!!!!'
if hasExcessiveRepetition(content, 5) {
rules = append(rules, "repeated_chars")
}
// Low vowel ratio suggests gibberish in Czech/English latin text
letters := regexp.MustCompile(`[A-Za-zÁáÉéĚěÍíÓóÚúŮůÝýŽžŠšČčŘřŤťŇňĎď]`).FindAllString(content, -1)
if len(letters) >= 8 {
vowels := regexp.MustCompile(`[AaEeIiOoUuYyÁáÉéĚěÍíÓóÚúŮůÝý]`).FindAllString(content, -1)
ratio := float64(len(vowels)) / float64(len(letters))
if ratio < 0.18 { // very low vowel ratio
rules = append(rules, "low_vowel_ratio")
}
}
// Too many links
linkCount := len(regexp.MustCompile(`https?://`).FindAllStringIndex(content, -1))
if linkCount >= 3 {
rules = append(rules, "too_many_links")
}
// All-caps shouting
if content == strings.ToUpper(content) && len(content) >= 8 {
rules = append(rules, "all_caps")
}
// Compute score by rules weight
weights := map[string]float64{
"empty": 1.0,
"too_short": 0.4,
"repeated_chars": 0.3,
"low_vowel_ratio": 0.3,
"too_many_links": 0.5,
"all_caps": 0.2,
}
score := 0.0
for _, r := range rules { score += weights[r] }
if score > 1.0 { score = 1.0 }
return score, rules
}
// hasExcessiveRepetition checks if s contains a run of the same character of length >= minRun
// Limited to ASCII letters and the punctuation characters ! ? . to mirror the previous intent.
func hasExcessiveRepetition(s string, minRun int) bool {
if minRun < 2 { minRun = 2 }
run := 1
var prev rune
first := true
for _, r := range s {
if first {
prev = r
first = false
continue
}
if r == prev && (isAsciiLetter(r) || r == '!' || r == '?' || r == '.') {
run++
if run >= minRun {
return true
}
} else {
prev = r
run = 1
}
}
return false
}
func isAsciiLetter(r rune) bool {
return (r >= 'a' && r <= 'z') || (r >= 'A' && r <= 'Z')
}