mirror of
https://github.com/Dvorinka/MyClubServer.git
synced 2026-06-04 10:42:57 +00:00
56 lines
1.7 KiB
Go
56 lines
1.7 KiB
Go
package services
|
|
|
|
import (
|
|
"regexp"
|
|
"strings"
|
|
)
|
|
|
|
// Simple heuristics to evaluate spammy text. Returns score 0..1 and triggered rules.
|
|
func EvaluateSpamScore(s string) (float64, []string) {
|
|
var rules []string
|
|
content := strings.TrimSpace(s)
|
|
if content == "" {
|
|
return 1.0, []string{"empty"}
|
|
}
|
|
// Too short
|
|
if len([]rune(content)) < 6 {
|
|
rules = append(rules, "too_short")
|
|
}
|
|
// Excessive repeated characters like 'aaaaaa' or '!!!!'
|
|
repeatRe := regexp.MustCompile(`([a-zA-Z!?.])\1{4,}`)
|
|
if repeatRe.MatchString(content) {
|
|
rules = append(rules, "repeated_chars")
|
|
}
|
|
// Low vowel ratio suggests gibberish in Czech/English latin text
|
|
letters := regexp.MustCompile(`[A-Za-zÁáÉéĚěÍíÓóÚúŮůÝýŽžŠšČčŘřŤťŇňĎď]`).FindAllString(content, -1)
|
|
if len(letters) >= 8 {
|
|
vowels := regexp.MustCompile(`[AaEeIiOoUuYyÁáÉéĚěÍíÓóÚúŮůÝý]`).FindAllString(content, -1)
|
|
ratio := float64(len(vowels)) / float64(len(letters))
|
|
if ratio < 0.18 { // very low vowel ratio
|
|
rules = append(rules, "low_vowel_ratio")
|
|
}
|
|
}
|
|
// Too many links
|
|
linkCount := len(regexp.MustCompile(`https?://`).FindAllStringIndex(content, -1))
|
|
if linkCount >= 3 {
|
|
rules = append(rules, "too_many_links")
|
|
}
|
|
// All-caps shouting
|
|
if content == strings.ToUpper(content) && len(content) >= 8 {
|
|
rules = append(rules, "all_caps")
|
|
}
|
|
// Compute score by rules weight
|
|
weights := map[string]float64{
|
|
"empty": 1.0,
|
|
"too_short": 0.4,
|
|
"repeated_chars": 0.3,
|
|
"low_vowel_ratio": 0.3,
|
|
"too_many_links": 0.5,
|
|
"all_caps": 0.2,
|
|
}
|
|
score := 0.0
|
|
for _, r := range rules { score += weights[r] }
|
|
if score > 1.0 { score = 1.0 }
|
|
return score, rules
|
|
}
|