mirror of
https://github.com/Dvorinka/MyClubServer.git
synced 2026-06-04 02:32:57 +00:00
85 lines
2.3 KiB
Go
85 lines
2.3 KiB
Go
package services
|
|
|
|
import (
|
|
"regexp"
|
|
"strings"
|
|
)
|
|
|
|
// Simple heuristics to evaluate spammy text. Returns score 0..1 and triggered rules.
|
|
func EvaluateSpamScore(s string) (float64, []string) {
|
|
var rules []string
|
|
content := strings.TrimSpace(s)
|
|
if content == "" {
|
|
return 1.0, []string{"empty"}
|
|
}
|
|
// Too short
|
|
if len([]rune(content)) < 6 {
|
|
rules = append(rules, "too_short")
|
|
}
|
|
// Excessive repeated characters like 'aaaaaa' or '!!!!'
|
|
if hasExcessiveRepetition(content, 5) {
|
|
rules = append(rules, "repeated_chars")
|
|
}
|
|
// Low vowel ratio suggests gibberish in Czech/English latin text
|
|
letters := regexp.MustCompile(`[A-Za-zÁáÉéĚěÍíÓóÚúŮůÝýŽžŠšČčŘřŤťŇňĎď]`).FindAllString(content, -1)
|
|
if len(letters) >= 8 {
|
|
vowels := regexp.MustCompile(`[AaEeIiOoUuYyÁáÉéĚěÍíÓóÚúŮůÝý]`).FindAllString(content, -1)
|
|
ratio := float64(len(vowels)) / float64(len(letters))
|
|
if ratio < 0.18 { // very low vowel ratio
|
|
rules = append(rules, "low_vowel_ratio")
|
|
}
|
|
}
|
|
// Too many links
|
|
linkCount := len(regexp.MustCompile(`https?://`).FindAllStringIndex(content, -1))
|
|
if linkCount >= 3 {
|
|
rules = append(rules, "too_many_links")
|
|
}
|
|
// All-caps shouting
|
|
if content == strings.ToUpper(content) && len(content) >= 8 {
|
|
rules = append(rules, "all_caps")
|
|
}
|
|
// Compute score by rules weight
|
|
weights := map[string]float64{
|
|
"empty": 1.0,
|
|
"too_short": 0.4,
|
|
"repeated_chars": 0.3,
|
|
"low_vowel_ratio": 0.3,
|
|
"too_many_links": 0.5,
|
|
"all_caps": 0.2,
|
|
}
|
|
score := 0.0
|
|
for _, r := range rules { score += weights[r] }
|
|
if score > 1.0 { score = 1.0 }
|
|
return score, rules
|
|
}
|
|
|
|
// hasExcessiveRepetition checks if s contains a run of the same character of length >= minRun
|
|
// Limited to ASCII letters and the punctuation characters ! ? . to mirror the previous intent.
|
|
func hasExcessiveRepetition(s string, minRun int) bool {
|
|
if minRun < 2 { minRun = 2 }
|
|
run := 1
|
|
var prev rune
|
|
first := true
|
|
for _, r := range s {
|
|
if first {
|
|
prev = r
|
|
first = false
|
|
continue
|
|
}
|
|
if r == prev && (isAsciiLetter(r) || r == '!' || r == '?' || r == '.') {
|
|
run++
|
|
if run >= minRun {
|
|
return true
|
|
}
|
|
} else {
|
|
prev = r
|
|
run = 1
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
func isAsciiLetter(r rune) bool {
|
|
return (r >= 'a' && r <= 'z') || (r >= 'A' && r <= 'Z')
|
|
}
|