package services import ( "regexp" "strings" ) // Simple heuristics to evaluate spammy text. Returns score 0..1 and triggered rules. func EvaluateSpamScore(s string) (float64, []string) { var rules []string content := strings.TrimSpace(s) if content == "" { return 1.0, []string{"empty"} } // Too short if len([]rune(content)) < 6 { rules = append(rules, "too_short") } // Excessive repeated characters like 'aaaaaa' or '!!!!' if hasExcessiveRepetition(content, 5) { rules = append(rules, "repeated_chars") } // Low vowel ratio suggests gibberish in Czech/English latin text letters := regexp.MustCompile(`[A-Za-zÁáÉéĚěÍíÓóÚúŮůÝýŽžŠšČčŘřŤťŇňĎď]`).FindAllString(content, -1) if len(letters) >= 8 { vowels := regexp.MustCompile(`[AaEeIiOoUuYyÁáÉéĚěÍíÓóÚúŮůÝý]`).FindAllString(content, -1) ratio := float64(len(vowels)) / float64(len(letters)) if ratio < 0.18 { // very low vowel ratio rules = append(rules, "low_vowel_ratio") } } // Too many links linkCount := len(regexp.MustCompile(`https?://`).FindAllStringIndex(content, -1)) if linkCount >= 3 { rules = append(rules, "too_many_links") } // All-caps shouting if content == strings.ToUpper(content) && len(content) >= 8 { rules = append(rules, "all_caps") } // Compute score by rules weight weights := map[string]float64{ "empty": 1.0, "too_short": 0.4, "repeated_chars": 0.3, "low_vowel_ratio": 0.3, "too_many_links": 0.5, "all_caps": 0.2, } score := 0.0 for _, r := range rules { score += weights[r] } if score > 1.0 { score = 1.0 } return score, rules } // hasExcessiveRepetition checks if s contains a run of the same character of length >= minRun // Limited to ASCII letters and the punctuation characters ! ? . to mirror the previous intent. func hasExcessiveRepetition(s string, minRun int) bool { if minRun < 2 { minRun = 2 } run := 1 var prev rune first := true for _, r := range s { if first { prev = r first = false continue } if r == prev && (isAsciiLetter(r) || r == '!' || r == '?' || r == '.') { run++ if run >= minRun { return true } } else { prev = r run = 1 } } return false } func isAsciiLetter(r rune) bool { return (r >= 'a' && r <= 'z') || (r >= 'A' && r <= 'Z') }