mirror of
https://github.com/Dvorinka/Devour.git
synced 2026-06-04 04:23:02 +00:00
359 lines
8.6 KiB
Go
359 lines
8.6 KiB
Go
package detectors
|
|
|
|
import (
|
|
"context"
|
|
"crypto/sha256"
|
|
"fmt"
|
|
"log"
|
|
"os"
|
|
"regexp"
|
|
"strings"
|
|
|
|
"github.com/yourorg/devour/internal/quality"
|
|
)
|
|
|
|
// DuplicationDetector detects duplicate and near-duplicate code
|
|
type DuplicationDetector struct {
|
|
*quality.BaseDetector
|
|
similarityThreshold float64
|
|
}
|
|
|
|
// DuplicateCluster represents a cluster of similar functions
|
|
type DuplicateCluster struct {
|
|
Functions []quality.FunctionInfo `json:"functions"`
|
|
Similarity float64 `json:"similarity"`
|
|
Representative string `json:"representative"`
|
|
}
|
|
|
|
// NewDuplicationDetector creates a new duplication detector
|
|
func NewDuplicationDetector(finder quality.FileFinder) *DuplicationDetector {
|
|
return &DuplicationDetector{
|
|
BaseDetector: quality.NewBaseDetector("duplication", quality.SeverityT3, finder),
|
|
similarityThreshold: 0.8,
|
|
}
|
|
}
|
|
|
|
// Name returns the detector name
|
|
func (d *DuplicationDetector) Name() string {
|
|
return "duplication"
|
|
}
|
|
|
|
// Severity returns the default severity
|
|
func (d *DuplicationDetector) Severity() quality.Severity {
|
|
return quality.SeverityT3
|
|
}
|
|
|
|
// Detect runs duplication detection on the given path
|
|
func (d *DuplicationDetector) Detect(ctx context.Context, path string, config *quality.Config) ([]quality.Finding, error) {
|
|
files, err := d.FindFiles(path, config.Language)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to find files: %w", err)
|
|
}
|
|
|
|
// Extract functions from all files
|
|
var allFunctions []quality.FunctionInfo
|
|
for _, file := range files {
|
|
if quality.ShouldExclude(file, config.Exclude) {
|
|
continue
|
|
}
|
|
|
|
functions, err := d.extractFunctions(file)
|
|
if err != nil {
|
|
log.Printf("Failed to extract functions from %s: %v", file, err)
|
|
continue
|
|
}
|
|
|
|
allFunctions = append(allFunctions, functions...)
|
|
}
|
|
|
|
// Find duplicates
|
|
clusters := d.findDuplicates(allFunctions)
|
|
|
|
// Convert clusters to findings
|
|
var findings []quality.Finding
|
|
for i, cluster := range clusters {
|
|
if len(cluster.Functions) < 2 {
|
|
continue
|
|
}
|
|
|
|
finding := quality.Finding{
|
|
ID: fmt.Sprintf("duplication-cluster-%d", i),
|
|
Type: "duplication",
|
|
Title: "Code duplication detected",
|
|
Description: fmt.Sprintf("Found %d similar functions with %.2f similarity",
|
|
len(cluster.Functions), cluster.Similarity),
|
|
File: cluster.Functions[0].File,
|
|
Line: cluster.Functions[0].Line,
|
|
Severity: d.Severity(),
|
|
Score: len(cluster.Functions) * 2, // Score based on cluster size
|
|
Status: quality.StatusOpen,
|
|
Metadata: map[string]string{
|
|
"cluster_size": fmt.Sprintf("%d", len(cluster.Functions)),
|
|
"similarity": fmt.Sprintf("%.2f", cluster.Similarity),
|
|
"functions": d.formatFunctionList(cluster.Functions),
|
|
},
|
|
}
|
|
findings = append(findings, finding)
|
|
}
|
|
|
|
return findings, nil
|
|
}
|
|
|
|
// extractFunctions extracts functions from a source file
|
|
func (d *DuplicationDetector) extractFunctions(filePath string) ([]quality.FunctionInfo, error) {
|
|
content, err := os.ReadFile(filePath)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
contentStr := string(content)
|
|
lines := strings.Split(contentStr, "\n")
|
|
|
|
var functions []quality.FunctionInfo
|
|
|
|
// Simple function extraction for Go (can be enhanced with AST parsing)
|
|
for i, line := range lines {
|
|
trimmed := strings.TrimSpace(line)
|
|
if strings.HasPrefix(trimmed, "func ") {
|
|
funcInfo := d.parseFunctionLine(trimmed, filePath, i+1, contentStr)
|
|
if funcInfo != nil {
|
|
functions = append(functions, *funcInfo)
|
|
}
|
|
}
|
|
}
|
|
|
|
return functions, nil
|
|
}
|
|
|
|
// parseFunctionLine parses a function declaration line
|
|
func (d *DuplicationDetector) parseFunctionLine(line, filePath string, lineNum int, content string) *quality.FunctionInfo {
|
|
// Extract function name
|
|
parts := strings.Fields(line)
|
|
if len(parts) < 2 {
|
|
return nil
|
|
}
|
|
|
|
funcName := parts[1]
|
|
// Remove parentheses and receiver if present
|
|
if idx := strings.Index(funcName, "("); idx != -1 {
|
|
funcName = funcName[:idx]
|
|
}
|
|
|
|
// Find function body
|
|
lines := strings.Split(content, "\n")
|
|
startLine := lineNum - 1
|
|
endLine := d.findFunctionEnd(lines, startLine)
|
|
|
|
if endLine <= startLine {
|
|
return nil
|
|
}
|
|
|
|
// Extract function body
|
|
bodyLines := lines[startLine:endLine]
|
|
body := strings.Join(bodyLines, "\n")
|
|
loc := endLine - startLine
|
|
|
|
// Create normalized version for comparison
|
|
normalized := d.normalizeFunction(body)
|
|
bodyHash := d.hashFunction(normalized)
|
|
|
|
return &quality.FunctionInfo{
|
|
Name: funcName,
|
|
File: filePath,
|
|
Line: lineNum,
|
|
EndLine: endLine,
|
|
LOC: loc,
|
|
Body: body,
|
|
Normalized: normalized,
|
|
BodyHash: bodyHash,
|
|
}
|
|
}
|
|
|
|
// findFunctionEnd finds the end line of a function
|
|
func (d *DuplicationDetector) findFunctionEnd(lines []string, startLine int) int {
|
|
if startLine >= len(lines) {
|
|
return startLine
|
|
}
|
|
|
|
braceCount := 0
|
|
for i := startLine; i < len(lines); i++ {
|
|
line := lines[i]
|
|
braceCount += strings.Count(line, "{")
|
|
braceCount += strings.Count(line, "}")
|
|
|
|
if braceCount == 0 && i > startLine {
|
|
return i
|
|
}
|
|
}
|
|
|
|
return len(lines)
|
|
}
|
|
|
|
// normalizeFunction normalizes a function for comparison
|
|
func (d *DuplicationDetector) normalizeFunction(body string) string {
|
|
// Remove comments
|
|
body = regexp.MustCompile(`//.*`).ReplaceAllString(body, "")
|
|
body = regexp.MustCompile(`/\*[\s\S]*?\*/`).ReplaceAllString(body, "")
|
|
|
|
// Normalize whitespace
|
|
body = regexp.MustCompile(`\s+`).ReplaceAllString(body, " ")
|
|
body = strings.TrimSpace(body)
|
|
|
|
// Normalize variable names (simple approach)
|
|
body = regexp.MustCompile(`\b[a-z][a-zA-Z0-9]*\b`).ReplaceAllString(body, "VAR")
|
|
|
|
return body
|
|
}
|
|
|
|
// hashFunction creates a hash of the normalized function
|
|
func (d *DuplicationDetector) hashFunction(normalized string) string {
|
|
hash := sha256.Sum256([]byte(normalized))
|
|
return fmt.Sprintf("%x", hash)
|
|
}
|
|
|
|
// findDuplicates finds duplicate functions using similarity analysis
|
|
func (d *DuplicationDetector) findDuplicates(functions []quality.FunctionInfo) []DuplicateCluster {
|
|
var clusters []DuplicateCluster
|
|
|
|
// Group by exact hash first
|
|
hashGroups := make(map[string][]quality.FunctionInfo)
|
|
for _, fn := range functions {
|
|
hashGroups[fn.BodyHash] = append(hashGroups[fn.BodyHash], fn)
|
|
}
|
|
|
|
// Create clusters from exact duplicates
|
|
for _, group := range hashGroups {
|
|
if len(group) >= 2 {
|
|
cluster := DuplicateCluster{
|
|
Functions: group,
|
|
Similarity: 1.0,
|
|
Representative: group[0].Name,
|
|
}
|
|
clusters = append(clusters, cluster)
|
|
}
|
|
}
|
|
|
|
// Find near-duplicates using similarity
|
|
processed := make(map[int]bool)
|
|
for i, fn1 := range functions {
|
|
if processed[i] {
|
|
continue
|
|
}
|
|
|
|
var similar []quality.FunctionInfo
|
|
similar = append(similar, fn1)
|
|
|
|
for j, fn2 := range functions {
|
|
if i == j || processed[j] {
|
|
continue
|
|
}
|
|
|
|
similarity := d.calculateSimilarity(fn1.Normalized, fn2.Normalized)
|
|
if similarity >= d.similarityThreshold {
|
|
similar = append(similar, fn2)
|
|
processed[j] = true
|
|
}
|
|
}
|
|
|
|
if len(similar) >= 2 {
|
|
cluster := DuplicateCluster{
|
|
Functions: similar,
|
|
Similarity: d.similarityThreshold,
|
|
Representative: similar[0].Name,
|
|
}
|
|
clusters = append(clusters, cluster)
|
|
}
|
|
|
|
processed[i] = true
|
|
}
|
|
|
|
return clusters
|
|
}
|
|
|
|
// calculateSimilarity calculates similarity between two strings
|
|
func (d *DuplicationDetector) calculateSimilarity(s1, s2 string) float64 {
|
|
if s1 == s2 {
|
|
return 1.0
|
|
}
|
|
|
|
// Simple Levenshtein distance-based similarity
|
|
distance := d.levenshteinDistance(s1, s2)
|
|
maxLen := max(len(s1), len(s2))
|
|
if maxLen == 0 {
|
|
return 1.0
|
|
}
|
|
|
|
return 1.0 - float64(distance)/float64(maxLen)
|
|
}
|
|
|
|
// levenshteinDistance calculates the Levenshtein distance between two strings
|
|
func (d *DuplicationDetector) levenshteinDistance(s1, s2 string) int {
|
|
m, n := len(s1), len(s2)
|
|
if m < n {
|
|
s1, s2 = s2, s1
|
|
m, n = n, m
|
|
}
|
|
|
|
if n == 0 {
|
|
return m
|
|
}
|
|
|
|
prev := make([]int, n+1)
|
|
for i := range prev {
|
|
prev[i] = i
|
|
}
|
|
|
|
for i := 1; i <= m; i++ {
|
|
current := make([]int, n+1)
|
|
current[0] = i
|
|
|
|
for j := 1; j <= n; j++ {
|
|
cost := 0
|
|
if s1[i-1] != s2[j-1] {
|
|
cost = 1
|
|
}
|
|
|
|
current[j] = min(
|
|
prev[j]+1, // deletion
|
|
current[j-1]+1, // insertion
|
|
prev[j-1]+cost, // substitution
|
|
)
|
|
}
|
|
|
|
prev = current
|
|
}
|
|
|
|
return prev[n]
|
|
}
|
|
|
|
// formatFunctionList formats a list of functions for metadata
|
|
func (d *DuplicationDetector) formatFunctionList(functions []quality.FunctionInfo) string {
|
|
var names []string
|
|
for _, fn := range functions {
|
|
names = append(names, fmt.Sprintf("%s:%d", fn.Name, fn.Line))
|
|
}
|
|
return strings.Join(names, ",")
|
|
}
|
|
|
|
// min returns the minimum of three integers
|
|
func min(a, b, c int) int {
|
|
if a < b {
|
|
if a < c {
|
|
return a
|
|
}
|
|
return c
|
|
}
|
|
if b < c {
|
|
return b
|
|
}
|
|
return c
|
|
}
|
|
|
|
// max returns the maximum of two integers
|
|
func max(a, b int) int {
|
|
if a > b {
|
|
return a
|
|
}
|
|
return b
|
|
}
|