package detectors import ( "context" "crypto/sha256" "fmt" "log" "os" "regexp" "strings" "github.com/yourorg/devour/internal/quality" ) // DuplicationDetector detects duplicate and near-duplicate code type DuplicationDetector struct { *quality.BaseDetector similarityThreshold float64 } // DuplicateCluster represents a cluster of similar functions type DuplicateCluster struct { Functions []quality.FunctionInfo `json:"functions"` Similarity float64 `json:"similarity"` Representative string `json:"representative"` } // NewDuplicationDetector creates a new duplication detector func NewDuplicationDetector(finder quality.FileFinder) *DuplicationDetector { return &DuplicationDetector{ BaseDetector: quality.NewBaseDetector("duplication", quality.SeverityT3, finder), similarityThreshold: 0.8, } } // Name returns the detector name func (d *DuplicationDetector) Name() string { return "duplication" } // Severity returns the default severity func (d *DuplicationDetector) Severity() quality.Severity { return quality.SeverityT3 } // Detect runs duplication detection on the given path func (d *DuplicationDetector) Detect(ctx context.Context, path string, config *quality.Config) ([]quality.Finding, error) { files, err := d.FindFiles(path, config.Language) if err != nil { return nil, fmt.Errorf("failed to find files: %w", err) } // Extract functions from all files var allFunctions []quality.FunctionInfo for _, file := range files { if quality.ShouldExclude(file, config.Exclude) { continue } functions, err := d.extractFunctions(file) if err != nil { log.Printf("Failed to extract functions from %s: %v", file, err) continue } allFunctions = append(allFunctions, functions...) } // Find duplicates clusters := d.findDuplicates(allFunctions) // Convert clusters to findings var findings []quality.Finding for i, cluster := range clusters { if len(cluster.Functions) < 2 { continue } finding := quality.Finding{ ID: fmt.Sprintf("duplication-cluster-%d", i), Type: "duplication", Title: "Code duplication detected", Description: fmt.Sprintf("Found %d similar functions with %.2f similarity", len(cluster.Functions), cluster.Similarity), File: cluster.Functions[0].File, Line: cluster.Functions[0].Line, Severity: d.Severity(), Score: len(cluster.Functions) * 2, // Score based on cluster size Status: quality.StatusOpen, Metadata: map[string]string{ "cluster_size": fmt.Sprintf("%d", len(cluster.Functions)), "similarity": fmt.Sprintf("%.2f", cluster.Similarity), "functions": d.formatFunctionList(cluster.Functions), }, } findings = append(findings, finding) } return findings, nil } // extractFunctions extracts functions from a source file func (d *DuplicationDetector) extractFunctions(filePath string) ([]quality.FunctionInfo, error) { content, err := os.ReadFile(filePath) if err != nil { return nil, err } contentStr := string(content) lines := strings.Split(contentStr, "\n") var functions []quality.FunctionInfo // Simple function extraction for Go (can be enhanced with AST parsing) for i, line := range lines { trimmed := strings.TrimSpace(line) if strings.HasPrefix(trimmed, "func ") { funcInfo := d.parseFunctionLine(trimmed, filePath, i+1, contentStr) if funcInfo != nil { functions = append(functions, *funcInfo) } } } return functions, nil } // parseFunctionLine parses a function declaration line func (d *DuplicationDetector) parseFunctionLine(line, filePath string, lineNum int, content string) *quality.FunctionInfo { // Extract function name parts := strings.Fields(line) if len(parts) < 2 { return nil } funcName := parts[1] // Remove parentheses and receiver if present if idx := strings.Index(funcName, "("); idx != -1 { funcName = funcName[:idx] } // Find function body lines := strings.Split(content, "\n") startLine := lineNum - 1 endLine := d.findFunctionEnd(lines, startLine) if endLine <= startLine { return nil } // Extract function body bodyLines := lines[startLine:endLine] body := strings.Join(bodyLines, "\n") loc := endLine - startLine // Create normalized version for comparison normalized := d.normalizeFunction(body) bodyHash := d.hashFunction(normalized) return &quality.FunctionInfo{ Name: funcName, File: filePath, Line: lineNum, EndLine: endLine, LOC: loc, Body: body, Normalized: normalized, BodyHash: bodyHash, } } // findFunctionEnd finds the end line of a function func (d *DuplicationDetector) findFunctionEnd(lines []string, startLine int) int { if startLine >= len(lines) { return startLine } braceCount := 0 for i := startLine; i < len(lines); i++ { line := lines[i] braceCount += strings.Count(line, "{") braceCount += strings.Count(line, "}") if braceCount == 0 && i > startLine { return i } } return len(lines) } // normalizeFunction normalizes a function for comparison func (d *DuplicationDetector) normalizeFunction(body string) string { // Remove comments body = regexp.MustCompile(`//.*`).ReplaceAllString(body, "") body = regexp.MustCompile(`/\*[\s\S]*?\*/`).ReplaceAllString(body, "") // Normalize whitespace body = regexp.MustCompile(`\s+`).ReplaceAllString(body, " ") body = strings.TrimSpace(body) // Normalize variable names (simple approach) body = regexp.MustCompile(`\b[a-z][a-zA-Z0-9]*\b`).ReplaceAllString(body, "VAR") return body } // hashFunction creates a hash of the normalized function func (d *DuplicationDetector) hashFunction(normalized string) string { hash := sha256.Sum256([]byte(normalized)) return fmt.Sprintf("%x", hash) } // findDuplicates finds duplicate functions using similarity analysis func (d *DuplicationDetector) findDuplicates(functions []quality.FunctionInfo) []DuplicateCluster { var clusters []DuplicateCluster // Group by exact hash first hashGroups := make(map[string][]quality.FunctionInfo) for _, fn := range functions { hashGroups[fn.BodyHash] = append(hashGroups[fn.BodyHash], fn) } // Create clusters from exact duplicates for _, group := range hashGroups { if len(group) >= 2 { cluster := DuplicateCluster{ Functions: group, Similarity: 1.0, Representative: group[0].Name, } clusters = append(clusters, cluster) } } // Find near-duplicates using similarity processed := make(map[int]bool) for i, fn1 := range functions { if processed[i] { continue } var similar []quality.FunctionInfo similar = append(similar, fn1) for j, fn2 := range functions { if i == j || processed[j] { continue } similarity := d.calculateSimilarity(fn1.Normalized, fn2.Normalized) if similarity >= d.similarityThreshold { similar = append(similar, fn2) processed[j] = true } } if len(similar) >= 2 { cluster := DuplicateCluster{ Functions: similar, Similarity: d.similarityThreshold, Representative: similar[0].Name, } clusters = append(clusters, cluster) } processed[i] = true } return clusters } // calculateSimilarity calculates similarity between two strings func (d *DuplicationDetector) calculateSimilarity(s1, s2 string) float64 { if s1 == s2 { return 1.0 } // Simple Levenshtein distance-based similarity distance := d.levenshteinDistance(s1, s2) maxLen := max(len(s1), len(s2)) if maxLen == 0 { return 1.0 } return 1.0 - float64(distance)/float64(maxLen) } // levenshteinDistance calculates the Levenshtein distance between two strings func (d *DuplicationDetector) levenshteinDistance(s1, s2 string) int { m, n := len(s1), len(s2) if m < n { s1, s2 = s2, s1 m, n = n, m } if n == 0 { return m } prev := make([]int, n+1) for i := range prev { prev[i] = i } for i := 1; i <= m; i++ { current := make([]int, n+1) current[0] = i for j := 1; j <= n; j++ { cost := 0 if s1[i-1] != s2[j-1] { cost = 1 } current[j] = min( prev[j]+1, // deletion current[j-1]+1, // insertion prev[j-1]+cost, // substitution ) } prev = current } return prev[n] } // formatFunctionList formats a list of functions for metadata func (d *DuplicationDetector) formatFunctionList(functions []quality.FunctionInfo) string { var names []string for _, fn := range functions { names = append(names, fmt.Sprintf("%s:%d", fn.Name, fn.Line)) } return strings.Join(names, ",") } // min returns the minimum of three integers func min(a, b, c int) int { if a < b { if a < c { return a } return c } if b < c { return b } return c } // max returns the maximum of two integers func max(a, b int) int { if a > b { return a } return b }