package scaling import ( "context" "fmt" "log" "math" "sync" "time" "containr/internal/deployment" "containr/internal/metrics" ) // AutoScaler manages automatic scaling of services type AutoScaler struct { scheduler *deployment.Scheduler metricsCollector *metrics.MetricsCollector policies map[string]*ScalingPolicy services map[string]*ServiceScalingState mu sync.RWMutex checkInterval time.Duration cooldownPeriod time.Duration enabled bool } // ScalingPolicy defines how a service should scale type ScalingPolicy struct { ServiceID string `json:"service_id"` MinReplicas int `json:"min_replicas"` MaxReplicas int `json:"max_replicas"` TargetCPU float64 `json:"target_cpu"` // Target CPU utilization percentage TargetMemory float64 `json:"target_memory"` // Target memory utilization percentage ScaleUpCooldown time.Duration `json:"scale_up_cooldown"` ScaleDownCooldown time.Duration `json:"scale_down_cooldown"` ScaleUpStep int `json:"scale_up_step"` // How many replicas to add when scaling up ScaleDownStep int `json:"scale_down_step"` // How many replicas to remove when scaling down Metrics []string `json:"metrics"` // Which metrics to consider Thresholds map[string]float64 `json:"thresholds"` // Custom thresholds for metrics Enabled bool `json:"enabled"` CostOptimization *CostOptimization `json:"cost_optimization"` } // CostOptimization defines cost-related scaling parameters type CostOptimization struct { MaxCostPerHour float64 `json:"max_cost_per_hour"` PreferEfficiency bool `json:"prefer_efficiency"` IdleTimeout time.Duration `json:"idle_timeout"` } // ServiceScalingState tracks the current scaling state of a service type ServiceScalingState struct { ServiceID string CurrentReplicas int DesiredReplicas int LastScaleAction time.Time LastScaleDirection string // "up" or "down" ScaleUpCooldown time.Time ScaleDownCooldown time.Time MetricsHistory []MetricsSnapshot Policy *ScalingPolicy } // MetricsSnapshot captures metrics at a point in time type MetricsSnapshot struct { Timestamp time.Time CPU float64 Memory float64 Requests float64 Errors float64 } // ScaleEvent represents a scaling action type ScaleEvent struct { ServiceID string `json:"service_id"` Action string `json:"action"` // "scale_up" or "scale_down" FromReplicas int `json:"from_replicas"` ToReplicas int `json:"to_replicas"` Reason string `json:"reason"` Timestamp time.Time `json:"timestamp"` Metrics map[string]float64 `json:"metrics"` CostImpact float64 `json:"cost_impact"` } // ScalingDecision contains the decision made by the autoscaler type ScalingDecision struct { ShouldScale bool `json:"should_scale"` Action string `json:"action"` CurrentReplicas int `json:"current_replicas"` DesiredReplicas int `json:"desired_replicas"` Reason string `json:"reason"` Metrics map[string]float64 `json:"metrics"` CostEstimate float64 `json:"cost_estimate"` } // NewAutoScaler creates a new auto-scaler func NewAutoScaler(scheduler *deployment.Scheduler, metricsCollector *metrics.MetricsCollector) *AutoScaler { return &AutoScaler{ scheduler: scheduler, metricsCollector: metricsCollector, policies: make(map[string]*ScalingPolicy), services: make(map[string]*ServiceScalingState), checkInterval: 30 * time.Second, cooldownPeriod: 5 * time.Minute, enabled: true, } } // Start begins the auto-scaling process func (as *AutoScaler) Start(ctx context.Context) error { ticker := time.NewTicker(as.checkInterval) defer ticker.Stop() log.Printf("AutoScaler started with check interval: %v", as.checkInterval) for { select { case <-ctx.Done(): return ctx.Err() case <-ticker.C: if as.enabled { if err := as.checkAndScale(ctx); err != nil { log.Printf("Error during auto-scaling check: %v", err) } } } } } // checkAndScale evaluates all services and scales if necessary func (as *AutoScaler) checkAndScale(ctx context.Context) error { as.mu.RLock() servicesToCheck := make([]*ServiceScalingState, 0, len(as.services)) for _, state := range as.services { if state.Policy != nil && state.Policy.Enabled { servicesToCheck = append(servicesToCheck, state) } } as.mu.RUnlock() for _, state := range servicesToCheck { decision, err := as.evaluateScaling(ctx, state) if err != nil { log.Printf("Error evaluating scaling for service %s: %v", state.ServiceID, err) continue } if decision.ShouldScale { if err := as.executeScaling(ctx, state, decision); err != nil { log.Printf("Error executing scaling for service %s: %v", state.ServiceID, err) } } } return nil } // evaluateScaling determines if a service needs to scale func (as *AutoScaler) evaluateScaling(ctx context.Context, state *ServiceScalingState) (*ScalingDecision, error) { policy := state.Policy now := time.Now() // Check cooldowns if now.Before(state.ScaleUpCooldown) && now.Before(state.ScaleDownCooldown) { return &ScalingDecision{ ShouldScale: false, CurrentReplicas: state.CurrentReplicas, DesiredReplicas: state.CurrentReplicas, Reason: "In cooldown period", }, nil } // Get current metrics metrics, err := as.getServiceMetrics(ctx, state.ServiceID) if err != nil { return nil, fmt.Errorf("failed to get service metrics: %w", err) } // Calculate desired replicas based on metrics desiredReplicas := as.calculateDesiredReplicas(state, metrics, policy) // Ensure within bounds if desiredReplicas < policy.MinReplicas { desiredReplicas = policy.MinReplicas } if desiredReplicas > policy.MaxReplicas { desiredReplicas = policy.MaxReplicas } // Check if scaling is needed if desiredReplicas == state.CurrentReplicas { return &ScalingDecision{ ShouldScale: false, CurrentReplicas: state.CurrentReplicas, DesiredReplicas: desiredReplicas, Reason: "No scaling needed", Metrics: metrics, }, nil } // Determine action and check cooldowns action := "scale_down" if desiredReplicas > state.CurrentReplicas { action = "scale_up" if now.Before(state.ScaleUpCooldown) { return &ScalingDecision{ ShouldScale: false, CurrentReplicas: state.CurrentReplicas, DesiredReplicas: desiredReplicas, Reason: "Scale up cooldown active", Metrics: metrics, }, nil } } else { if now.Before(state.ScaleDownCooldown) { return &ScalingDecision{ ShouldScale: false, CurrentReplicas: state.CurrentReplicas, DesiredReplicas: desiredReplicas, Reason: "Scale down cooldown active", Metrics: metrics, }, nil } } // Apply scaling steps if action == "scale_up" { maxStep := policy.ScaleUpStep if maxStep <= 0 { maxStep = 1 } if desiredReplicas-state.CurrentReplicas > maxStep { desiredReplicas = state.CurrentReplicas + maxStep } } else { maxStep := policy.ScaleDownStep if maxStep <= 0 { maxStep = 1 } if state.CurrentReplicas-desiredReplicas > maxStep { desiredReplicas = state.CurrentReplicas - maxStep } } // Cost optimization check if policy.CostOptimization != nil { costEstimate := as.estimateScalingCost(state, desiredReplicas) if costEstimate > policy.CostOptimization.MaxCostPerHour { return &ScalingDecision{ ShouldScale: false, CurrentReplicas: state.CurrentReplicas, DesiredReplicas: state.CurrentReplicas, Reason: fmt.Sprintf("Cost estimate %.2f exceeds maximum %.2f", costEstimate, policy.CostOptimization.MaxCostPerHour), Metrics: metrics, CostEstimate: costEstimate, }, nil } } reason := as.generateScalingReason(state, metrics, desiredReplicas) return &ScalingDecision{ ShouldScale: true, Action: action, CurrentReplicas: state.CurrentReplicas, DesiredReplicas: desiredReplicas, Reason: reason, Metrics: metrics, CostEstimate: as.estimateScalingCost(state, desiredReplicas), }, nil } // calculateDesiredReplicas calculates the desired number of replicas based on metrics func (as *AutoScaler) calculateDesiredReplicas(state *ServiceScalingState, metrics map[string]float64, policy *ScalingPolicy) int { currentReplicas := state.CurrentReplicas desiredReplicas := currentReplicas // CPU-based scaling if cpuUsage, ok := metrics["cpu"]; ok && policy.TargetCPU > 0 { cpuRatio := cpuUsage / policy.TargetCPU if cpuRatio > 1.2 { // Scale up if CPU is 20% above target desiredReplicas = int(math.Ceil(float64(currentReplicas) * cpuRatio)) } else if cpuRatio < 0.8 { // Scale down if CPU is 20% below target desiredReplicas = int(math.Floor(float64(currentReplicas) * cpuRatio)) } } // Memory-based scaling if memoryUsage, ok := metrics["memory"]; ok && policy.TargetMemory > 0 { memoryRatio := memoryUsage / policy.TargetMemory if memoryRatio > 1.2 { memDesired := int(math.Ceil(float64(currentReplicas) * memoryRatio)) if memDesired > desiredReplicas { desiredReplicas = memDesired } } else if memoryUsage < 0.8 { memDesired := int(math.Floor(float64(currentReplicas) * memoryRatio)) if memDesired < desiredReplicas { desiredReplicas = memDesired } } } // Request rate scaling if requestRate, ok := metrics["requests_per_second"]; ok { // Simple heuristic: scale based on request rate per replica // Assume each replica can handle ~100 requests per second requestsPerReplica := 100.0 requestDesired := int(math.Ceil(requestRate / requestsPerReplica)) if requestDesired > desiredReplicas { desiredReplicas = requestDesired } } // Error rate scaling (scale up if error rate is high) if errorRate, ok := metrics["error_rate"]; ok && errorRate > 0.05 { // 5% error rate errorDesired := currentReplicas + 1 if errorDesired > desiredReplicas { desiredReplicas = errorDesired } } return desiredReplicas } // getServiceMetrics gets current metrics for a service func (as *AutoScaler) getServiceMetrics(ctx context.Context, serviceID string) (map[string]float64, error) { // Get service metrics from the metrics collector serviceMetrics, err := as.metricsCollector.GetServiceMetrics(serviceID) if err != nil { // If no metrics available, return empty map return make(map[string]float64), nil } metrics := make(map[string]float64) // Calculate average metrics across instances if len(serviceMetrics.Instances) > 0 { var totalCPU, totalMemory, totalRequests float64 var totalErrors int64 for _, instance := range serviceMetrics.Instances { totalCPU += instance.CPU totalMemory += float64(instance.Memory) totalRequests += serviceMetrics.Requests.Throughput totalErrors += serviceMetrics.Errors.Total } instanceCount := float64(len(serviceMetrics.Instances)) metrics["cpu"] = totalCPU / instanceCount metrics["memory"] = totalMemory / instanceCount / (1024 * 1024 * 1024) // Convert to GB metrics["requests_per_second"] = totalRequests if serviceMetrics.Requests.Total > 0 { metrics["error_rate"] = float64(totalErrors) / float64(serviceMetrics.Requests.Total) } else { metrics["error_rate"] = 0 } } return metrics, nil } // executeScaling performs the actual scaling action func (as *AutoScaler) executeScaling(ctx context.Context, state *ServiceScalingState, decision *ScalingDecision) error { serviceID := state.ServiceID fromReplicas := state.CurrentReplicas toReplicas := decision.DesiredReplicas log.Printf("Executing scaling for service %s: %d -> %d replicas (%s)", serviceID, fromReplicas, toReplicas, decision.Reason) // In a real implementation, this would call the deployment engine // to scale the service (add/remove containers) // Update state as.mu.Lock() state.CurrentReplicas = toReplicas state.DesiredReplicas = toReplicas state.LastScaleAction = time.Now() state.LastScaleDirection = decision.Action // Set cooldowns if decision.Action == "scale_up" { state.ScaleUpCooldown = time.Now().Add(state.Policy.ScaleUpCooldown) } else { state.ScaleDownCooldown = time.Now().Add(state.Policy.ScaleDownCooldown) } as.mu.Unlock() // Record the scaling event event := &ScaleEvent{ ServiceID: serviceID, Action: decision.Action, FromReplicas: fromReplicas, ToReplicas: toReplicas, Reason: decision.Reason, Timestamp: time.Now(), Metrics: decision.Metrics, CostImpact: decision.CostEstimate, } // TODO: Store scaling event in database log.Printf("Scaling event: %+v", event) return nil } // generateScalingReason creates a human-readable reason for scaling func (as *AutoScaler) generateScalingReason(state *ServiceScalingState, metrics map[string]float64, desiredReplicas int) string { var reasons []string if cpuUsage, ok := metrics["cpu"]; ok { if cpuUsage > state.Policy.TargetCPU*1.2 { reasons = append(reasons, fmt.Sprintf("CPU usage %.1f%% above target %.1f%%", cpuUsage, state.Policy.TargetCPU)) } else if cpuUsage < state.Policy.TargetCPU*0.8 { reasons = append(reasons, fmt.Sprintf("CPU usage %.1f%% below target %.1f%%", cpuUsage, state.Policy.TargetCPU)) } } if memoryUsage, ok := metrics["memory"]; ok && state.Policy.TargetMemory > 0 { if memoryUsage > state.Policy.TargetMemory*1.2 { reasons = append(reasons, fmt.Sprintf("Memory usage %.1fGB above target %.1fGB", memoryUsage, state.Policy.TargetMemory)) } } if requestRate, ok := metrics["requests_per_second"]; ok { reasons = append(reasons, fmt.Sprintf("Request rate %.0f/s requires %d replicas", requestRate, desiredReplicas)) } if len(reasons) == 0 { return "Automatic scaling based on metrics" } return fmt.Sprintf("Scale %s: %v", state.LastScaleDirection, reasons) } // estimateScalingCost estimates the cost impact of scaling func (as *AutoScaler) estimateScalingCost(state *ServiceScalingState, replicas int) float64 { // Simple cost model: $0.01 per replica per hour // In a real implementation, this would consider actual instance costs baseCost := 0.01 return float64(replicas) * baseCost } // SetScalingPolicy sets or updates a scaling policy for a service func (as *AutoScaler) SetScalingPolicy(policy *ScalingPolicy) error { as.mu.Lock() defer as.mu.Unlock() // Set default values if not specified if policy.ScaleUpCooldown == 0 { policy.ScaleUpCooldown = 3 * time.Minute } if policy.ScaleDownCooldown == 0 { policy.ScaleDownCooldown = 5 * time.Minute } if policy.ScaleUpStep == 0 { policy.ScaleUpStep = 1 } if policy.ScaleDownStep == 0 { policy.ScaleDownStep = 1 } if policy.MinReplicas == 0 { policy.MinReplicas = 1 } if policy.MaxReplicas == 0 { policy.MaxReplicas = 10 } as.policies[policy.ServiceID] = policy // Initialize service state if not exists if _, exists := as.services[policy.ServiceID]; !exists { as.services[policy.ServiceID] = &ServiceScalingState{ ServiceID: policy.ServiceID, CurrentReplicas: policy.MinReplicas, DesiredReplicas: policy.MinReplicas, Policy: policy, MetricsHistory: make([]MetricsSnapshot, 0), } } else { as.services[policy.ServiceID].Policy = policy } return nil } // GetScalingPolicy returns the scaling policy for a service func (as *AutoScaler) GetScalingPolicy(serviceID string) (*ScalingPolicy, error) { as.mu.RLock() defer as.mu.RUnlock() policy, exists := as.policies[serviceID] if !exists { return nil, fmt.Errorf("no scaling policy found for service: %s", serviceID) } return policy, nil } // GetServiceState returns the current scaling state of a service func (as *AutoScaler) GetServiceState(serviceID string) (*ServiceScalingState, error) { as.mu.RLock() defer as.mu.RUnlock() state, exists := as.services[serviceID] if !exists { return nil, fmt.Errorf("no scaling state found for service: %s", serviceID) } return state, nil } // GetAllServiceStates returns all service scaling states func (as *AutoScaler) GetAllServiceStates() map[string]*ServiceScalingState { as.mu.RLock() defer as.mu.RUnlock() result := make(map[string]*ServiceScalingState) for id, state := range as.services { result[id] = state } return result } // Enable enables the auto-scaler func (as *AutoScaler) Enable() { as.mu.Lock() defer as.mu.Unlock() as.enabled = true } // Disable disables the auto-scaler func (as *AutoScaler) Disable() { as.mu.Lock() defer as.mu.Unlock() as.enabled = false } // IsEnabled returns whether the auto-scaler is enabled func (as *AutoScaler) IsEnabled() bool { as.mu.RLock() defer as.mu.RUnlock() return as.enabled } // GetScalingSummary returns a summary of scaling activities func (as *AutoScaler) GetScalingSummary() map[string]interface{} { as.mu.RLock() defer as.mu.RUnlock() totalServices := len(as.services) enabledServices := 0 totalReplicas := 0 scalingUp := 0 scalingDown := 0 for _, state := range as.services { if state.Policy != nil && state.Policy.Enabled { enabledServices++ } totalReplicas += state.CurrentReplicas if state.LastScaleDirection == "scale_up" && time.Since(state.LastScaleAction) < time.Hour { scalingUp++ } else if state.LastScaleDirection == "scale_down" && time.Since(state.LastScaleAction) < time.Hour { scalingDown++ } } return map[string]interface{}{ "total_services": totalServices, "enabled_services": enabledServices, "total_replicas": totalReplicas, "scaling_up": scalingUp, "scaling_down": scalingDown, "enabled": as.enabled, "check_interval": as.checkInterval.String(), } }