package metrics import ( "context" "encoding/json" "fmt" "strings" "sync" "time" "containr/internal/deployment" ) // MetricsCollector collects and aggregates metrics from nodes and services type MetricsCollector struct { nodes map[string]*NodeMetrics services map[string]*ServiceMetrics scheduler *deployment.Scheduler mu sync.RWMutex collectInterval time.Duration storage MetricsStorage } // NodeMetrics represents metrics for a node type NodeMetrics struct { NodeID string `json:"node_id"` Timestamp time.Time `json:"timestamp"` CPU CPUMetrics `json:"cpu"` Memory MemoryMetrics `json:"memory"` Storage StorageMetrics `json:"storage"` Network NetworkMetrics `json:"network"` Containers []ContainerMetrics `json:"containers"` System SystemMetrics `json:"system"` } // ServiceMetrics represents metrics for a service type ServiceMetrics struct { ServiceID string `json:"service_id"` ServiceName string `json:"service_name"` ProjectID string `json:"project_id"` Timestamp time.Time `json:"timestamp"` Instances []InstanceMetrics `json:"instances"` Requests RequestMetrics `json:"requests"` Errors ErrorMetrics `json:"errors"` Performance PerformanceMetrics `json:"performance"` Resources ResourceMetrics `json:"resources"` } // InstanceMetrics represents metrics for a service instance type InstanceMetrics struct { InstanceID string `json:"instance_id"` NodeID string `json:"node_id"` Status string `json:"status"` CPU float64 `json:"cpu"` // CPU usage percentage Memory int64 `json:"memory"` // Memory usage in bytes Network NetworkMetrics `json:"network"` StartTime time.Time `json:"start_time"` LastSeen time.Time `json:"last_seen"` Health HealthMetrics `json:"health"` } // CPUMetrics represents CPU metrics type CPUMetrics struct { UsagePercent float64 `json:"usage_percent"` UsageCores float64 `json:"usage_cores"` LoadAverage1 float64 `json:"load_average_1"` LoadAverage5 float64 `json:"load_average_5"` LoadAverage15 float64 `json:"load_average_15"` } // MemoryMetrics represents memory metrics type MemoryMetrics struct { Total int64 `json:"total"` Used int64 `json:"used"` Available int64 `json:"available"` UsagePercent float64 `json:"usage_percent"` SwapTotal int64 `json:"swap_total"` SwapUsed int64 `json:"swap_used"` } // StorageMetrics represents storage metrics type StorageMetrics struct { Total int64 `json:"total"` Used int64 `json:"used"` Available int64 `json:"available"` UsagePercent float64 `json:"usage_percent"` IOPS int64 `json:"iops"` Throughput int64 `json:"throughput"` } // NetworkMetrics represents network metrics type NetworkMetrics struct { BytesIn int64 `json:"bytes_in"` BytesOut int64 `json:"bytes_out"` PacketsIn int64 `json:"packets_in"` PacketsOut int64 `json:"packets_out"` ConnectionsIn int64 `json:"connections_in"` ConnectionsOut int64 `json:"connections_out"` ErrorsIn int64 `json:"errors_in"` ErrorsOut int64 `json:"errors_out"` } // ContainerMetrics represents metrics for containers type ContainerMetrics struct { ContainerID string `json:"container_id"` Name string `json:"name"` State string `json:"state"` CPU float64 `json:"cpu"` Memory int64 `json:"memory"` Network NetworkMetrics `json:"network"` StartTime time.Time `json:"start_time"` } // SystemMetrics represents system-level metrics type SystemMetrics struct { Uptime time.Duration `json:"uptime"` Processes int `json:"processes"` OS string `json:"os"` Kernel string `json:"kernel"` Architecture string `json:"architecture"` } // RequestMetrics represents HTTP/request metrics type RequestMetrics struct { Total int64 `json:"total"` Success int64 `json:"success"` Errors int64 `json:"errors"` AvgLatency float64 `json:"avg_latency"` P95Latency float64 `json:"p95_latency"` P99Latency float64 `json:"p99_latency"` Throughput float64 `json:"throughput"` } // ErrorMetrics represents error metrics type ErrorMetrics struct { Total int64 `json:"total"` ByType map[string]int64 `json:"by_type"` ByStatusCode map[string]int64 `json:"by_status_code"` Rate float64 `json:"rate"` } // PerformanceMetrics represents performance metrics type PerformanceMetrics struct { ResponseTime float64 `json:"response_time"` Throughput float64 `json:"throughput"` Concurrency int64 `json:"concurrency"` Saturation float64 `json:"saturation"` Utilization float64 `json:"utilization"` } // ResourceMetrics represents resource utilization metrics type ResourceMetrics struct { CPUUsage float64 `json:"cpu_usage"` MemoryUsage int64 `json:"memory_usage"` StorageUsage int64 `json:"storage_usage"` NetworkUsage int64 `json:"network_usage"` ResourceScore float64 `json:"resource_score"` } // HealthMetrics represents health metrics type HealthMetrics struct { Status string `json:"status"` LastCheck time.Time `json:"last_check"` CheckCount int `json:"check_count"` FailureCount int `json:"failure_count"` Uptime time.Duration `json:"uptime"` } // MetricsStorage defines the interface for metrics storage type MetricsStorage interface { StoreNodeMetrics(ctx context.Context, metrics *NodeMetrics) error StoreServiceMetrics(ctx context.Context, metrics *ServiceMetrics) error GetNodeMetrics(ctx context.Context, nodeID string, from, to time.Time) ([]*NodeMetrics, error) GetServiceMetrics(ctx context.Context, serviceID string, from, to time.Time) ([]*ServiceMetrics, error) GetAggregatedMetrics(ctx context.Context, query MetricsQuery) (*AggregatedMetrics, error) } // MetricsQuery represents a query for aggregated metrics type MetricsQuery struct { Type string `json:"type"` // node, service, project ID string `json:"id"` // node_id, service_id, project_id Metrics []string `json:"metrics"` // cpu, memory, network, etc. From time.Time `json:"from"` To time.Time `json:"to"` Interval time.Duration `json:"interval"` GroupBy []string `json:"group_by"` Filters map[string]string `json:"filters"` } // AggregatedMetrics represents aggregated metrics data type AggregatedMetrics struct { Query MetricsQuery `json:"query"` TimeSeries []TimeSeriesPoint `json:"time_series"` Summary map[string]MetricSummary `json:"summary"` } // TimeSeriesPoint represents a point in a time series type TimeSeriesPoint struct { Timestamp time.Time `json:"timestamp"` Values map[string]float64 `json:"values"` } // MetricSummary represents summary statistics for a metric type MetricSummary struct { Min float64 `json:"min"` Max float64 `json:"max"` Avg float64 `json:"avg"` P50 float64 `json:"p50"` P95 float64 `json:"p95"` P99 float64 `json:"p99"` Count int64 `json:"count"` } // NewMetricsCollector creates a new metrics collector func NewMetricsCollector(scheduler *deployment.Scheduler, storage MetricsStorage) *MetricsCollector { return &MetricsCollector{ nodes: make(map[string]*NodeMetrics), services: make(map[string]*ServiceMetrics), scheduler: scheduler, collectInterval: 30 * time.Second, storage: storage, } } // Start starts the metrics collection process func (mc *MetricsCollector) Start(ctx context.Context) error { ticker := time.NewTicker(mc.collectInterval) defer ticker.Stop() for { select { case <-ctx.Done(): return ctx.Err() case <-ticker.C: if err := mc.collectMetrics(ctx); err != nil { fmt.Printf("Error collecting metrics: %v\n", err) } } } } // collectMetrics collects metrics from all nodes and services func (mc *MetricsCollector) collectMetrics(ctx context.Context) error { // Collect node metrics nodes := mc.scheduler.GetNodes() for _, node := range nodes { metrics, err := mc.collectNodeMetrics(ctx, node) if err != nil { fmt.Printf("Error collecting metrics for node %s: %v\n", node.ID, err) continue } mc.mu.Lock() mc.nodes[node.ID] = metrics mc.mu.Unlock() // Store metrics if err := mc.storage.StoreNodeMetrics(ctx, metrics); err != nil { fmt.Printf("Error storing node metrics: %v\n", err) } } // TODO: Collect service metrics // This would involve querying service instances and collecting their metrics return nil } // collectNodeMetrics collects metrics from a specific node func (mc *MetricsCollector) collectNodeMetrics(ctx context.Context, node *deployment.Node) (*NodeMetrics, error) { // In a real implementation, this would collect actual metrics from the node // For now, we'll simulate metrics collection now := time.Now() metrics := &NodeMetrics{ NodeID: node.ID, Timestamp: now, CPU: CPUMetrics{ UsagePercent: node.Usage.CPU, UsageCores: node.Usage.CPU * float64(node.Capacity.CPU) / 100, LoadAverage1: 1.5, LoadAverage5: 1.8, LoadAverage15: 2.1, }, Memory: MemoryMetrics{ Total: node.Capacity.Memory, Used: node.Usage.Memory, Available: node.Capacity.Memory - node.Usage.Memory, UsagePercent: float64(node.Usage.Memory) / float64(node.Capacity.Memory) * 100, SwapTotal: 1024 * 1024 * 1024, // 1GB SwapUsed: 512 * 1024 * 1024, // 512MB }, Storage: StorageMetrics{ Total: node.Capacity.Storage, Used: node.Usage.Storage, Available: node.Capacity.Storage - node.Usage.Storage, UsagePercent: float64(node.Usage.Storage) / float64(node.Capacity.Storage) * 100, IOPS: 1000, Throughput: 1024 * 1024 * 100, // 100MB/s }, Network: NetworkMetrics{ BytesIn: node.Usage.Network, BytesOut: node.Usage.Network, PacketsIn: 10000, PacketsOut: 8000, ConnectionsIn: 50, ConnectionsOut: 30, ErrorsIn: 0, ErrorsOut: 0, }, Containers: []ContainerMetrics{}, System: SystemMetrics{ Uptime: time.Since(node.LastHeartbeat), Processes: 150, OS: "linux", Kernel: "5.15.0", Architecture: "x86_64", }, } // Collect container metrics for this node for _, containerID := range node.Containers { containerMetrics := mc.collectContainerMetrics(containerID) metrics.Containers = append(metrics.Containers, containerMetrics) } return metrics, nil } // collectContainerMetrics collects metrics for a specific container func (mc *MetricsCollector) collectContainerMetrics(containerID string) ContainerMetrics { // In a real implementation, this would query Docker/container runtime return ContainerMetrics{ ContainerID: containerID, Name: fmt.Sprintf("container-%s", containerID[:8]), State: "running", CPU: 25.5, Memory: 512 * 1024 * 1024, // 512MB Network: NetworkMetrics{ BytesIn: 1024 * 1024 * 10, // 10MB BytesOut: 1024 * 1024 * 8, // 8MB PacketsIn: 1000, PacketsOut: 800, }, StartTime: time.Now().Add(-1 * time.Hour), } } // GetNodeMetrics returns the latest metrics for a node func (mc *MetricsCollector) GetNodeMetrics(nodeID string) (*NodeMetrics, error) { mc.mu.RLock() defer mc.mu.RUnlock() metrics, exists := mc.nodes[nodeID] if !exists { return nil, fmt.Errorf("no metrics found for node: %s", nodeID) } return metrics, nil } // GetAllNodeMetrics returns metrics for all nodes func (mc *MetricsCollector) GetAllNodeMetrics() map[string]*NodeMetrics { mc.mu.RLock() defer mc.mu.RUnlock() // Return a copy to avoid race conditions result := make(map[string]*NodeMetrics) for id, metrics := range mc.nodes { result[id] = metrics } return result } // GetServiceMetrics returns the latest metrics for a service func (mc *MetricsCollector) GetServiceMetrics(serviceID string) (*ServiceMetrics, error) { mc.mu.RLock() defer mc.mu.RUnlock() metrics, exists := mc.services[serviceID] if !exists { return nil, fmt.Errorf("no metrics found for service: %s", serviceID) } return metrics, nil } // GetAggregatedMetrics returns aggregated metrics based on a query func (mc *MetricsCollector) GetAggregatedMetrics(ctx context.Context, query MetricsQuery) (*AggregatedMetrics, error) { return mc.storage.GetAggregatedMetrics(ctx, query) } // GetMetricsSummary returns a summary of all metrics func (mc *MetricsCollector) GetMetricsSummary() map[string]interface{} { mc.mu.RLock() defer mc.mu.RUnlock() totalNodes := len(mc.nodes) totalServices := len(mc.services) healthyNodes := 0 totalCPU := 0.0 totalMemory := int64(0) for _, metrics := range mc.nodes { if metrics.CPU.UsagePercent < 80 { healthyNodes++ } totalCPU += metrics.CPU.UsagePercent totalMemory += metrics.Memory.Used } avgCPU := float64(0) if totalNodes > 0 { avgCPU = totalCPU / float64(totalNodes) } return map[string]interface{}{ "total_nodes": totalNodes, "healthy_nodes": healthyNodes, "total_services": totalServices, "avg_cpu_usage": avgCPU, "total_memory": totalMemory, "collect_interval": mc.collectInterval.String(), "last_collection": time.Now().Format(time.RFC3339), } } // ExportMetrics exports metrics in various formats func (mc *MetricsCollector) ExportMetrics(format string) ([]byte, error) { mc.mu.RLock() defer mc.mu.RUnlock() data := map[string]interface{}{ "nodes": mc.nodes, "services": mc.services, "timestamp": time.Now(), } switch format { case "json": return json.MarshalIndent(data, "", " ") case "prometheus": return mc.exportPrometheusFormat() default: return nil, fmt.Errorf("unsupported export format: %s", format) } } // exportPrometheusFormat exports metrics in Prometheus format func (mc *MetricsCollector) exportPrometheusFormat() ([]byte, error) { var output []string for nodeID, metrics := range mc.nodes { // Node CPU metrics output = append(output, fmt.Sprintf("# HELP node_cpu_usage_percent CPU usage percentage for node")) output = append(output, fmt.Sprintf("# TYPE node_cpu_usage_percent gauge")) output = append(output, fmt.Sprintf("node_cpu_usage_percent{node=\"%s\"} %f", nodeID, metrics.CPU.UsagePercent)) // Node memory metrics output = append(output, fmt.Sprintf("# HELP node_memory_usage_bytes Memory usage in bytes for node")) output = append(output, fmt.Sprintf("# TYPE node_memory_usage_bytes gauge")) output = append(output, fmt.Sprintf("node_memory_usage_bytes{node=\"%s\"} %d", nodeID, metrics.Memory.Used)) // Node network metrics output = append(output, fmt.Sprintf("# HELP node_network_bytes_in Total bytes received for node")) output = append(output, fmt.Sprintf("# TYPE node_network_bytes_in counter")) output = append(output, fmt.Sprintf("node_network_bytes_in{node=\"%s\"} %d", nodeID, metrics.Network.BytesIn)) } result := []byte(strings.Join(output, "\n")) return result, nil }