Files
Containr/internal/metrics/collector.go
T

474 lines
15 KiB
Go

package metrics
import (
"context"
"encoding/json"
"fmt"
"strings"
"sync"
"time"
"containr/internal/deployment"
)
// MetricsCollector collects and aggregates metrics from nodes and services
type MetricsCollector struct {
nodes map[string]*NodeMetrics
services map[string]*ServiceMetrics
scheduler *deployment.Scheduler
mu sync.RWMutex
collectInterval time.Duration
storage MetricsStorage
}
// NodeMetrics represents metrics for a node
type NodeMetrics struct {
NodeID string `json:"node_id"`
Timestamp time.Time `json:"timestamp"`
CPU CPUMetrics `json:"cpu"`
Memory MemoryMetrics `json:"memory"`
Storage StorageMetrics `json:"storage"`
Network NetworkMetrics `json:"network"`
Containers []ContainerMetrics `json:"containers"`
System SystemMetrics `json:"system"`
}
// ServiceMetrics represents metrics for a service
type ServiceMetrics struct {
ServiceID string `json:"service_id"`
ServiceName string `json:"service_name"`
ProjectID string `json:"project_id"`
Timestamp time.Time `json:"timestamp"`
Instances []InstanceMetrics `json:"instances"`
Requests RequestMetrics `json:"requests"`
Errors ErrorMetrics `json:"errors"`
Performance PerformanceMetrics `json:"performance"`
Resources ResourceMetrics `json:"resources"`
}
// InstanceMetrics represents metrics for a service instance
type InstanceMetrics struct {
InstanceID string `json:"instance_id"`
NodeID string `json:"node_id"`
Status string `json:"status"`
CPU float64 `json:"cpu"` // CPU usage percentage
Memory int64 `json:"memory"` // Memory usage in bytes
Network NetworkMetrics `json:"network"`
StartTime time.Time `json:"start_time"`
LastSeen time.Time `json:"last_seen"`
Health HealthMetrics `json:"health"`
}
// CPUMetrics represents CPU metrics
type CPUMetrics struct {
UsagePercent float64 `json:"usage_percent"`
UsageCores float64 `json:"usage_cores"`
LoadAverage1 float64 `json:"load_average_1"`
LoadAverage5 float64 `json:"load_average_5"`
LoadAverage15 float64 `json:"load_average_15"`
}
// MemoryMetrics represents memory metrics
type MemoryMetrics struct {
Total int64 `json:"total"`
Used int64 `json:"used"`
Available int64 `json:"available"`
UsagePercent float64 `json:"usage_percent"`
SwapTotal int64 `json:"swap_total"`
SwapUsed int64 `json:"swap_used"`
}
// StorageMetrics represents storage metrics
type StorageMetrics struct {
Total int64 `json:"total"`
Used int64 `json:"used"`
Available int64 `json:"available"`
UsagePercent float64 `json:"usage_percent"`
IOPS int64 `json:"iops"`
Throughput int64 `json:"throughput"`
}
// NetworkMetrics represents network metrics
type NetworkMetrics struct {
BytesIn int64 `json:"bytes_in"`
BytesOut int64 `json:"bytes_out"`
PacketsIn int64 `json:"packets_in"`
PacketsOut int64 `json:"packets_out"`
ConnectionsIn int64 `json:"connections_in"`
ConnectionsOut int64 `json:"connections_out"`
ErrorsIn int64 `json:"errors_in"`
ErrorsOut int64 `json:"errors_out"`
}
// ContainerMetrics represents metrics for containers
type ContainerMetrics struct {
ContainerID string `json:"container_id"`
Name string `json:"name"`
State string `json:"state"`
CPU float64 `json:"cpu"`
Memory int64 `json:"memory"`
Network NetworkMetrics `json:"network"`
StartTime time.Time `json:"start_time"`
}
// SystemMetrics represents system-level metrics
type SystemMetrics struct {
Uptime time.Duration `json:"uptime"`
Processes int `json:"processes"`
OS string `json:"os"`
Kernel string `json:"kernel"`
Architecture string `json:"architecture"`
}
// RequestMetrics represents HTTP/request metrics
type RequestMetrics struct {
Total int64 `json:"total"`
Success int64 `json:"success"`
Errors int64 `json:"errors"`
AvgLatency float64 `json:"avg_latency"`
P95Latency float64 `json:"p95_latency"`
P99Latency float64 `json:"p99_latency"`
Throughput float64 `json:"throughput"`
}
// ErrorMetrics represents error metrics
type ErrorMetrics struct {
Total int64 `json:"total"`
ByType map[string]int64 `json:"by_type"`
ByStatusCode map[string]int64 `json:"by_status_code"`
Rate float64 `json:"rate"`
}
// PerformanceMetrics represents performance metrics
type PerformanceMetrics struct {
ResponseTime float64 `json:"response_time"`
Throughput float64 `json:"throughput"`
Concurrency int64 `json:"concurrency"`
Saturation float64 `json:"saturation"`
Utilization float64 `json:"utilization"`
}
// ResourceMetrics represents resource utilization metrics
type ResourceMetrics struct {
CPUUsage float64 `json:"cpu_usage"`
MemoryUsage int64 `json:"memory_usage"`
StorageUsage int64 `json:"storage_usage"`
NetworkUsage int64 `json:"network_usage"`
ResourceScore float64 `json:"resource_score"`
}
// HealthMetrics represents health metrics
type HealthMetrics struct {
Status string `json:"status"`
LastCheck time.Time `json:"last_check"`
CheckCount int `json:"check_count"`
FailureCount int `json:"failure_count"`
Uptime time.Duration `json:"uptime"`
}
// MetricsStorage defines the interface for metrics storage
type MetricsStorage interface {
StoreNodeMetrics(ctx context.Context, metrics *NodeMetrics) error
StoreServiceMetrics(ctx context.Context, metrics *ServiceMetrics) error
GetNodeMetrics(ctx context.Context, nodeID string, from, to time.Time) ([]*NodeMetrics, error)
GetServiceMetrics(ctx context.Context, serviceID string, from, to time.Time) ([]*ServiceMetrics, error)
GetAggregatedMetrics(ctx context.Context, query MetricsQuery) (*AggregatedMetrics, error)
}
// MetricsQuery represents a query for aggregated metrics
type MetricsQuery struct {
Type string `json:"type"` // node, service, project
ID string `json:"id"` // node_id, service_id, project_id
Metrics []string `json:"metrics"` // cpu, memory, network, etc.
From time.Time `json:"from"`
To time.Time `json:"to"`
Interval time.Duration `json:"interval"`
GroupBy []string `json:"group_by"`
Filters map[string]string `json:"filters"`
}
// AggregatedMetrics represents aggregated metrics data
type AggregatedMetrics struct {
Query MetricsQuery `json:"query"`
TimeSeries []TimeSeriesPoint `json:"time_series"`
Summary map[string]MetricSummary `json:"summary"`
}
// TimeSeriesPoint represents a point in a time series
type TimeSeriesPoint struct {
Timestamp time.Time `json:"timestamp"`
Values map[string]float64 `json:"values"`
}
// MetricSummary represents summary statistics for a metric
type MetricSummary struct {
Min float64 `json:"min"`
Max float64 `json:"max"`
Avg float64 `json:"avg"`
P50 float64 `json:"p50"`
P95 float64 `json:"p95"`
P99 float64 `json:"p99"`
Count int64 `json:"count"`
}
// NewMetricsCollector creates a new metrics collector
func NewMetricsCollector(scheduler *deployment.Scheduler, storage MetricsStorage) *MetricsCollector {
return &MetricsCollector{
nodes: make(map[string]*NodeMetrics),
services: make(map[string]*ServiceMetrics),
scheduler: scheduler,
collectInterval: 30 * time.Second,
storage: storage,
}
}
// Start starts the metrics collection process
func (mc *MetricsCollector) Start(ctx context.Context) error {
ticker := time.NewTicker(mc.collectInterval)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
return ctx.Err()
case <-ticker.C:
if err := mc.collectMetrics(ctx); err != nil {
fmt.Printf("Error collecting metrics: %v\n", err)
}
}
}
}
// collectMetrics collects metrics from all nodes and services
func (mc *MetricsCollector) collectMetrics(ctx context.Context) error {
// Collect node metrics
nodes := mc.scheduler.GetNodes()
for _, node := range nodes {
metrics, err := mc.collectNodeMetrics(ctx, node)
if err != nil {
fmt.Printf("Error collecting metrics for node %s: %v\n", node.ID, err)
continue
}
mc.mu.Lock()
mc.nodes[node.ID] = metrics
mc.mu.Unlock()
// Store metrics
if err := mc.storage.StoreNodeMetrics(ctx, metrics); err != nil {
fmt.Printf("Error storing node metrics: %v\n", err)
}
}
// TODO: Collect service metrics
// This would involve querying service instances and collecting their metrics
return nil
}
// collectNodeMetrics collects metrics from a specific node
func (mc *MetricsCollector) collectNodeMetrics(ctx context.Context, node *deployment.Node) (*NodeMetrics, error) {
// In a real implementation, this would collect actual metrics from the node
// For now, we'll simulate metrics collection
now := time.Now()
metrics := &NodeMetrics{
NodeID: node.ID,
Timestamp: now,
CPU: CPUMetrics{
UsagePercent: node.Usage.CPU,
UsageCores: node.Usage.CPU * float64(node.Capacity.CPU) / 100,
LoadAverage1: 1.5,
LoadAverage5: 1.8,
LoadAverage15: 2.1,
},
Memory: MemoryMetrics{
Total: node.Capacity.Memory,
Used: node.Usage.Memory,
Available: node.Capacity.Memory - node.Usage.Memory,
UsagePercent: float64(node.Usage.Memory) / float64(node.Capacity.Memory) * 100,
SwapTotal: 1024 * 1024 * 1024, // 1GB
SwapUsed: 512 * 1024 * 1024, // 512MB
},
Storage: StorageMetrics{
Total: node.Capacity.Storage,
Used: node.Usage.Storage,
Available: node.Capacity.Storage - node.Usage.Storage,
UsagePercent: float64(node.Usage.Storage) / float64(node.Capacity.Storage) * 100,
IOPS: 1000,
Throughput: 1024 * 1024 * 100, // 100MB/s
},
Network: NetworkMetrics{
BytesIn: node.Usage.Network,
BytesOut: node.Usage.Network,
PacketsIn: 10000,
PacketsOut: 8000,
ConnectionsIn: 50,
ConnectionsOut: 30,
ErrorsIn: 0,
ErrorsOut: 0,
},
Containers: []ContainerMetrics{},
System: SystemMetrics{
Uptime: time.Since(node.LastHeartbeat),
Processes: 150,
OS: "linux",
Kernel: "5.15.0",
Architecture: "x86_64",
},
}
// Collect container metrics for this node
for _, containerID := range node.Containers {
containerMetrics := mc.collectContainerMetrics(containerID)
metrics.Containers = append(metrics.Containers, containerMetrics)
}
return metrics, nil
}
// collectContainerMetrics collects metrics for a specific container
func (mc *MetricsCollector) collectContainerMetrics(containerID string) ContainerMetrics {
// In a real implementation, this would query Docker/container runtime
return ContainerMetrics{
ContainerID: containerID,
Name: fmt.Sprintf("container-%s", containerID[:8]),
State: "running",
CPU: 25.5,
Memory: 512 * 1024 * 1024, // 512MB
Network: NetworkMetrics{
BytesIn: 1024 * 1024 * 10, // 10MB
BytesOut: 1024 * 1024 * 8, // 8MB
PacketsIn: 1000,
PacketsOut: 800,
},
StartTime: time.Now().Add(-1 * time.Hour),
}
}
// GetNodeMetrics returns the latest metrics for a node
func (mc *MetricsCollector) GetNodeMetrics(nodeID string) (*NodeMetrics, error) {
mc.mu.RLock()
defer mc.mu.RUnlock()
metrics, exists := mc.nodes[nodeID]
if !exists {
return nil, fmt.Errorf("no metrics found for node: %s", nodeID)
}
return metrics, nil
}
// GetAllNodeMetrics returns metrics for all nodes
func (mc *MetricsCollector) GetAllNodeMetrics() map[string]*NodeMetrics {
mc.mu.RLock()
defer mc.mu.RUnlock()
// Return a copy to avoid race conditions
result := make(map[string]*NodeMetrics)
for id, metrics := range mc.nodes {
result[id] = metrics
}
return result
}
// GetServiceMetrics returns the latest metrics for a service
func (mc *MetricsCollector) GetServiceMetrics(serviceID string) (*ServiceMetrics, error) {
mc.mu.RLock()
defer mc.mu.RUnlock()
metrics, exists := mc.services[serviceID]
if !exists {
return nil, fmt.Errorf("no metrics found for service: %s", serviceID)
}
return metrics, nil
}
// GetAggregatedMetrics returns aggregated metrics based on a query
func (mc *MetricsCollector) GetAggregatedMetrics(ctx context.Context, query MetricsQuery) (*AggregatedMetrics, error) {
return mc.storage.GetAggregatedMetrics(ctx, query)
}
// GetMetricsSummary returns a summary of all metrics
func (mc *MetricsCollector) GetMetricsSummary() map[string]interface{} {
mc.mu.RLock()
defer mc.mu.RUnlock()
totalNodes := len(mc.nodes)
totalServices := len(mc.services)
healthyNodes := 0
totalCPU := 0.0
totalMemory := int64(0)
for _, metrics := range mc.nodes {
if metrics.CPU.UsagePercent < 80 {
healthyNodes++
}
totalCPU += metrics.CPU.UsagePercent
totalMemory += metrics.Memory.Used
}
avgCPU := float64(0)
if totalNodes > 0 {
avgCPU = totalCPU / float64(totalNodes)
}
return map[string]interface{}{
"total_nodes": totalNodes,
"healthy_nodes": healthyNodes,
"total_services": totalServices,
"avg_cpu_usage": avgCPU,
"total_memory": totalMemory,
"collect_interval": mc.collectInterval.String(),
"last_collection": time.Now().Format(time.RFC3339),
}
}
// ExportMetrics exports metrics in various formats
func (mc *MetricsCollector) ExportMetrics(format string) ([]byte, error) {
mc.mu.RLock()
defer mc.mu.RUnlock()
data := map[string]interface{}{
"nodes": mc.nodes,
"services": mc.services,
"timestamp": time.Now(),
}
switch format {
case "json":
return json.MarshalIndent(data, "", " ")
case "prometheus":
return mc.exportPrometheusFormat()
default:
return nil, fmt.Errorf("unsupported export format: %s", format)
}
}
// exportPrometheusFormat exports metrics in Prometheus format
func (mc *MetricsCollector) exportPrometheusFormat() ([]byte, error) {
var output []string
for nodeID, metrics := range mc.nodes {
// Node CPU metrics
output = append(output, fmt.Sprintf("# HELP node_cpu_usage_percent CPU usage percentage for node"))
output = append(output, fmt.Sprintf("# TYPE node_cpu_usage_percent gauge"))
output = append(output, fmt.Sprintf("node_cpu_usage_percent{node=\"%s\"} %f", nodeID, metrics.CPU.UsagePercent))
// Node memory metrics
output = append(output, fmt.Sprintf("# HELP node_memory_usage_bytes Memory usage in bytes for node"))
output = append(output, fmt.Sprintf("# TYPE node_memory_usage_bytes gauge"))
output = append(output, fmt.Sprintf("node_memory_usage_bytes{node=\"%s\"} %d", nodeID, metrics.Memory.Used))
// Node network metrics
output = append(output, fmt.Sprintf("# HELP node_network_bytes_in Total bytes received for node"))
output = append(output, fmt.Sprintf("# TYPE node_network_bytes_in counter"))
output = append(output, fmt.Sprintf("node_network_bytes_in{node=\"%s\"} %d", nodeID, metrics.Network.BytesIn))
}
result := []byte(strings.Join(output, "\n"))
return result, nil
}