mirror of
https://github.com/Dvorinka/Containr.git
synced 2026-06-03 20:12:58 +00:00
474 lines
15 KiB
Go
474 lines
15 KiB
Go
package metrics
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"fmt"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
|
|
"containr/internal/deployment"
|
|
)
|
|
|
|
// MetricsCollector collects and aggregates metrics from nodes and services
|
|
type MetricsCollector struct {
|
|
nodes map[string]*NodeMetrics
|
|
services map[string]*ServiceMetrics
|
|
scheduler *deployment.Scheduler
|
|
mu sync.RWMutex
|
|
collectInterval time.Duration
|
|
storage MetricsStorage
|
|
}
|
|
|
|
// NodeMetrics represents metrics for a node
|
|
type NodeMetrics struct {
|
|
NodeID string `json:"node_id"`
|
|
Timestamp time.Time `json:"timestamp"`
|
|
CPU CPUMetrics `json:"cpu"`
|
|
Memory MemoryMetrics `json:"memory"`
|
|
Storage StorageMetrics `json:"storage"`
|
|
Network NetworkMetrics `json:"network"`
|
|
Containers []ContainerMetrics `json:"containers"`
|
|
System SystemMetrics `json:"system"`
|
|
}
|
|
|
|
// ServiceMetrics represents metrics for a service
|
|
type ServiceMetrics struct {
|
|
ServiceID string `json:"service_id"`
|
|
ServiceName string `json:"service_name"`
|
|
ProjectID string `json:"project_id"`
|
|
Timestamp time.Time `json:"timestamp"`
|
|
Instances []InstanceMetrics `json:"instances"`
|
|
Requests RequestMetrics `json:"requests"`
|
|
Errors ErrorMetrics `json:"errors"`
|
|
Performance PerformanceMetrics `json:"performance"`
|
|
Resources ResourceMetrics `json:"resources"`
|
|
}
|
|
|
|
// InstanceMetrics represents metrics for a service instance
|
|
type InstanceMetrics struct {
|
|
InstanceID string `json:"instance_id"`
|
|
NodeID string `json:"node_id"`
|
|
Status string `json:"status"`
|
|
CPU float64 `json:"cpu"` // CPU usage percentage
|
|
Memory int64 `json:"memory"` // Memory usage in bytes
|
|
Network NetworkMetrics `json:"network"`
|
|
StartTime time.Time `json:"start_time"`
|
|
LastSeen time.Time `json:"last_seen"`
|
|
Health HealthMetrics `json:"health"`
|
|
}
|
|
|
|
// CPUMetrics represents CPU metrics
|
|
type CPUMetrics struct {
|
|
UsagePercent float64 `json:"usage_percent"`
|
|
UsageCores float64 `json:"usage_cores"`
|
|
LoadAverage1 float64 `json:"load_average_1"`
|
|
LoadAverage5 float64 `json:"load_average_5"`
|
|
LoadAverage15 float64 `json:"load_average_15"`
|
|
}
|
|
|
|
// MemoryMetrics represents memory metrics
|
|
type MemoryMetrics struct {
|
|
Total int64 `json:"total"`
|
|
Used int64 `json:"used"`
|
|
Available int64 `json:"available"`
|
|
UsagePercent float64 `json:"usage_percent"`
|
|
SwapTotal int64 `json:"swap_total"`
|
|
SwapUsed int64 `json:"swap_used"`
|
|
}
|
|
|
|
// StorageMetrics represents storage metrics
|
|
type StorageMetrics struct {
|
|
Total int64 `json:"total"`
|
|
Used int64 `json:"used"`
|
|
Available int64 `json:"available"`
|
|
UsagePercent float64 `json:"usage_percent"`
|
|
IOPS int64 `json:"iops"`
|
|
Throughput int64 `json:"throughput"`
|
|
}
|
|
|
|
// NetworkMetrics represents network metrics
|
|
type NetworkMetrics struct {
|
|
BytesIn int64 `json:"bytes_in"`
|
|
BytesOut int64 `json:"bytes_out"`
|
|
PacketsIn int64 `json:"packets_in"`
|
|
PacketsOut int64 `json:"packets_out"`
|
|
ConnectionsIn int64 `json:"connections_in"`
|
|
ConnectionsOut int64 `json:"connections_out"`
|
|
ErrorsIn int64 `json:"errors_in"`
|
|
ErrorsOut int64 `json:"errors_out"`
|
|
}
|
|
|
|
// ContainerMetrics represents metrics for containers
|
|
type ContainerMetrics struct {
|
|
ContainerID string `json:"container_id"`
|
|
Name string `json:"name"`
|
|
State string `json:"state"`
|
|
CPU float64 `json:"cpu"`
|
|
Memory int64 `json:"memory"`
|
|
Network NetworkMetrics `json:"network"`
|
|
StartTime time.Time `json:"start_time"`
|
|
}
|
|
|
|
// SystemMetrics represents system-level metrics
|
|
type SystemMetrics struct {
|
|
Uptime time.Duration `json:"uptime"`
|
|
Processes int `json:"processes"`
|
|
OS string `json:"os"`
|
|
Kernel string `json:"kernel"`
|
|
Architecture string `json:"architecture"`
|
|
}
|
|
|
|
// RequestMetrics represents HTTP/request metrics
|
|
type RequestMetrics struct {
|
|
Total int64 `json:"total"`
|
|
Success int64 `json:"success"`
|
|
Errors int64 `json:"errors"`
|
|
AvgLatency float64 `json:"avg_latency"`
|
|
P95Latency float64 `json:"p95_latency"`
|
|
P99Latency float64 `json:"p99_latency"`
|
|
Throughput float64 `json:"throughput"`
|
|
}
|
|
|
|
// ErrorMetrics represents error metrics
|
|
type ErrorMetrics struct {
|
|
Total int64 `json:"total"`
|
|
ByType map[string]int64 `json:"by_type"`
|
|
ByStatusCode map[string]int64 `json:"by_status_code"`
|
|
Rate float64 `json:"rate"`
|
|
}
|
|
|
|
// PerformanceMetrics represents performance metrics
|
|
type PerformanceMetrics struct {
|
|
ResponseTime float64 `json:"response_time"`
|
|
Throughput float64 `json:"throughput"`
|
|
Concurrency int64 `json:"concurrency"`
|
|
Saturation float64 `json:"saturation"`
|
|
Utilization float64 `json:"utilization"`
|
|
}
|
|
|
|
// ResourceMetrics represents resource utilization metrics
|
|
type ResourceMetrics struct {
|
|
CPUUsage float64 `json:"cpu_usage"`
|
|
MemoryUsage int64 `json:"memory_usage"`
|
|
StorageUsage int64 `json:"storage_usage"`
|
|
NetworkUsage int64 `json:"network_usage"`
|
|
ResourceScore float64 `json:"resource_score"`
|
|
}
|
|
|
|
// HealthMetrics represents health metrics
|
|
type HealthMetrics struct {
|
|
Status string `json:"status"`
|
|
LastCheck time.Time `json:"last_check"`
|
|
CheckCount int `json:"check_count"`
|
|
FailureCount int `json:"failure_count"`
|
|
Uptime time.Duration `json:"uptime"`
|
|
}
|
|
|
|
// MetricsStorage defines the interface for metrics storage
|
|
type MetricsStorage interface {
|
|
StoreNodeMetrics(ctx context.Context, metrics *NodeMetrics) error
|
|
StoreServiceMetrics(ctx context.Context, metrics *ServiceMetrics) error
|
|
GetNodeMetrics(ctx context.Context, nodeID string, from, to time.Time) ([]*NodeMetrics, error)
|
|
GetServiceMetrics(ctx context.Context, serviceID string, from, to time.Time) ([]*ServiceMetrics, error)
|
|
GetAggregatedMetrics(ctx context.Context, query MetricsQuery) (*AggregatedMetrics, error)
|
|
}
|
|
|
|
// MetricsQuery represents a query for aggregated metrics
|
|
type MetricsQuery struct {
|
|
Type string `json:"type"` // node, service, project
|
|
ID string `json:"id"` // node_id, service_id, project_id
|
|
Metrics []string `json:"metrics"` // cpu, memory, network, etc.
|
|
From time.Time `json:"from"`
|
|
To time.Time `json:"to"`
|
|
Interval time.Duration `json:"interval"`
|
|
GroupBy []string `json:"group_by"`
|
|
Filters map[string]string `json:"filters"`
|
|
}
|
|
|
|
// AggregatedMetrics represents aggregated metrics data
|
|
type AggregatedMetrics struct {
|
|
Query MetricsQuery `json:"query"`
|
|
TimeSeries []TimeSeriesPoint `json:"time_series"`
|
|
Summary map[string]MetricSummary `json:"summary"`
|
|
}
|
|
|
|
// TimeSeriesPoint represents a point in a time series
|
|
type TimeSeriesPoint struct {
|
|
Timestamp time.Time `json:"timestamp"`
|
|
Values map[string]float64 `json:"values"`
|
|
}
|
|
|
|
// MetricSummary represents summary statistics for a metric
|
|
type MetricSummary struct {
|
|
Min float64 `json:"min"`
|
|
Max float64 `json:"max"`
|
|
Avg float64 `json:"avg"`
|
|
P50 float64 `json:"p50"`
|
|
P95 float64 `json:"p95"`
|
|
P99 float64 `json:"p99"`
|
|
Count int64 `json:"count"`
|
|
}
|
|
|
|
// NewMetricsCollector creates a new metrics collector
|
|
func NewMetricsCollector(scheduler *deployment.Scheduler, storage MetricsStorage) *MetricsCollector {
|
|
return &MetricsCollector{
|
|
nodes: make(map[string]*NodeMetrics),
|
|
services: make(map[string]*ServiceMetrics),
|
|
scheduler: scheduler,
|
|
collectInterval: 30 * time.Second,
|
|
storage: storage,
|
|
}
|
|
}
|
|
|
|
// Start starts the metrics collection process
|
|
func (mc *MetricsCollector) Start(ctx context.Context) error {
|
|
ticker := time.NewTicker(mc.collectInterval)
|
|
defer ticker.Stop()
|
|
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
return ctx.Err()
|
|
case <-ticker.C:
|
|
if err := mc.collectMetrics(ctx); err != nil {
|
|
fmt.Printf("Error collecting metrics: %v\n", err)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// collectMetrics collects metrics from all nodes and services
|
|
func (mc *MetricsCollector) collectMetrics(ctx context.Context) error {
|
|
// Collect node metrics
|
|
nodes := mc.scheduler.GetNodes()
|
|
for _, node := range nodes {
|
|
metrics, err := mc.collectNodeMetrics(ctx, node)
|
|
if err != nil {
|
|
fmt.Printf("Error collecting metrics for node %s: %v\n", node.ID, err)
|
|
continue
|
|
}
|
|
|
|
mc.mu.Lock()
|
|
mc.nodes[node.ID] = metrics
|
|
mc.mu.Unlock()
|
|
|
|
// Store metrics
|
|
if err := mc.storage.StoreNodeMetrics(ctx, metrics); err != nil {
|
|
fmt.Printf("Error storing node metrics: %v\n", err)
|
|
}
|
|
}
|
|
|
|
// TODO: Collect service metrics
|
|
// This would involve querying service instances and collecting their metrics
|
|
|
|
return nil
|
|
}
|
|
|
|
// collectNodeMetrics collects metrics from a specific node
|
|
func (mc *MetricsCollector) collectNodeMetrics(ctx context.Context, node *deployment.Node) (*NodeMetrics, error) {
|
|
// In a real implementation, this would collect actual metrics from the node
|
|
// For now, we'll simulate metrics collection
|
|
now := time.Now()
|
|
|
|
metrics := &NodeMetrics{
|
|
NodeID: node.ID,
|
|
Timestamp: now,
|
|
CPU: CPUMetrics{
|
|
UsagePercent: node.Usage.CPU,
|
|
UsageCores: node.Usage.CPU * float64(node.Capacity.CPU) / 100,
|
|
LoadAverage1: 1.5,
|
|
LoadAverage5: 1.8,
|
|
LoadAverage15: 2.1,
|
|
},
|
|
Memory: MemoryMetrics{
|
|
Total: node.Capacity.Memory,
|
|
Used: node.Usage.Memory,
|
|
Available: node.Capacity.Memory - node.Usage.Memory,
|
|
UsagePercent: float64(node.Usage.Memory) / float64(node.Capacity.Memory) * 100,
|
|
SwapTotal: 1024 * 1024 * 1024, // 1GB
|
|
SwapUsed: 512 * 1024 * 1024, // 512MB
|
|
},
|
|
Storage: StorageMetrics{
|
|
Total: node.Capacity.Storage,
|
|
Used: node.Usage.Storage,
|
|
Available: node.Capacity.Storage - node.Usage.Storage,
|
|
UsagePercent: float64(node.Usage.Storage) / float64(node.Capacity.Storage) * 100,
|
|
IOPS: 1000,
|
|
Throughput: 1024 * 1024 * 100, // 100MB/s
|
|
},
|
|
Network: NetworkMetrics{
|
|
BytesIn: node.Usage.Network,
|
|
BytesOut: node.Usage.Network,
|
|
PacketsIn: 10000,
|
|
PacketsOut: 8000,
|
|
ConnectionsIn: 50,
|
|
ConnectionsOut: 30,
|
|
ErrorsIn: 0,
|
|
ErrorsOut: 0,
|
|
},
|
|
Containers: []ContainerMetrics{},
|
|
System: SystemMetrics{
|
|
Uptime: time.Since(node.LastHeartbeat),
|
|
Processes: 150,
|
|
OS: "linux",
|
|
Kernel: "5.15.0",
|
|
Architecture: "x86_64",
|
|
},
|
|
}
|
|
|
|
// Collect container metrics for this node
|
|
for _, containerID := range node.Containers {
|
|
containerMetrics := mc.collectContainerMetrics(containerID)
|
|
metrics.Containers = append(metrics.Containers, containerMetrics)
|
|
}
|
|
|
|
return metrics, nil
|
|
}
|
|
|
|
// collectContainerMetrics collects metrics for a specific container
|
|
func (mc *MetricsCollector) collectContainerMetrics(containerID string) ContainerMetrics {
|
|
// In a real implementation, this would query Docker/container runtime
|
|
return ContainerMetrics{
|
|
ContainerID: containerID,
|
|
Name: fmt.Sprintf("container-%s", containerID[:8]),
|
|
State: "running",
|
|
CPU: 25.5,
|
|
Memory: 512 * 1024 * 1024, // 512MB
|
|
Network: NetworkMetrics{
|
|
BytesIn: 1024 * 1024 * 10, // 10MB
|
|
BytesOut: 1024 * 1024 * 8, // 8MB
|
|
PacketsIn: 1000,
|
|
PacketsOut: 800,
|
|
},
|
|
StartTime: time.Now().Add(-1 * time.Hour),
|
|
}
|
|
}
|
|
|
|
// GetNodeMetrics returns the latest metrics for a node
|
|
func (mc *MetricsCollector) GetNodeMetrics(nodeID string) (*NodeMetrics, error) {
|
|
mc.mu.RLock()
|
|
defer mc.mu.RUnlock()
|
|
|
|
metrics, exists := mc.nodes[nodeID]
|
|
if !exists {
|
|
return nil, fmt.Errorf("no metrics found for node: %s", nodeID)
|
|
}
|
|
|
|
return metrics, nil
|
|
}
|
|
|
|
// GetAllNodeMetrics returns metrics for all nodes
|
|
func (mc *MetricsCollector) GetAllNodeMetrics() map[string]*NodeMetrics {
|
|
mc.mu.RLock()
|
|
defer mc.mu.RUnlock()
|
|
|
|
// Return a copy to avoid race conditions
|
|
result := make(map[string]*NodeMetrics)
|
|
for id, metrics := range mc.nodes {
|
|
result[id] = metrics
|
|
}
|
|
|
|
return result
|
|
}
|
|
|
|
// GetServiceMetrics returns the latest metrics for a service
|
|
func (mc *MetricsCollector) GetServiceMetrics(serviceID string) (*ServiceMetrics, error) {
|
|
mc.mu.RLock()
|
|
defer mc.mu.RUnlock()
|
|
|
|
metrics, exists := mc.services[serviceID]
|
|
if !exists {
|
|
return nil, fmt.Errorf("no metrics found for service: %s", serviceID)
|
|
}
|
|
|
|
return metrics, nil
|
|
}
|
|
|
|
// GetAggregatedMetrics returns aggregated metrics based on a query
|
|
func (mc *MetricsCollector) GetAggregatedMetrics(ctx context.Context, query MetricsQuery) (*AggregatedMetrics, error) {
|
|
return mc.storage.GetAggregatedMetrics(ctx, query)
|
|
}
|
|
|
|
// GetMetricsSummary returns a summary of all metrics
|
|
func (mc *MetricsCollector) GetMetricsSummary() map[string]interface{} {
|
|
mc.mu.RLock()
|
|
defer mc.mu.RUnlock()
|
|
|
|
totalNodes := len(mc.nodes)
|
|
totalServices := len(mc.services)
|
|
healthyNodes := 0
|
|
totalCPU := 0.0
|
|
totalMemory := int64(0)
|
|
|
|
for _, metrics := range mc.nodes {
|
|
if metrics.CPU.UsagePercent < 80 {
|
|
healthyNodes++
|
|
}
|
|
totalCPU += metrics.CPU.UsagePercent
|
|
totalMemory += metrics.Memory.Used
|
|
}
|
|
|
|
avgCPU := float64(0)
|
|
if totalNodes > 0 {
|
|
avgCPU = totalCPU / float64(totalNodes)
|
|
}
|
|
|
|
return map[string]interface{}{
|
|
"total_nodes": totalNodes,
|
|
"healthy_nodes": healthyNodes,
|
|
"total_services": totalServices,
|
|
"avg_cpu_usage": avgCPU,
|
|
"total_memory": totalMemory,
|
|
"collect_interval": mc.collectInterval.String(),
|
|
"last_collection": time.Now().Format(time.RFC3339),
|
|
}
|
|
}
|
|
|
|
// ExportMetrics exports metrics in various formats
|
|
func (mc *MetricsCollector) ExportMetrics(format string) ([]byte, error) {
|
|
mc.mu.RLock()
|
|
defer mc.mu.RUnlock()
|
|
|
|
data := map[string]interface{}{
|
|
"nodes": mc.nodes,
|
|
"services": mc.services,
|
|
"timestamp": time.Now(),
|
|
}
|
|
|
|
switch format {
|
|
case "json":
|
|
return json.MarshalIndent(data, "", " ")
|
|
case "prometheus":
|
|
return mc.exportPrometheusFormat()
|
|
default:
|
|
return nil, fmt.Errorf("unsupported export format: %s", format)
|
|
}
|
|
}
|
|
|
|
// exportPrometheusFormat exports metrics in Prometheus format
|
|
func (mc *MetricsCollector) exportPrometheusFormat() ([]byte, error) {
|
|
var output []string
|
|
|
|
for nodeID, metrics := range mc.nodes {
|
|
// Node CPU metrics
|
|
output = append(output, fmt.Sprintf("# HELP node_cpu_usage_percent CPU usage percentage for node"))
|
|
output = append(output, fmt.Sprintf("# TYPE node_cpu_usage_percent gauge"))
|
|
output = append(output, fmt.Sprintf("node_cpu_usage_percent{node=\"%s\"} %f", nodeID, metrics.CPU.UsagePercent))
|
|
|
|
// Node memory metrics
|
|
output = append(output, fmt.Sprintf("# HELP node_memory_usage_bytes Memory usage in bytes for node"))
|
|
output = append(output, fmt.Sprintf("# TYPE node_memory_usage_bytes gauge"))
|
|
output = append(output, fmt.Sprintf("node_memory_usage_bytes{node=\"%s\"} %d", nodeID, metrics.Memory.Used))
|
|
|
|
// Node network metrics
|
|
output = append(output, fmt.Sprintf("# HELP node_network_bytes_in Total bytes received for node"))
|
|
output = append(output, fmt.Sprintf("# TYPE node_network_bytes_in counter"))
|
|
output = append(output, fmt.Sprintf("node_network_bytes_in{node=\"%s\"} %d", nodeID, metrics.Network.BytesIn))
|
|
}
|
|
|
|
result := []byte(strings.Join(output, "\n"))
|
|
return result, nil
|
|
}
|