Containr/internal/metrics/collector.go

package metrics

import (
	"context"
	"encoding/json"
	"fmt"
	"strings"
	"sync"
	"time"

	"containr/internal/deployment"
)

// MetricsCollector collects and aggregates metrics from nodes and services
type MetricsCollector struct {
	nodes           map[string]*NodeMetrics
	services        map[string]*ServiceMetrics
	scheduler       *deployment.Scheduler
	mu              sync.RWMutex
	collectInterval time.Duration
	storage         MetricsStorage
}

// NodeMetrics represents metrics for a node
type NodeMetrics struct {
	NodeID     string             `json:"node_id"`
	Timestamp  time.Time          `json:"timestamp"`
	CPU        CPUMetrics         `json:"cpu"`
	Memory     MemoryMetrics      `json:"memory"`
	Storage    StorageMetrics     `json:"storage"`
	Network    NetworkMetrics     `json:"network"`
	Containers []ContainerMetrics `json:"containers"`
	System     SystemMetrics      `json:"system"`
}

// ServiceMetrics represents metrics for a service
type ServiceMetrics struct {
	ServiceID   string             `json:"service_id"`
	ServiceName string             `json:"service_name"`
	ProjectID   string             `json:"project_id"`
	Timestamp   time.Time          `json:"timestamp"`
	Instances   []InstanceMetrics  `json:"instances"`
	Requests    RequestMetrics     `json:"requests"`
	Errors      ErrorMetrics       `json:"errors"`
	Performance PerformanceMetrics `json:"performance"`
	Resources   ResourceMetrics    `json:"resources"`
}

// InstanceMetrics represents metrics for a service instance
type InstanceMetrics struct {
	InstanceID string         `json:"instance_id"`
	NodeID     string         `json:"node_id"`
	Status     string         `json:"status"`
	CPU        float64        `json:"cpu"`    // CPU usage percentage
	Memory     int64          `json:"memory"` // Memory usage in bytes
	Network    NetworkMetrics `json:"network"`
	StartTime  time.Time      `json:"start_time"`
	LastSeen   time.Time      `json:"last_seen"`
	Health     HealthMetrics  `json:"health"`
}

// CPUMetrics represents CPU metrics
type CPUMetrics struct {
	UsagePercent  float64 `json:"usage_percent"`
	UsageCores    float64 `json:"usage_cores"`
	LoadAverage1  float64 `json:"load_average_1"`
	LoadAverage5  float64 `json:"load_average_5"`
	LoadAverage15 float64 `json:"load_average_15"`
}

// MemoryMetrics represents memory metrics
type MemoryMetrics struct {
	Total        int64   `json:"total"`
	Used         int64   `json:"used"`
	Available    int64   `json:"available"`
	UsagePercent float64 `json:"usage_percent"`
	SwapTotal    int64   `json:"swap_total"`
	SwapUsed     int64   `json:"swap_used"`
}

// StorageMetrics represents storage metrics
type StorageMetrics struct {
	Total        int64   `json:"total"`
	Used         int64   `json:"used"`
	Available    int64   `json:"available"`
	UsagePercent float64 `json:"usage_percent"`
	IOPS         int64   `json:"iops"`
	Throughput   int64   `json:"throughput"`
}

// NetworkMetrics represents network metrics
type NetworkMetrics struct {
	BytesIn        int64 `json:"bytes_in"`
	BytesOut       int64 `json:"bytes_out"`
	PacketsIn      int64 `json:"packets_in"`
	PacketsOut     int64 `json:"packets_out"`
	ConnectionsIn  int64 `json:"connections_in"`
	ConnectionsOut int64 `json:"connections_out"`
	ErrorsIn       int64 `json:"errors_in"`
	ErrorsOut      int64 `json:"errors_out"`
}

// ContainerMetrics represents metrics for containers
type ContainerMetrics struct {
	ContainerID string         `json:"container_id"`
	Name        string         `json:"name"`
	State       string         `json:"state"`
	CPU         float64        `json:"cpu"`
	Memory      int64          `json:"memory"`
	Network     NetworkMetrics `json:"network"`
	StartTime   time.Time      `json:"start_time"`
}

// SystemMetrics represents system-level metrics
type SystemMetrics struct {
	Uptime       time.Duration `json:"uptime"`
	Processes    int           `json:"processes"`
	OS           string        `json:"os"`
	Kernel       string        `json:"kernel"`
	Architecture string        `json:"architecture"`
}

// RequestMetrics represents HTTP/request metrics
type RequestMetrics struct {
	Total      int64   `json:"total"`
	Success    int64   `json:"success"`
	Errors     int64   `json:"errors"`
	AvgLatency float64 `json:"avg_latency"`
	P95Latency float64 `json:"p95_latency"`
	P99Latency float64 `json:"p99_latency"`
	Throughput float64 `json:"throughput"`
}

// ErrorMetrics represents error metrics
type ErrorMetrics struct {
	Total        int64            `json:"total"`
	ByType       map[string]int64 `json:"by_type"`
	ByStatusCode map[string]int64 `json:"by_status_code"`
	Rate         float64          `json:"rate"`
}

// PerformanceMetrics represents performance metrics
type PerformanceMetrics struct {
	ResponseTime float64 `json:"response_time"`
	Throughput   float64 `json:"throughput"`
	Concurrency  int64   `json:"concurrency"`
	Saturation   float64 `json:"saturation"`
	Utilization  float64 `json:"utilization"`
}

// ResourceMetrics represents resource utilization metrics
type ResourceMetrics struct {
	CPUUsage      float64 `json:"cpu_usage"`
	MemoryUsage   int64   `json:"memory_usage"`
	StorageUsage  int64   `json:"storage_usage"`
	NetworkUsage  int64   `json:"network_usage"`
	ResourceScore float64 `json:"resource_score"`
}

// HealthMetrics represents health metrics
type HealthMetrics struct {
	Status       string        `json:"status"`
	LastCheck    time.Time     `json:"last_check"`
	CheckCount   int           `json:"check_count"`
	FailureCount int           `json:"failure_count"`
	Uptime       time.Duration `json:"uptime"`
}

// MetricsStorage defines the interface for metrics storage
type MetricsStorage interface {
	StoreNodeMetrics(ctx context.Context, metrics *NodeMetrics) error
	StoreServiceMetrics(ctx context.Context, metrics *ServiceMetrics) error
	GetNodeMetrics(ctx context.Context, nodeID string, from, to time.Time) ([]*NodeMetrics, error)
	GetServiceMetrics(ctx context.Context, serviceID string, from, to time.Time) ([]*ServiceMetrics, error)
	GetAggregatedMetrics(ctx context.Context, query MetricsQuery) (*AggregatedMetrics, error)
}

// MetricsQuery represents a query for aggregated metrics
type MetricsQuery struct {
	Type     string            `json:"type"`    // node, service, project
	ID       string            `json:"id"`      // node_id, service_id, project_id
	Metrics  []string          `json:"metrics"` // cpu, memory, network, etc.
	From     time.Time         `json:"from"`
	To       time.Time         `json:"to"`
	Interval time.Duration     `json:"interval"`
	GroupBy  []string          `json:"group_by"`
	Filters  map[string]string `json:"filters"`
}

// AggregatedMetrics represents aggregated metrics data
type AggregatedMetrics struct {
	Query      MetricsQuery             `json:"query"`
	TimeSeries []TimeSeriesPoint        `json:"time_series"`
	Summary    map[string]MetricSummary `json:"summary"`
}

// TimeSeriesPoint represents a point in a time series
type TimeSeriesPoint struct {
	Timestamp time.Time          `json:"timestamp"`
	Values    map[string]float64 `json:"values"`
}

// MetricSummary represents summary statistics for a metric
type MetricSummary struct {
	Min   float64 `json:"min"`
	Max   float64 `json:"max"`
	Avg   float64 `json:"avg"`
	P50   float64 `json:"p50"`
	P95   float64 `json:"p95"`
	P99   float64 `json:"p99"`
	Count int64   `json:"count"`
}

// NewMetricsCollector creates a new metrics collector
func NewMetricsCollector(scheduler *deployment.Scheduler, storage MetricsStorage) *MetricsCollector {
	return &MetricsCollector{
		nodes:           make(map[string]*NodeMetrics),
		services:        make(map[string]*ServiceMetrics),
		scheduler:       scheduler,
		collectInterval: 30 * time.Second,
		storage:         storage,
	}
}

// Start starts the metrics collection process
func (mc *MetricsCollector) Start(ctx context.Context) error {
	ticker := time.NewTicker(mc.collectInterval)
	defer ticker.Stop()

	for {
		select {
		case <-ctx.Done():
			return ctx.Err()
		case <-ticker.C:
			if err := mc.collectMetrics(ctx); err != nil {
				fmt.Printf("Error collecting metrics: %v\n", err)
			}
		}
	}
}

// collectMetrics collects metrics from all nodes and services
func (mc *MetricsCollector) collectMetrics(ctx context.Context) error {
	// Collect node metrics
	nodes := mc.scheduler.GetNodes()
	for _, node := range nodes {
		metrics, err := mc.collectNodeMetrics(ctx, node)
		if err != nil {
			fmt.Printf("Error collecting metrics for node %s: %v\n", node.ID, err)
			continue
		}

		mc.mu.Lock()
		mc.nodes[node.ID] = metrics
		mc.mu.Unlock()

		// Store metrics
		if err := mc.storage.StoreNodeMetrics(ctx, metrics); err != nil {
			fmt.Printf("Error storing node metrics: %v\n", err)
		}
	}

	// TODO: Collect service metrics
	// This would involve querying service instances and collecting their metrics

	return nil
}

// collectNodeMetrics collects metrics from a specific node
func (mc *MetricsCollector) collectNodeMetrics(ctx context.Context, node *deployment.Node) (*NodeMetrics, error) {
	// In a real implementation, this would collect actual metrics from the node
	// For now, we'll simulate metrics collection
	now := time.Now()

	metrics := &NodeMetrics{
		NodeID:    node.ID,
		Timestamp: now,
		CPU: CPUMetrics{
			UsagePercent:  node.Usage.CPU,
			UsageCores:    node.Usage.CPU * float64(node.Capacity.CPU) / 100,
			LoadAverage1:  1.5,
			LoadAverage5:  1.8,
			LoadAverage15: 2.1,
		},
		Memory: MemoryMetrics{
			Total:        node.Capacity.Memory,
			Used:         node.Usage.Memory,
			Available:    node.Capacity.Memory - node.Usage.Memory,
			UsagePercent: float64(node.Usage.Memory) / float64(node.Capacity.Memory) * 100,
			SwapTotal:    1024 * 1024 * 1024, // 1GB
			SwapUsed:     512 * 1024 * 1024,  // 512MB
		},
		Storage: StorageMetrics{
			Total:        node.Capacity.Storage,
			Used:         node.Usage.Storage,
			Available:    node.Capacity.Storage - node.Usage.Storage,
			UsagePercent: float64(node.Usage.Storage) / float64(node.Capacity.Storage) * 100,
			IOPS:         1000,
			Throughput:   1024 * 1024 * 100, // 100MB/s
		},
		Network: NetworkMetrics{
			BytesIn:        node.Usage.Network,
			BytesOut:       node.Usage.Network,
			PacketsIn:      10000,
			PacketsOut:     8000,
			ConnectionsIn:  50,
			ConnectionsOut: 30,
			ErrorsIn:       0,
			ErrorsOut:      0,
		},
		Containers: []ContainerMetrics{},
		System: SystemMetrics{
			Uptime:       time.Since(node.LastHeartbeat),
			Processes:    150,
			OS:           "linux",
			Kernel:       "5.15.0",
			Architecture: "x86_64",
		},
	}

	// Collect container metrics for this node
	for _, containerID := range node.Containers {
		containerMetrics := mc.collectContainerMetrics(containerID)
		metrics.Containers = append(metrics.Containers, containerMetrics)
	}

	return metrics, nil
}

// collectContainerMetrics collects metrics for a specific container
func (mc *MetricsCollector) collectContainerMetrics(containerID string) ContainerMetrics {
	// In a real implementation, this would query Docker/container runtime
	return ContainerMetrics{
		ContainerID: containerID,
		Name:        fmt.Sprintf("container-%s", containerID[:8]),
		State:       "running",
		CPU:         25.5,
		Memory:      512 * 1024 * 1024, // 512MB
		Network: NetworkMetrics{
			BytesIn:    1024 * 1024 * 10, // 10MB
			BytesOut:   1024 * 1024 * 8,  // 8MB
			PacketsIn:  1000,
			PacketsOut: 800,
		},
		StartTime: time.Now().Add(-1 * time.Hour),
	}
}

// GetNodeMetrics returns the latest metrics for a node
func (mc *MetricsCollector) GetNodeMetrics(nodeID string) (*NodeMetrics, error) {
	mc.mu.RLock()
	defer mc.mu.RUnlock()

	metrics, exists := mc.nodes[nodeID]
	if !exists {
		return nil, fmt.Errorf("no metrics found for node: %s", nodeID)
	}

	return metrics, nil
}

// GetAllNodeMetrics returns metrics for all nodes
func (mc *MetricsCollector) GetAllNodeMetrics() map[string]*NodeMetrics {
	mc.mu.RLock()
	defer mc.mu.RUnlock()

	// Return a copy to avoid race conditions
	result := make(map[string]*NodeMetrics)
	for id, metrics := range mc.nodes {
		result[id] = metrics
	}

	return result
}

// GetServiceMetrics returns the latest metrics for a service
func (mc *MetricsCollector) GetServiceMetrics(serviceID string) (*ServiceMetrics, error) {
	mc.mu.RLock()
	defer mc.mu.RUnlock()

	metrics, exists := mc.services[serviceID]
	if !exists {
		return nil, fmt.Errorf("no metrics found for service: %s", serviceID)
	}

	return metrics, nil
}

// GetAggregatedMetrics returns aggregated metrics based on a query
func (mc *MetricsCollector) GetAggregatedMetrics(ctx context.Context, query MetricsQuery) (*AggregatedMetrics, error) {
	return mc.storage.GetAggregatedMetrics(ctx, query)
}

// GetMetricsSummary returns a summary of all metrics
func (mc *MetricsCollector) GetMetricsSummary() map[string]interface{} {
	mc.mu.RLock()
	defer mc.mu.RUnlock()

	totalNodes := len(mc.nodes)
	totalServices := len(mc.services)
	healthyNodes := 0
	totalCPU := 0.0
	totalMemory := int64(0)

	for _, metrics := range mc.nodes {
		if metrics.CPU.UsagePercent < 80 {
			healthyNodes++
		}
		totalCPU += metrics.CPU.UsagePercent
		totalMemory += metrics.Memory.Used
	}

	avgCPU := float64(0)
	if totalNodes > 0 {
		avgCPU = totalCPU / float64(totalNodes)
	}

	return map[string]interface{}{
		"total_nodes":      totalNodes,
		"healthy_nodes":    healthyNodes,
		"total_services":   totalServices,
		"avg_cpu_usage":    avgCPU,
		"total_memory":     totalMemory,
		"collect_interval": mc.collectInterval.String(),
		"last_collection":  time.Now().Format(time.RFC3339),
	}
}

// ExportMetrics exports metrics in various formats
func (mc *MetricsCollector) ExportMetrics(format string) ([]byte, error) {
	mc.mu.RLock()
	defer mc.mu.RUnlock()

	data := map[string]interface{}{
		"nodes":     mc.nodes,
		"services":  mc.services,
		"timestamp": time.Now(),
	}

	switch format {
	case "json":
		return json.MarshalIndent(data, "", "  ")
	case "prometheus":
		return mc.exportPrometheusFormat()
	default:
		return nil, fmt.Errorf("unsupported export format: %s", format)
	}
}

// exportPrometheusFormat exports metrics in Prometheus format
func (mc *MetricsCollector) exportPrometheusFormat() ([]byte, error) {
	var output []string

	for nodeID, metrics := range mc.nodes {
		// Node CPU metrics
		output = append(output, fmt.Sprintf("# HELP node_cpu_usage_percent CPU usage percentage for node"))
		output = append(output, fmt.Sprintf("# TYPE node_cpu_usage_percent gauge"))
		output = append(output, fmt.Sprintf("node_cpu_usage_percent{node=\"%s\"} %f", nodeID, metrics.CPU.UsagePercent))

		// Node memory metrics
		output = append(output, fmt.Sprintf("# HELP node_memory_usage_bytes Memory usage in bytes for node"))
		output = append(output, fmt.Sprintf("# TYPE node_memory_usage_bytes gauge"))
		output = append(output, fmt.Sprintf("node_memory_usage_bytes{node=\"%s\"} %d", nodeID, metrics.Memory.Used))

		// Node network metrics
		output = append(output, fmt.Sprintf("# HELP node_network_bytes_in Total bytes received for node"))
		output = append(output, fmt.Sprintf("# TYPE node_network_bytes_in counter"))
		output = append(output, fmt.Sprintf("node_network_bytes_in{node=\"%s\"} %d", nodeID, metrics.Network.BytesIn))
	}

	result := []byte(strings.Join(output, "\n"))
	return result, nil
}