mirror of
https://github.com/rcourtman/Pulse.git
synced 2026-05-07 00:37:36 +00:00
331 lines
10 KiB
Go
331 lines
10 KiB
Go
package monitoring
|
|
|
|
import (
|
|
"context"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/rcourtman/pulse-go-rewrite/internal/models"
|
|
"github.com/rcourtman/pulse-go-rewrite/internal/monitoring/errors"
|
|
"github.com/rcourtman/pulse-go-rewrite/pkg/proxmox"
|
|
"github.com/rs/zerolog/log"
|
|
)
|
|
|
|
func (m *Monitor) pollContainersWithNodes(ctx context.Context, instanceName string, clusterName string, isCluster bool, client PVEClientInterface, nodes []proxmox.Node, nodeEffectiveStatus map[string]string) {
|
|
startTime := time.Now()
|
|
|
|
// Channel to collect container results from each node
|
|
type nodeResult struct {
|
|
node string
|
|
containers []models.Container
|
|
err error
|
|
}
|
|
|
|
resultChan := make(chan nodeResult, len(nodes))
|
|
var wg sync.WaitGroup
|
|
|
|
// Count online nodes for logging
|
|
onlineNodes := 0
|
|
for _, node := range nodes {
|
|
if nodeEffectiveStatus[node.Node] == "online" {
|
|
onlineNodes++
|
|
}
|
|
}
|
|
|
|
// Capture the previous guest context once per poll cycle so fallback behavior
|
|
// is based on a consistent pre-poll snapshot.
|
|
prevGuests := m.previousGuestContextForInstance(instanceName)
|
|
prevContainerIsOCI := prevGuests.containerOCIByVMID
|
|
|
|
log.Debug().
|
|
Str("instance", instanceName).
|
|
Int("totalNodes", len(nodes)).
|
|
Int("onlineNodes", onlineNodes).
|
|
Msg("Starting parallel container polling")
|
|
|
|
// Launch a goroutine for each online node
|
|
for _, node := range nodes {
|
|
// Skip offline nodes
|
|
if nodeEffectiveStatus[node.Node] != "online" {
|
|
log.Debug().
|
|
Str("node", node.Node).
|
|
Str("status", node.Status).
|
|
Msg("Skipping offline node for container polling")
|
|
continue
|
|
}
|
|
|
|
wg.Add(1)
|
|
go func(n proxmox.Node) {
|
|
defer wg.Done()
|
|
|
|
nodeStart := time.Now()
|
|
|
|
// Fetch containers for this node
|
|
containers, err := client.GetContainers(ctx, n.Node)
|
|
if err != nil {
|
|
monErr := errors.NewMonitorError(errors.ErrorTypeAPI, "get_containers", instanceName, err).WithNode(n.Node)
|
|
log.Error().Err(monErr).Str("node", n.Node).Msg("failed to get containers")
|
|
resultChan <- nodeResult{node: n.Node, err: err}
|
|
return
|
|
}
|
|
|
|
vmIDs := make([]int, 0, len(containers))
|
|
for _, ct := range containers {
|
|
if ct.Template == 1 {
|
|
continue
|
|
}
|
|
vmIDs = append(vmIDs, int(ct.VMID))
|
|
}
|
|
|
|
rootUsageOverrides := m.collectContainerRootUsage(ctx, client, n.Node, vmIDs)
|
|
|
|
var nodeContainers []models.Container
|
|
|
|
// Process each container
|
|
for _, container := range containers {
|
|
// Skip templates
|
|
if container.Template == 1 {
|
|
continue
|
|
}
|
|
|
|
// Parse tags
|
|
var tags []string
|
|
if container.Tags != "" {
|
|
tags = strings.Split(container.Tags, ";")
|
|
}
|
|
|
|
// Generate canonical guest ID: instance:node:vmid
|
|
guestID := makeGuestID(instanceName, n.Node, int(container.VMID))
|
|
|
|
// Calculate I/O rates
|
|
currentMetrics := IOMetrics{
|
|
DiskRead: int64(container.DiskRead),
|
|
DiskWrite: int64(container.DiskWrite),
|
|
NetworkIn: int64(container.NetIn),
|
|
NetworkOut: int64(container.NetOut),
|
|
Timestamp: time.Now(),
|
|
}
|
|
diskReadRate, diskWriteRate, netInRate, netOutRate := m.rateTracker.CalculateRates(guestID, currentMetrics)
|
|
|
|
// Set CPU to 0 for non-running containers
|
|
cpuUsage := safeFloat(container.CPU)
|
|
if container.Status != "running" {
|
|
cpuUsage = 0
|
|
}
|
|
|
|
memTotal, memUsed, memorySource, guestRaw := m.calculateLXCMemory(ctx, instanceName, proxmox.ClusterResource{
|
|
Type: "lxc",
|
|
Node: n.Node,
|
|
Name: container.Name,
|
|
Status: container.Status,
|
|
VMID: int(container.VMID),
|
|
MaxMem: container.MaxMem,
|
|
Mem: container.Mem,
|
|
}, client)
|
|
|
|
memTotalBytes := clampToInt64(memTotal)
|
|
memUsedBytes := clampToInt64(memUsed)
|
|
if memTotalBytes > 0 && memUsedBytes > memTotalBytes {
|
|
memUsedBytes = memTotalBytes
|
|
}
|
|
memFreeBytes := memTotalBytes - memUsedBytes
|
|
if memFreeBytes < 0 {
|
|
memFreeBytes = 0
|
|
}
|
|
memUsagePercent := safePercentage(float64(memUsedBytes), float64(memTotalBytes))
|
|
|
|
diskTotalBytes := clampToInt64(container.MaxDisk)
|
|
diskUsedBytes := clampToInt64(container.Disk)
|
|
if diskTotalBytes > 0 && diskUsedBytes > diskTotalBytes {
|
|
diskUsedBytes = diskTotalBytes
|
|
}
|
|
diskFreeBytes := diskTotalBytes - diskUsedBytes
|
|
if diskFreeBytes < 0 {
|
|
diskFreeBytes = 0
|
|
}
|
|
diskUsagePercent := safePercentage(float64(diskUsedBytes), float64(diskTotalBytes))
|
|
|
|
// Create container model
|
|
modelContainer := models.Container{
|
|
ID: guestID,
|
|
VMID: int(container.VMID),
|
|
Name: container.Name,
|
|
Node: n.Node,
|
|
Pool: strings.TrimSpace(container.Pool),
|
|
Instance: instanceName,
|
|
Status: container.Status,
|
|
Type: "lxc",
|
|
CPU: cpuUsage,
|
|
CPUs: int(container.CPUs),
|
|
Memory: models.Memory{
|
|
Total: memTotalBytes,
|
|
Used: memUsedBytes,
|
|
Free: memFreeBytes,
|
|
Usage: memUsagePercent,
|
|
},
|
|
Disk: models.Disk{
|
|
Total: diskTotalBytes,
|
|
Used: diskUsedBytes,
|
|
Free: diskFreeBytes,
|
|
Usage: diskUsagePercent,
|
|
},
|
|
NetworkIn: max(0, int64(netInRate)),
|
|
NetworkOut: max(0, int64(netOutRate)),
|
|
DiskRead: max(0, int64(diskReadRate)),
|
|
DiskWrite: max(0, int64(diskWriteRate)),
|
|
Uptime: int64(container.Uptime),
|
|
Template: container.Template == 1,
|
|
LastSeen: time.Now(),
|
|
Tags: tags,
|
|
}
|
|
|
|
if prevContainerIsOCI[modelContainer.VMID] {
|
|
modelContainer.IsOCI = true
|
|
modelContainer.Type = "oci"
|
|
}
|
|
|
|
if override, ok := rootUsageOverrides[int(container.VMID)]; ok {
|
|
overrideUsed := clampToInt64(override.Used)
|
|
overrideTotal := clampToInt64(override.Total)
|
|
|
|
if overrideUsed > 0 && (modelContainer.Disk.Used == 0 || overrideUsed < modelContainer.Disk.Used) {
|
|
modelContainer.Disk.Used = overrideUsed
|
|
}
|
|
|
|
if overrideTotal > 0 {
|
|
modelContainer.Disk.Total = overrideTotal
|
|
}
|
|
|
|
if modelContainer.Disk.Total > 0 && modelContainer.Disk.Used > modelContainer.Disk.Total {
|
|
modelContainer.Disk.Used = modelContainer.Disk.Total
|
|
}
|
|
|
|
modelContainer.Disk.Free = modelContainer.Disk.Total - modelContainer.Disk.Used
|
|
if modelContainer.Disk.Free < 0 {
|
|
modelContainer.Disk.Free = 0
|
|
}
|
|
|
|
modelContainer.Disk.Usage = safePercentage(float64(modelContainer.Disk.Used), float64(modelContainer.Disk.Total))
|
|
}
|
|
|
|
m.enrichContainerMetadata(ctx, client, instanceName, n.Node, &modelContainer)
|
|
|
|
// Zero out metrics for non-running containers
|
|
if container.Status != "running" {
|
|
modelContainer.CPU = 0
|
|
modelContainer.Memory.Usage = 0
|
|
modelContainer.Disk.Usage = 0
|
|
modelContainer.NetworkIn = 0
|
|
modelContainer.NetworkOut = 0
|
|
modelContainer.DiskRead = 0
|
|
modelContainer.DiskWrite = 0
|
|
}
|
|
|
|
// Trigger guest metadata migration if old format exists
|
|
if m.guestMetadataStore != nil {
|
|
m.guestMetadataStore.GetWithLegacyMigration(guestID, instanceName, n.Node, int(container.VMID))
|
|
}
|
|
|
|
m.recordGuestSnapshot(instanceName, modelContainer.Type, n.Node, int(container.VMID), GuestMemorySnapshot{
|
|
Name: modelContainer.Name,
|
|
Status: modelContainer.Status,
|
|
RetrievedAt: modelContainer.LastSeen,
|
|
MemorySource: memorySource,
|
|
FallbackReason: guestMemoryFallbackReason(memorySource),
|
|
Memory: modelContainer.Memory,
|
|
Raw: guestRaw,
|
|
})
|
|
|
|
nodeContainers = append(nodeContainers, modelContainer)
|
|
|
|
// Check alerts
|
|
m.alertManager.CheckGuest(modelContainer, instanceName)
|
|
}
|
|
|
|
nodeDuration := time.Since(nodeStart)
|
|
log.Debug().
|
|
Str("node", n.Node).
|
|
Int("containers", len(nodeContainers)).
|
|
Dur("duration", nodeDuration).
|
|
Msg("Node container polling completed")
|
|
|
|
resultChan <- nodeResult{node: n.Node, containers: nodeContainers}
|
|
}(node)
|
|
}
|
|
|
|
// Close channel when all goroutines complete
|
|
go func() {
|
|
wg.Wait()
|
|
close(resultChan)
|
|
}()
|
|
|
|
// Collect results from all nodes
|
|
var allContainers []models.Container
|
|
successfulNodes := 0
|
|
failedNodes := 0
|
|
|
|
for result := range resultChan {
|
|
if result.err != nil {
|
|
failedNodes++
|
|
} else {
|
|
successfulNodes++
|
|
allContainers = append(allContainers, result.containers...)
|
|
}
|
|
}
|
|
|
|
// If we got ZERO containers but had containers before (likely cluster health issue),
|
|
// preserve previous containers instead of clearing them
|
|
if len(allContainers) == 0 && len(nodes) > 0 {
|
|
allContainers = append(allContainers, prevGuests.containers...)
|
|
prevContainerCount := len(prevGuests.containers)
|
|
if prevContainerCount > 0 {
|
|
log.Warn().
|
|
Str("instance", instanceName).
|
|
Int("prevContainers", prevContainerCount).
|
|
Int("successfulNodes", successfulNodes).
|
|
Int("totalNodes", len(nodes)).
|
|
Msg("Traditional polling returned zero containers but had containers before - preserving previous containers")
|
|
}
|
|
}
|
|
|
|
// Check Docker presence for containers that need it (new, restarted, started)
|
|
allContainers = m.CheckContainersForDocker(ctx, allContainers)
|
|
|
|
// Update state with all containers
|
|
m.state.UpdateContainersForInstance(instanceName, allContainers)
|
|
|
|
// Record guest metrics history for running containers (enables sparkline/trends view)
|
|
if !shouldSkipNativeMockStateMetricWrites() {
|
|
now := time.Now()
|
|
for _, ct := range allContainers {
|
|
if ct.Status != "running" {
|
|
continue
|
|
}
|
|
m.metricsHistory.AddGuestMetric(ct.ID, "cpu", ct.CPU*100, now)
|
|
m.metricsHistory.AddGuestMetric(ct.ID, "memory", ct.Memory.Usage, now)
|
|
if ct.Disk.Usage >= 0 {
|
|
m.metricsHistory.AddGuestMetric(ct.ID, "disk", ct.Disk.Usage, now)
|
|
}
|
|
// Also write to persistent store
|
|
if m.metricsStore != nil {
|
|
m.metricsStore.Write("container", ct.ID, "cpu", ct.CPU*100, now)
|
|
m.metricsStore.Write("container", ct.ID, "memory", ct.Memory.Usage, now)
|
|
if ct.Disk.Usage >= 0 {
|
|
m.metricsStore.Write("container", ct.ID, "disk", ct.Disk.Usage, now)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
duration := time.Since(startTime)
|
|
log.Debug().
|
|
Str("instance", instanceName).
|
|
Int("totalContainers", len(allContainers)).
|
|
Int("successfulNodes", successfulNodes).
|
|
Int("failedNodes", failedNodes).
|
|
Dur("duration", duration).
|
|
Msg("Parallel container polling completed")
|
|
}
|
|
|
|
// pollStorageWithNodes polls storage from all nodes in parallel using goroutines
|