Pulse/internal/monitoring/monitor_polling_node.go
2026-04-01 11:51:19 +01:00

248 lines
8.3 KiB
Go

package monitoring
import (
"context"
"strconv"
"time"
"github.com/rcourtman/pulse-go-rewrite/internal/config"
"github.com/rcourtman/pulse-go-rewrite/internal/models"
"github.com/rcourtman/pulse-go-rewrite/pkg/proxmox"
"github.com/rs/zerolog/log"
)
func (m *Monitor) pollPVENode(
ctx context.Context,
instanceName string,
instanceCfg *config.PVEInstance,
client PVEClientInterface,
node proxmox.Node,
connectionHealthStr string,
prevNodeMemory map[string]models.Memory,
prevInstanceNodes []models.Node,
) (models.Node, string, string, error) {
nodeStart := time.Now()
displayName := getNodeDisplayName(instanceCfg, node.Node)
connectionHost, guestURL := resolveNodeConnectionInfo(instanceCfg, monitorDiscoveryConfig(m), node.Node)
nodeID, effectiveStatus := m.determineNodeIDAndStatus(instanceName, instanceCfg, node)
modelNode := models.Node{
ID: nodeID,
Name: node.Node,
DisplayName: displayName,
Instance: instanceName,
Host: connectionHost,
GuestURL: guestURL,
Status: effectiveStatus,
Type: "node",
CPU: safeFloat(node.CPU), // Proxmox returns 0-1 ratio (e.g., 0.15 = 15%)
Memory: models.Memory{
Total: int64(node.MaxMem),
Used: int64(node.Mem),
Free: int64(node.MaxMem - node.Mem),
Usage: safePercentage(float64(node.Mem), float64(node.MaxMem)),
},
Uptime: int64(node.Uptime),
LoadAverage: []float64{},
LastSeen: time.Now(),
ConnectionHealth: connectionHealthStr, // Use the determined health status
IsClusterMember: instanceCfg.IsCluster,
ClusterName: instanceCfg.ClusterName,
TemperatureMonitoringEnabled: instanceCfg.TemperatureMonitoringEnabled,
}
var nodeDiskSource string
modelNode.Disk, nodeDiskSource = m.resolveNodeDisk(instanceName, nodeID, node.Node, node, nil)
nodeSnapshotRaw := NodeMemoryRaw{
Total: node.MaxMem,
Used: node.Mem,
Free: node.MaxMem - node.Mem,
FallbackTotal: node.MaxMem,
FallbackUsed: node.Mem,
FallbackFree: node.MaxMem - node.Mem,
FallbackCalculated: true,
ProxmoxMemorySource: "nodes-endpoint",
}
nodeMemorySource := "nodes-endpoint"
var nodeFallbackReason string
// Debug logging for disk metrics - note that these values can fluctuate
// due to thin provisioning and dynamic allocation
if node.Disk > 0 && node.MaxDisk > 0 {
log.Debug().
Str("node", node.Node).
Uint64("disk", node.Disk).
Uint64("maxDisk", node.MaxDisk).
Float64("diskUsage", safePercentage(float64(node.Disk), float64(node.MaxDisk))).
Msg("Node disk metrics from /nodes endpoint")
}
// Track whether we successfully replaced memory metrics with detailed status data
memoryUpdated := false
// Get detailed node info if available (skip for offline nodes)
if effectiveStatus == "online" {
nodeInfo, nodeErr := client.GetNodeStatus(ctx, node.Node)
if nodeErr != nil {
nodeFallbackReason = "node-status-unavailable"
// If we can't get node status, log but continue with data from /nodes endpoint
if node.Disk > 0 && node.MaxDisk > 0 {
log.Warn().
Str("instance", instanceName).
Str("node", node.Node).
Err(nodeErr).
Uint64("usingDisk", node.Disk).
Uint64("usingMaxDisk", node.MaxDisk).
Msg("Could not get node status - using fallback metrics (memory will include cache/buffers)")
} else {
log.Warn().
Str("instance", instanceName).
Str("node", node.Node).
Err(nodeErr).
Uint64("disk", node.Disk).
Uint64("maxDisk", node.MaxDisk).
Msg("Could not get node status - no fallback metrics available (memory will include cache/buffers)")
}
} else if nodeInfo != nil {
if nodeInfo.Memory != nil {
nodeSnapshotRaw.Total = nodeInfo.Memory.Total
nodeSnapshotRaw.Used = nodeInfo.Memory.Used
nodeSnapshotRaw.Free = nodeInfo.Memory.Free
nodeSnapshotRaw.Available = nodeInfo.Memory.Available
nodeSnapshotRaw.Avail = nodeInfo.Memory.Avail
nodeSnapshotRaw.Buffers = nodeInfo.Memory.Buffers
nodeSnapshotRaw.Cached = nodeInfo.Memory.Cached
nodeSnapshotRaw.Shared = nodeInfo.Memory.Shared
nodeSnapshotRaw.EffectiveAvailable = nodeInfo.Memory.EffectiveAvailable()
nodeSnapshotRaw.ProxmoxMemorySource = "node-status"
nodeSnapshotRaw.FallbackCalculated = false
}
// Convert LoadAvg from interface{} to float64
loadAvg := make([]float64, 0, len(nodeInfo.LoadAvg))
for _, val := range nodeInfo.LoadAvg {
switch v := val.(type) {
case float64:
loadAvg = append(loadAvg, v)
case string:
if f, err := strconv.ParseFloat(v, 64); err == nil {
loadAvg = append(loadAvg, f)
}
}
}
modelNode.LoadAverage = loadAvg
modelNode.KernelVersion = nodeInfo.KernelVersion
modelNode.PVEVersion = nodeInfo.PVEVersion
if resolvedDisk, diskSource := m.resolveNodeDisk(instanceName, nodeID, node.Node, node, nodeInfo); diskSource != "" {
modelNode.Disk = resolvedDisk
nodeDiskSource = diskSource
} else {
log.Warn().
Str("node", node.Node).
Bool("rootfsNil", nodeInfo.RootFS == nil).
Uint64("nodeDisk", node.Disk).
Uint64("nodeMaxDisk", node.MaxDisk).
Msg("No valid disk metrics available for node")
}
// Update memory metrics to use Available field for more accurate usage
if nodeInfo.Memory != nil && nodeInfo.Memory.Total > 0 {
resolvedMemory, resolvedSource, resolvedFallback, resolvedRaw, ok := m.resolveNodeMemory(
ctx,
client,
instanceName,
node.Node,
nodeInfo.Memory,
nodeSnapshotRaw,
)
if ok {
modelNode.Memory = resolvedMemory
nodeMemorySource = resolvedSource
if resolvedFallback != "" {
nodeFallbackReason = resolvedFallback
}
nodeSnapshotRaw = resolvedRaw
memoryUpdated = true
}
}
if nodeInfo.CPUInfo != nil {
// Use MaxCPU from node data for logical CPU count (includes hyperthreading)
// If MaxCPU is not available or 0, fall back to physical cores
logicalCores := node.MaxCPU
if logicalCores == 0 {
logicalCores = nodeInfo.CPUInfo.Cores
}
mhzStr := nodeInfo.CPUInfo.GetMHzString()
log.Debug().
Str("node", node.Node).
Str("model", nodeInfo.CPUInfo.Model).
Int("cores", nodeInfo.CPUInfo.Cores).
Int("logicalCores", logicalCores).
Int("sockets", nodeInfo.CPUInfo.Sockets).
Str("mhz", mhzStr).
Msg("Node CPU info from Proxmox")
modelNode.CPUInfo = models.CPUInfo{
Model: nodeInfo.CPUInfo.Model,
Cores: logicalCores, // Use logical cores for display
Sockets: nodeInfo.CPUInfo.Sockets,
MHz: mhzStr,
}
}
}
}
// If we couldn't update memory metrics using detailed status, preserve previous accurate values if available
if !memoryUpdated && effectiveStatus == "online" {
if prevMem, exists := prevNodeMemory[modelNode.ID]; exists && prevMem.Total > 0 {
total := int64(node.MaxMem)
if total == 0 {
total = prevMem.Total
}
used := prevMem.Used
if total > 0 && used > total {
used = total
}
free := total - used
if free < 0 {
free = 0
}
preserved := prevMem
preserved.Total = total
preserved.Used = used
preserved.Free = free
preserved.Usage = safePercentage(float64(used), float64(total))
modelNode.Memory = preserved
log.Debug().
Str("instance", instanceName).
Str("node", node.Node).
Msg("Preserving previous memory metrics - node status unavailable this cycle")
if nodeFallbackReason == "" {
nodeFallbackReason = "preserved-previous-snapshot"
}
nodeMemorySource = "previous-snapshot"
if nodeSnapshotRaw.ProxmoxMemorySource == "node-status" && nodeSnapshotRaw.Total == 0 {
nodeSnapshotRaw.ProxmoxMemorySource = "previous-snapshot"
}
}
}
m.recordNodeSnapshot(instanceName, node.Node, NodeMemorySnapshot{
RetrievedAt: time.Now(),
MemorySource: nodeMemorySource,
FallbackReason: nodeFallbackReason,
Memory: modelNode.Memory,
Raw: nodeSnapshotRaw,
})
m.collectNodeTemperatureData(ctx, instanceName, instanceCfg, node, &modelNode, prevInstanceNodes, effectiveStatus)
m.applyNodePendingUpdates(ctx, instanceName, client, node, nodeID, effectiveStatus, &modelNode)
m.recordNodePollMetrics(instanceName, node, &modelNode, nodeStart)
return modelNode, effectiveStatus, nodeDiskSource, nil
}