mirror of
https://github.com/rcourtman/Pulse.git
synced 2026-05-07 00:37:36 +00:00
248 lines
8.3 KiB
Go
248 lines
8.3 KiB
Go
package monitoring
|
|
|
|
import (
|
|
"context"
|
|
"strconv"
|
|
"time"
|
|
|
|
"github.com/rcourtman/pulse-go-rewrite/internal/config"
|
|
"github.com/rcourtman/pulse-go-rewrite/internal/models"
|
|
"github.com/rcourtman/pulse-go-rewrite/pkg/proxmox"
|
|
"github.com/rs/zerolog/log"
|
|
)
|
|
|
|
func (m *Monitor) pollPVENode(
|
|
ctx context.Context,
|
|
instanceName string,
|
|
instanceCfg *config.PVEInstance,
|
|
client PVEClientInterface,
|
|
node proxmox.Node,
|
|
connectionHealthStr string,
|
|
prevNodeMemory map[string]models.Memory,
|
|
prevInstanceNodes []models.Node,
|
|
) (models.Node, string, string, error) {
|
|
nodeStart := time.Now()
|
|
displayName := getNodeDisplayName(instanceCfg, node.Node)
|
|
connectionHost, guestURL := resolveNodeConnectionInfo(instanceCfg, monitorDiscoveryConfig(m), node.Node)
|
|
nodeID, effectiveStatus := m.determineNodeIDAndStatus(instanceName, instanceCfg, node)
|
|
|
|
modelNode := models.Node{
|
|
ID: nodeID,
|
|
Name: node.Node,
|
|
DisplayName: displayName,
|
|
Instance: instanceName,
|
|
Host: connectionHost,
|
|
GuestURL: guestURL,
|
|
Status: effectiveStatus,
|
|
Type: "node",
|
|
CPU: safeFloat(node.CPU), // Proxmox returns 0-1 ratio (e.g., 0.15 = 15%)
|
|
Memory: models.Memory{
|
|
Total: int64(node.MaxMem),
|
|
Used: int64(node.Mem),
|
|
Free: int64(node.MaxMem - node.Mem),
|
|
Usage: safePercentage(float64(node.Mem), float64(node.MaxMem)),
|
|
},
|
|
Uptime: int64(node.Uptime),
|
|
LoadAverage: []float64{},
|
|
LastSeen: time.Now(),
|
|
ConnectionHealth: connectionHealthStr, // Use the determined health status
|
|
IsClusterMember: instanceCfg.IsCluster,
|
|
ClusterName: instanceCfg.ClusterName,
|
|
TemperatureMonitoringEnabled: instanceCfg.TemperatureMonitoringEnabled,
|
|
}
|
|
var nodeDiskSource string
|
|
modelNode.Disk, nodeDiskSource = m.resolveNodeDisk(instanceName, nodeID, node.Node, node, nil)
|
|
|
|
nodeSnapshotRaw := NodeMemoryRaw{
|
|
Total: node.MaxMem,
|
|
Used: node.Mem,
|
|
Free: node.MaxMem - node.Mem,
|
|
FallbackTotal: node.MaxMem,
|
|
FallbackUsed: node.Mem,
|
|
FallbackFree: node.MaxMem - node.Mem,
|
|
FallbackCalculated: true,
|
|
ProxmoxMemorySource: "nodes-endpoint",
|
|
}
|
|
nodeMemorySource := "nodes-endpoint"
|
|
var nodeFallbackReason string
|
|
|
|
// Debug logging for disk metrics - note that these values can fluctuate
|
|
// due to thin provisioning and dynamic allocation
|
|
if node.Disk > 0 && node.MaxDisk > 0 {
|
|
log.Debug().
|
|
Str("node", node.Node).
|
|
Uint64("disk", node.Disk).
|
|
Uint64("maxDisk", node.MaxDisk).
|
|
Float64("diskUsage", safePercentage(float64(node.Disk), float64(node.MaxDisk))).
|
|
Msg("Node disk metrics from /nodes endpoint")
|
|
}
|
|
|
|
// Track whether we successfully replaced memory metrics with detailed status data
|
|
memoryUpdated := false
|
|
|
|
// Get detailed node info if available (skip for offline nodes)
|
|
if effectiveStatus == "online" {
|
|
nodeInfo, nodeErr := client.GetNodeStatus(ctx, node.Node)
|
|
if nodeErr != nil {
|
|
nodeFallbackReason = "node-status-unavailable"
|
|
// If we can't get node status, log but continue with data from /nodes endpoint
|
|
if node.Disk > 0 && node.MaxDisk > 0 {
|
|
log.Warn().
|
|
Str("instance", instanceName).
|
|
Str("node", node.Node).
|
|
Err(nodeErr).
|
|
Uint64("usingDisk", node.Disk).
|
|
Uint64("usingMaxDisk", node.MaxDisk).
|
|
Msg("Could not get node status - using fallback metrics (memory will include cache/buffers)")
|
|
} else {
|
|
log.Warn().
|
|
Str("instance", instanceName).
|
|
Str("node", node.Node).
|
|
Err(nodeErr).
|
|
Uint64("disk", node.Disk).
|
|
Uint64("maxDisk", node.MaxDisk).
|
|
Msg("Could not get node status - no fallback metrics available (memory will include cache/buffers)")
|
|
}
|
|
} else if nodeInfo != nil {
|
|
if nodeInfo.Memory != nil {
|
|
nodeSnapshotRaw.Total = nodeInfo.Memory.Total
|
|
nodeSnapshotRaw.Used = nodeInfo.Memory.Used
|
|
nodeSnapshotRaw.Free = nodeInfo.Memory.Free
|
|
nodeSnapshotRaw.Available = nodeInfo.Memory.Available
|
|
nodeSnapshotRaw.Avail = nodeInfo.Memory.Avail
|
|
nodeSnapshotRaw.Buffers = nodeInfo.Memory.Buffers
|
|
nodeSnapshotRaw.Cached = nodeInfo.Memory.Cached
|
|
nodeSnapshotRaw.Shared = nodeInfo.Memory.Shared
|
|
nodeSnapshotRaw.EffectiveAvailable = nodeInfo.Memory.EffectiveAvailable()
|
|
nodeSnapshotRaw.ProxmoxMemorySource = "node-status"
|
|
nodeSnapshotRaw.FallbackCalculated = false
|
|
}
|
|
|
|
// Convert LoadAvg from interface{} to float64
|
|
loadAvg := make([]float64, 0, len(nodeInfo.LoadAvg))
|
|
for _, val := range nodeInfo.LoadAvg {
|
|
switch v := val.(type) {
|
|
case float64:
|
|
loadAvg = append(loadAvg, v)
|
|
case string:
|
|
if f, err := strconv.ParseFloat(v, 64); err == nil {
|
|
loadAvg = append(loadAvg, f)
|
|
}
|
|
}
|
|
}
|
|
modelNode.LoadAverage = loadAvg
|
|
modelNode.KernelVersion = nodeInfo.KernelVersion
|
|
modelNode.PVEVersion = nodeInfo.PVEVersion
|
|
|
|
if resolvedDisk, diskSource := m.resolveNodeDisk(instanceName, nodeID, node.Node, node, nodeInfo); diskSource != "" {
|
|
modelNode.Disk = resolvedDisk
|
|
nodeDiskSource = diskSource
|
|
} else {
|
|
log.Warn().
|
|
Str("node", node.Node).
|
|
Bool("rootfsNil", nodeInfo.RootFS == nil).
|
|
Uint64("nodeDisk", node.Disk).
|
|
Uint64("nodeMaxDisk", node.MaxDisk).
|
|
Msg("No valid disk metrics available for node")
|
|
}
|
|
|
|
// Update memory metrics to use Available field for more accurate usage
|
|
if nodeInfo.Memory != nil && nodeInfo.Memory.Total > 0 {
|
|
resolvedMemory, resolvedSource, resolvedFallback, resolvedRaw, ok := m.resolveNodeMemory(
|
|
ctx,
|
|
client,
|
|
instanceName,
|
|
node.Node,
|
|
nodeInfo.Memory,
|
|
nodeSnapshotRaw,
|
|
)
|
|
if ok {
|
|
modelNode.Memory = resolvedMemory
|
|
nodeMemorySource = resolvedSource
|
|
if resolvedFallback != "" {
|
|
nodeFallbackReason = resolvedFallback
|
|
}
|
|
nodeSnapshotRaw = resolvedRaw
|
|
memoryUpdated = true
|
|
}
|
|
}
|
|
|
|
if nodeInfo.CPUInfo != nil {
|
|
// Use MaxCPU from node data for logical CPU count (includes hyperthreading)
|
|
// If MaxCPU is not available or 0, fall back to physical cores
|
|
logicalCores := node.MaxCPU
|
|
if logicalCores == 0 {
|
|
logicalCores = nodeInfo.CPUInfo.Cores
|
|
}
|
|
|
|
mhzStr := nodeInfo.CPUInfo.GetMHzString()
|
|
log.Debug().
|
|
Str("node", node.Node).
|
|
Str("model", nodeInfo.CPUInfo.Model).
|
|
Int("cores", nodeInfo.CPUInfo.Cores).
|
|
Int("logicalCores", logicalCores).
|
|
Int("sockets", nodeInfo.CPUInfo.Sockets).
|
|
Str("mhz", mhzStr).
|
|
Msg("Node CPU info from Proxmox")
|
|
modelNode.CPUInfo = models.CPUInfo{
|
|
Model: nodeInfo.CPUInfo.Model,
|
|
Cores: logicalCores, // Use logical cores for display
|
|
Sockets: nodeInfo.CPUInfo.Sockets,
|
|
MHz: mhzStr,
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// If we couldn't update memory metrics using detailed status, preserve previous accurate values if available
|
|
if !memoryUpdated && effectiveStatus == "online" {
|
|
if prevMem, exists := prevNodeMemory[modelNode.ID]; exists && prevMem.Total > 0 {
|
|
total := int64(node.MaxMem)
|
|
if total == 0 {
|
|
total = prevMem.Total
|
|
}
|
|
used := prevMem.Used
|
|
if total > 0 && used > total {
|
|
used = total
|
|
}
|
|
free := total - used
|
|
if free < 0 {
|
|
free = 0
|
|
}
|
|
|
|
preserved := prevMem
|
|
preserved.Total = total
|
|
preserved.Used = used
|
|
preserved.Free = free
|
|
preserved.Usage = safePercentage(float64(used), float64(total))
|
|
|
|
modelNode.Memory = preserved
|
|
log.Debug().
|
|
Str("instance", instanceName).
|
|
Str("node", node.Node).
|
|
Msg("Preserving previous memory metrics - node status unavailable this cycle")
|
|
|
|
if nodeFallbackReason == "" {
|
|
nodeFallbackReason = "preserved-previous-snapshot"
|
|
}
|
|
nodeMemorySource = "previous-snapshot"
|
|
if nodeSnapshotRaw.ProxmoxMemorySource == "node-status" && nodeSnapshotRaw.Total == 0 {
|
|
nodeSnapshotRaw.ProxmoxMemorySource = "previous-snapshot"
|
|
}
|
|
}
|
|
}
|
|
|
|
m.recordNodeSnapshot(instanceName, node.Node, NodeMemorySnapshot{
|
|
RetrievedAt: time.Now(),
|
|
MemorySource: nodeMemorySource,
|
|
FallbackReason: nodeFallbackReason,
|
|
Memory: modelNode.Memory,
|
|
Raw: nodeSnapshotRaw,
|
|
})
|
|
|
|
m.collectNodeTemperatureData(ctx, instanceName, instanceCfg, node, &modelNode, prevInstanceNodes, effectiveStatus)
|
|
m.applyNodePendingUpdates(ctx, instanceName, client, node, nodeID, effectiveStatus, &modelNode)
|
|
m.recordNodePollMetrics(instanceName, node, &modelNode, nodeStart)
|
|
|
|
return modelNode, effectiveStatus, nodeDiskSource, nil
|
|
}
|