Pulse/internal/monitoring/monitor_polling_node_helpers.go

401 lines
13 KiB
Go

package monitoring
import (
"context"
stderrors "errors"
"fmt"
"strings"
"time"
"github.com/rcourtman/pulse-go-rewrite/internal/config"
"github.com/rcourtman/pulse-go-rewrite/internal/models"
"github.com/rcourtman/pulse-go-rewrite/pkg/proxmox"
"github.com/rs/zerolog/log"
)
func resolveNodeConnectionInfo(instanceCfg *config.PVEInstance, discoveryCfg config.DiscoveryConfig, nodeName string) (string, string) {
connectionHost := instanceCfg.Host
guestURL := instanceCfg.GuestURL
if instanceCfg.IsCluster && len(instanceCfg.ClusterEndpoints) > 0 {
hasFingerprint := instanceCfg.Fingerprint != ""
for _, ep := range instanceCfg.ClusterEndpoints {
if strings.EqualFold(ep.NodeName, nodeName) {
if effective := clusterEndpointRuntimeURL(ep, instanceCfg.VerifySSL, hasFingerprint, discoveryCfg); effective != "" {
connectionHost = effective
}
if ep.GuestURL != "" {
guestURL = ep.GuestURL
}
break
}
}
}
return connectionHost, guestURL
}
func (m *Monitor) determineNodeIDAndStatus(instanceName string, instanceCfg *config.PVEInstance, node proxmox.Node) (string, string) {
// Apply grace period for node status to prevent flapping
// For clustered nodes, use clusterName-nodeName as the ID to deduplicate
// when the same cluster is registered via multiple entry points
// (e.g., agent installed with --enable-proxmox on multiple cluster nodes)
var nodeID string
if instanceCfg.IsCluster && instanceCfg.ClusterName != "" {
nodeID = instanceCfg.ClusterName + "-" + node.Node
} else {
nodeID = instanceName + "-" + node.Node
}
effectiveStatus := node.Status
now := time.Now()
m.mu.Lock()
if strings.ToLower(node.Status) == "online" {
// Node is online - update last-online timestamp
m.nodeLastOnline[nodeID] = now
} else {
// Node is reported as offline - check grace period
lastOnline, exists := m.nodeLastOnline[nodeID]
if exists && now.Sub(lastOnline) < nodeOfflineGracePeriod {
// Still within grace period - preserve online status
effectiveStatus = "online"
log.Debug().
Str("instance", instanceName).
Str("node", node.Node).
Dur("timeSinceOnline", now.Sub(lastOnline)).
Dur("gracePeriod", nodeOfflineGracePeriod).
Msg("Node offline but within grace period - preserving online status")
} else {
// Grace period expired or never seen online - mark as offline
if exists {
log.Info().
Str("instance", instanceName).
Str("node", node.Node).
Dur("timeSinceOnline", now.Sub(lastOnline)).
Msg("Node offline and grace period expired - marking as offline")
}
}
}
m.mu.Unlock()
return nodeID, effectiveStatus
}
func (m *Monitor) collectNodeTemperatureData(
ctx context.Context,
instanceName string,
instanceCfg *config.PVEInstance,
node proxmox.Node,
modelNode *models.Node,
prevInstanceNodes []models.Node,
effectiveStatus string,
) {
if modelNode == nil {
return
}
// Collect temperature data via SSH (non-blocking, best effort)
// Only attempt for online nodes when temperature monitoring is enabled
// Check per-node setting first, fall back to global setting
tempMonitoringEnabled := m.config.TemperatureMonitoringEnabled
if instanceCfg.TemperatureMonitoringEnabled != nil {
tempMonitoringEnabled = *instanceCfg.TemperatureMonitoringEnabled
}
if effectiveStatus == "online" && tempMonitoringEnabled {
// First, check if there's a matching host agent with temperature data.
// Host agent temperatures are preferred because they don't require SSH access.
// Use getHostAgentTemperatureByID with the unique node ID to correctly handle
// duplicate hostname scenarios (e.g., two "px1" nodes on different IPs).
hostAgentTemp := m.getHostAgentTemperatureByID(modelNode.ID, node.Node)
if hostAgentTemp != nil {
log.Debug().
Str("node", node.Node).
Float64("cpuPackage", hostAgentTemp.CPUPackage).
Float64("cpuMax", hostAgentTemp.CPUMax).
Int("nvmeCount", len(hostAgentTemp.NVMe)).
Msg("Using temperature data from host agent")
}
// If no host agent temp or we need additional data (SMART), try SSH/proxy collection
var sshTemp *models.Temperature
var err error
if m.tempCollector != nil {
// Temperature collection is best-effort - use a short timeout to avoid blocking node polling
// Use context.Background() so the timeout is truly independent of the parent polling context
// If SSH is slow or unresponsive, we'll preserve previous temperature data
tempCtx, tempCancel := context.WithTimeout(context.Background(), 10*time.Second)
defer tempCancel()
// Determine SSH hostname to use (most robust approach):
// Prefer the resolved host for this node, with cluster overrides when available.
sshHost := modelNode.Host
foundNodeEndpoint := false
if modelNode.IsClusterMember && instanceCfg.IsCluster {
// Try to find specific endpoint configuration for this node
if len(instanceCfg.ClusterEndpoints) > 0 {
hasFingerprint := instanceCfg.Fingerprint != ""
for _, ep := range instanceCfg.ClusterEndpoints {
if strings.EqualFold(ep.NodeName, node.Node) {
if effective := clusterEndpointEffectiveURL(ep, instanceCfg.VerifySSL, hasFingerprint); effective != "" {
sshHost = effective
foundNodeEndpoint = true
}
break
}
}
}
// If no specific endpoint found, fall back to node name
if !foundNodeEndpoint {
sshHost = node.Node
log.Debug().
Str("node", node.Node).
Str("instance", instanceCfg.Name).
Msg("Node endpoint not found in cluster metadata - falling back to node name for temperature collection")
}
}
if strings.TrimSpace(sshHost) == "" {
sshHost = node.Node
}
// Skip SSH only when the host agent already has SMART data too.
// If the host agent only has CPU/NVMe readings, SSH can still
// augment the node with SMART disk temperatures.
skipSSHCollection := shouldSkipTemperatureSSHCollection(hostAgentTemp)
if !skipSSHCollection {
sshTemp, err = m.tempCollector.CollectTemperature(tempCtx, sshHost, node.Node)
if err != nil && hostAgentTemp == nil {
log.Debug().
Str("node", node.Node).
Str("sshHost", sshHost).
Bool("isCluster", modelNode.IsClusterMember).
Int("endpointCount", len(instanceCfg.ClusterEndpoints)).
Msg("Temperature collection failed - check SSH access")
}
}
// Debug: log SSH temp details before merge
if sshTemp != nil {
log.Debug().
Str("node", node.Node).
Bool("sshTempAvailable", sshTemp.Available).
Bool("sshHasSMART", sshTemp.HasSMART).
Int("sshSMARTCount", len(sshTemp.SMART)).
Bool("sshHasNVMe", sshTemp.HasNVMe).
Int("sshNVMeCount", len(sshTemp.NVMe)).
Msg("SSH temperature data before merge")
} else {
log.Debug().
Str("node", node.Node).
Msg("SSH temperature data is nil")
}
}
// Merge host agent and SSH temperatures
temp := mergeTemperatureData(hostAgentTemp, sshTemp)
if temp != nil && temp.Available {
// Get the current CPU temperature (prefer package, fall back to max)
currentTemp := temp.CPUPackage
if currentTemp == 0 && temp.CPUMax > 0 {
currentTemp = temp.CPUMax
}
// Find previous temperature data for this node to preserve min/max
var prevTemp *models.Temperature
for _, prevNode := range prevInstanceNodes {
if prevNode.ID == modelNode.ID && prevNode.Temperature != nil {
prevTemp = prevNode.Temperature
break
}
}
// Initialize or update min/max tracking
if prevTemp != nil && prevTemp.CPUMin > 0 {
// Preserve existing min/max and update if necessary
temp.CPUMin = prevTemp.CPUMin
temp.CPUMaxRecord = prevTemp.CPUMaxRecord
temp.MinRecorded = prevTemp.MinRecorded
temp.MaxRecorded = prevTemp.MaxRecorded
// Update min if current is lower
if currentTemp > 0 && currentTemp < temp.CPUMin {
temp.CPUMin = currentTemp
temp.MinRecorded = time.Now()
}
// Update max if current is higher
if currentTemp > temp.CPUMaxRecord {
temp.CPUMaxRecord = currentTemp
temp.MaxRecorded = time.Now()
}
} else if currentTemp > 0 {
// First reading - initialize min/max to current value
temp.CPUMin = currentTemp
temp.CPUMaxRecord = currentTemp
temp.MinRecorded = time.Now()
temp.MaxRecorded = time.Now()
}
modelNode.Temperature = temp
// Determine source for logging
tempSource := "ssh"
if hostAgentTemp != nil && sshTemp == nil {
tempSource = "agent"
} else if hostAgentTemp != nil && sshTemp != nil {
tempSource = "agent+ssh"
}
log.Debug().
Str("node", node.Node).
Str("source", tempSource).
Float64("cpuPackage", temp.CPUPackage).
Float64("cpuMax", temp.CPUMax).
Float64("cpuMin", temp.CPUMin).
Float64("cpuMaxRecord", temp.CPUMaxRecord).
Int("nvmeCount", len(temp.NVMe)).
Msg("Collected temperature data")
} else {
// Temperature data returned but not available (temp != nil && !temp.Available)
// OR no temperature data from any source - preserve previous temperature if available
// This prevents the temperature column from flickering when collection temporarily fails
var prevTemp *models.Temperature
for _, prevNode := range prevInstanceNodes {
if prevNode.ID == modelNode.ID && prevNode.Temperature != nil && prevNode.Temperature.Available {
prevTemp = prevNode.Temperature
break
}
}
if prevTemp != nil {
// Clone the previous temperature to avoid modifying historical data
preserved := *prevTemp
preserved.LastUpdate = prevTemp.LastUpdate // Keep original update time to indicate staleness
modelNode.Temperature = &preserved
log.Debug().
Str("node", node.Node).
Bool("isCluster", modelNode.IsClusterMember).
Float64("cpuPackage", preserved.CPUPackage).
Time("lastUpdate", preserved.LastUpdate).
Msg("Preserved previous temperature data (current collection failed or unavailable)")
} else {
log.Debug().
Str("node", node.Node).
Bool("isCluster", modelNode.IsClusterMember).
Msg("No temperature data available (collection failed, no previous data to preserve)")
}
}
}
}
func (m *Monitor) applyNodePendingUpdates(ctx context.Context, instanceName string, client PVEClientInterface, node proxmox.Node, nodeID string, effectiveStatus string, modelNode *models.Node) {
if modelNode == nil {
return
}
// Poll pending apt updates (less frequently - every 30 minutes)
// Only for online nodes to avoid wasting API calls on offline nodes
if effectiveStatus == "online" {
now := time.Now()
m.mu.RLock()
if m.nodePendingUpdatesCache == nil {
m.mu.RUnlock()
m.mu.Lock()
if m.nodePendingUpdatesCache == nil {
m.nodePendingUpdatesCache = make(map[string]pendingUpdatesCache)
}
m.mu.Unlock()
m.mu.RLock()
}
cached, hasCached := m.nodePendingUpdatesCache[nodeID]
m.mu.RUnlock()
if !hasCached || now.Sub(cached.checkedAt) >= pendingUpdatesCacheTTL {
// Time to check for updates
pendingPkgs, err := client.GetNodePendingUpdates(ctx, node.Node)
if err != nil {
// API call failed - preserve cached value if available, don't spam logs
log.Debug().
Err(err).
Str("node", node.Node).
Str("instance", instanceName).
Msg("Could not check pending apt updates (may require Sys.Audit permission)")
if hasCached {
modelNode.PendingUpdates = cached.count
modelNode.PendingUpdatesCheckedAt = cached.checkedAt
}
} else {
updateCount := len(pendingPkgs)
modelNode.PendingUpdates = updateCount
modelNode.PendingUpdatesCheckedAt = now
// Cache the result
m.mu.Lock()
m.nodePendingUpdatesCache[nodeID] = pendingUpdatesCache{
count: updateCount,
checkedAt: now,
}
m.mu.Unlock()
log.Debug().
Str("node", node.Node).
Str("instance", instanceName).
Int("pendingUpdates", updateCount).
Msg("Checked pending apt updates")
}
} else {
// Use cached value
modelNode.PendingUpdates = cached.count
modelNode.PendingUpdatesCheckedAt = cached.checkedAt
}
}
}
func (m *Monitor) recordNodePollMetrics(instanceName string, node proxmox.Node, modelNode *models.Node, nodeStart time.Time) {
if m.pollMetrics == nil || modelNode == nil {
return
}
nodeNameLabel := strings.TrimSpace(node.Node)
if nodeNameLabel == "" {
nodeNameLabel = strings.TrimSpace(modelNode.DisplayName)
}
if nodeNameLabel == "" {
nodeNameLabel = "unknown-node"
}
success := true
nodeErrReason := ""
health := strings.ToLower(strings.TrimSpace(modelNode.ConnectionHealth))
if health != "" && health != "healthy" {
success = false
nodeErrReason = fmt.Sprintf("connection health %s", health)
}
status := strings.ToLower(strings.TrimSpace(modelNode.Status))
if success && status != "" && status != "online" {
success = false
nodeErrReason = fmt.Sprintf("status %s", status)
}
var nodeErr error
if !success {
if nodeErrReason == "" {
nodeErrReason = "unknown node error"
}
nodeErr = stderrors.New(nodeErrReason)
}
m.pollMetrics.RecordNodeResult(NodePollResult{
InstanceName: instanceName,
InstanceType: "pve",
NodeName: nodeNameLabel,
Success: success,
Error: nodeErr,
StartTime: nodeStart,
EndTime: time.Now(),
})
}