diff --git a/docs/release-control/v6/internal/subsystems/monitoring.md b/docs/release-control/v6/internal/subsystems/monitoring.md index e02663670..a3bd6b20f 100644 --- a/docs/release-control/v6/internal/subsystems/monitoring.md +++ b/docs/release-control/v6/internal/subsystems/monitoring.md @@ -328,6 +328,12 @@ canonical physical-disk model and the Proxmox polling runtime in `internal/monitoring/monitor_pve.go` must evaluate disk alerts only after that merged disk view exists, so controller-backed disks do not lose health and endurance coverage between collection and alerting. +That same host-agent temperature boundary must not suppress SSH SMART disk +collection just because the agent already reported CPU package or NVMe +temperatures. `internal/monitoring/monitor_polling_node_helpers.go` may skip +SSH only once the host-agent temperature payload already has SMART disk data, +so nodes keep their disk-temperature and SMART augmentation when the host agent +is present but lacks SMART support. That same Proxmox monitoring boundary also owns checked response parsing for polymorphic numeric fields. Shared client parsers such as `pkg/proxmox/replication.go` must use the package's checked integer conversion diff --git a/internal/monitoring/canonical_guardrails_test.go b/internal/monitoring/canonical_guardrails_test.go index a50188abd..ea828f50a 100644 --- a/internal/monitoring/canonical_guardrails_test.go +++ b/internal/monitoring/canonical_guardrails_test.go @@ -220,6 +220,31 @@ func TestProxmoxGuestDiskCarryForwardUsesCanonicalStabilityHelper(t *testing.T) } } +func TestMonitoringTemperatureFallbackUsesSMARTAwareSSHSkipRule(t *testing.T) { + requiredSnippets := map[string][]string{ + "host_agent_temps.go": { + "func shouldSkipTemperatureSSHCollection(hostAgentTemp *models.Temperature) bool {", + "return hostAgentTemp.HasSMART", + }, + "monitor_polling_node_helpers.go": { + "skipSSHCollection := shouldSkipTemperatureSSHCollection(hostAgentTemp)", + }, + } + + for file, snippets := range requiredSnippets { + data, err := os.ReadFile(file) + if err != nil { + t.Fatalf("failed to read %s: %v", file, err) + } + source := string(data) + for _, snippet := range snippets { + if !strings.Contains(source, snippet) { + t.Fatalf("%s must contain %q", file, snippet) + } + } + } +} + func TestMonitoringRuntimeAvoidsLegacyMockPartialHelpers(t *testing.T) { forbiddenSnippets := []string{ "mock.GetMockState(", diff --git a/internal/monitoring/host_agent_temps.go b/internal/monitoring/host_agent_temps.go index 84d2e037c..97667c1ce 100644 --- a/internal/monitoring/host_agent_temps.go +++ b/internal/monitoring/host_agent_temps.go @@ -22,6 +22,16 @@ func (m *Monitor) getHostAgentTemperature(nodeName string) *models.Temperature { return m.getHostAgentTemperatureByID("", nodeName) } +func shouldSkipTemperatureSSHCollection(hostAgentTemp *models.Temperature) bool { + if hostAgentTemp == nil { + return false + } + + // Host-agent CPU or NVMe readings are preferred, but SSH should still augment + // the node with SMART disk temperatures until the agent has reported SMART. + return hostAgentTemp.HasSMART +} + // getHostAgentTemperatureByID looks for a matching host agent by node ID first, // then falls back to hostname matching. This correctly handles clusters where // multiple nodes may have the same hostname (e.g., "px1" on different IPs). diff --git a/internal/monitoring/host_agent_temps_test.go b/internal/monitoring/host_agent_temps_test.go index a23442b57..6b22813bb 100644 --- a/internal/monitoring/host_agent_temps_test.go +++ b/internal/monitoring/host_agent_temps_test.go @@ -152,6 +152,37 @@ func TestConvertHostSensorsToTemperature_NoPackageUsesMaxCore(t *testing.T) { } } +func TestShouldSkipTemperatureSSHCollection(t *testing.T) { + t.Run("nil host agent temp does not skip", func(t *testing.T) { + if shouldSkipTemperatureSSHCollection(nil) { + t.Fatal("expected nil host agent temp not to skip SSH collection") + } + }) + + t.Run("cpu only host agent temp does not skip", func(t *testing.T) { + host := &models.Temperature{ + Available: true, + HasCPU: true, + CPUPackage: 55, + } + if shouldSkipTemperatureSSHCollection(host) { + t.Fatal("expected CPU-only host agent temp not to skip SSH collection") + } + }) + + t.Run("host agent smart data skips", func(t *testing.T) { + host := &models.Temperature{ + Available: true, + HasCPU: true, + HasSMART: true, + SMART: []models.DiskTemp{{Device: "/dev/sda", Temperature: 35}}, + } + if !shouldSkipTemperatureSSHCollection(host) { + t.Fatal("expected SMART-capable host agent temp to skip SSH collection") + } + }) +} + func TestIsHostAgentTemperatureRecent(t *testing.T) { tests := []struct { name string diff --git a/internal/monitoring/monitor_polling_node_helpers.go b/internal/monitoring/monitor_polling_node_helpers.go index 934414b76..ac94f715e 100644 --- a/internal/monitoring/monitor_polling_node_helpers.go +++ b/internal/monitoring/monitor_polling_node_helpers.go @@ -159,8 +159,10 @@ func (m *Monitor) collectNodeTemperatureData( sshHost = node.Node } - // Skip SSH collection if we already have host agent data. - skipSSHCollection := hostAgentTemp != nil + // Skip SSH only when the host agent already has SMART data too. + // If the host agent only has CPU/NVMe readings, SSH can still + // augment the node with SMART disk temperatures. + skipSSHCollection := shouldSkipTemperatureSSHCollection(hostAgentTemp) if !skipSSHCollection { sshTemp, err = m.tempCollector.CollectTemperature(tempCtx, sshHost, node.Node)