diff --git a/docs/release-control/v6/internal/subsystems/monitoring.md b/docs/release-control/v6/internal/subsystems/monitoring.md index 5ae4ba1c0..7ebe5fb33 100644 --- a/docs/release-control/v6/internal/subsystems/monitoring.md +++ b/docs/release-control/v6/internal/subsystems/monitoring.md @@ -25,16 +25,20 @@ truth for live infrastructure data. 1. `internal/monitoring/monitor.go` 2. `internal/monitoring/poll_providers.go` 3. `internal/monitoring/monitor_discovery_helpers.go` -4. `internal/monitoring/metrics.go` -5. `internal/monitoring/metrics_history.go` -6. `internal/unifiedresources/read_state.go` -7. `internal/unifiedresources/monitor_adapter.go` -8. `internal/unifiedresources/views.go` -9. `internal/monitoring/connected_infrastructure.go` -10. `internal/monitoring/reload.go` -11. `docker-entrypoint.sh` -12. `internal/monitoring/truenas_poller.go` -13. `internal/monitoring/vmware_poller.go` +4. `internal/monitoring/monitor_polling_node.go` +5. `internal/monitoring/monitor_pve.go` +6. `internal/monitoring/monitor_pve_storage.go` +7. `internal/monitoring/node_disk_sources.go` +8. `internal/monitoring/metrics.go` +9. `internal/monitoring/metrics_history.go` +10. `internal/unifiedresources/read_state.go` +11. `internal/unifiedresources/monitor_adapter.go` +12. `internal/unifiedresources/views.go` +13. `internal/monitoring/connected_infrastructure.go` +14. `internal/monitoring/reload.go` +15. `docker-entrypoint.sh` +16. `internal/monitoring/truenas_poller.go` +17. `internal/monitoring/vmware_poller.go` ## Shared Boundaries @@ -333,7 +337,14 @@ agent, the node summary must prefer the linked host's canonical disk view over Proxmox `rootfs` bytes because dataset-level `rootfs` can materially under-report ZFS-backed node capacity and usage. Proxmox `rootfs` and `/nodes` disk values remain fallback sources only when no linked host disk truth is -available. +available. When the runtime must fall back beyond the linked host and `rootfs` +paths, it must treat the raw `/nodes` disk figure as low-confidence and prefer +the canonical local system storage owner instead of whichever mounted storage +is merely present or largest. On multi-storage Proxmox hosts, fallback +selection must rank `local-zfs`, `local-lvm`, `local`, and other non-shared +guest-root storages ahead of backup-only mounts, and storage-derived disk +metrics may override the `/nodes` figure only when that figure is the active +source or node disk truth is otherwise absent. TrueNAS monitoring ownership now also includes provider rebind semantics in `internal/monitoring/truenas_poller.go`. When a stored TrueNAS connection's host, auth, TLS, or fingerprint settings change, the poller must replace the diff --git a/internal/monitoring/canonical_guardrails_test.go b/internal/monitoring/canonical_guardrails_test.go index 4996e3194..e80e394c9 100644 --- a/internal/monitoring/canonical_guardrails_test.go +++ b/internal/monitoring/canonical_guardrails_test.go @@ -247,8 +247,10 @@ func TestProxmoxNodeDiskUsesCanonicalResolver(t *testing.T) { } source := string(data) requiredSnippets := []string{ - "modelNode.Disk, _ = m.resolveNodeDisk(", + "var nodeDiskSource string", + "modelNode.Disk, nodeDiskSource = m.resolveNodeDisk(", "if resolvedDisk, diskSource := m.resolveNodeDisk(", + "return modelNode, effectiveStatus, nodeDiskSource, nil", } for _, snippet := range requiredSnippets { if !strings.Contains(source, snippet) { @@ -257,6 +259,40 @@ func TestProxmoxNodeDiskUsesCanonicalResolver(t *testing.T) { } } +func TestProxmoxNodeDiskFallbackPrefersCanonicalSystemStorage(t *testing.T) { + requiredSnippets := map[string][]string{ + "node_disk_sources.go": { + "func preferredNodeDiskFallbackRank(storage proxmox.Storage) (int, bool) {", + `case "local-zfs":`, + `case "local-lvm":`, + `case "local":`, + `supportsGuestRoots := storageContentIncludes(storage.Content, "images") || storageContentIncludes(storage.Content, "rootdir")`, + "if storage.Shared != 0 {", + }, + "monitor_pve.go": { + "modelNodes, nodeEffectiveStatus, nodeDiskSources := m.pollPVENodesParallel(", + "modelNodes = m.applyStorageFallbackAndRecordNodeMetrics(instanceName, client, modelNodes, nodeDiskSources, localStorageByNode)", + }, + "monitor_pve_storage.go": { + "rank, ok := preferredNodeDiskFallbackRank(storage)", + `(modelNodes[i].Disk.Total == 0 || currentDiskSource == "" || currentDiskSource == "nodes-endpoint")`, + }, + } + + for file, snippets := range requiredSnippets { + data, err := os.ReadFile(file) + if err != nil { + t.Fatalf("failed to read %s: %v", file, err) + } + source := string(data) + for _, snippet := range snippets { + if !strings.Contains(source, snippet) { + t.Fatalf("%s must contain %q", file, snippet) + } + } + } +} + func TestProxmoxGuestPollersCarryPoolIntoCanonicalModels(t *testing.T) { requiredSnippets := map[string][]string{ "monitor_pve_guest_builders.go": {"Pool: strings.TrimSpace(res.Pool)"}, diff --git a/internal/monitoring/memory_trust_characterization_test.go b/internal/monitoring/memory_trust_characterization_test.go index dc372d434..144088501 100644 --- a/internal/monitoring/memory_trust_characterization_test.go +++ b/internal/monitoring/memory_trust_characterization_test.go @@ -98,7 +98,7 @@ func TestPollPVENodeMemoryTrustCharacterization(t *testing.T) { MaxCPU: 8, } - modelNode, _, err := mon.pollPVENode(context.Background(), "test", &mon.config.PVEInstances[0], client, node, "healthy", nil, nil) + modelNode, _, _, err := mon.pollPVENode(context.Background(), "test", &mon.config.PVEInstances[0], client, node, "healthy", nil, nil) if err != nil { t.Fatalf("pollPVENode() error = %v", err) } @@ -147,7 +147,7 @@ func TestPollPVENodePreservesPreviousSnapshotDuringTransientFallback(t *testing. MaxCPU: 8, } - first, _, err := mon.pollPVENode(context.Background(), "test", &mon.config.PVEInstances[0], client, node, "healthy", nil, nil) + first, _, _, err := mon.pollPVENode(context.Background(), "test", &mon.config.PVEInstances[0], client, node, "healthy", nil, nil) if err != nil { t.Fatalf("first pollPVENode() error = %v", err) } @@ -156,7 +156,7 @@ func TestPollPVENodePreservesPreviousSnapshotDuringTransientFallback(t *testing. } client.nodeStatus = nil - second, _, err := mon.pollPVENode( + second, _, _, err := mon.pollPVENode( context.Background(), "test", &mon.config.PVEInstances[0], diff --git a/internal/monitoring/monitor_memory_test.go b/internal/monitoring/monitor_memory_test.go index 949ab8159..dc08b4534 100644 --- a/internal/monitoring/monitor_memory_test.go +++ b/internal/monitoring/monitor_memory_test.go @@ -14,9 +14,11 @@ import ( ) type stubPVEClient struct { - nodes []proxmox.Node - nodeStatus *proxmox.NodeStatus - rrdPoints []proxmox.NodeRRDPoint + nodes []proxmox.Node + nodeStatus *proxmox.NodeStatus + storageByNode map[string][]proxmox.Storage + allStorage []proxmox.Storage + rrdPoints []proxmox.NodeRRDPoint } var _ PVEClientInterface = (*stubPVEClient)(nil) @@ -50,11 +52,14 @@ func (s *stubPVEClient) GetContainers(ctx context.Context, node string) ([]proxm } func (s *stubPVEClient) GetStorage(ctx context.Context, node string) ([]proxmox.Storage, error) { - return nil, nil + if s.storageByNode == nil { + return nil, nil + } + return s.storageByNode[node], nil } func (s *stubPVEClient) GetAllStorage(ctx context.Context) ([]proxmox.Storage, error) { - return nil, nil + return s.allStorage, nil } func (s *stubPVEClient) GetBackupTasks(ctx context.Context) ([]proxmox.Task, error) { diff --git a/internal/monitoring/monitor_polling_node.go b/internal/monitoring/monitor_polling_node.go index 1dc1b5b43..b645d0c16 100644 --- a/internal/monitoring/monitor_polling_node.go +++ b/internal/monitoring/monitor_polling_node.go @@ -20,7 +20,7 @@ func (m *Monitor) pollPVENode( connectionHealthStr string, prevNodeMemory map[string]models.Memory, prevInstanceNodes []models.Node, -) (models.Node, string, error) { +) (models.Node, string, string, error) { nodeStart := time.Now() displayName := getNodeDisplayName(instanceCfg, node.Node) connectionHost, guestURL := resolveNodeConnectionInfo(instanceCfg, node.Node) @@ -50,7 +50,8 @@ func (m *Monitor) pollPVENode( ClusterName: instanceCfg.ClusterName, TemperatureMonitoringEnabled: instanceCfg.TemperatureMonitoringEnabled, } - modelNode.Disk, _ = m.resolveNodeDisk(instanceName, nodeID, node.Node, node, nil) + var nodeDiskSource string + modelNode.Disk, nodeDiskSource = m.resolveNodeDisk(instanceName, nodeID, node.Node, node, nil) nodeSnapshotRaw := NodeMemoryRaw{ Total: node.MaxMem, @@ -135,6 +136,7 @@ func (m *Monitor) pollPVENode( if resolvedDisk, diskSource := m.resolveNodeDisk(instanceName, nodeID, node.Node, node, nodeInfo); diskSource != "" { modelNode.Disk = resolvedDisk + nodeDiskSource = diskSource } else { log.Warn(). Str("node", node.Node). @@ -242,5 +244,5 @@ func (m *Monitor) pollPVENode( m.applyNodePendingUpdates(ctx, instanceName, client, node, nodeID, effectiveStatus, &modelNode) m.recordNodePollMetrics(instanceName, node, &modelNode, nodeStart) - return modelNode, effectiveStatus, nil + return modelNode, effectiveStatus, nodeDiskSource, nil } diff --git a/internal/monitoring/monitor_pve.go b/internal/monitoring/monitor_pve.go index 4646fd299..9dd33f12f 100644 --- a/internal/monitoring/monitor_pve.go +++ b/internal/monitoring/monitor_pve.go @@ -497,13 +497,15 @@ func (m *Monitor) pollPVENodesParallel( prevNodeMemory map[string]models.Memory, prevInstanceNodes []models.Node, debugEnabled bool, -) ([]models.Node, map[string]string) { +) ([]models.Node, map[string]string, map[string]string) { var modelNodes []models.Node nodeEffectiveStatus := make(map[string]string) + nodeDiskSources := make(map[string]string) type nodePollResult struct { node models.Node effectiveStatus string + diskSource string } resultChan := make(chan nodePollResult, len(nodes)) @@ -521,11 +523,12 @@ func (m *Monitor) pollPVENodesParallel( go func(node proxmox.Node) { defer wg.Done() - modelNode, effectiveStatus, _ := m.pollPVENode(ctx, instanceName, instanceCfg, client, node, connectionHealthStr, prevNodeMemory, prevInstanceNodes) + modelNode, effectiveStatus, diskSource, _ := m.pollPVENode(ctx, instanceName, instanceCfg, client, node, connectionHealthStr, prevNodeMemory, prevInstanceNodes) resultChan <- nodePollResult{ node: modelNode, effectiveStatus: effectiveStatus, + diskSource: diskSource, } }(node) } @@ -536,9 +539,10 @@ func (m *Monitor) pollPVENodesParallel( for res := range resultChan { modelNodes = append(modelNodes, res.node) nodeEffectiveStatus[res.node.Name] = res.effectiveStatus + nodeDiskSources[res.node.Name] = res.diskSource } - return modelNodes, nodeEffectiveStatus + return modelNodes, nodeEffectiveStatus, nodeDiskSources } func (m *Monitor) preserveNodesWhenEmpty(instanceName string, modelNodes []models.Node, prevInstanceNodes []models.Node) []models.Node { @@ -978,7 +982,7 @@ func (m *Monitor) pollPVEInstance(ctx context.Context, instanceName string, clie prevNodeMemory, prevInstanceNodes := m.snapshotPrevNodes(instanceName) // Convert to models - modelNodes, nodeEffectiveStatus := m.pollPVENodesParallel( + modelNodes, nodeEffectiveStatus, nodeDiskSources := m.pollPVENodesParallel( ctx, instanceName, instanceCfg, @@ -995,7 +999,8 @@ func (m *Monitor) pollPVEInstance(ctx context.Context, instanceName string, clie // Update state first so we have nodes available m.state.UpdateNodesForInstance(instanceName, modelNodes) - // Storage fallback is used to provide disk metrics when rootfs is not available. + // Storage fallback is used to provide disk metrics when the node summary only + // has the low-confidence /nodes figure or no disk truth at all. // We run this asynchronously with a short timeout so it doesn't block VM/container polling. // This addresses the issue where slow storage APIs (e.g., NFS mounts) can cause the entire // polling task to timeout before reaching VM/container polling. @@ -1024,7 +1029,7 @@ func (m *Monitor) pollPVEInstance(ctx context.Context, instanceName string, clie // We give the storage fallback goroutine up to 2 additional seconds to finish if it's still running. localStorageByNode := m.awaitStorageFallback(instanceName, storageFallback, 2*time.Second) - modelNodes = m.applyStorageFallbackAndRecordNodeMetrics(instanceName, client, modelNodes, localStorageByNode) + modelNodes = m.applyStorageFallbackAndRecordNodeMetrics(instanceName, client, modelNodes, nodeDiskSources, localStorageByNode) // Periodically re-check cluster status for nodes marked as standalone // This addresses issue #437 where clusters aren't detected on first attempt diff --git a/internal/monitoring/monitor_pve_storage.go b/internal/monitoring/monitor_pve_storage.go index bfbfea938..2ff1b577d 100644 --- a/internal/monitoring/monitor_pve_storage.go +++ b/internal/monitoring/monitor_pve_storage.go @@ -94,7 +94,9 @@ func (m *Monitor) startStorageFallback( continue } - // Look for local or local-lvm storage as most stable disk metric + bestRank := 0 + bestFound := false + var bestStorage proxmox.Storage for _, storage := range nodeStorages { if reason, skip := readOnlyFilesystemReason(storage.Type, storage.Total, storage.Used); skip { log.Debug(). @@ -107,25 +109,35 @@ func (m *Monitor) startStorageFallback( Msg("Skipping read-only storage while building disk fallback") continue } - if storage.Storage == "local" || storage.Storage == "local-lvm" { - disk := models.Disk{ - Total: int64(storage.Total), - Used: int64(storage.Used), - Free: int64(storage.Available), - Usage: safePercentage(float64(storage.Used), float64(storage.Total)), - } - // Prefer "local" over "local-lvm" - sf.mu.Lock() - if _, exists := sf.byNode[node.Node]; !exists || storage.Storage == "local" { - sf.byNode[node.Node] = disk - log.Debug(). - Str("node", node.Node). - Str("storage", storage.Storage). - Float64("usage", disk.Usage). - Msg("Using storage for disk metrics fallback") - } - sf.mu.Unlock() + + rank, ok := preferredNodeDiskFallbackRank(storage) + if !ok { + continue } + if !bestFound || rank < bestRank || (rank == bestRank && storage.Total > bestStorage.Total) { + bestRank = rank + bestStorage = storage + bestFound = true + } + } + + if bestFound { + disk := models.Disk{ + Total: int64(bestStorage.Total), + Used: int64(bestStorage.Used), + Free: int64(bestStorage.Available), + Usage: safePercentage(float64(bestStorage.Used), float64(bestStorage.Total)), + } + sf.mu.Lock() + sf.byNode[node.Node] = disk + sf.mu.Unlock() + log.Debug(). + Str("node", node.Node). + Str("storage", bestStorage.Storage). + Str("type", bestStorage.Type). + Int("rank", bestRank). + Float64("usage", disk.Usage). + Msg("Using preferred storage for disk metrics fallback") } } }() @@ -164,17 +176,19 @@ func (m *Monitor) applyStorageFallbackAndRecordNodeMetrics( instanceName string, client PVEClientInterface, modelNodes []models.Node, + nodeDiskSources map[string]string, storageFallback map[string]models.Disk, ) []models.Node { for i := range modelNodes { - if modelNodes[i].Disk.Total == 0 { - if disk, exists := storageFallback[modelNodes[i].Name]; exists { - modelNodes[i].Disk = disk - log.Debug(). - Str("node", modelNodes[i].Name). - Float64("usage", disk.Usage). - Msg("Applied storage fallback for disk metrics") - } + currentDiskSource := nodeDiskSources[modelNodes[i].Name] + if disk, exists := storageFallback[modelNodes[i].Name]; exists && + (modelNodes[i].Disk.Total == 0 || currentDiskSource == "" || currentDiskSource == "nodes-endpoint") { + modelNodes[i].Disk = disk + log.Debug(). + Str("node", modelNodes[i].Name). + Str("previousSource", currentDiskSource). + Float64("usage", disk.Usage). + Msg("Applied storage fallback for disk metrics") } if modelNodes[i].Status == "online" { diff --git a/internal/monitoring/node_disk_sources.go b/internal/monitoring/node_disk_sources.go index 62f2602a4..d4a560cdf 100644 --- a/internal/monitoring/node_disk_sources.go +++ b/internal/monitoring/node_disk_sources.go @@ -72,6 +72,56 @@ func (m *Monitor) resolveNodeDisk( return models.Disk{}, "" } +func preferredNodeDiskFallbackRank(storage proxmox.Storage) (int, bool) { + name := strings.ToLower(strings.TrimSpace(storage.Storage)) + storageType := strings.ToLower(strings.TrimSpace(storage.Type)) + path := strings.TrimSpace(storage.Path) + supportsGuestRoots := storageContentIncludes(storage.Content, "images") || storageContentIncludes(storage.Content, "rootdir") + + switch name { + case "local-zfs": + return 0, true + case "local-lvm": + return 1, true + case "local": + return 2, true + } + + if storage.Shared != 0 { + return 0, false + } + + if supportsGuestRoots { + switch storageType { + case "zfspool", "zfs", "local-zfs": + return 3, true + case "lvmthin", "lvm", "local-lvm": + return 4, true + } + + if storageType == "dir" && path == "/var/lib/vz" { + return 5, true + } + + if strings.HasPrefix(name, "local") { + return 6, true + } + + return 7, true + } + + return 0, false +} + +func storageContentIncludes(content, want string) bool { + for _, part := range strings.Split(content, ",") { + if strings.EqualFold(strings.TrimSpace(part), want) { + return true + } + } + return false +} + func (m *Monitor) linkedHostForNode(instanceName, nodeID, nodeName string) *models.Host { readState := m.GetUnifiedReadStateOrSnapshot() if readState == nil { diff --git a/internal/monitoring/node_memory_sources_test.go b/internal/monitoring/node_memory_sources_test.go index 6240f9fd6..0401a5d01 100644 --- a/internal/monitoring/node_memory_sources_test.go +++ b/internal/monitoring/node_memory_sources_test.go @@ -133,7 +133,7 @@ func TestPollPVENodePrefersLinkedHostDiskOverRootFS(t *testing.T) { }, } - modelNode, _, err := mon.pollPVENode( + modelNode, _, _, err := mon.pollPVENode( context.Background(), "test", &mon.config.PVEInstances[0], @@ -186,3 +186,78 @@ func TestResolveNodeDiskFallsBackToRootFSWithoutLinkedHost(t *testing.T) { t.Fatalf("disk = %+v, want rootfs values", disk) } } + +func TestPollPVEInstancePrefersCanonicalLocalStorageOverNodesEndpoint(t *testing.T) { + t.Setenv("PULSE_DATA_DIR", t.TempDir()) + + client := &stubPVEClient{ + nodes: []proxmox.Node{ + { + Node: "nuc", + Status: "online", + CPU: 0.12, + MaxCPU: 8, + Mem: 4 * 1024 * 1024 * 1024, + MaxMem: 8 * 1024 * 1024 * 1024, + Disk: 92 * 1024 * 1024 * 1024, + MaxDisk: 916 * 1024 * 1024 * 1024, + Uptime: 3600, + }, + }, + allStorage: []proxmox.Storage{{Storage: "local-zfs"}}, + storageByNode: map[string][]proxmox.Storage{ + "nuc": { + { + Storage: "local-zfs", + Type: "zfspool", + Content: "images,rootdir", + Shared: 0, + Total: 944 * 1024 * 1024 * 1024, + Used: 313 * 1024 * 1024 * 1024, + Available: 631 * 1024 * 1024 * 1024, + }, + { + Storage: "t7shield", + Type: "dir", + Content: "backup,iso,vztmpl", + Shared: 0, + Total: 916 * 1024 * 1024 * 1024, + Used: 92 * 1024 * 1024 * 1024, + Available: 824 * 1024 * 1024 * 1024, + }, + { + Storage: "hetzner", + Type: "dir", + Content: "backup", + Shared: 0, + Total: 711 * 1024 * 1024 * 1024, + Used: 109 * 1024 * 1024 * 1024, + Available: 602 * 1024 * 1024 * 1024, + }, + }, + }, + } + + mon := newTestPVEMonitor("test") + defer mon.alertManager.Stop() + defer mon.notificationMgr.Stop() + mon.config.PVEInstances[0].MonitorStorage = true + + mon.pollPVEInstance(context.Background(), "test", client) + + snapshot := mon.state.GetSnapshot() + if len(snapshot.Nodes) != 1 { + t.Fatalf("expected one node in state, got %d", len(snapshot.Nodes)) + } + + node := snapshot.Nodes[0] + wantTotal := int64(944 * 1024 * 1024 * 1024) + wantUsed := int64(313 * 1024 * 1024 * 1024) + wantFree := int64(631 * 1024 * 1024 * 1024) + if node.Disk.Total != wantTotal || node.Disk.Used != wantUsed || node.Disk.Free != wantFree { + t.Fatalf("node disk = %+v, want local-zfs fallback totals", node.Disk) + } + if node.Disk.Usage < 33.1 || node.Disk.Usage > 33.2 { + t.Fatalf("node disk usage = %.2f, want local-zfs usage around 33.16", node.Disk.Usage) + } +} diff --git a/scripts/release_control/canonical_completion_guard_test.py b/scripts/release_control/canonical_completion_guard_test.py index f9f2d97ee..789e4eb5b 100644 --- a/scripts/release_control/canonical_completion_guard_test.py +++ b/scripts/release_control/canonical_completion_guard_test.py @@ -2826,14 +2826,14 @@ class CanonicalCompletionGuardTest(unittest.TestCase): { "heading": "## Canonical Files", "path": "internal/unifiedresources/views.go", - "line": 32, + "line": 36, "heading_line": 23, }, { "heading": "## Extension Points", "path": "internal/unifiedresources/views.go", - "line": 47, - "heading_line": 43, + "line": 51, + "heading_line": 47, }, ], ) diff --git a/scripts/release_control/subsystem_lookup_test.py b/scripts/release_control/subsystem_lookup_test.py index a27c816ca..f4286d74d 100644 --- a/scripts/release_control/subsystem_lookup_test.py +++ b/scripts/release_control/subsystem_lookup_test.py @@ -4610,14 +4610,14 @@ class SubsystemLookupTest(unittest.TestCase): { "heading": "## Canonical Files", "path": "internal/unifiedresources/views.go", - "line": 32, + "line": 36, "heading_line": 23, }, { "heading": "## Extension Points", "path": "internal/unifiedresources/views.go", - "line": 47, - "heading_line": 43, + "line": 51, + "heading_line": 47, }, ], )