From fadd3ef2bdf704cab256c6f90f7ca260d02cecc0 Mon Sep 17 00:00:00 2001 From: rcourtman Date: Wed, 25 Mar 2026 16:49:47 +0000 Subject: [PATCH] Prefer linked host disk metrics for Proxmox nodes --- .../v6/internal/subsystems/monitoring.md | 7 ++ internal/models/disk_summary.go | 25 +++++ internal/models/disk_summary_test.go | 47 ++++++++ .../monitoring/canonical_guardrails_test.go | 17 +++ internal/monitoring/monitor_polling_node.go | 33 +----- internal/monitoring/node_disk_sources.go | 104 ++++++++++++++++++ .../monitoring/node_memory_sources_test.go | 87 +++++++++++++++ 7 files changed, 290 insertions(+), 30 deletions(-) create mode 100644 internal/models/disk_summary.go create mode 100644 internal/models/disk_summary_test.go create mode 100644 internal/monitoring/node_disk_sources.go diff --git a/docs/release-control/v6/internal/subsystems/monitoring.md b/docs/release-control/v6/internal/subsystems/monitoring.md index 4fa272142..d674e6ccd 100644 --- a/docs/release-control/v6/internal/subsystems/monitoring.md +++ b/docs/release-control/v6/internal/subsystems/monitoring.md @@ -191,3 +191,10 @@ legacy aliases such as `rrd-available`, `rrd-data`, `node-status-available`, `calculated`, and `listing` must normalize onto the governed canonical labels before snapshots are returned to diagnostics consumers, not only when new snapshots are first recorded. +Node disk-source selection now also routes through one canonical resolver +under `internal/monitoring/`. When a Proxmox node has a linked Pulse host +agent, the node summary must prefer the linked host's canonical disk view over +Proxmox `rootfs` bytes because dataset-level `rootfs` can materially +under-report ZFS-backed node capacity and usage. Proxmox `rootfs` and `/nodes` +disk values remain fallback sources only when no linked host disk truth is +available. diff --git a/internal/models/disk_summary.go b/internal/models/disk_summary.go new file mode 100644 index 000000000..0518a2f49 --- /dev/null +++ b/internal/models/disk_summary.go @@ -0,0 +1,25 @@ +package models + +import "strings" + +// SummaryDisk selects the canonical summary disk for host-level metrics. +// Prefer the root-mounted filesystem when present; otherwise fall back to the +// first disk with non-zero capacity in the existing collection order. +func SummaryDisk(disks []Disk) (Disk, bool) { + for _, disk := range disks { + if disk.Total <= 0 { + continue + } + if strings.TrimSpace(disk.Mountpoint) == "/" { + return disk, true + } + } + + for _, disk := range disks { + if disk.Total > 0 { + return disk, true + } + } + + return Disk{}, false +} diff --git a/internal/models/disk_summary_test.go b/internal/models/disk_summary_test.go new file mode 100644 index 000000000..189ce7f9f --- /dev/null +++ b/internal/models/disk_summary_test.go @@ -0,0 +1,47 @@ +package models + +import "testing" + +func TestSummaryDiskPrefersRootMount(t *testing.T) { + disks := []Disk{ + {Mountpoint: "/boot", Total: 1, Used: 1}, + {Mountpoint: "/", Total: 100, Used: 40, Free: 60, Usage: 40}, + {Mountpoint: "/var", Total: 200, Used: 50}, + } + + got, ok := SummaryDisk(disks) + if !ok { + t.Fatal("expected summary disk") + } + if got.Mountpoint != "/" { + t.Fatalf("summary mountpoint = %q, want /", got.Mountpoint) + } + if got.Total != 100 || got.Used != 40 { + t.Fatalf("summary disk = %+v, want root disk", got) + } +} + +func TestSummaryDiskFallsBackToFirstNonZeroDisk(t *testing.T) { + disks := []Disk{ + {Mountpoint: "", Total: 0}, + {Mountpoint: "/tank", Total: 400, Used: 100, Free: 300, Usage: 25}, + {Mountpoint: "/backup", Total: 800, Used: 200}, + } + + got, ok := SummaryDisk(disks) + if !ok { + t.Fatal("expected summary disk") + } + if got.Mountpoint != "/tank" { + t.Fatalf("summary mountpoint = %q, want /tank", got.Mountpoint) + } +} + +func TestSummaryDiskReturnsFalseWhenNoUsableDiskExists(t *testing.T) { + if _, ok := SummaryDisk(nil); ok { + t.Fatal("expected no summary disk for nil input") + } + if _, ok := SummaryDisk([]Disk{{Mountpoint: "/", Total: 0}}); ok { + t.Fatal("expected no summary disk for zero-capacity disks") + } +} diff --git a/internal/monitoring/canonical_guardrails_test.go b/internal/monitoring/canonical_guardrails_test.go index b67addb1c..43a7107c2 100644 --- a/internal/monitoring/canonical_guardrails_test.go +++ b/internal/monitoring/canonical_guardrails_test.go @@ -107,6 +107,23 @@ func TestLegacyMemorySourceAliasesRemainCanonicalized(t *testing.T) { } } +func TestProxmoxNodeDiskUsesCanonicalResolver(t *testing.T) { + data, err := os.ReadFile("monitor_polling_node.go") + if err != nil { + t.Fatalf("failed to read monitor_polling_node.go: %v", err) + } + source := string(data) + requiredSnippets := []string{ + "modelNode.Disk, _ = m.resolveNodeDisk(", + "if resolvedDisk, diskSource := m.resolveNodeDisk(", + } + for _, snippet := range requiredSnippets { + if !strings.Contains(source, snippet) { + t.Fatalf("monitor_polling_node.go must contain %q", snippet) + } + } +} + func TestAlertLifecycleCanonicalChangesRemainWritable(t *testing.T) { store := unifiedresources.NewMemoryStore() incidentStore := memory.NewIncidentStore(memory.IncidentStoreConfig{}) diff --git a/internal/monitoring/monitor_polling_node.go b/internal/monitoring/monitor_polling_node.go index c5d179ba7..1dc1b5b43 100644 --- a/internal/monitoring/monitor_polling_node.go +++ b/internal/monitoring/monitor_polling_node.go @@ -42,12 +42,6 @@ func (m *Monitor) pollPVENode( Free: int64(node.MaxMem - node.Mem), Usage: safePercentage(float64(node.Mem), float64(node.MaxMem)), }, - Disk: models.Disk{ - Total: int64(node.MaxDisk), - Used: int64(node.Disk), - Free: int64(node.MaxDisk - node.Disk), - Usage: safePercentage(float64(node.Disk), float64(node.MaxDisk)), - }, Uptime: int64(node.Uptime), LoadAverage: []float64{}, LastSeen: time.Now(), @@ -56,6 +50,7 @@ func (m *Monitor) pollPVENode( ClusterName: instanceCfg.ClusterName, TemperatureMonitoringEnabled: instanceCfg.TemperatureMonitoringEnabled, } + modelNode.Disk, _ = m.resolveNodeDisk(instanceName, nodeID, node.Node, node, nil) nodeSnapshotRaw := NodeMemoryRaw{ Total: node.MaxMem, @@ -138,31 +133,9 @@ func (m *Monitor) pollPVENode( modelNode.KernelVersion = nodeInfo.KernelVersion modelNode.PVEVersion = nodeInfo.PVEVersion - // Prefer rootfs data for more accurate disk metrics, but ensure we have valid fallback - if nodeInfo.RootFS != nil && nodeInfo.RootFS.Total > 0 { - modelNode.Disk = models.Disk{ - Total: int64(nodeInfo.RootFS.Total), - Used: int64(nodeInfo.RootFS.Used), - Free: int64(nodeInfo.RootFS.Free), - Usage: safePercentage(float64(nodeInfo.RootFS.Used), float64(nodeInfo.RootFS.Total)), - } - log.Debug(). - Str("node", node.Node). - Uint64("rootfsUsed", nodeInfo.RootFS.Used). - Uint64("rootfsTotal", nodeInfo.RootFS.Total). - Float64("rootfsUsage", modelNode.Disk.Usage). - Msg("Using rootfs for disk metrics") - } else if node.Disk > 0 && node.MaxDisk > 0 { - // RootFS unavailable but we have valid disk data from /nodes endpoint - // Keep the values we already set from the nodes list - log.Debug(). - Str("node", node.Node). - Bool("rootfsNil", nodeInfo.RootFS == nil). - Uint64("fallbackDisk", node.Disk). - Uint64("fallbackMaxDisk", node.MaxDisk). - Msg("RootFS data unavailable - using /nodes endpoint disk metrics") + if resolvedDisk, diskSource := m.resolveNodeDisk(instanceName, nodeID, node.Node, node, nodeInfo); diskSource != "" { + modelNode.Disk = resolvedDisk } else { - // Neither rootfs nor valid node disk data available log.Warn(). Str("node", node.Node). Bool("rootfsNil", nodeInfo.RootFS == nil). diff --git a/internal/monitoring/node_disk_sources.go b/internal/monitoring/node_disk_sources.go new file mode 100644 index 000000000..62f2602a4 --- /dev/null +++ b/internal/monitoring/node_disk_sources.go @@ -0,0 +1,104 @@ +package monitoring + +import ( + "strings" + + "github.com/rcourtman/pulse-go-rewrite/internal/models" + "github.com/rcourtman/pulse-go-rewrite/pkg/proxmox" + "github.com/rs/zerolog/log" +) + +func (m *Monitor) resolveNodeDisk( + instanceName string, + nodeID string, + nodeName string, + node proxmox.Node, + nodeInfo *proxmox.NodeStatus, +) (models.Disk, string) { + if linkedHost := m.linkedHostForNode(instanceName, nodeID, nodeName); linkedHost != nil { + if disk, ok := models.SummaryDisk(linkedHost.Disks); ok { + resolved := models.Disk{ + Total: disk.Total, + Used: disk.Used, + Free: disk.Free, + Usage: disk.Usage, + } + log.Debug(). + Str("instance", instanceName). + Str("node", nodeName). + Str("hostAgentID", linkedHost.ID). + Int64("total", resolved.Total). + Int64("used", resolved.Used). + Float64("usage", resolved.Usage). + Msg("Node disk: using linked Pulse host agent disk summary") + return resolved, "agent" + } + } + + if nodeInfo != nil && nodeInfo.RootFS != nil && nodeInfo.RootFS.Total > 0 { + resolved := models.Disk{ + Total: int64(nodeInfo.RootFS.Total), + Used: int64(nodeInfo.RootFS.Used), + Free: int64(nodeInfo.RootFS.Free), + Usage: safePercentage(float64(nodeInfo.RootFS.Used), float64(nodeInfo.RootFS.Total)), + } + log.Debug(). + Str("instance", instanceName). + Str("node", nodeName). + Uint64("rootfsUsed", nodeInfo.RootFS.Used). + Uint64("rootfsTotal", nodeInfo.RootFS.Total). + Float64("rootfsUsage", resolved.Usage). + Msg("Node disk: using Proxmox rootfs metrics") + return resolved, "node-status-rootfs" + } + + if node.MaxDisk > 0 { + resolved := models.Disk{ + Total: int64(node.MaxDisk), + Used: int64(node.Disk), + Free: int64(node.MaxDisk - node.Disk), + Usage: safePercentage(float64(node.Disk), float64(node.MaxDisk)), + } + log.Debug(). + Str("instance", instanceName). + Str("node", nodeName). + Uint64("disk", node.Disk). + Uint64("maxDisk", node.MaxDisk). + Float64("usage", resolved.Usage). + Msg("Node disk: using /nodes endpoint metrics") + return resolved, "nodes-endpoint" + } + + return models.Disk{}, "" +} + +func (m *Monitor) linkedHostForNode(instanceName, nodeID, nodeName string) *models.Host { + readState := m.GetUnifiedReadStateOrSnapshot() + if readState == nil { + return nil + } + + linkedAgentID := "" + for _, existingNode := range nodesForInstanceFromReadState(readState, instanceName) { + if existingNode.ID == nodeID || strings.EqualFold(existingNode.Name, nodeName) { + linkedAgentID = strings.TrimSpace(existingNode.LinkedAgentID) + break + } + } + if linkedAgentID == "" { + return nil + } + + for _, host := range hostsFromReadState(readState) { + if strings.TrimSpace(host.ID) != linkedAgentID { + continue + } + if !strings.EqualFold(strings.TrimSpace(host.Status), "online") { + return nil + } + resolved := host + return &resolved + } + + return nil +} diff --git a/internal/monitoring/node_memory_sources_test.go b/internal/monitoring/node_memory_sources_test.go index 67a75acd8..6240f9fd6 100644 --- a/internal/monitoring/node_memory_sources_test.go +++ b/internal/monitoring/node_memory_sources_test.go @@ -4,6 +4,7 @@ import ( "context" "testing" + "github.com/rcourtman/pulse-go-rewrite/internal/models" "github.com/rcourtman/pulse-go-rewrite/pkg/proxmox" ) @@ -99,3 +100,89 @@ func TestResolveNodeMemoryCharacterization(t *testing.T) { }) } } + +func TestPollPVENodePrefersLinkedHostDiskOverRootFS(t *testing.T) { + mon := newTestPVEMonitor("test") + defer mon.alertManager.Stop() + defer mon.notificationMgr.Stop() + + mon.state.UpsertHost(models.Host{ + ID: "host-1", + Hostname: "ebringa", + Status: "online", + Disks: []models.Disk{ + {Mountpoint: "/", Type: "zfs", Total: 975, Used: 410, Free: 565, Usage: 42.0512820513}, + }, + }) + mon.state.UpdateNodesForInstance("test", []models.Node{ + { + ID: "test-ebringa", + Name: "ebringa", + Instance: "test", + LinkedAgentID: "host-1", + }, + }) + + client := &stubPVEClient{ + nodeStatus: &proxmox.NodeStatus{ + RootFS: &proxmox.RootFS{ + Total: 975, + Used: 3, + Free: 972, + }, + }, + } + + modelNode, _, err := mon.pollPVENode( + context.Background(), + "test", + &mon.config.PVEInstances[0], + client, + proxmox.Node{ + Node: "ebringa", + Status: "online", + MaxDisk: 975, + Disk: 3, + }, + "healthy", + nil, + nil, + ) + if err != nil { + t.Fatalf("pollPVENode() error = %v", err) + } + + if modelNode.Disk.Total != 975 || modelNode.Disk.Used != 410 || modelNode.Disk.Free != 565 { + t.Fatalf("node disk = %+v, want linked host summary", modelNode.Disk) + } + if modelNode.Disk.Usage < 42 || modelNode.Disk.Usage > 42.1 { + t.Fatalf("node disk usage = %.2f, want linked host usage around 42.05", modelNode.Disk.Usage) + } +} + +func TestResolveNodeDiskFallsBackToRootFSWithoutLinkedHost(t *testing.T) { + mon := newTestPVEMonitor("test") + defer mon.alertManager.Stop() + defer mon.notificationMgr.Stop() + + disk, source := mon.resolveNodeDisk( + "test", + "test-node1", + "node1", + proxmox.Node{Node: "node1", Disk: 10, MaxDisk: 100}, + &proxmox.NodeStatus{ + RootFS: &proxmox.RootFS{ + Total: 200, + Used: 50, + Free: 150, + }, + }, + ) + + if source != "node-status-rootfs" { + t.Fatalf("source = %q, want node-status-rootfs", source) + } + if disk.Total != 200 || disk.Used != 50 || disk.Free != 150 { + t.Fatalf("disk = %+v, want rootfs values", disk) + } +}