Prefer linked host disk metrics for Proxmox nodes

This commit is contained in:
rcourtman 2026-03-25 16:49:47 +00:00
parent aef1bd41fa
commit fadd3ef2bd
7 changed files with 290 additions and 30 deletions

View file

@ -191,3 +191,10 @@ legacy aliases such as `rrd-available`, `rrd-data`, `node-status-available`,
`calculated`, and `listing` must normalize onto the governed canonical labels
before snapshots are returned to diagnostics consumers, not only when new
snapshots are first recorded.
Node disk-source selection now also routes through one canonical resolver
under `internal/monitoring/`. When a Proxmox node has a linked Pulse host
agent, the node summary must prefer the linked host's canonical disk view over
Proxmox `rootfs` bytes because dataset-level `rootfs` can materially
under-report ZFS-backed node capacity and usage. Proxmox `rootfs` and `/nodes`
disk values remain fallback sources only when no linked host disk truth is
available.

View file

@ -0,0 +1,25 @@
package models
import "strings"
// SummaryDisk selects the canonical summary disk for host-level metrics.
// Prefer the root-mounted filesystem when present; otherwise fall back to the
// first disk with non-zero capacity in the existing collection order.
func SummaryDisk(disks []Disk) (Disk, bool) {
for _, disk := range disks {
if disk.Total <= 0 {
continue
}
if strings.TrimSpace(disk.Mountpoint) == "/" {
return disk, true
}
}
for _, disk := range disks {
if disk.Total > 0 {
return disk, true
}
}
return Disk{}, false
}

View file

@ -0,0 +1,47 @@
package models
import "testing"
func TestSummaryDiskPrefersRootMount(t *testing.T) {
disks := []Disk{
{Mountpoint: "/boot", Total: 1, Used: 1},
{Mountpoint: "/", Total: 100, Used: 40, Free: 60, Usage: 40},
{Mountpoint: "/var", Total: 200, Used: 50},
}
got, ok := SummaryDisk(disks)
if !ok {
t.Fatal("expected summary disk")
}
if got.Mountpoint != "/" {
t.Fatalf("summary mountpoint = %q, want /", got.Mountpoint)
}
if got.Total != 100 || got.Used != 40 {
t.Fatalf("summary disk = %+v, want root disk", got)
}
}
func TestSummaryDiskFallsBackToFirstNonZeroDisk(t *testing.T) {
disks := []Disk{
{Mountpoint: "", Total: 0},
{Mountpoint: "/tank", Total: 400, Used: 100, Free: 300, Usage: 25},
{Mountpoint: "/backup", Total: 800, Used: 200},
}
got, ok := SummaryDisk(disks)
if !ok {
t.Fatal("expected summary disk")
}
if got.Mountpoint != "/tank" {
t.Fatalf("summary mountpoint = %q, want /tank", got.Mountpoint)
}
}
func TestSummaryDiskReturnsFalseWhenNoUsableDiskExists(t *testing.T) {
if _, ok := SummaryDisk(nil); ok {
t.Fatal("expected no summary disk for nil input")
}
if _, ok := SummaryDisk([]Disk{{Mountpoint: "/", Total: 0}}); ok {
t.Fatal("expected no summary disk for zero-capacity disks")
}
}

View file

@ -107,6 +107,23 @@ func TestLegacyMemorySourceAliasesRemainCanonicalized(t *testing.T) {
}
}
func TestProxmoxNodeDiskUsesCanonicalResolver(t *testing.T) {
data, err := os.ReadFile("monitor_polling_node.go")
if err != nil {
t.Fatalf("failed to read monitor_polling_node.go: %v", err)
}
source := string(data)
requiredSnippets := []string{
"modelNode.Disk, _ = m.resolveNodeDisk(",
"if resolvedDisk, diskSource := m.resolveNodeDisk(",
}
for _, snippet := range requiredSnippets {
if !strings.Contains(source, snippet) {
t.Fatalf("monitor_polling_node.go must contain %q", snippet)
}
}
}
func TestAlertLifecycleCanonicalChangesRemainWritable(t *testing.T) {
store := unifiedresources.NewMemoryStore()
incidentStore := memory.NewIncidentStore(memory.IncidentStoreConfig{})

View file

@ -42,12 +42,6 @@ func (m *Monitor) pollPVENode(
Free: int64(node.MaxMem - node.Mem),
Usage: safePercentage(float64(node.Mem), float64(node.MaxMem)),
},
Disk: models.Disk{
Total: int64(node.MaxDisk),
Used: int64(node.Disk),
Free: int64(node.MaxDisk - node.Disk),
Usage: safePercentage(float64(node.Disk), float64(node.MaxDisk)),
},
Uptime: int64(node.Uptime),
LoadAverage: []float64{},
LastSeen: time.Now(),
@ -56,6 +50,7 @@ func (m *Monitor) pollPVENode(
ClusterName: instanceCfg.ClusterName,
TemperatureMonitoringEnabled: instanceCfg.TemperatureMonitoringEnabled,
}
modelNode.Disk, _ = m.resolveNodeDisk(instanceName, nodeID, node.Node, node, nil)
nodeSnapshotRaw := NodeMemoryRaw{
Total: node.MaxMem,
@ -138,31 +133,9 @@ func (m *Monitor) pollPVENode(
modelNode.KernelVersion = nodeInfo.KernelVersion
modelNode.PVEVersion = nodeInfo.PVEVersion
// Prefer rootfs data for more accurate disk metrics, but ensure we have valid fallback
if nodeInfo.RootFS != nil && nodeInfo.RootFS.Total > 0 {
modelNode.Disk = models.Disk{
Total: int64(nodeInfo.RootFS.Total),
Used: int64(nodeInfo.RootFS.Used),
Free: int64(nodeInfo.RootFS.Free),
Usage: safePercentage(float64(nodeInfo.RootFS.Used), float64(nodeInfo.RootFS.Total)),
}
log.Debug().
Str("node", node.Node).
Uint64("rootfsUsed", nodeInfo.RootFS.Used).
Uint64("rootfsTotal", nodeInfo.RootFS.Total).
Float64("rootfsUsage", modelNode.Disk.Usage).
Msg("Using rootfs for disk metrics")
} else if node.Disk > 0 && node.MaxDisk > 0 {
// RootFS unavailable but we have valid disk data from /nodes endpoint
// Keep the values we already set from the nodes list
log.Debug().
Str("node", node.Node).
Bool("rootfsNil", nodeInfo.RootFS == nil).
Uint64("fallbackDisk", node.Disk).
Uint64("fallbackMaxDisk", node.MaxDisk).
Msg("RootFS data unavailable - using /nodes endpoint disk metrics")
if resolvedDisk, diskSource := m.resolveNodeDisk(instanceName, nodeID, node.Node, node, nodeInfo); diskSource != "" {
modelNode.Disk = resolvedDisk
} else {
// Neither rootfs nor valid node disk data available
log.Warn().
Str("node", node.Node).
Bool("rootfsNil", nodeInfo.RootFS == nil).

View file

@ -0,0 +1,104 @@
package monitoring
import (
"strings"
"github.com/rcourtman/pulse-go-rewrite/internal/models"
"github.com/rcourtman/pulse-go-rewrite/pkg/proxmox"
"github.com/rs/zerolog/log"
)
func (m *Monitor) resolveNodeDisk(
instanceName string,
nodeID string,
nodeName string,
node proxmox.Node,
nodeInfo *proxmox.NodeStatus,
) (models.Disk, string) {
if linkedHost := m.linkedHostForNode(instanceName, nodeID, nodeName); linkedHost != nil {
if disk, ok := models.SummaryDisk(linkedHost.Disks); ok {
resolved := models.Disk{
Total: disk.Total,
Used: disk.Used,
Free: disk.Free,
Usage: disk.Usage,
}
log.Debug().
Str("instance", instanceName).
Str("node", nodeName).
Str("hostAgentID", linkedHost.ID).
Int64("total", resolved.Total).
Int64("used", resolved.Used).
Float64("usage", resolved.Usage).
Msg("Node disk: using linked Pulse host agent disk summary")
return resolved, "agent"
}
}
if nodeInfo != nil && nodeInfo.RootFS != nil && nodeInfo.RootFS.Total > 0 {
resolved := models.Disk{
Total: int64(nodeInfo.RootFS.Total),
Used: int64(nodeInfo.RootFS.Used),
Free: int64(nodeInfo.RootFS.Free),
Usage: safePercentage(float64(nodeInfo.RootFS.Used), float64(nodeInfo.RootFS.Total)),
}
log.Debug().
Str("instance", instanceName).
Str("node", nodeName).
Uint64("rootfsUsed", nodeInfo.RootFS.Used).
Uint64("rootfsTotal", nodeInfo.RootFS.Total).
Float64("rootfsUsage", resolved.Usage).
Msg("Node disk: using Proxmox rootfs metrics")
return resolved, "node-status-rootfs"
}
if node.MaxDisk > 0 {
resolved := models.Disk{
Total: int64(node.MaxDisk),
Used: int64(node.Disk),
Free: int64(node.MaxDisk - node.Disk),
Usage: safePercentage(float64(node.Disk), float64(node.MaxDisk)),
}
log.Debug().
Str("instance", instanceName).
Str("node", nodeName).
Uint64("disk", node.Disk).
Uint64("maxDisk", node.MaxDisk).
Float64("usage", resolved.Usage).
Msg("Node disk: using /nodes endpoint metrics")
return resolved, "nodes-endpoint"
}
return models.Disk{}, ""
}
func (m *Monitor) linkedHostForNode(instanceName, nodeID, nodeName string) *models.Host {
readState := m.GetUnifiedReadStateOrSnapshot()
if readState == nil {
return nil
}
linkedAgentID := ""
for _, existingNode := range nodesForInstanceFromReadState(readState, instanceName) {
if existingNode.ID == nodeID || strings.EqualFold(existingNode.Name, nodeName) {
linkedAgentID = strings.TrimSpace(existingNode.LinkedAgentID)
break
}
}
if linkedAgentID == "" {
return nil
}
for _, host := range hostsFromReadState(readState) {
if strings.TrimSpace(host.ID) != linkedAgentID {
continue
}
if !strings.EqualFold(strings.TrimSpace(host.Status), "online") {
return nil
}
resolved := host
return &resolved
}
return nil
}

View file

@ -4,6 +4,7 @@ import (
"context"
"testing"
"github.com/rcourtman/pulse-go-rewrite/internal/models"
"github.com/rcourtman/pulse-go-rewrite/pkg/proxmox"
)
@ -99,3 +100,89 @@ func TestResolveNodeMemoryCharacterization(t *testing.T) {
})
}
}
func TestPollPVENodePrefersLinkedHostDiskOverRootFS(t *testing.T) {
mon := newTestPVEMonitor("test")
defer mon.alertManager.Stop()
defer mon.notificationMgr.Stop()
mon.state.UpsertHost(models.Host{
ID: "host-1",
Hostname: "ebringa",
Status: "online",
Disks: []models.Disk{
{Mountpoint: "/", Type: "zfs", Total: 975, Used: 410, Free: 565, Usage: 42.0512820513},
},
})
mon.state.UpdateNodesForInstance("test", []models.Node{
{
ID: "test-ebringa",
Name: "ebringa",
Instance: "test",
LinkedAgentID: "host-1",
},
})
client := &stubPVEClient{
nodeStatus: &proxmox.NodeStatus{
RootFS: &proxmox.RootFS{
Total: 975,
Used: 3,
Free: 972,
},
},
}
modelNode, _, err := mon.pollPVENode(
context.Background(),
"test",
&mon.config.PVEInstances[0],
client,
proxmox.Node{
Node: "ebringa",
Status: "online",
MaxDisk: 975,
Disk: 3,
},
"healthy",
nil,
nil,
)
if err != nil {
t.Fatalf("pollPVENode() error = %v", err)
}
if modelNode.Disk.Total != 975 || modelNode.Disk.Used != 410 || modelNode.Disk.Free != 565 {
t.Fatalf("node disk = %+v, want linked host summary", modelNode.Disk)
}
if modelNode.Disk.Usage < 42 || modelNode.Disk.Usage > 42.1 {
t.Fatalf("node disk usage = %.2f, want linked host usage around 42.05", modelNode.Disk.Usage)
}
}
func TestResolveNodeDiskFallsBackToRootFSWithoutLinkedHost(t *testing.T) {
mon := newTestPVEMonitor("test")
defer mon.alertManager.Stop()
defer mon.notificationMgr.Stop()
disk, source := mon.resolveNodeDisk(
"test",
"test-node1",
"node1",
proxmox.Node{Node: "node1", Disk: 10, MaxDisk: 100},
&proxmox.NodeStatus{
RootFS: &proxmox.RootFS{
Total: 200,
Used: 50,
Free: 150,
},
},
)
if source != "node-status-rootfs" {
t.Fatalf("source = %q, want node-status-rootfs", source)
}
if disk.Total != 200 || disk.Used != 50 || disk.Free != 150 {
t.Fatalf("disk = %+v, want rootfs values", disk)
}
}