Pulse/internal/monitoring/monitor_memory_test.go
rcourtman f344938403
Some checks are pending
Build and Test / Secret Scan (push) Waiting to run
Build and Test / Frontend & Backend (push) Waiting to run
Core E2E Tests / Playwright Core E2E (push) Waiting to run
Retry Linux guest meminfo sooner after transient failures (#1319)
2026-03-26 23:27:54 +00:00

800 lines
23 KiB
Go

package monitoring
import (
"context"
"fmt"
"math"
"testing"
"time"
"github.com/rcourtman/pulse-go-rewrite/internal/alerts"
"github.com/rcourtman/pulse-go-rewrite/internal/config"
"github.com/rcourtman/pulse-go-rewrite/internal/models"
"github.com/rcourtman/pulse-go-rewrite/internal/notifications"
"github.com/rcourtman/pulse-go-rewrite/pkg/proxmox"
)
type stubPVEClient struct {
nodes []proxmox.Node
nodeStatus *proxmox.NodeStatus
rrdPoints []proxmox.NodeRRDPoint
rrdErr error // if set, GetNodeRRDData returns this error
vmMemAvail uint64
vmMemErr error
vmMemCalls int
}
var _ PVEClientInterface = (*stubPVEClient)(nil)
func (s *stubPVEClient) GetNodes(ctx context.Context) ([]proxmox.Node, error) {
return s.nodes, nil
}
func (s *stubPVEClient) GetNodeStatus(ctx context.Context, node string) (*proxmox.NodeStatus, error) {
return s.nodeStatus, nil
}
func (s *stubPVEClient) GetNodeRRDData(ctx context.Context, node, timeframe, cf string, ds []string) ([]proxmox.NodeRRDPoint, error) {
if s.rrdErr != nil {
return nil, s.rrdErr
}
return s.rrdPoints, nil
}
func (s *stubPVEClient) GetLXCRRDData(ctx context.Context, node string, vmid int, timeframe, cf string, ds []string) ([]proxmox.GuestRRDPoint, error) {
return nil, nil
}
func (s *stubPVEClient) GetVMRRDData(ctx context.Context, node string, vmid int, timeframe, cf string, ds []string) ([]proxmox.GuestRRDPoint, error) {
return nil, nil
}
func (s *stubPVEClient) GetVMs(ctx context.Context, node string) ([]proxmox.VM, error) {
return nil, nil
}
func (s *stubPVEClient) GetContainers(ctx context.Context, node string) ([]proxmox.Container, error) {
return nil, nil
}
func (s *stubPVEClient) GetStorage(ctx context.Context, node string) ([]proxmox.Storage, error) {
return nil, nil
}
func (s *stubPVEClient) GetAllStorage(ctx context.Context) ([]proxmox.Storage, error) {
return nil, nil
}
func (s *stubPVEClient) GetBackupTasks(ctx context.Context) ([]proxmox.Task, error) {
return nil, nil
}
func (s *stubPVEClient) GetReplicationStatus(ctx context.Context) ([]proxmox.ReplicationJob, error) {
return nil, nil
}
func (s *stubPVEClient) GetStorageContent(ctx context.Context, node, storage string) ([]proxmox.StorageContent, error) {
return nil, nil
}
func (s *stubPVEClient) GetVMSnapshots(ctx context.Context, node string, vmid int) ([]proxmox.Snapshot, error) {
return nil, nil
}
func (s *stubPVEClient) GetContainerSnapshots(ctx context.Context, node string, vmid int) ([]proxmox.Snapshot, error) {
return nil, nil
}
func (s *stubPVEClient) GetVMStatus(ctx context.Context, node string, vmid int) (*proxmox.VMStatus, error) {
return nil, nil
}
func (s *stubPVEClient) GetContainerStatus(ctx context.Context, node string, vmid int) (*proxmox.Container, error) {
return nil, nil
}
func (s *stubPVEClient) GetContainerConfig(ctx context.Context, node string, vmid int) (map[string]interface{}, error) {
return nil, nil
}
func (s *stubPVEClient) GetContainerInterfaces(ctx context.Context, node string, vmid int) ([]proxmox.ContainerInterface, error) {
return nil, nil
}
func (s *stubPVEClient) GetClusterResources(ctx context.Context, resourceType string) ([]proxmox.ClusterResource, error) {
return nil, nil
}
func (s *stubPVEClient) IsClusterMember(ctx context.Context) (bool, error) {
return false, nil
}
func (s *stubPVEClient) GetVMFSInfo(ctx context.Context, node string, vmid int) ([]proxmox.VMFileSystem, error) {
return nil, nil
}
func (s *stubPVEClient) GetVMNetworkInterfaces(ctx context.Context, node string, vmid int) ([]proxmox.VMNetworkInterface, error) {
return nil, nil
}
func (s *stubPVEClient) GetVMAgentInfo(ctx context.Context, node string, vmid int) (map[string]interface{}, error) {
return map[string]interface{}{}, nil
}
func (s *stubPVEClient) GetVMAgentVersion(ctx context.Context, node string, vmid int) (string, error) {
return "", nil
}
func (s *stubPVEClient) GetVMMemAvailableFromAgent(ctx context.Context, node string, vmid int) (uint64, error) {
s.vmMemCalls++
if s.vmMemErr != nil {
return 0, s.vmMemErr
}
return s.vmMemAvail, nil
}
func (s *stubPVEClient) GetZFSPoolStatus(ctx context.Context, node string) ([]proxmox.ZFSPoolStatus, error) {
return nil, nil
}
func (s *stubPVEClient) GetZFSPoolsWithDetails(ctx context.Context, node string) ([]proxmox.ZFSPoolInfo, error) {
return nil, nil
}
func (s *stubPVEClient) GetDisks(ctx context.Context, node string) ([]proxmox.Disk, error) {
return nil, nil
}
func (s *stubPVEClient) GetCephStatus(ctx context.Context) (*proxmox.CephStatus, error) {
return nil, nil
}
func (s *stubPVEClient) GetCephDF(ctx context.Context) (*proxmox.CephDF, error) {
return nil, nil
}
func (s *stubPVEClient) GetNodePendingUpdates(ctx context.Context, node string) ([]proxmox.AptPackage, error) {
return nil, nil
}
func floatPtr(v float64) *float64 { return &v }
func newTestPVEMonitor(instanceName string) *Monitor {
return &Monitor{
config: &config.Config{
PVEInstances: []config.PVEInstance{{
Name: instanceName,
Host: "https://pve",
}},
},
state: models.NewState(),
alertManager: alerts.NewManager(),
notificationMgr: notifications.NewNotificationManager(""),
metricsHistory: NewMetricsHistory(32, time.Hour),
nodeSnapshots: make(map[string]NodeMemorySnapshot),
guestSnapshots: make(map[string]GuestMemorySnapshot),
nodeRRDMemCache: make(map[string]rrdMemCacheEntry),
lastClusterCheck: make(map[string]time.Time),
lastPhysicalDiskPoll: make(map[string]time.Time),
failureCounts: make(map[string]int),
lastOutcome: make(map[string]taskOutcome),
pollStatusMap: make(map[string]*pollStatus),
dlqInsightMap: make(map[string]*dlqInsight),
authFailures: make(map[string]int),
lastAuthAttempt: make(map[string]time.Time),
nodeLastOnline: make(map[string]time.Time),
}
}
func TestPollPVEInstanceUsesRRDMemUsedFallback(t *testing.T) {
t.Setenv("PULSE_DATA_DIR", t.TempDir())
total := uint64(16 * 1024 * 1024 * 1024)
actualUsed := total / 3
client := &stubPVEClient{
nodes: []proxmox.Node{
{
Node: "node1",
Status: "online",
CPU: 0.15,
MaxCPU: 8,
Mem: total,
MaxMem: total,
Disk: 0,
MaxDisk: 0,
Uptime: 3600,
},
},
nodeStatus: &proxmox.NodeStatus{
Memory: &proxmox.MemoryStatus{
Total: total,
Used: total,
Free: 0,
},
},
rrdPoints: []proxmox.NodeRRDPoint{
{
MemTotal: floatPtr(float64(total)),
MemUsed: floatPtr(float64(actualUsed)),
},
},
}
mon := newTestPVEMonitor("test")
defer mon.alertManager.Stop()
defer mon.notificationMgr.Stop()
mon.pollPVEInstance(context.Background(), "test", client)
snapshot := mon.state.GetSnapshot()
if len(snapshot.Nodes) != 1 {
t.Fatalf("expected one node in state, got %d", len(snapshot.Nodes))
}
node := snapshot.Nodes[0]
expectedUsage := (float64(actualUsed) / float64(total)) * 100
if diff := math.Abs(node.Memory.Usage - expectedUsage); diff > 0.5 {
t.Fatalf("memory usage mismatch: got %.2f want %.2f (diff %.2f)", node.Memory.Usage, expectedUsage, diff)
}
if node.Memory.Used != int64(actualUsed) {
t.Fatalf("memory used mismatch: got %d want %d", node.Memory.Used, actualUsed)
}
snapKey := makeNodeSnapshotKey("test", "node1")
mon.diagMu.RLock()
snap, ok := mon.nodeSnapshots[snapKey]
mon.diagMu.RUnlock()
if !ok {
t.Fatal("expected node snapshot entry to be recorded")
}
if snap.MemorySource != "rrd-memused" {
t.Fatalf("expected memory source rrd-memused, got %q", snap.MemorySource)
}
if snap.Raw.ProxmoxMemorySource != "rrd-memused" {
t.Fatalf("expected proxmox memory source rrd-memused, got %q", snap.Raw.ProxmoxMemorySource)
}
if snap.Raw.RRDUsed != actualUsed {
t.Fatalf("expected snapshot RRD used %d, got %d", actualUsed, snap.Raw.RRDUsed)
}
}
func TestGetVMAgentMemAvailableRetriesKnownNonWindowsGuestSoonerAfterNegativeCache(t *testing.T) {
t.Parallel()
client := &stubPVEClient{vmMemAvail: 2 << 30}
monitor := &Monitor{
guestMetadataCache: map[string]guestMetadataCacheEntry{
guestMetadataCacheKey("pve1", "node1", 100): {
osName: "Ubuntu",
fetchedAt: time.Now(),
},
},
vmAgentMemCache: map[string]agentMemCacheEntry{
"pve1/node1/100": {
negative: true,
fetchedAt: time.Now().Add(-vmAgentMemNegativeKnownGuestTTL - time.Second),
},
},
}
available, err := monitor.getVMAgentMemAvailable(context.Background(), client, "pve1", "node1", 100)
if err != nil {
t.Fatalf("getVMAgentMemAvailable() error = %v", err)
}
if available != 2<<30 {
t.Fatalf("getVMAgentMemAvailable() available = %d, want %d", available, uint64(2<<30))
}
if client.vmMemCalls != 1 {
t.Fatalf("expected guest-agent meminfo retry, got %d calls", client.vmMemCalls)
}
}
func TestGetVMAgentMemAvailableKeepsLongNegativeCacheForWindowsGuest(t *testing.T) {
t.Parallel()
client := &stubPVEClient{vmMemAvail: 2 << 30}
monitor := &Monitor{
guestMetadataCache: map[string]guestMetadataCacheEntry{
guestMetadataCacheKey("pve1", "node1", 100): {
osName: "Microsoft Windows",
fetchedAt: time.Now(),
},
},
vmAgentMemCache: map[string]agentMemCacheEntry{
"pve1/node1/100": {
negative: true,
fetchedAt: time.Now().Add(-vmAgentMemNegativeKnownGuestTTL - time.Second),
},
},
}
available, err := monitor.getVMAgentMemAvailable(context.Background(), client, "pve1", "node1", 100)
if err == nil {
t.Fatal("expected cached negative result for Windows guest")
}
if available != 0 {
t.Fatalf("expected no memavailable result, got %d", available)
}
if client.vmMemCalls != 0 {
t.Fatalf("expected Windows guest negative cache to suppress retry, got %d calls", client.vmMemCalls)
}
}
func TestPollPVENodePrefersLinkedHostDiskOverRootFS(t *testing.T) {
mon := newTestPVEMonitor("test")
defer mon.alertManager.Stop()
defer mon.notificationMgr.Stop()
mon.state.UpsertHost(models.Host{
ID: "host-1",
Hostname: "ebringa",
Status: "online",
Disks: []models.Disk{
{Mountpoint: "/", Type: "zfs", Total: 975, Used: 410, Free: 565, Usage: 42.0512820513},
},
})
mon.state.UpdateNodesForInstance("test", []models.Node{
{
ID: "test-ebringa",
Name: "ebringa",
Instance: "test",
LinkedHostAgentID: "host-1",
},
})
client := &stubPVEClient{
nodeStatus: &proxmox.NodeStatus{
RootFS: &proxmox.RootFS{
Total: 975,
Used: 3,
Free: 972,
},
},
}
modelNode, _, err := mon.pollPVENode(
context.Background(),
"test",
&mon.config.PVEInstances[0],
client,
proxmox.Node{
Node: "ebringa",
Status: "online",
MaxDisk: 975,
Disk: 3,
},
"healthy",
nil,
nil,
)
if err != nil {
t.Fatalf("pollPVENode() error = %v", err)
}
if modelNode.Disk.Total != 975 || modelNode.Disk.Used != 410 || modelNode.Disk.Free != 565 {
t.Fatalf("node disk = %+v, want linked host summary", modelNode.Disk)
}
if modelNode.Disk.Usage < 42 || modelNode.Disk.Usage > 42.1 {
t.Fatalf("node disk usage = %.2f, want linked host usage around 42.05", modelNode.Disk.Usage)
}
}
func TestResolveNodeDiskFallsBackToRootFSWithoutLinkedHost(t *testing.T) {
mon := newTestPVEMonitor("test")
defer mon.alertManager.Stop()
defer mon.notificationMgr.Stop()
disk, source := mon.resolveNodeDisk(
"test",
"test-node1",
"node1",
proxmox.Node{Node: "node1", Disk: 10, MaxDisk: 100},
&proxmox.NodeStatus{
RootFS: &proxmox.RootFS{
Total: 200,
Used: 50,
Free: 150,
},
},
)
if source != "rootfs" {
t.Fatalf("source = %q, want rootfs", source)
}
if disk.Total != 200 || disk.Used != 50 || disk.Free != 150 {
t.Fatalf("disk = %+v, want rootfs values", disk)
}
}
func TestPollPVEInstancePreservesRecentNodesWhenGetNodesReturnsEmpty(t *testing.T) {
t.Setenv("PULSE_DATA_DIR", t.TempDir())
client := &stubPVEClient{
nodes: []proxmox.Node{
{
Node: "node1",
Status: "online",
CPU: 0.10,
MaxCPU: 8,
Mem: 4 * 1024 * 1024 * 1024,
MaxMem: 8 * 1024 * 1024 * 1024,
Uptime: 7200,
MaxDisk: 100 * 1024 * 1024 * 1024,
Disk: 40 * 1024 * 1024 * 1024,
},
},
nodeStatus: &proxmox.NodeStatus{
Memory: &proxmox.MemoryStatus{
Total: 8 * 1024 * 1024 * 1024,
Used: 4 * 1024 * 1024 * 1024,
Free: 4 * 1024 * 1024 * 1024,
},
},
}
mon := newTestPVEMonitor("test")
defer mon.alertManager.Stop()
defer mon.notificationMgr.Stop()
mon.pollPVEInstance(context.Background(), "test", client)
// Simulate transient API gap: node list temporarily empty.
client.nodes = nil
mon.pollPVEInstance(context.Background(), "test", client)
snapshot := mon.state.GetSnapshot()
if len(snapshot.Nodes) != 1 {
t.Fatalf("expected one node in state, got %d", len(snapshot.Nodes))
}
node := snapshot.Nodes[0]
if node.Status == "offline" {
t.Fatalf("expected recent node to remain non-offline during grace window, got %q", node.Status)
}
}
func TestPollPVEInstanceMarksStaleNodesOfflineWhenGetNodesReturnsEmpty(t *testing.T) {
t.Setenv("PULSE_DATA_DIR", t.TempDir())
client := &stubPVEClient{
nodes: []proxmox.Node{
{
Node: "node1",
Status: "online",
CPU: 0.10,
MaxCPU: 8,
Mem: 4 * 1024 * 1024 * 1024,
MaxMem: 8 * 1024 * 1024 * 1024,
Uptime: 7200,
MaxDisk: 100 * 1024 * 1024 * 1024,
Disk: 40 * 1024 * 1024 * 1024,
},
},
nodeStatus: &proxmox.NodeStatus{
Memory: &proxmox.MemoryStatus{
Total: 8 * 1024 * 1024 * 1024,
Used: 4 * 1024 * 1024 * 1024,
Free: 4 * 1024 * 1024 * 1024,
},
},
}
mon := newTestPVEMonitor("test")
defer mon.alertManager.Stop()
defer mon.notificationMgr.Stop()
mon.pollPVEInstance(context.Background(), "test", client)
first := mon.state.GetSnapshot()
if len(first.Nodes) != 1 {
t.Fatalf("expected one node after first poll, got %d", len(first.Nodes))
}
staleNode := first.Nodes[0]
staleNode.LastSeen = time.Now().Add(-nodeOfflineGracePeriod - 2*time.Second)
mon.state.UpdateNodesForInstance("test", []models.Node{staleNode})
client.nodes = nil
mon.pollPVEInstance(context.Background(), "test", client)
second := mon.state.GetSnapshot()
if len(second.Nodes) != 1 {
t.Fatalf("expected one node after fallback poll, got %d", len(second.Nodes))
}
node := second.Nodes[0]
if node.Status != "offline" {
t.Fatalf("expected stale node to be marked offline, got %q", node.Status)
}
if node.ConnectionHealth != "error" {
t.Fatalf("expected stale node connection health error, got %q", node.ConnectionHealth)
}
}
func TestPollPVEInstanceUsesRRDMemAvailableWhenCacheMetricsMissing(t *testing.T) {
t.Setenv("PULSE_DATA_DIR", t.TempDir())
total := uint64(16 * 1024 * 1024 * 1024) // 16 GB
reportedUsed := uint64(14 * 1024 * 1024 * 1024) // 14 GB (inflated, includes cache)
reportedFree := uint64(2 * 1024 * 1024 * 1024) // 2 GB
rrdAvailable := uint64(10 * 1024 * 1024 * 1024) // 10 GB (cache-aware)
client := &stubPVEClient{
nodes: []proxmox.Node{
{
Node: "node1",
Status: "online",
CPU: 0.15,
MaxCPU: 8,
Mem: reportedUsed,
MaxMem: total,
Disk: 0,
MaxDisk: 0,
Uptime: 3600,
},
},
nodeStatus: &proxmox.NodeStatus{
Memory: &proxmox.MemoryStatus{
Total: total,
Used: reportedUsed,
Free: reportedFree,
// Available, Buffers, Cached all zero — triggers missingCacheMetrics
},
},
rrdPoints: []proxmox.NodeRRDPoint{
{
MemTotal: floatPtr(float64(total)),
MemUsed: floatPtr(float64(reportedUsed)),
MemAvailable: floatPtr(float64(rrdAvailable)),
},
},
}
mon := newTestPVEMonitor("test")
defer mon.alertManager.Stop()
defer mon.notificationMgr.Stop()
mon.pollPVEInstance(context.Background(), "test", client)
snapshot := mon.state.GetSnapshot()
if len(snapshot.Nodes) != 1 {
t.Fatalf("expected one node in state, got %d", len(snapshot.Nodes))
}
node := snapshot.Nodes[0]
expectedUsed := total - rrdAvailable // 6 GB
if node.Memory.Used != int64(expectedUsed) {
t.Fatalf("memory used mismatch: got %d want %d (should exclude reclaimable cache)", node.Memory.Used, expectedUsed)
}
expectedUsage := (float64(expectedUsed) / float64(total)) * 100
if diff := math.Abs(node.Memory.Usage - expectedUsage); diff > 0.5 {
t.Fatalf("memory usage mismatch: got %.2f want %.2f (diff %.2f)", node.Memory.Usage, expectedUsage, diff)
}
snapKey := makeNodeSnapshotKey("test", "node1")
mon.diagMu.RLock()
snap, ok := mon.nodeSnapshots[snapKey]
mon.diagMu.RUnlock()
if !ok {
t.Fatal("expected node snapshot entry to be recorded")
}
if snap.MemorySource != "rrd-memavailable" {
t.Fatalf("expected memory source rrd-memavailable, got %q", snap.MemorySource)
}
if snap.Raw.RRDAvailable != rrdAvailable {
t.Fatalf("expected snapshot RRD available %d, got %d", rrdAvailable, snap.Raw.RRDAvailable)
}
}
func TestPollPVEInstanceNodeMemoryUnchangedWhenCacheFieldsPresent(t *testing.T) {
t.Setenv("PULSE_DATA_DIR", t.TempDir())
total := uint64(16 * 1024 * 1024 * 1024) // 16 GB
reportedUsed := uint64(6 * 1024 * 1024 * 1024) // 6 GB
reportedFree := uint64(2 * 1024 * 1024 * 1024) // 2 GB
buffers := uint64(2 * 1024 * 1024 * 1024) // 2 GB
cached := uint64(6 * 1024 * 1024 * 1024) // 6 GB
// derived available = free + buffers + cached = 10 GB
rrdAvailable := uint64(9 * 1024 * 1024 * 1024) // 9 GB (slightly stale)
client := &stubPVEClient{
nodes: []proxmox.Node{
{
Node: "node1",
Status: "online",
CPU: 0.15,
MaxCPU: 8,
Mem: reportedUsed,
MaxMem: total,
Disk: 0,
MaxDisk: 0,
Uptime: 3600,
},
},
nodeStatus: &proxmox.NodeStatus{
Memory: &proxmox.MemoryStatus{
Total: total,
Used: reportedUsed,
Free: reportedFree,
Buffers: buffers,
Cached: cached,
},
},
rrdPoints: []proxmox.NodeRRDPoint{
{
MemTotal: floatPtr(float64(total)),
MemUsed: floatPtr(float64(reportedUsed)),
MemAvailable: floatPtr(float64(rrdAvailable)),
},
},
}
mon := newTestPVEMonitor("test")
defer mon.alertManager.Stop()
defer mon.notificationMgr.Stop()
mon.pollPVEInstance(context.Background(), "test", client)
snapshot := mon.state.GetSnapshot()
if len(snapshot.Nodes) != 1 {
t.Fatalf("expected one node in state, got %d", len(snapshot.Nodes))
}
node := snapshot.Nodes[0]
// With cache fields present, derived available = free + buffers + cached = 10 GB
// actualUsed = total - derivedAvailable = 16 - 10 = 6 GB
expectedUsed := reportedUsed
if node.Memory.Used != int64(expectedUsed) {
t.Fatalf("memory used mismatch: got %d want %d (should use derived available, not RRD)", node.Memory.Used, expectedUsed)
}
snapKey := makeNodeSnapshotKey("test", "node1")
mon.diagMu.RLock()
snap, ok := mon.nodeSnapshots[snapKey]
mon.diagMu.RUnlock()
if !ok {
t.Fatal("expected node snapshot entry to be recorded")
}
if snap.MemorySource == "rrd-memavailable" {
t.Fatalf("memory source should NOT be rrd-memavailable when cache fields are present, got %q", snap.MemorySource)
}
if snap.MemorySource != "derived-free-buffers-cached" {
t.Fatalf("expected memory source derived-free-buffers-cached, got %q", snap.MemorySource)
}
}
func TestPollPVEInstanceRRDFailFallsBackGracefully(t *testing.T) {
t.Setenv("PULSE_DATA_DIR", t.TempDir())
total := uint64(16 * 1024 * 1024 * 1024) // 16 GB
reportedUsed := uint64(14 * 1024 * 1024 * 1024) // 14 GB
reportedFree := uint64(2 * 1024 * 1024 * 1024) // 2 GB
client := &stubPVEClient{
nodes: []proxmox.Node{
{
Node: "node1",
Status: "online",
CPU: 0.15,
MaxCPU: 8,
Mem: reportedUsed,
MaxMem: total,
Disk: 0,
MaxDisk: 0,
Uptime: 3600,
},
},
nodeStatus: &proxmox.NodeStatus{
Memory: &proxmox.MemoryStatus{
Total: total,
Used: reportedUsed,
Free: reportedFree,
// No Available/Buffers/Cached — missingCacheMetrics is true
},
},
rrdErr: fmt.Errorf("connection refused"),
}
mon := newTestPVEMonitor("test")
defer mon.alertManager.Stop()
defer mon.notificationMgr.Stop()
mon.pollPVEInstance(context.Background(), "test", client)
snapshot := mon.state.GetSnapshot()
if len(snapshot.Nodes) != 1 {
t.Fatalf("expected one node in state, got %d", len(snapshot.Nodes))
}
node := snapshot.Nodes[0]
// With RRD unavailable and no cache fields, the best we can do is
// Total - Used gap or Free — memory will appear inflated but we
// must not crash or report zero.
if node.Memory.Total != int64(total) {
t.Fatalf("total mismatch: got %d want %d", node.Memory.Total, total)
}
if node.Memory.Used <= 0 {
t.Fatal("expected non-zero memory used even when RRD fails")
}
snapKey := makeNodeSnapshotKey("test", "node1")
mon.diagMu.RLock()
snap, ok := mon.nodeSnapshots[snapKey]
mon.diagMu.RUnlock()
if !ok {
t.Fatal("expected node snapshot entry to be recorded")
}
if snap.MemorySource == "rrd-memavailable" || snap.MemorySource == "rrd-memused" {
t.Fatalf("should not use RRD source when RRD fetch failed, got %q", snap.MemorySource)
}
}
func TestPollPVEInstanceUsesRRDMemUsedWhenMemAvailableMissingAndFreeZero(t *testing.T) {
// When cache metrics are missing, Free=0, and RRD only has memused (no
// memavailable), the rrd-memused fallback should kick in. This is the
// same scenario as TestPollPVEInstanceUsesRRDMemUsedFallback but with
// the widened gate — verifying no regression.
t.Setenv("PULSE_DATA_DIR", t.TempDir())
total := uint64(16 * 1024 * 1024 * 1024) // 16 GB
rrdMemUsed := uint64(6 * 1024 * 1024 * 1024) // 6 GB
client := &stubPVEClient{
nodes: []proxmox.Node{
{
Node: "node1",
Status: "online",
CPU: 0.15,
MaxCPU: 8,
Mem: total,
MaxMem: total,
Disk: 0,
MaxDisk: 0,
Uptime: 3600,
},
},
nodeStatus: &proxmox.NodeStatus{
Memory: &proxmox.MemoryStatus{
Total: total,
Used: total, // All memory reported as used
Free: 0, // Free is zero
// No Available/Buffers/Cached
},
},
rrdPoints: []proxmox.NodeRRDPoint{
{
MemTotal: floatPtr(float64(total)),
MemUsed: floatPtr(float64(rrdMemUsed)),
// No MemAvailable
},
},
}
mon := newTestPVEMonitor("test")
defer mon.alertManager.Stop()
defer mon.notificationMgr.Stop()
mon.pollPVEInstance(context.Background(), "test", client)
snapshot := mon.state.GetSnapshot()
if len(snapshot.Nodes) != 1 {
t.Fatalf("expected one node in state, got %d", len(snapshot.Nodes))
}
node := snapshot.Nodes[0]
if node.Memory.Used != int64(rrdMemUsed) {
t.Fatalf("memory used mismatch: got %d want %d (should use RRD memused fallback)", node.Memory.Used, rrdMemUsed)
}
snapKey := makeNodeSnapshotKey("test", "node1")
mon.diagMu.RLock()
snap, ok := mon.nodeSnapshots[snapKey]
mon.diagMu.RUnlock()
if !ok {
t.Fatal("expected node snapshot entry to be recorded")
}
if snap.MemorySource != "rrd-memused" {
t.Fatalf("expected memory source rrd-memused, got %q", snap.MemorySource)
}
}