mirror of
https://github.com/rcourtman/Pulse.git
synced 2026-05-11 21:28:15 +00:00
Linux VM page cache (#1270): QEMU VM memory now falls back to Proxmox RRD's memavailable metric (which excludes reclaimable page cache) when the qemu-guest-agent doesn't provide MemInfo.Available. Previously the fallback was detailedStatus.Mem (total - MemFree), inflating usage to 80%+ on VMs with normal Linux page cache. Mirrors the existing LXC rrd-memavailable path. FreeBSD ZFS ARC (#1264, #1051): The host agent now reads kstat.zfs.misc.arcstats.size via SysctlRaw on FreeBSD and subtracts the ARC size from reported memory usage. ZFS ARC is reclaimable under memory pressure (like Linux SReclaimable) but gopsutil counts it as wired/non-reclaimable, causing false 90%+ memory alerts on TrueNAS and FreeBSD hosts. Build-tagged so it compiles cleanly on all platforms. Fixes #1270 Fixes #1264 Fixes #1051 (cherry picked from commit 94502f83ff9ffc6da28aaadc946a2f7d8b4e9bac)
348 lines
10 KiB
Go
348 lines
10 KiB
Go
package monitoring
|
|
|
|
import (
|
|
"context"
|
|
"math"
|
|
"testing"
|
|
"time"
|
|
|
|
"github.com/rcourtman/pulse-go-rewrite/internal/alerts"
|
|
"github.com/rcourtman/pulse-go-rewrite/internal/config"
|
|
"github.com/rcourtman/pulse-go-rewrite/internal/models"
|
|
"github.com/rcourtman/pulse-go-rewrite/internal/notifications"
|
|
"github.com/rcourtman/pulse-go-rewrite/pkg/proxmox"
|
|
)
|
|
|
|
type stubPVEClient struct {
|
|
nodes []proxmox.Node
|
|
nodeStatus *proxmox.NodeStatus
|
|
rrdPoints []proxmox.NodeRRDPoint
|
|
}
|
|
|
|
var _ PVEClientInterface = (*stubPVEClient)(nil)
|
|
|
|
func (s *stubPVEClient) GetNodes(ctx context.Context) ([]proxmox.Node, error) {
|
|
return s.nodes, nil
|
|
}
|
|
|
|
func (s *stubPVEClient) GetNodeStatus(ctx context.Context, node string) (*proxmox.NodeStatus, error) {
|
|
return s.nodeStatus, nil
|
|
}
|
|
|
|
func (s *stubPVEClient) GetNodeRRDData(ctx context.Context, node, timeframe, cf string, ds []string) ([]proxmox.NodeRRDPoint, error) {
|
|
return s.rrdPoints, nil
|
|
}
|
|
|
|
func (s *stubPVEClient) GetLXCRRDData(ctx context.Context, node string, vmid int, timeframe, cf string, ds []string) ([]proxmox.GuestRRDPoint, error) {
|
|
return nil, nil
|
|
}
|
|
|
|
func (s *stubPVEClient) GetVMRRDData(ctx context.Context, node string, vmid int, timeframe, cf string, ds []string) ([]proxmox.GuestRRDPoint, error) {
|
|
return nil, nil
|
|
}
|
|
|
|
func (s *stubPVEClient) GetVMs(ctx context.Context, node string) ([]proxmox.VM, error) {
|
|
return nil, nil
|
|
}
|
|
|
|
func (s *stubPVEClient) GetContainers(ctx context.Context, node string) ([]proxmox.Container, error) {
|
|
return nil, nil
|
|
}
|
|
|
|
func (s *stubPVEClient) GetStorage(ctx context.Context, node string) ([]proxmox.Storage, error) {
|
|
return nil, nil
|
|
}
|
|
|
|
func (s *stubPVEClient) GetAllStorage(ctx context.Context) ([]proxmox.Storage, error) {
|
|
return nil, nil
|
|
}
|
|
|
|
func (s *stubPVEClient) GetBackupTasks(ctx context.Context) ([]proxmox.Task, error) {
|
|
return nil, nil
|
|
}
|
|
|
|
func (s *stubPVEClient) GetReplicationStatus(ctx context.Context) ([]proxmox.ReplicationJob, error) {
|
|
return nil, nil
|
|
}
|
|
|
|
func (s *stubPVEClient) GetStorageContent(ctx context.Context, node, storage string) ([]proxmox.StorageContent, error) {
|
|
return nil, nil
|
|
}
|
|
|
|
func (s *stubPVEClient) GetVMSnapshots(ctx context.Context, node string, vmid int) ([]proxmox.Snapshot, error) {
|
|
return nil, nil
|
|
}
|
|
|
|
func (s *stubPVEClient) GetContainerSnapshots(ctx context.Context, node string, vmid int) ([]proxmox.Snapshot, error) {
|
|
return nil, nil
|
|
}
|
|
|
|
func (s *stubPVEClient) GetVMStatus(ctx context.Context, node string, vmid int) (*proxmox.VMStatus, error) {
|
|
return nil, nil
|
|
}
|
|
|
|
func (s *stubPVEClient) GetContainerStatus(ctx context.Context, node string, vmid int) (*proxmox.Container, error) {
|
|
return nil, nil
|
|
}
|
|
|
|
func (s *stubPVEClient) GetContainerConfig(ctx context.Context, node string, vmid int) (map[string]interface{}, error) {
|
|
return nil, nil
|
|
}
|
|
func (s *stubPVEClient) GetContainerInterfaces(ctx context.Context, node string, vmid int) ([]proxmox.ContainerInterface, error) {
|
|
return nil, nil
|
|
}
|
|
|
|
func (s *stubPVEClient) GetClusterResources(ctx context.Context, resourceType string) ([]proxmox.ClusterResource, error) {
|
|
return nil, nil
|
|
}
|
|
|
|
func (s *stubPVEClient) IsClusterMember(ctx context.Context) (bool, error) {
|
|
return false, nil
|
|
}
|
|
|
|
func (s *stubPVEClient) GetVMFSInfo(ctx context.Context, node string, vmid int) ([]proxmox.VMFileSystem, error) {
|
|
return nil, nil
|
|
}
|
|
|
|
func (s *stubPVEClient) GetVMNetworkInterfaces(ctx context.Context, node string, vmid int) ([]proxmox.VMNetworkInterface, error) {
|
|
return nil, nil
|
|
}
|
|
|
|
func (s *stubPVEClient) GetVMAgentInfo(ctx context.Context, node string, vmid int) (map[string]interface{}, error) {
|
|
return map[string]interface{}{}, nil
|
|
}
|
|
|
|
func (s *stubPVEClient) GetVMAgentVersion(ctx context.Context, node string, vmid int) (string, error) {
|
|
return "", nil
|
|
}
|
|
|
|
func (s *stubPVEClient) GetZFSPoolStatus(ctx context.Context, node string) ([]proxmox.ZFSPoolStatus, error) {
|
|
return nil, nil
|
|
}
|
|
|
|
func (s *stubPVEClient) GetZFSPoolsWithDetails(ctx context.Context, node string) ([]proxmox.ZFSPoolInfo, error) {
|
|
return nil, nil
|
|
}
|
|
|
|
func (s *stubPVEClient) GetDisks(ctx context.Context, node string) ([]proxmox.Disk, error) {
|
|
return nil, nil
|
|
}
|
|
|
|
func (s *stubPVEClient) GetCephStatus(ctx context.Context) (*proxmox.CephStatus, error) {
|
|
return nil, nil
|
|
}
|
|
|
|
func (s *stubPVEClient) GetCephDF(ctx context.Context) (*proxmox.CephDF, error) {
|
|
return nil, nil
|
|
}
|
|
|
|
func (s *stubPVEClient) GetNodePendingUpdates(ctx context.Context, node string) ([]proxmox.AptPackage, error) {
|
|
return nil, nil
|
|
}
|
|
|
|
func floatPtr(v float64) *float64 { return &v }
|
|
|
|
func newTestPVEMonitor(instanceName string) *Monitor {
|
|
return &Monitor{
|
|
config: &config.Config{
|
|
PVEInstances: []config.PVEInstance{{
|
|
Name: instanceName,
|
|
Host: "https://pve",
|
|
}},
|
|
},
|
|
state: models.NewState(),
|
|
alertManager: alerts.NewManager(),
|
|
notificationMgr: notifications.NewNotificationManager(""),
|
|
metricsHistory: NewMetricsHistory(32, time.Hour),
|
|
nodeSnapshots: make(map[string]NodeMemorySnapshot),
|
|
guestSnapshots: make(map[string]GuestMemorySnapshot),
|
|
nodeRRDMemCache: make(map[string]rrdMemCacheEntry),
|
|
lastClusterCheck: make(map[string]time.Time),
|
|
lastPhysicalDiskPoll: make(map[string]time.Time),
|
|
failureCounts: make(map[string]int),
|
|
lastOutcome: make(map[string]taskOutcome),
|
|
pollStatusMap: make(map[string]*pollStatus),
|
|
dlqInsightMap: make(map[string]*dlqInsight),
|
|
authFailures: make(map[string]int),
|
|
lastAuthAttempt: make(map[string]time.Time),
|
|
nodeLastOnline: make(map[string]time.Time),
|
|
}
|
|
}
|
|
|
|
func TestPollPVEInstanceUsesRRDMemUsedFallback(t *testing.T) {
|
|
t.Setenv("PULSE_DATA_DIR", t.TempDir())
|
|
|
|
total := uint64(16 * 1024 * 1024 * 1024)
|
|
actualUsed := total / 3
|
|
|
|
client := &stubPVEClient{
|
|
nodes: []proxmox.Node{
|
|
{
|
|
Node: "node1",
|
|
Status: "online",
|
|
CPU: 0.15,
|
|
MaxCPU: 8,
|
|
Mem: total,
|
|
MaxMem: total,
|
|
Disk: 0,
|
|
MaxDisk: 0,
|
|
Uptime: 3600,
|
|
},
|
|
},
|
|
nodeStatus: &proxmox.NodeStatus{
|
|
Memory: &proxmox.MemoryStatus{
|
|
Total: total,
|
|
Used: total,
|
|
Free: 0,
|
|
},
|
|
},
|
|
rrdPoints: []proxmox.NodeRRDPoint{
|
|
{
|
|
MemTotal: floatPtr(float64(total)),
|
|
MemUsed: floatPtr(float64(actualUsed)),
|
|
},
|
|
},
|
|
}
|
|
|
|
mon := newTestPVEMonitor("test")
|
|
defer mon.alertManager.Stop()
|
|
defer mon.notificationMgr.Stop()
|
|
|
|
mon.pollPVEInstance(context.Background(), "test", client)
|
|
|
|
snapshot := mon.state.GetSnapshot()
|
|
if len(snapshot.Nodes) != 1 {
|
|
t.Fatalf("expected one node in state, got %d", len(snapshot.Nodes))
|
|
}
|
|
|
|
node := snapshot.Nodes[0]
|
|
expectedUsage := (float64(actualUsed) / float64(total)) * 100
|
|
if diff := math.Abs(node.Memory.Usage - expectedUsage); diff > 0.5 {
|
|
t.Fatalf("memory usage mismatch: got %.2f want %.2f (diff %.2f)", node.Memory.Usage, expectedUsage, diff)
|
|
}
|
|
if node.Memory.Used != int64(actualUsed) {
|
|
t.Fatalf("memory used mismatch: got %d want %d", node.Memory.Used, actualUsed)
|
|
}
|
|
|
|
snapKey := makeNodeSnapshotKey("test", "node1")
|
|
mon.diagMu.RLock()
|
|
snap, ok := mon.nodeSnapshots[snapKey]
|
|
mon.diagMu.RUnlock()
|
|
if !ok {
|
|
t.Fatal("expected node snapshot entry to be recorded")
|
|
}
|
|
if snap.MemorySource != "rrd-memused" {
|
|
t.Fatalf("expected memory source rrd-memused, got %q", snap.MemorySource)
|
|
}
|
|
if snap.Raw.ProxmoxMemorySource != "rrd-memused" {
|
|
t.Fatalf("expected proxmox memory source rrd-memused, got %q", snap.Raw.ProxmoxMemorySource)
|
|
}
|
|
if snap.Raw.RRDUsed != actualUsed {
|
|
t.Fatalf("expected snapshot RRD used %d, got %d", actualUsed, snap.Raw.RRDUsed)
|
|
}
|
|
}
|
|
|
|
func TestPollPVEInstancePreservesRecentNodesWhenGetNodesReturnsEmpty(t *testing.T) {
|
|
t.Setenv("PULSE_DATA_DIR", t.TempDir())
|
|
|
|
client := &stubPVEClient{
|
|
nodes: []proxmox.Node{
|
|
{
|
|
Node: "node1",
|
|
Status: "online",
|
|
CPU: 0.10,
|
|
MaxCPU: 8,
|
|
Mem: 4 * 1024 * 1024 * 1024,
|
|
MaxMem: 8 * 1024 * 1024 * 1024,
|
|
Uptime: 7200,
|
|
MaxDisk: 100 * 1024 * 1024 * 1024,
|
|
Disk: 40 * 1024 * 1024 * 1024,
|
|
},
|
|
},
|
|
nodeStatus: &proxmox.NodeStatus{
|
|
Memory: &proxmox.MemoryStatus{
|
|
Total: 8 * 1024 * 1024 * 1024,
|
|
Used: 4 * 1024 * 1024 * 1024,
|
|
Free: 4 * 1024 * 1024 * 1024,
|
|
},
|
|
},
|
|
}
|
|
|
|
mon := newTestPVEMonitor("test")
|
|
defer mon.alertManager.Stop()
|
|
defer mon.notificationMgr.Stop()
|
|
|
|
mon.pollPVEInstance(context.Background(), "test", client)
|
|
|
|
// Simulate transient API gap: node list temporarily empty.
|
|
client.nodes = nil
|
|
mon.pollPVEInstance(context.Background(), "test", client)
|
|
|
|
snapshot := mon.state.GetSnapshot()
|
|
if len(snapshot.Nodes) != 1 {
|
|
t.Fatalf("expected one node in state, got %d", len(snapshot.Nodes))
|
|
}
|
|
|
|
node := snapshot.Nodes[0]
|
|
if node.Status == "offline" {
|
|
t.Fatalf("expected recent node to remain non-offline during grace window, got %q", node.Status)
|
|
}
|
|
}
|
|
|
|
func TestPollPVEInstanceMarksStaleNodesOfflineWhenGetNodesReturnsEmpty(t *testing.T) {
|
|
t.Setenv("PULSE_DATA_DIR", t.TempDir())
|
|
|
|
client := &stubPVEClient{
|
|
nodes: []proxmox.Node{
|
|
{
|
|
Node: "node1",
|
|
Status: "online",
|
|
CPU: 0.10,
|
|
MaxCPU: 8,
|
|
Mem: 4 * 1024 * 1024 * 1024,
|
|
MaxMem: 8 * 1024 * 1024 * 1024,
|
|
Uptime: 7200,
|
|
MaxDisk: 100 * 1024 * 1024 * 1024,
|
|
Disk: 40 * 1024 * 1024 * 1024,
|
|
},
|
|
},
|
|
nodeStatus: &proxmox.NodeStatus{
|
|
Memory: &proxmox.MemoryStatus{
|
|
Total: 8 * 1024 * 1024 * 1024,
|
|
Used: 4 * 1024 * 1024 * 1024,
|
|
Free: 4 * 1024 * 1024 * 1024,
|
|
},
|
|
},
|
|
}
|
|
|
|
mon := newTestPVEMonitor("test")
|
|
defer mon.alertManager.Stop()
|
|
defer mon.notificationMgr.Stop()
|
|
|
|
mon.pollPVEInstance(context.Background(), "test", client)
|
|
|
|
first := mon.state.GetSnapshot()
|
|
if len(first.Nodes) != 1 {
|
|
t.Fatalf("expected one node after first poll, got %d", len(first.Nodes))
|
|
}
|
|
|
|
staleNode := first.Nodes[0]
|
|
staleNode.LastSeen = time.Now().Add(-nodeOfflineGracePeriod - 2*time.Second)
|
|
mon.state.UpdateNodesForInstance("test", []models.Node{staleNode})
|
|
|
|
client.nodes = nil
|
|
mon.pollPVEInstance(context.Background(), "test", client)
|
|
|
|
second := mon.state.GetSnapshot()
|
|
if len(second.Nodes) != 1 {
|
|
t.Fatalf("expected one node after fallback poll, got %d", len(second.Nodes))
|
|
}
|
|
|
|
node := second.Nodes[0]
|
|
if node.Status != "offline" {
|
|
t.Fatalf("expected stale node to be marked offline, got %q", node.Status)
|
|
}
|
|
if node.ConnectionHealth != "error" {
|
|
t.Fatalf("expected stale node connection health error, got %q", node.ConnectionHealth)
|
|
}
|
|
}
|