Pulse/internal/monitoring/monitor_storage_test.go
rcourtman 0ae2806f18 fix(memory): add guest agent /proc/meminfo fallback to avoid VM memory inflation (#1270)
Proxmox status.Mem includes page cache as "used" memory, inflating
reported VM usage. The existing fallbacks (balloon meminfo, RRD, linked
host agent) were frequently unavailable, causing most VMs to fall
through to the inflated status-mem source.

Adds a new last-resort fallback that reads /proc/meminfo via the QEMU
guest agent file-read endpoint to get accurate MemAvailable. Results
are cached (60s positive, 5min negative backoff for unsupported VMs).

Also fixes: RRD memavailable fallback missing from traditional polling
path, cache key collisions in multi-PVE setups, FreeMem underflow
guard inconsistency, and integer overflow in kB-to-bytes conversion.
2026-02-20 13:31:52 +00:00

231 lines
7 KiB
Go

package monitoring
import (
"context"
"fmt"
"math"
"testing"
"time"
"github.com/rcourtman/pulse-go-rewrite/internal/alerts"
"github.com/rcourtman/pulse-go-rewrite/internal/models"
"github.com/rcourtman/pulse-go-rewrite/pkg/proxmox"
)
// fakeStorageClient provides minimal PVE responses needed by the optimized storage poller.
type fakeStorageClient struct {
allStorage []proxmox.Storage
storageByNode map[string][]proxmox.Storage
}
func (f *fakeStorageClient) GetNodes(ctx context.Context) ([]proxmox.Node, error) {
return nil, nil
}
func (f *fakeStorageClient) GetNodeStatus(ctx context.Context, node string) (*proxmox.NodeStatus, error) {
return nil, nil
}
func (f *fakeStorageClient) GetNodeRRDData(ctx context.Context, node string, timeframe string, cf string, ds []string) ([]proxmox.NodeRRDPoint, error) {
return nil, nil
}
func (f *fakeStorageClient) GetLXCRRDData(ctx context.Context, node string, vmid int, timeframe string, cf string, ds []string) ([]proxmox.GuestRRDPoint, error) {
return nil, nil
}
func (f *fakeStorageClient) GetVMRRDData(ctx context.Context, node string, vmid int, timeframe string, cf string, ds []string) ([]proxmox.GuestRRDPoint, error) {
return nil, nil
}
func (f *fakeStorageClient) GetVMs(ctx context.Context, node string) ([]proxmox.VM, error) {
return nil, nil
}
func (f *fakeStorageClient) GetContainers(ctx context.Context, node string) ([]proxmox.Container, error) {
return nil, nil
}
func (f *fakeStorageClient) GetStorage(ctx context.Context, node string) ([]proxmox.Storage, error) {
if storages, ok := f.storageByNode[node]; ok {
return storages, nil
}
return nil, fmt.Errorf("unexpected node: %s", node)
}
func (f *fakeStorageClient) GetAllStorage(ctx context.Context) ([]proxmox.Storage, error) {
return f.allStorage, nil
}
func (f *fakeStorageClient) GetBackupTasks(ctx context.Context) ([]proxmox.Task, error) {
return nil, nil
}
func (f *fakeStorageClient) GetReplicationStatus(ctx context.Context) ([]proxmox.ReplicationJob, error) {
return nil, nil
}
func (f *fakeStorageClient) GetStorageContent(ctx context.Context, node, storage string) ([]proxmox.StorageContent, error) {
return nil, nil
}
func (f *fakeStorageClient) GetVMSnapshots(ctx context.Context, node string, vmid int) ([]proxmox.Snapshot, error) {
return nil, nil
}
func (f *fakeStorageClient) GetContainerSnapshots(ctx context.Context, node string, vmid int) ([]proxmox.Snapshot, error) {
return nil, nil
}
func (f *fakeStorageClient) GetVMStatus(ctx context.Context, node string, vmid int) (*proxmox.VMStatus, error) {
return nil, nil
}
func (f *fakeStorageClient) GetContainerStatus(ctx context.Context, node string, vmid int) (*proxmox.Container, error) {
return nil, nil
}
func (f *fakeStorageClient) GetContainerConfig(ctx context.Context, node string, vmid int) (map[string]interface{}, error) {
return nil, nil
}
func (f *fakeStorageClient) GetContainerInterfaces(ctx context.Context, node string, vmid int) ([]proxmox.ContainerInterface, error) {
return nil, nil
}
func (f *fakeStorageClient) GetClusterResources(ctx context.Context, resourceType string) ([]proxmox.ClusterResource, error) {
return nil, nil
}
func (f *fakeStorageClient) IsClusterMember(ctx context.Context) (bool, error) {
return false, nil
}
func (f *fakeStorageClient) GetVMFSInfo(ctx context.Context, node string, vmid int) ([]proxmox.VMFileSystem, error) {
return nil, nil
}
func (f *fakeStorageClient) GetVMNetworkInterfaces(ctx context.Context, node string, vmid int) ([]proxmox.VMNetworkInterface, error) {
return nil, nil
}
func (f *fakeStorageClient) GetVMAgentInfo(ctx context.Context, node string, vmid int) (map[string]interface{}, error) {
return nil, nil
}
func (f *fakeStorageClient) GetVMAgentVersion(ctx context.Context, node string, vmid int) (string, error) {
return "", nil
}
func (f *fakeStorageClient) GetVMMemAvailableFromAgent(ctx context.Context, node string, vmid int) (uint64, error) {
return 0, fmt.Errorf("not implemented")
}
func (f *fakeStorageClient) GetZFSPoolStatus(ctx context.Context, node string) ([]proxmox.ZFSPoolStatus, error) {
return nil, nil
}
func (f *fakeStorageClient) GetZFSPoolsWithDetails(ctx context.Context, node string) ([]proxmox.ZFSPoolInfo, error) {
return nil, nil
}
func (f *fakeStorageClient) GetDisks(ctx context.Context, node string) ([]proxmox.Disk, error) {
return nil, nil
}
func (f *fakeStorageClient) GetCephStatus(ctx context.Context) (*proxmox.CephStatus, error) {
return nil, nil
}
func (f *fakeStorageClient) GetCephDF(ctx context.Context) (*proxmox.CephDF, error) {
return nil, nil
}
func (f *fakeStorageClient) GetNodePendingUpdates(ctx context.Context, node string) ([]proxmox.AptPackage, error) {
return nil, nil
}
func TestPollStorageWithNodesOptimizedRecordsMetricsAndAlerts(t *testing.T) {
t.Setenv("PULSE_DATA_DIR", t.TempDir())
monitor := &Monitor{
state: &models.State{},
metricsHistory: NewMetricsHistory(16, time.Hour),
alertManager: alerts.NewManager(),
}
t.Cleanup(func() {
monitor.alertManager.Stop()
})
// Ensure storage alerts trigger immediately for the test.
cfg := monitor.alertManager.GetConfig()
cfg.MinimumDelta = 0
if cfg.TimeThresholds == nil {
cfg.TimeThresholds = make(map[string]int)
}
cfg.TimeThresholds["storage"] = 0
monitor.alertManager.UpdateConfig(cfg)
storage := proxmox.Storage{
Storage: "local",
Type: "dir",
Content: "images",
Active: 1,
Enabled: 1,
Shared: 0,
Total: 1000,
Used: 900,
Available: 100,
}
client := &fakeStorageClient{
allStorage: []proxmox.Storage{storage},
storageByNode: map[string][]proxmox.Storage{
"node1": {storage},
},
}
nodes := []proxmox.Node{
{Node: "node1", Status: "online"},
}
monitor.pollStorageWithNodes(context.Background(), "inst1", client, nodes)
metrics := monitor.metricsHistory.GetAllStorageMetrics("inst1-node1-local", time.Minute)
if len(metrics["usage"]) != 1 {
t.Fatalf("expected one usage metric entry, got %d", len(metrics["usage"]))
}
if len(metrics["used"]) != 1 {
t.Fatalf("expected one used metric entry, got %d", len(metrics["used"]))
}
if len(metrics["total"]) != 1 {
t.Fatalf("expected one total metric entry, got %d", len(metrics["total"]))
}
if len(metrics["avail"]) != 1 {
t.Fatalf("expected one avail metric entry, got %d", len(metrics["avail"]))
}
if diff := math.Abs(metrics["usage"][0].Value - 90); diff > 0.001 {
t.Fatalf("expected usage metric 90, diff %.4f", diff)
}
if diff := math.Abs(metrics["used"][0].Value - 900); diff > 0.001 {
t.Fatalf("expected used metric 900, diff %.4f", diff)
}
if diff := math.Abs(metrics["total"][0].Value - 1000); diff > 0.001 {
t.Fatalf("expected total metric 1000, diff %.4f", diff)
}
if diff := math.Abs(metrics["avail"][0].Value - 100); diff > 0.001 {
t.Fatalf("expected avail metric 100, diff %.4f", diff)
}
alerts := monitor.alertManager.GetActiveAlerts()
found := false
for _, alert := range alerts {
if alert.ID == "inst1-node1-local-usage" {
found = true
break
}
}
if !found {
t.Fatalf("expected storage usage alert to be active")
}
}