Pulse/internal/monitoring/proxmox_vm_cluster_resource_test.go

1023 lines
35 KiB
Go

package monitoring
import (
"context"
"testing"
"time"
"github.com/rcourtman/pulse-go-rewrite/internal/models"
"github.com/rcourtman/pulse-go-rewrite/pkg/proxmox"
)
type slowGuestAgentClusterClient struct {
stubPVEClient
resources []proxmox.ClusterResource
fsDelay time.Duration
}
type emptyFSInfoClusterClient struct {
stubPVEClient
resources []proxmox.ClusterResource
}
type repeatedLowTrustMemoryClusterClient struct {
stubPVEClient
resources []proxmox.ClusterResource
vmStatuses map[int]*proxmox.VMStatus
}
type rotatingGuestAgentClusterClient struct {
stubPVEClient
resources []proxmox.ClusterResource
fsDelay time.Duration
}
type transientStatusFailureClusterClient struct {
stubPVEClient
resources []proxmox.ClusterResource
}
type healthyGuestLowTrustMemoryClusterClient struct {
stubPVEClient
resources []proxmox.ClusterResource
}
type windowsDriveClusterClient struct {
stubPVEClient
resources []proxmox.ClusterResource
}
func (c *slowGuestAgentClusterClient) GetClusterResources(ctx context.Context, resourceType string) ([]proxmox.ClusterResource, error) {
return c.resources, nil
}
func (c *slowGuestAgentClusterClient) GetVMStatus(ctx context.Context, node string, vmid int) (*proxmox.VMStatus, error) {
return &proxmox.VMStatus{
MaxMem: 8 * 1024,
Mem: 4 * 1024,
Agent: proxmox.VMAgentField{Value: 1},
}, nil
}
func (c *slowGuestAgentClusterClient) GetVMFSInfo(ctx context.Context, node string, vmid int) ([]proxmox.VMFileSystem, error) {
select {
case <-time.After(c.fsDelay):
case <-ctx.Done():
return nil, ctx.Err()
}
return []proxmox.VMFileSystem{{
Mountpoint: "/",
Type: "ext4",
TotalBytes: 100 * 1024 * 1024 * 1024,
UsedBytes: 40 * 1024 * 1024 * 1024,
Disk: "/dev/vda",
}}, nil
}
func (c *emptyFSInfoClusterClient) GetClusterResources(ctx context.Context, resourceType string) ([]proxmox.ClusterResource, error) {
return c.resources, nil
}
func (c *emptyFSInfoClusterClient) GetVMStatus(ctx context.Context, node string, vmid int) (*proxmox.VMStatus, error) {
return &proxmox.VMStatus{
MaxMem: 8 * 1024,
Mem: 4 * 1024,
Agent: proxmox.VMAgentField{Value: 1},
}, nil
}
func (c *emptyFSInfoClusterClient) GetVMFSInfo(ctx context.Context, node string, vmid int) ([]proxmox.VMFileSystem, error) {
return []proxmox.VMFileSystem{}, nil
}
func (c *repeatedLowTrustMemoryClusterClient) GetClusterResources(ctx context.Context, resourceType string) ([]proxmox.ClusterResource, error) {
return c.resources, nil
}
func (c *repeatedLowTrustMemoryClusterClient) GetVMStatus(ctx context.Context, node string, vmid int) (*proxmox.VMStatus, error) {
if status, ok := c.vmStatuses[vmid]; ok {
return status, nil
}
return nil, nil
}
func (c *rotatingGuestAgentClusterClient) GetClusterResources(ctx context.Context, resourceType string) ([]proxmox.ClusterResource, error) {
return c.resources, nil
}
func (c *rotatingGuestAgentClusterClient) GetVMStatus(ctx context.Context, node string, vmid int) (*proxmox.VMStatus, error) {
return &proxmox.VMStatus{
MaxMem: 8 * 1024,
Mem: 4 * 1024,
Agent: proxmox.VMAgentField{Value: 1},
}, nil
}
func (c *rotatingGuestAgentClusterClient) GetVMFSInfo(ctx context.Context, node string, vmid int) ([]proxmox.VMFileSystem, error) {
select {
case <-time.After(c.fsDelay):
case <-ctx.Done():
return nil, ctx.Err()
}
return []proxmox.VMFileSystem{{
Mountpoint: "/",
Type: "ext4",
TotalBytes: 100 * 1024 * 1024 * 1024,
UsedBytes: 40 * 1024 * 1024 * 1024,
Disk: "/dev/vda",
}}, nil
}
func (c *rotatingGuestAgentClusterClient) GetVMNetworkInterfaces(ctx context.Context, node string, vmid int) ([]proxmox.VMNetworkInterface, error) {
return nil, nil
}
func (c *rotatingGuestAgentClusterClient) GetVMAgentInfo(ctx context.Context, node string, vmid int) (map[string]interface{}, error) {
return nil, nil
}
func (c *rotatingGuestAgentClusterClient) GetVMAgentVersion(ctx context.Context, node string, vmid int) (string, error) {
return "", nil
}
func (c *transientStatusFailureClusterClient) GetClusterResources(ctx context.Context, resourceType string) ([]proxmox.ClusterResource, error) {
return c.resources, nil
}
func (c *transientStatusFailureClusterClient) GetVMStatus(ctx context.Context, node string, vmid int) (*proxmox.VMStatus, error) {
return nil, context.DeadlineExceeded
}
func (c *transientStatusFailureClusterClient) GetVMFSInfo(ctx context.Context, node string, vmid int) ([]proxmox.VMFileSystem, error) {
return []proxmox.VMFileSystem{{
Mountpoint: "/",
Type: "ext4",
TotalBytes: 100 * 1024 * 1024 * 1024,
UsedBytes: 40 * 1024 * 1024 * 1024,
Disk: "/dev/vda",
}}, nil
}
func (c *transientStatusFailureClusterClient) GetVMNetworkInterfaces(ctx context.Context, node string, vmid int) ([]proxmox.VMNetworkInterface, error) {
return []proxmox.VMNetworkInterface{
{
Name: "Ethernet0",
HardwareAddr: "00:11:22:33:44:55",
IPAddresses: []proxmox.VMIpAddress{
{Address: "192.168.1.50", Prefix: 24},
},
},
}, nil
}
func (c *transientStatusFailureClusterClient) GetVMAgentInfo(ctx context.Context, node string, vmid int) (map[string]interface{}, error) {
return map[string]interface{}{
"pretty-name": "Ubuntu 24.04",
"version": "24.04",
}, nil
}
func (c *transientStatusFailureClusterClient) GetVMAgentVersion(ctx context.Context, node string, vmid int) (string, error) {
return "8.2.0", nil
}
func (c *transientStatusFailureClusterClient) GetVMMemAvailableFromAgent(ctx context.Context, node string, vmid int) (uint64, error) {
return 5 * 1024, nil
}
func (c *healthyGuestLowTrustMemoryClusterClient) GetClusterResources(ctx context.Context, resourceType string) ([]proxmox.ClusterResource, error) {
return c.resources, nil
}
func (c *healthyGuestLowTrustMemoryClusterClient) GetVMStatus(ctx context.Context, node string, vmid int) (*proxmox.VMStatus, error) {
const total = uint64(8 << 30)
return &proxmox.VMStatus{
Status: "running",
Agent: proxmox.VMAgentField{Value: 1},
MaxMem: total,
Mem: total,
}, nil
}
func (c *healthyGuestLowTrustMemoryClusterClient) GetVMNetworkInterfaces(ctx context.Context, node string, vmid int) ([]proxmox.VMNetworkInterface, error) {
return []proxmox.VMNetworkInterface{
{
Name: "Ethernet0",
HardwareAddr: "00:11:22:33:44:55",
IPAddresses: []proxmox.VMIpAddress{
{Address: "192.168.1.50", Prefix: 24},
},
},
}, nil
}
func (c *healthyGuestLowTrustMemoryClusterClient) GetVMAgentInfo(ctx context.Context, node string, vmid int) (map[string]interface{}, error) {
return map[string]interface{}{
"name": "Ubuntu",
"version-id": "24.04",
"pretty-name": "Ubuntu 24.04",
"version": "24.04",
"kernel-release": "6.8.0",
}, nil
}
func (c *healthyGuestLowTrustMemoryClusterClient) GetVMAgentVersion(ctx context.Context, node string, vmid int) (string, error) {
return "8.2.0", nil
}
func (c *healthyGuestLowTrustMemoryClusterClient) GetVMMemAvailableFromAgent(ctx context.Context, node string, vmid int) (uint64, error) {
return 0, context.DeadlineExceeded
}
func (c *windowsDriveClusterClient) GetClusterResources(ctx context.Context, resourceType string) ([]proxmox.ClusterResource, error) {
return c.resources, nil
}
func (c *windowsDriveClusterClient) GetVMStatus(ctx context.Context, node string, vmid int) (*proxmox.VMStatus, error) {
return &proxmox.VMStatus{
Status: "running",
MaxMem: 8 * 1024,
Mem: 4 * 1024,
Agent: proxmox.VMAgentField{Value: 1},
}, nil
}
func (c *windowsDriveClusterClient) GetVMFSInfo(ctx context.Context, node string, vmid int) ([]proxmox.VMFileSystem, error) {
return []proxmox.VMFileSystem{
{
Mountpoint: "C:",
Type: "NTFS",
TotalBytes: 100 * 1024 * 1024 * 1024,
UsedBytes: 57 * 1024 * 1024 * 1024,
Disk: "C:",
},
{
Mountpoint: "System Reserved",
Type: "NTFS",
TotalBytes: 500 * 1024 * 1024,
UsedBytes: 150 * 1024 * 1024,
Disk: "system-reserved",
},
}, nil
}
func TestGuestAgentFSInfoBudgetHonorsConfiguredTimeouts(t *testing.T) {
t.Parallel()
m := &Monitor{
guestAgentFSInfoTimeout: 15 * time.Second,
guestAgentRetries: 1,
}
budget := m.guestAgentFSInfoBudget()
if budget < 30*time.Second {
t.Fatalf("guestAgentFSInfoBudget() = %s, want at least 30s", budget)
}
}
func TestRotateIndexedClusterResources(t *testing.T) {
t.Parallel()
original := []indexedClusterResource{
{order: 0, resource: proxmox.ClusterResource{VMID: 100}},
{order: 1, resource: proxmox.ClusterResource{VMID: 101}},
{order: 2, resource: proxmox.ClusterResource{VMID: 102}},
}
rotated := rotateIndexedClusterResources(original, 1)
if got := []int{rotated[0].resource.VMID, rotated[1].resource.VMID, rotated[2].resource.VMID}; got[0] != 101 || got[1] != 102 || got[2] != 100 {
t.Fatalf("rotateIndexedClusterResources(..., 1) VMIDs = %v, want [101 102 100]", got)
}
if original[0].resource.VMID != 100 || original[1].resource.VMID != 101 || original[2].resource.VMID != 102 {
t.Fatal("rotateIndexedClusterResources should not mutate the original slice")
}
}
func TestPollVMsAndContainersEfficientCompletesDiskQueriesWithinPollBudget(t *testing.T) {
t.Setenv("PULSE_DATA_DIR", t.TempDir())
client := &slowGuestAgentClusterClient{
fsDelay: 60 * time.Millisecond,
resources: []proxmox.ClusterResource{
{Type: "qemu", Node: "node1", VMID: 100, Name: "vm100", Status: "running", MaxMem: 8 * 1024, Mem: 4 * 1024, MaxDisk: 100 * 1024 * 1024 * 1024, MaxCPU: 4},
{Type: "qemu", Node: "node1", VMID: 101, Name: "vm101", Status: "running", MaxMem: 8 * 1024, Mem: 4 * 1024, MaxDisk: 100 * 1024 * 1024 * 1024, MaxCPU: 4},
{Type: "qemu", Node: "node1", VMID: 102, Name: "vm102", Status: "running", MaxMem: 8 * 1024, Mem: 4 * 1024, MaxDisk: 100 * 1024 * 1024 * 1024, MaxCPU: 4},
{Type: "qemu", Node: "node1", VMID: 103, Name: "vm103", Status: "running", MaxMem: 8 * 1024, Mem: 4 * 1024, MaxDisk: 100 * 1024 * 1024 * 1024, MaxCPU: 4},
},
}
mon := newTestPVEMonitor("pve1")
defer mon.alertManager.Stop()
defer mon.notificationMgr.Stop()
mon.rateTracker = NewRateTracker()
mon.guestMetadataCache = make(map[string]guestMetadataCacheEntry)
mon.guestMetadataLimiter = make(map[string]time.Time)
mon.vmRRDMemCache = make(map[string]rrdMemCacheEntry)
mon.vmAgentMemCache = make(map[string]agentMemCacheEntry)
mon.guestAgentFSInfoTimeout = 250 * time.Millisecond
mon.guestAgentNetworkTimeout = 250 * time.Millisecond
mon.guestAgentOSInfoTimeout = 250 * time.Millisecond
mon.guestAgentVersionTimeout = 250 * time.Millisecond
mon.guestAgentRetries = 0
mon.guestAgentWorkSlots = make(chan struct{}, 4)
ctx, cancel := context.WithTimeout(context.Background(), 180*time.Millisecond)
defer cancel()
if ok := mon.pollVMsAndContainersEfficient(ctx, "pve1", "", false, client, map[string]string{"node1": "online"}); !ok {
t.Fatal("pollVMsAndContainersEfficient() returned false")
}
state := mon.state.GetSnapshot()
if len(state.VMs) != 4 {
t.Fatalf("expected 4 VMs, got %d", len(state.VMs))
}
for _, vm := range state.VMs {
if vm.Disk.Total <= 0 || vm.Disk.Usage <= 0 {
t.Fatalf("expected guest-agent disk data for %s, got total=%d usage=%.2f", vm.Name, vm.Disk.Total, vm.Disk.Usage)
}
}
}
func TestPollVMsAndContainersEfficientRotatesGuestAgentPriorityAcrossPolls(t *testing.T) {
t.Setenv("PULSE_DATA_DIR", t.TempDir())
client := &rotatingGuestAgentClusterClient{
fsDelay: 60 * time.Millisecond,
resources: []proxmox.ClusterResource{
{Type: "qemu", Node: "node1", VMID: 100, Name: "vm100", Status: "running", MaxMem: 8 * 1024, Mem: 4 * 1024, MaxDisk: 100 * 1024 * 1024 * 1024, MaxCPU: 4},
{Type: "qemu", Node: "node1", VMID: 101, Name: "vm101", Status: "running", MaxMem: 8 * 1024, Mem: 4 * 1024, MaxDisk: 100 * 1024 * 1024 * 1024, MaxCPU: 4},
{Type: "qemu", Node: "node1", VMID: 102, Name: "vm102", Status: "running", MaxMem: 8 * 1024, Mem: 4 * 1024, MaxDisk: 100 * 1024 * 1024 * 1024, MaxCPU: 4},
},
}
mon := newTestPVEMonitor("pve1")
defer mon.alertManager.Stop()
defer mon.notificationMgr.Stop()
mon.rateTracker = NewRateTracker()
mon.guestMetadataCache = make(map[string]guestMetadataCacheEntry)
mon.guestMetadataLimiter = make(map[string]time.Time)
mon.vmRRDMemCache = make(map[string]rrdMemCacheEntry)
mon.vmAgentMemCache = make(map[string]agentMemCacheEntry)
mon.guestAgentWorkSlots = make(chan struct{}, 1)
mon.guestAgentFSInfoTimeout = 250 * time.Millisecond
mon.guestAgentNetworkTimeout = 250 * time.Millisecond
mon.guestAgentOSInfoTimeout = 250 * time.Millisecond
mon.guestAgentVersionTimeout = 250 * time.Millisecond
mon.guestAgentRetries = 0
checkResolved := func(expectedVMID int) {
state := mon.state.GetSnapshot()
if len(state.VMs) != 3 {
t.Fatalf("expected 3 VMs, got %d", len(state.VMs))
}
vmByID := make(map[int]models.VM, len(state.VMs))
for _, vm := range state.VMs {
vmByID[vm.VMID] = vm
}
if vmByID[expectedVMID].Disk.Usage <= 0 {
t.Fatalf("expected VM %d to get a real disk reading, got usage=%.2f reason=%q", expectedVMID, vmByID[expectedVMID].Disk.Usage, vmByID[expectedVMID].DiskStatusReason)
}
}
for _, expectedVMID := range []int{100, 101, 102} {
ctx, cancel := context.WithTimeout(context.Background(), 75*time.Millisecond)
if ok := mon.pollVMsAndContainersEfficient(ctx, "pve1", "", false, client, map[string]string{"node1": "online"}); !ok {
cancel()
t.Fatal("pollVMsAndContainersEfficient() returned false")
}
cancel()
checkResolved(expectedVMID)
}
}
func TestPollVMsAndContainersEfficientPreservesCachedGuestMetadataWhenStatusUnavailable(t *testing.T) {
t.Setenv("PULSE_DATA_DIR", t.TempDir())
client := &transientStatusFailureClusterClient{
resources: []proxmox.ClusterResource{
{Type: "qemu", Node: "node1", VMID: 100, Name: "vm100", Status: "running", MaxMem: 8 * 1024, Mem: 4 * 1024, MaxDisk: 100 * 1024 * 1024 * 1024, MaxCPU: 4},
},
}
mon := newTestPVEMonitor("pve1")
defer mon.alertManager.Stop()
defer mon.notificationMgr.Stop()
mon.rateTracker = NewRateTracker()
mon.guestMetadataCache = map[string]guestMetadataCacheEntry{
guestMetadataCacheKey("pve1", "node1", 100): {
ipAddresses: []string{"192.168.1.50"},
networkInterfaces: []models.GuestNetworkInterface{
{Name: "Ethernet0", MAC: "00:11:22:33:44:55", Addresses: []string{"192.168.1.50"}},
},
osName: "Windows",
osVersion: "Server 2022",
agentVersion: "8.2.0",
fetchedAt: time.Now(),
},
}
mon.guestMetadataLimiter = make(map[string]time.Time)
mon.vmRRDMemCache = make(map[string]rrdMemCacheEntry)
mon.vmAgentMemCache = make(map[string]agentMemCacheEntry)
mon.guestAgentFSInfoTimeout = 250 * time.Millisecond
mon.guestAgentNetworkTimeout = 250 * time.Millisecond
mon.guestAgentOSInfoTimeout = 250 * time.Millisecond
mon.guestAgentVersionTimeout = 250 * time.Millisecond
mon.guestAgentRetries = 0
mon.guestAgentWorkSlots = make(chan struct{}, 1)
if ok := mon.pollVMsAndContainersEfficient(context.Background(), "pve1", "", false, client, map[string]string{"node1": "online"}); !ok {
t.Fatal("pollVMsAndContainersEfficient() returned false")
}
state := mon.state.GetSnapshot()
if len(state.VMs) != 1 {
t.Fatalf("expected 1 VM, got %d", len(state.VMs))
}
vm := state.VMs[0]
if len(vm.IPAddresses) != 1 || vm.IPAddresses[0] != "192.168.1.50" {
t.Fatalf("expected cached IPs to be preserved, got %#v", vm.IPAddresses)
}
if len(vm.NetworkInterfaces) != 1 || vm.NetworkInterfaces[0].Name != "Ethernet0" {
t.Fatalf("expected cached interfaces to be preserved, got %#v", vm.NetworkInterfaces)
}
if vm.OSName != "Windows" || vm.OSVersion != "Server 2022" {
t.Fatalf("expected cached OS info to be preserved, got %q %q", vm.OSName, vm.OSVersion)
}
if vm.AgentVersion != "8.2.0" {
t.Fatalf("expected cached agent version to be preserved, got %q", vm.AgentVersion)
}
}
func TestPollVMsAndContainersEfficientContinuesGuestAgentQueriesAfterTransientStatusFailure(t *testing.T) {
t.Setenv("PULSE_DATA_DIR", t.TempDir())
client := &transientStatusFailureClusterClient{
resources: []proxmox.ClusterResource{
{Type: "qemu", Node: "node1", VMID: 100, Name: "vm100", Status: "running", MaxMem: 8 * 1024, Mem: 8 * 1024, MaxDisk: 100 * 1024 * 1024 * 1024, MaxCPU: 4},
},
}
mon := newTestPVEMonitor("pve1")
defer mon.alertManager.Stop()
defer mon.notificationMgr.Stop()
mon.rateTracker = NewRateTracker()
mon.guestMetadataCache = make(map[string]guestMetadataCacheEntry)
mon.guestMetadataLimiter = make(map[string]time.Time)
mon.vmRRDMemCache = make(map[string]rrdMemCacheEntry)
mon.vmAgentMemCache = make(map[string]agentMemCacheEntry)
mon.guestAgentFSInfoTimeout = 250 * time.Millisecond
mon.guestAgentNetworkTimeout = 250 * time.Millisecond
mon.guestAgentOSInfoTimeout = 250 * time.Millisecond
mon.guestAgentVersionTimeout = 250 * time.Millisecond
mon.guestAgentRetries = 0
mon.guestAgentWorkSlots = make(chan struct{}, 1)
mon.state.UpdateVMsForInstance("pve1", []models.VM{
{
ID: makeGuestID("pve1", "node1", 100),
VMID: 100,
Name: "vm100",
Node: "node1",
Instance: "pve1",
Type: "qemu",
Status: "running",
AgentVersion: "8.1.0",
NetworkInterfaces: []models.GuestNetworkInterface{
{Name: "Ethernet0", MAC: "00:11:22:33:44:55", Addresses: []string{"192.168.1.50"}},
},
LastSeen: time.Now(),
},
})
if ok := mon.pollVMsAndContainersEfficient(context.Background(), "pve1", "", false, client, map[string]string{"node1": "online"}); !ok {
t.Fatal("pollVMsAndContainersEfficient() returned false")
}
state := mon.state.GetSnapshot()
if len(state.VMs) != 1 {
t.Fatalf("expected 1 VM, got %d", len(state.VMs))
}
vm := state.VMs[0]
if vm.MemorySource != "guest-agent-meminfo" {
t.Fatalf("expected guest-agent memory fallback after status failure, got %q", vm.MemorySource)
}
if vm.Disk.Usage != 40 {
t.Fatalf("expected live guest-agent disk usage after status failure, got %.2f", vm.Disk.Usage)
}
if vm.DiskStatusReason != "" {
t.Fatalf("expected empty disk status reason, got %q", vm.DiskStatusReason)
}
if len(vm.Disks) != 1 || vm.Disks[0].Device != "/dev/vda" {
t.Fatalf("expected live guest-agent disk inventory, got %#v", vm.Disks)
}
if len(vm.NetworkInterfaces) != 1 || vm.NetworkInterfaces[0].Name != "Ethernet0" {
t.Fatalf("expected refreshed network interfaces, got %#v", vm.NetworkInterfaces)
}
if vm.AgentVersion != "8.2.0" {
t.Fatalf("expected refreshed agent version, got %q", vm.AgentVersion)
}
}
func TestPollVMsAndContainersEfficientKeepsPreviousMemoryForHealthyGuestAfterRepeatedLowTrustFullUsage(t *testing.T) {
t.Setenv("PULSE_DATA_DIR", t.TempDir())
const total = uint64(8 << 30)
const trustedUsed = uint64(3 << 30)
client := &healthyGuestLowTrustMemoryClusterClient{
resources: []proxmox.ClusterResource{
{Type: "qemu", Node: "node1", VMID: 100, Name: "vm100", Status: "running", MaxMem: total, Mem: total, MaxDisk: 100 * 1024 * 1024 * 1024, MaxCPU: 4},
},
}
mon := newTestPVEMonitor("pve1")
defer mon.alertManager.Stop()
defer mon.notificationMgr.Stop()
mon.rateTracker = NewRateTracker()
mon.guestMetadataCache = make(map[string]guestMetadataCacheEntry)
mon.guestMetadataLimiter = make(map[string]time.Time)
mon.vmRRDMemCache = make(map[string]rrdMemCacheEntry)
mon.vmAgentMemCache = make(map[string]agentMemCacheEntry)
mon.guestAgentFSInfoTimeout = 250 * time.Millisecond
mon.guestAgentNetworkTimeout = 250 * time.Millisecond
mon.guestAgentOSInfoTimeout = 250 * time.Millisecond
mon.guestAgentVersionTimeout = 250 * time.Millisecond
mon.guestAgentRetries = 0
mon.guestAgentWorkSlots = make(chan struct{}, 1)
mon.state.UpdateVMsForInstance("pve1", []models.VM{
{
ID: makeGuestID("pve1", "node1", 100),
VMID: 100,
Name: "vm100",
Node: "node1",
Instance: "pve1",
Type: "qemu",
Status: "running",
MemorySource: "guest-agent-meminfo",
Memory: models.Memory{
Total: int64(total),
Used: int64(trustedUsed),
Free: int64(total - trustedUsed),
Usage: safePercentage(float64(trustedUsed), float64(total)),
},
LastSeen: time.Now(),
},
})
for i := 0; i < 2; i++ {
if ok := mon.pollVMsAndContainersEfficient(context.Background(), "pve1", "", false, client, map[string]string{"node1": "online"}); !ok {
t.Fatalf("pollVMsAndContainersEfficient() returned false on pass %d", i+1)
}
}
state := mon.state.GetSnapshot()
if len(state.VMs) != 1 {
t.Fatalf("expected 1 VM, got %d", len(state.VMs))
}
vm := state.VMs[0]
if vm.MemorySource != "previous-snapshot" {
t.Fatalf("memory source = %q, want previous-snapshot", vm.MemorySource)
}
if vm.Memory.Used != int64(trustedUsed) {
t.Fatalf("memory used = %d, want preserved %d", vm.Memory.Used, trustedUsed)
}
if len(vm.NetworkInterfaces) != 1 || vm.NetworkInterfaces[0].Name != "Ethernet0" {
t.Fatalf("expected guest agent network metadata to confirm healthy guest, got %#v", vm.NetworkInterfaces)
}
}
func TestPollVMsAndContainersEfficientCarriesForwardPreviousIndividualDisks(t *testing.T) {
t.Setenv("PULSE_DATA_DIR", t.TempDir())
client := &emptyFSInfoClusterClient{
resources: []proxmox.ClusterResource{
{Type: "qemu", Node: "node1", VMID: 100, Name: "vm100", Status: "running", MaxMem: 8 * 1024, Mem: 4 * 1024, MaxDisk: 100 * 1024 * 1024 * 1024, MaxCPU: 4},
},
}
mon := newTestPVEMonitor("pve1")
defer mon.alertManager.Stop()
defer mon.notificationMgr.Stop()
mon.rateTracker = NewRateTracker()
mon.guestMetadataCache = make(map[string]guestMetadataCacheEntry)
mon.guestMetadataLimiter = make(map[string]time.Time)
mon.vmRRDMemCache = make(map[string]rrdMemCacheEntry)
mon.vmAgentMemCache = make(map[string]agentMemCacheEntry)
mon.guestAgentWorkSlots = make(chan struct{}, 2)
prevVM := models.VM{
ID: makeGuestID("pve1", "node1", 100),
VMID: 100,
Name: "vm100",
Node: "node1",
Instance: "pve1",
Type: "qemu",
Status: "running",
Disk: models.Disk{
Total: 100 * 1024 * 1024 * 1024,
Used: 40 * 1024 * 1024 * 1024,
Free: 60 * 1024 * 1024 * 1024,
Usage: 40,
},
Disks: []models.Disk{
{
Total: 100 * 1024 * 1024 * 1024,
Used: 40 * 1024 * 1024 * 1024,
Free: 60 * 1024 * 1024 * 1024,
Usage: 40,
Mountpoint: "/",
Type: "ext4",
Device: "/dev/vda",
},
},
}
mon.state.UpdateVMs([]models.VM{prevVM})
if ok := mon.pollVMsAndContainersEfficient(context.Background(), "pve1", "", false, client, map[string]string{"node1": "online"}); !ok {
t.Fatal("pollVMsAndContainersEfficient() returned false")
}
state := mon.state.GetSnapshot()
if len(state.VMs) != 1 {
t.Fatalf("expected 1 VM, got %d", len(state.VMs))
}
vm := state.VMs[0]
if len(vm.Disks) != 1 {
t.Fatalf("expected previous individual disks to be preserved, got %#v", vm.Disks)
}
if vm.Disks[0].Mountpoint != "/" || vm.Disks[0].Device != "/dev/vda" {
t.Fatalf("unexpected carried-forward disk data: %#v", vm.Disks[0])
}
if vm.Disk.Usage != 40 {
t.Fatalf("expected aggregate disk usage to be carried forward, got %.2f", vm.Disk.Usage)
}
if vm.DiskStatusReason != "prev-no-filesystems" {
t.Fatalf("expected carried-forward disk status reason, got %q", vm.DiskStatusReason)
}
}
func TestPollVMsAndContainersEfficientMarksDiskUnknownUntilGuestAgentFilesystemDataArrives(t *testing.T) {
t.Setenv("PULSE_DATA_DIR", t.TempDir())
client := &emptyFSInfoClusterClient{
resources: []proxmox.ClusterResource{
{
Type: "qemu",
Node: "node1",
VMID: 100,
Name: "vm100",
Status: "running",
MaxMem: 8 * 1024,
Mem: 4 * 1024,
Disk: 57 * 1024 * 1024 * 1024,
MaxDisk: 100 * 1024 * 1024 * 1024,
MaxCPU: 4,
},
},
}
mon := newTestPVEMonitor("pve1")
defer mon.alertManager.Stop()
defer mon.notificationMgr.Stop()
mon.rateTracker = NewRateTracker()
mon.guestMetadataCache = make(map[string]guestMetadataCacheEntry)
mon.guestMetadataLimiter = make(map[string]time.Time)
mon.vmRRDMemCache = make(map[string]rrdMemCacheEntry)
mon.vmAgentMemCache = make(map[string]agentMemCacheEntry)
mon.guestAgentWorkSlots = make(chan struct{}, 2)
if ok := mon.pollVMsAndContainersEfficient(context.Background(), "pve1", "", false, client, map[string]string{"node1": "online"}); !ok {
t.Fatal("pollVMsAndContainersEfficient() returned false")
}
state := mon.state.GetSnapshot()
if len(state.VMs) != 1 {
t.Fatalf("expected 1 VM, got %d", len(state.VMs))
}
vm := state.VMs[0]
if vm.Disk.Usage != -1 {
t.Fatalf("expected aggregate disk usage to remain unknown, got %.2f", vm.Disk.Usage)
}
if vm.DiskStatusReason != "no-filesystems" {
t.Fatalf("expected disk status reason %q, got %q", "no-filesystems", vm.DiskStatusReason)
}
guestMetrics := mon.metricsHistory.GetGuestMetrics(vm.ID, "disk", time.Hour)
if len(guestMetrics) != 0 {
t.Fatalf("expected no disk metric samples while disk usage is unknown, got %#v", guestMetrics)
}
}
func TestPollVMsAndContainersEfficientUsesLinkedHostAgentDiskFallback(t *testing.T) {
t.Setenv("PULSE_DATA_DIR", t.TempDir())
client := &emptyFSInfoClusterClient{
resources: []proxmox.ClusterResource{
{
Type: "qemu",
Node: "node1",
VMID: 100,
Name: "vm100",
Status: "running",
MaxMem: 8 * 1024,
Mem: 4 * 1024,
MaxDisk: 100 * 1024 * 1024 * 1024,
MaxCPU: 4,
},
},
}
mon := newTestPVEMonitor("pve1")
defer mon.alertManager.Stop()
defer mon.notificationMgr.Stop()
mon.rateTracker = NewRateTracker()
mon.guestMetadataCache = make(map[string]guestMetadataCacheEntry)
mon.guestMetadataLimiter = make(map[string]time.Time)
mon.vmRRDMemCache = make(map[string]rrdMemCacheEntry)
mon.vmAgentMemCache = make(map[string]agentMemCacheEntry)
mon.guestAgentWorkSlots = make(chan struct{}, 2)
mon.state.UpsertHost(models.Host{
ID: "host-100",
Hostname: "vm100-agent",
Status: "online",
LinkedVMID: makeGuestID("pve1", "node1", 100),
Disks: []models.Disk{
{
Total: 100 * 1024 * 1024 * 1024,
Used: 57 * 1024 * 1024 * 1024,
Free: 43 * 1024 * 1024 * 1024,
Usage: 57,
Mountpoint: "C:",
Type: "NTFS",
Device: "C:",
},
},
})
if ok := mon.pollVMsAndContainersEfficient(context.Background(), "pve1", "", false, client, map[string]string{"node1": "online"}); !ok {
t.Fatal("pollVMsAndContainersEfficient() returned false")
}
state := mon.state.GetSnapshot()
if len(state.VMs) != 1 {
t.Fatalf("expected 1 VM, got %d", len(state.VMs))
}
vm := state.VMs[0]
if vm.Disk.Usage != 57 {
t.Fatalf("expected linked host-agent disk usage, got %.2f", vm.Disk.Usage)
}
if vm.DiskStatusReason != "" {
t.Fatalf("expected cleared disk status reason, got %q", vm.DiskStatusReason)
}
if len(vm.Disks) != 1 || vm.Disks[0].Mountpoint != "C:" {
t.Fatalf("expected linked host-agent disk inventory, got %#v", vm.Disks)
}
}
func TestPollVMsAndContainersEfficientKeepsNormalizedWindowsDriveRoots(t *testing.T) {
t.Setenv("PULSE_DATA_DIR", t.TempDir())
client := &windowsDriveClusterClient{
resources: []proxmox.ClusterResource{
{
Type: "qemu",
Node: "node1",
VMID: 100,
Name: "win100",
Status: "running",
MaxMem: 8 * 1024,
Mem: 4 * 1024,
Disk: 0,
MaxDisk: 100 * 1024 * 1024 * 1024,
MaxCPU: 4,
},
},
}
mon := newTestPVEMonitor("pve1")
defer mon.alertManager.Stop()
defer mon.notificationMgr.Stop()
mon.rateTracker = NewRateTracker()
mon.guestMetadataCache = make(map[string]guestMetadataCacheEntry)
mon.guestMetadataLimiter = make(map[string]time.Time)
mon.vmRRDMemCache = make(map[string]rrdMemCacheEntry)
mon.vmAgentMemCache = make(map[string]agentMemCacheEntry)
mon.guestAgentWorkSlots = make(chan struct{}, 2)
if ok := mon.pollVMsAndContainersEfficient(context.Background(), "pve1", "", false, client, map[string]string{"node1": "online"}); !ok {
t.Fatal("pollVMsAndContainersEfficient() returned false")
}
state := mon.state.GetSnapshot()
if len(state.VMs) != 1 {
t.Fatalf("expected 1 VM, got %d", len(state.VMs))
}
vm := state.VMs[0]
if vm.DiskStatusReason != "" {
t.Fatalf("expected empty disk status reason, got %q", vm.DiskStatusReason)
}
if len(vm.Disks) != 1 {
t.Fatalf("expected 1 usable Windows disk, got %#v", vm.Disks)
}
if vm.Disks[0].Mountpoint != "C:" {
t.Fatalf("expected normalized Windows drive root to be preserved, got %q", vm.Disks[0].Mountpoint)
}
if vm.Disk.Usage <= 0 {
t.Fatalf("expected Windows guest disk usage to be populated, got %.2f", vm.Disk.Usage)
}
}
func TestPollVMsAndContainersEfficientStabilizesSuspiciousRepeatedLowTrustMemory(t *testing.T) {
t.Setenv("PULSE_DATA_DIR", t.TempDir())
const total = uint64(8 << 30)
client := &repeatedLowTrustMemoryClusterClient{
resources: []proxmox.ClusterResource{
{Type: "qemu", Node: "node1", VMID: 100, Name: "vm100", Status: "running", MaxMem: total, Mem: total, MaxCPU: 4},
{Type: "qemu", Node: "node1", VMID: 101, Name: "vm101", Status: "running", MaxMem: total, Mem: total, MaxCPU: 4},
{Type: "qemu", Node: "node1", VMID: 102, Name: "vm102", Status: "running", MaxMem: total, Mem: total, MaxCPU: 4},
{Type: "qemu", Node: "node1", VMID: 103, Name: "vm103", Status: "running", MaxMem: total, Mem: 2 << 30, MaxCPU: 4},
},
vmStatuses: map[int]*proxmox.VMStatus{
100: {Status: "running", MaxMem: total, Mem: total, Balloon: 2 << 30, Agent: proxmox.VMAgentField{Value: 1}},
101: {Status: "running", MaxMem: total, Mem: total, Agent: proxmox.VMAgentField{Value: 1}},
102: {Status: "running", MaxMem: total, Mem: total, Agent: proxmox.VMAgentField{Value: 1}},
103: {Status: "running", MaxMem: total, Mem: 2 << 30, Agent: proxmox.VMAgentField{Value: 0}},
},
}
mon := newTestPVEMonitor("pve1")
defer mon.alertManager.Stop()
defer mon.notificationMgr.Stop()
mon.rateTracker = NewRateTracker()
mon.guestMetadataCache = make(map[string]guestMetadataCacheEntry)
mon.guestMetadataLimiter = make(map[string]time.Time)
mon.vmRRDMemCache = make(map[string]rrdMemCacheEntry)
mon.vmAgentMemCache = make(map[string]agentMemCacheEntry)
mon.guestAgentWorkSlots = make(chan struct{}, 4)
now := time.Now()
mon.state.UpdateVMs([]models.VM{
{
ID: makeGuestID("pve1", "node1", 100),
VMID: 100,
Name: "vm100",
Node: "node1",
Instance: "pve1",
Type: "qemu",
Status: "running",
MemorySource: "rrd-memavailable",
Memory: models.Memory{Total: int64(total), Used: 3 << 30, Free: 5 << 30, Usage: safePercentage(float64(3<<30), float64(total))},
LastSeen: now,
},
{
ID: makeGuestID("pve1", "node1", 101),
VMID: 101,
Name: "vm101",
Node: "node1",
Instance: "pve1",
Type: "qemu",
Status: "running",
MemorySource: "guest-agent-meminfo",
Memory: models.Memory{Total: int64(total), Used: 4 << 30, Free: 4 << 30, Usage: 50},
LastSeen: now,
},
{
ID: makeGuestID("pve1", "node1", 102),
VMID: 102,
Name: "vm102",
Node: "node1",
Instance: "pve1",
Type: "qemu",
Status: "running",
MemorySource: "previous-snapshot",
Memory: models.Memory{Total: int64(total), Used: 5 << 30, Free: 3 << 30, Usage: 62.5},
LastSeen: now,
},
})
if ok := mon.pollVMsAndContainersEfficient(context.Background(), "pve1", "", false, client, map[string]string{"node1": "online"}); !ok {
t.Fatal("pollVMsAndContainersEfficient() returned false")
}
state := mon.state.GetSnapshot()
if len(state.VMs) != 4 {
t.Fatalf("expected 4 VMs, got %d", len(state.VMs))
}
vmByID := make(map[int]models.VM, len(state.VMs))
for _, vm := range state.VMs {
vmByID[vm.VMID] = vm
}
if vmByID[100].MemorySource != "previous-snapshot" || vmByID[100].Memory.Used != 3<<30 {
t.Fatalf("vm100 memory = %#v source=%q, want preserved previous reading", vmByID[100].Memory, vmByID[100].MemorySource)
}
if vmByID[100].Memory.Balloon != 2<<30 {
t.Fatalf("vm100 balloon = %d, want current balloon", vmByID[100].Memory.Balloon)
}
if vmByID[101].MemorySource != "previous-snapshot" || vmByID[101].Memory.Used != 4<<30 {
t.Fatalf("vm101 memory = %#v source=%q, want preserved previous reading", vmByID[101].Memory, vmByID[101].MemorySource)
}
if vmByID[102].MemorySource != "previous-snapshot" || vmByID[102].Memory.Used != 5<<30 {
t.Fatalf("vm102 memory = %#v source=%q, want chained preserved reading", vmByID[102].Memory, vmByID[102].MemorySource)
}
if vmByID[103].MemorySource != "status-mem" || vmByID[103].Memory.Used != 2<<30 {
t.Fatalf("vm103 memory = %#v source=%q, want unaffected current reading", vmByID[103].Memory, vmByID[103].MemorySource)
}
snapshotKey := makeGuestSnapshotKey("pve1", "qemu", "node1", 100)
mon.diagMu.RLock()
snapshot, ok := mon.guestSnapshots[snapshotKey]
stabilizedSnapshot := mon.guestSnapshots[makeGuestSnapshotKey("pve1", "qemu", "node1", 102)]
mon.diagMu.RUnlock()
if !ok {
t.Fatal("expected guest snapshot for vm100")
}
if snapshot.MemorySource != "previous-snapshot" || snapshot.Memory.Used != 3<<30 {
t.Fatalf("snapshot memory = %#v source=%q, want preserved previous reading", snapshot.Memory, snapshot.MemorySource)
}
if !snapshotHasNote(stabilizedSnapshot.Notes, "preserved-previous-memory-after-repeated-low-trust-pattern") &&
!snapshotHasNote(stabilizedSnapshot.Notes, "preserved-previous-memory-for-healthy-guest-low-trust-full-usage") {
t.Fatalf("vm102 snapshot notes = %#v, want preservation note", stabilizedSnapshot.Notes)
}
}
func TestPollVMsAndContainersEfficientTreatsAvailableGuestAgentAsHealthyForMemoryCarryForward(t *testing.T) {
t.Setenv("PULSE_DATA_DIR", t.TempDir())
const total = uint64(8 << 30)
client := &repeatedLowTrustMemoryClusterClient{
resources: []proxmox.ClusterResource{
{Type: "qemu", Node: "node1", VMID: 100, Name: "vm100", Status: "running", MaxMem: total, Mem: total, MaxCPU: 4},
},
vmStatuses: map[int]*proxmox.VMStatus{
100: {Status: "running", MaxMem: total, Mem: total, Agent: proxmox.VMAgentField{Value: 1}},
},
}
mon := newTestPVEMonitor("pve1")
defer mon.alertManager.Stop()
defer mon.notificationMgr.Stop()
mon.rateTracker = NewRateTracker()
mon.guestMetadataCache = make(map[string]guestMetadataCacheEntry)
mon.guestMetadataLimiter = make(map[string]time.Time)
mon.vmRRDMemCache = make(map[string]rrdMemCacheEntry)
mon.vmAgentMemCache = make(map[string]agentMemCacheEntry)
mon.guestAgentWorkSlots = make(chan struct{}, 4)
now := time.Now()
mon.state.UpdateVMs([]models.VM{
{
ID: makeGuestID("pve1", "node1", 100),
VMID: 100,
Name: "vm100",
Node: "node1",
Instance: "pve1",
Type: "qemu",
Status: "running",
MemorySource: "previous-snapshot",
Memory: models.Memory{Total: int64(total), Used: 3 << 30, Free: 5 << 30, Usage: safePercentage(float64(3<<30), float64(total))},
LastSeen: now,
},
})
if ok := mon.pollVMsAndContainersEfficient(context.Background(), "pve1", "", false, client, map[string]string{"node1": "online"}); !ok {
t.Fatal("pollVMsAndContainersEfficient() returned false")
}
state := mon.state.GetSnapshot()
if len(state.VMs) != 1 {
t.Fatalf("expected 1 VM, got %d", len(state.VMs))
}
vm := state.VMs[0]
if vm.MemorySource != "previous-snapshot" || vm.Memory.Used != 3<<30 {
t.Fatalf("vm memory = %#v source=%q, want preserved previous reading", vm.Memory, vm.MemorySource)
}
}