package monitoring import ( "context" "testing" "time" "github.com/rcourtman/pulse-go-rewrite/internal/models" "github.com/rcourtman/pulse-go-rewrite/pkg/proxmox" ) type slowGuestAgentClusterClient struct { stubPVEClient resources []proxmox.ClusterResource fsDelay time.Duration } type emptyFSInfoClusterClient struct { stubPVEClient resources []proxmox.ClusterResource } type repeatedLowTrustMemoryClusterClient struct { stubPVEClient resources []proxmox.ClusterResource vmStatuses map[int]*proxmox.VMStatus } type rotatingGuestAgentClusterClient struct { stubPVEClient resources []proxmox.ClusterResource fsDelay time.Duration } type transientStatusFailureClusterClient struct { stubPVEClient resources []proxmox.ClusterResource } type healthyGuestLowTrustMemoryClusterClient struct { stubPVEClient resources []proxmox.ClusterResource } type windowsDriveClusterClient struct { stubPVEClient resources []proxmox.ClusterResource } func (c *slowGuestAgentClusterClient) GetClusterResources(ctx context.Context, resourceType string) ([]proxmox.ClusterResource, error) { return c.resources, nil } func (c *slowGuestAgentClusterClient) GetVMStatus(ctx context.Context, node string, vmid int) (*proxmox.VMStatus, error) { return &proxmox.VMStatus{ MaxMem: 8 * 1024, Mem: 4 * 1024, Agent: proxmox.VMAgentField{Value: 1}, }, nil } func (c *slowGuestAgentClusterClient) GetVMFSInfo(ctx context.Context, node string, vmid int) ([]proxmox.VMFileSystem, error) { select { case <-time.After(c.fsDelay): case <-ctx.Done(): return nil, ctx.Err() } return []proxmox.VMFileSystem{{ Mountpoint: "/", Type: "ext4", TotalBytes: 100 * 1024 * 1024 * 1024, UsedBytes: 40 * 1024 * 1024 * 1024, Disk: "/dev/vda", }}, nil } func (c *emptyFSInfoClusterClient) GetClusterResources(ctx context.Context, resourceType string) ([]proxmox.ClusterResource, error) { return c.resources, nil } func (c *emptyFSInfoClusterClient) GetVMStatus(ctx context.Context, node string, vmid int) (*proxmox.VMStatus, error) { return &proxmox.VMStatus{ MaxMem: 8 * 1024, Mem: 4 * 1024, Agent: proxmox.VMAgentField{Value: 1}, }, nil } func (c *emptyFSInfoClusterClient) GetVMFSInfo(ctx context.Context, node string, vmid int) ([]proxmox.VMFileSystem, error) { return []proxmox.VMFileSystem{}, nil } func (c *repeatedLowTrustMemoryClusterClient) GetClusterResources(ctx context.Context, resourceType string) ([]proxmox.ClusterResource, error) { return c.resources, nil } func (c *repeatedLowTrustMemoryClusterClient) GetVMStatus(ctx context.Context, node string, vmid int) (*proxmox.VMStatus, error) { if status, ok := c.vmStatuses[vmid]; ok { return status, nil } return nil, nil } func (c *rotatingGuestAgentClusterClient) GetClusterResources(ctx context.Context, resourceType string) ([]proxmox.ClusterResource, error) { return c.resources, nil } func (c *rotatingGuestAgentClusterClient) GetVMStatus(ctx context.Context, node string, vmid int) (*proxmox.VMStatus, error) { return &proxmox.VMStatus{ MaxMem: 8 * 1024, Mem: 4 * 1024, Agent: proxmox.VMAgentField{Value: 1}, }, nil } func (c *rotatingGuestAgentClusterClient) GetVMFSInfo(ctx context.Context, node string, vmid int) ([]proxmox.VMFileSystem, error) { select { case <-time.After(c.fsDelay): case <-ctx.Done(): return nil, ctx.Err() } return []proxmox.VMFileSystem{{ Mountpoint: "/", Type: "ext4", TotalBytes: 100 * 1024 * 1024 * 1024, UsedBytes: 40 * 1024 * 1024 * 1024, Disk: "/dev/vda", }}, nil } func (c *rotatingGuestAgentClusterClient) GetVMNetworkInterfaces(ctx context.Context, node string, vmid int) ([]proxmox.VMNetworkInterface, error) { return nil, nil } func (c *rotatingGuestAgentClusterClient) GetVMAgentInfo(ctx context.Context, node string, vmid int) (map[string]interface{}, error) { return nil, nil } func (c *rotatingGuestAgentClusterClient) GetVMAgentVersion(ctx context.Context, node string, vmid int) (string, error) { return "", nil } func (c *transientStatusFailureClusterClient) GetClusterResources(ctx context.Context, resourceType string) ([]proxmox.ClusterResource, error) { return c.resources, nil } func (c *transientStatusFailureClusterClient) GetVMStatus(ctx context.Context, node string, vmid int) (*proxmox.VMStatus, error) { return nil, context.DeadlineExceeded } func (c *transientStatusFailureClusterClient) GetVMFSInfo(ctx context.Context, node string, vmid int) ([]proxmox.VMFileSystem, error) { return []proxmox.VMFileSystem{{ Mountpoint: "/", Type: "ext4", TotalBytes: 100 * 1024 * 1024 * 1024, UsedBytes: 40 * 1024 * 1024 * 1024, Disk: "/dev/vda", }}, nil } func (c *transientStatusFailureClusterClient) GetVMNetworkInterfaces(ctx context.Context, node string, vmid int) ([]proxmox.VMNetworkInterface, error) { return []proxmox.VMNetworkInterface{ { Name: "Ethernet0", HardwareAddr: "00:11:22:33:44:55", IPAddresses: []proxmox.VMIpAddress{ {Address: "192.168.1.50", Prefix: 24}, }, }, }, nil } func (c *transientStatusFailureClusterClient) GetVMAgentInfo(ctx context.Context, node string, vmid int) (map[string]interface{}, error) { return map[string]interface{}{ "pretty-name": "Ubuntu 24.04", "version": "24.04", }, nil } func (c *transientStatusFailureClusterClient) GetVMAgentVersion(ctx context.Context, node string, vmid int) (string, error) { return "8.2.0", nil } func (c *transientStatusFailureClusterClient) GetVMMemAvailableFromAgent(ctx context.Context, node string, vmid int) (uint64, error) { return 5 * 1024, nil } func (c *healthyGuestLowTrustMemoryClusterClient) GetClusterResources(ctx context.Context, resourceType string) ([]proxmox.ClusterResource, error) { return c.resources, nil } func (c *healthyGuestLowTrustMemoryClusterClient) GetVMStatus(ctx context.Context, node string, vmid int) (*proxmox.VMStatus, error) { const total = uint64(8 << 30) return &proxmox.VMStatus{ Status: "running", Agent: proxmox.VMAgentField{Value: 1}, MaxMem: total, Mem: total, }, nil } func (c *healthyGuestLowTrustMemoryClusterClient) GetVMNetworkInterfaces(ctx context.Context, node string, vmid int) ([]proxmox.VMNetworkInterface, error) { return []proxmox.VMNetworkInterface{ { Name: "Ethernet0", HardwareAddr: "00:11:22:33:44:55", IPAddresses: []proxmox.VMIpAddress{ {Address: "192.168.1.50", Prefix: 24}, }, }, }, nil } func (c *healthyGuestLowTrustMemoryClusterClient) GetVMAgentInfo(ctx context.Context, node string, vmid int) (map[string]interface{}, error) { return map[string]interface{}{ "name": "Ubuntu", "version-id": "24.04", "pretty-name": "Ubuntu 24.04", "version": "24.04", "kernel-release": "6.8.0", }, nil } func (c *healthyGuestLowTrustMemoryClusterClient) GetVMAgentVersion(ctx context.Context, node string, vmid int) (string, error) { return "8.2.0", nil } func (c *healthyGuestLowTrustMemoryClusterClient) GetVMMemAvailableFromAgent(ctx context.Context, node string, vmid int) (uint64, error) { return 0, context.DeadlineExceeded } func (c *windowsDriveClusterClient) GetClusterResources(ctx context.Context, resourceType string) ([]proxmox.ClusterResource, error) { return c.resources, nil } func (c *windowsDriveClusterClient) GetVMStatus(ctx context.Context, node string, vmid int) (*proxmox.VMStatus, error) { return &proxmox.VMStatus{ Status: "running", MaxMem: 8 * 1024, Mem: 4 * 1024, Agent: proxmox.VMAgentField{Value: 1}, }, nil } func (c *windowsDriveClusterClient) GetVMFSInfo(ctx context.Context, node string, vmid int) ([]proxmox.VMFileSystem, error) { return []proxmox.VMFileSystem{ { Mountpoint: "C:", Type: "NTFS", TotalBytes: 100 * 1024 * 1024 * 1024, UsedBytes: 57 * 1024 * 1024 * 1024, Disk: "C:", }, { Mountpoint: "System Reserved", Type: "NTFS", TotalBytes: 500 * 1024 * 1024, UsedBytes: 150 * 1024 * 1024, Disk: "system-reserved", }, }, nil } func TestGuestAgentFSInfoBudgetHonorsConfiguredTimeouts(t *testing.T) { t.Parallel() m := &Monitor{ guestAgentFSInfoTimeout: 15 * time.Second, guestAgentRetries: 1, } budget := m.guestAgentFSInfoBudget() if budget < 30*time.Second { t.Fatalf("guestAgentFSInfoBudget() = %s, want at least 30s", budget) } } func TestRotateIndexedClusterResources(t *testing.T) { t.Parallel() original := []indexedClusterResource{ {order: 0, resource: proxmox.ClusterResource{VMID: 100}}, {order: 1, resource: proxmox.ClusterResource{VMID: 101}}, {order: 2, resource: proxmox.ClusterResource{VMID: 102}}, } rotated := rotateIndexedClusterResources(original, 1) if got := []int{rotated[0].resource.VMID, rotated[1].resource.VMID, rotated[2].resource.VMID}; got[0] != 101 || got[1] != 102 || got[2] != 100 { t.Fatalf("rotateIndexedClusterResources(..., 1) VMIDs = %v, want [101 102 100]", got) } if original[0].resource.VMID != 100 || original[1].resource.VMID != 101 || original[2].resource.VMID != 102 { t.Fatal("rotateIndexedClusterResources should not mutate the original slice") } } func TestPollVMsAndContainersEfficientCompletesDiskQueriesWithinPollBudget(t *testing.T) { t.Setenv("PULSE_DATA_DIR", t.TempDir()) client := &slowGuestAgentClusterClient{ fsDelay: 60 * time.Millisecond, resources: []proxmox.ClusterResource{ {Type: "qemu", Node: "node1", VMID: 100, Name: "vm100", Status: "running", MaxMem: 8 * 1024, Mem: 4 * 1024, MaxDisk: 100 * 1024 * 1024 * 1024, MaxCPU: 4}, {Type: "qemu", Node: "node1", VMID: 101, Name: "vm101", Status: "running", MaxMem: 8 * 1024, Mem: 4 * 1024, MaxDisk: 100 * 1024 * 1024 * 1024, MaxCPU: 4}, {Type: "qemu", Node: "node1", VMID: 102, Name: "vm102", Status: "running", MaxMem: 8 * 1024, Mem: 4 * 1024, MaxDisk: 100 * 1024 * 1024 * 1024, MaxCPU: 4}, {Type: "qemu", Node: "node1", VMID: 103, Name: "vm103", Status: "running", MaxMem: 8 * 1024, Mem: 4 * 1024, MaxDisk: 100 * 1024 * 1024 * 1024, MaxCPU: 4}, }, } mon := newTestPVEMonitor("pve1") defer mon.alertManager.Stop() defer mon.notificationMgr.Stop() mon.rateTracker = NewRateTracker() mon.guestMetadataCache = make(map[string]guestMetadataCacheEntry) mon.guestMetadataLimiter = make(map[string]time.Time) mon.vmRRDMemCache = make(map[string]rrdMemCacheEntry) mon.vmAgentMemCache = make(map[string]agentMemCacheEntry) mon.guestAgentFSInfoTimeout = 250 * time.Millisecond mon.guestAgentNetworkTimeout = 250 * time.Millisecond mon.guestAgentOSInfoTimeout = 250 * time.Millisecond mon.guestAgentVersionTimeout = 250 * time.Millisecond mon.guestAgentRetries = 0 mon.guestAgentWorkSlots = make(chan struct{}, 4) ctx, cancel := context.WithTimeout(context.Background(), 180*time.Millisecond) defer cancel() if ok := mon.pollVMsAndContainersEfficient(ctx, "pve1", "", false, client, map[string]string{"node1": "online"}); !ok { t.Fatal("pollVMsAndContainersEfficient() returned false") } state := mon.state.GetSnapshot() if len(state.VMs) != 4 { t.Fatalf("expected 4 VMs, got %d", len(state.VMs)) } for _, vm := range state.VMs { if vm.Disk.Total <= 0 || vm.Disk.Usage <= 0 { t.Fatalf("expected guest-agent disk data for %s, got total=%d usage=%.2f", vm.Name, vm.Disk.Total, vm.Disk.Usage) } } } func TestPollVMsAndContainersEfficientRotatesGuestAgentPriorityAcrossPolls(t *testing.T) { t.Setenv("PULSE_DATA_DIR", t.TempDir()) client := &rotatingGuestAgentClusterClient{ fsDelay: 60 * time.Millisecond, resources: []proxmox.ClusterResource{ {Type: "qemu", Node: "node1", VMID: 100, Name: "vm100", Status: "running", MaxMem: 8 * 1024, Mem: 4 * 1024, MaxDisk: 100 * 1024 * 1024 * 1024, MaxCPU: 4}, {Type: "qemu", Node: "node1", VMID: 101, Name: "vm101", Status: "running", MaxMem: 8 * 1024, Mem: 4 * 1024, MaxDisk: 100 * 1024 * 1024 * 1024, MaxCPU: 4}, {Type: "qemu", Node: "node1", VMID: 102, Name: "vm102", Status: "running", MaxMem: 8 * 1024, Mem: 4 * 1024, MaxDisk: 100 * 1024 * 1024 * 1024, MaxCPU: 4}, }, } mon := newTestPVEMonitor("pve1") defer mon.alertManager.Stop() defer mon.notificationMgr.Stop() mon.rateTracker = NewRateTracker() mon.guestMetadataCache = make(map[string]guestMetadataCacheEntry) mon.guestMetadataLimiter = make(map[string]time.Time) mon.vmRRDMemCache = make(map[string]rrdMemCacheEntry) mon.vmAgentMemCache = make(map[string]agentMemCacheEntry) mon.guestAgentWorkSlots = make(chan struct{}, 1) mon.guestAgentFSInfoTimeout = 250 * time.Millisecond mon.guestAgentNetworkTimeout = 250 * time.Millisecond mon.guestAgentOSInfoTimeout = 250 * time.Millisecond mon.guestAgentVersionTimeout = 250 * time.Millisecond mon.guestAgentRetries = 0 checkResolved := func(expectedVMID int) { state := mon.state.GetSnapshot() if len(state.VMs) != 3 { t.Fatalf("expected 3 VMs, got %d", len(state.VMs)) } vmByID := make(map[int]models.VM, len(state.VMs)) for _, vm := range state.VMs { vmByID[vm.VMID] = vm } if vmByID[expectedVMID].Disk.Usage <= 0 { t.Fatalf("expected VM %d to get a real disk reading, got usage=%.2f reason=%q", expectedVMID, vmByID[expectedVMID].Disk.Usage, vmByID[expectedVMID].DiskStatusReason) } } for _, expectedVMID := range []int{100, 101, 102} { ctx, cancel := context.WithTimeout(context.Background(), 75*time.Millisecond) if ok := mon.pollVMsAndContainersEfficient(ctx, "pve1", "", false, client, map[string]string{"node1": "online"}); !ok { cancel() t.Fatal("pollVMsAndContainersEfficient() returned false") } cancel() checkResolved(expectedVMID) } } func TestPollVMsAndContainersEfficientPreservesCachedGuestMetadataWhenStatusUnavailable(t *testing.T) { t.Setenv("PULSE_DATA_DIR", t.TempDir()) client := &transientStatusFailureClusterClient{ resources: []proxmox.ClusterResource{ {Type: "qemu", Node: "node1", VMID: 100, Name: "vm100", Status: "running", MaxMem: 8 * 1024, Mem: 4 * 1024, MaxDisk: 100 * 1024 * 1024 * 1024, MaxCPU: 4}, }, } mon := newTestPVEMonitor("pve1") defer mon.alertManager.Stop() defer mon.notificationMgr.Stop() mon.rateTracker = NewRateTracker() mon.guestMetadataCache = map[string]guestMetadataCacheEntry{ guestMetadataCacheKey("pve1", "node1", 100): { ipAddresses: []string{"192.168.1.50"}, networkInterfaces: []models.GuestNetworkInterface{ {Name: "Ethernet0", MAC: "00:11:22:33:44:55", Addresses: []string{"192.168.1.50"}}, }, osName: "Windows", osVersion: "Server 2022", agentVersion: "8.2.0", fetchedAt: time.Now(), }, } mon.guestMetadataLimiter = make(map[string]time.Time) mon.vmRRDMemCache = make(map[string]rrdMemCacheEntry) mon.vmAgentMemCache = make(map[string]agentMemCacheEntry) mon.guestAgentFSInfoTimeout = 250 * time.Millisecond mon.guestAgentNetworkTimeout = 250 * time.Millisecond mon.guestAgentOSInfoTimeout = 250 * time.Millisecond mon.guestAgentVersionTimeout = 250 * time.Millisecond mon.guestAgentRetries = 0 mon.guestAgentWorkSlots = make(chan struct{}, 1) if ok := mon.pollVMsAndContainersEfficient(context.Background(), "pve1", "", false, client, map[string]string{"node1": "online"}); !ok { t.Fatal("pollVMsAndContainersEfficient() returned false") } state := mon.state.GetSnapshot() if len(state.VMs) != 1 { t.Fatalf("expected 1 VM, got %d", len(state.VMs)) } vm := state.VMs[0] if len(vm.IPAddresses) != 1 || vm.IPAddresses[0] != "192.168.1.50" { t.Fatalf("expected cached IPs to be preserved, got %#v", vm.IPAddresses) } if len(vm.NetworkInterfaces) != 1 || vm.NetworkInterfaces[0].Name != "Ethernet0" { t.Fatalf("expected cached interfaces to be preserved, got %#v", vm.NetworkInterfaces) } if vm.OSName != "Windows" || vm.OSVersion != "Server 2022" { t.Fatalf("expected cached OS info to be preserved, got %q %q", vm.OSName, vm.OSVersion) } if vm.AgentVersion != "8.2.0" { t.Fatalf("expected cached agent version to be preserved, got %q", vm.AgentVersion) } } func TestPollVMsAndContainersEfficientContinuesGuestAgentQueriesAfterTransientStatusFailure(t *testing.T) { t.Setenv("PULSE_DATA_DIR", t.TempDir()) client := &transientStatusFailureClusterClient{ resources: []proxmox.ClusterResource{ {Type: "qemu", Node: "node1", VMID: 100, Name: "vm100", Status: "running", MaxMem: 8 * 1024, Mem: 8 * 1024, MaxDisk: 100 * 1024 * 1024 * 1024, MaxCPU: 4}, }, } mon := newTestPVEMonitor("pve1") defer mon.alertManager.Stop() defer mon.notificationMgr.Stop() mon.rateTracker = NewRateTracker() mon.guestMetadataCache = make(map[string]guestMetadataCacheEntry) mon.guestMetadataLimiter = make(map[string]time.Time) mon.vmRRDMemCache = make(map[string]rrdMemCacheEntry) mon.vmAgentMemCache = make(map[string]agentMemCacheEntry) mon.guestAgentFSInfoTimeout = 250 * time.Millisecond mon.guestAgentNetworkTimeout = 250 * time.Millisecond mon.guestAgentOSInfoTimeout = 250 * time.Millisecond mon.guestAgentVersionTimeout = 250 * time.Millisecond mon.guestAgentRetries = 0 mon.guestAgentWorkSlots = make(chan struct{}, 1) mon.state.UpdateVMsForInstance("pve1", []models.VM{ { ID: makeGuestID("pve1", "node1", 100), VMID: 100, Name: "vm100", Node: "node1", Instance: "pve1", Type: "qemu", Status: "running", AgentVersion: "8.1.0", NetworkInterfaces: []models.GuestNetworkInterface{ {Name: "Ethernet0", MAC: "00:11:22:33:44:55", Addresses: []string{"192.168.1.50"}}, }, LastSeen: time.Now(), }, }) if ok := mon.pollVMsAndContainersEfficient(context.Background(), "pve1", "", false, client, map[string]string{"node1": "online"}); !ok { t.Fatal("pollVMsAndContainersEfficient() returned false") } state := mon.state.GetSnapshot() if len(state.VMs) != 1 { t.Fatalf("expected 1 VM, got %d", len(state.VMs)) } vm := state.VMs[0] if vm.MemorySource != "guest-agent-meminfo" { t.Fatalf("expected guest-agent memory fallback after status failure, got %q", vm.MemorySource) } if vm.Disk.Usage != 40 { t.Fatalf("expected live guest-agent disk usage after status failure, got %.2f", vm.Disk.Usage) } if vm.DiskStatusReason != "" { t.Fatalf("expected empty disk status reason, got %q", vm.DiskStatusReason) } if len(vm.Disks) != 1 || vm.Disks[0].Device != "/dev/vda" { t.Fatalf("expected live guest-agent disk inventory, got %#v", vm.Disks) } if len(vm.NetworkInterfaces) != 1 || vm.NetworkInterfaces[0].Name != "Ethernet0" { t.Fatalf("expected refreshed network interfaces, got %#v", vm.NetworkInterfaces) } if vm.AgentVersion != "8.2.0" { t.Fatalf("expected refreshed agent version, got %q", vm.AgentVersion) } } func TestPollVMsAndContainersEfficientKeepsPreviousMemoryForHealthyGuestAfterRepeatedLowTrustFullUsage(t *testing.T) { t.Setenv("PULSE_DATA_DIR", t.TempDir()) const total = uint64(8 << 30) const trustedUsed = uint64(3 << 30) client := &healthyGuestLowTrustMemoryClusterClient{ resources: []proxmox.ClusterResource{ {Type: "qemu", Node: "node1", VMID: 100, Name: "vm100", Status: "running", MaxMem: total, Mem: total, MaxDisk: 100 * 1024 * 1024 * 1024, MaxCPU: 4}, }, } mon := newTestPVEMonitor("pve1") defer mon.alertManager.Stop() defer mon.notificationMgr.Stop() mon.rateTracker = NewRateTracker() mon.guestMetadataCache = make(map[string]guestMetadataCacheEntry) mon.guestMetadataLimiter = make(map[string]time.Time) mon.vmRRDMemCache = make(map[string]rrdMemCacheEntry) mon.vmAgentMemCache = make(map[string]agentMemCacheEntry) mon.guestAgentFSInfoTimeout = 250 * time.Millisecond mon.guestAgentNetworkTimeout = 250 * time.Millisecond mon.guestAgentOSInfoTimeout = 250 * time.Millisecond mon.guestAgentVersionTimeout = 250 * time.Millisecond mon.guestAgentRetries = 0 mon.guestAgentWorkSlots = make(chan struct{}, 1) mon.state.UpdateVMsForInstance("pve1", []models.VM{ { ID: makeGuestID("pve1", "node1", 100), VMID: 100, Name: "vm100", Node: "node1", Instance: "pve1", Type: "qemu", Status: "running", MemorySource: "guest-agent-meminfo", Memory: models.Memory{ Total: int64(total), Used: int64(trustedUsed), Free: int64(total - trustedUsed), Usage: safePercentage(float64(trustedUsed), float64(total)), }, LastSeen: time.Now(), }, }) for i := 0; i < 2; i++ { if ok := mon.pollVMsAndContainersEfficient(context.Background(), "pve1", "", false, client, map[string]string{"node1": "online"}); !ok { t.Fatalf("pollVMsAndContainersEfficient() returned false on pass %d", i+1) } } state := mon.state.GetSnapshot() if len(state.VMs) != 1 { t.Fatalf("expected 1 VM, got %d", len(state.VMs)) } vm := state.VMs[0] if vm.MemorySource != "previous-snapshot" { t.Fatalf("memory source = %q, want previous-snapshot", vm.MemorySource) } if vm.Memory.Used != int64(trustedUsed) { t.Fatalf("memory used = %d, want preserved %d", vm.Memory.Used, trustedUsed) } if len(vm.NetworkInterfaces) != 1 || vm.NetworkInterfaces[0].Name != "Ethernet0" { t.Fatalf("expected guest agent network metadata to confirm healthy guest, got %#v", vm.NetworkInterfaces) } } func TestPollVMsAndContainersEfficientCarriesForwardPreviousIndividualDisks(t *testing.T) { t.Setenv("PULSE_DATA_DIR", t.TempDir()) client := &emptyFSInfoClusterClient{ resources: []proxmox.ClusterResource{ {Type: "qemu", Node: "node1", VMID: 100, Name: "vm100", Status: "running", MaxMem: 8 * 1024, Mem: 4 * 1024, MaxDisk: 100 * 1024 * 1024 * 1024, MaxCPU: 4}, }, } mon := newTestPVEMonitor("pve1") defer mon.alertManager.Stop() defer mon.notificationMgr.Stop() mon.rateTracker = NewRateTracker() mon.guestMetadataCache = make(map[string]guestMetadataCacheEntry) mon.guestMetadataLimiter = make(map[string]time.Time) mon.vmRRDMemCache = make(map[string]rrdMemCacheEntry) mon.vmAgentMemCache = make(map[string]agentMemCacheEntry) mon.guestAgentWorkSlots = make(chan struct{}, 2) prevVM := models.VM{ ID: makeGuestID("pve1", "node1", 100), VMID: 100, Name: "vm100", Node: "node1", Instance: "pve1", Type: "qemu", Status: "running", Disk: models.Disk{ Total: 100 * 1024 * 1024 * 1024, Used: 40 * 1024 * 1024 * 1024, Free: 60 * 1024 * 1024 * 1024, Usage: 40, }, Disks: []models.Disk{ { Total: 100 * 1024 * 1024 * 1024, Used: 40 * 1024 * 1024 * 1024, Free: 60 * 1024 * 1024 * 1024, Usage: 40, Mountpoint: "/", Type: "ext4", Device: "/dev/vda", }, }, } mon.state.UpdateVMs([]models.VM{prevVM}) if ok := mon.pollVMsAndContainersEfficient(context.Background(), "pve1", "", false, client, map[string]string{"node1": "online"}); !ok { t.Fatal("pollVMsAndContainersEfficient() returned false") } state := mon.state.GetSnapshot() if len(state.VMs) != 1 { t.Fatalf("expected 1 VM, got %d", len(state.VMs)) } vm := state.VMs[0] if len(vm.Disks) != 1 { t.Fatalf("expected previous individual disks to be preserved, got %#v", vm.Disks) } if vm.Disks[0].Mountpoint != "/" || vm.Disks[0].Device != "/dev/vda" { t.Fatalf("unexpected carried-forward disk data: %#v", vm.Disks[0]) } if vm.Disk.Usage != 40 { t.Fatalf("expected aggregate disk usage to be carried forward, got %.2f", vm.Disk.Usage) } if vm.DiskStatusReason != "prev-no-filesystems" { t.Fatalf("expected carried-forward disk status reason, got %q", vm.DiskStatusReason) } } func TestPollVMsAndContainersEfficientMarksDiskUnknownUntilGuestAgentFilesystemDataArrives(t *testing.T) { t.Setenv("PULSE_DATA_DIR", t.TempDir()) client := &emptyFSInfoClusterClient{ resources: []proxmox.ClusterResource{ { Type: "qemu", Node: "node1", VMID: 100, Name: "vm100", Status: "running", MaxMem: 8 * 1024, Mem: 4 * 1024, Disk: 57 * 1024 * 1024 * 1024, MaxDisk: 100 * 1024 * 1024 * 1024, MaxCPU: 4, }, }, } mon := newTestPVEMonitor("pve1") defer mon.alertManager.Stop() defer mon.notificationMgr.Stop() mon.rateTracker = NewRateTracker() mon.guestMetadataCache = make(map[string]guestMetadataCacheEntry) mon.guestMetadataLimiter = make(map[string]time.Time) mon.vmRRDMemCache = make(map[string]rrdMemCacheEntry) mon.vmAgentMemCache = make(map[string]agentMemCacheEntry) mon.guestAgentWorkSlots = make(chan struct{}, 2) if ok := mon.pollVMsAndContainersEfficient(context.Background(), "pve1", "", false, client, map[string]string{"node1": "online"}); !ok { t.Fatal("pollVMsAndContainersEfficient() returned false") } state := mon.state.GetSnapshot() if len(state.VMs) != 1 { t.Fatalf("expected 1 VM, got %d", len(state.VMs)) } vm := state.VMs[0] if vm.Disk.Usage != -1 { t.Fatalf("expected aggregate disk usage to remain unknown, got %.2f", vm.Disk.Usage) } if vm.DiskStatusReason != "no-filesystems" { t.Fatalf("expected disk status reason %q, got %q", "no-filesystems", vm.DiskStatusReason) } guestMetrics := mon.metricsHistory.GetGuestMetrics(vm.ID, "disk", time.Hour) if len(guestMetrics) != 0 { t.Fatalf("expected no disk metric samples while disk usage is unknown, got %#v", guestMetrics) } } func TestPollVMsAndContainersEfficientUsesLinkedHostAgentDiskFallback(t *testing.T) { t.Setenv("PULSE_DATA_DIR", t.TempDir()) client := &emptyFSInfoClusterClient{ resources: []proxmox.ClusterResource{ { Type: "qemu", Node: "node1", VMID: 100, Name: "vm100", Status: "running", MaxMem: 8 * 1024, Mem: 4 * 1024, MaxDisk: 100 * 1024 * 1024 * 1024, MaxCPU: 4, }, }, } mon := newTestPVEMonitor("pve1") defer mon.alertManager.Stop() defer mon.notificationMgr.Stop() mon.rateTracker = NewRateTracker() mon.guestMetadataCache = make(map[string]guestMetadataCacheEntry) mon.guestMetadataLimiter = make(map[string]time.Time) mon.vmRRDMemCache = make(map[string]rrdMemCacheEntry) mon.vmAgentMemCache = make(map[string]agentMemCacheEntry) mon.guestAgentWorkSlots = make(chan struct{}, 2) mon.state.UpsertHost(models.Host{ ID: "host-100", Hostname: "vm100-agent", Status: "online", LinkedVMID: makeGuestID("pve1", "node1", 100), Disks: []models.Disk{ { Total: 100 * 1024 * 1024 * 1024, Used: 57 * 1024 * 1024 * 1024, Free: 43 * 1024 * 1024 * 1024, Usage: 57, Mountpoint: "C:", Type: "NTFS", Device: "C:", }, }, }) if ok := mon.pollVMsAndContainersEfficient(context.Background(), "pve1", "", false, client, map[string]string{"node1": "online"}); !ok { t.Fatal("pollVMsAndContainersEfficient() returned false") } state := mon.state.GetSnapshot() if len(state.VMs) != 1 { t.Fatalf("expected 1 VM, got %d", len(state.VMs)) } vm := state.VMs[0] if vm.Disk.Usage != 57 { t.Fatalf("expected linked host-agent disk usage, got %.2f", vm.Disk.Usage) } if vm.DiskStatusReason != "" { t.Fatalf("expected cleared disk status reason, got %q", vm.DiskStatusReason) } if len(vm.Disks) != 1 || vm.Disks[0].Mountpoint != "C:" { t.Fatalf("expected linked host-agent disk inventory, got %#v", vm.Disks) } } func TestPollVMsAndContainersEfficientKeepsNormalizedWindowsDriveRoots(t *testing.T) { t.Setenv("PULSE_DATA_DIR", t.TempDir()) client := &windowsDriveClusterClient{ resources: []proxmox.ClusterResource{ { Type: "qemu", Node: "node1", VMID: 100, Name: "win100", Status: "running", MaxMem: 8 * 1024, Mem: 4 * 1024, Disk: 0, MaxDisk: 100 * 1024 * 1024 * 1024, MaxCPU: 4, }, }, } mon := newTestPVEMonitor("pve1") defer mon.alertManager.Stop() defer mon.notificationMgr.Stop() mon.rateTracker = NewRateTracker() mon.guestMetadataCache = make(map[string]guestMetadataCacheEntry) mon.guestMetadataLimiter = make(map[string]time.Time) mon.vmRRDMemCache = make(map[string]rrdMemCacheEntry) mon.vmAgentMemCache = make(map[string]agentMemCacheEntry) mon.guestAgentWorkSlots = make(chan struct{}, 2) if ok := mon.pollVMsAndContainersEfficient(context.Background(), "pve1", "", false, client, map[string]string{"node1": "online"}); !ok { t.Fatal("pollVMsAndContainersEfficient() returned false") } state := mon.state.GetSnapshot() if len(state.VMs) != 1 { t.Fatalf("expected 1 VM, got %d", len(state.VMs)) } vm := state.VMs[0] if vm.DiskStatusReason != "" { t.Fatalf("expected empty disk status reason, got %q", vm.DiskStatusReason) } if len(vm.Disks) != 1 { t.Fatalf("expected 1 usable Windows disk, got %#v", vm.Disks) } if vm.Disks[0].Mountpoint != "C:" { t.Fatalf("expected normalized Windows drive root to be preserved, got %q", vm.Disks[0].Mountpoint) } if vm.Disk.Usage <= 0 { t.Fatalf("expected Windows guest disk usage to be populated, got %.2f", vm.Disk.Usage) } } func TestPollVMsAndContainersEfficientStabilizesSuspiciousRepeatedLowTrustMemory(t *testing.T) { t.Setenv("PULSE_DATA_DIR", t.TempDir()) const total = uint64(8 << 30) client := &repeatedLowTrustMemoryClusterClient{ resources: []proxmox.ClusterResource{ {Type: "qemu", Node: "node1", VMID: 100, Name: "vm100", Status: "running", MaxMem: total, Mem: total, MaxCPU: 4}, {Type: "qemu", Node: "node1", VMID: 101, Name: "vm101", Status: "running", MaxMem: total, Mem: total, MaxCPU: 4}, {Type: "qemu", Node: "node1", VMID: 102, Name: "vm102", Status: "running", MaxMem: total, Mem: total, MaxCPU: 4}, {Type: "qemu", Node: "node1", VMID: 103, Name: "vm103", Status: "running", MaxMem: total, Mem: 2 << 30, MaxCPU: 4}, }, vmStatuses: map[int]*proxmox.VMStatus{ 100: {Status: "running", MaxMem: total, Mem: total, Balloon: 2 << 30, Agent: proxmox.VMAgentField{Value: 1}}, 101: {Status: "running", MaxMem: total, Mem: total, Agent: proxmox.VMAgentField{Value: 1}}, 102: {Status: "running", MaxMem: total, Mem: total, Agent: proxmox.VMAgentField{Value: 1}}, 103: {Status: "running", MaxMem: total, Mem: 2 << 30, Agent: proxmox.VMAgentField{Value: 0}}, }, } mon := newTestPVEMonitor("pve1") defer mon.alertManager.Stop() defer mon.notificationMgr.Stop() mon.rateTracker = NewRateTracker() mon.guestMetadataCache = make(map[string]guestMetadataCacheEntry) mon.guestMetadataLimiter = make(map[string]time.Time) mon.vmRRDMemCache = make(map[string]rrdMemCacheEntry) mon.vmAgentMemCache = make(map[string]agentMemCacheEntry) mon.guestAgentWorkSlots = make(chan struct{}, 4) now := time.Now() mon.state.UpdateVMs([]models.VM{ { ID: makeGuestID("pve1", "node1", 100), VMID: 100, Name: "vm100", Node: "node1", Instance: "pve1", Type: "qemu", Status: "running", MemorySource: "rrd-memavailable", Memory: models.Memory{Total: int64(total), Used: 3 << 30, Free: 5 << 30, Usage: safePercentage(float64(3<<30), float64(total))}, LastSeen: now, }, { ID: makeGuestID("pve1", "node1", 101), VMID: 101, Name: "vm101", Node: "node1", Instance: "pve1", Type: "qemu", Status: "running", MemorySource: "guest-agent-meminfo", Memory: models.Memory{Total: int64(total), Used: 4 << 30, Free: 4 << 30, Usage: 50}, LastSeen: now, }, { ID: makeGuestID("pve1", "node1", 102), VMID: 102, Name: "vm102", Node: "node1", Instance: "pve1", Type: "qemu", Status: "running", MemorySource: "previous-snapshot", Memory: models.Memory{Total: int64(total), Used: 5 << 30, Free: 3 << 30, Usage: 62.5}, LastSeen: now, }, }) if ok := mon.pollVMsAndContainersEfficient(context.Background(), "pve1", "", false, client, map[string]string{"node1": "online"}); !ok { t.Fatal("pollVMsAndContainersEfficient() returned false") } state := mon.state.GetSnapshot() if len(state.VMs) != 4 { t.Fatalf("expected 4 VMs, got %d", len(state.VMs)) } vmByID := make(map[int]models.VM, len(state.VMs)) for _, vm := range state.VMs { vmByID[vm.VMID] = vm } if vmByID[100].MemorySource != "previous-snapshot" || vmByID[100].Memory.Used != 3<<30 { t.Fatalf("vm100 memory = %#v source=%q, want preserved previous reading", vmByID[100].Memory, vmByID[100].MemorySource) } if vmByID[100].Memory.Balloon != 2<<30 { t.Fatalf("vm100 balloon = %d, want current balloon", vmByID[100].Memory.Balloon) } if vmByID[101].MemorySource != "previous-snapshot" || vmByID[101].Memory.Used != 4<<30 { t.Fatalf("vm101 memory = %#v source=%q, want preserved previous reading", vmByID[101].Memory, vmByID[101].MemorySource) } if vmByID[102].MemorySource != "previous-snapshot" || vmByID[102].Memory.Used != 5<<30 { t.Fatalf("vm102 memory = %#v source=%q, want chained preserved reading", vmByID[102].Memory, vmByID[102].MemorySource) } if vmByID[103].MemorySource != "status-mem" || vmByID[103].Memory.Used != 2<<30 { t.Fatalf("vm103 memory = %#v source=%q, want unaffected current reading", vmByID[103].Memory, vmByID[103].MemorySource) } snapshotKey := makeGuestSnapshotKey("pve1", "qemu", "node1", 100) mon.diagMu.RLock() snapshot, ok := mon.guestSnapshots[snapshotKey] stabilizedSnapshot := mon.guestSnapshots[makeGuestSnapshotKey("pve1", "qemu", "node1", 102)] mon.diagMu.RUnlock() if !ok { t.Fatal("expected guest snapshot for vm100") } if snapshot.MemorySource != "previous-snapshot" || snapshot.Memory.Used != 3<<30 { t.Fatalf("snapshot memory = %#v source=%q, want preserved previous reading", snapshot.Memory, snapshot.MemorySource) } if !snapshotHasNote(stabilizedSnapshot.Notes, "preserved-previous-memory-after-repeated-low-trust-pattern") && !snapshotHasNote(stabilizedSnapshot.Notes, "preserved-previous-memory-for-healthy-guest-low-trust-full-usage") { t.Fatalf("vm102 snapshot notes = %#v, want preservation note", stabilizedSnapshot.Notes) } } func TestPollVMsAndContainersEfficientTreatsAvailableGuestAgentAsHealthyForMemoryCarryForward(t *testing.T) { t.Setenv("PULSE_DATA_DIR", t.TempDir()) const total = uint64(8 << 30) client := &repeatedLowTrustMemoryClusterClient{ resources: []proxmox.ClusterResource{ {Type: "qemu", Node: "node1", VMID: 100, Name: "vm100", Status: "running", MaxMem: total, Mem: total, MaxCPU: 4}, }, vmStatuses: map[int]*proxmox.VMStatus{ 100: {Status: "running", MaxMem: total, Mem: total, Agent: proxmox.VMAgentField{Value: 1}}, }, } mon := newTestPVEMonitor("pve1") defer mon.alertManager.Stop() defer mon.notificationMgr.Stop() mon.rateTracker = NewRateTracker() mon.guestMetadataCache = make(map[string]guestMetadataCacheEntry) mon.guestMetadataLimiter = make(map[string]time.Time) mon.vmRRDMemCache = make(map[string]rrdMemCacheEntry) mon.vmAgentMemCache = make(map[string]agentMemCacheEntry) mon.guestAgentWorkSlots = make(chan struct{}, 4) now := time.Now() mon.state.UpdateVMs([]models.VM{ { ID: makeGuestID("pve1", "node1", 100), VMID: 100, Name: "vm100", Node: "node1", Instance: "pve1", Type: "qemu", Status: "running", MemorySource: "previous-snapshot", Memory: models.Memory{Total: int64(total), Used: 3 << 30, Free: 5 << 30, Usage: safePercentage(float64(3<<30), float64(total))}, LastSeen: now, }, }) if ok := mon.pollVMsAndContainersEfficient(context.Background(), "pve1", "", false, client, map[string]string{"node1": "online"}); !ok { t.Fatal("pollVMsAndContainersEfficient() returned false") } state := mon.state.GetSnapshot() if len(state.VMs) != 1 { t.Fatalf("expected 1 VM, got %d", len(state.VMs)) } vm := state.VMs[0] if vm.MemorySource != "previous-snapshot" || vm.Memory.Used != 3<<30 { t.Fatalf("vm memory = %#v source=%q, want preserved previous reading", vm.Memory, vm.MemorySource) } }