Pulse/internal/monitoring/monitor_polling_test.go
2026-04-09 20:15:17 +01:00

2352 lines
72 KiB
Go

package monitoring
import (
"context"
"errors"
"strings"
"sync/atomic"
"testing"
"time"
"github.com/rcourtman/pulse-go-rewrite/internal/alerts"
"github.com/rcourtman/pulse-go-rewrite/internal/config"
"github.com/rcourtman/pulse-go-rewrite/internal/mock"
"github.com/rcourtman/pulse-go-rewrite/internal/models"
"github.com/rcourtman/pulse-go-rewrite/internal/storagehealth"
"github.com/rcourtman/pulse-go-rewrite/internal/truenas"
"github.com/rcourtman/pulse-go-rewrite/internal/unifiedresources"
"github.com/rcourtman/pulse-go-rewrite/internal/vmware"
"github.com/rcourtman/pulse-go-rewrite/pkg/metrics"
"github.com/rcourtman/pulse-go-rewrite/pkg/pbs"
"github.com/rcourtman/pulse-go-rewrite/pkg/pmg"
)
type testPollProvider struct {
providerType InstanceType
instances []string
describeInstances []PollProviderInstanceInfo
connectionStatus map[string]bool
connectionKey string
interval time.Duration
buildPollTask func(instanceName string) (PollTask, error)
}
func (p testPollProvider) Type() InstanceType { return p.providerType }
func (p testPollProvider) ListInstances(_ *Monitor) []string {
out := make([]string, len(p.instances))
copy(out, p.instances)
return out
}
func (p testPollProvider) DescribeInstances(_ *Monitor) []PollProviderInstanceInfo {
out := make([]PollProviderInstanceInfo, len(p.describeInstances))
for i := range p.describeInstances {
out[i] = PollProviderInstanceInfo{
Name: p.describeInstances[i].Name,
DisplayName: p.describeInstances[i].DisplayName,
Connection: p.describeInstances[i].Connection,
Metadata: cloneProviderMetadata(p.describeInstances[i].Metadata),
}
}
return out
}
func (p testPollProvider) ConnectionStatuses(_ *Monitor) map[string]bool {
if len(p.connectionStatus) == 0 {
return nil
}
out := make(map[string]bool, len(p.connectionStatus))
for key, healthy := range p.connectionStatus {
out[key] = healthy
}
return out
}
func (p testPollProvider) ConnectionHealthKey(_ *Monitor, instanceName string) string {
if strings.TrimSpace(p.connectionKey) != "" {
return strings.TrimSpace(p.connectionKey)
}
return ""
}
func (p testPollProvider) BaseInterval(_ *Monitor) time.Duration { return p.interval }
func (p testPollProvider) BuildPollTask(_ *Monitor, instanceName string) (PollTask, error) {
if p.buildPollTask == nil {
return PollTask{
InstanceName: instanceName,
InstanceType: string(p.providerType),
}, nil
}
return p.buildPollTask(instanceName)
}
type testSupplementalPollProvider struct {
testPollProvider
source unifiedresources.DataSource
ownedSources []unifiedresources.DataSource
recordsByOrg map[string][]unifiedresources.IngestRecord
changesByOrg map[string][]unifiedresources.ResourceChange
lastRequestedOrg string
}
func (p *testSupplementalPollProvider) SupplementalSource() unifiedresources.DataSource {
return p.source
}
func (p *testSupplementalPollProvider) SupplementalRecords(_ *Monitor, orgID string) []unifiedresources.IngestRecord {
p.lastRequestedOrg = orgID
records := p.recordsByOrg[orgID]
out := make([]unifiedresources.IngestRecord, len(records))
copy(out, records)
return out
}
func (p *testSupplementalPollProvider) SupplementalChanges(_ *Monitor, orgID string) []unifiedresources.ResourceChange {
p.lastRequestedOrg = orgID
return cloneTestResourceChanges(p.changesByOrg[orgID])
}
func (p *testSupplementalPollProvider) SnapshotOwnedSources(_ *Monitor) []unifiedresources.DataSource {
out := make([]unifiedresources.DataSource, len(p.ownedSources))
copy(out, p.ownedSources)
return out
}
type testMonitorSupplementalProvider struct {
recordsByOrg map[string][]unifiedresources.IngestRecord
changesByOrg map[string][]unifiedresources.ResourceChange
ownedSources []unifiedresources.DataSource
lastRequestedOrg string
}
func ptrFloat64(value float64) *float64 {
return &value
}
func ptrInt64(value int64) *int64 {
return &value
}
func cloneTestResourceChanges(in []unifiedresources.ResourceChange) []unifiedresources.ResourceChange {
if in == nil {
return nil
}
out := make([]unifiedresources.ResourceChange, len(in))
for i := range in {
out[i] = in[i]
if in[i].OccurredAt != nil {
occurredAt := in[i].OccurredAt.UTC()
out[i].OccurredAt = &occurredAt
}
if in[i].RelatedResources != nil {
out[i].RelatedResources = append([]string(nil), in[i].RelatedResources...)
}
if in[i].Metadata != nil {
out[i].Metadata = make(map[string]any, len(in[i].Metadata))
for key, value := range in[i].Metadata {
out[i].Metadata[key] = value
}
}
}
return out
}
func (p *testMonitorSupplementalProvider) SupplementalRecords(_ *Monitor, orgID string) []unifiedresources.IngestRecord {
p.lastRequestedOrg = orgID
records := p.recordsByOrg[orgID]
out := make([]unifiedresources.IngestRecord, len(records))
copy(out, records)
return out
}
func (p *testMonitorSupplementalProvider) SupplementalChanges(_ *Monitor, orgID string) []unifiedresources.ResourceChange {
p.lastRequestedOrg = orgID
return cloneTestResourceChanges(p.changesByOrg[orgID])
}
func (p *testMonitorSupplementalProvider) SnapshotOwnedSources() []unifiedresources.DataSource {
out := make([]unifiedresources.DataSource, len(p.ownedSources))
copy(out, p.ownedSources)
return out
}
type testSupplementalResourceStore struct {
snapshotCalls int
lastSnapshot models.StateSnapshot
recordsBySource map[unifiedresources.DataSource][]unifiedresources.IngestRecord
recordedChanges []unifiedresources.ResourceChange
}
func (s *testSupplementalResourceStore) ShouldSkipAPIPolling(string) bool { return false }
func (s *testSupplementalResourceStore) GetPollingRecommendations() map[string]float64 { return nil }
func (s *testSupplementalResourceStore) GetAll() []unifiedresources.Resource { return nil }
func (s *testSupplementalResourceStore) PopulateFromSnapshot(snapshot models.StateSnapshot) {
s.lastSnapshot = snapshot
s.snapshotCalls++
}
func (s *testSupplementalResourceStore) PopulateSupplementalRecords(source unifiedresources.DataSource, records []unifiedresources.IngestRecord) {
if s.recordsBySource == nil {
s.recordsBySource = make(map[unifiedresources.DataSource][]unifiedresources.IngestRecord)
}
cloned := make([]unifiedresources.IngestRecord, len(records))
copy(cloned, records)
s.recordsBySource[source] = append(s.recordsBySource[source], cloned...)
}
func (s *testSupplementalResourceStore) RecordChange(change unifiedresources.ResourceChange) error {
s.recordedChanges = append(s.recordedChanges, cloneTestResourceChanges([]unifiedresources.ResourceChange{change})...)
return nil
}
type testAtomicResourceStore struct {
snapshotCalls int
atomicCalls int
lastSnapshot models.StateSnapshot
lastRecordsBySrc map[unifiedresources.DataSource][]unifiedresources.IngestRecord
}
func (s *testAtomicResourceStore) ShouldSkipAPIPolling(string) bool { return false }
func (s *testAtomicResourceStore) GetPollingRecommendations() map[string]float64 { return nil }
func (s *testAtomicResourceStore) GetAll() []unifiedresources.Resource { return nil }
func (s *testAtomicResourceStore) PopulateFromSnapshot(snapshot models.StateSnapshot) {
s.lastSnapshot = snapshot
s.snapshotCalls++
}
func (s *testAtomicResourceStore) PopulateSnapshotAndSupplemental(snapshot models.StateSnapshot, recordsBySource map[unifiedresources.DataSource][]unifiedresources.IngestRecord) {
s.lastSnapshot = snapshot
s.atomicCalls++
s.lastRecordsBySrc = make(map[unifiedresources.DataSource][]unifiedresources.IngestRecord, len(recordsBySource))
for source, records := range recordsBySource {
cloned := make([]unifiedresources.IngestRecord, len(records))
copy(cloned, records)
s.lastRecordsBySrc[source] = cloned
}
}
func TestBuildScheduledTasksUsesConfiguredIntervals(t *testing.T) {
now := time.Now()
cfg := &config.Config{
PVEPollingInterval: 2 * time.Minute,
PBSPollingInterval: 45 * time.Second,
PMGPollingInterval: 90 * time.Second,
AdaptivePollingBaseInterval: 10 * time.Second,
}
monitor := &Monitor{
config: cfg,
pveClients: map[string]PVEClientInterface{"pve-1": nil},
pbsClients: map[string]*pbs.Client{"pbs-1": nil},
pmgClients: map[string]*pmg.Client{"pmg-1": nil},
}
tasks := monitor.buildScheduledTasks(now)
if len(tasks) != 3 {
t.Fatalf("expected 3 tasks, got %d", len(tasks))
}
got := map[InstanceType]time.Duration{}
for _, task := range tasks {
if !task.NextRun.Equal(now) {
t.Fatalf("expected NextRun to equal provided time, got %v", task.NextRun)
}
got[task.InstanceType] = task.Interval
}
if got[InstanceTypePVE] != cfg.PVEPollingInterval {
t.Fatalf("expected PVE interval %v, got %v", cfg.PVEPollingInterval, got[InstanceTypePVE])
}
if got[InstanceTypePBS] != cfg.PBSPollingInterval {
t.Fatalf("expected PBS interval %v, got %v", cfg.PBSPollingInterval, got[InstanceTypePBS])
}
if got[InstanceTypePMG] != cfg.PMGPollingInterval {
t.Fatalf("expected PMG interval %v, got %v", cfg.PMGPollingInterval, got[InstanceTypePMG])
}
}
func TestRescheduleTaskUsesInstanceIntervalWhenSchedulerDisabled(t *testing.T) {
cfg := &config.Config{
PVEPollingInterval: 75 * time.Second,
AdaptivePollingBaseInterval: 10 * time.Second,
}
monitor := &Monitor{
config: cfg,
taskQueue: NewTaskQueue(),
}
task := ScheduledTask{
InstanceName: "pve-1",
InstanceType: InstanceTypePVE,
Interval: 0,
NextRun: time.Now(),
}
monitor.rescheduleTask(task)
monitor.taskQueue.mu.Lock()
entry, ok := monitor.taskQueue.entries[schedulerKey(task.InstanceType, task.InstanceName)]
monitor.taskQueue.mu.Unlock()
if !ok {
t.Fatalf("expected task to be rescheduled in queue")
}
if entry.task.Interval != cfg.PVEPollingInterval {
t.Fatalf("expected interval %v, got %v", cfg.PVEPollingInterval, entry.task.Interval)
}
remaining := time.Until(entry.task.NextRun)
if remaining < cfg.PVEPollingInterval-2*time.Second || remaining > cfg.PVEPollingInterval+time.Second {
t.Fatalf("expected NextRun about %v from now, got %v", cfg.PVEPollingInterval, remaining)
}
}
func TestUpdateResourceStoreSyncsUnifiedIncidentAlerts(t *testing.T) {
alertManager := alerts.NewManagerWithDataDir(t.TempDir())
defer alertManager.Stop()
store := unifiedresources.NewMonitorAdapter(nil)
monitor := &Monitor{
state: models.NewState(),
resourceStore: store,
alertManager: alertManager,
orgID: "default",
supplementalProviders: map[unifiedresources.DataSource]MonitorSupplementalRecordsProvider{
unifiedresources.SourceTrueNAS: &testMonitorSupplementalProvider{
recordsByOrg: map[string][]unifiedresources.IngestRecord{
"default": {{
SourceID: "pool:tank",
Resource: unifiedresources.Resource{
ID: "storage:tank",
Type: unifiedresources.ResourceTypeStorage,
Name: "tank",
ParentName: "truenas-main",
Sources: []unifiedresources.DataSource{unifiedresources.SourceTrueNAS},
Storage: &unifiedresources.StorageMeta{
Platform: "truenas",
Topology: "pool",
Protection: "zfs",
IsZFS: true,
},
Incidents: []unifiedresources.ResourceIncident{{
Provider: "truenas",
NativeID: "alert-1",
Code: "truenas_volume_status",
Severity: storagehealth.RiskWarning,
Summary: "Pool tank is DEGRADED",
}},
},
}},
},
},
},
}
monitor.updateResourceStore(models.StateSnapshot{})
active := alertManager.GetActiveAlerts()
if len(active) != 1 {
t.Fatalf("expected 1 active alert, got %d", len(active))
}
if active[0].Type != "zfs-pool-state" {
t.Fatalf("alert type = %q, want zfs-pool-state", active[0].Type)
}
snapshot := monitor.state.GetSnapshot()
if len(snapshot.ActiveAlerts) != 1 {
t.Fatalf("expected state snapshot to contain 1 active alert, got %d", len(snapshot.ActiveAlerts))
}
}
func TestUpdateResourceStoreSyncsCanonicalStorageMetrics(t *testing.T) {
t.Run("supplemental truenas storage", func(t *testing.T) {
cfg := metrics.DefaultConfig(t.TempDir())
store, err := metrics.NewStore(cfg)
if err != nil {
t.Fatalf("metrics.NewStore() error = %v", err)
}
defer func() { _ = store.Close() }()
resourceStore := unifiedresources.NewMonitorAdapter(nil)
used := int64(620)
total := int64(1000)
monitor := &Monitor{
resourceStore: resourceStore,
metricsHistory: NewMetricsHistory(1024, 24*time.Hour),
metricsStore: store,
orgID: "default",
supplementalProviders: map[unifiedresources.DataSource]MonitorSupplementalRecordsProvider{
unifiedresources.SourceTrueNAS: &testMonitorSupplementalProvider{
recordsByOrg: map[string][]unifiedresources.IngestRecord{
"default": {{
SourceID: "pool:tank",
Resource: unifiedresources.Resource{
ID: "storage:tank",
Type: unifiedresources.ResourceTypeStorage,
Name: "tank",
Status: unifiedresources.StatusOnline,
LastSeen: time.Now().UTC(),
UpdatedAt: time.Now().UTC(),
Sources: []unifiedresources.DataSource{unifiedresources.SourceTrueNAS},
Metrics: &unifiedresources.ResourceMetrics{
Disk: &unifiedresources.MetricValue{Used: &used, Total: &total, Percent: 62},
},
Storage: &unifiedresources.StorageMeta{
Platform: "truenas",
Topology: "pool",
Protection: "zfs",
},
},
}},
},
},
},
}
monitor.updateResourceStore(models.StateSnapshot{})
memory := monitor.GetStorageMetrics("pool:tank", time.Hour)
if got := len(memory["usage"]); got == 0 {
t.Fatalf("expected in-memory usage history for pool:tank")
}
storeBacked := monitor.GetStorageMetricsForChart("pool:tank", 7*24*time.Hour)
if got := len(storeBacked["usage"]); got == 0 {
t.Fatalf("expected persisted usage history for pool:tank")
}
})
t.Run("agent unraid storage", func(t *testing.T) {
cfg := metrics.DefaultConfig(t.TempDir())
store, err := metrics.NewStore(cfg)
if err != nil {
t.Fatalf("metrics.NewStore() error = %v", err)
}
defer func() { _ = store.Close() }()
resourceStore := unifiedresources.NewMonitorAdapter(nil)
snapshot := models.StateSnapshot{
Hosts: []models.Host{
{
ID: "host-tower",
Hostname: "tower",
Status: "online",
LastSeen: time.Now().UTC(),
MachineID: "machine-tower",
Disks: []models.Disk{
{Mountpoint: "/mnt/user", Total: 1000, Used: 400, Free: 600, Usage: 40},
},
Unraid: &models.HostUnraidStorage{
ArrayStarted: true,
ArrayState: "STARTED",
NumProtected: 1,
},
},
},
}
monitor := &Monitor{
resourceStore: resourceStore,
metricsHistory: NewMetricsHistory(1024, 24*time.Hour),
metricsStore: store,
}
monitor.updateResourceStore(snapshot)
var storageResourceID string
for _, resource := range resourceStore.GetAll() {
if resource.Type == unifiedresources.ResourceTypeStorage && resource.Storage != nil && resource.Storage.Platform == "unraid" {
storageResourceID = resource.ID
break
}
}
if storageResourceID == "" {
t.Fatal("expected unraid storage resource in unified store")
}
target := resourceStore.MetricsTargetForResource(storageResourceID)
if target == nil || target.ResourceType != "storage" || target.ResourceID != "host-tower/storage:unraid-array" {
t.Fatalf("unexpected unraid storage metrics target %+v", target)
}
memory := monitor.GetStorageMetrics(target.ResourceID, time.Hour)
if got := len(memory["usage"]); got == 0 {
t.Fatalf("expected in-memory usage history for %s", target.ResourceID)
}
storeBacked := monitor.GetStorageMetricsForChart(target.ResourceID, 7*24*time.Hour)
if got := len(storeBacked["usage"]); got == 0 {
t.Fatalf("expected persisted usage history for %s", target.ResourceID)
}
})
}
func TestBuildBroadcastFrontendStateIncludesUnifiedIncidentAlerts(t *testing.T) {
alertManager := alerts.NewManagerWithDataDir(t.TempDir())
defer alertManager.Stop()
store := unifiedresources.NewMonitorAdapter(nil)
monitor := &Monitor{
state: models.NewState(),
resourceStore: store,
alertManager: alertManager,
orgID: "default",
supplementalProviders: map[unifiedresources.DataSource]MonitorSupplementalRecordsProvider{
unifiedresources.SourceTrueNAS: &testMonitorSupplementalProvider{
recordsByOrg: map[string][]unifiedresources.IngestRecord{
"default": {{
SourceID: "system:truenas-main",
Resource: unifiedresources.Resource{
ID: "agent:truenas-main",
Type: unifiedresources.ResourceTypeAgent,
Name: "truenas-main",
Sources: []unifiedresources.DataSource{unifiedresources.SourceTrueNAS},
TrueNAS: &unifiedresources.TrueNASData{Hostname: "truenas-main"},
Incidents: []unifiedresources.ResourceIncident{{
Provider: "truenas",
NativeID: "alert-2",
Code: "truenas_volume_status",
Severity: storagehealth.RiskCritical,
Summary: "Pool tank is FAULTED",
}},
},
}},
},
},
},
}
frontend := monitor.buildBroadcastFrontendStateFromSnapshot(models.StateSnapshot{})
if len(frontend.ActiveAlerts) != 1 {
t.Fatalf("expected 1 active alert in frontend state, got %d", len(frontend.ActiveAlerts))
}
if frontend.ActiveAlerts[0].Type != "resource-incident" {
t.Fatalf("alert type = %q, want resource-incident", frontend.ActiveAlerts[0].Type)
}
}
func TestSyncUnifiedAppContainerMetricsRecordsTrueNASHistory(t *testing.T) {
previous := truenas.IsFeatureEnabled()
truenas.SetFeatureEnabled(true)
t.Cleanup(func() {
truenas.SetFeatureEnabled(previous)
})
cfg := metrics.DefaultConfig(t.TempDir())
store, err := metrics.NewStore(cfg)
if err != nil {
t.Fatalf("metrics.NewStore() error = %v", err)
}
defer func() { _ = store.Close() }()
resourceStore := unifiedresources.NewMonitorAdapter(nil)
records := truenas.NewProvider(truenas.DefaultFixtures()).Records()
resourceStore.PopulateSnapshotAndSupplemental(models.StateSnapshot{}, map[unifiedresources.DataSource][]unifiedresources.IngestRecord{
unifiedresources.SourceTrueNAS: records,
})
monitor := &Monitor{
resourceStore: resourceStore,
metricsHistory: NewMetricsHistory(1024, 24*time.Hour),
metricsStore: store,
}
monitor.syncUnifiedAppContainerMetrics(resourceStore)
var appResourceID string
for _, resource := range resourceStore.GetAll() {
if resource.Type == unifiedresources.ResourceTypeAppContainer && resource.Name == "Nextcloud" {
appResourceID = resource.ID
break
}
}
if appResourceID == "" {
t.Fatal("expected Nextcloud app-container resource in unified store")
}
target := resourceStore.MetricsTargetForResource(appResourceID)
if target == nil || target.ResourceType != "app-container" || target.ResourceID != "nextcloud" {
t.Fatalf("unexpected app-container metrics target %+v", target)
}
inMemory := monitor.GetGuestMetrics("docker:nextcloud", time.Hour)
if got := len(inMemory["cpu"]); got == 0 {
t.Fatalf("expected in-memory cpu history for nextcloud")
}
if got := len(inMemory["netin"]); got == 0 {
t.Fatalf("expected in-memory network history for nextcloud")
}
storeBacked := monitor.GetGuestMetricsForChart("docker:nextcloud", "dockerContainer", "nextcloud", 7*24*time.Hour)
if got := len(storeBacked["cpu"]); got == 0 {
t.Fatalf("expected persisted cpu history for nextcloud")
}
}
func TestSyncUnifiedAppContainerMetricsSkipsMockOwnedTrueNASHistoryWhenMockEnabled(t *testing.T) {
previousFeature := truenas.IsFeatureEnabled()
truenas.SetFeatureEnabled(true)
t.Cleanup(func() {
truenas.SetFeatureEnabled(previousFeature)
})
previousMock := mock.IsMockEnabled()
mock.SetEnabled(true)
t.Cleanup(func() {
mock.SetEnabled(previousMock)
})
cfg := metrics.DefaultConfig(t.TempDir())
store, err := metrics.NewStore(cfg)
if err != nil {
t.Fatalf("metrics.NewStore() error = %v", err)
}
defer func() { _ = store.Close() }()
resourceStore := unifiedresources.NewMonitorAdapter(nil)
resourceStore.PopulateSnapshotAndSupplemental(models.StateSnapshot{}, map[unifiedresources.DataSource][]unifiedresources.IngestRecord{
unifiedresources.SourceTrueNAS: truenas.NewProvider(truenas.DefaultFixtures()).Records(),
})
monitor := &Monitor{
resourceStore: resourceStore,
metricsHistory: NewMetricsHistory(1024, 24*time.Hour),
metricsStore: store,
}
monitor.syncUnifiedAppContainerMetrics(resourceStore)
if got := len(monitor.GetGuestMetrics("docker:nextcloud", time.Hour)["cpu"]); got != 0 {
t.Fatalf("expected mock-owned TrueNAS app history to be skipped, got %d cpu points", got)
}
}
func TestSyncUnifiedAgentMetricsRecordsTrueNASHostHistory(t *testing.T) {
previous := truenas.IsFeatureEnabled()
truenas.SetFeatureEnabled(true)
t.Cleanup(func() {
truenas.SetFeatureEnabled(previous)
})
cfg := metrics.DefaultConfig(t.TempDir())
store, err := metrics.NewStore(cfg)
if err != nil {
t.Fatalf("metrics.NewStore() error = %v", err)
}
defer func() { _ = store.Close() }()
resourceStore := unifiedresources.NewMonitorAdapter(nil)
records := truenas.NewProvider(truenas.DefaultFixtures()).Records()
resourceStore.PopulateSnapshotAndSupplemental(models.StateSnapshot{}, map[unifiedresources.DataSource][]unifiedresources.IngestRecord{
unifiedresources.SourceTrueNAS: records,
})
monitor := &Monitor{
resourceStore: resourceStore,
metricsHistory: NewMetricsHistory(1024, 24*time.Hour),
metricsStore: store,
}
monitor.syncUnifiedAgentMetrics(resourceStore)
var systemResourceID string
for _, resource := range resourceStore.GetAll() {
if resource.Type == unifiedresources.ResourceTypeAgent && resource.Name == "truenas-main" {
systemResourceID = resource.ID
break
}
}
if systemResourceID == "" {
t.Fatal("expected TrueNAS system resource in unified store")
}
target := resourceStore.MetricsTargetForResource(systemResourceID)
if target == nil || target.ResourceType != "agent" || target.ResourceID != "truenas-main" {
t.Fatalf("unexpected agent metrics target %+v", target)
}
inMemory := monitor.GetGuestMetrics("agent:truenas-main", time.Hour)
if got := len(inMemory["cpu"]); got == 0 {
t.Fatalf("expected in-memory cpu history for truenas-main")
}
if got := len(inMemory["netin"]); got == 0 {
t.Fatalf("expected in-memory network history for truenas-main")
}
storeBacked := monitor.GetGuestMetricsForChart("agent:truenas-main", "agent", "truenas-main", 7*24*time.Hour)
if got := len(storeBacked["cpu"]); got == 0 {
t.Fatalf("expected persisted cpu history for truenas-main")
}
}
func TestSyncUnifiedAgentMetricsSkipsMockOwnedProviderHistoryWhenMockEnabled(t *testing.T) {
previousTrueNAS := truenas.IsFeatureEnabled()
truenas.SetFeatureEnabled(true)
t.Cleanup(func() {
truenas.SetFeatureEnabled(previousTrueNAS)
})
previousMock := mock.IsMockEnabled()
mock.SetEnabled(true)
t.Cleanup(func() {
mock.SetEnabled(previousMock)
})
cfg := metrics.DefaultConfig(t.TempDir())
store, err := metrics.NewStore(cfg)
if err != nil {
t.Fatalf("metrics.NewStore() error = %v", err)
}
defer func() { _ = store.Close() }()
resourceStore := unifiedresources.NewMonitorAdapter(nil)
resourceStore.PopulateSnapshotAndSupplemental(models.StateSnapshot{}, map[unifiedresources.DataSource][]unifiedresources.IngestRecord{
unifiedresources.SourceTrueNAS: truenas.NewProvider(truenas.DefaultFixtures()).Records(),
})
monitor := &Monitor{
resourceStore: resourceStore,
metricsHistory: NewMetricsHistory(1024, 24*time.Hour),
metricsStore: store,
}
monitor.syncUnifiedAgentMetrics(resourceStore)
if got := len(monitor.GetGuestMetrics("agent:truenas-main", time.Hour)["cpu"]); got != 0 {
t.Fatalf("expected mock-owned TrueNAS host history to be skipped, got %d cpu points", got)
}
}
func TestSyncUnifiedAgentMetricsRecordsVMwareHostHistory(t *testing.T) {
previous := vmware.IsFeatureEnabled()
vmware.SetFeatureEnabled(true)
t.Cleanup(func() {
vmware.SetFeatureEnabled(previous)
})
cfg := metrics.DefaultConfig(t.TempDir())
store, err := metrics.NewStore(cfg)
if err != nil {
t.Fatalf("metrics.NewStore() error = %v", err)
}
defer func() { _ = store.Close() }()
resourceStore := unifiedresources.NewMonitorAdapter(nil)
records := vmware.NewProvider(vmware.InventorySnapshot{
ConnectionID: "vc-1",
ConnectionName: "Lab VC",
VCenterHost: "vc.lab.local",
CollectedAt: time.Date(2026, time.March, 30, 18, 15, 0, 0, time.UTC),
Hosts: []vmware.InventoryHost{{
Host: "host-101",
Name: "esxi-01.lab.local",
ConnectionState: "CONNECTED",
PowerState: "POWERED_ON",
HostUUID: "uuid-host-1",
Metrics: &vmware.InventoryMetrics{
CPUPercent: ptrFloat64(21.4),
MemoryPercent: ptrFloat64(63.2),
MemoryUsedBytes: ptrInt64(27144105984),
MemoryTotalBytes: ptrInt64(42949672960),
NetInBytesPerSecond: ptrFloat64(1024),
NetOutBytesPerSecond: ptrFloat64(2048),
DiskReadBytesPerSecond: ptrFloat64(4096),
DiskWriteBytesPerSecond: ptrFloat64(8192),
},
}},
}).Records()
resourceStore.PopulateSnapshotAndSupplemental(models.StateSnapshot{}, map[unifiedresources.DataSource][]unifiedresources.IngestRecord{
unifiedresources.SourceVMware: records,
})
monitor := &Monitor{
resourceStore: resourceStore,
metricsHistory: NewMetricsHistory(1024, 24*time.Hour),
metricsStore: store,
}
monitor.syncUnifiedAgentMetrics(resourceStore)
var systemResourceID string
for _, resource := range resourceStore.GetAll() {
if resource.Type == unifiedresources.ResourceTypeAgent && resource.Name == "esxi-01.lab.local" {
systemResourceID = resource.ID
break
}
}
if systemResourceID == "" {
t.Fatal("expected VMware host resource in unified store")
}
target := resourceStore.MetricsTargetForResource(systemResourceID)
if target == nil || target.ResourceType != "agent" || target.ResourceID != "vc-1:host:host-101" {
t.Fatalf("unexpected agent metrics target %+v", target)
}
inMemory := monitor.GetGuestMetrics("agent:vc-1:host:host-101", time.Hour)
if got := len(inMemory["cpu"]); got == 0 {
t.Fatalf("expected in-memory cpu history for VMware host")
}
if got := len(inMemory["netin"]); got == 0 {
t.Fatalf("expected in-memory network history for VMware host")
}
storeBacked := monitor.GetGuestMetricsForChart("agent:vc-1:host:host-101", "agent", "vc-1:host:host-101", 7*24*time.Hour)
if got := len(storeBacked["cpu"]); got == 0 {
t.Fatalf("expected persisted cpu history for VMware host")
}
}
func TestSyncUnifiedVMMetricsRecordsVMwareVMHistory(t *testing.T) {
previous := vmware.IsFeatureEnabled()
vmware.SetFeatureEnabled(true)
t.Cleanup(func() {
vmware.SetFeatureEnabled(previous)
})
cfg := metrics.DefaultConfig(t.TempDir())
store, err := metrics.NewStore(cfg)
if err != nil {
t.Fatalf("metrics.NewStore() error = %v", err)
}
defer func() { _ = store.Close() }()
resourceStore := unifiedresources.NewMonitorAdapter(nil)
records := vmware.NewProvider(vmware.InventorySnapshot{
ConnectionID: "vc-1",
ConnectionName: "Lab VC",
VCenterHost: "vc.lab.local",
CollectedAt: time.Date(2026, time.March, 30, 18, 15, 0, 0, time.UTC),
VMs: []vmware.InventoryVM{{
VM: "vm-201",
Name: "app-01",
PowerState: "POWERED_ON",
CPUCount: 4,
MemorySizeMiB: 8192,
Metrics: &vmware.InventoryMetrics{
CPUPercent: ptrFloat64(38.1),
MemoryPercent: ptrFloat64(57.5),
MemoryUsedBytes: ptrInt64(5033164800),
MemoryTotalBytes: ptrInt64(8589934592),
NetInBytesPerSecond: ptrFloat64(512),
NetOutBytesPerSecond: ptrFloat64(768),
DiskReadBytesPerSecond: ptrFloat64(1536),
DiskWriteBytesPerSecond: ptrFloat64(2048),
},
}},
}).Records()
resourceStore.PopulateSnapshotAndSupplemental(models.StateSnapshot{}, map[unifiedresources.DataSource][]unifiedresources.IngestRecord{
unifiedresources.SourceVMware: records,
})
monitor := &Monitor{
resourceStore: resourceStore,
metricsHistory: NewMetricsHistory(1024, 24*time.Hour),
metricsStore: store,
}
monitor.syncUnifiedVMMetrics(resourceStore)
var vmResourceID string
for _, resource := range resourceStore.GetAll() {
if resource.Type == unifiedresources.ResourceTypeVM && resource.Name == "app-01" {
vmResourceID = resource.ID
break
}
}
if vmResourceID == "" {
t.Fatal("expected VMware VM resource in unified store")
}
target := resourceStore.MetricsTargetForResource(vmResourceID)
if target == nil || target.ResourceType != "vm" || target.ResourceID != "vc-1:vm:vm-201" {
t.Fatalf("unexpected vm metrics target %+v", target)
}
inMemory := monitor.GetGuestMetrics("vc-1:vm:vm-201", time.Hour)
if got := len(inMemory["cpu"]); got == 0 {
t.Fatalf("expected in-memory cpu history for VMware VM")
}
if got := len(inMemory["netin"]); got == 0 {
t.Fatalf("expected in-memory network history for VMware VM")
}
storeBacked := monitor.GetGuestMetricsForChart("vc-1:vm:vm-201", "vm", "vc-1:vm:vm-201", 7*24*time.Hour)
if got := len(storeBacked["cpu"]); got == 0 {
t.Fatalf("expected persisted cpu history for VMware VM")
}
}
func TestSyncUnifiedPhysicalDiskMetricsRecordsTrueNASDiskHistory(t *testing.T) {
previous := truenas.IsFeatureEnabled()
truenas.SetFeatureEnabled(true)
t.Cleanup(func() {
truenas.SetFeatureEnabled(previous)
})
cfg := metrics.DefaultConfig(t.TempDir())
store, err := metrics.NewStore(cfg)
if err != nil {
t.Fatalf("metrics.NewStore() error = %v", err)
}
defer func() { _ = store.Close() }()
resourceStore := unifiedresources.NewMonitorAdapter(nil)
records := truenas.NewProvider(truenas.DefaultFixtures()).Records()
resourceStore.PopulateSnapshotAndSupplemental(models.StateSnapshot{}, map[unifiedresources.DataSource][]unifiedresources.IngestRecord{
unifiedresources.SourceTrueNAS: records,
})
monitor := &Monitor{
resourceStore: resourceStore,
metricsStore: store,
}
monitor.syncUnifiedPhysicalDiskMetrics(resourceStore)
var diskResourceID string
for _, resource := range resourceStore.GetAll() {
if resource.Type != unifiedresources.ResourceTypePhysicalDisk || resource.PhysicalDisk == nil {
continue
}
if strings.TrimSpace(resource.PhysicalDisk.DevPath) == "/dev/sdc" {
diskResourceID = resource.ID
break
}
}
if diskResourceID == "" {
t.Fatal("expected TrueNAS sdc disk resource in unified store")
}
target := resourceStore.MetricsTargetForResource(diskResourceID)
if target == nil || target.ResourceType != "disk" || target.ResourceID != "WD-WX12A3456" {
t.Fatalf("unexpected physical-disk metrics target %+v", target)
}
charts := monitor.GetPhysicalDiskTemperatureCharts(7 * 24 * time.Hour)
entry, ok := charts[target.ResourceID]
if !ok {
t.Fatalf("expected persisted disk chart for %s, got %#v", target.ResourceID, charts)
}
if entry.Node != "truenas-main" {
t.Fatalf("chart node = %q, want truenas-main", entry.Node)
}
if len(entry.Temperature) == 0 {
t.Fatalf("expected temperature history for %s", target.ResourceID)
}
last := entry.Temperature[len(entry.Temperature)-1]
if last.Value != 63 {
t.Fatalf("expected last disk temperature 63, got %.2f", last.Value)
}
}
func TestSyncUnifiedStorageAndDiskMetricsSkipMockOwnedTrueNASHistoryWhenMockEnabled(t *testing.T) {
previousFeature := truenas.IsFeatureEnabled()
truenas.SetFeatureEnabled(true)
t.Cleanup(func() {
truenas.SetFeatureEnabled(previousFeature)
})
previousMock := mock.IsMockEnabled()
mock.SetEnabled(true)
t.Cleanup(func() {
mock.SetEnabled(previousMock)
})
cfg := metrics.DefaultConfig(t.TempDir())
store, err := metrics.NewStore(cfg)
if err != nil {
t.Fatalf("metrics.NewStore() error = %v", err)
}
defer func() { _ = store.Close() }()
resourceStore := unifiedresources.NewMonitorAdapter(nil)
resourceStore.PopulateSnapshotAndSupplemental(models.StateSnapshot{}, map[unifiedresources.DataSource][]unifiedresources.IngestRecord{
unifiedresources.SourceTrueNAS: truenas.NewProvider(truenas.DefaultFixtures()).Records(),
})
monitor := &Monitor{
resourceStore: resourceStore,
metricsHistory: NewMetricsHistory(1024, 24*time.Hour),
metricsStore: store,
}
monitor.syncUnifiedStorageMetrics(resourceStore)
monitor.syncUnifiedPhysicalDiskMetrics(resourceStore)
if got := len(monitor.GetStorageMetrics("pool:tank", time.Hour)["usage"]); got != 0 {
t.Fatalf("expected mock-owned TrueNAS storage history to be skipped, got %d usage points", got)
}
points, err := store.Query("disk", "WD-WX12A3456", "smart_temp", time.Now().Add(-time.Hour), time.Now(), 0)
if err != nil {
t.Fatalf("store.Query() error = %v", err)
}
if len(points) != 0 {
t.Fatalf("expected mock-owned TrueNAS disk history to be skipped, got %d points", len(points))
}
}
func TestBuildBroadcastFrontendStatePrefersLiveAlertManagerOverSnapshotAlerts(t *testing.T) {
alertManager := alerts.NewManagerWithDataDir(t.TempDir())
defer alertManager.Stop()
alertManager.SyncUnifiedResourceIncidents([]unifiedresources.Resource{{
ID: "agent:truenas-main",
Type: unifiedresources.ResourceTypeAgent,
Name: "truenas-main",
Sources: []unifiedresources.DataSource{unifiedresources.SourceTrueNAS},
TrueNAS: &unifiedresources.TrueNASData{Hostname: "truenas-main"},
Incidents: []unifiedresources.ResourceIncident{{
Provider: "truenas",
NativeID: "alert-live-1",
Code: "truenas_volume_status",
Severity: storagehealth.RiskCritical,
Summary: "Pool tank is FAULTED",
}},
}})
monitor := &Monitor{
state: models.NewState(),
alertManager: alertManager,
}
staleSnapshot := models.StateSnapshot{
ActiveAlerts: []models.Alert{{
ID: "snapshot-stale-1",
Type: "offline",
ResourceID: "stale-resource",
ResourceName: "stale-resource",
Message: "stale snapshot alert",
}},
}
frontend := monitor.buildBroadcastFrontendStateFromSnapshot(staleSnapshot)
if len(frontend.ActiveAlerts) != 1 {
t.Fatalf("expected 1 live active alert in frontend state, got %d", len(frontend.ActiveAlerts))
}
if got := frontend.ActiveAlerts[0].ID; got == "snapshot-stale-1" || got == "" {
t.Fatalf("expected live alert manager id, got %q", got)
}
if got := frontend.ActiveAlerts[0].Type; got != "resource-incident" {
t.Fatalf("alert type = %q, want resource-incident", got)
}
if got := frontend.ActiveAlerts[0].Message; got == "stale snapshot alert" {
t.Fatalf("expected live alert manager payload, got stale snapshot message %q", got)
}
}
func TestActiveAlertsSnapshotPrefersLiveAlertManagerOverStateSnapshot(t *testing.T) {
alertManager := alerts.NewManagerWithDataDir(t.TempDir())
defer alertManager.Stop()
alertManager.SyncUnifiedResourceIncidents([]unifiedresources.Resource{{
ID: "agent:truenas-live",
Type: unifiedresources.ResourceTypeAgent,
Name: "truenas-live",
Sources: []unifiedresources.DataSource{unifiedresources.SourceTrueNAS},
TrueNAS: &unifiedresources.TrueNASData{Hostname: "truenas-live"},
Incidents: []unifiedresources.ResourceIncident{{
Provider: "truenas",
NativeID: "live-active-1",
Code: "truenas_volume_status",
Severity: storagehealth.RiskCritical,
Summary: "Pool tank is FAULTED",
}},
}})
state := models.NewState()
state.UpdateActiveAlerts([]models.Alert{{
ID: "snapshot-active-1",
Type: "offline",
ResourceID: "snapshot-resource",
ResourceName: "snapshot-resource",
Message: "stale snapshot active alert",
}})
monitor := &Monitor{
state: state,
alertManager: alertManager,
}
active := monitor.ActiveAlertsSnapshot()
if len(active) != 1 {
t.Fatalf("expected 1 live active alert, got %d", len(active))
}
if got := active[0].ID; got == "snapshot-active-1" || got == "" {
t.Fatalf("expected live alert manager alert id, got %q", got)
}
if got := active[0].Type; got != "resource-incident" {
t.Fatalf("alert type = %q, want resource-incident", got)
}
}
func TestRecentlyResolvedSnapshotPrefersLiveAlertManagerOverStateSnapshot(t *testing.T) {
alertManager := alerts.NewManagerWithDataDir(t.TempDir())
defer alertManager.Stop()
resource := unifiedresources.Resource{
ID: "agent:truenas-live",
Type: unifiedresources.ResourceTypeAgent,
Name: "truenas-live",
Sources: []unifiedresources.DataSource{unifiedresources.SourceTrueNAS},
TrueNAS: &unifiedresources.TrueNASData{Hostname: "truenas-live"},
Incidents: []unifiedresources.ResourceIncident{{
Provider: "truenas",
NativeID: "live-resolved-1",
Code: "truenas_volume_status",
Severity: storagehealth.RiskCritical,
Summary: "Pool tank is FAULTED",
}},
}
alertManager.SyncUnifiedResourceIncidents([]unifiedresources.Resource{resource})
alertManager.SyncUnifiedResourceIncidents(nil)
state := models.NewState()
state.UpdateRecentlyResolved([]models.ResolvedAlert{{
Alert: models.Alert{
ID: "snapshot-resolved-1",
Type: "offline",
ResourceID: "snapshot-resource",
ResourceName: "snapshot-resource",
Message: "stale snapshot resolved alert",
},
ResolvedTime: time.Now().UTC(),
}})
monitor := &Monitor{
state: state,
alertManager: alertManager,
}
resolved := monitor.RecentlyResolvedSnapshot()
if len(resolved) == 0 {
t.Fatal("expected recently resolved alert from live alert manager")
}
if got := resolved[0].ID; got == "snapshot-resolved-1" || got == "" {
t.Fatalf("expected live resolved alert id, got %q", got)
}
if got := resolved[0].Type; got != "resource-incident" {
t.Fatalf("resolved alert type = %q, want resource-incident", got)
}
}
func TestBuildBroadcastFrontendStatePreservesSnapshotAlertsWithoutAlertManager(t *testing.T) {
monitor := &Monitor{
state: models.NewState(),
}
snapshot := models.StateSnapshot{
ActiveAlerts: []models.Alert{{
ID: "snapshot-only-1",
Type: "offline",
ResourceID: "snapshot-resource",
ResourceName: "snapshot-resource",
Message: "snapshot-only alert",
}},
}
frontend := monitor.buildBroadcastFrontendStateFromSnapshot(snapshot)
if len(frontend.ActiveAlerts) != 1 {
t.Fatalf("expected snapshot alert to be preserved, got %#v", frontend.ActiveAlerts)
}
if got := frontend.ActiveAlerts[0].ID; got != "snapshot-only-1" {
t.Fatalf("alert id = %q, want snapshot-only-1", got)
}
}
func TestRecordTaskResult_Success(t *testing.T) {
m := &Monitor{
pollStatusMap: make(map[string]*pollStatus),
failureCounts: make(map[string]int),
lastOutcome: make(map[string]taskOutcome),
circuitBreakers: make(map[string]*circuitBreaker),
}
// Record a success
m.recordTaskResult(InstanceTypePVE, "test-instance", nil)
key := schedulerKey(InstanceTypePVE, "test-instance")
// Verify failure count is reset
if m.failureCounts[key] != 0 {
t.Errorf("expected failureCounts[%s] = 0, got %d", key, m.failureCounts[key])
}
// Verify last outcome is success
outcome, ok := m.lastOutcome[key]
if !ok {
t.Fatalf("expected lastOutcome[%s] to exist", key)
}
if !outcome.success {
t.Error("expected outcome.success = true")
}
// Verify poll status
status, ok := m.pollStatusMap[key]
if !ok {
t.Fatalf("expected pollStatusMap[%s] to exist", key)
}
if status.ConsecutiveFailures != 0 {
t.Errorf("expected ConsecutiveFailures = 0, got %d", status.ConsecutiveFailures)
}
}
func TestRecordTaskResult_Failure(t *testing.T) {
m := &Monitor{
pollStatusMap: make(map[string]*pollStatus),
failureCounts: make(map[string]int),
lastOutcome: make(map[string]taskOutcome),
circuitBreakers: make(map[string]*circuitBreaker),
}
testErr := errors.New("connection refused")
// Record a failure
m.recordTaskResult(InstanceTypePVE, "test-instance", testErr)
key := schedulerKey(InstanceTypePVE, "test-instance")
// Verify failure count is incremented
if m.failureCounts[key] != 1 {
t.Errorf("expected failureCounts[%s] = 1, got %d", key, m.failureCounts[key])
}
// Verify last outcome is failure
outcome, ok := m.lastOutcome[key]
if !ok {
t.Fatalf("expected lastOutcome[%s] to exist", key)
}
if outcome.success {
t.Error("expected outcome.success = false")
}
if outcome.err != testErr {
t.Errorf("expected outcome.err = %v, got %v", testErr, outcome.err)
}
// Verify poll status
status, ok := m.pollStatusMap[key]
if !ok {
t.Fatalf("expected pollStatusMap[%s] to exist", key)
}
if status.ConsecutiveFailures != 1 {
t.Errorf("expected ConsecutiveFailures = 1, got %d", status.ConsecutiveFailures)
}
if status.LastErrorMessage != "connection refused" {
t.Errorf("expected LastErrorMessage = 'connection refused', got %q", status.LastErrorMessage)
}
}
func TestRecordTaskResult_ConsecutiveFailures(t *testing.T) {
m := &Monitor{
pollStatusMap: make(map[string]*pollStatus),
failureCounts: make(map[string]int),
lastOutcome: make(map[string]taskOutcome),
circuitBreakers: make(map[string]*circuitBreaker),
}
testErr := errors.New("timeout")
// Record multiple failures
m.recordTaskResult(InstanceTypePBS, "pbs-server", testErr)
m.recordTaskResult(InstanceTypePBS, "pbs-server", testErr)
m.recordTaskResult(InstanceTypePBS, "pbs-server", testErr)
key := schedulerKey(InstanceTypePBS, "pbs-server")
// Verify consecutive failures count
status := m.pollStatusMap[key]
if status.ConsecutiveFailures != 3 {
t.Errorf("expected ConsecutiveFailures = 3, got %d", status.ConsecutiveFailures)
}
// FirstFailureAt should be set on first failure and not change
if status.FirstFailureAt.IsZero() {
t.Error("expected FirstFailureAt to be set")
}
}
func TestRecordTaskResult_SuccessResetsFailures(t *testing.T) {
m := &Monitor{
pollStatusMap: make(map[string]*pollStatus),
failureCounts: make(map[string]int),
lastOutcome: make(map[string]taskOutcome),
circuitBreakers: make(map[string]*circuitBreaker),
}
testErr := errors.New("error")
key := schedulerKey(InstanceTypePMG, "pmg-server")
// Record some failures first
m.recordTaskResult(InstanceTypePMG, "pmg-server", testErr)
m.recordTaskResult(InstanceTypePMG, "pmg-server", testErr)
if m.pollStatusMap[key].ConsecutiveFailures != 2 {
t.Fatalf("expected 2 failures before reset")
}
// Now record a success
m.recordTaskResult(InstanceTypePMG, "pmg-server", nil)
// Verify everything is reset
if m.failureCounts[key] != 0 {
t.Errorf("expected failureCounts to be reset to 0, got %d", m.failureCounts[key])
}
if m.pollStatusMap[key].ConsecutiveFailures != 0 {
t.Errorf("expected ConsecutiveFailures to be reset to 0, got %d", m.pollStatusMap[key].ConsecutiveFailures)
}
if !m.pollStatusMap[key].FirstFailureAt.IsZero() {
t.Error("expected FirstFailureAt to be reset to zero")
}
}
func TestRecordTaskResult_NilMaps(t *testing.T) {
// Monitor with nil internal maps - should not panic
m := &Monitor{
pollStatusMap: make(map[string]*pollStatus),
failureCounts: nil, // nil
lastOutcome: nil, // nil
circuitBreakers: make(map[string]*circuitBreaker),
}
// Should not panic
m.recordTaskResult(InstanceTypePVE, "test", nil)
m.recordTaskResult(InstanceTypePVE, "test", errors.New("error"))
// pollStatusMap should still be updated
key := schedulerKey(InstanceTypePVE, "test")
if _, ok := m.pollStatusMap[key]; !ok {
t.Error("expected pollStatusMap to be updated even with nil failureCounts/lastOutcome")
}
}
func TestDescribeInstancesForScheduler_NoClients(t *testing.T) {
m := &Monitor{
pveClients: make(map[string]PVEClientInterface),
pbsClients: make(map[string]*pbs.Client),
pmgClients: make(map[string]*pmg.Client),
}
descriptors := m.describeInstancesForScheduler()
if descriptors != nil {
t.Errorf("expected nil for empty clients, got %v", descriptors)
}
}
func TestDescribeInstancesForScheduler_PVEOnly(t *testing.T) {
m := &Monitor{
pveClients: map[string]PVEClientInterface{"pve-1": nil, "pve-2": nil},
pbsClients: make(map[string]*pbs.Client),
pmgClients: make(map[string]*pmg.Client),
}
descriptors := m.describeInstancesForScheduler()
if len(descriptors) != 2 {
t.Fatalf("expected 2 descriptors, got %d", len(descriptors))
}
// Should be sorted
if descriptors[0].Name != "pve-1" || descriptors[1].Name != "pve-2" {
t.Errorf("expected sorted order [pve-1, pve-2], got [%s, %s]", descriptors[0].Name, descriptors[1].Name)
}
for _, desc := range descriptors {
if desc.Type != InstanceTypePVE {
t.Errorf("expected type PVE, got %v", desc.Type)
}
}
}
func TestDescribeInstancesForScheduler_PBSOnly(t *testing.T) {
m := &Monitor{
pveClients: make(map[string]PVEClientInterface),
pbsClients: map[string]*pbs.Client{"pbs-backup": nil},
pmgClients: make(map[string]*pmg.Client),
}
descriptors := m.describeInstancesForScheduler()
if len(descriptors) != 1 {
t.Fatalf("expected 1 descriptor, got %d", len(descriptors))
}
if descriptors[0].Name != "pbs-backup" {
t.Errorf("expected name 'pbs-backup', got %q", descriptors[0].Name)
}
if descriptors[0].Type != InstanceTypePBS {
t.Errorf("expected type PBS, got %v", descriptors[0].Type)
}
}
func TestDescribeInstancesForScheduler_PMGOnly(t *testing.T) {
m := &Monitor{
pveClients: make(map[string]PVEClientInterface),
pbsClients: make(map[string]*pbs.Client),
pmgClients: map[string]*pmg.Client{"pmg-mail": nil},
}
descriptors := m.describeInstancesForScheduler()
if len(descriptors) != 1 {
t.Fatalf("expected 1 descriptor, got %d", len(descriptors))
}
if descriptors[0].Name != "pmg-mail" {
t.Errorf("expected name 'pmg-mail', got %q", descriptors[0].Name)
}
if descriptors[0].Type != InstanceTypePMG {
t.Errorf("expected type PMG, got %v", descriptors[0].Type)
}
}
func TestDescribeInstancesForScheduler_AllTypes(t *testing.T) {
m := &Monitor{
pveClients: map[string]PVEClientInterface{"pve-1": nil},
pbsClients: map[string]*pbs.Client{"pbs-1": nil},
pmgClients: map[string]*pmg.Client{"pmg-1": nil},
}
descriptors := m.describeInstancesForScheduler()
if len(descriptors) != 3 {
t.Fatalf("expected 3 descriptors, got %d", len(descriptors))
}
// Check we have one of each type
types := make(map[InstanceType]bool)
for _, desc := range descriptors {
types[desc.Type] = true
}
if !types[InstanceTypePVE] || !types[InstanceTypePBS] || !types[InstanceTypePMG] {
t.Error("expected one descriptor of each type")
}
}
func TestDescribeInstancesForScheduler_NilSchedulerAndTracker(t *testing.T) {
m := &Monitor{
pveClients: map[string]PVEClientInterface{"pve-1": nil},
pbsClients: make(map[string]*pbs.Client),
pmgClients: make(map[string]*pmg.Client),
scheduler: nil, // explicitly nil
stalenessTracker: nil, // explicitly nil
}
// Should not panic with nil scheduler and stalenessTracker
descriptors := m.describeInstancesForScheduler()
if len(descriptors) != 1 {
t.Fatalf("expected 1 descriptor, got %d", len(descriptors))
}
// LastScheduled and LastSuccess should be zero values
if !descriptors[0].LastScheduled.IsZero() {
t.Error("expected LastScheduled to be zero with nil scheduler")
}
if !descriptors[0].LastSuccess.IsZero() {
t.Error("expected LastSuccess to be zero with nil stalenessTracker")
}
}
func TestRescheduleTask_SuccessfulOutcome(t *testing.T) {
cfg := &config.Config{
PVEPollingInterval: 30 * time.Second,
AdaptivePollingBaseInterval: 10 * time.Second,
}
m := &Monitor{
config: cfg,
taskQueue: NewTaskQueue(),
lastOutcome: make(map[string]taskOutcome),
failureCounts: make(map[string]int),
}
task := ScheduledTask{
InstanceName: "pve-1",
InstanceType: InstanceTypePVE,
Interval: 30 * time.Second,
NextRun: time.Now(),
}
key := schedulerKey(task.InstanceType, task.InstanceName)
// Record a successful outcome
m.lastOutcome[key] = taskOutcome{success: true}
m.rescheduleTask(task)
// Task should be rescheduled at regular interval (no backoff)
m.taskQueue.mu.Lock()
entry, ok := m.taskQueue.entries[key]
m.taskQueue.mu.Unlock()
if !ok {
t.Fatal("expected task to be rescheduled")
}
// Should use base interval since scheduler is nil
if entry.task.Interval != cfg.PVEPollingInterval {
t.Errorf("expected interval %v, got %v", cfg.PVEPollingInterval, entry.task.Interval)
}
}
func TestRescheduleTask_TransientFailureWithBackoff(t *testing.T) {
cfg := &config.Config{
PVEPollingInterval: 30 * time.Second,
AdaptivePollingBaseInterval: 10 * time.Second,
}
m := &Monitor{
config: cfg,
taskQueue: NewTaskQueue(),
lastOutcome: make(map[string]taskOutcome),
failureCounts: make(map[string]int),
maxRetryAttempts: 5,
backoffCfg: backoffConfig{
Initial: 5 * time.Second,
Multiplier: 2,
Jitter: 0, // no jitter for predictable testing
Max: 5 * time.Minute,
},
}
// Add randomFloat method for backoff calculation
m.rng = nil // will use default random
task := ScheduledTask{
InstanceName: "pve-1",
InstanceType: InstanceTypePVE,
Interval: 30 * time.Second,
NextRun: time.Now(),
}
key := schedulerKey(task.InstanceType, task.InstanceName)
// Record a transient failure (1st attempt, below maxRetryAttempts)
m.failureCounts[key] = 1
m.lastOutcome[key] = taskOutcome{
success: false,
transient: true,
err: errors.New("connection timeout"),
}
m.rescheduleTask(task)
// Task should be rescheduled with backoff delay
m.taskQueue.mu.Lock()
entry, ok := m.taskQueue.entries[key]
m.taskQueue.mu.Unlock()
if !ok {
t.Fatal("expected task to be rescheduled with backoff")
}
// With backoff, interval should be modified
if entry.task.Interval <= 0 {
t.Errorf("expected positive backoff interval, got %v", entry.task.Interval)
}
}
func TestRescheduleTask_NonTransientFailureGoesToDeadLetter(t *testing.T) {
cfg := &config.Config{
PVEPollingInterval: 30 * time.Second,
}
deadLetterQ := NewTaskQueue()
m := &Monitor{
config: cfg,
taskQueue: NewTaskQueue(),
deadLetterQueue: deadLetterQ,
lastOutcome: make(map[string]taskOutcome),
failureCounts: make(map[string]int),
maxRetryAttempts: 5,
}
task := ScheduledTask{
InstanceName: "pve-1",
InstanceType: InstanceTypePVE,
Interval: 30 * time.Second,
NextRun: time.Now(),
}
key := schedulerKey(task.InstanceType, task.InstanceName)
// Record a non-transient failure (permanent error)
m.failureCounts[key] = 1
m.lastOutcome[key] = taskOutcome{
success: false,
transient: false, // non-transient
err: errors.New("authentication failed"),
}
m.rescheduleTask(task)
// Task should NOT be in the main queue
m.taskQueue.mu.Lock()
_, inMainQueue := m.taskQueue.entries[key]
m.taskQueue.mu.Unlock()
if inMainQueue {
t.Error("expected task to NOT be in main queue after non-transient failure")
}
// Task should be in dead letter queue
deadLetterQ.mu.Lock()
dlqSize := len(deadLetterQ.entries)
deadLetterQ.mu.Unlock()
if dlqSize != 1 {
t.Errorf("expected 1 task in dead letter queue, got %d", dlqSize)
}
}
func TestRescheduleTask_ExceededRetryAttemptsGoesToDeadLetter(t *testing.T) {
cfg := &config.Config{
PVEPollingInterval: 30 * time.Second,
}
deadLetterQ := NewTaskQueue()
m := &Monitor{
config: cfg,
taskQueue: NewTaskQueue(),
deadLetterQueue: deadLetterQ,
lastOutcome: make(map[string]taskOutcome),
failureCounts: make(map[string]int),
maxRetryAttempts: 3,
}
task := ScheduledTask{
InstanceName: "pve-1",
InstanceType: InstanceTypePVE,
Interval: 30 * time.Second,
NextRun: time.Now(),
}
key := schedulerKey(task.InstanceType, task.InstanceName)
// Exceed max retry attempts (failureCount >= maxRetryAttempts)
m.failureCounts[key] = 3
m.lastOutcome[key] = taskOutcome{
success: false,
transient: true, // transient, but exceeded retries
err: errors.New("connection timeout"),
}
m.rescheduleTask(task)
// Task should be in dead letter queue
deadLetterQ.mu.Lock()
dlqSize := len(deadLetterQ.entries)
deadLetterQ.mu.Unlock()
if dlqSize != 1 {
t.Errorf("expected 1 task in dead letter queue after exceeding retries, got %d", dlqSize)
}
}
func TestRescheduleTask_NoOutcomeUsesDefaultInterval(t *testing.T) {
cfg := &config.Config{
PVEPollingInterval: 45 * time.Second,
AdaptivePollingBaseInterval: 10 * time.Second,
}
m := &Monitor{
config: cfg,
taskQueue: NewTaskQueue(),
lastOutcome: make(map[string]taskOutcome),
failureCounts: make(map[string]int),
}
task := ScheduledTask{
InstanceName: "pve-1",
InstanceType: InstanceTypePVE,
Interval: 0, // no interval set
NextRun: time.Now(),
}
key := schedulerKey(task.InstanceType, task.InstanceName)
// No outcome recorded - hasOutcome will be false
m.rescheduleTask(task)
m.taskQueue.mu.Lock()
entry, ok := m.taskQueue.entries[key]
m.taskQueue.mu.Unlock()
if !ok {
t.Fatal("expected task to be rescheduled")
}
// Should use config PVE polling interval
if entry.task.Interval != cfg.PVEPollingInterval {
t.Errorf("expected interval %v, got %v", cfg.PVEPollingInterval, entry.task.Interval)
}
}
func TestRescheduleTask_PBSInstance(t *testing.T) {
cfg := &config.Config{
PBSPollingInterval: 60 * time.Second,
AdaptivePollingBaseInterval: 10 * time.Second,
}
m := &Monitor{
config: cfg,
taskQueue: NewTaskQueue(),
lastOutcome: make(map[string]taskOutcome),
failureCounts: make(map[string]int),
}
task := ScheduledTask{
InstanceName: "pbs-1",
InstanceType: InstanceTypePBS,
Interval: 0,
NextRun: time.Now(),
}
key := schedulerKey(task.InstanceType, task.InstanceName)
m.rescheduleTask(task)
m.taskQueue.mu.Lock()
entry, ok := m.taskQueue.entries[key]
m.taskQueue.mu.Unlock()
if !ok {
t.Fatal("expected PBS task to be rescheduled")
}
if entry.task.Interval != cfg.PBSPollingInterval {
t.Errorf("expected PBS interval %v, got %v", cfg.PBSPollingInterval, entry.task.Interval)
}
}
func TestRescheduleTask_PMGInstance(t *testing.T) {
cfg := &config.Config{
PMGPollingInterval: 90 * time.Second,
AdaptivePollingBaseInterval: 10 * time.Second,
}
m := &Monitor{
config: cfg,
taskQueue: NewTaskQueue(),
lastOutcome: make(map[string]taskOutcome),
failureCounts: make(map[string]int),
}
task := ScheduledTask{
InstanceName: "pmg-1",
InstanceType: InstanceTypePMG,
Interval: 0,
NextRun: time.Now(),
}
key := schedulerKey(task.InstanceType, task.InstanceName)
m.rescheduleTask(task)
m.taskQueue.mu.Lock()
entry, ok := m.taskQueue.entries[key]
m.taskQueue.mu.Unlock()
if !ok {
t.Fatal("expected PMG task to be rescheduled")
}
if entry.task.Interval != cfg.PMGPollingInterval {
t.Errorf("expected PMG interval %v, got %v", cfg.PMGPollingInterval, entry.task.Interval)
}
}
func TestRescheduleTask_AdaptivePollingMaxIntervalLimit(t *testing.T) {
cfg := &config.Config{
PVEPollingInterval: 30 * time.Second,
AdaptivePollingEnabled: true,
AdaptivePollingMaxInterval: 10 * time.Second, // <= 15s enables capping
AdaptivePollingBaseInterval: 5 * time.Second,
}
m := &Monitor{
config: cfg,
taskQueue: NewTaskQueue(),
lastOutcome: make(map[string]taskOutcome),
failureCounts: make(map[string]int),
maxRetryAttempts: 5,
backoffCfg: backoffConfig{
Initial: 10 * time.Second, // would normally backoff to 10s+
Multiplier: 2,
Jitter: 0,
Max: 5 * time.Minute,
},
}
task := ScheduledTask{
InstanceName: "pve-1",
InstanceType: InstanceTypePVE,
Interval: 30 * time.Second,
NextRun: time.Now(),
}
key := schedulerKey(task.InstanceType, task.InstanceName)
// Simulate transient failure to trigger backoff
m.failureCounts[key] = 1
m.lastOutcome[key] = taskOutcome{
success: false,
transient: true,
err: errors.New("timeout"),
}
m.rescheduleTask(task)
m.taskQueue.mu.Lock()
entry, ok := m.taskQueue.entries[key]
m.taskQueue.mu.Unlock()
if !ok {
t.Fatal("expected task to be rescheduled")
}
// With AdaptivePollingMaxInterval <= 15s, backoff delay should be capped at 4s
maxDelay := 4 * time.Second
if entry.task.Interval > maxDelay {
t.Errorf("expected backoff interval to be capped at %v, got %v", maxDelay, entry.task.Interval)
}
}
func TestRescheduleTask_UsesExistingIntervalWhenSet(t *testing.T) {
cfg := &config.Config{
PVEPollingInterval: 30 * time.Second,
AdaptivePollingBaseInterval: 10 * time.Second,
}
m := &Monitor{
config: cfg,
taskQueue: NewTaskQueue(),
lastOutcome: make(map[string]taskOutcome),
failureCounts: make(map[string]int),
}
customInterval := 45 * time.Second
task := ScheduledTask{
InstanceName: "pve-1",
InstanceType: InstanceTypePVE,
Interval: customInterval, // custom interval already set
NextRun: time.Now(),
}
key := schedulerKey(task.InstanceType, task.InstanceName)
m.rescheduleTask(task)
m.taskQueue.mu.Lock()
entry, ok := m.taskQueue.entries[key]
m.taskQueue.mu.Unlock()
if !ok {
t.Fatal("expected task to be rescheduled")
}
// Should use the existing interval when it's already set
if entry.task.Interval != customInterval {
t.Errorf("expected existing interval %v to be preserved, got %v", customInterval, entry.task.Interval)
}
}
func TestCustomPollProviderIntegration(t *testing.T) {
const customType InstanceType = "xcp"
const customName = "xcp-cluster-1"
customInterval := 42 * time.Second
var executed atomic.Bool
monitor := &Monitor{
taskQueue: NewTaskQueue(),
}
if err := monitor.RegisterPollProvider(testPollProvider{
providerType: customType,
instances: []string{customName},
interval: customInterval,
buildPollTask: func(instanceName string) (PollTask, error) {
return PollTask{
InstanceName: instanceName,
InstanceType: string(customType),
Run: func(context.Context) {
executed.Store(true)
},
}, nil
},
}); err != nil {
t.Fatalf("RegisterPollProvider failed: %v", err)
}
monitor.SetExecutor(nil) // restore default executor
tasks := monitor.buildScheduledTasks(time.Now())
if len(tasks) != 1 {
t.Fatalf("expected 1 scheduled task for custom provider, got %d", len(tasks))
}
task := tasks[0]
if task.InstanceType != customType {
t.Fatalf("expected custom instance type %q, got %q", customType, task.InstanceType)
}
if task.InstanceName != customName {
t.Fatalf("expected custom instance name %q, got %q", customName, task.InstanceName)
}
if task.Interval != customInterval {
t.Fatalf("expected custom interval %v, got %v", customInterval, task.Interval)
}
monitor.executeScheduledTask(context.Background(), task)
if !executed.Load() {
t.Fatal("expected custom provider poll task callback to execute")
}
}
func TestUpdateResourceStore_IngestsSupplementalRecords(t *testing.T) {
store := &testSupplementalResourceStore{}
provider := &testSupplementalPollProvider{
testPollProvider: testPollProvider{
providerType: InstanceType("xcp"),
instances: []string{"xcp-cluster-1"},
interval: 30 * time.Second,
},
source: unifiedresources.DataSource("xcp"),
recordsByOrg: map[string][]unifiedresources.IngestRecord{
"default": {
{
SourceID: "xcp-host-1",
Resource: unifiedresources.Resource{
Type: unifiedresources.ResourceTypeAgent,
Name: "xcp-host-1",
Status: unifiedresources.StatusOnline,
LastSeen: time.Now().UTC(),
},
Identity: unifiedresources.ResourceIdentity{Hostnames: []string{"xcp-host-1"}},
},
},
},
}
monitor := &Monitor{
resourceStore: store,
}
if err := monitor.RegisterPollProvider(provider); err != nil {
t.Fatalf("RegisterPollProvider failed: %v", err)
}
monitor.updateResourceStore(models.StateSnapshot{})
if store.snapshotCalls != 1 {
t.Fatalf("expected PopulateFromSnapshot to be called once, got %d", store.snapshotCalls)
}
if provider.lastRequestedOrg != "default" {
t.Fatalf("expected default org lookup, got %q", provider.lastRequestedOrg)
}
records := store.recordsBySource[unifiedresources.DataSource("xcp")]
if len(records) != 1 {
t.Fatalf("expected 1 supplemental record ingested, got %d", len(records))
}
if records[0].SourceID != "xcp-host-1" {
t.Fatalf("expected supplemental source ID xcp-host-1, got %q", records[0].SourceID)
}
}
func TestUpdateResourceStore_RecordsSupplementalChanges(t *testing.T) {
store := &testSupplementalResourceStore{}
occurredAt := time.Date(2026, 3, 30, 18, 30, 0, 0, time.UTC)
provider := &testSupplementalPollProvider{
testPollProvider: testPollProvider{
providerType: InstanceType("vmware"),
instances: []string{"vc-1"},
interval: 30 * time.Second,
},
source: unifiedresources.SourceVMware,
changesByOrg: map[string][]unifiedresources.ResourceChange{
"default": {
{
ID: "activity-1",
ResourceID: "vc-1:vm:vm-201",
ObservedAt: occurredAt,
OccurredAt: &occurredAt,
Kind: unifiedresources.ChangeActivity,
SourceType: unifiedresources.SourcePlatformEvent,
SourceAdapter: unifiedresources.AdapterVMware,
Confidence: unifiedresources.ConfidenceHigh,
Reason: "Create snapshot (success)",
Metadata: map[string]any{
unifiedresources.MetadataActivityType: "vmware_task",
},
},
},
},
}
monitor := &Monitor{
resourceStore: store,
}
if err := monitor.RegisterPollProvider(provider); err != nil {
t.Fatalf("RegisterPollProvider failed: %v", err)
}
monitor.updateResourceStore(models.StateSnapshot{})
if len(store.recordedChanges) != 1 {
t.Fatalf("expected 1 supplemental change recorded, got %d", len(store.recordedChanges))
}
recorded := store.recordedChanges[0]
if recorded.Kind != unifiedresources.ChangeActivity {
t.Fatalf("recorded change kind = %q, want %q", recorded.Kind, unifiedresources.ChangeActivity)
}
if recorded.SourceAdapter != unifiedresources.AdapterVMware {
t.Fatalf("recorded change source adapter = %q, want %q", recorded.SourceAdapter, unifiedresources.AdapterVMware)
}
if recorded.OccurredAt == nil || !recorded.OccurredAt.Equal(occurredAt) {
t.Fatalf("recorded change occurred_at = %v, want %v", recorded.OccurredAt, occurredAt)
}
if got := recorded.Metadata[unifiedresources.MetadataActivityType]; got != "vmware_task" {
t.Fatalf("recorded activity_type = %#v, want vmware_task", got)
}
}
func TestUpdateResourceStore_SuppressesProviderOwnedSnapshotSources(t *testing.T) {
store := &testSupplementalResourceStore{}
provider := &testSupplementalPollProvider{
testPollProvider: testPollProvider{
providerType: InstanceType("xcp"),
instances: []string{"xcp-cluster-1"},
interval: 30 * time.Second,
},
source: unifiedresources.SourceProxmox,
ownedSources: []unifiedresources.DataSource{unifiedresources.SourceProxmox},
recordsByOrg: map[string][]unifiedresources.IngestRecord{
"default": {
{
SourceID: "xcp-host-1",
Resource: unifiedresources.Resource{
Type: unifiedresources.ResourceTypeAgent,
Name: "xcp-host-1",
Status: unifiedresources.StatusOnline,
LastSeen: time.Now().UTC(),
},
Identity: unifiedresources.ResourceIdentity{Hostnames: []string{"xcp-host-1"}},
},
},
},
}
monitor := &Monitor{
resourceStore: store,
}
if err := monitor.RegisterPollProvider(provider); err != nil {
t.Fatalf("RegisterPollProvider failed: %v", err)
}
monitor.updateResourceStore(models.StateSnapshot{
Nodes: []models.Node{{}},
VMs: []models.VM{{}},
Containers: []models.Container{{}},
Storage: []models.Storage{{}},
PhysicalDisks: []models.PhysicalDisk{{}},
CephClusters: []models.CephCluster{{}},
Hosts: []models.Host{{}},
})
if store.snapshotCalls != 1 {
t.Fatalf("expected PopulateFromSnapshot to be called once, got %d", store.snapshotCalls)
}
if len(store.lastSnapshot.Nodes) != 0 || len(store.lastSnapshot.VMs) != 0 || len(store.lastSnapshot.Containers) != 0 {
t.Fatalf("expected proxmox compute slices to be suppressed before snapshot ingest")
}
if len(store.lastSnapshot.Storage) != 0 || len(store.lastSnapshot.PhysicalDisks) != 0 || len(store.lastSnapshot.CephClusters) != 0 {
t.Fatalf("expected proxmox storage slices to be suppressed before snapshot ingest")
}
if len(store.lastSnapshot.Hosts) != 1 {
t.Fatalf("expected agent slice to remain in snapshot ingest")
}
records := store.recordsBySource[unifiedresources.SourceProxmox]
if len(records) != 1 {
t.Fatalf("expected 1 provider-owned supplemental record, got %d", len(records))
}
}
func TestUpdateResourceStore_IngestsRegisteredSupplementalProvider(t *testing.T) {
store := &testSupplementalResourceStore{}
provider := &testMonitorSupplementalProvider{
recordsByOrg: map[string][]unifiedresources.IngestRecord{
"default": {
{
SourceID: "tn-host-1",
Resource: unifiedresources.Resource{
Type: unifiedresources.ResourceTypeAgent,
Name: "tn-host-1",
Status: unifiedresources.StatusOnline,
LastSeen: time.Now().UTC(),
},
Identity: unifiedresources.ResourceIdentity{Hostnames: []string{"tn-host-1"}},
},
},
},
}
monitor := &Monitor{
resourceStore: store,
}
monitor.SetSupplementalRecordsProvider(unifiedresources.SourceTrueNAS, provider)
store.snapshotCalls = 0
store.recordsBySource = nil
monitor.updateResourceStore(models.StateSnapshot{})
if store.snapshotCalls != 1 {
t.Fatalf("expected PopulateFromSnapshot to be called once, got %d", store.snapshotCalls)
}
if provider.lastRequestedOrg != "default" {
t.Fatalf("expected default org lookup, got %q", provider.lastRequestedOrg)
}
records := store.recordsBySource[unifiedresources.SourceTrueNAS]
if len(records) != 1 {
t.Fatalf("expected 1 supplemental record from direct provider, got %d", len(records))
}
if records[0].SourceID != "tn-host-1" {
t.Fatalf("expected source ID tn-host-1, got %q", records[0].SourceID)
}
}
func TestUpdateResourceStore_UsesAtomicStoreReplacementWhenAvailable(t *testing.T) {
store := &testAtomicResourceStore{}
provider := &testMonitorSupplementalProvider{
recordsByOrg: map[string][]unifiedresources.IngestRecord{
"default": {
{
SourceID: "tn-host-1",
Resource: unifiedresources.Resource{
Type: unifiedresources.ResourceTypeAgent,
Name: "tn-host-1",
Status: unifiedresources.StatusOnline,
LastSeen: time.Now().UTC(),
},
Identity: unifiedresources.ResourceIdentity{Hostnames: []string{"tn-host-1"}},
},
},
},
}
monitor := &Monitor{
resourceStore: store,
}
monitor.SetSupplementalRecordsProvider(unifiedresources.SourceTrueNAS, provider)
store.snapshotCalls = 0
store.atomicCalls = 0
store.lastSnapshot = models.StateSnapshot{}
store.lastRecordsBySrc = nil
snapshot := models.StateSnapshot{
Hosts: []models.Host{{ID: "host-1", Hostname: "minipc", Status: "online"}},
}
monitor.updateResourceStore(snapshot)
if store.atomicCalls != 1 {
t.Fatalf("expected atomic populate to be called once, got %d", store.atomicCalls)
}
if store.snapshotCalls != 0 {
t.Fatalf("expected legacy PopulateFromSnapshot to be skipped, got %d calls", store.snapshotCalls)
}
if len(store.lastSnapshot.Hosts) != 1 || store.lastSnapshot.Hosts[0].Hostname != "minipc" {
t.Fatalf("expected snapshot to be passed atomically, got %#v", store.lastSnapshot)
}
records := store.lastRecordsBySrc[unifiedresources.SourceTrueNAS]
if len(records) != 1 {
t.Fatalf("expected 1 atomic supplemental record, got %d", len(records))
}
if records[0].SourceID != "tn-host-1" {
t.Fatalf("expected source ID tn-host-1, got %q", records[0].SourceID)
}
}
func TestUpdateResourceStore_SuppressesSnapshotForRegisteredSupplementalOwnership(t *testing.T) {
store := &testSupplementalResourceStore{}
provider := &testMonitorSupplementalProvider{
ownedSources: []unifiedresources.DataSource{unifiedresources.SourceProxmox},
recordsByOrg: map[string][]unifiedresources.IngestRecord{
"default": {
{
SourceID: "tn-host-1",
Resource: unifiedresources.Resource{
Type: unifiedresources.ResourceTypeAgent,
Name: "tn-host-1",
Status: unifiedresources.StatusOnline,
LastSeen: time.Now().UTC(),
},
Identity: unifiedresources.ResourceIdentity{Hostnames: []string{"tn-host-1"}},
},
},
},
}
monitor := &Monitor{
resourceStore: store,
}
monitor.SetSupplementalRecordsProvider(unifiedresources.SourceTrueNAS, provider)
store.snapshotCalls = 0
store.recordsBySource = nil
store.lastSnapshot = models.StateSnapshot{}
monitor.updateResourceStore(models.StateSnapshot{
Nodes: []models.Node{{}},
VMs: []models.VM{{}},
Containers: []models.Container{{}},
Hosts: []models.Host{{}},
})
if len(store.lastSnapshot.Nodes) != 0 || len(store.lastSnapshot.VMs) != 0 || len(store.lastSnapshot.Containers) != 0 {
t.Fatalf("expected proxmox slices to be suppressed for direct provider ownership")
}
if len(store.lastSnapshot.Hosts) != 1 {
t.Fatalf("expected non-owned host slice to remain")
}
records := store.recordsBySource[unifiedresources.SourceTrueNAS]
if len(records) != 1 {
t.Fatalf("expected 1 supplemental record from direct provider, got %d", len(records))
}
}
func TestSchedulerHealth_UsesProviderInstanceDescriptions(t *testing.T) {
const customType InstanceType = "xcp"
monitor := &Monitor{
config: &config.Config{},
instanceInfoCache: make(map[string]*instanceInfo),
}
if err := monitor.RegisterPollProvider(testPollProvider{
providerType: customType,
instances: []string{"xcp-a"},
describeInstances: []PollProviderInstanceInfo{
{
Name: "xcp-a",
DisplayName: "XCP Cluster A",
Connection: "https://xcp-a.example",
},
},
interval: 30 * time.Second,
}); err != nil {
t.Fatalf("RegisterPollProvider failed: %v", err)
}
resp := monitor.SchedulerHealth()
key := schedulerKey(customType, "xcp-a")
for _, inst := range resp.Instances {
if inst.Key != key {
continue
}
if inst.DisplayName != "XCP Cluster A" {
t.Fatalf("expected display name %q, got %q", "XCP Cluster A", inst.DisplayName)
}
if inst.Connection != "https://xcp-a.example" {
t.Fatalf("expected connection %q, got %q", "https://xcp-a.example", inst.Connection)
}
return
}
t.Fatalf("expected instance %q in scheduler health", key)
}
func TestGetConnectionStatuses_CustomProviderStatuses(t *testing.T) {
const customType InstanceType = "xcp"
monitor := &Monitor{
state: models.NewState(),
}
if err := monitor.RegisterPollProvider(testPollProvider{
providerType: customType,
instances: []string{"xcp-a"},
connectionStatus: map[string]bool{
"xcp-xcp-a": true,
},
}); err != nil {
t.Fatalf("RegisterPollProvider failed: %v", err)
}
statuses := monitor.GetConnectionStatuses()
if connected, ok := statuses["xcp-xcp-a"]; !ok || !connected {
t.Fatalf("expected xcp-xcp-a to be connected, got exists=%v value=%v", ok, connected)
}
}
func TestGetConnectionStatuses_CustomProviderFallbackToState(t *testing.T) {
const customType InstanceType = "xcp"
monitor := &Monitor{
state: models.NewState(),
}
monitor.state.SetConnectionHealth("xcp-xcp-a", true)
if err := monitor.RegisterPollProvider(pollProviderAdapter{
instanceType: customType,
listInstances: func(*Monitor) []string {
return []string{"xcp-a"}
},
baseInterval: func(*Monitor) time.Duration { return 30 * time.Second },
buildPollTask: func(*Monitor, string) (PollTask, error) {
return PollTask{}, nil
},
}); err != nil {
t.Fatalf("RegisterPollProvider failed: %v", err)
}
statuses := monitor.GetConnectionStatuses()
if connected, ok := statuses["xcp-xcp-a"]; !ok || !connected {
t.Fatalf("expected xcp-xcp-a to be connected via fallback, got exists=%v value=%v", ok, connected)
}
}
func TestGetConnectionStatuses_BuiltInPMGSupport(t *testing.T) {
monitor := &Monitor{
config: &config.Config{
PMGInstances: []config.PMGInstance{
{Name: "pmg-1"},
{Name: "pmg-2"},
},
},
state: models.NewState(),
pmgClients: map[string]*pmg.Client{"pmg-1": {}},
}
monitor.state.SetConnectionHealth("pmg-pmg-1", true)
statuses := monitor.GetConnectionStatuses()
if connected, ok := statuses["pmg-pmg-1"]; !ok || !connected {
t.Fatalf("expected pmg-pmg-1 connected, got exists=%v value=%v", ok, connected)
}
if connected, ok := statuses["pmg-pmg-2"]; !ok || connected {
t.Fatalf("expected pmg-pmg-2 disconnected, got exists=%v value=%v", ok, connected)
}
}
func TestSetProviderConnectionHealth_UsesProviderConnectionKey(t *testing.T) {
const customType InstanceType = "xcp"
const instanceName = "xcp-a"
const providerKey = "provider/xcp-a"
monitor := &Monitor{
state: models.NewState(),
}
if err := monitor.RegisterPollProvider(testPollProvider{
providerType: customType,
instances: []string{instanceName},
connectionKey: providerKey,
}); err != nil {
t.Fatalf("RegisterPollProvider failed: %v", err)
}
monitor.setProviderConnectionHealth(customType, instanceName, true)
if !monitor.state.ConnectionHealth[providerKey] {
t.Fatalf("expected provider key %q to be marked healthy", providerKey)
}
if _, exists := monitor.state.ConnectionHealth["xcp-"+instanceName]; exists {
t.Fatalf("did not expect fallback key %q when provider key override is set", "xcp-"+instanceName)
}
}