Pulse/internal/alerts/alerts_test.go
rcourtman dfbe2eb873
Some checks are pending
Build and Test / Secret Scan (push) Waiting to run
Build and Test / Frontend & Backend (push) Waiting to run
Core E2E Tests / Playwright Core E2E (push) Waiting to run
Suppress noisy recovery notifications
2026-04-13 14:40:12 +01:00

17744 lines
449 KiB
Go

package alerts
import (
"encoding/json"
"fmt"
"math"
"os"
"path/filepath"
"reflect"
"strings"
"sync"
"testing"
"time"
"github.com/rcourtman/pulse-go-rewrite/internal/models"
"github.com/rcourtman/pulse-go-rewrite/internal/utils"
"github.com/rcourtman/pulse-go-rewrite/pkg/proxmox"
)
// testEnvMu protects concurrent access to PULSE_DATA_DIR during parallel tests.
// Tests using newTestManager are effectively serialized because the Manager
// calls GetDataDir() repeatedly (not just at creation time).
var testEnvMu sync.Mutex
// newTestManager creates a Manager with an isolated temp directory for testing.
// It uses os.Setenv with a mutex to safely handle parallel tests that call // t.Parallel()
// before invoking this function (t.Setenv cannot be used after t.Parallel).
//
// IMPORTANT: The mutex is held for the entire duration of the test because the
// Manager calls GetDataDir() not just at creation time, but also during operations
// like SaveActiveAlerts() and LoadActiveAlerts(). This effectively serializes
// tests that use newTestManager, but ensures correct isolation.
func newTestManager(t *testing.T) *Manager {
t.Helper()
tmpDir := t.TempDir()
testEnvMu.Lock()
oldVal, hadOld := os.LookupEnv("PULSE_DATA_DIR")
os.Setenv("PULSE_DATA_DIR", tmpDir)
m := NewManager()
// Restore env var and release mutex when test completes.
// We also stop the history manager's background goroutines (but not the
// full manager Stop which includes a 100ms sleep) to prevent writes to
// the temp directory after the test completes.
t.Cleanup(func() {
// Stop the history manager to halt background save routines
m.historyManager.Stop()
// Close escalation channel to stop that goroutine too
select {
case <-m.escalationStop:
// Already closed
default:
close(m.escalationStop)
}
// Close cleanup channel
select {
case <-m.cleanupStop:
// Already closed
default:
close(m.cleanupStop)
}
// Brief pause to let goroutines finish any in-flight operations.
// Under heavy parallel test load (full suite), 10ms is not enough.
time.Sleep(50 * time.Millisecond)
if hadOld {
os.Setenv("PULSE_DATA_DIR", oldVal)
} else {
os.Unsetenv("PULSE_DATA_DIR")
}
testEnvMu.Unlock()
})
return m
}
func TestAcknowledgePersistsThroughCheckMetric(t *testing.T) {
m := newTestManager(t)
m.ClearActiveAlerts()
// Set config fields directly to bypass UpdateConfig's default value enforcement
m.mu.Lock()
m.config.TimeThreshold = 0
m.config.TimeThresholds = map[string]int{}
m.config.SuppressionWindow = 0
m.config.MinimumDelta = 0
m.mu.Unlock()
threshold := &HysteresisThreshold{Trigger: 80, Clear: 70}
m.checkMetric("res1", "Resource", "node1", "inst1", "guest", "usage", 90, threshold, nil)
if _, exists := m.activeAlerts["res1-usage"]; !exists {
t.Fatalf("expected alert to be created")
}
if err := m.AcknowledgeAlert("res1-usage", "tester"); err != nil {
t.Fatalf("ack failed: %v", err)
}
if !m.activeAlerts["res1-usage"].Acknowledged {
t.Fatalf("acknowledged flag not set")
}
alerts := m.GetActiveAlerts()
if len(alerts) != 1 || !alerts[0].Acknowledged {
t.Fatalf("GetActiveAlerts lost acknowledgement")
}
m.checkMetric("res1", "Resource", "node1", "inst1", "guest", "usage", 85, threshold, nil)
if !m.activeAlerts["res1-usage"].Acknowledged {
t.Fatalf("acknowledged flag lost after update")
}
}
func TestCheckMetricClearsAlertWhenThresholdDisabled(t *testing.T) {
m := newTestManager(t)
m.ClearActiveAlerts()
m.mu.Lock()
m.config.TimeThreshold = 0
m.config.TimeThresholds = map[string]int{}
m.config.SuppressionWindow = 0
m.config.MinimumDelta = 0
m.mu.Unlock()
// First, create an active alert with an enabled threshold
threshold := &HysteresisThreshold{Trigger: 80, Clear: 70}
m.checkMetric("res1", "Resource", "node1", "inst1", "guest", "memory", 90, threshold, nil)
m.mu.RLock()
_, exists := m.activeAlerts["res1-memory"]
m.mu.RUnlock()
if !exists {
t.Fatalf("expected alert to be created")
}
// Now call checkMetric with a disabled threshold (Trigger=0) — should clear the alert
disabledThreshold := &HysteresisThreshold{Trigger: 0, Clear: 0}
m.checkMetric("res1", "Resource", "node1", "inst1", "guest", "memory", 90, disabledThreshold, nil)
m.mu.RLock()
_, stillExists := m.activeAlerts["res1-memory"]
m.mu.RUnlock()
if stillExists {
t.Errorf("expected alert to be cleared when threshold is disabled (Trigger=0)")
}
// Also test with nil threshold
// Re-create the alert
m.checkMetric("res1", "Resource", "node1", "inst1", "guest", "memory", 90, threshold, nil)
m.mu.RLock()
_, exists = m.activeAlerts["res1-memory"]
m.mu.RUnlock()
if !exists {
t.Fatalf("expected alert to be re-created")
}
// Call with nil threshold — should also clear
m.checkMetric("res1", "Resource", "node1", "inst1", "guest", "memory", 90, nil, nil)
m.mu.RLock()
_, stillExists = m.activeAlerts["res1-memory"]
m.mu.RUnlock()
if stillExists {
t.Errorf("expected alert to be cleared when threshold is nil")
}
}
func TestGetActiveAlertsKeepsInstanceScopedNodeDisplayNames(t *testing.T) {
m := newTestManager(t)
m.ClearActiveAlerts()
m.mu.Lock()
m.config.TimeThreshold = 0
m.config.TimeThresholds = map[string]int{}
m.config.SuppressionWindow = 0
m.config.MinimumDelta = 0
m.mu.Unlock()
threshold := &HysteresisThreshold{Trigger: 80, Clear: 70}
m.UpdateNodeDisplayName("cluster-a", "pve", "Alpha")
m.UpdateNodeDisplayName("cluster-b", "pve", "Beta")
m.checkMetric("guest-a", "vm-a", "pve", "cluster-a", "guest", "cpu", 90, threshold, nil)
m.checkMetric("guest-b", "vm-b", "pve", "cluster-b", "guest", "cpu", 91, threshold, nil)
m.UpdateNodeDisplayName("cluster-a", "pve", "Alpha Updated")
m.checkMetric("guest-a", "vm-a", "pve", "cluster-a", "guest", "cpu", 92, threshold, nil)
m.checkMetric("guest-b", "vm-b", "pve", "cluster-b", "guest", "cpu", 93, threshold, nil)
gotByID := make(map[string]Alert)
for _, alert := range m.GetActiveAlerts() {
gotByID[alert.ID] = alert
}
if got := gotByID["guest-a-cpu"].NodeDisplayName; got != "Alpha Updated" {
t.Fatalf("guest-a NodeDisplayName = %q, want %q", got, "Alpha Updated")
}
if got := gotByID["guest-b-cpu"].NodeDisplayName; got != "Beta" {
t.Fatalf("guest-b NodeDisplayName = %q, want %q", got, "Beta")
}
}
func TestCheckGuestSkipsAlertsWhenMetricDisabled(t *testing.T) {
m := newTestManager(t)
vmID := "instance-node-101"
instanceName := "instance"
// Start with default configuration to allow CPU alerts.
initialConfig := AlertConfig{
Enabled: true,
GuestDefaults: ThresholdConfig{
CPU: &HysteresisThreshold{Trigger: 80, Clear: 75},
},
TimeThreshold: 0,
TimeThresholds: map[string]int{},
NodeDefaults: ThresholdConfig{
CPU: &HysteresisThreshold{Trigger: 80, Clear: 75},
Memory: &HysteresisThreshold{Trigger: 85, Clear: 80},
Disk: &HysteresisThreshold{Trigger: 90, Clear: 85},
},
StorageDefault: HysteresisThreshold{Trigger: 85, Clear: 80},
Overrides: make(map[string]ThresholdConfig),
}
m.UpdateConfig(initialConfig)
m.mu.Lock()
m.config.TimeThreshold = 0
m.config.TimeThresholds = map[string]int{}
m.config.ActivationState = ActivationActive
m.mu.Unlock()
var dispatched []*Alert
done := make(chan struct{}, 1)
var resolved []string
resolvedDone := make(chan struct{}, 1)
m.SetAlertCallback(func(alert *Alert) {
dispatched = append(dispatched, alert)
select {
case done <- struct{}{}:
default:
}
})
m.SetResolvedCallback(func(alertID string) {
resolved = append(resolved, alertID)
select {
case resolvedDone <- struct{}{}:
default:
}
})
vm := models.VM{
ID: vmID,
Name: "test-vm",
Node: "node",
Instance: instanceName,
Status: "running",
CPU: 1.0, // 100% once multiplied by 100 inside CheckGuest
Memory: models.Memory{
Usage: 65,
},
Disk: models.Disk{
Usage: 40,
},
}
// Initial check should trigger an alert with default thresholds.
m.CheckGuest(vm, instanceName)
select {
case <-done:
case <-time.After(100 * time.Millisecond):
t.Fatalf("did not receive initial alert dispatch")
}
if len(dispatched) != 1 {
t.Fatalf("expected 1 alert before disabling metric, got %d", len(dispatched))
}
// Apply override disabling CPU alerts for this VM.
disabledConfig := initialConfig
disabledConfig.Overrides = map[string]ThresholdConfig{
vmID: {
CPU: &HysteresisThreshold{Trigger: -1, Clear: 0},
},
}
disabledConfig.TimeThreshold = 0
disabledConfig.TimeThresholds = map[string]int{}
m.UpdateConfig(disabledConfig)
m.mu.Lock()
m.config.TimeThreshold = 0
m.config.TimeThresholds = map[string]int{}
m.config.ActivationState = ActivationActive
m.mu.Unlock()
// Clear dispatched slice to capture only post-disable notifications.
dispatched = dispatched[:0]
done = make(chan struct{}, 1)
// Re-run evaluation with high CPU; no alert should be dispatched.
m.CheckGuest(vm, instanceName)
select {
case <-done:
t.Fatalf("expected no alerts after disabling CPU metric, but callback fired")
case <-time.After(100 * time.Millisecond):
// No callback fired as expected.
}
// Active alerts should be cleared by the config update.
m.mu.RLock()
activeCount := len(m.activeAlerts)
m.mu.RUnlock()
if activeCount != 0 {
t.Fatalf("expected active alerts to be cleared after disabling metric, got %d", activeCount)
}
select {
case <-resolvedDone:
case <-time.After(100 * time.Millisecond):
t.Fatalf("expected resolved callback to fire after disabling metric")
}
if len(resolved) != 1 || resolved[0] != fmt.Sprintf("%s-cpu", vmID) {
t.Fatalf("expected resolved callback for %s-cpu, got %v", vmID, resolved)
}
m.mu.RLock()
_, isPending := m.pendingAlerts[fmt.Sprintf("%s-cpu", vmID)]
m.mu.RUnlock()
if isPending {
t.Fatalf("expected pending alert entry to be cleared after disabling metric")
}
}
func TestPulseNoAlertsSuppressesGuestAlerts(t *testing.T) {
m := newTestManager(t)
m.ClearActiveAlerts()
m.mu.Lock()
m.config.TimeThreshold = 0
m.config.TimeThresholds = map[string]int{}
m.config.ActivationState = ActivationActive
m.mu.Unlock()
var dispatched int
m.SetAlertCallback(func(alert *Alert) {
dispatched++
})
vm := models.VM{
ID: "inst/qemu/101",
Name: "test-vm",
Node: "node1",
Instance: "inst",
Status: "running",
CPU: 1.0,
Memory: models.Memory{
Usage: 95,
},
Disk: models.Disk{
Usage: 95,
},
Tags: []string{"pulse-no-alerts"},
}
m.CheckGuest(vm, "inst")
if dispatched != 0 {
t.Fatalf("expected no alert dispatch, got %d", dispatched)
}
if alerts := m.GetActiveAlerts(); len(alerts) != 0 {
t.Fatalf("expected no active alerts, got %d", len(alerts))
}
}
func TestPulseMonitorOnlySkipsDispatchButRetainsAlert(t *testing.T) {
m := newTestManager(t)
m.ClearActiveAlerts()
m.mu.Lock()
m.config.TimeThreshold = 0
m.config.TimeThresholds = map[string]int{}
m.config.ActivationState = ActivationActive
m.mu.Unlock()
var dispatched int
m.SetAlertCallback(func(alert *Alert) {
dispatched++
})
vm := models.VM{
ID: "inst/qemu/102",
Name: "monitor-vm",
Node: "node1",
Instance: "inst",
Status: "running",
CPU: 1.0,
Memory: models.Memory{Usage: 90},
Disk: models.Disk{Usage: 50},
Tags: []string{"pulse-monitor-only"},
}
m.CheckGuest(vm, "inst")
if dispatched != 0 {
t.Fatalf("expected monitor-only alert to skip dispatch, got %d callbacks", dispatched)
}
alerts := m.GetActiveAlerts()
if len(alerts) == 0 {
t.Fatalf("expected monitor-only alert to remain active")
}
if alerts[0].Metadata == nil || alerts[0].Metadata["monitorOnly"] != true {
t.Fatalf("expected alert metadata to mark monitorOnly, got %+v", alerts[0].Metadata)
}
}
func TestPulseRelaxedThresholdsIncreaseCpuTrigger(t *testing.T) {
m := newTestManager(t)
m.ClearActiveAlerts()
m.mu.Lock()
m.config.TimeThreshold = 0
m.config.TimeThresholds = map[string]int{}
m.mu.Unlock()
vm := models.VM{
ID: "inst/qemu/103",
Name: "relaxed-vm",
Node: "node1",
Instance: "inst",
Status: "running",
CPU: 0.9, // 90%
Memory: models.Memory{Usage: 60},
Disk: models.Disk{Usage: 40},
Tags: []string{"pulse-relaxed"},
}
m.CheckGuest(vm, "inst")
if alerts := m.GetActiveAlerts(); len(alerts) != 0 {
t.Fatalf("expected no alerts at 90%% CPU with relaxed thresholds, got %d", len(alerts))
}
vm.CPU = 1.0
m.CheckGuest(vm, "inst")
if alerts := m.GetActiveAlerts(); len(alerts) == 0 {
t.Fatalf("expected alert once CPU exceeds relaxed threshold")
}
}
func TestClearAlertMarksResolutionAndReturnsStatus(t *testing.T) {
m := newTestManager(t)
m.ClearActiveAlerts()
m.mu.Lock()
m.config.TimeThreshold = 0
m.config.TimeThresholds = map[string]int{}
m.mu.Unlock()
vm := models.VM{
ID: "inst/qemu/104",
Name: "clear-vm",
Node: "node1",
Instance: "inst",
Status: "running",
CPU: 1.0,
Memory: models.Memory{Usage: 80},
Disk: models.Disk{Usage: 80},
}
m.CheckGuest(vm, "inst")
alerts := m.GetActiveAlerts()
if len(alerts) == 0 {
t.Fatalf("expected alert to be active before clearing")
}
alertID := alerts[0].ID
if ok := m.ClearAlert(alertID); !ok {
t.Fatalf("expected manual clear to succeed")
}
if remaining := m.GetActiveAlerts(); len(remaining) != 0 {
t.Fatalf("expected no active alerts after clear, found %d", len(remaining))
}
resolved := m.GetRecentlyResolved()
if len(resolved) == 0 || resolved[0].Alert.ID != alertID {
t.Fatalf("expected alert %s to be tracked as recently resolved", alertID)
}
if ok := m.ClearAlert(alertID); ok {
t.Fatalf("expected second clear to report missing alert")
}
}
func TestHandleDockerHostRemovedClearsAlertsAndTracking(t *testing.T) {
m := newTestManager(t)
host := models.DockerHost{ID: "host1", DisplayName: "Host One", Hostname: "host-one"}
containerResourceID := "docker:host1/container1"
containerAlertID := "docker-container-state-" + containerResourceID
hostAlertID := "docker-host-offline-host1"
m.mu.Lock()
m.activeAlerts[hostAlertID] = &Alert{ID: hostAlertID, ResourceID: "docker:host1"}
m.activeAlerts[containerAlertID] = &Alert{ID: containerAlertID, ResourceID: containerResourceID}
m.dockerOfflineCount[host.ID] = 2
m.dockerStateConfirm[containerResourceID] = 1
m.dockerRestartTracking[containerResourceID] = &dockerRestartRecord{}
m.dockerLastExitCode[containerResourceID] = 137
m.mu.Unlock()
m.HandleDockerHostRemoved(host)
m.mu.RLock()
defer m.mu.RUnlock()
if _, exists := m.activeAlerts[containerAlertID]; exists {
t.Fatalf("expected container alerts to be cleared")
}
if _, exists := m.activeAlerts[hostAlertID]; exists {
t.Fatalf("expected host offline alert to be cleared")
}
if _, exists := m.dockerOfflineCount[host.ID]; exists {
t.Fatalf("expected offline tracking to be cleared")
}
if _, exists := m.dockerStateConfirm[containerResourceID]; exists {
t.Fatalf("expected state confirmation to be cleared")
}
if _, exists := m.dockerRestartTracking[containerResourceID]; exists {
t.Fatalf("expected restart tracking to be cleared")
}
if _, exists := m.dockerLastExitCode[containerResourceID]; exists {
t.Fatalf("expected last exit code tracking to be cleared")
}
}
func TestCheckHostGeneratesMetricAlerts(t *testing.T) {
m := newTestManager(t)
m.ClearActiveAlerts()
m.mu.Lock()
m.config.TimeThreshold = 0
m.config.TimeThresholds = map[string]int{}
m.mu.Unlock()
host := models.Host{
ID: "host-1",
DisplayName: "Test Host",
Hostname: "host-1.example",
Platform: "linux",
OSName: "ubuntu",
CPUUsage: 95,
CPUCount: 8,
Memory: models.Memory{
Usage: 92,
Total: 16384,
Used: 15000,
Free: 1384,
},
Disks: []models.Disk{
{
Mountpoint: "/",
Usage: 93,
Total: 100,
Used: 93,
Free: 7,
},
},
Status: "online",
IntervalSeconds: 30,
LastSeen: time.Now(),
Tags: []string{"prod"},
}
m.CheckHost(host)
m.mu.RLock()
defer m.mu.RUnlock()
cpuAlertID := fmt.Sprintf("%s-cpu", hostResourceID(host.ID))
if _, exists := m.activeAlerts[cpuAlertID]; !exists {
t.Fatalf("expected CPU alert %q to be active", cpuAlertID)
}
memAlertID := fmt.Sprintf("%s-memory", hostResourceID(host.ID))
if _, exists := m.activeAlerts[memAlertID]; !exists {
t.Fatalf("expected memory alert %q to be active", memAlertID)
}
diskResourceID, _ := hostDiskResourceID(host, host.Disks[0])
diskAlertID := fmt.Sprintf("%s-disk", diskResourceID)
if _, exists := m.activeAlerts[diskAlertID]; !exists {
t.Fatalf("expected disk alert %q to be active", diskAlertID)
}
}
func TestHandleHostOfflineRequiresConfirmations(t *testing.T) {
m := newTestManager(t)
m.ClearActiveAlerts()
host := models.Host{ID: "host-2", DisplayName: "Second Host", Hostname: "host-two"}
alertID := fmt.Sprintf("host-offline-%s", host.ID)
resourceKey := hostResourceID(host.ID)
m.HandleHostOffline(host)
m.mu.RLock()
if _, exists := m.activeAlerts[alertID]; exists {
t.Fatalf("expected no alert after first offline detection")
}
if count := m.offlineConfirmations[resourceKey]; count != 1 {
t.Fatalf("expected confirmation count to be 1, got %d", count)
}
m.mu.RUnlock()
m.HandleHostOffline(host)
m.mu.RLock()
if _, exists := m.activeAlerts[alertID]; exists {
t.Fatalf("expected no alert after second offline detection")
}
if count := m.offlineConfirmations[resourceKey]; count != 2 {
t.Fatalf("expected confirmation count to be 2, got %d", count)
}
m.mu.RUnlock()
m.HandleHostOffline(host)
m.mu.RLock()
if _, exists := m.activeAlerts[alertID]; !exists {
t.Fatalf("expected alert %q after third offline detection", alertID)
}
m.mu.RUnlock()
m.HandleHostOnline(host)
m.mu.RLock()
if _, exists := m.activeAlerts[alertID]; exists {
t.Fatalf("expected offline alert %q to be cleared after host online", alertID)
}
if _, exists := m.offlineConfirmations[resourceKey]; exists {
t.Fatalf("expected offline confirmations to be cleared when host online")
}
m.mu.RUnlock()
}
func TestCheckHostDisabledOverrideClearsAlerts(t *testing.T) {
m := newTestManager(t)
m.ClearActiveAlerts()
m.mu.Lock()
m.config.TimeThreshold = 0
m.config.TimeThresholds = map[string]int{}
m.mu.Unlock()
host := models.Host{
ID: "host-3",
DisplayName: "Override Host",
Hostname: "override.example",
CPUUsage: 90,
Memory: models.Memory{
Usage: 91,
Total: 16000,
Used: 14560,
Free: 1440,
},
Disks: []models.Disk{
{Mountpoint: "/data", Usage: 92, Total: 200, Used: 184, Free: 16},
},
Status: "online",
IntervalSeconds: 30,
LastSeen: time.Now(),
}
m.CheckHost(host)
m.mu.RLock()
if len(m.activeAlerts) == 0 {
m.mu.RUnlock()
t.Fatalf("expected active alerts prior to disabling host overrides")
}
m.mu.RUnlock()
cfg := m.GetConfig()
cfg.Overrides = map[string]ThresholdConfig{
host.ID: {
Disabled: true,
},
}
m.UpdateConfig(cfg)
m.mu.Lock()
m.config.TimeThreshold = 0
m.config.TimeThresholds = map[string]int{}
m.mu.Unlock()
m.CheckHost(host)
m.mu.RLock()
defer m.mu.RUnlock()
if len(m.activeAlerts) != 0 {
t.Fatalf("expected all host alerts to be cleared after disabling override, got %d", len(m.activeAlerts))
}
}
func TestCheckSnapshotsForInstanceCreatesAndClearsAlerts(t *testing.T) {
m := newTestManager(t)
m.ClearActiveAlerts()
cfg := AlertConfig{
Enabled: true,
StorageDefault: HysteresisThreshold{Trigger: 85, Clear: 80},
SnapshotDefaults: SnapshotAlertConfig{
Enabled: true,
WarningDays: 7,
CriticalDays: 14,
WarningSizeGiB: 0,
CriticalSizeGiB: 0,
},
Overrides: make(map[string]ThresholdConfig),
}
m.UpdateConfig(cfg)
m.mu.Lock()
m.config.TimeThreshold = 0
m.config.TimeThresholds = map[string]int{}
m.mu.Unlock()
now := time.Now()
snapshots := []models.GuestSnapshot{
{
ID: "inst-node-100-weekly",
Name: "weekly",
Node: "node",
Instance: "inst",
Type: "qemu",
VMID: 100,
Time: now.Add(-15 * 24 * time.Hour),
SizeBytes: 60 << 30,
},
}
guestNames := map[string]string{
"inst:node:100": "app-server",
}
m.CheckSnapshotsForInstance("inst", snapshots, guestNames)
m.mu.RLock()
alert, exists := m.activeAlerts["snapshot-age-inst-node-100-weekly"]
m.mu.RUnlock()
if !exists {
t.Fatalf("expected snapshot age alert to be created")
}
if alert.Level != AlertLevelCritical {
t.Fatalf("expected critical level for old snapshot, got %s", alert.Level)
}
if alert.ResourceName != "app-server snapshot 'weekly'" {
t.Fatalf("unexpected resource name: %s", alert.ResourceName)
}
m.CheckSnapshotsForInstance("inst", nil, guestNames)
m.mu.RLock()
_, exists = m.activeAlerts["snapshot-age-inst-node-100-weekly"]
m.mu.RUnlock()
if exists {
t.Fatalf("expected snapshot alert to be cleared when snapshot missing")
}
}
func TestCheckSnapshotsRespectsOverrides(t *testing.T) {
m := newTestManager(t)
m.ClearActiveAlerts()
cfg := AlertConfig{
Enabled: true,
SnapshotDefaults: SnapshotAlertConfig{
Enabled: true,
WarningDays: 7,
CriticalDays: 14,
},
}
m.UpdateConfig(cfg)
m.mu.Lock()
m.config.TimeThreshold = 0
m.mu.Unlock()
now := time.Now()
snapshots := []models.GuestSnapshot{
{
ID: "inst:node:100:weekly",
Name: "weekly",
Node: "node",
Instance: "inst",
Type: "qemu",
VMID: 100,
Time: now.Add(-10 * 24 * time.Hour), // Triggers Warning (10 > 7)
},
}
resourceKey := "inst:node:100"
guestNames := map[string]string{
resourceKey: "app-server",
}
// 1. Verify warning alert is created
m.CheckSnapshotsForInstance("inst", snapshots, guestNames)
m.mu.RLock()
alert, exists := m.activeAlerts["snapshot-age-inst:node:100:weekly"]
m.mu.RUnlock()
if !exists {
t.Fatalf("expected snapshot warning alert")
}
if alert.Level != AlertLevelWarning {
t.Fatalf("expected warning alert, got %s", alert.Level)
}
// 2. Disable via override
cfg = m.GetConfig()
cfg.Overrides = map[string]ThresholdConfig{
"inst:node:100": {
Snapshot: &SnapshotAlertConfig{Enabled: false},
},
}
m.UpdateConfig(cfg)
m.CheckSnapshotsForInstance("inst", snapshots, guestNames)
m.mu.RLock()
_, exists = m.activeAlerts["snapshot-age-inst:node:100:weekly"]
m.mu.RUnlock()
if exists {
t.Fatalf("expected snapshot alert to be suppressed by override")
}
}
func TestCheckSnapshotsForInstanceTriggersOnSnapshotSize(t *testing.T) {
m := newTestManager(t)
m.ClearActiveAlerts()
cfg := AlertConfig{
Enabled: true,
StorageDefault: HysteresisThreshold{Trigger: 85, Clear: 80},
SnapshotDefaults: SnapshotAlertConfig{
Enabled: true,
WarningDays: 0,
CriticalDays: 0,
WarningSizeGiB: 50,
CriticalSizeGiB: 100,
},
Overrides: make(map[string]ThresholdConfig),
}
m.UpdateConfig(cfg)
m.mu.Lock()
m.config.TimeThreshold = 0
m.config.TimeThresholds = map[string]int{}
m.mu.Unlock()
now := time.Now()
snapshots := []models.GuestSnapshot{
{
ID: "inst-node-200-sizey",
Name: "pre-maintenance",
Node: "node",
Instance: "inst",
Type: "qemu",
VMID: 200,
Time: now.Add(-2 * time.Hour),
SizeBytes: int64(120) << 30,
},
}
guestNames := map[string]string{
"inst:node:200": "db-server",
}
m.CheckSnapshotsForInstance("inst", snapshots, guestNames)
m.mu.RLock()
alert, exists := m.activeAlerts["snapshot-age-inst-node-200-sizey"]
m.mu.RUnlock()
if !exists {
t.Fatalf("expected snapshot size alert to be created")
}
if alert.Level != AlertLevelCritical {
t.Fatalf("expected critical level for large snapshot, got %s", alert.Level)
}
if alert.Value < 119.5 || alert.Value > 120.5 {
t.Fatalf("expected alert value near 120 GiB, got %.2f", alert.Value)
}
if alert.Threshold != 100 {
t.Fatalf("expected threshold 100 GiB, got %.2f", alert.Threshold)
}
if alert.Metadata == nil {
t.Fatalf("expected metadata for snapshot alert")
}
if metric, ok := alert.Metadata["primaryMetric"].(string); !ok || metric != "size" {
t.Fatalf("expected primary metric size, got %#v", alert.Metadata["primaryMetric"])
}
if sizeBytes, ok := alert.Metadata["snapshotSizeBytes"].(int64); !ok || sizeBytes == 0 {
t.Fatalf("expected snapshotSizeBytes in metadata")
}
metrics, ok := alert.Metadata["triggeredMetrics"].([]string)
if !ok {
t.Fatalf("expected triggeredMetrics slice, got %#v", alert.Metadata["triggeredMetrics"])
}
foundSize := false
for _, metric := range metrics {
if metric == "size" {
foundSize = true
break
}
}
if !foundSize {
t.Fatalf("expected size metric recorded in metadata")
}
}
func TestCheckSnapshotsForInstanceIncludesAgeAndSizeReasons(t *testing.T) {
m := newTestManager(t)
m.ClearActiveAlerts()
cfg := AlertConfig{
Enabled: true,
StorageDefault: HysteresisThreshold{Trigger: 85, Clear: 80},
SnapshotDefaults: SnapshotAlertConfig{
Enabled: true,
WarningDays: 5,
CriticalDays: 10,
WarningSizeGiB: 40,
CriticalSizeGiB: 80,
},
Overrides: make(map[string]ThresholdConfig),
}
m.UpdateConfig(cfg)
m.mu.Lock()
m.config.TimeThreshold = 0
m.config.TimeThresholds = map[string]int{}
m.mu.Unlock()
now := time.Now()
snapshots := []models.GuestSnapshot{
{
ID: "inst-node-300-combined",
Name: "long-running",
Node: "node",
Instance: "inst",
Type: "qemu",
VMID: 300,
Time: now.Add(-15 * 24 * time.Hour),
SizeBytes: int64(90) << 30,
},
}
guestNames := map[string]string{
"inst:node:300": "app-server",
}
m.CheckSnapshotsForInstance("inst", snapshots, guestNames)
m.mu.RLock()
alert, exists := m.activeAlerts["snapshot-age-inst-node-300-combined"]
m.mu.RUnlock()
if !exists {
t.Fatalf("expected combined snapshot alert to be created")
}
if alert.Level != AlertLevelCritical {
t.Fatalf("expected critical level, got %s", alert.Level)
}
if !strings.Contains(alert.Message, "days old") || !strings.Contains(strings.ToLower(alert.Message), "gib") {
t.Fatalf("expected alert message to reference age and size, got %q", alert.Message)
}
if alert.Metadata == nil {
t.Fatalf("expected metadata for combined alert")
}
metrics, ok := alert.Metadata["triggeredMetrics"].([]string)
if !ok {
t.Fatalf("expected triggeredMetrics slice, got %#v", alert.Metadata["triggeredMetrics"])
}
if len(metrics) < 2 {
t.Fatalf("expected both age and size metrics recorded, got %v", metrics)
}
if metric, ok := alert.Metadata["primaryMetric"].(string); !ok || metric != "age" {
t.Fatalf("expected primary metric age, got %#v", alert.Metadata["primaryMetric"])
}
}
func TestCheckBackupsCreatesAndClearsAlerts(t *testing.T) {
m := newTestManager(t)
m.ClearActiveAlerts()
m.mu.Lock()
m.config.Enabled = true
m.config.BackupDefaults = BackupAlertConfig{
Enabled: true,
WarningDays: 7,
CriticalDays: 14,
}
m.config.TimeThreshold = 0
m.config.TimeThresholds = map[string]int{}
m.mu.Unlock()
now := time.Now()
storageBackups := []models.StorageBackup{
{
ID: "inst-node-100-backup",
Storage: "local",
Node: "node",
Instance: "inst",
Type: "qemu",
VMID: 100,
Time: now.Add(-15 * 24 * time.Hour),
},
}
key := BuildGuestKey("inst", "node", 100)
guestsByKey := map[string]GuestLookup{
key: {
ResourceID: "qemu/100",
Name: "app-server",
Instance: "inst",
Node: "node",
Type: "qemu",
VMID: 100,
},
}
guestsByVMID := map[string][]GuestLookup{
"100": {guestsByKey[key]},
}
m.CheckBackups(storageBackups, nil, nil, guestsByKey, guestsByVMID, nil)
m.mu.RLock()
alert, exists := m.activeAlerts["backup-age-"+sanitizeAlertKey(key)]
m.mu.RUnlock()
if !exists {
t.Fatalf("expected backup age alert to be created")
}
if alert.Level != AlertLevelCritical {
t.Fatalf("expected critical backup alert, got %s", alert.Level)
}
// Recent backup clears alert
storageBackups[0].Time = now
m.CheckBackups(storageBackups, nil, nil, guestsByKey, guestsByVMID, nil)
m.mu.RLock()
_, exists = m.activeAlerts["backup-age-"+sanitizeAlertKey(key)]
m.mu.RUnlock()
if exists {
t.Fatalf("expected backup-age alert to clear after fresh backup")
}
}
func TestCheckBackupsRespectsOverrides(t *testing.T) {
m := newTestManager(t)
m.ClearActiveAlerts()
m.mu.Lock()
m.config.Enabled = true
m.config.BackupDefaults = BackupAlertConfig{
Enabled: true,
WarningDays: 7,
CriticalDays: 14,
}
m.config.TimeThreshold = 0
m.mu.Unlock()
now := time.Now()
storageBackups := []models.StorageBackup{
{
ID: "inst-node-100-backup",
Storage: "local",
Node: "node",
Instance: "inst",
Type: "qemu",
VMID: 100,
Time: now.Add(-10 * 24 * time.Hour), // Triggers Warning (10 > 7)
},
}
key := BuildGuestKey("inst", "node", 100)
resourceID := "inst:node:100"
guestsByKey := map[string]GuestLookup{
key: {
ResourceID: resourceID,
Name: "app-server",
Instance: "inst",
Node: "node",
Type: "qemu",
VMID: 100,
},
}
guestsByVMID := map[string][]GuestLookup{
"100": {guestsByKey[key]},
}
// 1. Verify warning alert is created with defaults
m.CheckBackups(storageBackups, nil, nil, guestsByKey, guestsByVMID, nil)
m.mu.RLock()
alert, exists := m.activeAlerts["backup-age-"+sanitizeAlertKey(key)]
m.mu.RUnlock()
if !exists {
t.Fatalf("expected backup warning alert")
}
if alert.Level != AlertLevelWarning {
t.Fatalf("expected warning alert, got %s", alert.Level)
}
// 2. Apply override to disable backup alerts for this guest
cfg := m.GetConfig()
cfg.Overrides = map[string]ThresholdConfig{
resourceID: {
Backup: &BackupAlertConfig{Enabled: false},
},
}
m.UpdateConfig(cfg)
m.CheckBackups(storageBackups, nil, nil, guestsByKey, guestsByVMID, nil)
m.mu.RLock()
_, exists = m.activeAlerts["backup-age-"+sanitizeAlertKey(key)]
m.mu.RUnlock()
if exists {
t.Fatalf("expected backup alert to be cleared/suppressed by override")
}
// 3. Apply override to change thresholds
cfg.Overrides[resourceID] = ThresholdConfig{
Backup: &BackupAlertConfig{
Enabled: true,
WarningDays: 15, // 10 < 15, so no alert
CriticalDays: 20,
},
}
m.UpdateConfig(cfg)
m.CheckBackups(storageBackups, nil, nil, guestsByKey, guestsByVMID, nil)
m.mu.RLock()
_, exists = m.activeAlerts["backup-age-"+sanitizeAlertKey(key)]
m.mu.RUnlock()
if exists {
t.Fatalf("expected no backup alert with increased thresholds in override")
}
// 4. Test global guest disable
cfg.Overrides[resourceID] = ThresholdConfig{
Disabled: true,
}
m.UpdateConfig(cfg)
storageBackups[0].Time = now.Add(-30 * 24 * time.Hour) // Way past defaults
m.CheckBackups(storageBackups, nil, nil, guestsByKey, guestsByVMID, nil)
m.mu.RLock()
_, exists = m.activeAlerts["backup-age-"+sanitizeAlertKey(key)]
m.mu.RUnlock()
if exists {
t.Fatalf("expected no backup alert for globally disabled guest")
}
}
func TestCheckBackupsHandlesPbsOnlyGuests(t *testing.T) {
m := newTestManager(t)
m.ClearActiveAlerts()
m.mu.Lock()
m.config.Enabled = true
m.config.BackupDefaults = BackupAlertConfig{
Enabled: true,
WarningDays: 3,
CriticalDays: 5,
}
m.mu.Unlock()
now := time.Now()
pbsBackups := []models.PBSBackup{
{
ID: "pbs-backup-999-0",
Instance: "pbs-main",
Datastore: "backup-store",
BackupType: "qemu",
VMID: "999",
BackupTime: now.Add(-6 * 24 * time.Hour),
},
}
// Include a live sentinel guest so hasLiveInventory is true and orphan detection runs.
sentinelKey := BuildGuestKey("inst", "snode", 9999)
guestsByKey := map[string]GuestLookup{
sentinelKey: {ResourceID: "qemu/9999", Name: "sentinel-vm", Instance: "inst", Node: "snode", Type: "qemu", VMID: 9999},
}
guestsByVMID := map[string][]GuestLookup{
"9999": {guestsByKey[sentinelKey]},
}
m.CheckBackups(nil, pbsBackups, nil, guestsByKey, guestsByVMID, nil)
m.mu.RLock()
found := false
for id, alert := range m.activeAlerts {
if strings.HasPrefix(id, "backup-orphaned-") {
found = true
if alert.Level != AlertLevelWarning {
t.Fatalf("expected PBS orphaned backup alert to be warning, got %s", alert.Level)
}
break
}
}
m.mu.RUnlock()
if !found {
t.Fatalf("expected PBS orphaned backup alert to be created")
}
}
func TestCheckBackupsDisambiguatesWithNamespace(t *testing.T) {
// Test that when multiple guests have the same VMID from different instances,
// the namespace is used to match the backup to the correct guest.
// This addresses issue #1095 where users have multiple PVE instances with
// overlapping VMIDs and separate PBS instances backing them up.
m := newTestManager(t)
m.ClearActiveAlerts()
m.mu.Lock()
m.config.Enabled = true
m.config.BackupDefaults = BackupAlertConfig{
Enabled: true,
WarningDays: 3,
CriticalDays: 5,
}
m.mu.Unlock()
now := time.Now()
// Two guests with the same VMID (100) but on different instances
guestsByKey := map[string]GuestLookup{
"pve-node1-100": {
ResourceID: "qemu/100",
Name: "webserver-pve",
Instance: "pve",
Node: "node1",
Type: "qemu",
VMID: 100,
},
"pve-nat-node2-100": {
ResourceID: "qemu/100",
Name: "webserver-nat",
Instance: "pve-nat",
Node: "node2",
Type: "qemu",
VMID: 100,
},
}
// Both guests have VMID "100"
guestsByVMID := map[string][]GuestLookup{
"100": {
guestsByKey["pve-node1-100"],
guestsByKey["pve-nat-node2-100"],
},
}
// PBS backup with namespace "nat" should match the "pve-nat" instance
pbsBackups := []models.PBSBackup{
{
ID: "pbs-backup-100-nat",
Instance: "pbs-main",
Datastore: "backup-store",
Namespace: "nat", // This namespace should match "pve-nat"
BackupType: "qemu",
VMID: "100",
BackupTime: now.Add(-6 * 24 * time.Hour), // Critical
},
}
m.CheckBackups(nil, pbsBackups, nil, guestsByKey, guestsByVMID, nil)
m.mu.RLock()
defer m.mu.RUnlock()
// Should find an alert keyed to the pve-nat instance (node2), not pve (node1)
expectedKey := "backup-age-pve-nat-node2-100"
alert, exists := m.activeAlerts[expectedKey]
if !exists {
// List what keys we do have for debugging
var keys []string
for k := range m.activeAlerts {
keys = append(keys, k)
}
t.Fatalf("expected alert with key %q not found; found keys: %v", expectedKey, keys)
}
if alert.ResourceName != "webserver-nat backup" {
t.Errorf("expected ResourceName 'webserver-nat backup', got %q", alert.ResourceName)
}
if alert.Instance != "pve-nat" {
t.Errorf("expected Instance 'pve-nat', got %q", alert.Instance)
}
}
// TestCheckBackupsVMIDCollisionNonMatchingNamespace verifies that when multiple guests
// share a VMID and the PBS backup namespace matches none of them, the alert uses the
// generic PBS key rather than falsely attributing to a specific guest.
func TestCheckBackupsVMIDCollisionNonMatchingNamespace(t *testing.T) {
m := newTestManager(t)
m.ClearActiveAlerts()
m.mu.Lock()
m.config.Enabled = true
m.config.BackupDefaults = BackupAlertConfig{
Enabled: true,
WarningDays: 3,
CriticalDays: 5,
}
m.mu.Unlock()
now := time.Now()
guestsByKey := map[string]GuestLookup{
"pve1-node1-100": {
ResourceID: "qemu/100",
Name: "vm-pve1",
Instance: "pve1",
Node: "node1",
Type: "qemu",
VMID: 100,
},
"pve2-node2-100": {
ResourceID: "qemu/100",
Name: "vm-pve2",
Instance: "pve2",
Node: "node2",
Type: "qemu",
VMID: 100,
},
}
guestsByVMID := map[string][]GuestLookup{
"100": {
guestsByKey["pve1-node1-100"],
guestsByKey["pve2-node2-100"],
},
}
// PBS backup with namespace "staging" — matches neither pve1 nor pve2
pbsBackups := []models.PBSBackup{
{
ID: "pbs-100",
Instance: "pbs-main",
Datastore: "backup-store",
Namespace: "staging",
BackupType: "qemu",
VMID: "100",
BackupTime: now.Add(-6 * 24 * time.Hour),
},
}
m.CheckBackups(nil, pbsBackups, nil, guestsByKey, guestsByVMID, nil)
m.mu.RLock()
defer m.mu.RUnlock()
// Should NOT have a guest-specific alert key
for key := range m.activeAlerts {
if key == "backup-age-pve1-node1-100" || key == "backup-age-pve2-node2-100" {
t.Errorf("should not attribute ambiguous backup to a specific guest, but found key %q", key)
}
}
// Should have a generic PBS alert key
expectedKey := "backup-age-pbs-pbs-main-qemu-100"
if _, exists := m.activeAlerts[expectedKey]; !exists {
var keys []string
for k := range m.activeAlerts {
keys = append(keys, k)
}
t.Errorf("expected generic PBS alert key %q, found keys: %v", expectedKey, keys)
}
}
// TestCheckBackupsVMIDCollisionNoNamespace verifies that when multiple guests
// share a VMID and the PBS backup has no namespace, the alert uses the generic PBS key.
func TestCheckBackupsVMIDCollisionNoNamespace(t *testing.T) {
m := newTestManager(t)
m.ClearActiveAlerts()
m.mu.Lock()
m.config.Enabled = true
m.config.BackupDefaults = BackupAlertConfig{
Enabled: true,
WarningDays: 3,
CriticalDays: 5,
}
m.mu.Unlock()
now := time.Now()
guestsByKey := map[string]GuestLookup{
"pve1-node1-100": {
ResourceID: "qemu/100",
Name: "vm-pve1",
Instance: "pve1",
Node: "node1",
Type: "qemu",
VMID: 100,
},
"pve2-node2-100": {
ResourceID: "qemu/100",
Name: "vm-pve2",
Instance: "pve2",
Node: "node2",
Type: "qemu",
VMID: 100,
},
}
guestsByVMID := map[string][]GuestLookup{
"100": {
guestsByKey["pve1-node1-100"],
guestsByKey["pve2-node2-100"],
},
}
// PBS backup with NO namespace
pbsBackups := []models.PBSBackup{
{
ID: "pbs-100",
Instance: "pbs-main",
Datastore: "backup-store",
Namespace: "",
BackupType: "qemu",
VMID: "100",
BackupTime: now.Add(-6 * 24 * time.Hour),
},
}
m.CheckBackups(nil, pbsBackups, nil, guestsByKey, guestsByVMID, nil)
m.mu.RLock()
defer m.mu.RUnlock()
// Should NOT have a guest-specific alert key
for key := range m.activeAlerts {
if key == "backup-age-pve1-node1-100" || key == "backup-age-pve2-node2-100" {
t.Errorf("should not attribute ambiguous backup to a specific guest, but found key %q", key)
}
}
// Should have a generic PBS alert key
expectedKey := "backup-age-pbs-pbs-main-qemu-100"
if _, exists := m.activeAlerts[expectedKey]; !exists {
var keys []string
for k := range m.activeAlerts {
keys = append(keys, k)
}
t.Errorf("expected generic PBS alert key %q, found keys: %v", expectedKey, keys)
}
}
func TestCheckBackupsPbsTypeMismatchCreatesOrphanedAlert(t *testing.T) {
m := newTestManager(t)
m.ClearActiveAlerts()
m.mu.Lock()
m.config.Enabled = true
m.config.BackupDefaults = BackupAlertConfig{
Enabled: true,
WarningDays: 3,
CriticalDays: 5,
}
m.mu.Unlock()
now := time.Now()
pbsBackups := []models.PBSBackup{
{
ID: "pbs-vm-101",
Instance: "pbs-main",
Datastore: "backup-store",
BackupType: "vm",
VMID: "101",
BackupTime: now.Add(-30 * 24 * time.Hour),
},
}
guestKey := BuildGuestKey("pve1", "node1", 101)
guestsByKey := map[string]GuestLookup{
guestKey: {
ResourceID: "lxc/101",
Name: "ct-101",
Instance: "pve1",
Node: "node1",
Type: "lxc",
VMID: 101,
},
}
guestsByVMID := map[string][]GuestLookup{
"101": {guestsByKey[guestKey]},
}
m.CheckBackups(nil, pbsBackups, nil, guestsByKey, guestsByVMID, nil)
orphanedID := "backup-orphaned-" + sanitizeAlertKey("pbs:pbs-main:vm:101")
ageID := "backup-age-" + sanitizeAlertKey("pbs:pbs-main:vm:101")
m.mu.RLock()
defer m.mu.RUnlock()
alert, exists := m.activeAlerts[orphanedID]
if !exists {
var keys []string
for k := range m.activeAlerts {
keys = append(keys, k)
}
t.Fatalf("expected orphaned alert %q, found keys: %v", orphanedID, keys)
}
if alert.Type != "backup-orphaned" {
t.Fatalf("expected backup-orphaned alert, got %s", alert.Type)
}
if _, exists := m.activeAlerts[ageID]; exists {
t.Fatalf("expected no backup-age alert for mismatched live guest type")
}
}
func TestCheckBackupsStorageTypeMismatchCreatesOrphanedAlert(t *testing.T) {
m := newTestManager(t)
m.ClearActiveAlerts()
m.mu.Lock()
m.config.Enabled = true
m.config.BackupDefaults = BackupAlertConfig{
Enabled: true,
WarningDays: 3,
CriticalDays: 5,
}
m.mu.Unlock()
now := time.Now()
storageBackups := []models.StorageBackup{
{
ID: "pve1-node1-101-backup",
Storage: "local",
Node: "node1",
Instance: "pve1",
Type: "qemu",
VMID: 101,
Time: now.Add(-30 * 24 * time.Hour),
},
}
guestKey := BuildGuestKey("pve1", "node1", 101)
guestsByKey := map[string]GuestLookup{
guestKey: {
ResourceID: "lxc/101",
Name: "ct-101",
Instance: "pve1",
Node: "node1",
Type: "lxc",
VMID: 101,
},
}
guestsByVMID := map[string][]GuestLookup{
"101": {guestsByKey[guestKey]},
}
m.CheckBackups(storageBackups, nil, nil, guestsByKey, guestsByVMID, nil)
orphanedID := "backup-orphaned-" + sanitizeAlertKey(guestKey)
ageID := "backup-age-" + sanitizeAlertKey(guestKey)
m.mu.RLock()
defer m.mu.RUnlock()
alert, exists := m.activeAlerts[orphanedID]
if !exists {
var keys []string
for k := range m.activeAlerts {
keys = append(keys, k)
}
t.Fatalf("expected orphaned alert %q, found keys: %v", orphanedID, keys)
}
if alert.Type != "backup-orphaned" {
t.Fatalf("expected backup-orphaned alert, got %s", alert.Type)
}
if _, exists := m.activeAlerts[ageID]; exists {
t.Fatalf("expected no backup-age alert for mismatched live guest type")
}
}
func TestCheckBackupsHandlesPmgBackups(t *testing.T) {
m := newTestManager(t)
m.ClearActiveAlerts()
m.mu.Lock()
m.config.Enabled = true
m.config.BackupDefaults = BackupAlertConfig{
Enabled: true,
WarningDays: 5,
CriticalDays: 7,
}
m.mu.Unlock()
now := time.Now()
pmgBackups := []models.PMGBackup{
{
ID: "pmg-backup-mail-01",
Instance: "mail",
Node: "mail-gateway",
Filename: "pmg-backup_2024-01-01.tgz",
BackupTime: now.Add(-8 * 24 * time.Hour),
Size: 123456,
},
}
m.CheckBackups(nil, nil, pmgBackups, map[string]GuestLookup{}, map[string][]GuestLookup{}, nil)
m.mu.RLock()
found := false
for id, alert := range m.activeAlerts {
if strings.HasPrefix(id, "backup-age-") {
found = true
if alert.Level != AlertLevelCritical {
t.Fatalf("expected PMG backup alert to be critical")
}
break
}
}
m.mu.RUnlock()
if !found {
t.Fatalf("expected PMG backup alert to be created")
}
}
func TestCheckBackupsSkipsOrphanedWhenDisabled(t *testing.T) {
m := newTestManager(t)
m.ClearActiveAlerts()
alertOrphaned := false
m.mu.Lock()
m.config.Enabled = true
m.config.BackupDefaults = BackupAlertConfig{
Enabled: true,
WarningDays: 3,
CriticalDays: 5,
AlertOrphaned: &alertOrphaned,
IgnoreVMIDs: []string{},
}
m.mu.Unlock()
now := time.Now()
storageBackups := []models.StorageBackup{
{
ID: "inst-node-200-backup",
Storage: "local",
Node: "node",
Instance: "inst",
Type: "qemu",
VMID: 200,
Time: now.Add(-6 * 24 * time.Hour),
},
}
// Include a live sentinel guest so hasLiveInventory is true and orphan detection runs.
sentinelKey := BuildGuestKey("inst", "snode", 9999)
guestsByKey := map[string]GuestLookup{
sentinelKey: {ResourceID: "qemu/9999", Name: "sentinel-vm", Instance: "inst", Node: "snode", Type: "qemu", VMID: 9999},
}
guestsByVMID := map[string][]GuestLookup{
"9999": {guestsByKey[sentinelKey]},
}
m.CheckBackups(storageBackups, nil, nil, guestsByKey, guestsByVMID, nil)
m.mu.RLock()
defer m.mu.RUnlock()
for id := range m.activeAlerts {
if strings.HasPrefix(id, "backup-age-") {
t.Fatalf("expected orphaned backup to be skipped, found alert %s", id)
}
if strings.HasPrefix(id, "backup-orphaned-") {
t.Fatalf("expected orphaned backup alert to be suppressed when alertOrphaned=false, found alert %s", id)
}
}
}
func TestCheckBackupsCreatesOrphanedAlert(t *testing.T) {
m := newTestManager(t)
m.ClearActiveAlerts()
alertOrphaned := true
m.mu.Lock()
m.config.Enabled = true
m.config.BackupDefaults = BackupAlertConfig{
Enabled: true,
WarningDays: 7,
CriticalDays: 14,
AlertOrphaned: &alertOrphaned,
IgnoreVMIDs: []string{},
}
m.mu.Unlock()
now := time.Now()
// Backup is only 1 day old — well below both age thresholds.
storageBackups := []models.StorageBackup{
{
ID: "inst-node-200-backup",
Storage: "local",
Node: "node",
Instance: "inst",
Type: "qemu",
VMID: 200,
Time: now.Add(-1 * 24 * time.Hour),
},
}
// Include a live sentinel guest so hasLiveInventory is true and orphan detection runs.
// VMID 200 is still orphaned because it's not in the inventory.
sentinelKey := BuildGuestKey("inst", "snode", 9999)
guestsByKey := map[string]GuestLookup{
sentinelKey: {ResourceID: "qemu/9999", Name: "sentinel-vm", Instance: "inst", Node: "snode", Type: "qemu", VMID: 9999},
}
guestsByVMID := map[string][]GuestLookup{
"9999": {guestsByKey[sentinelKey]},
}
m.CheckBackups(storageBackups, nil, nil, guestsByKey, guestsByVMID, nil)
m.mu.RLock()
defer m.mu.RUnlock()
found := false
for id, alert := range m.activeAlerts {
if strings.HasPrefix(id, "backup-orphaned-") {
found = true
if alert.Type != "backup-orphaned" {
t.Fatalf("expected alert type backup-orphaned, got %s", alert.Type)
}
if alert.Level != AlertLevelWarning {
t.Fatalf("expected alert level warning, got %s", alert.Level)
}
if alert.Metadata == nil || alert.Metadata["orphaned"] != true {
t.Fatalf("expected metadata orphaned=true")
}
if alert.Metadata["vmid"] != "200" {
t.Fatalf("expected metadata vmid=200, got %v", alert.Metadata["vmid"])
}
}
}
if !found {
t.Fatalf("expected a backup-orphaned alert to be created for orphaned VMID 200")
}
// Also verify no backup-age alert was created (below thresholds).
for id := range m.activeAlerts {
if strings.HasPrefix(id, "backup-age-") {
t.Fatalf("expected no backup-age alert for orphan below age threshold, found %s", id)
}
}
}
func TestCheckBackupsOrphanedAlertClearsWhenGuestReappears(t *testing.T) {
m := newTestManager(t)
m.ClearActiveAlerts()
alertOrphaned := true
m.mu.Lock()
m.config.Enabled = true
m.config.BackupDefaults = BackupAlertConfig{
Enabled: true,
WarningDays: 7,
CriticalDays: 14,
AlertOrphaned: &alertOrphaned,
IgnoreVMIDs: []string{},
}
m.mu.Unlock()
now := time.Now()
storageBackups := []models.StorageBackup{
{
ID: "inst-node-300-backup",
Storage: "local",
Node: "node",
Instance: "inst",
Type: "qemu",
VMID: 300,
Time: now.Add(-1 * 24 * time.Hour),
},
}
// Include a live sentinel guest on the same instance so orphan detection runs.
sentinelKey := BuildGuestKey("inst", "snode", 9999)
sentinelByKey := map[string]GuestLookup{
sentinelKey: {ResourceID: "qemu/9999", Name: "sentinel-vm", Instance: "inst", Node: "snode", Type: "qemu", VMID: 9999},
}
sentinelByVMID := map[string][]GuestLookup{
"9999": {sentinelByKey[sentinelKey]},
}
// First cycle: guest 300 absent (only sentinel present) → orphaned alert fires.
m.CheckBackups(storageBackups, nil, nil, sentinelByKey, sentinelByVMID, nil)
m.mu.RLock()
orphanedFound := false
for id := range m.activeAlerts {
if strings.HasPrefix(id, "backup-orphaned-") {
orphanedFound = true
}
}
m.mu.RUnlock()
if !orphanedFound {
t.Fatalf("expected orphaned alert after first cycle")
}
// Second cycle: guest reappears in inventory → orphaned alert should clear.
guestKey := BuildGuestKey("inst", "node", 300)
guestsByKey := map[string]GuestLookup{
guestKey: {ResourceID: "qemu/300", Name: "restored-vm", Instance: "inst", Node: "node", Type: "qemu", VMID: 300},
}
guestsByVMID := map[string][]GuestLookup{
"300": {guestsByKey[guestKey]},
}
m.CheckBackups(storageBackups, nil, nil, guestsByKey, guestsByVMID, nil)
m.mu.RLock()
defer m.mu.RUnlock()
for id := range m.activeAlerts {
if strings.HasPrefix(id, "backup-orphaned-") {
t.Fatalf("expected orphaned alert to be cleared after guest reappears, found %s", id)
}
}
}
func TestCheckBackupsOrphanedIgnoresVMIDs(t *testing.T) {
m := newTestManager(t)
m.ClearActiveAlerts()
alertOrphaned := true
m.mu.Lock()
m.config.Enabled = true
m.config.BackupDefaults = BackupAlertConfig{
Enabled: true,
WarningDays: 7,
CriticalDays: 14,
AlertOrphaned: &alertOrphaned,
IgnoreVMIDs: []string{"20*"},
}
m.mu.Unlock()
now := time.Now()
storageBackups := []models.StorageBackup{
{
ID: "inst-node-200-backup",
Storage: "local",
Node: "node",
Instance: "inst",
Type: "qemu",
VMID: 200,
Time: now.Add(-1 * 24 * time.Hour),
},
{
ID: "inst-node-300-backup",
Storage: "local",
Node: "node",
Instance: "inst",
Type: "qemu",
VMID: 300,
Time: now.Add(-1 * 24 * time.Hour),
},
}
// Include a live sentinel guest so hasLiveInventory is true and orphan detection runs.
sentinelKey := BuildGuestKey("inst", "snode", 9999)
guestsByKey := map[string]GuestLookup{
sentinelKey: {ResourceID: "qemu/9999", Name: "sentinel-vm", Instance: "inst", Node: "snode", Type: "qemu", VMID: 9999},
}
guestsByVMID := map[string][]GuestLookup{
"9999": {guestsByKey[sentinelKey]},
}
// Both are orphaned (not in inventory), but VMID 200 matches ignore pattern "20*".
m.CheckBackups(storageBackups, nil, nil, guestsByKey, guestsByVMID, nil)
m.mu.RLock()
defer m.mu.RUnlock()
for id := range m.activeAlerts {
if strings.HasPrefix(id, "backup-orphaned-") && strings.Contains(id, "200") {
t.Fatalf("expected orphaned alert for VMID 200 to be suppressed by ignoreVMIDs, found %s", id)
}
}
found300 := false
for id := range m.activeAlerts {
if strings.HasPrefix(id, "backup-orphaned-") && strings.Contains(id, "300") {
found300 = true
}
}
if !found300 {
t.Fatalf("expected orphaned alert for VMID 300 (not in ignore list)")
}
}
func TestCheckBackupsOrphanedWithZeroAgeThresholds(t *testing.T) {
m := newTestManager(t)
m.ClearActiveAlerts()
alertOrphaned := true
m.mu.Lock()
m.config.Enabled = true
m.config.BackupDefaults = BackupAlertConfig{
Enabled: true,
WarningDays: 0,
CriticalDays: 0,
AlertOrphaned: &alertOrphaned,
IgnoreVMIDs: []string{},
}
m.mu.Unlock()
now := time.Now()
storageBackups := []models.StorageBackup{
{
ID: "inst-node-400-backup",
Storage: "local",
Node: "node",
Instance: "inst",
Type: "qemu",
VMID: 400,
Time: now.Add(-1 * 24 * time.Hour),
},
}
// Include a live sentinel guest so hasLiveInventory is true and orphan detection runs.
sentinelKey := BuildGuestKey("inst", "snode", 9999)
guestsByKey := map[string]GuestLookup{
sentinelKey: {ResourceID: "qemu/9999", Name: "sentinel-vm", Instance: "inst", Node: "snode", Type: "qemu", VMID: 9999},
}
guestsByVMID := map[string][]GuestLookup{
"9999": {guestsByKey[sentinelKey]},
}
// Orphaned guest with zero age thresholds — should still fire orphaned alert.
m.CheckBackups(storageBackups, nil, nil, guestsByKey, guestsByVMID, nil)
m.mu.RLock()
defer m.mu.RUnlock()
found := false
for id := range m.activeAlerts {
if strings.HasPrefix(id, "backup-orphaned-") {
found = true
}
}
if !found {
t.Fatalf("expected backup-orphaned alert even with zero age thresholds")
}
// No backup-age alerts should exist since thresholds are 0.
for id := range m.activeAlerts {
if strings.HasPrefix(id, "backup-age-") {
t.Fatalf("expected no backup-age alert with zero thresholds, found %s", id)
}
}
}
func TestCheckBackupsOrphanedWithPersistedMetadata(t *testing.T) {
// When a guest is deleted, enrichWithPersistedMetadata adds an entry to
// guestsByVMID with an empty ResourceID (just name/type metadata for display).
// This must NOT suppress orphaned alerts — only live guests (ResourceID != "")
// indicate the guest is still in inventory.
m := newTestManager(t)
m.ClearActiveAlerts()
alertOrphaned := true
m.mu.Lock()
m.config.Enabled = true
m.config.BackupDefaults = BackupAlertConfig{
Enabled: true,
WarningDays: 7,
CriticalDays: 14,
AlertOrphaned: &alertOrphaned,
IgnoreVMIDs: []string{},
}
m.mu.Unlock()
now := time.Now()
storageBackups := []models.StorageBackup{
{
ID: "inst-node-500-backup",
Storage: "local",
Node: "node",
Instance: "inst",
Type: "qemu",
VMID: 500,
Time: now.Add(-1 * 24 * time.Hour),
},
}
// Simulate persisted metadata for deleted guest: entry exists in
// guestsByVMID but with empty ResourceID (no live guest).
// Include a live sentinel guest on the same instance so orphan detection runs.
sentinelKey := BuildGuestKey("inst", "snode", 9999)
guestsByKey := map[string]GuestLookup{
sentinelKey: {ResourceID: "qemu/9999", Name: "sentinel-vm", Instance: "inst", Node: "snode", Type: "qemu", VMID: 9999},
}
guestsByVMID := map[string][]GuestLookup{
"500": {{Name: "deleted-vm", Instance: "inst", Node: "node", Type: "qemu", VMID: 500}},
"9999": {guestsByKey[sentinelKey]},
}
m.CheckBackups(storageBackups, nil, nil, guestsByKey, guestsByVMID, nil)
m.mu.RLock()
defer m.mu.RUnlock()
found := false
for id := range m.activeAlerts {
if strings.HasPrefix(id, "backup-orphaned-") {
found = true
}
}
if !found {
t.Fatalf("expected backup-orphaned alert even when guestsByVMID has metadata-only entry (no ResourceID)")
}
}
func TestCheckBackupsOrphanedSkippedWhenNoLiveInventory(t *testing.T) {
// When no live guests exist (empty maps or only persisted metadata),
// orphan detection is skipped entirely to avoid false positives during
// startup race / auth failure / inventory outage.
m := newTestManager(t)
m.ClearActiveAlerts()
alertOrphaned := true
m.mu.Lock()
m.config.Enabled = true
m.config.BackupDefaults = BackupAlertConfig{
Enabled: true,
WarningDays: 7,
CriticalDays: 14,
AlertOrphaned: &alertOrphaned,
IgnoreVMIDs: []string{},
}
m.mu.Unlock()
now := time.Now()
storageBackups := []models.StorageBackup{
{
ID: "inst-node-600-backup",
Storage: "local",
Node: "node",
Instance: "inst",
Type: "qemu",
VMID: 600,
Time: now.Add(-1 * 24 * time.Hour),
},
}
// Completely empty guest maps — no live inventory.
m.CheckBackups(storageBackups, nil, nil, map[string]GuestLookup{}, map[string][]GuestLookup{}, nil)
m.mu.RLock()
defer m.mu.RUnlock()
for id := range m.activeAlerts {
if strings.HasPrefix(id, "backup-orphaned-") {
t.Fatalf("expected no orphaned alerts when guest inventory is empty (startup race guard), found %s", id)
}
}
}
func TestCheckBackupsOrphanedPreservedWhenNoLiveInventory(t *testing.T) {
// When a legitimate orphan alert already exists and inventory becomes
// unavailable (auth failure, restart), the alert should be preserved
// rather than cleared — we can't confirm it's resolved.
m := newTestManager(t)
m.ClearActiveAlerts()
alertOrphaned := true
m.mu.Lock()
m.config.Enabled = true
m.config.BackupDefaults = BackupAlertConfig{
Enabled: true,
WarningDays: 7,
CriticalDays: 14,
AlertOrphaned: &alertOrphaned,
IgnoreVMIDs: []string{},
}
m.mu.Unlock()
now := time.Now()
storageBackups := []models.StorageBackup{
{
ID: "inst-node-700-backup",
Storage: "local",
Node: "node",
Instance: "inst",
Type: "qemu",
VMID: 700,
Time: now.Add(-1 * 24 * time.Hour),
},
}
// First cycle: with live inventory → orphan alert fires.
sentinelKey := BuildGuestKey("inst", "snode", 9999)
guestsByKey := map[string]GuestLookup{
sentinelKey: {ResourceID: "qemu/9999", Name: "sentinel-vm", Instance: "inst", Node: "snode", Type: "qemu", VMID: 9999},
}
guestsByVMID := map[string][]GuestLookup{
"9999": {guestsByKey[sentinelKey]},
}
m.CheckBackups(storageBackups, nil, nil, guestsByKey, guestsByVMID, nil)
m.mu.RLock()
orphanFound := false
for id := range m.activeAlerts {
if strings.HasPrefix(id, "backup-orphaned-") {
orphanFound = true
}
}
m.mu.RUnlock()
if !orphanFound {
t.Fatalf("expected orphan alert after first cycle with live inventory")
}
// Second cycle: inventory disappears (empty maps) — orphan alert must be preserved.
m.CheckBackups(storageBackups, nil, nil, map[string]GuestLookup{}, map[string][]GuestLookup{}, nil)
m.mu.RLock()
defer m.mu.RUnlock()
preserved := false
for id := range m.activeAlerts {
if strings.HasPrefix(id, "backup-orphaned-") {
preserved = true
}
}
if !preserved {
t.Fatalf("expected orphan alert to be preserved when inventory is unavailable, but it was cleared")
}
}
func TestCheckBackupsOrphanedCrossInstanceVMID(t *testing.T) {
// Instance A's guest (VMID 600) is deleted, but instance B has a live
// guest with the same VMID. The storage backup from instance A should
// still fire an orphaned alert — the live guest on instance B is irrelevant.
m := newTestManager(t)
m.ClearActiveAlerts()
alertOrphaned := true
m.mu.Lock()
m.config.Enabled = true
m.config.BackupDefaults = BackupAlertConfig{
Enabled: true,
WarningDays: 7,
CriticalDays: 14,
AlertOrphaned: &alertOrphaned,
IgnoreVMIDs: []string{},
}
m.mu.Unlock()
now := time.Now()
// Storage backup from instance A.
storageBackups := []models.StorageBackup{
{
ID: "instA-nodeA-600-backup",
Storage: "local",
Node: "nodeA",
Instance: "instA",
Type: "qemu",
VMID: 600,
Time: now.Add(-1 * 24 * time.Hour),
},
}
// Instance B has a live guest with the same VMID.
// Instance A also has a sentinel guest so its inventory is considered populated.
keyB := BuildGuestKey("instB", "nodeB", 600)
sentinelA := BuildGuestKey("instA", "nodeA", 9999)
guestsByKey := map[string]GuestLookup{
keyB: {ResourceID: "qemu/600", Name: "vm-instB", Instance: "instB", Node: "nodeB", Type: "qemu", VMID: 600},
sentinelA: {ResourceID: "qemu/9999", Name: "sentinel-vm", Instance: "instA", Node: "nodeA", Type: "qemu", VMID: 9999},
}
guestsByVMID := map[string][]GuestLookup{
"600": {guestsByKey[keyB]},
"9999": {guestsByKey[sentinelA]},
}
m.CheckBackups(storageBackups, nil, nil, guestsByKey, guestsByVMID, nil)
m.mu.RLock()
defer m.mu.RUnlock()
found := false
for id := range m.activeAlerts {
if strings.HasPrefix(id, "backup-orphaned-") {
found = true
}
}
if !found {
t.Fatalf("expected backup-orphaned alert for instA even though instB has a live guest with the same VMID")
}
}
func TestCheckBackupsSkipsPVEOrphanDetectionUntilTemplateInventoryReady(t *testing.T) {
m := newTestManager(t)
m.ClearActiveAlerts()
alertOrphaned := true
m.mu.Lock()
m.config.Enabled = true
m.config.BackupDefaults = BackupAlertConfig{
Enabled: true,
WarningDays: 7,
CriticalDays: 14,
AlertOrphaned: &alertOrphaned,
IgnoreVMIDs: []string{},
}
m.mu.Unlock()
now := time.Now()
storageBackups := []models.StorageBackup{
{
ID: "instA-node2-700-backup",
Storage: "local",
Node: "node2",
Instance: "instA",
Type: "qemu",
VMID: 700,
Time: now.Add(-1 * 24 * time.Hour),
},
}
// Simulate the startup/concurrency window where the instance has enough live
// guest data to satisfy the legacy readiness heuristic, but template inventory
// has not been populated yet. This backup must not be treated as orphaned.
sentinelKey := BuildGuestKey("instA", "node3", 9999)
guestsByKey := map[string]GuestLookup{
sentinelKey: {
ResourceID: "qemu/9999",
Name: "sentinel-vm",
Instance: "instA",
Node: "node3",
Type: "qemu",
VMID: 9999,
},
}
guestsByVMID := map[string][]GuestLookup{
"9999": {guestsByKey[sentinelKey]},
}
m.CheckBackups(storageBackups, nil, nil, guestsByKey, guestsByVMID, map[string]bool{})
m.mu.RLock()
defer m.mu.RUnlock()
for id := range m.activeAlerts {
if strings.HasPrefix(id, "backup-orphaned-") {
t.Fatalf("expected no orphaned alert before template inventory is ready, found %s", id)
}
}
}
func TestCheckBackupsIgnoresVMIDs(t *testing.T) {
m := newTestManager(t)
m.ClearActiveAlerts()
alertOrphaned := true
m.mu.Lock()
m.config.Enabled = true
m.config.BackupDefaults = BackupAlertConfig{
Enabled: true,
WarningDays: 1,
CriticalDays: 2,
AlertOrphaned: &alertOrphaned,
IgnoreVMIDs: []string{"10*"},
}
m.mu.Unlock()
now := time.Now()
storageBackups := []models.StorageBackup{
{
ID: "inst-node-101-backup",
Storage: "local",
Node: "node",
Instance: "inst",
Type: "qemu",
VMID: 101,
Time: now.Add(-3 * 24 * time.Hour),
},
{
ID: "inst-node-200-backup",
Storage: "local",
Node: "node",
Instance: "inst",
Type: "qemu",
VMID: 200,
Time: now.Add(-3 * 24 * time.Hour),
},
}
keyIgnored := BuildGuestKey("inst", "node", 101)
keyAllowed := BuildGuestKey("inst", "node", 200)
guestsByKey := map[string]GuestLookup{
keyIgnored: {ResourceID: "qemu/101", Name: "ignored-vm", Instance: "inst", Node: "node", Type: "qemu", VMID: 101},
keyAllowed: {ResourceID: "qemu/200", Name: "allowed-vm", Instance: "inst", Node: "node", Type: "qemu", VMID: 200},
}
guestsByVMID := map[string][]GuestLookup{
"101": {guestsByKey[keyIgnored]},
"200": {guestsByKey[keyAllowed]},
}
m.CheckBackups(storageBackups, nil, nil, guestsByKey, guestsByVMID, nil)
m.mu.RLock()
_, ignoredExists := m.activeAlerts["backup-age-"+sanitizeAlertKey(keyIgnored)]
_, allowedExists := m.activeAlerts["backup-age-"+sanitizeAlertKey(keyAllowed)]
m.mu.RUnlock()
if ignoredExists {
t.Fatalf("expected backup alert for ignored VMID to be suppressed")
}
if !allowedExists {
t.Fatalf("expected backup alert for non-ignored VMID")
}
}
func TestCheckDockerHostIgnoresContainersByPrefix(t *testing.T) {
m := newTestManager(t)
m.mu.Lock()
m.config.DockerIgnoredContainerPrefixes = []string{"runner-"}
m.mu.Unlock()
container := models.DockerContainer{
ID: "1234567890ab",
Name: "runner-auto-1",
State: "exited",
Status: "Exited (0) 3 seconds ago",
}
host := models.DockerHost{
ID: "host-ephemeral",
Hostname: "ci-host",
DisplayName: "CI Host",
Containers: []models.DockerContainer{container},
}
resourceID := dockerResourceID(host.ID, container.ID)
alertID := fmt.Sprintf("docker-container-state-%s", resourceID)
// Run twice to satisfy the confirmation threshold when not ignored
m.CheckDockerHost(host)
m.CheckDockerHost(host)
if _, exists := m.activeAlerts[alertID]; exists {
t.Fatalf("expected no state alert for ignored container")
}
if _, exists := m.dockerStateConfirm[resourceID]; exists {
t.Fatalf("expected no state confirmation tracking for ignored container")
}
}
func TestDockerServiceReplicaAlerts(t *testing.T) {
m := newTestManager(t)
m.ClearActiveAlerts()
m.mu.RLock()
cfg := m.config
m.mu.RUnlock()
cfg.Enabled = true
m.UpdateConfig(cfg)
host := models.DockerHost{
ID: "host-1",
DisplayName: "Prod Swarm",
Hostname: "swarm-prod",
Services: []models.DockerService{
{
ID: "svc-1",
Name: "web",
DesiredTasks: 4,
RunningTasks: 2,
Mode: "replicated",
},
},
}
m.CheckDockerHost(host)
resourceID := dockerServiceResourceID(host.ID, "svc-1", "web")
alertID := fmt.Sprintf("docker-service-health-%s", resourceID)
alert, exists := m.activeAlerts[alertID]
if !exists {
t.Fatalf("expected service alert %s to be raised", alertID)
}
if alert.Level != AlertLevelCritical {
t.Fatalf("expected critical severity, got %s", alert.Level)
}
if missing, ok := alert.Metadata["missingTasks"].(int); !ok || missing != 2 {
t.Fatalf("expected missingTasks metadata to be 2, got %v", alert.Metadata["missingTasks"])
}
// Resolve by restoring replicas
host.Services[0].RunningTasks = 4
m.CheckDockerHost(host)
if _, exists := m.activeAlerts[alertID]; exists {
t.Fatalf("expected service alert %s to be cleared when replicas restored", alertID)
}
}
func TestDockerServiceAlertDoesNotRenotifyWhenUnchanged(t *testing.T) {
m := newTestManager(t)
m.ClearActiveAlerts()
cfg := m.GetConfig()
cfg.Enabled = true
cfg.ActivationState = ActivationActive
cfg.Schedule.MaxAlertsHour = 100
m.UpdateConfig(cfg)
dispatched := make(chan string, 4)
m.SetAlertCallback(func(alert *Alert) {
dispatched <- alert.ID
})
host := models.DockerHost{
ID: "host-1",
DisplayName: "Prod Swarm",
Hostname: "swarm-prod",
Services: []models.DockerService{
{
ID: "svc-1",
Name: "web",
DesiredTasks: 4,
RunningTasks: 2,
Mode: "replicated",
},
},
}
m.CheckDockerHost(host)
select {
case <-dispatched:
case <-time.After(1 * time.Second):
t.Fatal("expected initial docker service alert notification")
}
// Same degraded state should update LastSeen/value but not re-notify every poll.
m.CheckDockerHost(host)
select {
case id := <-dispatched:
t.Fatalf("expected no second notification for unchanged service alert, got %s", id)
case <-time.After(250 * time.Millisecond):
}
}
func TestDockerServiceAlertPreservesLastNotifiedWhenUnchanged(t *testing.T) {
m := newTestManager(t)
m.ClearActiveAlerts()
cfg := m.GetConfig()
cfg.Enabled = true
cfg.ActivationState = ActivationActive
cfg.Schedule.MaxAlertsHour = 100
m.UpdateConfig(cfg)
host := models.DockerHost{
ID: "host-1",
DisplayName: "Prod Swarm",
Hostname: "swarm-prod",
Services: []models.DockerService{
{
ID: "svc-1",
Name: "web",
DesiredTasks: 4,
RunningTasks: 2,
Mode: "replicated",
},
},
}
m.CheckDockerHost(host)
resourceID := dockerServiceResourceID(host.ID, "svc-1", "web")
alertID := fmt.Sprintf("docker-service-health-%s", resourceID)
alert, exists := m.activeAlerts[alertID]
if !exists {
t.Fatalf("expected service alert %s to be raised", alertID)
}
notifiedAt := time.Now().Add(-2 * time.Minute).UTC()
alert.LastNotified = &notifiedAt
// Same degraded state should keep LastNotified while refreshing state.
m.CheckDockerHost(host)
updated, exists := m.activeAlerts[alertID]
if !exists {
t.Fatalf("expected service alert %s to remain active", alertID)
}
if updated.LastNotified == nil {
t.Fatal("expected LastNotified to be preserved, got nil")
}
if !updated.LastNotified.Equal(notifiedAt) {
t.Fatalf("expected LastNotified %s, got %s", notifiedAt, updated.LastNotified)
}
}
func TestDockerServiceAlertRenotifiesOnEscalationToCritical(t *testing.T) {
m := newTestManager(t)
m.ClearActiveAlerts()
cfg := m.GetConfig()
cfg.Enabled = true
cfg.ActivationState = ActivationActive
cfg.Schedule.MaxAlertsHour = 100
cfg.DockerDefaults.ServiceWarnGapPct = 10
cfg.DockerDefaults.ServiceCritGapPct = 50
m.UpdateConfig(cfg)
dispatched := make(chan AlertLevel, 4)
m.SetAlertCallback(func(alert *Alert) {
dispatched <- alert.Level
})
host := models.DockerHost{
ID: "host-1",
DisplayName: "Prod Swarm",
Hostname: "swarm-prod",
Services: []models.DockerService{
{
ID: "svc-1",
Name: "web",
DesiredTasks: 4,
RunningTasks: 3, // 25% missing -> warning
Mode: "replicated",
},
},
}
m.CheckDockerHost(host)
select {
case level := <-dispatched:
if level != AlertLevelWarning {
t.Fatalf("expected warning notification first, got %s", level)
}
case <-time.After(1 * time.Second):
t.Fatal("expected initial warning notification")
}
// Escalate from warning to critical: should notify again.
host.Services[0].RunningTasks = 1 // 75% missing -> critical
m.CheckDockerHost(host)
select {
case level := <-dispatched:
if level != AlertLevelCritical {
t.Fatalf("expected critical escalation notification, got %s", level)
}
case <-time.After(1 * time.Second):
t.Fatal("expected escalation notification")
}
}
func TestDockerServiceUpdateStateAlert(t *testing.T) {
m := newTestManager(t)
cfg := m.GetConfig()
cfg.Enabled = true
m.UpdateConfig(cfg)
now := time.Now()
host := models.DockerHost{
ID: "host-update",
DisplayName: "Swarm",
Hostname: "swarm.local",
Services: []models.DockerService{
{
ID: "svc-update",
Name: "api",
DesiredTasks: 1,
RunningTasks: 1,
UpdateStatus: &models.DockerServiceUpdate{
State: "rollback_failed",
Message: "Rollback failed",
CompletedAt: &now,
},
},
},
}
m.CheckDockerHost(host)
resourceID := dockerServiceResourceID(host.ID, "svc-update", "api")
alertID := fmt.Sprintf("docker-service-health-%s", resourceID)
alert, exists := m.activeAlerts[alertID]
if !exists {
t.Fatalf("expected docker service alert %s to be raised", alertID)
}
if alert.Level != AlertLevelCritical {
t.Fatalf("expected critical severity for rollback failure, got %s", alert.Level)
}
if state, ok := alert.Metadata["updateState"].(string); !ok || state != "rollback_failed" {
t.Fatalf("expected updateState metadata to be rollback_failed, got %v", alert.Metadata["updateState"])
}
}
func TestDockerContainerStateUsesDockerDefaults(t *testing.T) {
m := newTestManager(t)
cfg := m.GetConfig()
cfg.DockerDefaults.StatePoweredOffSeverity = AlertLevelCritical
m.UpdateConfig(cfg)
container := models.DockerContainer{
ID: "container-1",
Name: "web",
State: "exited",
Status: "Exited (1) seconds ago",
}
host := models.DockerHost{
ID: "host-1",
DisplayName: "Docker Host",
Hostname: "docker.local",
Containers: []models.DockerContainer{container},
}
m.CheckDockerHost(host)
m.CheckDockerHost(host)
resourceID := dockerResourceID(host.ID, container.ID)
alertID := fmt.Sprintf("docker-container-state-%s", resourceID)
alert, exists := m.activeAlerts[alertID]
if !exists {
t.Fatalf("expected docker container state alert %s to be raised", alertID)
}
if alert.Level != AlertLevelCritical {
t.Fatalf("expected critical severity from docker defaults, got %s", alert.Level)
}
}
func TestDockerContainerStateRespectsDisableDefault(t *testing.T) {
m := newTestManager(t)
cfg := m.GetConfig()
cfg.DockerDefaults.StateDisableConnectivity = true
m.UpdateConfig(cfg)
container := models.DockerContainer{
ID: "container-2",
Name: "batch",
State: "exited",
Status: "Exited (0) seconds ago",
}
host := models.DockerHost{
ID: "host-2",
DisplayName: "Docker Host",
Hostname: "docker.example",
Containers: []models.DockerContainer{container},
}
m.CheckDockerHost(host)
m.CheckDockerHost(host)
resourceID := dockerResourceID(host.ID, container.ID)
alertID := fmt.Sprintf("docker-container-state-%s", resourceID)
if _, exists := m.activeAlerts[alertID]; exists {
t.Fatalf("did not expect docker container state alert when defaults disable connectivity")
}
}
func TestDockerContainerMemoryLimitHysteresis(t *testing.T) {
m := newTestManager(t)
hostID := "host-mem"
containerID := "container-mem"
hostHigh := models.DockerHost{
ID: hostID,
DisplayName: "Docker Host",
Hostname: "docker.mem",
Containers: []models.DockerContainer{
{
ID: containerID,
Name: "memory-hog",
State: "running",
Status: "Up 10 minutes",
MemoryUsage: 96 * 1024 * 1024,
MemoryLimit: 100 * 1024 * 1024,
},
},
}
m.CheckDockerHost(hostHigh)
resourceID := dockerResourceID(hostID, containerID)
alertID := fmt.Sprintf("docker-container-memory-limit-%s", resourceID)
if _, exists := m.activeAlerts[alertID]; !exists {
t.Fatalf("expected memory limit alert to be raised")
}
hostLow := models.DockerHost{
ID: hostID,
DisplayName: "Docker Host",
Hostname: "docker.mem",
Containers: []models.DockerContainer{
{
ID: containerID,
Name: "memory-hog",
State: "running",
Status: "Up 12 minutes",
MemoryUsage: 80 * 1024 * 1024,
MemoryLimit: 100 * 1024 * 1024,
},
},
}
m.CheckDockerHost(hostLow)
if _, exists := m.activeAlerts[alertID]; exists {
t.Fatalf("expected memory limit alert to clear after usage dropped below hysteresis threshold")
}
}
func TestDockerContainerDiskUsageAlert(t *testing.T) {
m := newTestManager(t)
cfg := m.GetConfig()
cfg.Enabled = true
cfg.TimeThreshold = 0
if cfg.TimeThresholds == nil {
cfg.TimeThresholds = make(map[string]int)
}
cfg.TimeThresholds["docker"] = 0
cfg.TimeThresholds["guest"] = 0
cfg.DockerDefaults.Disk = HysteresisThreshold{Trigger: 75, Clear: 65}
m.UpdateConfig(cfg)
const gib = 1024 * 1024 * 1024
host := models.DockerHost{
ID: "host-disk",
DisplayName: "Docker Host",
Hostname: "docker.disk",
Containers: []models.DockerContainer{
{
ID: "container-disk",
Name: "disk-hog",
State: "running",
Status: "Up 5 minutes",
WritableLayerBytes: int64(8 * gib),
RootFilesystemBytes: int64(10 * gib),
},
},
}
m.CheckDockerHost(host)
resourceID := dockerResourceID(host.ID, host.Containers[0].ID)
alertID := fmt.Sprintf("%s-%s", resourceID, "disk")
alert, exists := m.activeAlerts[alertID]
if !exists {
t.Fatalf("expected docker container disk alert %s to be raised", alertID)
}
if alert.Level != AlertLevelWarning {
t.Fatalf("expected warning severity for disk usage alert, got %s", alert.Level)
}
if alert.Metadata == nil {
t.Fatalf("expected disk alert metadata to be populated")
}
if percent, ok := alert.Metadata["diskPercent"].(float64); !ok || percent < 79.5 || percent > 80.5 {
t.Fatalf("expected diskPercent metadata to be ~80%%, got %v", alert.Metadata["diskPercent"])
}
if used, ok := alert.Metadata["writableLayerBytes"].(int64); !ok || used != int64(8*gib) {
t.Fatalf("expected writableLayerBytes metadata to be %d, got %v", int64(8*gib), alert.Metadata["writableLayerBytes"])
}
// Drop usage below the clear threshold and ensure the alert resolves.
host.Containers[0].WritableLayerBytes = int64(4 * gib)
m.CheckDockerHost(host)
if _, stillActive := m.activeAlerts[alertID]; stillActive {
t.Fatalf("expected docker container disk alert %s to clear after usage dropped", alertID)
}
}
func TestUpdateConfigClampsDockerServiceCriticalGap(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
cfg := AlertConfig{
Enabled: true,
GuestDefaults: ThresholdConfig{},
NodeDefaults: ThresholdConfig{},
HostDefaults: ThresholdConfig{},
StorageDefault: HysteresisThreshold{},
DockerDefaults: DockerThresholdConfig{
ServiceWarnGapPct: 35,
ServiceCritGapPct: 20,
},
PMGDefaults: PMGThresholdConfig{},
SnapshotDefaults: SnapshotAlertConfig{},
BackupDefaults: BackupAlertConfig{},
Overrides: make(map[string]ThresholdConfig),
Schedule: ScheduleConfig{},
}
m.UpdateConfig(cfg)
m.mu.RLock()
defer m.mu.RUnlock()
if m.config.DockerDefaults.ServiceWarnGapPct != 35 {
t.Fatalf("expected warning gap to remain 35, got %d", m.config.DockerDefaults.ServiceWarnGapPct)
}
if m.config.DockerDefaults.ServiceCritGapPct != 35 {
t.Fatalf("expected critical gap to be clamped to 35, got %d", m.config.DockerDefaults.ServiceCritGapPct)
}
}
func TestDockerServiceAlertUsesClampedCriticalGap(t *testing.T) {
m := newTestManager(t)
m.ClearActiveAlerts()
cfg := AlertConfig{
Enabled: true,
GuestDefaults: ThresholdConfig{},
NodeDefaults: ThresholdConfig{},
HostDefaults: ThresholdConfig{},
StorageDefault: HysteresisThreshold{},
DockerDefaults: DockerThresholdConfig{
ServiceWarnGapPct: 20,
ServiceCritGapPct: 5,
},
PMGDefaults: PMGThresholdConfig{},
SnapshotDefaults: SnapshotAlertConfig{},
BackupDefaults: BackupAlertConfig{},
Overrides: make(map[string]ThresholdConfig),
Schedule: ScheduleConfig{},
}
m.UpdateConfig(cfg)
host := models.DockerHost{
ID: "docker-host-1",
DisplayName: "Docker Host",
Hostname: "docker-host.local",
Services: []models.DockerService{
{
ID: "svc-123",
Name: "api",
DesiredTasks: 10,
RunningTasks: 7,
},
},
}
m.CheckDockerHost(host)
resourceID := dockerServiceResourceID(host.ID, "svc-123", "api")
alertID := fmt.Sprintf("docker-service-health-%s", resourceID)
alert, exists := m.activeAlerts[alertID]
if !exists {
t.Fatalf("expected docker service alert %s to be raised", alertID)
}
if alert.Level != AlertLevelCritical {
t.Fatalf("expected critical severity when replicas 7/10, got %s", alert.Level)
}
if pct, ok := alert.Metadata["percentMissing"].(float64); !ok || math.Abs(pct-30.0) > 0.01 {
t.Fatalf("expected percentMissing metadata ~30, got %v", alert.Metadata["percentMissing"])
}
}
// TestNormalizeHostDefaultsPreservesZeroTrigger verifies that setting
// Host Agent thresholds to 0 is preserved (fixes GitHub issue #864).
// Setting a threshold to 0 should disable alerting for that metric.
func TestNormalizeHostDefaultsPreservesZeroTrigger(t *testing.T) {
// t.Parallel()
t.Run("nil HostDefaults get factory defaults", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
cfg := AlertConfig{
Enabled: true,
HostDefaults: ThresholdConfig{}, // Empty - needs defaults
}
m.UpdateConfig(cfg)
m.mu.RLock()
defer m.mu.RUnlock()
if m.config.HostDefaults.CPU == nil {
t.Fatal("CPU defaults should be set")
}
if m.config.HostDefaults.CPU.Trigger != 80 {
t.Errorf("CPU trigger = %v, want 80", m.config.HostDefaults.CPU.Trigger)
}
if m.config.HostDefaults.Memory == nil {
t.Fatal("Memory defaults should be set")
}
if m.config.HostDefaults.Memory.Trigger != 85 {
t.Errorf("Memory trigger = %v, want 85", m.config.HostDefaults.Memory.Trigger)
}
if m.config.HostDefaults.Disk == nil {
t.Fatal("Disk defaults should be set")
}
if m.config.HostDefaults.Disk.Trigger != 90 {
t.Errorf("Disk trigger = %v, want 90", m.config.HostDefaults.Disk.Trigger)
}
})
t.Run("Trigger=0 preserved to disable alerting", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
// Set Memory to 0 to disable memory alerting for host agents
cfg := AlertConfig{
Enabled: true,
HostDefaults: ThresholdConfig{
CPU: &HysteresisThreshold{Trigger: 80, Clear: 75},
Memory: &HysteresisThreshold{Trigger: 0, Clear: 0}, // Disabled
Disk: &HysteresisThreshold{Trigger: 90, Clear: 85},
},
}
m.UpdateConfig(cfg)
m.mu.RLock()
defer m.mu.RUnlock()
// Memory threshold should remain at 0 (disabled), not reset to default
if m.config.HostDefaults.Memory == nil {
t.Fatal("Memory defaults should be preserved (not nil)")
}
if m.config.HostDefaults.Memory.Trigger != 0 {
t.Errorf("Memory trigger = %v, want 0 (disabled)", m.config.HostDefaults.Memory.Trigger)
}
if m.config.HostDefaults.Memory.Clear != 0 {
t.Errorf("Memory clear = %v, want 0 (disabled)", m.config.HostDefaults.Memory.Clear)
}
// CPU and Disk should still have their values
if m.config.HostDefaults.CPU.Trigger != 80 {
t.Errorf("CPU trigger = %v, want 80", m.config.HostDefaults.CPU.Trigger)
}
if m.config.HostDefaults.Disk.Trigger != 90 {
t.Errorf("Disk trigger = %v, want 90", m.config.HostDefaults.Disk.Trigger)
}
})
t.Run("Trigger=0 sets Clear=0 automatically", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
// Set CPU to 0 with a non-zero Clear - Clear should be normalized to 0
cfg := AlertConfig{
Enabled: true,
HostDefaults: ThresholdConfig{
CPU: &HysteresisThreshold{Trigger: 0, Clear: 50}, // Clear should become 0
Memory: &HysteresisThreshold{Trigger: 85, Clear: 80},
Disk: &HysteresisThreshold{Trigger: 0, Clear: 75}, // Clear should become 0
},
}
m.UpdateConfig(cfg)
m.mu.RLock()
defer m.mu.RUnlock()
if m.config.HostDefaults.CPU.Clear != 0 {
t.Errorf("CPU clear = %v, want 0 when trigger is 0", m.config.HostDefaults.CPU.Clear)
}
if m.config.HostDefaults.Disk.Clear != 0 {
t.Errorf("Disk clear = %v, want 0 when trigger is 0", m.config.HostDefaults.Disk.Clear)
}
})
t.Run("missing Clear computed from Trigger", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
cfg := AlertConfig{
Enabled: true,
HostDefaults: ThresholdConfig{
CPU: &HysteresisThreshold{Trigger: 90, Clear: 0}, // Clear should be computed
Memory: &HysteresisThreshold{Trigger: 95, Clear: 0}, // Clear should be computed
Disk: &HysteresisThreshold{Trigger: 92, Clear: 0}, // Clear should be computed
},
}
m.UpdateConfig(cfg)
m.mu.RLock()
defer m.mu.RUnlock()
// Clear should be Trigger - 5
if m.config.HostDefaults.CPU.Clear != 85 {
t.Errorf("CPU clear = %v, want 85", m.config.HostDefaults.CPU.Clear)
}
if m.config.HostDefaults.Memory.Clear != 90 {
t.Errorf("Memory clear = %v, want 90", m.config.HostDefaults.Memory.Clear)
}
if m.config.HostDefaults.Disk.Clear != 87 {
t.Errorf("Disk clear = %v, want 87", m.config.HostDefaults.Disk.Clear)
}
})
}
// TestNormalizeStorageDefaultsPreservesZeroTrigger verifies that setting
// StorageDefault threshold to 0 is preserved to disable storage alerting.
func TestNormalizeStorageDefaultsPreservesZeroTrigger(t *testing.T) {
// t.Parallel()
t.Run("negative trigger gets factory defaults", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
cfg := AlertConfig{
Enabled: true,
StorageDefault: HysteresisThreshold{Trigger: -1, Clear: 0},
}
m.UpdateConfig(cfg)
m.mu.RLock()
defer m.mu.RUnlock()
if m.config.StorageDefault.Trigger != 85 {
t.Errorf("StorageDefault trigger = %v, want 85", m.config.StorageDefault.Trigger)
}
if m.config.StorageDefault.Clear != 80 {
t.Errorf("StorageDefault clear = %v, want 80", m.config.StorageDefault.Clear)
}
})
t.Run("Trigger=0 preserved to disable storage alerting", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
cfg := AlertConfig{
Enabled: true,
StorageDefault: HysteresisThreshold{Trigger: 0, Clear: 0},
}
m.UpdateConfig(cfg)
m.mu.RLock()
defer m.mu.RUnlock()
if m.config.StorageDefault.Trigger != 0 {
t.Errorf("StorageDefault trigger = %v, want 0 (disabled)", m.config.StorageDefault.Trigger)
}
if m.config.StorageDefault.Clear != 0 {
t.Errorf("StorageDefault clear = %v, want 0 (disabled)", m.config.StorageDefault.Clear)
}
})
t.Run("missing Clear computed from Trigger", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
cfg := AlertConfig{
Enabled: true,
StorageDefault: HysteresisThreshold{Trigger: 90, Clear: 0},
}
m.UpdateConfig(cfg)
m.mu.RLock()
defer m.mu.RUnlock()
if m.config.StorageDefault.Trigger != 90 {
t.Errorf("StorageDefault trigger = %v, want 90", m.config.StorageDefault.Trigger)
}
if m.config.StorageDefault.Clear != 85 {
t.Errorf("StorageDefault clear = %v, want 85 (trigger - 5)", m.config.StorageDefault.Clear)
}
})
}
// TestNormalizeNodeDefaultsTemperaturePreservesZeroTrigger verifies that setting
// NodeDefaults.Temperature threshold to 0 is preserved to disable temperature alerting.
func TestNormalizeNodeDefaultsTemperaturePreservesZeroTrigger(t *testing.T) {
// t.Parallel()
t.Run("nil Temperature gets factory defaults", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
cfg := AlertConfig{
Enabled: true,
NodeDefaults: ThresholdConfig{}, // Empty - Temperature needs defaults
}
m.UpdateConfig(cfg)
m.mu.RLock()
defer m.mu.RUnlock()
if m.config.NodeDefaults.Temperature == nil {
t.Fatal("Temperature defaults should be set")
}
if m.config.NodeDefaults.Temperature.Trigger != 80 {
t.Errorf("Temperature trigger = %v, want 80", m.config.NodeDefaults.Temperature.Trigger)
}
if m.config.NodeDefaults.Temperature.Clear != 75 {
t.Errorf("Temperature clear = %v, want 75", m.config.NodeDefaults.Temperature.Clear)
}
})
t.Run("Trigger=0 preserved to disable temperature alerting", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
cfg := AlertConfig{
Enabled: true,
NodeDefaults: ThresholdConfig{
Temperature: &HysteresisThreshold{Trigger: 0, Clear: 0},
},
}
m.UpdateConfig(cfg)
m.mu.RLock()
defer m.mu.RUnlock()
if m.config.NodeDefaults.Temperature == nil {
t.Fatal("Temperature should be preserved (not nil)")
}
if m.config.NodeDefaults.Temperature.Trigger != 0 {
t.Errorf("Temperature trigger = %v, want 0 (disabled)", m.config.NodeDefaults.Temperature.Trigger)
}
if m.config.NodeDefaults.Temperature.Clear != 0 {
t.Errorf("Temperature clear = %v, want 0 (disabled)", m.config.NodeDefaults.Temperature.Clear)
}
})
t.Run("missing Clear computed from Trigger", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
cfg := AlertConfig{
Enabled: true,
NodeDefaults: ThresholdConfig{
Temperature: &HysteresisThreshold{Trigger: 85, Clear: 0},
},
}
m.UpdateConfig(cfg)
m.mu.RLock()
defer m.mu.RUnlock()
if m.config.NodeDefaults.Temperature.Trigger != 85 {
t.Errorf("Temperature trigger = %v, want 85", m.config.NodeDefaults.Temperature.Trigger)
}
if m.config.NodeDefaults.Temperature.Clear != 80 {
t.Errorf("Temperature clear = %v, want 80 (trigger - 5)", m.config.NodeDefaults.Temperature.Clear)
}
})
}
// TestNormalizeDockerThresholdPreservesZeroTrigger verifies that Docker
// container thresholds can be set to 0 to disable alerting.
func TestNormalizeDockerThresholdPreservesZeroTrigger(t *testing.T) {
// t.Parallel()
t.Run("Trigger=0 disables Docker CPU alerting", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
cfg := AlertConfig{
Enabled: true,
DockerDefaults: DockerThresholdConfig{
CPU: HysteresisThreshold{Trigger: 0, Clear: 0},
Memory: HysteresisThreshold{Trigger: 85, Clear: 80},
Disk: HysteresisThreshold{Trigger: 85, Clear: 80},
},
}
m.UpdateConfig(cfg)
m.mu.RLock()
defer m.mu.RUnlock()
if m.config.DockerDefaults.CPU.Trigger != 0 {
t.Errorf("Docker CPU trigger = %v, want 0 (disabled)", m.config.DockerDefaults.CPU.Trigger)
}
if m.config.DockerDefaults.Memory.Trigger != 85 {
t.Errorf("Docker Memory trigger = %v, want 85", m.config.DockerDefaults.Memory.Trigger)
}
})
t.Run("negative trigger replaced with defaults", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
cfg := AlertConfig{
Enabled: true,
DockerDefaults: DockerThresholdConfig{
CPU: HysteresisThreshold{Trigger: -5, Clear: 0},
Memory: HysteresisThreshold{Trigger: -10, Clear: 0},
Disk: HysteresisThreshold{Trigger: -1, Clear: 0},
},
}
m.UpdateConfig(cfg)
m.mu.RLock()
defer m.mu.RUnlock()
if m.config.DockerDefaults.CPU.Trigger != 80 {
t.Errorf("Docker CPU trigger = %v, want 80 (default)", m.config.DockerDefaults.CPU.Trigger)
}
if m.config.DockerDefaults.Memory.Trigger != 85 {
t.Errorf("Docker Memory trigger = %v, want 85 (default)", m.config.DockerDefaults.Memory.Trigger)
}
if m.config.DockerDefaults.Disk.Trigger != 85 {
t.Errorf("Docker Disk trigger = %v, want 85 (default)", m.config.DockerDefaults.Disk.Trigger)
}
})
}
func TestNormalizeDockerIgnoredPrefixes(t *testing.T) {
// t.Parallel()
tests := []struct {
name string
input []string
expected []string
}{
{
name: "nil input",
input: nil,
expected: nil,
},
{
name: "blank entries removed",
input: []string{"", " ", "\t"},
expected: nil,
},
{
name: "trims and deduplicates preserving first occurrence casing",
input: []string{" Foo ", "foo", "Bar", " bar ", "Baz"},
expected: []string{"Foo", "Bar", "Baz"},
},
{
name: "already normalized list remains unchanged",
input: []string{"alpha", "beta"},
expected: []string{"alpha", "beta"},
},
}
for _, tc := range tests {
tc := tc
t.Run(tc.name, func(t *testing.T) {
// t.Parallel()
got := NormalizeDockerIgnoredPrefixes(tc.input)
if !reflect.DeepEqual(got, tc.expected) {
t.Fatalf("expected %v, got %v", tc.expected, got)
}
})
}
}
func TestCheckDockerHostIgnoredPrefixClearsExistingAlerts(t *testing.T) {
m := newTestManager(t)
container := models.DockerContainer{
ID: "abc123456789",
Name: "runner-job-1",
State: "exited",
Status: "Exited (1) 10 seconds ago",
}
host := models.DockerHost{
ID: "docker-host",
DisplayName: "Docker Host",
Hostname: "docker-host.local",
Containers: []models.DockerContainer{container},
}
resourceID := dockerResourceID(host.ID, container.ID)
stateAlertID := fmt.Sprintf("docker-container-state-%s", resourceID)
healthAlertID := fmt.Sprintf("docker-container-health-%s", resourceID)
restartAlertID := fmt.Sprintf("docker-container-restart-loop-%s", resourceID)
m.mu.Lock()
m.config.Enabled = true
m.config.DockerIgnoredContainerPrefixes = []string{"runner-"}
m.activeAlerts[stateAlertID] = &Alert{ID: stateAlertID, ResourceID: resourceID}
m.activeAlerts[healthAlertID] = &Alert{ID: healthAlertID, ResourceID: resourceID}
m.activeAlerts[restartAlertID] = &Alert{ID: restartAlertID, ResourceID: resourceID}
m.dockerStateConfirm[resourceID] = 2
m.dockerRestartTracking[resourceID] = &dockerRestartRecord{}
m.dockerLastExitCode[resourceID] = 137
m.mu.Unlock()
m.CheckDockerHost(host)
m.mu.RLock()
defer m.mu.RUnlock()
if _, exists := m.activeAlerts[stateAlertID]; exists {
t.Fatalf("expected state alert cleared for ignored container")
}
if _, exists := m.activeAlerts[healthAlertID]; exists {
t.Fatalf("expected health alert cleared for ignored container")
}
if _, exists := m.activeAlerts[restartAlertID]; exists {
t.Fatalf("expected restart alert cleared for ignored container")
}
if _, exists := m.dockerStateConfirm[resourceID]; exists {
t.Fatalf("expected state confirmation tracking cleared")
}
if _, exists := m.dockerRestartTracking[resourceID]; exists {
t.Fatalf("expected restart tracking cleared")
}
if _, exists := m.dockerLastExitCode[resourceID]; exists {
t.Fatalf("expected last exit code cleared")
}
}
func TestUpdateConfigNormalizesDockerIgnoredPrefixes(t *testing.T) {
// t.Parallel()
t.Run("nil input remains nil", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.UpdateConfig(AlertConfig{})
m.mu.RLock()
defer m.mu.RUnlock()
if m.config.DockerIgnoredContainerPrefixes != nil {
t.Fatalf("expected nil prefixes, got %v", m.config.DockerIgnoredContainerPrefixes)
}
})
t.Run("duplicates trimmed and deduplicated", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
cfg := AlertConfig{
DockerIgnoredContainerPrefixes: []string{
" Foo ",
"foo",
"Bar",
},
}
m.UpdateConfig(cfg)
m.mu.RLock()
defer m.mu.RUnlock()
expected := []string{"Foo", "Bar"}
if !reflect.DeepEqual(m.config.DockerIgnoredContainerPrefixes, expected) {
t.Fatalf("expected normalized prefixes %v, got %v", expected, m.config.DockerIgnoredContainerPrefixes)
}
})
}
func TestMatchesDockerIgnoredPrefix(t *testing.T) {
// t.Parallel()
tests := []struct {
name string
containerName string
containerID string
prefixes []string
want bool
}{
{name: "empty prefixes", containerName: "runner-123", containerID: "abc", prefixes: nil, want: false},
{name: "match with name", containerName: "runner-123", containerID: "abc", prefixes: []string{"runner-"}, want: true},
{name: "match with id", containerName: "app", containerID: "abc123", prefixes: []string{"abc"}, want: true},
{name: "trimmed comparison", containerName: "runner-job", containerID: "abc", prefixes: []string{" runner- "}, want: true},
{name: "case insensitive", containerName: "Runner-Job", containerID: "abc", prefixes: []string{"runner-"}, want: true},
{name: "no match", containerName: "service", containerID: "xyz", prefixes: []string{"runner-"}, want: false},
{name: "skips empty prefix in list", containerName: "runner-job", containerID: "abc", prefixes: []string{"", "runner-"}, want: true},
{name: "all empty prefixes returns false", containerName: "runner-job", containerID: "abc", prefixes: []string{"", " ", ""}, want: false},
{name: "empty name matches id", containerName: "", containerID: "runner-123", prefixes: []string{"runner-"}, want: true},
{name: "empty id matches name", containerName: "runner-job", containerID: "", prefixes: []string{"runner-"}, want: true},
{name: "both empty no match", containerName: "", containerID: "", prefixes: []string{"runner-"}, want: false},
}
for _, tc := range tests {
tc := tc
t.Run(tc.name, func(t *testing.T) {
// t.Parallel()
if got := matchesDockerIgnoredPrefix(tc.containerName, tc.containerID, tc.prefixes); got != tc.want {
t.Fatalf("matchesDockerIgnoredPrefix(%q, %q, %v) = %v, want %v", tc.containerName, tc.containerID, tc.prefixes, got, tc.want)
}
})
}
}
func TestDockerInstanceName(t *testing.T) {
// t.Parallel()
tests := []struct {
name string
host models.DockerHost
want string
}{
{name: "uses display name", host: models.DockerHost{DisplayName: "Prod Host"}, want: "Docker:Prod Host"},
{name: "falls back to hostname", host: models.DockerHost{Hostname: "docker.local"}, want: "Docker:docker.local"},
{name: "defaults when empty", host: models.DockerHost{}, want: "Docker"},
}
for _, tc := range tests {
tc := tc
t.Run(tc.name, func(t *testing.T) {
// t.Parallel()
if got := dockerInstanceName(tc.host); got != tc.want {
t.Fatalf("dockerInstanceName(%+v) = %q, want %q", tc.host, got, tc.want)
}
})
}
}
func TestDockerContainerDisplayName(t *testing.T) {
// t.Parallel()
tests := []struct {
name string
container models.DockerContainer
want string
}{
{name: "trims whitespace", container: models.DockerContainer{Name: " app "}, want: "app"},
{name: "strips leading slash", container: models.DockerContainer{Name: "/runner"}, want: "runner"},
{name: "falls back to id truncated", container: models.DockerContainer{ID: "0123456789abcdef"}, want: "0123456789ab"},
}
for _, tc := range tests {
tc := tc
t.Run(tc.name, func(t *testing.T) {
// t.Parallel()
if got := dockerContainerDisplayName(tc.container); got != tc.want {
t.Fatalf("dockerContainerDisplayName(%+v) = %q, want %q", tc.container, got, tc.want)
}
})
}
}
func TestDockerResourceID(t *testing.T) {
// t.Parallel()
tests := []struct {
name string
hostID string
containerID string
want string
}{
{name: "both ids present", hostID: "host1", containerID: "abc", want: "docker:host1/abc"},
{name: "missing host id", hostID: "", containerID: "abc", want: "docker:container/abc"},
{name: "missing container id", hostID: "host1", containerID: "", want: "docker:host1"},
{name: "both missing", hostID: "", containerID: "", want: "docker:unknown"},
}
for _, tc := range tests {
tc := tc
t.Run(tc.name, func(t *testing.T) {
// t.Parallel()
if got := dockerResourceID(tc.hostID, tc.containerID); got != tc.want {
t.Fatalf("dockerResourceID(%q, %q) = %q, want %q", tc.hostID, tc.containerID, got, tc.want)
}
})
}
}
func TestHasKnownFirmwareBug(t *testing.T) {
// t.Parallel()
tests := []struct {
name string
model string
want bool
}{
{name: "Samsung 980 with SSD prefix", model: "Samsung SSD 980 1TB", want: true},
{name: "Samsung 980 without SSD prefix", model: "Samsung 980 PRO 2TB", want: true},
{name: "Samsung 990 with SSD prefix", model: "Samsung SSD 990 PRO 2TB", want: true},
{name: "Samsung 990 without SSD prefix", model: "Samsung 990 EVO 1TB", want: true},
{name: "Samsung 980 lowercase", model: "samsung ssd 980 1tb", want: true},
{name: "Samsung 990 mixed case", model: "SAMSUNG 990 PRO", want: true},
{name: "Samsung 970 (not affected)", model: "Samsung SSD 970 EVO Plus", want: false},
{name: "Samsung 870 (not affected)", model: "Samsung 870 QVO", want: false},
{name: "Other manufacturer", model: "WD Blue SN570", want: false},
{name: "Empty model", model: "", want: false},
}
for _, tc := range tests {
tc := tc
t.Run(tc.name, func(t *testing.T) {
// t.Parallel()
if got := hasKnownFirmwareBug(tc.model); got != tc.want {
t.Fatalf("hasKnownFirmwareBug(%q) = %v, want %v", tc.model, got, tc.want)
}
})
}
}
func TestCheckDiskHealthSkipsSamsung980FalseAlerts(t *testing.T) {
m := newTestManager(t)
m.ClearActiveAlerts()
// Samsung 980 reporting FAILED health (firmware bug) but actually healthy
disk := proxmox.Disk{
DevPath: "/dev/nvme0n1",
Model: "Samsung SSD 980 1TB",
Serial: "S649NF0R123456",
Type: "nvme",
Health: "FAILED", // False report due to firmware bug
Wearout: 99, // Drive is actually healthy with 99% life remaining
Size: 1000204886016,
}
// Should not create an alert for health status
m.CheckDiskHealth("test-instance", "pve-node1", disk)
m.mu.RLock()
healthAlertID := "disk-health-test-instance-pve-node1-/dev/nvme0n1"
if _, exists := m.activeAlerts[healthAlertID]; exists {
m.mu.RUnlock()
t.Fatalf("expected no health alert for Samsung 980 with known firmware bug")
}
m.mu.RUnlock()
// Now test that wearout alerts still work for these drives
disk.Wearout = 5 // Low wearout should still trigger alert
m.CheckDiskHealth("test-instance", "pve-node1", disk)
m.mu.RLock()
wearoutAlertID := "disk-wearout-test-instance-pve-node1-/dev/nvme0n1"
if _, exists := m.activeAlerts[wearoutAlertID]; !exists {
m.mu.RUnlock()
t.Fatalf("expected wearout alert to still work for Samsung 980")
}
m.mu.RUnlock()
}
func TestCheckDiskHealthClearsExistingSamsung980Alerts(t *testing.T) {
m := newTestManager(t)
m.ClearActiveAlerts()
disk := proxmox.Disk{
DevPath: "/dev/nvme0n1",
Model: "Samsung SSD 990 PRO 2TB",
Serial: "S6Z0NF0R654321",
Type: "nvme",
Health: "FAILED",
Wearout: 98,
Size: 2000398934016,
}
alertID := "disk-health-test-instance-pve-node1-/dev/nvme0n1"
// Manually create an existing alert (simulating alert from before the fix)
m.mu.Lock()
m.activeAlerts[alertID] = &Alert{
ID: alertID,
Type: "disk-health",
Level: AlertLevelCritical,
ResourceID: "pve-node1-/dev/nvme0n1",
ResourceName: "Samsung SSD 990 PRO 2TB (/dev/nvme0n1)",
Node: "pve-node1",
Instance: "test-instance",
Message: "Disk health check failed: FAILED",
}
m.mu.Unlock()
// Check disk health - should clear the existing false alert
m.CheckDiskHealth("test-instance", "pve-node1", disk)
m.mu.RLock()
defer m.mu.RUnlock()
if _, exists := m.activeAlerts[alertID]; exists {
t.Fatalf("expected existing Samsung 990 health alert to be cleared")
}
}
func TestCheckDiskHealthHealthyDiskNoAlert(t *testing.T) {
m := newTestManager(t)
m.ClearActiveAlerts()
// Non-Samsung disk with PASSED health should not create alert
disk := proxmox.Disk{
DevPath: "/dev/sda",
Model: "Western Digital WD40EFZX",
Serial: "WD-WCC4E0123456",
Type: "hdd",
Health: "PASSED",
Wearout: 0, // N/A for HDD
Size: 4000787030016,
}
m.CheckDiskHealth("test-instance", "pve-node1", disk)
m.mu.RLock()
healthAlertID := "disk-health-test-instance-pve-node1-/dev/sda"
if _, exists := m.activeAlerts[healthAlertID]; exists {
m.mu.RUnlock()
t.Fatalf("expected no health alert for healthy disk with PASSED status")
}
m.mu.RUnlock()
// Also test with "OK" status
disk.Health = "OK"
m.CheckDiskHealth("test-instance", "pve-node1", disk)
m.mu.RLock()
defer m.mu.RUnlock()
if _, exists := m.activeAlerts[healthAlertID]; exists {
t.Fatalf("expected no health alert for healthy disk with OK status")
}
}
func TestCheckDiskHealthFailedDiskCreatesAlert(t *testing.T) {
m := newTestManager(t)
m.ClearActiveAlerts()
// Non-Samsung disk with FAILED health should create alert
disk := proxmox.Disk{
DevPath: "/dev/sdb",
Model: "Seagate ST2000DM008",
Serial: "ZA123456",
Type: "hdd",
Health: "FAILED",
Wearout: 0,
Size: 2000398934016,
}
m.CheckDiskHealth("test-instance", "pve-node1", disk)
m.mu.RLock()
defer m.mu.RUnlock()
healthAlertID := "disk-health-test-instance-pve-node1-/dev/sdb"
alert, exists := m.activeAlerts[healthAlertID]
if !exists {
t.Fatalf("expected health alert to be created for failed disk")
}
if alert.Level != AlertLevelCritical {
t.Errorf("expected critical alert level, got %s", alert.Level)
}
if alert.Type != "disk-health" {
t.Errorf("expected type disk-health, got %s", alert.Type)
}
if alert.Node != "pve-node1" {
t.Errorf("expected node pve-node1, got %s", alert.Node)
}
if alert.Instance != "test-instance" {
t.Errorf("expected instance test-instance, got %s", alert.Instance)
}
}
func TestCheckDiskHealthRecoveryAlertCleared(t *testing.T) {
m := newTestManager(t)
m.ClearActiveAlerts()
disk := proxmox.Disk{
DevPath: "/dev/sdc",
Model: "Intel SSDSC2BB480G4",
Serial: "BTWL123456789",
Type: "ssd",
Health: "FAILED",
Wearout: 50,
Size: 480103981056,
}
// First check creates alert
m.CheckDiskHealth("test-instance", "pve-node1", disk)
healthAlertID := "disk-health-test-instance-pve-node1-/dev/sdc"
m.mu.RLock()
if _, exists := m.activeAlerts[healthAlertID]; !exists {
m.mu.RUnlock()
t.Fatalf("expected health alert to be created")
}
m.mu.RUnlock()
// Disk health recovers
disk.Health = "PASSED"
m.CheckDiskHealth("test-instance", "pve-node1", disk)
m.mu.RLock()
defer m.mu.RUnlock()
if _, exists := m.activeAlerts[healthAlertID]; exists {
t.Fatalf("expected health alert to be cleared after recovery")
}
}
func TestCheckDiskHealthLowWearoutCreatesAlert(t *testing.T) {
m := newTestManager(t)
m.ClearActiveAlerts()
// SSD with low wearout (less than 10% life remaining)
disk := proxmox.Disk{
DevPath: "/dev/nvme1n1",
Model: "Crucial CT1000MX500",
Serial: "12345678ABCD",
Type: "nvme",
Health: "PASSED",
Wearout: 5, // Only 5% life remaining
Size: 1000204886016,
}
m.CheckDiskHealth("test-instance", "pve-node1", disk)
m.mu.RLock()
defer m.mu.RUnlock()
wearoutAlertID := "disk-wearout-test-instance-pve-node1-/dev/nvme1n1"
alert, exists := m.activeAlerts[wearoutAlertID]
if !exists {
t.Fatalf("expected wearout alert to be created for disk with low life remaining")
}
if alert.Level != AlertLevelWarning {
t.Errorf("expected warning alert level, got %s", alert.Level)
}
if alert.Type != "disk-wearout" {
t.Errorf("expected type disk-wearout, got %s", alert.Type)
}
if alert.Value != 5 {
t.Errorf("expected value 5, got %f", alert.Value)
}
if alert.Threshold != 10.0 {
t.Errorf("expected threshold 10.0, got %f", alert.Threshold)
}
}
func TestCheckDiskHealthWearoutAlertUpdatesOnSubsequentChecks(t *testing.T) {
m := newTestManager(t)
m.ClearActiveAlerts()
disk := proxmox.Disk{
DevPath: "/dev/nvme2n1",
Model: "Kingston SA2000M8",
Serial: "50026B768A123456",
Type: "nvme",
Health: "PASSED",
Wearout: 8,
Size: 500107862016,
}
// First check creates alert
m.CheckDiskHealth("test-instance", "pve-node1", disk)
wearoutAlertID := "disk-wearout-test-instance-pve-node1-/dev/nvme2n1"
m.mu.RLock()
alert, exists := m.activeAlerts[wearoutAlertID]
if !exists {
m.mu.RUnlock()
t.Fatalf("expected wearout alert to be created")
}
firstLastSeen := alert.LastSeen
m.mu.RUnlock()
// Wait a moment to ensure time difference
time.Sleep(10 * time.Millisecond)
// Wearout decreases further
disk.Wearout = 6
m.CheckDiskHealth("test-instance", "pve-node1", disk)
m.mu.RLock()
defer m.mu.RUnlock()
alert, exists = m.activeAlerts[wearoutAlertID]
if !exists {
t.Fatalf("expected wearout alert to still exist")
}
if !alert.LastSeen.After(firstLastSeen) {
t.Errorf("expected LastSeen to be updated, got %v (original: %v)", alert.LastSeen, firstLastSeen)
}
if alert.Value != 6 {
t.Errorf("expected value to be updated to 6, got %f", alert.Value)
}
}
func TestCheckDiskHealthWearoutRecoveryAlertCleared(t *testing.T) {
m := newTestManager(t)
m.ClearActiveAlerts()
disk := proxmox.Disk{
DevPath: "/dev/sdd",
Model: "ADATA SU800",
Serial: "2J012345678",
Type: "ssd",
Health: "PASSED",
Wearout: 5,
Size: 256060514304,
}
// First check creates wearout alert
m.CheckDiskHealth("test-instance", "pve-node1", disk)
wearoutAlertID := "disk-wearout-test-instance-pve-node1-/dev/sdd"
m.mu.RLock()
if _, exists := m.activeAlerts[wearoutAlertID]; !exists {
m.mu.RUnlock()
t.Fatalf("expected wearout alert to be created")
}
m.mu.RUnlock()
// Wearout recovers (replaced drive, or misread corrected)
disk.Wearout = 95
m.CheckDiskHealth("test-instance", "pve-node1", disk)
m.mu.RLock()
defer m.mu.RUnlock()
if _, exists := m.activeAlerts[wearoutAlertID]; exists {
t.Fatalf("expected wearout alert to be cleared after recovery")
}
}
func TestCheckDiskHealthEmptyOrUnknownHealthNoAlert(t *testing.T) {
m := newTestManager(t)
m.ClearActiveAlerts()
disk := proxmox.Disk{
DevPath: "/dev/sde",
Model: "Generic USB Storage",
Serial: "USB123456",
Type: "hdd",
Health: "", // Empty health - SMART not supported
Wearout: 0,
Size: 128043712512,
}
healthAlertID := "disk-health-test-instance-pve-node1-/dev/sde"
// Empty health should not create alert
m.CheckDiskHealth("test-instance", "pve-node1", disk)
m.mu.RLock()
if _, exists := m.activeAlerts[healthAlertID]; exists {
m.mu.RUnlock()
t.Fatalf("expected no health alert for disk with empty health status")
}
m.mu.RUnlock()
// UNKNOWN health should not create alert
disk.Health = "UNKNOWN"
m.CheckDiskHealth("test-instance", "pve-node1", disk)
m.mu.RLock()
if _, exists := m.activeAlerts[healthAlertID]; exists {
m.mu.RUnlock()
t.Fatalf("expected no health alert for disk with UNKNOWN health status")
}
m.mu.RUnlock()
// Lowercase "unknown" should also not create alert (normalized to uppercase)
disk.Health = "unknown"
m.CheckDiskHealth("test-instance", "pve-node1", disk)
m.mu.RLock()
defer m.mu.RUnlock()
if _, exists := m.activeAlerts[healthAlertID]; exists {
t.Fatalf("expected no health alert for disk with lowercase unknown health status")
}
}
func TestDisableAllStorageClearsExistingAlerts(t *testing.T) {
m := newTestManager(t)
storageID := "local-lvm"
// Start with configuration that allows storage alerts
initialConfig := AlertConfig{
Enabled: true,
DisableAllStorage: false,
StorageDefault: HysteresisThreshold{Trigger: 80, Clear: 75},
TimeThreshold: 0,
TimeThresholds: map[string]int{},
NodeDefaults: ThresholdConfig{
CPU: &HysteresisThreshold{Trigger: 80, Clear: 75},
Memory: &HysteresisThreshold{Trigger: 85, Clear: 80},
Disk: &HysteresisThreshold{Trigger: 90, Clear: 85},
},
GuestDefaults: ThresholdConfig{
CPU: &HysteresisThreshold{Trigger: 80, Clear: 75},
},
Overrides: make(map[string]ThresholdConfig),
}
m.UpdateConfig(initialConfig)
m.mu.Lock()
m.config.TimeThreshold = 0
m.config.TimeThresholds = map[string]int{}
m.config.ActivationState = ActivationActive
m.mu.Unlock()
var dispatched []*Alert
done := make(chan struct{}, 1)
var resolved []string
resolvedDone := make(chan struct{}, 1)
m.SetAlertCallback(func(alert *Alert) {
dispatched = append(dispatched, alert)
select {
case done <- struct{}{}:
default:
}
})
m.SetResolvedCallback(func(alertID string) {
resolved = append(resolved, alertID)
select {
case resolvedDone <- struct{}{}:
default:
}
})
storage := models.Storage{
ID: storageID,
Name: "local-lvm",
Usage: 90.0,
Status: "available",
}
// Initial check should trigger an alert
m.CheckStorage(storage)
select {
case <-done:
case <-time.After(100 * time.Millisecond):
t.Fatalf("did not receive initial alert dispatch")
}
if len(dispatched) != 1 {
t.Fatalf("expected 1 alert before disabling storage, got %d", len(dispatched))
}
// Apply config with DisableAllStorage enabled
disabledConfig := initialConfig
disabledConfig.DisableAllStorage = true
m.UpdateConfig(disabledConfig)
m.mu.Lock()
m.config.TimeThreshold = 0
m.config.TimeThresholds = map[string]int{}
m.config.ActivationState = ActivationActive
m.mu.Unlock()
// Clear dispatched slice to capture only post-disable notifications
dispatched = dispatched[:0]
done = make(chan struct{}, 1)
// Re-run CheckStorage with high usage; no alert should be dispatched
m.CheckStorage(storage)
select {
case <-done:
t.Fatalf("expected no alerts after disabling all storage, but callback fired")
case <-time.After(100 * time.Millisecond):
// No callback fired as expected
}
// Active alerts should be cleared by reevaluateActiveAlertsLocked
m.mu.RLock()
activeCount := len(m.activeAlerts)
m.mu.RUnlock()
if activeCount != 0 {
t.Fatalf("expected active alerts to be cleared after disabling all storage, got %d", activeCount)
}
// Resolved callback should have fired
select {
case <-resolvedDone:
case <-time.After(100 * time.Millisecond):
t.Fatalf("expected resolved callback to fire after disabling all storage")
}
expectedAlertID := fmt.Sprintf("%s-usage", storageID)
if len(resolved) != 1 || resolved[0] != expectedAlertID {
t.Fatalf("expected resolved callback for %s, got %v", expectedAlertID, resolved)
}
// Pending alert should be cleared
m.mu.RLock()
_, isPending := m.pendingAlerts[expectedAlertID]
m.mu.RUnlock()
if isPending {
t.Fatalf("expected pending alert entry to be cleared after disabling all storage")
}
}
func TestUpdateConfigPreservesZeroDockerThresholds(t *testing.T) {
t.Helper()
m := newTestManager(t)
config := m.GetConfig()
config.DockerDefaults.Memory = HysteresisThreshold{Trigger: 0, Clear: 0}
m.UpdateConfig(config)
m.mu.RLock()
defer m.mu.RUnlock()
if m.config.DockerDefaults.Memory.Trigger != 0 {
t.Fatalf("expected docker memory trigger to remain 0 when disabled, got %.1f", m.config.DockerDefaults.Memory.Trigger)
}
if m.config.DockerDefaults.Memory.Clear != 0 {
t.Fatalf("expected docker memory clear to remain 0 when disabled, got %.1f", m.config.DockerDefaults.Memory.Clear)
}
}
func TestReevaluateClearsDockerContainerAlertWhenOverrideDisabled(t *testing.T) {
m := newTestManager(t)
resourceID := "docker:host-1/container-1"
alertID := resourceID + "-memory"
resolved := make(chan string, 1)
m.SetResolvedCallback(func(id string) {
resolved <- id
})
m.mu.Lock()
m.activeAlerts[alertID] = &Alert{
ID: alertID,
Type: "memory",
ResourceID: resourceID,
ResourceName: "qbittorrent",
Instance: "Docker",
Metadata: map[string]interface{}{
"resourceType": "Docker Container",
},
Threshold: 80,
Value: 90,
}
m.mu.Unlock()
config := m.GetConfig()
config.Overrides = map[string]ThresholdConfig{
resourceID: {
Disabled: true,
},
}
config.ActivationState = ActivationActive
m.UpdateConfig(config)
select {
case got := <-resolved:
if got != alertID {
t.Fatalf("resolved callback fired for unexpected alert %s", got)
}
case <-time.After(200 * time.Millisecond):
t.Fatalf("expected alert to be resolved when docker container override is disabled")
}
m.mu.RLock()
_, exists := m.activeAlerts[alertID]
m.mu.RUnlock()
if exists {
t.Fatalf("expected docker container alert to be cleared when override is disabled")
}
}
func TestReevaluateClearsDockerContainerAlertWhenIgnoredPrefixAdded(t *testing.T) {
m := newTestManager(t)
resourceID := "docker:host-2/container-abc123"
alertID := resourceID + "-cpu"
resolved := make(chan string, 1)
m.SetResolvedCallback(func(id string) {
resolved <- id
})
m.mu.Lock()
m.activeAlerts[alertID] = &Alert{
ID: alertID,
Type: "cpu",
ResourceID: resourceID,
ResourceName: "qbittorrentvpn",
Instance: "Docker",
Metadata: map[string]interface{}{
"resourceType": "Docker Container",
"containerId": "abc123",
"containerName": "qbittorrentvpn",
},
Threshold: 80,
Value: 95,
}
m.mu.Unlock()
config := m.GetConfig()
config.DockerIgnoredContainerPrefixes = []string{"qbit"}
config.ActivationState = ActivationActive
m.UpdateConfig(config)
select {
case got := <-resolved:
if got != alertID {
t.Fatalf("resolved callback fired for unexpected alert %s", got)
}
case <-time.After(200 * time.Millisecond):
t.Fatalf("expected alert to be resolved after adding ignored prefix")
}
m.mu.RLock()
_, exists := m.activeAlerts[alertID]
m.mu.RUnlock()
if exists {
t.Fatalf("expected docker container alert to be cleared when ignored prefix is configured")
}
}
func TestBuildGuestKey(t *testing.T) {
// t.Parallel()
tests := []struct {
name string
instance string
node string
vmid int
want string
}{
{
name: "different instance and node",
instance: "cluster-1",
node: "pve-node",
vmid: 100,
want: "cluster-1:pve-node:100",
},
{
name: "same instance and node",
instance: "pve-node",
node: "pve-node",
vmid: 200,
want: "pve-node:pve-node:200",
},
{
name: "empty instance uses node",
instance: "",
node: "pve-node",
vmid: 300,
want: "pve-node:pve-node:300",
},
{
name: "whitespace instance uses node",
instance: " ",
node: "pve-node",
vmid: 400,
want: "pve-node:pve-node:400",
},
{
name: "instance with whitespace trimmed",
instance: " cluster-1 ",
node: "pve-node",
vmid: 500,
want: "cluster-1:pve-node:500",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
// t.Parallel()
got := BuildGuestKey(tt.instance, tt.node, tt.vmid)
if got != tt.want {
t.Errorf("BuildGuestKey(%q, %q, %d) = %q, want %q", tt.instance, tt.node, tt.vmid, got, tt.want)
}
})
}
}
func TestCheckFlapping(t *testing.T) {
// t.Parallel()
tests := []struct {
name string
flappingEnabled bool
threshold int
windowSeconds int
cooldownMinutes int
historyEntries int // number of state changes to simulate before the test call
expectFlapping bool
expectNewFlapping bool // should this trigger a new flapping detection (vs already flapping)
}{
{
name: "disabled returns false",
flappingEnabled: false,
threshold: 5,
windowSeconds: 300,
historyEntries: 10, // way over threshold
expectFlapping: false,
},
{
name: "below threshold returns false",
flappingEnabled: true,
threshold: 5,
windowSeconds: 300,
historyEntries: 2, // only 2 + 1 (test call) = 3 < 5
expectFlapping: false,
},
{
name: "at threshold triggers new flapping",
flappingEnabled: true,
threshold: 5,
windowSeconds: 300,
cooldownMinutes: 15,
historyEntries: 4, // 4 + 1 (test call) = 5 == threshold
expectFlapping: true,
expectNewFlapping: true,
},
{
name: "above threshold triggers flapping",
flappingEnabled: true,
threshold: 5,
windowSeconds: 300,
cooldownMinutes: 15,
historyEntries: 6, // 6 + 1 = 7 > 5
expectFlapping: true,
expectNewFlapping: true,
},
{
name: "single state change below threshold",
flappingEnabled: true,
threshold: 5,
windowSeconds: 300,
historyEntries: 0, // only the test call = 1 < 5
expectFlapping: false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
// Configure flapping settings
m.mu.Lock()
m.config.FlappingEnabled = tt.flappingEnabled
m.config.FlappingThreshold = tt.threshold
m.config.FlappingWindowSeconds = tt.windowSeconds
m.config.FlappingCooldownMinutes = tt.cooldownMinutes
alertID := "test-alert-" + tt.name
// Add history entries within the time window
now := time.Now()
for i := 0; i < tt.historyEntries; i++ {
m.flappingHistory[alertID] = append(m.flappingHistory[alertID], now.Add(-time.Duration(i)*time.Second))
}
m.mu.Unlock()
// Call checkFlappingLocked
m.mu.Lock()
result := m.checkFlappingLocked(alertID)
m.mu.Unlock()
if result != tt.expectFlapping {
t.Errorf("checkFlappingLocked() = %v, want %v", result, tt.expectFlapping)
}
// Check if flapping was newly detected
m.mu.RLock()
isFlappingActive := m.flappingActive[alertID]
_, hasSuppression := m.suppressedUntil[alertID]
m.mu.RUnlock()
if tt.expectNewFlapping {
if !isFlappingActive {
t.Errorf("expected flappingActive[%s] to be true", alertID)
}
if !hasSuppression {
t.Errorf("expected suppressedUntil[%s] to be set", alertID)
}
}
})
}
}
func TestCheckFlappingAlreadyFlapping(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
alertID := "already-flapping-alert"
m.mu.Lock()
m.config.FlappingEnabled = true
m.config.FlappingThreshold = 3
m.config.FlappingWindowSeconds = 300
m.config.FlappingCooldownMinutes = 15
// Pre-set flapping state
m.flappingActive[alertID] = true
existingSuppression := time.Now().Add(10 * time.Minute)
m.suppressedUntil[alertID] = existingSuppression
// Add history to exceed threshold
now := time.Now()
m.flappingHistory[alertID] = []time.Time{
now.Add(-10 * time.Second),
now.Add(-5 * time.Second),
}
m.mu.Unlock()
// Call checkFlappingLocked - should return true but NOT update suppression
m.mu.Lock()
result := m.checkFlappingLocked(alertID)
m.mu.Unlock()
if !result {
t.Errorf("checkFlappingLocked() = false, want true for already flapping alert")
}
// Verify suppression time was NOT updated (existing suppression should remain)
m.mu.RLock()
currentSuppression := m.suppressedUntil[alertID]
m.mu.RUnlock()
if !currentSuppression.Equal(existingSuppression) {
t.Errorf("suppressedUntil was updated from %v to %v; should remain unchanged for already-flapping alert",
existingSuppression, currentSuppression)
}
}
func TestCheckFlappingWindowExpiry(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
alertID := "window-expiry-alert"
m.mu.Lock()
m.config.FlappingEnabled = true
m.config.FlappingThreshold = 3
m.config.FlappingWindowSeconds = 60 // 1 minute window
// Add old history entries outside the window
now := time.Now()
m.flappingHistory[alertID] = []time.Time{
now.Add(-5 * time.Minute), // outside 1 minute window
now.Add(-4 * time.Minute), // outside 1 minute window
now.Add(-3 * time.Minute), // outside 1 minute window
now.Add(-2 * time.Minute), // outside 1 minute window
}
m.mu.Unlock()
// Call checkFlappingLocked - old entries should be pruned
m.mu.Lock()
result := m.checkFlappingLocked(alertID)
historyLen := len(m.flappingHistory[alertID])
m.mu.Unlock()
if result {
t.Errorf("checkFlappingLocked() = true, want false (old entries should be pruned)")
}
// Only the current call should remain in history
if historyLen != 1 {
t.Errorf("history length = %d, want 1 (old entries should be pruned)", historyLen)
}
}
func TestGetGlobalMetricTimeThreshold(t *testing.T) {
// t.Parallel()
tests := []struct {
name string
metricTimeThresholds map[string]map[string]int
metricType string
wantDelay int
wantFound bool
}{
{
name: "empty MetricTimeThresholds returns false",
metricTimeThresholds: nil,
metricType: "cpu",
wantDelay: 0,
wantFound: false,
},
{
name: "no all key returns false",
metricTimeThresholds: map[string]map[string]int{"specific": {"cpu": 60}},
metricType: "cpu",
wantDelay: 0,
wantFound: false,
},
{
name: "empty all map returns false",
metricTimeThresholds: map[string]map[string]int{"all": {}},
metricType: "cpu",
wantDelay: 0,
wantFound: false,
},
{
name: "empty metricType returns false",
metricTimeThresholds: map[string]map[string]int{"all": {"cpu": 60}},
metricType: "",
wantDelay: 0,
wantFound: false,
},
{
name: "whitespace metricType returns false",
metricTimeThresholds: map[string]map[string]int{"all": {"cpu": 60}},
metricType: " ",
wantDelay: 0,
wantFound: false,
},
{
name: "direct metric match",
metricTimeThresholds: map[string]map[string]int{"all": {"cpu": 120, "memory": 90}},
metricType: "cpu",
wantDelay: 120,
wantFound: true,
},
{
name: "metric match case insensitive",
metricTimeThresholds: map[string]map[string]int{"all": {"cpu": 120}},
metricType: "CPU",
wantDelay: 120,
wantFound: true,
},
{
name: "metric match with whitespace",
metricTimeThresholds: map[string]map[string]int{"all": {"cpu": 120}},
metricType: " cpu ",
wantDelay: 120,
wantFound: true,
},
{
name: "default fallback",
metricTimeThresholds: map[string]map[string]int{"all": {"default": 30}},
metricType: "unknown",
wantDelay: 30,
wantFound: true,
},
{
name: "_default fallback",
metricTimeThresholds: map[string]map[string]int{"all": {"_default": 45}},
metricType: "unknown",
wantDelay: 45,
wantFound: true,
},
{
name: "wildcard fallback",
metricTimeThresholds: map[string]map[string]int{"all": {"*": 15}},
metricType: "unknown",
wantDelay: 15,
wantFound: true,
},
{
name: "direct match takes precedence over default",
metricTimeThresholds: map[string]map[string]int{"all": {"cpu": 120, "default": 30}},
metricType: "cpu",
wantDelay: 120,
wantFound: true,
},
{
name: "no match and no fallback returns false",
metricTimeThresholds: map[string]map[string]int{"all": {"cpu": 120, "memory": 90}},
metricType: "disk",
wantDelay: 0,
wantFound: false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.config.MetricTimeThresholds = tt.metricTimeThresholds
m.mu.Unlock()
m.mu.RLock()
gotDelay, gotFound := m.getGlobalMetricTimeThreshold(tt.metricType)
m.mu.RUnlock()
if gotDelay != tt.wantDelay {
t.Errorf("getGlobalMetricTimeThreshold() delay = %d, want %d", gotDelay, tt.wantDelay)
}
if gotFound != tt.wantFound {
t.Errorf("getGlobalMetricTimeThreshold() found = %v, want %v", gotFound, tt.wantFound)
}
})
}
}
func TestGetBaseTimeThreshold(t *testing.T) {
// t.Parallel()
tests := []struct {
name string
timeThresholds map[string]int
timeThreshold int // global fallback
resourceType string
wantDelay int
wantFound bool
}{
{
name: "nil TimeThresholds returns global TimeThreshold",
timeThresholds: nil,
timeThreshold: 60,
resourceType: "guest",
wantDelay: 60,
wantFound: false,
},
{
name: "direct resource type match",
timeThresholds: map[string]int{"guest": 120, "node": 90},
timeThreshold: 60,
resourceType: "guest",
wantDelay: 120,
wantFound: true,
},
{
name: "canonical key match for vm",
timeThresholds: map[string]int{"guest": 120},
timeThreshold: 60,
resourceType: "vm",
wantDelay: 120,
wantFound: true,
},
{
name: "canonical key match for container",
timeThresholds: map[string]int{"guest": 120},
timeThreshold: 60,
resourceType: "container",
wantDelay: 120,
wantFound: true,
},
{
name: "all fallback when no specific match",
timeThresholds: map[string]int{"all": 45},
timeThreshold: 60,
resourceType: "storage",
wantDelay: 45,
wantFound: false, // "all" returns found=false
},
{
name: "specific match takes precedence over all",
timeThresholds: map[string]int{"storage": 30, "all": 45},
timeThreshold: 60,
resourceType: "storage",
wantDelay: 30,
wantFound: true,
},
{
name: "no match and no all returns global threshold",
timeThresholds: map[string]int{"guest": 120},
timeThreshold: 60,
resourceType: "storage",
wantDelay: 60,
wantFound: false,
},
{
name: "empty TimeThresholds returns global threshold",
timeThresholds: map[string]int{},
timeThreshold: 60,
resourceType: "guest",
wantDelay: 60,
wantFound: false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.config.TimeThresholds = tt.timeThresholds
m.config.TimeThreshold = tt.timeThreshold
m.mu.Unlock()
m.mu.RLock()
gotDelay, gotFound := m.getBaseTimeThreshold(tt.resourceType)
m.mu.RUnlock()
if gotDelay != tt.wantDelay {
t.Errorf("getBaseTimeThreshold() delay = %d, want %d", gotDelay, tt.wantDelay)
}
if gotFound != tt.wantFound {
t.Errorf("getBaseTimeThreshold() found = %v, want %v", gotFound, tt.wantFound)
}
})
}
}
func TestGetMetricTimeThreshold(t *testing.T) {
// t.Parallel()
tests := []struct {
name string
metricTimeThresholds map[string]map[string]int
resourceType string
metricType string
wantDelay int
wantFound bool
}{
{
name: "empty MetricTimeThresholds returns false",
metricTimeThresholds: nil,
resourceType: "guest",
metricType: "cpu",
wantDelay: 0,
wantFound: false,
},
{
name: "empty metricType returns false",
metricTimeThresholds: map[string]map[string]int{"guest": {"cpu": 60}},
resourceType: "guest",
metricType: "",
wantDelay: 0,
wantFound: false,
},
{
name: "whitespace metricType returns false",
metricTimeThresholds: map[string]map[string]int{"guest": {"cpu": 60}},
resourceType: "guest",
metricType: " ",
wantDelay: 0,
wantFound: false,
},
{
name: "direct match on resourceType and metricType",
metricTimeThresholds: map[string]map[string]int{"guest": {"cpu": 120, "memory": 90}},
resourceType: "guest",
metricType: "cpu",
wantDelay: 120,
wantFound: true,
},
{
name: "canonical key match vm to guest",
metricTimeThresholds: map[string]map[string]int{"guest": {"cpu": 120}},
resourceType: "vm",
metricType: "cpu",
wantDelay: 120,
wantFound: true,
},
{
name: "canonical key match container to guest",
metricTimeThresholds: map[string]map[string]int{"guest": {"memory": 90}},
resourceType: "container",
metricType: "memory",
wantDelay: 90,
wantFound: true,
},
{
name: "default fallback within resourceType",
metricTimeThresholds: map[string]map[string]int{"guest": {"default": 30}},
resourceType: "guest",
metricType: "unknown",
wantDelay: 30,
wantFound: true,
},
{
name: "_default fallback within resourceType",
metricTimeThresholds: map[string]map[string]int{"guest": {"_default": 45}},
resourceType: "guest",
metricType: "unknown",
wantDelay: 45,
wantFound: true,
},
{
name: "wildcard fallback within resourceType",
metricTimeThresholds: map[string]map[string]int{"guest": {"*": 15}},
resourceType: "guest",
metricType: "unknown",
wantDelay: 15,
wantFound: true,
},
{
name: "direct match takes precedence over default",
metricTimeThresholds: map[string]map[string]int{"guest": {"cpu": 120, "default": 30}},
resourceType: "guest",
metricType: "cpu",
wantDelay: 120,
wantFound: true,
},
{
name: "no match for resourceType returns false",
metricTimeThresholds: map[string]map[string]int{"node": {"cpu": 60}},
resourceType: "guest",
metricType: "cpu",
wantDelay: 0,
wantFound: false,
},
{
name: "empty perType map skipped",
metricTimeThresholds: map[string]map[string]int{"guest": {}},
resourceType: "guest",
metricType: "cpu",
wantDelay: 0,
wantFound: false,
},
{
name: "metricType case insensitive",
metricTimeThresholds: map[string]map[string]int{"guest": {"cpu": 120}},
resourceType: "guest",
metricType: "CPU",
wantDelay: 120,
wantFound: true,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.config.MetricTimeThresholds = tt.metricTimeThresholds
m.mu.Unlock()
m.mu.RLock()
gotDelay, gotFound := m.getMetricTimeThreshold(tt.resourceType, tt.metricType)
m.mu.RUnlock()
if gotDelay != tt.wantDelay {
t.Errorf("getMetricTimeThreshold() delay = %d, want %d", gotDelay, tt.wantDelay)
}
if gotFound != tt.wantFound {
t.Errorf("getMetricTimeThreshold() found = %v, want %v", gotFound, tt.wantFound)
}
})
}
}
func TestCheckRateLimit(t *testing.T) {
// t.Parallel()
t.Run("no rate limit when MaxAlertsHour is zero", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.config.Schedule.MaxAlertsHour = 0
m.mu.Unlock()
m.mu.Lock()
result := m.checkRateLimit("test-alert")
m.mu.Unlock()
if !result {
t.Errorf("checkRateLimit() = false, want true when MaxAlertsHour is 0")
}
})
t.Run("no rate limit when MaxAlertsHour is negative", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.config.Schedule.MaxAlertsHour = -1
m.mu.Unlock()
m.mu.Lock()
result := m.checkRateLimit("test-alert")
m.mu.Unlock()
if !result {
t.Errorf("checkRateLimit() = false, want true when MaxAlertsHour is negative")
}
})
t.Run("allows alerts under rate limit", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.config.Schedule.MaxAlertsHour = 5
m.mu.Unlock()
// First 5 alerts should be allowed
for i := 0; i < 5; i++ {
m.mu.Lock()
result := m.checkRateLimit("test-alert")
m.mu.Unlock()
if !result {
t.Errorf("checkRateLimit() call %d = false, want true (under limit)", i+1)
}
}
})
t.Run("blocks alerts at rate limit", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.config.Schedule.MaxAlertsHour = 3
m.mu.Unlock()
// Use up the rate limit
for i := 0; i < 3; i++ {
m.mu.Lock()
_ = m.checkRateLimit("test-alert")
m.mu.Unlock()
}
// Fourth alert should be blocked
m.mu.Lock()
result := m.checkRateLimit("test-alert")
m.mu.Unlock()
if result {
t.Errorf("checkRateLimit() = true, want false (at rate limit)")
}
})
t.Run("different alert IDs have separate limits", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.config.Schedule.MaxAlertsHour = 2
m.mu.Unlock()
// Use up limit for alert-1
for i := 0; i < 2; i++ {
m.mu.Lock()
_ = m.checkRateLimit("alert-1")
m.mu.Unlock()
}
// alert-2 should still be allowed
m.mu.Lock()
result := m.checkRateLimit("alert-2")
m.mu.Unlock()
if !result {
t.Errorf("checkRateLimit(alert-2) = false, want true (separate limit)")
}
})
t.Run("old entries are cleaned up", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.config.Schedule.MaxAlertsHour = 2
// Pre-populate with old entries (more than 1 hour ago)
oldTime := time.Now().Add(-2 * time.Hour)
m.alertRateLimit["test-alert"] = []time.Time{oldTime, oldTime}
m.mu.Unlock()
// Should be allowed because old entries are cleaned up
m.mu.Lock()
result := m.checkRateLimit("test-alert")
m.mu.Unlock()
if !result {
t.Errorf("checkRateLimit() = false, want true (old entries should be cleaned)")
}
})
t.Run("mixed old and recent entries", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.config.Schedule.MaxAlertsHour = 2
// Pre-populate with 1 old and 1 recent entry
oldTime := time.Now().Add(-2 * time.Hour)
recentTime := time.Now().Add(-30 * time.Minute)
m.alertRateLimit["test-alert"] = []time.Time{oldTime, recentTime}
m.mu.Unlock()
// First call should be allowed (1 recent + 1 new = 2)
m.mu.Lock()
result1 := m.checkRateLimit("test-alert")
m.mu.Unlock()
if !result1 {
t.Errorf("checkRateLimit() call 1 = false, want true")
}
// Second call should be blocked (2 recent + 1 new would exceed 2)
m.mu.Lock()
result2 := m.checkRateLimit("test-alert")
m.mu.Unlock()
if result2 {
t.Errorf("checkRateLimit() call 2 = true, want false (at limit)")
}
})
}
func TestApplyRelaxedGuestThresholds(t *testing.T) {
// t.Parallel()
t.Run("nil thresholds get defaults", func(t *testing.T) {
// t.Parallel()
cfg := ThresholdConfig{
CPU: nil,
Memory: nil,
Disk: nil,
}
result := applyRelaxedGuestThresholds(cfg)
if result.CPU == nil {
t.Fatal("expected CPU threshold to be set")
}
if result.CPU.Trigger != 95 {
t.Errorf("CPU.Trigger = %v, want 95", result.CPU.Trigger)
}
if result.CPU.Clear != 90 {
t.Errorf("CPU.Clear = %v, want 90", result.CPU.Clear)
}
if result.Memory == nil {
t.Fatal("expected Memory threshold to be set")
}
if result.Memory.Trigger != 92 {
t.Errorf("Memory.Trigger = %v, want 92", result.Memory.Trigger)
}
if result.Disk == nil {
t.Fatal("expected Disk threshold to be set")
}
if result.Disk.Trigger != 95 {
t.Errorf("Disk.Trigger = %v, want 95", result.Disk.Trigger)
}
})
t.Run("low thresholds raised to minimum", func(t *testing.T) {
// t.Parallel()
cfg := ThresholdConfig{
CPU: &HysteresisThreshold{Trigger: 50, Clear: 45},
Memory: &HysteresisThreshold{Trigger: 60, Clear: 55},
Disk: &HysteresisThreshold{Trigger: 70, Clear: 65},
}
result := applyRelaxedGuestThresholds(cfg)
if result.CPU.Trigger != 95 {
t.Errorf("CPU.Trigger = %v, want 95 (raised to minimum)", result.CPU.Trigger)
}
if result.Memory.Trigger != 92 {
t.Errorf("Memory.Trigger = %v, want 92 (raised to minimum)", result.Memory.Trigger)
}
if result.Disk.Trigger != 95 {
t.Errorf("Disk.Trigger = %v, want 95 (raised to minimum)", result.Disk.Trigger)
}
})
t.Run("high thresholds unchanged", func(t *testing.T) {
// t.Parallel()
cfg := ThresholdConfig{
CPU: &HysteresisThreshold{Trigger: 98, Clear: 93},
Memory: &HysteresisThreshold{Trigger: 95, Clear: 90},
Disk: &HysteresisThreshold{Trigger: 99, Clear: 94},
}
result := applyRelaxedGuestThresholds(cfg)
if result.CPU.Trigger != 98 {
t.Errorf("CPU.Trigger = %v, want 98 (unchanged)", result.CPU.Trigger)
}
if result.Memory.Trigger != 95 {
t.Errorf("Memory.Trigger = %v, want 95 (unchanged)", result.Memory.Trigger)
}
if result.Disk.Trigger != 99 {
t.Errorf("Disk.Trigger = %v, want 99 (unchanged)", result.Disk.Trigger)
}
})
t.Run("clear adjusted when too close to trigger", func(t *testing.T) {
// t.Parallel()
cfg := ThresholdConfig{
CPU: &HysteresisThreshold{Trigger: 95, Clear: 96}, // Clear >= Trigger
}
result := applyRelaxedGuestThresholds(cfg)
if result.CPU.Clear >= result.CPU.Trigger {
t.Errorf("CPU.Clear = %v should be less than Trigger = %v", result.CPU.Clear, result.CPU.Trigger)
}
if result.CPU.Clear != 90 {
t.Errorf("CPU.Clear = %v, want 90 (Trigger - 5)", result.CPU.Clear)
}
})
t.Run("clear clamped at zero when it would go negative", func(t *testing.T) {
// t.Parallel()
// Create a threshold where Trigger is above min but Clear would go negative
// The adjust function sets Clear = Trigger - 5 if Clear >= Trigger
// Then clamps to 0 if Clear < 0
// Since all triggers get raised to 95/92/95, the negative clamp path
// won't be hit in normal use. Test the logic directly with a config
// that has Trigger exactly at minimum and Clear at minimum
cfg := ThresholdConfig{
CPU: &HysteresisThreshold{Trigger: 95, Clear: 3},
}
result := applyRelaxedGuestThresholds(cfg)
// Clear at 3 is valid (less than Trigger 95), should stay at 3
if result.CPU.Trigger != 95 {
t.Errorf("CPU.Trigger = %v, want 95", result.CPU.Trigger)
}
if result.CPU.Clear != 3 {
t.Errorf("CPU.Clear = %v, want 3 (unchanged since < Trigger)", result.CPU.Clear)
}
})
t.Run("original config unchanged", func(t *testing.T) {
// t.Parallel()
original := ThresholdConfig{
CPU: &HysteresisThreshold{Trigger: 50, Clear: 45},
}
_ = applyRelaxedGuestThresholds(original)
// Original should be unchanged
if original.CPU.Trigger != 50 {
t.Errorf("original CPU.Trigger = %v, want 50 (should be unchanged)", original.CPU.Trigger)
}
})
}
func TestShouldNotifyAfterCooldown(t *testing.T) {
// t.Parallel()
t.Run("cooldown disabled allows notification", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.config.Schedule.Cooldown = 0
m.mu.Unlock()
alert := &Alert{
ID: "test-alert",
LastNotified: nil,
}
if !m.shouldNotifyAfterCooldown(alert) {
t.Error("expected true when cooldown is 0")
}
})
t.Run("negative cooldown allows notification", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.config.Schedule.Cooldown = -5
m.mu.Unlock()
now := time.Now()
alert := &Alert{
ID: "test-alert",
LastNotified: &now,
}
if !m.shouldNotifyAfterCooldown(alert) {
t.Error("expected true when cooldown is negative")
}
})
t.Run("first notification allowed when never notified", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.config.Schedule.Cooldown = 30 // 30 minutes
m.mu.Unlock()
alert := &Alert{
ID: "test-alert",
LastNotified: nil,
}
if !m.shouldNotifyAfterCooldown(alert) {
t.Error("expected true when alert has never been notified")
}
})
t.Run("notification blocked during cooldown period", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.config.Schedule.Cooldown = 30 // 30 minutes
m.mu.Unlock()
lastNotified := time.Now().Add(-10 * time.Minute) // Notified 10 minutes ago
alert := &Alert{
ID: "test-alert",
LastNotified: &lastNotified,
}
if m.shouldNotifyAfterCooldown(alert) {
t.Error("expected false when still in cooldown period")
}
})
t.Run("notification allowed after cooldown expires", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.config.Schedule.Cooldown = 30 // 30 minutes
m.mu.Unlock()
lastNotified := time.Now().Add(-45 * time.Minute) // Notified 45 minutes ago
alert := &Alert{
ID: "test-alert",
LastNotified: &lastNotified,
}
if !m.shouldNotifyAfterCooldown(alert) {
t.Error("expected true after cooldown period expires")
}
})
t.Run("notification allowed at exact cooldown boundary", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.config.Schedule.Cooldown = 30 // 30 minutes
m.mu.Unlock()
lastNotified := time.Now().Add(-30 * time.Minute) // Exactly 30 minutes ago
alert := &Alert{
ID: "test-alert",
LastNotified: &lastNotified,
}
if !m.shouldNotifyAfterCooldown(alert) {
t.Error("expected true at exact cooldown boundary (>=)")
}
})
}
func TestDockerServiceDisplayName(t *testing.T) {
// t.Parallel()
tests := []struct {
name string
service models.DockerService
expected string
}{
{
name: "returns name when present",
service: models.DockerService{Name: "my-service", ID: "abc123456789xyz"},
expected: "my-service",
},
{
name: "returns trimmed name",
service: models.DockerService{Name: " my-service ", ID: "abc123456789xyz"},
expected: "my-service",
},
{
name: "returns truncated ID when name is empty",
service: models.DockerService{Name: "", ID: "abc123456789xyz"},
expected: "abc123456789",
},
{
name: "returns full short ID when less than 12 chars",
service: models.DockerService{Name: "", ID: "abc123"},
expected: "abc123",
},
{
name: "returns trimmed ID",
service: models.DockerService{Name: "", ID: " abc123456789xyz "},
expected: "abc123456789",
},
{
name: "returns 'service' when both name and ID empty",
service: models.DockerService{Name: "", ID: ""},
expected: "service",
},
{
name: "returns 'service' when both whitespace only",
service: models.DockerService{Name: " ", ID: " "},
expected: "service",
},
{
name: "prefers name over ID",
service: models.DockerService{Name: "preferred", ID: "not-this-id"},
expected: "preferred",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
// t.Parallel()
result := dockerServiceDisplayName(tt.service)
if result != tt.expected {
t.Errorf("dockerServiceDisplayName(%+v) = %q, want %q", tt.service, result, tt.expected)
}
})
}
}
func TestDockerServiceResourceID(t *testing.T) {
// t.Parallel()
tests := []struct {
name string
hostID string
serviceID string
serviceName string
expected string
}{
{
name: "with host and service ID",
hostID: "host-1",
serviceID: "svc-123",
serviceName: "my-service",
expected: "docker:host-1/service/svc-123",
},
{
name: "without host ID uses service prefix only",
hostID: "",
serviceID: "svc-123",
serviceName: "my-service",
expected: "docker-service:svc-123",
},
{
name: "whitespace host ID treated as empty",
hostID: " ",
serviceID: "svc-123",
serviceName: "my-service",
expected: "docker-service:svc-123",
},
{
name: "derives ID from service name when ID empty",
hostID: "host-1",
serviceID: "",
serviceName: "My Service",
expected: "docker:host-1/service/my-service",
},
{
name: "special chars in name replaced with dash",
hostID: "host-1",
serviceID: "",
serviceName: "my/service:v1.0",
expected: "docker:host-1/service/my-service-v1-0",
},
{
name: "backslash and colon replaced",
hostID: "host-1",
serviceID: "",
serviceName: "path\\to:service",
expected: "docker:host-1/service/path-to-service",
},
{
name: "preserves alphanumeric and underscore",
hostID: "host-1",
serviceID: "",
serviceName: "my_service_123",
expected: "docker:host-1/service/my_service_123",
},
{
name: "preserves hyphens",
hostID: "host-1",
serviceID: "",
serviceName: "my-service-name",
expected: "docker:host-1/service/my-service-name",
},
{
name: "trims leading/trailing dashes and underscores",
hostID: "host-1",
serviceID: "",
serviceName: "---my-service___",
expected: "docker:host-1/service/my-service",
},
{
name: "truncates long derived ID to 32 chars",
hostID: "host-1",
serviceID: "",
serviceName: "this-is-a-very-long-service-name-that-exceeds-the-limit",
expected: "docker:host-1/service/this-is-a-very-long-service-name",
},
{
name: "uses 'service' when name is all special chars",
hostID: "host-1",
serviceID: "",
serviceName: "!!!@@@###",
expected: "docker:host-1/service/service",
},
{
name: "uses 'service' when both ID and name empty",
hostID: "host-1",
serviceID: "",
serviceName: "",
expected: "docker:host-1/service/service",
},
{
name: "uses 'service' when both ID and name whitespace",
hostID: "host-1",
serviceID: " ",
serviceName: " ",
expected: "docker:host-1/service/service",
},
{
name: "no host and derived name",
hostID: "",
serviceID: "",
serviceName: "webserver",
expected: "docker-service:webserver",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
// t.Parallel()
result := dockerServiceResourceID(tt.hostID, tt.serviceID, tt.serviceName)
if result != tt.expected {
t.Errorf("dockerServiceResourceID(%q, %q, %q) = %q, want %q",
tt.hostID, tt.serviceID, tt.serviceName, result, tt.expected)
}
})
}
}
func TestClearStorageOfflineAlert(t *testing.T) {
// t.Parallel()
t.Run("clears existing offline alert", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
storage := models.Storage{
ID: "storage-1",
Name: "local-lvm",
Node: "pve1",
}
alertID := fmt.Sprintf("storage-offline-%s", storage.ID)
// Create an existing offline alert
m.mu.Lock()
m.activeAlerts[alertID] = &Alert{
ID: alertID,
Type: "storage-offline",
Level: "critical",
StartTime: time.Now().Add(-10 * time.Minute),
}
m.offlineConfirmations[storage.ID] = 3
m.mu.Unlock()
resolvedCh := make(chan string, 1)
m.SetResolvedCallback(func(id string) {
resolvedCh <- id
})
m.clearStorageOfflineAlert(storage)
m.mu.RLock()
_, alertStillActive := m.activeAlerts[alertID]
m.mu.RUnlock()
if !alertStillActive {
t.Fatal("expected alert to remain active until recovery is confirmed")
}
select {
case <-resolvedCh:
t.Fatal("expected no resolved callback before recovery is confirmed")
default:
}
m.clearStorageOfflineAlert(storage)
m.mu.RLock()
_, alertExists := m.activeAlerts[alertID]
_, confirmExists := m.offlineConfirmations[storage.ID]
m.mu.RUnlock()
if alertExists {
t.Error("expected alert to be cleared")
}
if confirmExists {
t.Error("expected offline confirmation to be cleared")
}
select {
case resolvedID := <-resolvedCh:
if resolvedID != alertID {
t.Errorf("expected resolved callback with %q, got %q", alertID, resolvedID)
}
case <-time.After(2 * time.Second):
t.Error("expected resolved callback to be called")
}
})
t.Run("noop when no alert exists", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
storage := models.Storage{
ID: "storage-2",
Name: "local-zfs",
Node: "pve1",
}
var callbackCalled bool
m.SetResolvedCallback(func(id string) {
callbackCalled = true
})
m.clearStorageOfflineAlert(storage)
if callbackCalled {
t.Error("expected no callback when no alert exists")
}
})
t.Run("clears offline confirmation even when no alert", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
storage := models.Storage{
ID: "storage-3",
Name: "ceph-pool",
Node: "pve2",
}
// Set confirmation without alert
m.mu.Lock()
m.offlineConfirmations[storage.ID] = 2
m.mu.Unlock()
m.clearStorageOfflineAlert(storage)
m.mu.RLock()
_, confirmExists := m.offlineConfirmations[storage.ID]
m.mu.RUnlock()
if confirmExists {
t.Error("expected offline confirmation to be cleared")
}
})
}
func TestClearHostMetricAlerts(t *testing.T) {
// t.Parallel()
t.Run("clears specified metrics", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
hostID := "my-host"
resourceID := fmt.Sprintf("host:%s", hostID)
// Create alerts for cpu and memory
m.mu.Lock()
m.activeAlerts[fmt.Sprintf("%s-cpu", resourceID)] = &Alert{ID: fmt.Sprintf("%s-cpu", resourceID)}
m.activeAlerts[fmt.Sprintf("%s-memory", resourceID)] = &Alert{ID: fmt.Sprintf("%s-memory", resourceID)}
m.activeAlerts[fmt.Sprintf("%s-disk", resourceID)] = &Alert{ID: fmt.Sprintf("%s-disk", resourceID)}
m.mu.Unlock()
m.clearHostMetricAlerts(hostID, "cpu", "disk")
m.mu.RLock()
_, cpuExists := m.activeAlerts[fmt.Sprintf("%s-cpu", resourceID)]
_, memExists := m.activeAlerts[fmt.Sprintf("%s-memory", resourceID)]
_, diskExists := m.activeAlerts[fmt.Sprintf("%s-disk", resourceID)]
m.mu.RUnlock()
if cpuExists {
t.Error("expected cpu alert to be cleared")
}
if !memExists {
t.Error("expected memory alert to remain (not specified)")
}
if diskExists {
t.Error("expected disk alert to be cleared")
}
})
t.Run("defaults to cpu and memory when no metrics specified", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
hostID := "default-host"
resourceID := fmt.Sprintf("host:%s", hostID)
// Create alerts
m.mu.Lock()
m.activeAlerts[fmt.Sprintf("%s-cpu", resourceID)] = &Alert{ID: fmt.Sprintf("%s-cpu", resourceID)}
m.activeAlerts[fmt.Sprintf("%s-memory", resourceID)] = &Alert{ID: fmt.Sprintf("%s-memory", resourceID)}
m.activeAlerts[fmt.Sprintf("%s-disk", resourceID)] = &Alert{ID: fmt.Sprintf("%s-disk", resourceID)}
m.mu.Unlock()
m.clearHostMetricAlerts(hostID) // No metrics specified
m.mu.RLock()
_, cpuExists := m.activeAlerts[fmt.Sprintf("%s-cpu", resourceID)]
_, memExists := m.activeAlerts[fmt.Sprintf("%s-memory", resourceID)]
_, diskExists := m.activeAlerts[fmt.Sprintf("%s-disk", resourceID)]
m.mu.RUnlock()
if cpuExists {
t.Error("expected cpu alert to be cleared (default)")
}
if memExists {
t.Error("expected memory alert to be cleared (default)")
}
if !diskExists {
t.Error("expected disk alert to remain (not in defaults)")
}
})
t.Run("empty hostID is noop", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
// Create an alert that should not be touched
m.mu.Lock()
m.activeAlerts["host:unknown-cpu"] = &Alert{ID: "host:unknown-cpu"}
m.mu.Unlock()
m.clearHostMetricAlerts("", "cpu")
m.mu.RLock()
_, exists := m.activeAlerts["host:unknown-cpu"]
m.mu.RUnlock()
if !exists {
t.Error("expected alert to remain when hostID is empty")
}
})
}
func TestClearHostDiskAlerts(t *testing.T) {
// t.Parallel()
t.Run("clears all disk alerts for host", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
hostID := "disk-host"
resourceID := fmt.Sprintf("host:%s", hostID)
// Create disk alerts with the expected ResourceID format
m.mu.Lock()
m.activeAlerts["disk1-alert"] = &Alert{
ID: "disk1-alert",
ResourceID: fmt.Sprintf("%s/disk:sda", resourceID),
}
m.activeAlerts["disk2-alert"] = &Alert{
ID: "disk2-alert",
ResourceID: fmt.Sprintf("%s/disk:sdb", resourceID),
}
m.activeAlerts["cpu-alert"] = &Alert{
ID: "cpu-alert",
ResourceID: fmt.Sprintf("%s-cpu", resourceID),
}
m.mu.Unlock()
m.clearHostDiskAlerts(hostID)
m.mu.RLock()
_, disk1Exists := m.activeAlerts["disk1-alert"]
_, disk2Exists := m.activeAlerts["disk2-alert"]
_, cpuExists := m.activeAlerts["cpu-alert"]
m.mu.RUnlock()
if disk1Exists {
t.Error("expected disk1 alert to be cleared")
}
if disk2Exists {
t.Error("expected disk2 alert to be cleared")
}
if !cpuExists {
t.Error("expected cpu alert to remain (not a disk alert)")
}
})
t.Run("empty hostID is noop", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
// Create an alert
m.mu.Lock()
m.activeAlerts["disk-alert"] = &Alert{
ID: "disk-alert",
ResourceID: "host:unknown/disk:sda",
}
m.mu.Unlock()
m.clearHostDiskAlerts("")
m.mu.RLock()
_, exists := m.activeAlerts["disk-alert"]
m.mu.RUnlock()
if !exists {
t.Error("expected alert to remain when hostID is empty")
}
})
t.Run("skips nil alerts", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
hostID := "nil-test"
resourceID := fmt.Sprintf("host:%s", hostID)
m.mu.Lock()
m.activeAlerts["nil-alert"] = nil
m.activeAlerts["real-alert"] = &Alert{
ID: "real-alert",
ResourceID: fmt.Sprintf("%s/disk:sda", resourceID),
}
m.mu.Unlock()
// Should not panic
m.clearHostDiskAlerts(hostID)
m.mu.RLock()
_, realExists := m.activeAlerts["real-alert"]
m.mu.RUnlock()
if realExists {
t.Error("expected real alert to be cleared")
}
})
t.Run("noop when no matching alerts", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.activeAlerts["other-alert"] = &Alert{
ID: "other-alert",
ResourceID: "host:other-host/disk:sda",
}
m.mu.Unlock()
m.clearHostDiskAlerts("my-host")
m.mu.RLock()
_, exists := m.activeAlerts["other-alert"]
m.mu.RUnlock()
if !exists {
t.Error("expected other host's alert to remain")
}
})
}
func TestCleanupHostDiskAlerts(t *testing.T) {
// t.Parallel()
t.Run("clears alerts not in seen set", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
host := models.Host{ID: "host-1"}
resourceID := fmt.Sprintf("host:%s", host.ID)
// Create disk alerts
m.mu.Lock()
m.activeAlerts["disk-sda"] = &Alert{
ID: "disk-sda",
ResourceID: fmt.Sprintf("%s/disk:sda", resourceID),
}
m.activeAlerts["disk-sdb"] = &Alert{
ID: "disk-sdb",
ResourceID: fmt.Sprintf("%s/disk:sdb", resourceID),
}
m.activeAlerts["disk-sdc"] = &Alert{
ID: "disk-sdc",
ResourceID: fmt.Sprintf("%s/disk:sdc", resourceID),
}
m.mu.Unlock()
// Only sda and sdb are in the seen set
seen := map[string]struct{}{
fmt.Sprintf("%s/disk:sda", resourceID): {},
fmt.Sprintf("%s/disk:sdb", resourceID): {},
}
m.cleanupHostDiskAlerts(host, seen)
m.mu.RLock()
_, sdaExists := m.activeAlerts["disk-sda"]
_, sdbExists := m.activeAlerts["disk-sdb"]
_, sdcExists := m.activeAlerts["disk-sdc"]
m.mu.RUnlock()
if !sdaExists {
t.Error("expected sda alert to remain (in seen set)")
}
if !sdbExists {
t.Error("expected sdb alert to remain (in seen set)")
}
if sdcExists {
t.Error("expected sdc alert to be cleared (not in seen set)")
}
})
t.Run("empty host ID is noop", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.activeAlerts["disk-alert"] = &Alert{
ID: "disk-alert",
ResourceID: "host:unknown/disk:sda",
}
m.mu.Unlock()
host := models.Host{ID: ""}
m.cleanupHostDiskAlerts(host, nil)
m.mu.RLock()
_, exists := m.activeAlerts["disk-alert"]
m.mu.RUnlock()
if !exists {
t.Error("expected alert to remain when host ID is empty")
}
})
t.Run("skips nil alerts", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
host := models.Host{ID: "host-2"}
resourceID := fmt.Sprintf("host:%s", host.ID)
m.mu.Lock()
m.activeAlerts["nil-alert"] = nil
m.activeAlerts["real-alert"] = &Alert{
ID: "real-alert",
ResourceID: fmt.Sprintf("%s/disk:sda", resourceID),
}
m.mu.Unlock()
seen := map[string]struct{}{} // Empty seen set
// Should not panic
m.cleanupHostDiskAlerts(host, seen)
m.mu.RLock()
_, realExists := m.activeAlerts["real-alert"]
m.mu.RUnlock()
if realExists {
t.Error("expected real alert to be cleared (not in seen set)")
}
})
t.Run("skips non-matching prefix", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
host := models.Host{ID: "host-3"}
resourceID := fmt.Sprintf("host:%s", host.ID)
m.mu.Lock()
m.activeAlerts["cpu-alert"] = &Alert{
ID: "cpu-alert",
ResourceID: fmt.Sprintf("%s-cpu", resourceID), // Not a disk alert
}
m.activeAlerts["disk-alert"] = &Alert{
ID: "disk-alert",
ResourceID: fmt.Sprintf("%s/disk:sda", resourceID),
}
m.mu.Unlock()
seen := map[string]struct{}{} // Empty seen set
m.cleanupHostDiskAlerts(host, seen)
m.mu.RLock()
_, cpuExists := m.activeAlerts["cpu-alert"]
_, diskExists := m.activeAlerts["disk-alert"]
m.mu.RUnlock()
if !cpuExists {
t.Error("expected cpu alert to remain (not a disk alert)")
}
if diskExists {
t.Error("expected disk alert to be cleared")
}
})
}
func TestHandleDockerHostRemovedEmptyID(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
// Create some alerts that should not be touched
m.mu.Lock()
m.activeAlerts["docker-host-offline-host1"] = &Alert{ID: "docker-host-offline-host1"}
m.dockerOfflineCount["host1"] = 3
m.mu.Unlock()
// Call with empty ID - should be noop
host := models.DockerHost{ID: ""}
m.HandleDockerHostRemoved(host)
m.mu.RLock()
_, alertExists := m.activeAlerts["docker-host-offline-host1"]
_, countExists := m.dockerOfflineCount["host1"]
m.mu.RUnlock()
if !alertExists {
t.Error("expected alert to remain when host ID is empty")
}
if !countExists {
t.Error("expected offline count to remain when host ID is empty")
}
}
func TestHandleDockerHostOnline(t *testing.T) {
// t.Parallel()
t.Run("clears offline alert and tracking", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
host := models.DockerHost{ID: "docker-host-1", DisplayName: "My Host"}
alertID := fmt.Sprintf("docker-host-offline-%s", host.ID)
// Set up offline alert and tracking
m.mu.Lock()
m.activeAlerts[alertID] = &Alert{ID: alertID, ResourceID: fmt.Sprintf("docker:%s", host.ID)}
m.dockerOfflineCount[host.ID] = 5
m.mu.Unlock()
m.HandleDockerHostOnline(host)
m.mu.RLock()
_, alertExists := m.activeAlerts[alertID]
_, countExists := m.dockerOfflineCount[host.ID]
m.mu.RUnlock()
if alertExists {
t.Error("expected offline alert to be cleared")
}
if countExists {
t.Error("expected offline count to be cleared")
}
})
t.Run("noop when no offline alert exists", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
host := models.DockerHost{ID: "docker-host-2"}
// Set up only tracking, no alert
m.mu.Lock()
m.dockerOfflineCount[host.ID] = 2
m.mu.Unlock()
m.HandleDockerHostOnline(host)
m.mu.RLock()
_, countExists := m.dockerOfflineCount[host.ID]
m.mu.RUnlock()
if countExists {
t.Error("expected offline count to be cleared even without alert")
}
})
t.Run("empty host ID is noop", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
// Create some data that should not be touched
m.mu.Lock()
m.activeAlerts["docker-host-offline-other"] = &Alert{ID: "docker-host-offline-other"}
m.dockerOfflineCount["other"] = 3
m.mu.Unlock()
host := models.DockerHost{ID: ""}
m.HandleDockerHostOnline(host)
m.mu.RLock()
_, alertExists := m.activeAlerts["docker-host-offline-other"]
_, countExists := m.dockerOfflineCount["other"]
m.mu.RUnlock()
if !alertExists {
t.Error("expected other alert to remain when host ID is empty")
}
if !countExists {
t.Error("expected other count to remain when host ID is empty")
}
})
}
func TestCleanupDockerContainerAlerts(t *testing.T) {
// t.Parallel()
t.Run("clears alerts not in seen set", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
host := models.DockerHost{ID: "docker-host-1"}
prefix := fmt.Sprintf("docker:%s/", host.ID)
// Create container alerts
m.mu.Lock()
m.activeAlerts["container1-alert"] = &Alert{
ID: "container1-alert",
ResourceID: prefix + "container1",
}
m.activeAlerts["container2-alert"] = &Alert{
ID: "container2-alert",
ResourceID: prefix + "container2",
}
m.activeAlerts["container3-alert"] = &Alert{
ID: "container3-alert",
ResourceID: prefix + "container3",
}
m.dockerStateConfirm[prefix+"container1"] = 2
m.dockerStateConfirm[prefix+"container2"] = 1
m.dockerStateConfirm[prefix+"container3"] = 3
m.mu.Unlock()
// Only container1 and container2 are in seen set
seen := map[string]struct{}{
prefix + "container1": {},
prefix + "container2": {},
}
m.cleanupDockerContainerAlerts(host, seen)
m.mu.RLock()
_, c1Exists := m.activeAlerts["container1-alert"]
_, c2Exists := m.activeAlerts["container2-alert"]
_, c3Exists := m.activeAlerts["container3-alert"]
_, s1Exists := m.dockerStateConfirm[prefix+"container1"]
_, s2Exists := m.dockerStateConfirm[prefix+"container2"]
_, s3Exists := m.dockerStateConfirm[prefix+"container3"]
m.mu.RUnlock()
if !c1Exists {
t.Error("expected container1 alert to remain (in seen set)")
}
if !c2Exists {
t.Error("expected container2 alert to remain (in seen set)")
}
if c3Exists {
t.Error("expected container3 alert to be cleared (not in seen set)")
}
if !s1Exists {
t.Error("expected container1 state confirm to remain (in seen set)")
}
if !s2Exists {
t.Error("expected container2 state confirm to remain (in seen set)")
}
if s3Exists {
t.Error("expected container3 state confirm to be cleared (not in seen set)")
}
})
t.Run("skips alerts from other hosts", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
host := models.DockerHost{ID: "host-a"}
// Create alert for a different host
m.mu.Lock()
m.activeAlerts["other-host-alert"] = &Alert{
ID: "other-host-alert",
ResourceID: "docker:host-b/container1",
}
m.mu.Unlock()
seen := map[string]struct{}{} // Empty seen set
m.cleanupDockerContainerAlerts(host, seen)
m.mu.RLock()
_, exists := m.activeAlerts["other-host-alert"]
m.mu.RUnlock()
if !exists {
t.Error("expected other host's alert to remain")
}
})
t.Run("handles empty seen set", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
host := models.DockerHost{ID: "host-c"}
prefix := fmt.Sprintf("docker:%s/", host.ID)
m.mu.Lock()
m.activeAlerts["to-clear"] = &Alert{
ID: "to-clear",
ResourceID: prefix + "container1",
}
m.dockerStateConfirm[prefix+"container1"] = 1
m.mu.Unlock()
m.cleanupDockerContainerAlerts(host, map[string]struct{}{})
m.mu.RLock()
_, alertExists := m.activeAlerts["to-clear"]
_, stateExists := m.dockerStateConfirm[prefix+"container1"]
m.mu.RUnlock()
if alertExists {
t.Error("expected alert to be cleared with empty seen set")
}
if stateExists {
t.Error("expected state confirm to be cleared with empty seen set")
}
})
}
func TestSafeCallEscalateCallback(t *testing.T) {
// t.Parallel()
t.Run("calls callback with alert and level", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
var receivedAlert *Alert
var receivedLevel int
done := make(chan struct{})
m.SetEscalateCallback(func(alert *Alert, level int) {
receivedAlert = alert
receivedLevel = level
close(done)
})
alert := &Alert{
ID: "test-alert",
Type: "test",
ResourceName: "resource-1",
}
m.safeCallEscalateCallback(alert, 2)
select {
case <-done:
if receivedAlert == nil {
t.Fatal("expected alert to be received")
}
if receivedAlert.ID != "test-alert" {
t.Errorf("expected alert ID 'test-alert', got %q", receivedAlert.ID)
}
if receivedLevel != 2 {
t.Errorf("expected level 2, got %d", receivedLevel)
}
case <-time.After(1 * time.Second):
t.Fatal("callback not called within timeout")
}
})
t.Run("noop when callback is nil", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
// No callback set
alert := &Alert{ID: "test-alert"}
// Should not panic
m.safeCallEscalateCallback(alert, 1)
})
t.Run("recovers from panic in callback", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
done := make(chan struct{})
m.SetEscalateCallback(func(alert *Alert, level int) {
defer close(done)
panic("test panic")
})
alert := &Alert{ID: "panic-test"}
// Should not panic the caller
m.safeCallEscalateCallback(alert, 1)
select {
case <-done:
// Callback ran (and panicked, but recovered)
case <-time.After(1 * time.Second):
t.Fatal("callback not called within timeout")
}
})
t.Run("clones alert to prevent modification", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
var receivedAlert *Alert
done := make(chan struct{})
m.SetEscalateCallback(func(alert *Alert, level int) {
receivedAlert = alert
close(done)
})
original := &Alert{
ID: "original-alert",
ResourceName: "original-resource",
}
m.safeCallEscalateCallback(original, 1)
select {
case <-done:
// Modify original after callback started
original.ResourceName = "modified"
// Received alert should be a clone, not affected by modification
if receivedAlert.ID != "original-alert" {
t.Errorf("expected cloned alert ID")
}
case <-time.After(1 * time.Second):
t.Fatal("callback not called within timeout")
}
})
}
func TestSafeCallResolvedCallback(t *testing.T) {
// t.Parallel()
t.Run("calls callback with alert ID synchronously", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
var receivedID string
m.SetResolvedCallback(func(alertID string) {
receivedID = alertID
})
m.safeCallResolvedCallback("test-alert-123", false)
if receivedID != "test-alert-123" {
t.Errorf("expected alert ID 'test-alert-123', got %q", receivedID)
}
})
t.Run("calls callback asynchronously", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
var receivedID string
done := make(chan struct{})
m.SetResolvedCallback(func(alertID string) {
receivedID = alertID
close(done)
})
m.safeCallResolvedCallback("async-alert", true)
select {
case <-done:
if receivedID != "async-alert" {
t.Errorf("expected alert ID 'async-alert', got %q", receivedID)
}
case <-time.After(1 * time.Second):
t.Fatal("async callback not called within timeout")
}
})
t.Run("noop when callback is nil", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
// No callback set
// Should not panic
m.safeCallResolvedCallback("test-alert", false)
m.safeCallResolvedCallback("test-alert", true)
})
t.Run("recovers from panic in sync callback", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.SetResolvedCallback(func(alertID string) {
panic("test panic")
})
// Should not panic the caller
m.safeCallResolvedCallback("panic-test", false)
})
t.Run("recovers from panic in async callback", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
done := make(chan struct{})
m.SetResolvedCallback(func(alertID string) {
defer close(done)
panic("async panic")
})
m.safeCallResolvedCallback("async-panic", true)
select {
case <-done:
// Callback ran (and panicked, but recovered)
case <-time.After(1 * time.Second):
t.Fatal("async callback not called within timeout")
}
})
}
func TestHandleHostOnline(t *testing.T) {
// t.Parallel()
t.Run("clears offline alert and confirmation tracking", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
host := models.Host{ID: "host-1", Hostname: "my-host"}
alertID := fmt.Sprintf("host-offline-%s", host.ID)
resourceKey := fmt.Sprintf("host:%s", host.ID)
// Set up offline alert and tracking
m.mu.Lock()
m.activeAlerts[alertID] = &Alert{ID: alertID, ResourceID: resourceKey}
m.offlineConfirmations[resourceKey] = 5
m.mu.Unlock()
m.HandleHostOnline(host)
m.mu.RLock()
_, alertExists := m.activeAlerts[alertID]
_, confirmExists := m.offlineConfirmations[resourceKey]
m.mu.RUnlock()
if alertExists {
t.Error("expected offline alert to be cleared")
}
if confirmExists {
t.Error("expected offline confirmation to be cleared")
}
})
t.Run("clears confirmation even without alert", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
host := models.Host{ID: "host-2"}
resourceKey := fmt.Sprintf("host:%s", host.ID)
// Only tracking, no alert
m.mu.Lock()
m.offlineConfirmations[resourceKey] = 2
m.mu.Unlock()
m.HandleHostOnline(host)
m.mu.RLock()
_, exists := m.offlineConfirmations[resourceKey]
m.mu.RUnlock()
if exists {
t.Error("expected offline confirmation to be cleared")
}
})
t.Run("empty host ID is noop", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
// Create data that should not be touched
m.mu.Lock()
m.activeAlerts["host-offline-other"] = &Alert{ID: "host-offline-other"}
m.offlineConfirmations["host:other"] = 3
m.mu.Unlock()
host := models.Host{ID: ""}
m.HandleHostOnline(host)
m.mu.RLock()
_, alertExists := m.activeAlerts["host-offline-other"]
_, confirmExists := m.offlineConfirmations["host:other"]
m.mu.RUnlock()
if !alertExists {
t.Error("expected other alert to remain when host ID is empty")
}
if !confirmExists {
t.Error("expected other confirmation to remain when host ID is empty")
}
})
}
func TestAcknowledgeAlertNotFound(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
err := m.AcknowledgeAlert("nonexistent-alert", "user1")
if err == nil {
t.Fatal("expected error when acknowledging nonexistent alert")
}
if !strings.Contains(err.Error(), "alert not found") {
t.Errorf("expected 'alert not found' error, got: %v", err)
}
}
func TestUnacknowledgeAlertNotFound(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
err := m.UnacknowledgeAlert("nonexistent-alert")
if err == nil {
t.Fatal("expected error when unacknowledging nonexistent alert")
}
if !strings.Contains(err.Error(), "alert not found") {
t.Errorf("expected 'alert not found' error, got: %v", err)
}
}
func TestUnacknowledgeAlertSuccess(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
// Create and acknowledge an alert first
alertID := "test-alert-123"
now := time.Now()
m.activeAlerts[alertID] = &Alert{
ID: alertID,
Acknowledged: true,
AckTime: &now,
AckUser: "user1",
}
m.ackState[alertID] = ackRecord{acknowledged: true, user: "user1", time: now}
// Unacknowledge the alert
err := m.UnacknowledgeAlert(alertID)
if err != nil {
t.Fatalf("unexpected error unacknowledging alert: %v", err)
}
// Verify alert state was updated
alert := m.activeAlerts[alertID]
if alert.Acknowledged {
t.Error("expected Acknowledged to be false")
}
if alert.AckTime != nil {
t.Error("expected AckTime to be nil")
}
if alert.AckUser != "" {
t.Errorf("expected AckUser to be empty, got: %s", alert.AckUser)
}
// Verify ackState was removed
if _, exists := m.ackState[alertID]; exists {
t.Error("expected ackState entry to be deleted")
}
}
func TestClearActiveAlertsEmptyMaps(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
// Ensure maps are empty initially
if len(m.activeAlerts) != 0 {
t.Fatalf("expected activeAlerts to be empty, got %d", len(m.activeAlerts))
}
if len(m.pendingAlerts) != 0 {
t.Fatalf("expected pendingAlerts to be empty, got %d", len(m.pendingAlerts))
}
// Call ClearActiveAlerts on empty manager - should return early without panic
m.ClearActiveAlerts()
// Verify maps are still empty (function returned early)
if len(m.activeAlerts) != 0 {
t.Errorf("expected activeAlerts to remain empty, got %d", len(m.activeAlerts))
}
}
func TestClearActiveAlertsWithExistingAlerts(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
// Populate various maps with test data
m.mu.Lock()
m.activeAlerts["test-alert-1"] = &Alert{ID: "test-alert-1", Type: "cpu-usage"}
m.activeAlerts["test-alert-2"] = &Alert{ID: "test-alert-2", Type: "memory-usage"}
m.pendingAlerts["pending-1"] = time.Now()
m.recentAlerts["recent-1"] = &Alert{ID: "recent-1", Type: "disk-usage"}
m.suppressedUntil["suppressed-1"] = time.Now().Add(time.Hour)
m.alertRateLimit["rate-1"] = []time.Time{time.Now()}
m.nodeOfflineCount["node-1"] = 3
m.offlineConfirmations["node-1"] = 2
m.dockerOfflineCount["docker-1"] = 1
m.dockerStateConfirm["docker-1"] = 1
m.ackState["test-alert-1"] = ackRecord{acknowledged: true, user: "testuser", time: time.Now()}
m.mu.Unlock()
m.resolvedMutex.Lock()
m.recentlyResolved["resolved-1"] = &ResolvedAlert{Alert: &Alert{ID: "resolved-1"}, ResolvedTime: time.Now()}
m.resolvedMutex.Unlock()
// Call ClearActiveAlerts
m.ClearActiveAlerts()
// Give goroutine time to run SaveActiveAlerts
time.Sleep(50 * time.Millisecond)
// Verify all maps are cleared
m.mu.RLock()
if len(m.activeAlerts) != 0 {
t.Errorf("expected activeAlerts to be empty, got %d", len(m.activeAlerts))
}
if len(m.pendingAlerts) != 0 {
t.Errorf("expected pendingAlerts to be empty, got %d", len(m.pendingAlerts))
}
if len(m.recentAlerts) != 0 {
t.Errorf("expected recentAlerts to be empty, got %d", len(m.recentAlerts))
}
if len(m.suppressedUntil) != 0 {
t.Errorf("expected suppressedUntil to be empty, got %d", len(m.suppressedUntil))
}
if len(m.alertRateLimit) != 0 {
t.Errorf("expected alertRateLimit to be empty, got %d", len(m.alertRateLimit))
}
if len(m.nodeOfflineCount) != 0 {
t.Errorf("expected nodeOfflineCount to be empty, got %d", len(m.nodeOfflineCount))
}
if len(m.offlineConfirmations) != 0 {
t.Errorf("expected offlineConfirmations to be empty, got %d", len(m.offlineConfirmations))
}
if len(m.dockerOfflineCount) != 0 {
t.Errorf("expected dockerOfflineCount to be empty, got %d", len(m.dockerOfflineCount))
}
if len(m.dockerStateConfirm) != 0 {
t.Errorf("expected dockerStateConfirm to be empty, got %d", len(m.dockerStateConfirm))
}
if len(m.ackState) != 0 {
t.Errorf("expected ackState to be empty, got %d", len(m.ackState))
}
m.mu.RUnlock()
m.resolvedMutex.RLock()
if len(m.recentlyResolved) != 0 {
t.Errorf("expected recentlyResolved to be empty, got %d", len(m.recentlyResolved))
}
m.resolvedMutex.RUnlock()
}
func TestClearBackupAlertsLocked(t *testing.T) {
// t.Parallel()
t.Run("clears backup-age and backup-orphaned alerts only", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
// Add a backup-age alert
m.activeAlerts["backup-alert-1"] = &Alert{
ID: "backup-alert-1",
Type: "backup-age",
}
// Add a non-backup alert
m.activeAlerts["cpu-alert-1"] = &Alert{
ID: "cpu-alert-1",
Type: "cpu",
}
// Add another backup-age alert
m.activeAlerts["backup-alert-2"] = &Alert{
ID: "backup-alert-2",
Type: "backup-age",
}
// Add a backup-orphaned alert
m.activeAlerts["backup-orphaned-1"] = &Alert{
ID: "backup-orphaned-1",
Type: "backup-orphaned",
}
if len(m.activeAlerts) != 4 {
t.Fatalf("expected 4 alerts, got %d", len(m.activeAlerts))
}
m.mu.Lock()
m.clearBackupAlertsLocked()
m.mu.Unlock()
// Should have removed backup-age and backup-orphaned alerts, keeping cpu alert
if len(m.activeAlerts) != 1 {
t.Errorf("expected 1 alert remaining, got %d", len(m.activeAlerts))
}
if _, exists := m.activeAlerts["cpu-alert-1"]; !exists {
t.Error("expected cpu-alert-1 to remain")
}
if _, exists := m.activeAlerts["backup-alert-1"]; exists {
t.Error("expected backup-alert-1 to be cleared")
}
if _, exists := m.activeAlerts["backup-alert-2"]; exists {
t.Error("expected backup-alert-2 to be cleared")
}
if _, exists := m.activeAlerts["backup-orphaned-1"]; exists {
t.Error("expected backup-orphaned-1 to be cleared")
}
})
t.Run("handles nil alert in map", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
// Add a nil alert entry
m.activeAlerts["nil-alert"] = nil
// Add a valid backup-age alert
m.activeAlerts["backup-alert"] = &Alert{
ID: "backup-alert",
Type: "backup-age",
}
m.mu.Lock()
m.clearBackupAlertsLocked()
m.mu.Unlock()
// Should have skipped nil and removed backup-age
if len(m.activeAlerts) != 1 {
t.Errorf("expected 1 alert remaining, got %d", len(m.activeAlerts))
}
// Nil entry should remain
if _, exists := m.activeAlerts["nil-alert"]; !exists {
t.Error("expected nil-alert entry to remain (nil check should skip it)")
}
})
t.Run("empty alerts map is no-op", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.clearBackupAlertsLocked()
m.mu.Unlock()
if len(m.activeAlerts) != 0 {
t.Errorf("expected 0 alerts, got %d", len(m.activeAlerts))
}
})
}
func TestClearBackupAlerts(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
// Add a backup-age alert
m.activeAlerts["backup-alert"] = &Alert{
ID: "backup-alert",
Type: "backup-age",
}
// Add a non-backup alert
m.activeAlerts["cpu-alert"] = &Alert{
ID: "cpu-alert",
Type: "cpu",
}
// Call the public method (handles locking internally)
m.clearBackupAlerts()
// Only cpu alert should remain
if len(m.activeAlerts) != 1 {
t.Errorf("expected 1 alert remaining, got %d", len(m.activeAlerts))
}
if _, exists := m.activeAlerts["cpu-alert"]; !exists {
t.Error("expected cpu-alert to remain")
}
}
func TestClearSnapshotAlertsForInstanceLocked(t *testing.T) {
// t.Parallel()
t.Run("clears snapshot alerts for specific instance", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
// Add snapshot alerts for different instances
m.activeAlerts["snap-inst1"] = &Alert{
ID: "snap-inst1",
Type: "snapshot-age",
Instance: "instance1",
}
m.activeAlerts["snap-inst2"] = &Alert{
ID: "snap-inst2",
Type: "snapshot-age",
Instance: "instance2",
}
// Add a non-snapshot alert
m.activeAlerts["cpu-alert"] = &Alert{
ID: "cpu-alert",
Type: "cpu",
}
m.mu.Lock()
m.clearSnapshotAlertsForInstanceLocked("instance1")
m.mu.Unlock()
// Should keep instance2 snapshot and cpu alert
if len(m.activeAlerts) != 2 {
t.Errorf("expected 2 alerts remaining, got %d", len(m.activeAlerts))
}
if _, exists := m.activeAlerts["snap-inst1"]; exists {
t.Error("expected snap-inst1 to be cleared")
}
if _, exists := m.activeAlerts["snap-inst2"]; !exists {
t.Error("expected snap-inst2 to remain")
}
})
t.Run("clears all snapshot alerts when instance is empty", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
// Add snapshot alerts for different instances
m.activeAlerts["snap-inst1"] = &Alert{
ID: "snap-inst1",
Type: "snapshot-age",
Instance: "instance1",
}
m.activeAlerts["snap-inst2"] = &Alert{
ID: "snap-inst2",
Type: "snapshot-age",
Instance: "instance2",
}
// Add a non-snapshot alert
m.activeAlerts["cpu-alert"] = &Alert{
ID: "cpu-alert",
Type: "cpu",
}
m.mu.Lock()
m.clearSnapshotAlertsForInstanceLocked("")
m.mu.Unlock()
// Should keep only cpu alert
if len(m.activeAlerts) != 1 {
t.Errorf("expected 1 alert remaining, got %d", len(m.activeAlerts))
}
if _, exists := m.activeAlerts["cpu-alert"]; !exists {
t.Error("expected cpu-alert to remain")
}
})
t.Run("handles nil alert in map", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
// Add nil entry and valid snapshot alert
m.activeAlerts["nil-alert"] = nil
m.activeAlerts["snap-alert"] = &Alert{
ID: "snap-alert",
Type: "snapshot-age",
Instance: "inst1",
}
m.mu.Lock()
m.clearSnapshotAlertsForInstanceLocked("inst1")
m.mu.Unlock()
// Nil entry should remain, snapshot should be cleared
if len(m.activeAlerts) != 1 {
t.Errorf("expected 1 alert remaining, got %d", len(m.activeAlerts))
}
if _, exists := m.activeAlerts["nil-alert"]; !exists {
t.Error("expected nil-alert entry to remain")
}
})
}
func TestClearSnapshotAlertsForInstance(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
// Add a snapshot alert
m.activeAlerts["snap-alert"] = &Alert{
ID: "snap-alert",
Type: "snapshot-age",
Instance: "instance1",
}
// Call the public method (handles locking internally)
m.clearSnapshotAlertsForInstance("instance1")
if len(m.activeAlerts) != 0 {
t.Errorf("expected 0 alerts remaining, got %d", len(m.activeAlerts))
}
}
func TestApplyGlobalOfflineSettingsLocked(t *testing.T) {
// t.Parallel()
t.Run("DisableAllNodesOffline clears node offline alerts", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
// Add node offline alerts
m.activeAlerts["node-offline-node1"] = &Alert{ID: "node-offline-node1", Type: "offline"}
m.activeAlerts["node-offline-node2"] = &Alert{ID: "node-offline-node2", Type: "offline"}
// Add non-node alert
m.activeAlerts["cpu-alert"] = &Alert{ID: "cpu-alert", Type: "cpu"}
// Add to nodeOfflineCount
m.nodeOfflineCount["node1"] = 3
m.nodeOfflineCount["node2"] = 2
m.config.DisableAllNodesOffline = true
m.mu.Lock()
m.applyGlobalOfflineSettingsLocked()
m.mu.Unlock()
// Node alerts should be cleared
if _, exists := m.activeAlerts["node-offline-node1"]; exists {
t.Error("expected node-offline-node1 to be cleared")
}
if _, exists := m.activeAlerts["node-offline-node2"]; exists {
t.Error("expected node-offline-node2 to be cleared")
}
// Non-node alert should remain
if _, exists := m.activeAlerts["cpu-alert"]; !exists {
t.Error("expected cpu-alert to remain")
}
// nodeOfflineCount should be reset
if len(m.nodeOfflineCount) != 0 {
t.Errorf("expected nodeOfflineCount to be empty, got %d entries", len(m.nodeOfflineCount))
}
})
t.Run("DisableAllPBSOffline clears PBS offline alerts", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
// Add PBS offline alerts
m.activeAlerts["pbs-offline-pbs1"] = &Alert{ID: "pbs-offline-pbs1", ResourceID: "pbs1", Type: "offline"}
// Add non-PBS alert
m.activeAlerts["cpu-alert"] = &Alert{ID: "cpu-alert", Type: "cpu"}
// Add to offlineConfirmations
m.offlineConfirmations["pbs1"] = 3
m.config.DisableAllPBSOffline = true
m.mu.Lock()
m.applyGlobalOfflineSettingsLocked()
m.mu.Unlock()
// PBS alert should be cleared
if _, exists := m.activeAlerts["pbs-offline-pbs1"]; exists {
t.Error("expected pbs-offline-pbs1 to be cleared")
}
// Non-PBS alert should remain
if _, exists := m.activeAlerts["cpu-alert"]; !exists {
t.Error("expected cpu-alert to remain")
}
// offlineConfirmations for PBS should be removed
if _, exists := m.offlineConfirmations["pbs1"]; exists {
t.Error("expected offlineConfirmations for pbs1 to be removed")
}
})
t.Run("DisableAllGuestsOffline clears guest powered off alerts", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
// Add guest powered off alerts
m.activeAlerts["guest-powered-off-vm1"] = &Alert{ID: "guest-powered-off-vm1", ResourceID: "vm1", Type: "powered-off"}
// Add non-guest alert
m.activeAlerts["cpu-alert"] = &Alert{ID: "cpu-alert", Type: "cpu"}
// Add to offlineConfirmations
m.offlineConfirmations["vm1"] = 2
m.config.DisableAllGuestsOffline = true
m.mu.Lock()
m.applyGlobalOfflineSettingsLocked()
m.mu.Unlock()
// Guest alert should be cleared
if _, exists := m.activeAlerts["guest-powered-off-vm1"]; exists {
t.Error("expected guest-powered-off-vm1 to be cleared")
}
// Non-guest alert should remain
if _, exists := m.activeAlerts["cpu-alert"]; !exists {
t.Error("expected cpu-alert to remain")
}
// offlineConfirmations for guest should be removed
if _, exists := m.offlineConfirmations["vm1"]; exists {
t.Error("expected offlineConfirmations for vm1 to be removed")
}
})
t.Run("DisableAllDockerHostsOffline clears docker host alerts", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
// Add docker host offline alerts
m.activeAlerts["docker-host-offline-host1"] = &Alert{ID: "docker-host-offline-host1", Type: "offline"}
// Add non-docker host alert
m.activeAlerts["cpu-alert"] = &Alert{ID: "cpu-alert", Type: "cpu"}
// Add to dockerOfflineCount
m.dockerOfflineCount["host1"] = 3
m.config.DisableAllDockerHostsOffline = true
m.mu.Lock()
m.applyGlobalOfflineSettingsLocked()
m.mu.Unlock()
// Docker host alert should be cleared
if _, exists := m.activeAlerts["docker-host-offline-host1"]; exists {
t.Error("expected docker-host-offline-host1 to be cleared")
}
// Non-docker host alert should remain
if _, exists := m.activeAlerts["cpu-alert"]; !exists {
t.Error("expected cpu-alert to remain")
}
// dockerOfflineCount should be reset
if len(m.dockerOfflineCount) != 0 {
t.Errorf("expected dockerOfflineCount to be empty, got %d entries", len(m.dockerOfflineCount))
}
})
t.Run("DisableAllDockerContainers clears docker container alerts", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
// Add docker container alerts
m.activeAlerts["docker-container-unhealthy-c1"] = &Alert{ID: "docker-container-unhealthy-c1", Type: "unhealthy"}
m.activeAlerts["docker-container-exited-c2"] = &Alert{ID: "docker-container-exited-c2", Type: "exited"}
// Add non-container alert
m.activeAlerts["cpu-alert"] = &Alert{ID: "cpu-alert", Type: "cpu"}
// Add tracking state
m.dockerStateConfirm["c1"] = 2
m.dockerRestartTracking["c1"] = &dockerRestartRecord{count: 5}
m.dockerLastExitCode["c1"] = 137
m.config.DisableAllDockerContainers = true
m.mu.Lock()
m.applyGlobalOfflineSettingsLocked()
m.mu.Unlock()
// Docker container alerts should be cleared
if _, exists := m.activeAlerts["docker-container-unhealthy-c1"]; exists {
t.Error("expected docker-container-unhealthy-c1 to be cleared")
}
if _, exists := m.activeAlerts["docker-container-exited-c2"]; exists {
t.Error("expected docker-container-exited-c2 to be cleared")
}
// Non-container alert should remain
if _, exists := m.activeAlerts["cpu-alert"]; !exists {
t.Error("expected cpu-alert to remain")
}
// Tracking state should be reset
if len(m.dockerStateConfirm) != 0 {
t.Errorf("expected dockerStateConfirm to be empty, got %d entries", len(m.dockerStateConfirm))
}
if len(m.dockerRestartTracking) != 0 {
t.Errorf("expected dockerRestartTracking to be empty, got %d entries", len(m.dockerRestartTracking))
}
if len(m.dockerLastExitCode) != 0 {
t.Errorf("expected dockerLastExitCode to be empty, got %d entries", len(m.dockerLastExitCode))
}
})
t.Run("DisableAllDockerServices clears docker service alerts", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
// Add docker service alerts
m.activeAlerts["docker-service-unhealthy-svc1"] = &Alert{ID: "docker-service-unhealthy-svc1", Type: "unhealthy"}
// Add non-service alert
m.activeAlerts["cpu-alert"] = &Alert{ID: "cpu-alert", Type: "cpu"}
m.config.DisableAllDockerServices = true
m.mu.Lock()
m.applyGlobalOfflineSettingsLocked()
m.mu.Unlock()
// Docker service alert should be cleared
if _, exists := m.activeAlerts["docker-service-unhealthy-svc1"]; exists {
t.Error("expected docker-service-unhealthy-svc1 to be cleared")
}
// Non-service alert should remain
if _, exists := m.activeAlerts["cpu-alert"]; !exists {
t.Error("expected cpu-alert to remain")
}
})
t.Run("no settings enabled does nothing", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
// Add various alerts
m.activeAlerts["node-offline-node1"] = &Alert{ID: "node-offline-node1", Type: "offline"}
m.activeAlerts["pbs-offline-pbs1"] = &Alert{ID: "pbs-offline-pbs1", Type: "offline"}
m.activeAlerts["docker-container-unhealthy-c1"] = &Alert{ID: "docker-container-unhealthy-c1", Type: "unhealthy"}
// All disable settings are false by default
m.mu.Lock()
m.applyGlobalOfflineSettingsLocked()
m.mu.Unlock()
// All alerts should remain
if len(m.activeAlerts) != 3 {
t.Errorf("expected 3 alerts to remain, got %d", len(m.activeAlerts))
}
})
}
func TestHandleHostOffline(t *testing.T) {
// t.Parallel()
t.Run("empty host ID returns early", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.config.Enabled = true
host := models.Host{ID: "", Hostname: "test-host"}
m.HandleHostOffline(host)
// No alert should be created
if len(m.activeAlerts) != 0 {
t.Errorf("expected 0 alerts, got %d", len(m.activeAlerts))
}
})
t.Run("alerts disabled returns early", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.config.Enabled = false
host := models.Host{ID: "host1", Hostname: "test-host"}
m.HandleHostOffline(host)
// No alert should be created
if len(m.activeAlerts) != 0 {
t.Errorf("expected 0 alerts, got %d", len(m.activeAlerts))
}
})
t.Run("DisableAllHostsOffline clears alert and returns", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.config.Enabled = true
m.config.DisableAllHostsOffline = true
// Pre-create an alert and confirmation
alertID := "host-offline-host1"
m.activeAlerts[alertID] = &Alert{ID: alertID, Type: "host-offline"}
m.offlineConfirmations["host:host1"] = 5
host := models.Host{ID: "host1", Hostname: "test-host"}
m.HandleHostOffline(host)
// Alert should be cleared and confirmations removed
if _, exists := m.activeAlerts[alertID]; exists {
t.Error("expected alert to be cleared")
}
if _, exists := m.offlineConfirmations["host:host1"]; exists {
t.Error("expected offlineConfirmations to be cleared")
}
})
t.Run("override DisableConnectivity clears alert and returns", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.config.Enabled = true
m.config.Overrides = map[string]ThresholdConfig{
"host1": {DisableConnectivity: true},
}
// Pre-create an alert and confirmation
alertID := "host-offline-host1"
m.activeAlerts[alertID] = &Alert{ID: alertID, Type: "host-offline"}
m.offlineConfirmations["host:host1"] = 5
host := models.Host{ID: "host1", Hostname: "test-host"}
m.HandleHostOffline(host)
// Alert should be cleared and confirmations removed
if _, exists := m.activeAlerts[alertID]; exists {
t.Error("expected alert to be cleared")
}
if _, exists := m.offlineConfirmations["host:host1"]; exists {
t.Error("expected offlineConfirmations to be cleared")
}
})
t.Run("override Disabled clears alert and returns", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.config.Enabled = true
m.config.Overrides = map[string]ThresholdConfig{
"host1": {Disabled: true},
}
host := models.Host{ID: "host1", Hostname: "test-host"}
m.HandleHostOffline(host)
// No alert should be created
if len(m.activeAlerts) != 0 {
t.Errorf("expected 0 alerts, got %d", len(m.activeAlerts))
}
})
t.Run("existing alert updates LastSeen", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.config.Enabled = true
alertID := "host-offline-host1"
oldTime := time.Now().Add(-1 * time.Hour)
m.activeAlerts[alertID] = &Alert{ID: alertID, Type: "host-offline", LastSeen: oldTime}
host := models.Host{ID: "host1", Hostname: "test-host"}
m.HandleHostOffline(host)
// LastSeen should be updated
alert := m.activeAlerts[alertID]
if alert.LastSeen.Before(time.Now().Add(-1 * time.Minute)) {
t.Errorf("expected LastSeen to be updated to recent time, got %v", alert.LastSeen)
}
})
t.Run("insufficient confirmations waits", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.config.Enabled = true
host := models.Host{ID: "host1", Hostname: "test-host"}
// First two calls should not create alert
m.HandleHostOffline(host)
if len(m.activeAlerts) != 0 {
t.Errorf("expected 0 alerts after 1st call, got %d", len(m.activeAlerts))
}
if m.offlineConfirmations["host:host1"] != 1 {
t.Errorf("expected 1 confirmation, got %d", m.offlineConfirmations["host:host1"])
}
m.HandleHostOffline(host)
if len(m.activeAlerts) != 0 {
t.Errorf("expected 0 alerts after 2nd call, got %d", len(m.activeAlerts))
}
if m.offlineConfirmations["host:host1"] != 2 {
t.Errorf("expected 2 confirmations, got %d", m.offlineConfirmations["host:host1"])
}
})
t.Run("sufficient confirmations creates alert", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.config.Enabled = true
host := models.Host{
ID: "host1",
Hostname: "test-host",
DisplayName: "Test Host",
Platform: "linux",
OSName: "Ubuntu",
OSVersion: "22.04",
}
// Make 3 calls to reach required confirmations
m.HandleHostOffline(host)
m.HandleHostOffline(host)
m.HandleHostOffline(host)
// Alert should now be created
alertID := "host-offline-host1"
alert, exists := m.activeAlerts[alertID]
if !exists {
t.Fatal("expected alert to be created after 3 confirmations")
}
if alert.Type != "host-offline" {
t.Errorf("expected type 'host-offline', got '%s'", alert.Type)
}
if alert.Level != AlertLevelCritical {
t.Errorf("expected level Critical, got '%s'", alert.Level)
}
if alert.ResourceName == "" {
t.Error("expected ResourceName to be set")
}
})
}
func TestReevaluateActiveAlertsLocked(t *testing.T) {
// t.Parallel()
t.Run("empty alerts map is no-op", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.reevaluateActiveAlertsLocked()
m.mu.Unlock()
if len(m.activeAlerts) != 0 {
t.Errorf("expected 0 alerts, got %d", len(m.activeAlerts))
}
})
t.Run("alert with insufficient ID parts is skipped", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
// Alert ID without dash separator
m.activeAlerts["singlepart"] = &Alert{ID: "singlepart", Type: "cpu", Value: 90}
m.mu.Lock()
m.reevaluateActiveAlertsLocked()
m.mu.Unlock()
// Alert should remain (skipped due to ID format)
if _, exists := m.activeAlerts["singlepart"]; !exists {
t.Error("expected singlepart alert to remain")
}
})
t.Run("DisableAllPMG resolves PMG queue alerts", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
// Add PMG queue alert
m.activeAlerts["pmg-queue-cpu"] = &Alert{
ID: "pmg-queue-cpu",
Type: "queue-depth",
}
m.config.DisableAllPMG = true
m.mu.Lock()
m.reevaluateActiveAlertsLocked()
m.mu.Unlock()
// PMG alert should be resolved
if _, exists := m.activeAlerts["pmg-queue-cpu"]; exists {
t.Error("expected PMG alert to be resolved")
}
})
t.Run("DisableAllHosts resolves Host alerts", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
// Add host alert with resourceType metadata
m.activeAlerts["host-1-cpu"] = &Alert{
ID: "host-1-cpu",
Type: "cpu",
Value: 90,
Metadata: map[string]interface{}{
"resourceType": "Host",
},
}
m.config.DisableAllHosts = true
m.mu.Lock()
m.reevaluateActiveAlertsLocked()
m.mu.Unlock()
// Host alert should be resolved
if _, exists := m.activeAlerts["host-1-cpu"]; exists {
t.Error("expected Host alert to be resolved")
}
})
t.Run("Docker host offline alerts are skipped", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
// Add docker host offline alert
m.activeAlerts["docker-host-1-offline"] = &Alert{
ID: "docker-host-1-offline",
Type: "docker-host-offline",
}
m.mu.Lock()
m.reevaluateActiveAlertsLocked()
m.mu.Unlock()
// Docker host offline alert should remain (skipped)
if _, exists := m.activeAlerts["docker-host-1-offline"]; !exists {
t.Error("expected docker-host-offline alert to remain")
}
})
t.Run("DisableAllDockerHosts resolves dockerhost alerts", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
// Add dockerhost metric alert
m.activeAlerts["dockerhost-1-cpu"] = &Alert{
ID: "dockerhost-1-cpu",
Type: "cpu",
Value: 90,
Metadata: map[string]interface{}{
"resourceType": "dockerhost",
},
}
m.config.DisableAllDockerHosts = true
m.mu.Lock()
m.reevaluateActiveAlertsLocked()
m.mu.Unlock()
// Dockerhost alert should be resolved
if _, exists := m.activeAlerts["dockerhost-1-cpu"]; exists {
t.Error("expected dockerhost alert to be resolved")
}
})
t.Run("DisableAllNodes resolves Node alerts", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
// Add node alert with Instance = "Node"
m.activeAlerts["node1-cpu"] = &Alert{
ID: "node1-cpu",
Type: "cpu",
Value: 90,
Instance: "Node",
}
m.config.DisableAllNodes = true
m.mu.Lock()
m.reevaluateActiveAlertsLocked()
m.mu.Unlock()
// Node alert should be resolved
if _, exists := m.activeAlerts["node1-cpu"]; exists {
t.Error("expected Node alert to be resolved")
}
})
t.Run("DisableAllStorage resolves Storage alerts", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
// Add storage alert with Instance = "Storage"
m.activeAlerts["storage1-usage"] = &Alert{
ID: "storage1-usage",
Type: "usage",
Value: 90,
Instance: "Storage",
}
m.config.DisableAllStorage = true
m.mu.Lock()
m.reevaluateActiveAlertsLocked()
m.mu.Unlock()
// Storage alert should be resolved
if _, exists := m.activeAlerts["storage1-usage"]; exists {
t.Error("expected Storage alert to be resolved")
}
})
t.Run("DisableAllPBS resolves PBS alerts", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
// Add PBS alert with Instance = "PBS"
m.activeAlerts["pbs1-cpu"] = &Alert{
ID: "pbs1-cpu",
Type: "cpu",
Value: 90,
Instance: "PBS",
}
m.config.DisableAllPBS = true
m.mu.Lock()
m.reevaluateActiveAlertsLocked()
m.mu.Unlock()
// PBS alert should be resolved
if _, exists := m.activeAlerts["pbs1-cpu"]; exists {
t.Error("expected PBS alert to be resolved")
}
})
t.Run("DisableAllGuests resolves Guest alerts", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
// Add guest alert with Instance set to something other than "Node"/"Storage"/"PBS"
// Note: If both Instance and Node are empty, it matches the node branch
m.activeAlerts["guest1-cpu"] = &Alert{
ID: "guest1-cpu",
Type: "cpu",
Value: 90,
Instance: "qemu/100", // Guest instance
Node: "pve1", // Different from Instance, so doesn't match node branch
}
m.config.DisableAllGuests = true
m.mu.Lock()
m.reevaluateActiveAlertsLocked()
m.mu.Unlock()
// Guest alert should be resolved
if _, exists := m.activeAlerts["guest1-cpu"]; exists {
t.Error("expected Guest alert to be resolved")
}
})
t.Run("alert with disabled override is resolved", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
// Add guest alert with override
m.activeAlerts["guest1-cpu"] = &Alert{
ID: "guest1-cpu",
Type: "cpu",
Value: 90,
Instance: "qemu/100",
Node: "pve1",
}
m.config.Overrides = map[string]ThresholdConfig{
"guest1": {Disabled: true},
}
m.mu.Lock()
m.reevaluateActiveAlertsLocked()
m.mu.Unlock()
// Alert should be resolved due to disabled override
if _, exists := m.activeAlerts["guest1-cpu"]; exists {
t.Error("expected alert with disabled override to be resolved")
}
})
t.Run("alert below clear threshold is resolved", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
// Add guest alert below new clear threshold
m.activeAlerts["guest1-cpu"] = &Alert{
ID: "guest1-cpu",
Type: "cpu",
Value: 70, // Below clear threshold
Threshold: 80,
Instance: "qemu/100",
Node: "pve1",
}
m.config.GuestDefaults.CPU = &HysteresisThreshold{Trigger: 80, Clear: 75}
m.mu.Lock()
m.reevaluateActiveAlertsLocked()
m.mu.Unlock()
// Alert should be resolved (value 70 < clear 75)
if _, exists := m.activeAlerts["guest1-cpu"]; exists {
t.Error("expected alert below clear threshold to be resolved")
}
})
t.Run("alert between clear and trigger is resolved on config change", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
// Add guest alert between clear and new higher trigger
m.activeAlerts["guest1-cpu"] = &Alert{
ID: "guest1-cpu",
Type: "cpu",
Value: 85, // Between clear (75) and new trigger (90)
Threshold: 80,
Instance: "qemu/100",
Node: "pve1",
}
m.config.GuestDefaults.CPU = &HysteresisThreshold{Trigger: 90, Clear: 75}
m.mu.Lock()
m.reevaluateActiveAlertsLocked()
m.mu.Unlock()
// Alert should be resolved (value 85 < trigger 90)
if _, exists := m.activeAlerts["guest1-cpu"]; exists {
t.Error("expected alert between thresholds to be resolved")
}
})
}
func TestHandleHostRemoved(t *testing.T) {
// t.Parallel()
t.Run("empty host ID is no-op", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.activeAlerts["host-offline-host1"] = &Alert{ID: "host-offline-host1"}
m.mu.Unlock()
// Empty ID host
m.HandleHostRemoved(models.Host{ID: ""})
// Alert should still exist
m.mu.RLock()
_, exists := m.activeAlerts["host-offline-host1"]
m.mu.RUnlock()
if !exists {
t.Error("expected alert to remain when empty host ID passed")
}
})
t.Run("clears host offline alert", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.config.Enabled = true
m.activeAlerts["host-offline-host1"] = &Alert{
ID: "host-offline-host1",
ResourceID: "host:host1",
}
m.offlineConfirmations["host:host1"] = 5
m.mu.Unlock()
m.HandleHostRemoved(models.Host{ID: "host1", Hostname: "testhost"})
m.mu.RLock()
_, alertExists := m.activeAlerts["host-offline-host1"]
_, confirmExists := m.offlineConfirmations["host:host1"]
m.mu.RUnlock()
if alertExists {
t.Error("expected host offline alert to be cleared")
}
if confirmExists {
t.Error("expected offline confirmations to be cleared")
}
})
t.Run("clears host metric alerts", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.config.Enabled = true
// Add CPU and memory alerts for host
m.activeAlerts["host:host1-cpu"] = &Alert{
ID: "host:host1-cpu",
ResourceID: "host:host1",
}
m.activeAlerts["host:host1-memory"] = &Alert{
ID: "host:host1-memory",
ResourceID: "host:host1",
}
m.mu.Unlock()
m.HandleHostRemoved(models.Host{ID: "host1", Hostname: "testhost"})
m.mu.RLock()
_, cpuExists := m.activeAlerts["host:host1-cpu"]
_, memExists := m.activeAlerts["host:host1-memory"]
m.mu.RUnlock()
if cpuExists {
t.Error("expected host CPU alert to be cleared")
}
if memExists {
t.Error("expected host memory alert to be cleared")
}
})
t.Run("clears host disk alerts", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.config.Enabled = true
// Add disk alerts for host
m.activeAlerts["host:host1/disk:sda-usage"] = &Alert{
ID: "host:host1/disk:sda-usage",
ResourceID: "host:host1/disk:sda",
}
m.activeAlerts["host:host1/disk:sdb-usage"] = &Alert{
ID: "host:host1/disk:sdb-usage",
ResourceID: "host:host1/disk:sdb",
}
m.mu.Unlock()
m.HandleHostRemoved(models.Host{ID: "host1", Hostname: "testhost"})
m.mu.RLock()
_, sda := m.activeAlerts["host:host1/disk:sda-usage"]
_, sdb := m.activeAlerts["host:host1/disk:sdb-usage"]
m.mu.RUnlock()
if sda {
t.Error("expected host disk sda alert to be cleared")
}
if sdb {
t.Error("expected host disk sdb alert to be cleared")
}
})
t.Run("clears all alert types together", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.config.Enabled = true
// Add multiple alert types
m.activeAlerts["host-offline-host1"] = &Alert{ID: "host-offline-host1", ResourceID: "host:host1"}
m.activeAlerts["host:host1-cpu"] = &Alert{ID: "host:host1-cpu", ResourceID: "host:host1"}
m.activeAlerts["host:host1-memory"] = &Alert{ID: "host:host1-memory", ResourceID: "host:host1"}
m.activeAlerts["host:host1/disk:sda-usage"] = &Alert{ID: "host:host1/disk:sda-usage", ResourceID: "host:host1/disk:sda"}
m.offlineConfirmations["host:host1"] = 3
m.mu.Unlock()
m.HandleHostRemoved(models.Host{ID: "host1", Hostname: "testhost"})
m.mu.RLock()
alertCount := 0
for id := range m.activeAlerts {
if strings.Contains(id, "host1") {
alertCount++
}
}
_, confirmExists := m.offlineConfirmations["host:host1"]
m.mu.RUnlock()
if alertCount > 0 {
t.Errorf("expected all host1 alerts to be cleared, got %d remaining", alertCount)
}
if confirmExists {
t.Error("expected offline confirmations to be cleared")
}
})
}
func TestReevaluateGuestAlert(t *testing.T) {
// t.Parallel()
t.Run("no active alerts is no-op", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.config.Enabled = true
m.config.GuestDefaults.CPU = &HysteresisThreshold{Trigger: 80, Clear: 70}
m.mu.Unlock()
// No alerts exist - should not panic
m.ReevaluateGuestAlert(nil, "guest1")
m.mu.RLock()
count := len(m.activeAlerts)
m.mu.RUnlock()
if count != 0 {
t.Errorf("expected 0 alerts, got %d", count)
}
})
t.Run("clears alert when threshold disabled (nil)", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.config.Enabled = true
m.activeAlerts["guest1-cpu"] = &Alert{
ID: "guest1-cpu",
Type: "cpu",
Value: 90,
}
m.config.GuestDefaults.CPU = nil // Disabled
m.mu.Unlock()
m.ReevaluateGuestAlert(nil, "guest1")
m.mu.RLock()
_, exists := m.activeAlerts["guest1-cpu"]
m.mu.RUnlock()
if exists {
t.Error("expected alert to be cleared when threshold is nil")
}
})
t.Run("clears alert when trigger is zero", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.config.Enabled = true
m.activeAlerts["guest1-memory"] = &Alert{
ID: "guest1-memory",
Type: "memory",
Value: 85,
}
m.config.GuestDefaults.Memory = &HysteresisThreshold{Trigger: 0, Clear: 0}
m.mu.Unlock()
m.ReevaluateGuestAlert(nil, "guest1")
m.mu.RLock()
_, exists := m.activeAlerts["guest1-memory"]
m.mu.RUnlock()
if exists {
t.Error("expected alert to be cleared when trigger is 0")
}
})
t.Run("clears alert when value below clear threshold", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.config.Enabled = true
m.activeAlerts["guest1-cpu"] = &Alert{
ID: "guest1-cpu",
Type: "cpu",
Value: 65, // Below clear threshold of 70
}
m.config.GuestDefaults.CPU = &HysteresisThreshold{Trigger: 80, Clear: 70}
m.mu.Unlock()
m.ReevaluateGuestAlert(nil, "guest1")
m.mu.RLock()
_, exists := m.activeAlerts["guest1-cpu"]
m.mu.RUnlock()
if exists {
t.Error("expected alert to be cleared when value below clear threshold")
}
})
t.Run("clears alert when value below trigger threshold", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.config.Enabled = true
m.activeAlerts["guest1-disk"] = &Alert{
ID: "guest1-disk",
Type: "disk",
Value: 75, // Below trigger of 80
}
m.config.GuestDefaults.Disk = &HysteresisThreshold{Trigger: 80, Clear: 70}
m.mu.Unlock()
m.ReevaluateGuestAlert(nil, "guest1")
m.mu.RLock()
_, exists := m.activeAlerts["guest1-disk"]
m.mu.RUnlock()
if exists {
t.Error("expected alert to be cleared when value below trigger")
}
})
t.Run("keeps alert when value above both thresholds", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.config.Enabled = true
m.activeAlerts["guest1-cpu"] = &Alert{
ID: "guest1-cpu",
Type: "cpu",
Value: 90, // Above both trigger (80) and clear (70)
}
m.config.GuestDefaults.CPU = &HysteresisThreshold{Trigger: 80, Clear: 70}
m.mu.Unlock()
m.ReevaluateGuestAlert(nil, "guest1")
m.mu.RLock()
_, exists := m.activeAlerts["guest1-cpu"]
m.mu.RUnlock()
if !exists {
t.Error("expected alert to remain when value above thresholds")
}
})
t.Run("processes all metric types", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.config.Enabled = true
// Add alerts for all metric types with values below threshold
metrics := []string{"cpu", "memory", "disk", "diskRead", "diskWrite", "networkIn", "networkOut"}
for _, metric := range metrics {
m.activeAlerts[fmt.Sprintf("guest1-%s", metric)] = &Alert{
ID: fmt.Sprintf("guest1-%s", metric),
Type: metric,
Value: 50, // Below threshold
}
}
threshold := &HysteresisThreshold{Trigger: 80, Clear: 70}
m.config.GuestDefaults.CPU = threshold
m.config.GuestDefaults.Memory = threshold
m.config.GuestDefaults.Disk = threshold
m.config.GuestDefaults.DiskRead = threshold
m.config.GuestDefaults.DiskWrite = threshold
m.config.GuestDefaults.NetworkIn = threshold
m.config.GuestDefaults.NetworkOut = threshold
m.mu.Unlock()
m.ReevaluateGuestAlert(nil, "guest1")
m.mu.RLock()
remaining := len(m.activeAlerts)
m.mu.RUnlock()
if remaining != 0 {
t.Errorf("expected all alerts to be cleared, got %d remaining", remaining)
}
})
t.Run("clears pending alert when threshold disabled", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.config.Enabled = true
m.activeAlerts["guest1-cpu"] = &Alert{
ID: "guest1-cpu",
Type: "cpu",
Value: 90,
}
m.pendingAlerts["guest1-cpu"] = time.Now() // pendingAlerts is map[string]time.Time
m.config.GuestDefaults.CPU = nil // Disabled
m.mu.Unlock()
m.ReevaluateGuestAlert(nil, "guest1")
m.mu.RLock()
_, alertExists := m.activeAlerts["guest1-cpu"]
_, pendingExists := m.pendingAlerts["guest1-cpu"]
m.mu.RUnlock()
if alertExists {
t.Error("expected active alert to be cleared")
}
if pendingExists {
t.Error("expected pending alert to be cleared")
}
})
t.Run("uses clear equals trigger when clear is zero", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.config.Enabled = true
m.activeAlerts["guest1-cpu"] = &Alert{
ID: "guest1-cpu",
Type: "cpu",
Value: 75, // Below trigger of 80
}
// Clear is 0, so it should use trigger (80) as clear threshold
m.config.GuestDefaults.CPU = &HysteresisThreshold{Trigger: 80, Clear: 0}
m.mu.Unlock()
m.ReevaluateGuestAlert(nil, "guest1")
m.mu.RLock()
_, exists := m.activeAlerts["guest1-cpu"]
m.mu.RUnlock()
if exists {
t.Error("expected alert to be cleared when value below trigger (used as clear)")
}
})
t.Run("ignores alerts for different guests", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.config.Enabled = true
m.activeAlerts["guest1-cpu"] = &Alert{
ID: "guest1-cpu",
Type: "cpu",
Value: 50, // Below threshold
}
m.activeAlerts["guest2-cpu"] = &Alert{
ID: "guest2-cpu",
Type: "cpu",
Value: 50, // Below threshold
}
m.config.GuestDefaults.CPU = &HysteresisThreshold{Trigger: 80, Clear: 70}
m.mu.Unlock()
// Only reevaluate guest1
m.ReevaluateGuestAlert(nil, "guest1")
m.mu.RLock()
_, guest1Exists := m.activeAlerts["guest1-cpu"]
_, guest2Exists := m.activeAlerts["guest2-cpu"]
m.mu.RUnlock()
if guest1Exists {
t.Error("expected guest1 alert to be cleared")
}
if !guest2Exists {
t.Error("expected guest2 alert to remain (not reevaluated)")
}
})
}
func TestHandleDockerHostOffline(t *testing.T) {
// t.Parallel()
t.Run("empty host ID is no-op", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.config.Enabled = true
initialCount := len(m.activeAlerts)
m.mu.Unlock()
m.HandleDockerHostOffline(models.DockerHost{ID: ""})
m.mu.RLock()
finalCount := len(m.activeAlerts)
m.mu.RUnlock()
if finalCount != initialCount {
t.Error("expected no change when empty host ID passed")
}
})
t.Run("disabled alerts is no-op", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.config.Enabled = false
m.mu.Unlock()
m.HandleDockerHostOffline(models.DockerHost{ID: "docker1", DisplayName: "Docker Host 1"})
m.mu.RLock()
_, exists := m.activeAlerts["docker-host-offline-docker1"]
m.mu.RUnlock()
if exists {
t.Error("expected no alert when alerts are disabled")
}
})
t.Run("DisableAllDockerHostsOffline clears tracking and alert", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.config.Enabled = true
m.config.DisableAllDockerHostsOffline = true
m.dockerOfflineCount["docker1"] = 5
m.activeAlerts["docker-host-offline-docker1"] = &Alert{ID: "docker-host-offline-docker1"}
m.mu.Unlock()
m.HandleDockerHostOffline(models.DockerHost{ID: "docker1", DisplayName: "Docker Host 1"})
m.mu.RLock()
_, alertExists := m.activeAlerts["docker-host-offline-docker1"]
_, countExists := m.dockerOfflineCount["docker1"]
m.mu.RUnlock()
if alertExists {
t.Error("expected alert to be cleared")
}
if countExists {
t.Error("expected offline count to be cleared")
}
})
t.Run("override DisableConnectivity clears tracking and alert", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.config.Enabled = true
m.config.Overrides = map[string]ThresholdConfig{
"docker1": {DisableConnectivity: true},
}
m.dockerOfflineCount["docker1"] = 3
m.activeAlerts["docker-host-offline-docker1"] = &Alert{ID: "docker-host-offline-docker1"}
m.mu.Unlock()
m.HandleDockerHostOffline(models.DockerHost{ID: "docker1", DisplayName: "Docker Host 1"})
m.mu.RLock()
_, alertExists := m.activeAlerts["docker-host-offline-docker1"]
_, countExists := m.dockerOfflineCount["docker1"]
m.mu.RUnlock()
if alertExists {
t.Error("expected alert to be cleared with override")
}
if countExists {
t.Error("expected offline count to be cleared with override")
}
})
t.Run("existing alert updates LastSeen", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
oldTime := time.Now().Add(-1 * time.Hour)
m.mu.Lock()
m.config.Enabled = true
m.activeAlerts["docker-host-offline-docker1"] = &Alert{
ID: "docker-host-offline-docker1",
LastSeen: oldTime,
}
m.mu.Unlock()
m.HandleDockerHostOffline(models.DockerHost{ID: "docker1", DisplayName: "Docker Host 1"})
m.mu.RLock()
alert := m.activeAlerts["docker-host-offline-docker1"]
m.mu.RUnlock()
if alert == nil {
t.Fatal("expected alert to exist")
}
if !alert.LastSeen.After(oldTime) {
t.Error("expected LastSeen to be updated")
}
})
t.Run("requires 3 confirmations before alert", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.config.Enabled = true
m.mu.Unlock()
host := models.DockerHost{ID: "docker1", DisplayName: "Docker Host 1", Hostname: "docker-server"}
// First call - confirmation 1
m.HandleDockerHostOffline(host)
m.mu.RLock()
count1 := m.dockerOfflineCount["docker1"]
alert1 := m.activeAlerts["docker-host-offline-docker1"]
m.mu.RUnlock()
if count1 != 1 {
t.Errorf("expected count 1, got %d", count1)
}
if alert1 != nil {
t.Error("expected no alert after 1 confirmation")
}
// Second call - confirmation 2
m.HandleDockerHostOffline(host)
m.mu.RLock()
count2 := m.dockerOfflineCount["docker1"]
alert2 := m.activeAlerts["docker-host-offline-docker1"]
m.mu.RUnlock()
if count2 != 2 {
t.Errorf("expected count 2, got %d", count2)
}
if alert2 != nil {
t.Error("expected no alert after 2 confirmations")
}
// Third call - confirmation 3 - should create alert
m.HandleDockerHostOffline(host)
m.mu.RLock()
count3 := m.dockerOfflineCount["docker1"]
alert3 := m.activeAlerts["docker-host-offline-docker1"]
m.mu.RUnlock()
if count3 != 3 {
t.Errorf("expected count 3, got %d", count3)
}
if alert3 == nil {
t.Fatal("expected alert after 3 confirmations")
}
if alert3.Type != "docker-host-offline" {
t.Errorf("expected type docker-host-offline, got %s", alert3.Type)
}
if alert3.Level != AlertLevelCritical {
t.Errorf("expected critical level, got %s", alert3.Level)
}
})
t.Run("alert has correct metadata", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.config.Enabled = true
m.dockerOfflineCount["docker1"] = 2 // Pre-set to trigger on next call
m.mu.Unlock()
host := models.DockerHost{
ID: "docker1",
DisplayName: "My Docker Host",
Hostname: "docker-server.local",
AgentID: "agent-123",
}
m.HandleDockerHostOffline(host)
m.mu.RLock()
alert := m.activeAlerts["docker-host-offline-docker1"]
m.mu.RUnlock()
if alert == nil {
t.Fatal("expected alert to be created")
}
if alert.ResourceID != "docker:docker1" {
t.Errorf("expected resourceID docker:docker1, got %s", alert.ResourceID)
}
if alert.ResourceName != "My Docker Host" {
t.Errorf("expected resourceName 'My Docker Host', got %s", alert.ResourceName)
}
if alert.Node != "docker-server.local" {
t.Errorf("expected node docker-server.local, got %s", alert.Node)
}
if alert.Metadata["resourceType"] != "DockerHost" {
t.Errorf("expected metadata resourceType DockerHost, got %v", alert.Metadata["resourceType"])
}
if alert.Metadata["hostId"] != "docker1" {
t.Errorf("expected metadata hostId docker1, got %v", alert.Metadata["hostId"])
}
if alert.Metadata["agentId"] != "agent-123" {
t.Errorf("expected metadata agentId agent-123, got %v", alert.Metadata["agentId"])
}
})
}
func TestSetMetricHooks(t *testing.T) {
// NOT parallel - modifies package-level state
// Save existing state and restore after test
oldFired := recordAlertFired
oldResolved := recordAlertResolved
oldSuppressed := recordAlertSuppressed
oldAcknowledged := recordAlertAcknowledged
defer func() {
recordAlertFired = oldFired
recordAlertResolved = oldResolved
recordAlertSuppressed = oldSuppressed
recordAlertAcknowledged = oldAcknowledged
}()
t.Run("sets all hooks", func(t *testing.T) {
var firedCalled, resolvedCalled, suppressedCalled, acknowledgedCalled bool
SetMetricHooks(
func(a *Alert) { firedCalled = true },
func(a *Alert) { resolvedCalled = true },
func(s string) { suppressedCalled = true },
func() { acknowledgedCalled = true },
)
// Verify hooks are set by calling them (if they were nil, this would panic)
if recordAlertFired != nil {
recordAlertFired(&Alert{})
}
if recordAlertResolved != nil {
recordAlertResolved(&Alert{})
}
if recordAlertSuppressed != nil {
recordAlertSuppressed("test")
}
if recordAlertAcknowledged != nil {
recordAlertAcknowledged()
}
if !firedCalled {
t.Error("expected fired hook to be called")
}
if !resolvedCalled {
t.Error("expected resolved hook to be called")
}
if !suppressedCalled {
t.Error("expected suppressed hook to be called")
}
if !acknowledgedCalled {
t.Error("expected acknowledged hook to be called")
}
})
t.Run("nil hooks are safe", func(t *testing.T) {
SetMetricHooks(nil, nil, nil, nil)
// Should not panic
if recordAlertFired != nil {
t.Error("expected fired hook to be nil")
}
if recordAlertResolved != nil {
t.Error("expected resolved hook to be nil")
}
})
}
func TestNotifyExistingAlert(t *testing.T) {
// t.Parallel()
t.Run("non-existent alert is no-op", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
// Should not panic
m.NotifyExistingAlert("non-existent-alert")
})
t.Run("existing alert dispatches notification", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
dispatchedCh := make(chan bool, 1)
m.SetAlertCallback(func(a *Alert) {
dispatchedCh <- true
})
m.mu.Lock()
m.config.Enabled = true
m.config.ActivationState = ActivationActive // Must be active to dispatch
m.activeAlerts["test-alert"] = &Alert{
ID: "test-alert",
Type: "test",
Level: AlertLevelWarning,
}
m.mu.Unlock()
m.NotifyExistingAlert("test-alert")
// Wait for async dispatch with timeout
select {
case <-dispatchedCh:
// Success
case <-time.After(1 * time.Second):
t.Error("expected alert callback to be called (timeout)")
}
})
}
func TestGetResolvedAlert(t *testing.T) {
// t.Parallel()
t.Run("returns nil for non-existent alert", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
result := m.GetResolvedAlert("non-existent")
if result != nil {
t.Error("expected nil for non-existent alert")
}
})
t.Run("returns nil for nil resolved entry", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.resolvedMutex.Lock()
m.recentlyResolved["test"] = nil
m.resolvedMutex.Unlock()
result := m.GetResolvedAlert("test")
if result != nil {
t.Error("expected nil for nil resolved entry")
}
})
t.Run("returns nil when Alert is nil", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.resolvedMutex.Lock()
m.recentlyResolved["test"] = &ResolvedAlert{Alert: nil}
m.resolvedMutex.Unlock()
result := m.GetResolvedAlert("test")
if result != nil {
t.Error("expected nil when Alert is nil")
}
})
t.Run("returns cloned alert", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
resolvedTime := time.Now()
m.resolvedMutex.Lock()
m.recentlyResolved["test"] = &ResolvedAlert{
Alert: &Alert{
ID: "test",
Type: "cpu",
Level: AlertLevelWarning,
ResourceID: "res1",
ResourceName: "Resource 1",
},
ResolvedTime: resolvedTime,
}
m.resolvedMutex.Unlock()
result := m.GetResolvedAlert("test")
if result == nil {
t.Fatal("expected non-nil result")
}
if result.Alert.ID != "test" {
t.Errorf("expected ID test, got %s", result.Alert.ID)
}
if result.ResolvedTime != resolvedTime {
t.Error("expected resolved time to match")
}
})
}
func TestGetAlertHistory(t *testing.T) {
// t.Parallel()
t.Run("returns history from history manager", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
// Add some alerts to history
m.historyManager.AddAlert(Alert{ID: "alert1", Type: "cpu"})
m.historyManager.AddAlert(Alert{ID: "alert2", Type: "memory"})
history := m.GetAlertHistory(10)
if len(history) < 2 {
t.Errorf("expected at least 2 history entries, got %d", len(history))
}
})
t.Run("respects limit", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
// Add alerts
for i := 0; i < 5; i++ {
m.historyManager.AddAlert(Alert{ID: fmt.Sprintf("alert%d", i), Type: "test"})
}
history := m.GetAlertHistory(2)
if len(history) > 2 {
t.Errorf("expected max 2 entries, got %d", len(history))
}
})
}
func TestGetAlertHistorySince(t *testing.T) {
// t.Parallel()
t.Run("zero time returns all history", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.historyManager.AddAlert(Alert{ID: "alert1", Type: "cpu"})
history := m.GetAlertHistorySince(time.Time{}, 10)
if len(history) == 0 {
t.Error("expected history entries for zero time")
}
})
t.Run("filters by time", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
// Add an alert
m.historyManager.AddAlert(Alert{ID: "alert1", Type: "cpu", StartTime: time.Now()})
// Query for alerts after now + 1 hour (should return none)
future := time.Now().Add(1 * time.Hour)
history := m.GetAlertHistorySince(future, 10)
if len(history) != 0 {
t.Errorf("expected 0 entries for future time, got %d", len(history))
}
})
}
func TestClearAlertHistory(t *testing.T) {
// t.Parallel()
t.Run("clears all history", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
// Add some alerts
m.historyManager.AddAlert(Alert{ID: "alert1", Type: "cpu"})
m.historyManager.AddAlert(Alert{ID: "alert2", Type: "memory"})
err := m.ClearAlertHistory()
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
history := m.GetAlertHistory(10)
if len(history) != 0 {
t.Errorf("expected 0 entries after clear, got %d", len(history))
}
})
}
func TestClearNodeOfflineAlert(t *testing.T) {
// t.Parallel()
t.Run("no alert and no count is no-op", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
node := models.Node{ID: "node1", Name: "Node 1"}
m.clearNodeOfflineAlert(node)
m.mu.RLock()
alertCount := len(m.activeAlerts)
m.mu.RUnlock()
if alertCount != 0 {
t.Errorf("expected 0 alerts, got %d", alertCount)
}
})
t.Run("resets offline count when node comes online", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.nodeOfflineCount["node1"] = 5
m.mu.Unlock()
node := models.Node{ID: "node1", Name: "Node 1"}
m.clearNodeOfflineAlert(node)
m.mu.RLock()
_, exists := m.nodeOfflineCount["node1"]
m.mu.RUnlock()
if exists {
t.Error("expected offline count to be cleared")
}
})
t.Run("clears existing alert and adds to resolved", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
resolvedCh := make(chan struct{}, 1)
m.SetResolvedCallback(func(alertID string) {
resolvedCh <- struct{}{}
})
m.mu.Lock()
m.nodeOfflineCount["node1"] = 3
m.activeAlerts["node-offline-node1"] = &Alert{
ID: "node-offline-node1",
Type: "offline",
StartTime: time.Now().Add(-10 * time.Minute),
}
m.mu.Unlock()
node := models.Node{ID: "node1", Name: "Node 1", Instance: "pve1"}
m.clearNodeOfflineAlert(node)
m.mu.RLock()
_, alertStillActive := m.activeAlerts["node-offline-node1"]
m.mu.RUnlock()
if !alertStillActive {
t.Fatal("expected alert to remain active until recovery is confirmed")
}
select {
case <-resolvedCh:
t.Fatal("expected no resolved callback before recovery is confirmed")
default:
}
m.clearNodeOfflineAlert(node)
m.mu.RLock()
_, alertStillActive = m.activeAlerts["node-offline-node1"]
m.mu.RUnlock()
if !alertStillActive {
t.Fatal("expected alert to remain active until final recovery confirmation")
}
m.clearNodeOfflineAlert(node)
m.mu.RLock()
_, alertExists := m.activeAlerts["node-offline-node1"]
_, countExists := m.nodeOfflineCount["node1"]
m.mu.RUnlock()
if alertExists {
t.Error("expected alert to be cleared")
}
if countExists {
t.Error("expected offline count to be cleared")
}
// Check resolved
m.resolvedMutex.RLock()
resolved := m.recentlyResolved["node-offline-node1"]
m.resolvedMutex.RUnlock()
if resolved == nil {
t.Error("expected alert to be added to recently resolved")
}
select {
case <-resolvedCh:
case <-time.After(2 * time.Second):
t.Error("expected resolved callback to be called")
}
})
}
// TestClearOfflineAlertNoDeadlock is a regression test for a deadlock introduced
// by commit 07b4765b. The resolved callback (handleAlertResolved) calls
// ShouldSuppressResolvedNotification which acquires m.mu.RLock(). If the
// clear*OfflineAlert functions call the callback synchronously while holding
// m.mu.Lock(), Go's non-reentrant RWMutex deadlocks.
func TestClearOfflineAlertNoDeadlock(t *testing.T) {
// t.Parallel()
type testCase struct {
name string
setupFn func(m *Manager)
clearFn func(m *Manager)
}
cases := []testCase{
{
name: "clearNodeOfflineAlert",
setupFn: func(m *Manager) {
m.mu.Lock()
m.activeAlerts["node-offline-node1"] = &Alert{
ID: "node-offline-node1",
Type: "offline",
StartTime: time.Now().Add(-5 * time.Minute),
}
m.offlineRecoveryConfirmations["node-offline-node1"] = 2
m.mu.Unlock()
},
clearFn: func(m *Manager) {
m.clearNodeOfflineAlert(models.Node{ID: "node1", Name: "Node 1", Instance: "pve1"})
},
},
{
name: "clearPBSOfflineAlert",
setupFn: func(m *Manager) {
m.mu.Lock()
m.activeAlerts["pbs-offline-pbs1"] = &Alert{
ID: "pbs-offline-pbs1",
Type: "offline",
StartTime: time.Now().Add(-5 * time.Minute),
}
m.offlineRecoveryConfirmations["pbs-offline-pbs1"] = 2
m.mu.Unlock()
},
clearFn: func(m *Manager) {
m.clearPBSOfflineAlert(models.PBSInstance{ID: "pbs1", Name: "PBS 1", Host: "host1"})
},
},
{
name: "clearPMGOfflineAlert",
setupFn: func(m *Manager) {
m.mu.Lock()
m.activeAlerts["pmg-offline-pmg1"] = &Alert{
ID: "pmg-offline-pmg1",
Type: "offline",
StartTime: time.Now().Add(-5 * time.Minute),
}
m.offlineRecoveryConfirmations["pmg-offline-pmg1"] = 2
m.mu.Unlock()
},
clearFn: func(m *Manager) {
m.clearPMGOfflineAlert(models.PMGInstance{ID: "pmg1", Name: "PMG 1", Host: "host1"})
},
},
{
name: "clearStorageOfflineAlert",
setupFn: func(m *Manager) {
m.mu.Lock()
m.activeAlerts["storage-offline-stor1"] = &Alert{
ID: "storage-offline-stor1",
Type: "offline",
StartTime: time.Now().Add(-5 * time.Minute),
}
m.offlineRecoveryConfirmations["storage-offline-stor1"] = 1
m.mu.Unlock()
},
clearFn: func(m *Manager) {
m.clearStorageOfflineAlert(models.Storage{ID: "stor1", Name: "Storage 1", Node: "node1"})
},
},
{
name: "clearGuestPoweredOffAlert",
setupFn: func(m *Manager) {
m.mu.Lock()
m.activeAlerts["guest-powered-off-vm100"] = &Alert{
ID: "guest-powered-off-vm100",
Type: "powered-off",
StartTime: time.Now().Add(-5 * time.Minute),
}
m.mu.Unlock()
},
clearFn: func(m *Manager) {
m.clearGuestPoweredOffAlert("vm100", "TestVM")
},
},
}
for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
m := newTestManager(t)
// Simulate what handleAlertResolved does in production:
// it calls ShouldSuppressResolvedNotification which acquires m.mu.RLock().
// Before the fix, this deadlocked because the caller held m.mu.Lock().
done := make(chan struct{})
m.SetResolvedCallback(func(alertID string) {
_ = m.ShouldSuppressResolvedNotification(&Alert{ID: alertID})
close(done)
})
tc.setupFn(m)
tc.clearFn(m)
select {
case <-done:
// Callback completed without deadlock
case <-time.After(3 * time.Second):
t.Fatal("deadlock: resolved callback did not complete within 3 seconds")
}
})
}
}
func TestShouldSuppressResolvedNotification(t *testing.T) {
t.Run("suppresses recovery when firing notification was never sent", func(t *testing.T) {
m := newTestManager(t)
if !m.ShouldSuppressResolvedNotification(&Alert{
ID: "node1-temperature",
Type: "temperature",
}) {
t.Fatal("expected resolved notification suppression when LastNotified is nil")
}
})
t.Run("suppresses recovery for acknowledged alerts", func(t *testing.T) {
m := newTestManager(t)
notifiedAt := time.Now().Add(-5 * time.Minute)
if !m.ShouldSuppressResolvedNotification(&Alert{
ID: "node1-temperature",
Type: "temperature",
Acknowledged: true,
LastNotified: &notifiedAt,
}) {
t.Fatal("expected resolved notification suppression for acknowledged alert")
}
})
t.Run("allows recovery for notified unacknowledged alerts outside quiet hours", func(t *testing.T) {
m := newTestManager(t)
notifiedAt := time.Now().Add(-5 * time.Minute)
if m.ShouldSuppressResolvedNotification(&Alert{
ID: "node1-temperature",
Type: "temperature",
Level: AlertLevelWarning,
LastNotified: &notifiedAt,
}) {
t.Fatal("expected resolved notification to be allowed")
}
})
}
func TestClearPBSOfflineAlert(t *testing.T) {
// t.Parallel()
t.Run("no alert and no count is no-op", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
pbs := models.PBSInstance{ID: "pbs1", Name: "PBS 1"}
m.clearPBSOfflineAlert(pbs)
m.mu.RLock()
alertCount := len(m.activeAlerts)
m.mu.RUnlock()
if alertCount != 0 {
t.Errorf("expected 0 alerts, got %d", alertCount)
}
})
t.Run("resets offline confirmation count", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.offlineConfirmations["pbs1"] = 5
m.mu.Unlock()
pbs := models.PBSInstance{ID: "pbs1", Name: "PBS 1"}
m.clearPBSOfflineAlert(pbs)
m.mu.RLock()
_, exists := m.offlineConfirmations["pbs1"]
m.mu.RUnlock()
if exists {
t.Error("expected offline confirmation count to be cleared")
}
})
t.Run("clears existing alert and adds to resolved", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
resolvedCh := make(chan struct{}, 1)
m.SetResolvedCallback(func(alertID string) {
resolvedCh <- struct{}{}
})
m.mu.Lock()
m.offlineConfirmations["pbs1"] = 3
m.activeAlerts["pbs-offline-pbs1"] = &Alert{
ID: "pbs-offline-pbs1",
Type: "offline",
StartTime: time.Now().Add(-5 * time.Minute),
}
m.mu.Unlock()
pbs := models.PBSInstance{ID: "pbs1", Name: "PBS 1", Host: "pbs.local"}
m.clearPBSOfflineAlert(pbs)
m.mu.RLock()
_, alertStillActive := m.activeAlerts["pbs-offline-pbs1"]
m.mu.RUnlock()
if !alertStillActive {
t.Fatal("expected alert to remain active until recovery is confirmed")
}
select {
case <-resolvedCh:
t.Fatal("expected no resolved callback before recovery is confirmed")
default:
}
m.clearPBSOfflineAlert(pbs)
m.mu.RLock()
_, alertStillActive = m.activeAlerts["pbs-offline-pbs1"]
m.mu.RUnlock()
if !alertStillActive {
t.Fatal("expected alert to remain active until final recovery confirmation")
}
m.clearPBSOfflineAlert(pbs)
m.mu.RLock()
_, alertExists := m.activeAlerts["pbs-offline-pbs1"]
_, countExists := m.offlineConfirmations["pbs1"]
m.mu.RUnlock()
if alertExists {
t.Error("expected alert to be cleared")
}
if countExists {
t.Error("expected offline confirmation count to be cleared")
}
// Check resolved
m.resolvedMutex.RLock()
resolved := m.recentlyResolved["pbs-offline-pbs1"]
m.resolvedMutex.RUnlock()
if resolved == nil {
t.Error("expected alert to be added to recently resolved")
}
select {
case <-resolvedCh:
case <-time.After(2 * time.Second):
t.Error("expected resolved callback to be called")
}
})
}
func TestClearPMGOfflineAlert(t *testing.T) {
// t.Parallel()
t.Run("no alert and no count is no-op", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
pmg := models.PMGInstance{ID: "pmg1", Name: "PMG 1"}
m.clearPMGOfflineAlert(pmg)
m.mu.RLock()
alertCount := len(m.activeAlerts)
m.mu.RUnlock()
if alertCount != 0 {
t.Errorf("expected 0 alerts, got %d", alertCount)
}
})
t.Run("resets offline confirmation count", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.offlineConfirmations["pmg1"] = 5
m.mu.Unlock()
pmg := models.PMGInstance{ID: "pmg1", Name: "PMG 1"}
m.clearPMGOfflineAlert(pmg)
m.mu.RLock()
_, exists := m.offlineConfirmations["pmg1"]
m.mu.RUnlock()
if exists {
t.Error("expected offline confirmation count to be cleared")
}
})
t.Run("clears existing alert and adds to resolved", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
resolvedCh := make(chan struct{}, 1)
m.SetResolvedCallback(func(alertID string) {
resolvedCh <- struct{}{}
})
m.mu.Lock()
m.offlineConfirmations["pmg1"] = 3
m.activeAlerts["pmg-offline-pmg1"] = &Alert{
ID: "pmg-offline-pmg1",
Type: "offline",
StartTime: time.Now().Add(-5 * time.Minute),
}
m.mu.Unlock()
pmg := models.PMGInstance{ID: "pmg1", Name: "PMG 1", Host: "pmg.local"}
m.clearPMGOfflineAlert(pmg)
m.mu.RLock()
_, alertStillActive := m.activeAlerts["pmg-offline-pmg1"]
m.mu.RUnlock()
if !alertStillActive {
t.Fatal("expected alert to remain active until recovery is confirmed")
}
select {
case <-resolvedCh:
t.Fatal("expected no resolved callback before recovery is confirmed")
default:
}
m.clearPMGOfflineAlert(pmg)
m.mu.RLock()
_, alertStillActive = m.activeAlerts["pmg-offline-pmg1"]
m.mu.RUnlock()
if !alertStillActive {
t.Fatal("expected alert to remain active until final recovery confirmation")
}
m.clearPMGOfflineAlert(pmg)
m.mu.RLock()
_, alertExists := m.activeAlerts["pmg-offline-pmg1"]
_, countExists := m.offlineConfirmations["pmg1"]
m.mu.RUnlock()
if alertExists {
t.Error("expected alert to be cleared")
}
if countExists {
t.Error("expected offline confirmation count to be cleared")
}
// Check resolved
m.resolvedMutex.RLock()
resolved := m.recentlyResolved["pmg-offline-pmg1"]
m.resolvedMutex.RUnlock()
if resolved == nil {
t.Error("expected alert to be added to recently resolved")
}
select {
case <-resolvedCh:
case <-time.After(2 * time.Second):
t.Error("expected resolved callback to be called")
}
})
}
func TestCheckNodeOffline(t *testing.T) {
// t.Parallel()
t.Run("override DisableConnectivity clears alert and returns", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.config.Overrides = map[string]ThresholdConfig{
"node1": {DisableConnectivity: true},
}
m.activeAlerts["node-offline-node1"] = &Alert{ID: "node-offline-node1"}
m.nodeOfflineCount["node1"] = 5
m.mu.Unlock()
node := models.Node{ID: "node1", Name: "Node 1"}
m.checkNodeOffline(node)
m.mu.RLock()
_, alertExists := m.activeAlerts["node-offline-node1"]
_, countExists := m.nodeOfflineCount["node1"]
m.mu.RUnlock()
if alertExists {
t.Error("expected alert to be cleared when connectivity disabled")
}
if countExists {
t.Error("expected offline count to be cleared")
}
})
t.Run("existing alert updates LastSeen", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
oldTime := time.Now().Add(-1 * time.Hour)
m.mu.Lock()
m.activeAlerts["node-offline-node1"] = &Alert{
ID: "node-offline-node1",
StartTime: oldTime,
LastSeen: oldTime,
}
m.mu.Unlock()
node := models.Node{ID: "node1", Name: "Node 1"}
m.checkNodeOffline(node)
m.mu.RLock()
alert := m.activeAlerts["node-offline-node1"]
m.mu.RUnlock()
if alert == nil {
t.Fatal("expected alert to exist")
}
if !alert.LastSeen.After(oldTime) {
t.Error("expected LastSeen to be updated")
}
if !alert.StartTime.Equal(oldTime) {
t.Error("expected StartTime to remain unchanged")
}
})
t.Run("insufficient confirmations waits", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
node := models.Node{ID: "node1", Name: "Node 1", Instance: "pve1"}
// First call - count 1
m.checkNodeOffline(node)
m.mu.RLock()
count1 := m.nodeOfflineCount["node1"]
alert1 := m.activeAlerts["node-offline-node1"]
m.mu.RUnlock()
if count1 != 1 {
t.Errorf("expected count 1, got %d", count1)
}
if alert1 != nil {
t.Error("expected no alert after 1 confirmation")
}
// Second call - count 2
m.checkNodeOffline(node)
m.mu.RLock()
count2 := m.nodeOfflineCount["node1"]
alert2 := m.activeAlerts["node-offline-node1"]
m.mu.RUnlock()
if count2 != 2 {
t.Errorf("expected count 2, got %d", count2)
}
if alert2 != nil {
t.Error("expected no alert after 2 confirmations")
}
})
t.Run("creates alert after 3 confirmations", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.nodeOfflineCount["node1"] = 2 // Pre-set to trigger on next call
m.mu.Unlock()
node := models.Node{
ID: "node1",
Name: "Node 1",
Instance: "pve1",
Status: "offline",
ConnectionHealth: "disconnected",
}
m.checkNodeOffline(node)
m.mu.RLock()
alert := m.activeAlerts["node-offline-node1"]
count := m.nodeOfflineCount["node1"]
m.mu.RUnlock()
if count != 3 {
t.Errorf("expected count 3, got %d", count)
}
if alert == nil {
t.Fatal("expected alert after 3 confirmations")
}
if alert.Type != "connectivity" {
t.Errorf("expected type connectivity, got %s", alert.Type)
}
if alert.Level != AlertLevelCritical {
t.Errorf("expected critical level, got %s", alert.Level)
}
if alert.ResourceID != "node1" {
t.Errorf("expected resourceID node1, got %s", alert.ResourceID)
}
})
t.Run("alert added to history", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.nodeOfflineCount["node1"] = 2
m.mu.Unlock()
node := models.Node{ID: "node1", Name: "Node 1", Instance: "pve1"}
m.checkNodeOffline(node)
// Check history
history := m.GetAlertHistory(10)
found := false
for _, h := range history {
if h.ID == "node-offline-node1" {
found = true
break
}
}
if !found {
t.Error("expected alert to be added to history")
}
})
}
func TestCheckPBSOffline(t *testing.T) {
// t.Parallel()
t.Run("override Disabled clears alert and returns", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.config.Overrides = map[string]ThresholdConfig{
"pbs1": {Disabled: true},
}
m.activeAlerts["pbs-offline-pbs1"] = &Alert{ID: "pbs-offline-pbs1"}
m.mu.Unlock()
pbs := models.PBSInstance{ID: "pbs1", Name: "PBS 1"}
m.checkPBSOffline(pbs)
m.mu.RLock()
_, alertExists := m.activeAlerts["pbs-offline-pbs1"]
m.mu.RUnlock()
if alertExists {
t.Error("expected alert to be cleared when disabled")
}
})
t.Run("override DisableConnectivity clears alert and returns", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.config.Overrides = map[string]ThresholdConfig{
"pbs1": {DisableConnectivity: true},
}
m.activeAlerts["pbs-offline-pbs1"] = &Alert{ID: "pbs-offline-pbs1"}
m.mu.Unlock()
pbs := models.PBSInstance{ID: "pbs1", Name: "PBS 1"}
m.checkPBSOffline(pbs)
m.mu.RLock()
_, alertExists := m.activeAlerts["pbs-offline-pbs1"]
m.mu.RUnlock()
if alertExists {
t.Error("expected alert to be cleared when connectivity disabled")
}
})
t.Run("insufficient confirmations waits", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
pbs := models.PBSInstance{ID: "pbs1", Name: "PBS 1"}
// First two calls - not enough confirmations
m.checkPBSOffline(pbs)
m.checkPBSOffline(pbs)
m.mu.RLock()
count := m.offlineConfirmations["pbs1"]
_, alertExists := m.activeAlerts["pbs-offline-pbs1"]
m.mu.RUnlock()
if count != 2 {
t.Errorf("expected count 2, got %d", count)
}
if alertExists {
t.Error("expected no alert after 2 confirmations")
}
})
t.Run("creates alert after 3 confirmations", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.offlineConfirmations["pbs1"] = 2
m.mu.Unlock()
pbs := models.PBSInstance{ID: "pbs1", Name: "PBS 1", Host: "pbs.local"}
m.checkPBSOffline(pbs)
m.mu.RLock()
alert := m.activeAlerts["pbs-offline-pbs1"]
count := m.offlineConfirmations["pbs1"]
m.mu.RUnlock()
if count != 3 {
t.Errorf("expected count 3, got %d", count)
}
if alert == nil {
t.Fatal("expected alert after 3 confirmations")
}
if alert.Type != "offline" {
t.Errorf("expected type offline, got %s", alert.Type)
}
if alert.Level != AlertLevelCritical {
t.Errorf("expected critical level, got %s", alert.Level)
}
})
t.Run("existing alert updates LastSeen", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
oldTime := time.Now().Add(-1 * time.Hour)
m.mu.Lock()
m.offlineConfirmations["pbs1"] = 3
m.activeAlerts["pbs-offline-pbs1"] = &Alert{
ID: "pbs-offline-pbs1",
LastSeen: oldTime,
}
m.mu.Unlock()
pbs := models.PBSInstance{ID: "pbs1", Name: "PBS 1"}
m.checkPBSOffline(pbs)
m.mu.RLock()
alert := m.activeAlerts["pbs-offline-pbs1"]
m.mu.RUnlock()
if alert == nil {
t.Fatal("expected alert to exist")
}
if !alert.LastSeen.After(oldTime) {
t.Error("expected LastSeen to be updated")
}
})
}
func TestCheckPMGOffline(t *testing.T) {
// t.Parallel()
t.Run("override Disabled clears alert and returns", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.config.Overrides = map[string]ThresholdConfig{
"pmg1": {Disabled: true},
}
m.activeAlerts["pmg-offline-pmg1"] = &Alert{ID: "pmg-offline-pmg1"}
m.mu.Unlock()
pmg := models.PMGInstance{ID: "pmg1", Name: "PMG 1"}
m.checkPMGOffline(pmg)
m.mu.RLock()
_, alertExists := m.activeAlerts["pmg-offline-pmg1"]
m.mu.RUnlock()
if alertExists {
t.Error("expected alert to be cleared when disabled")
}
})
t.Run("override DisableConnectivity clears alert and returns", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.config.Overrides = map[string]ThresholdConfig{
"pmg1": {DisableConnectivity: true},
}
m.activeAlerts["pmg-offline-pmg1"] = &Alert{ID: "pmg-offline-pmg1"}
m.mu.Unlock()
pmg := models.PMGInstance{ID: "pmg1", Name: "PMG 1"}
m.checkPMGOffline(pmg)
m.mu.RLock()
_, alertExists := m.activeAlerts["pmg-offline-pmg1"]
m.mu.RUnlock()
if alertExists {
t.Error("expected alert to be cleared when connectivity disabled")
}
})
t.Run("insufficient confirmations waits", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
pmg := models.PMGInstance{ID: "pmg1", Name: "PMG 1"}
// First two calls - not enough confirmations
m.checkPMGOffline(pmg)
m.checkPMGOffline(pmg)
m.mu.RLock()
count := m.offlineConfirmations["pmg1"]
_, alertExists := m.activeAlerts["pmg-offline-pmg1"]
m.mu.RUnlock()
if count != 2 {
t.Errorf("expected count 2, got %d", count)
}
if alertExists {
t.Error("expected no alert after 2 confirmations")
}
})
t.Run("creates alert after 3 confirmations", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.offlineConfirmations["pmg1"] = 2
m.mu.Unlock()
pmg := models.PMGInstance{ID: "pmg1", Name: "PMG 1", Host: "pmg.local"}
m.checkPMGOffline(pmg)
m.mu.RLock()
alert := m.activeAlerts["pmg-offline-pmg1"]
count := m.offlineConfirmations["pmg1"]
m.mu.RUnlock()
if count != 3 {
t.Errorf("expected count 3, got %d", count)
}
if alert == nil {
t.Fatal("expected alert after 3 confirmations")
}
if alert.Type != "offline" {
t.Errorf("expected type offline, got %s", alert.Type)
}
if alert.Level != AlertLevelCritical {
t.Errorf("expected critical level, got %s", alert.Level)
}
})
t.Run("existing alert updates LastSeen", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
oldTime := time.Now().Add(-1 * time.Hour)
m.mu.Lock()
m.offlineConfirmations["pmg1"] = 3
m.activeAlerts["pmg-offline-pmg1"] = &Alert{
ID: "pmg-offline-pmg1",
LastSeen: oldTime,
}
m.mu.Unlock()
pmg := models.PMGInstance{ID: "pmg1", Name: "PMG 1"}
m.checkPMGOffline(pmg)
m.mu.RLock()
alert := m.activeAlerts["pmg-offline-pmg1"]
m.mu.RUnlock()
if alert == nil {
t.Fatal("expected alert to exist")
}
if !alert.LastSeen.After(oldTime) {
t.Error("expected LastSeen to be updated")
}
})
}
func TestCalculateTrimmedBaseline(t *testing.T) {
// t.Parallel()
t.Run("less than 12 samples returns untrustworthy", func(t *testing.T) {
// t.Parallel()
samples := []float64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}
baseline, trustworthy := calculateTrimmedBaseline(samples)
if trustworthy {
t.Error("expected untrustworthy with less than 12 samples")
}
if baseline != 0 {
t.Errorf("expected baseline 0, got %f", baseline)
}
})
t.Run("empty samples returns untrustworthy", func(t *testing.T) {
// t.Parallel()
samples := []float64{}
baseline, trustworthy := calculateTrimmedBaseline(samples)
if trustworthy {
t.Error("expected untrustworthy with empty samples")
}
if baseline != 0 {
t.Errorf("expected baseline 0, got %f", baseline)
}
})
t.Run("12-23 samples uses simple mean", func(t *testing.T) {
// t.Parallel()
// 12 samples summing to 78
samples := []float64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}
baseline, trustworthy := calculateTrimmedBaseline(samples)
if !trustworthy {
t.Error("expected trustworthy with 12 samples")
}
// Mean of 1-12 is (1+2+...+12)/12 = 78/12 = 6.5
if baseline != 6.5 {
t.Errorf("expected baseline 6.5, got %f", baseline)
}
})
t.Run("24+ samples uses trimmed mean", func(t *testing.T) {
// t.Parallel()
// 24 identical values - trimmed mean should equal value
samples := make([]float64, 24)
for i := range samples {
samples[i] = 10.0
}
baseline, trustworthy := calculateTrimmedBaseline(samples)
if !trustworthy {
t.Error("expected trustworthy with 24 samples")
}
if baseline != 10.0 {
t.Errorf("expected baseline 10.0, got %f", baseline)
}
})
t.Run("24+ samples falls back to median when diff > 40%", func(t *testing.T) {
// t.Parallel()
// Create samples where trimmed mean differs significantly from median
// Mostly 10s with some extreme outliers that survive trimming
samples := make([]float64, 24)
for i := range samples {
if i < 4 {
samples[i] = 100.0 // Extreme high values
} else {
samples[i] = 10.0 // Normal values
}
}
// After sorting: 10,10,...,10,100,100,100,100
// Median is 10 (middle values are 10s)
// Trimmed mean (drop 2 highest and 2 lowest): still has 2 100s
// So trimmed mean > median * 1.4, should fall back to median
baseline, trustworthy := calculateTrimmedBaseline(samples)
if !trustworthy {
t.Error("expected trustworthy")
}
// Should use median (10) due to large diff
if baseline != 10.0 {
t.Errorf("expected baseline 10.0 (median fallback), got %f", baseline)
}
})
t.Run("24+ samples uses trimmed mean when diff <= 40%", func(t *testing.T) {
// t.Parallel()
// Sequential values with minimal outlier effect
samples := make([]float64, 24)
for i := range samples {
samples[i] = float64(i + 1) // 1,2,3,...,24
}
baseline, trustworthy := calculateTrimmedBaseline(samples)
if !trustworthy {
t.Error("expected trustworthy")
}
// Median of 1-24 is (12+13)/2 = 12.5
// Trimmed mean of 3-22 is (3+4+...+22)/20 = 250/20 = 12.5
// Both are close, should use trimmed mean
if baseline != 12.5 {
t.Errorf("expected baseline 12.5, got %f", baseline)
}
})
t.Run("odd length array uses middle element for median", func(t *testing.T) {
// t.Parallel()
// 25 samples: an odd-length array
samples := make([]float64, 25)
for i := range samples {
samples[i] = float64(i + 1) // 1,2,3,...,25
}
baseline, trustworthy := calculateTrimmedBaseline(samples)
if !trustworthy {
t.Error("expected trustworthy")
}
// Median of sorted 1-25 is the 13th element = 13
// Trimmed mean excludes top/bottom 2: 3..23 = 21 elements, sum = (3+23)*21/2 = 273, mean = 13
// Both are 13, diff is 0%, should use trimmed mean = 13
if baseline != 13.0 {
t.Errorf("expected baseline 13.0, got %f", baseline)
}
})
t.Run("trimmed mean less than median triggers diff calculation", func(t *testing.T) {
// t.Parallel()
// Create samples where trimmed mean < median but within 40%
// High outliers at top (excluded by trim), low values in middle
samples := make([]float64, 24)
// First 2 (will be trimmed): very low
samples[0], samples[1] = 1, 2
// Middle 20: mostly 50 but some variance
for i := 2; i < 22; i++ {
samples[i] = 50.0
}
// Last 2 (will be trimmed): very high
samples[22], samples[23] = 100, 200
baseline, trustworthy := calculateTrimmedBaseline(samples)
if !trustworthy {
t.Error("expected trustworthy")
}
// After sorting: 1, 2, 50x20, 100, 200
// Median of even array: (50+50)/2 = 50
// Trimmed mean: 50x20/20 = 50
// Should return 50
if baseline != 50.0 {
t.Errorf("expected baseline 50.0, got %f", baseline)
}
})
}
func TestCreateOrUpdateNodeAlert(t *testing.T) {
// t.Parallel()
t.Run("creates new alert", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
pmg := models.PMGInstance{ID: "pmg1", Name: "PMG 1"}
m.createOrUpdateNodeAlert(
"pmg1-node-queue",
pmg,
"mail-node1",
"pmg-node-queue",
AlertLevelWarning,
100,
50,
"Queue depth high",
)
m.mu.RLock()
alert := m.activeAlerts["pmg1-node-queue"]
m.mu.RUnlock()
if alert == nil {
t.Fatal("expected alert to be created")
}
if alert.Type != "pmg-node-queue" {
t.Errorf("expected type pmg-node-queue, got %s", alert.Type)
}
if alert.Level != AlertLevelWarning {
t.Errorf("expected warning level, got %s", alert.Level)
}
if alert.Value != 100 {
t.Errorf("expected value 100, got %f", alert.Value)
}
if alert.Threshold != 50 {
t.Errorf("expected threshold 50, got %f", alert.Threshold)
}
if alert.Node != "mail-node1" {
t.Errorf("expected node mail-node1, got %s", alert.Node)
}
})
t.Run("updates existing alert", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
oldTime := time.Now().Add(-1 * time.Hour)
m.mu.Lock()
m.activeAlerts["pmg1-node-queue"] = &Alert{
ID: "pmg1-node-queue",
Value: 50,
Threshold: 40,
Level: AlertLevelWarning,
Message: "Old message",
LastSeen: oldTime,
}
m.mu.Unlock()
pmg := models.PMGInstance{ID: "pmg1", Name: "PMG 1"}
m.createOrUpdateNodeAlert(
"pmg1-node-queue",
pmg,
"mail-node1",
"pmg-node-queue",
AlertLevelCritical,
200,
100,
"New message",
)
m.mu.RLock()
alert := m.activeAlerts["pmg1-node-queue"]
m.mu.RUnlock()
if alert == nil {
t.Fatal("expected alert to exist")
}
if alert.Value != 200 {
t.Errorf("expected value 200, got %f", alert.Value)
}
if alert.Threshold != 100 {
t.Errorf("expected threshold 100, got %f", alert.Threshold)
}
if alert.Level != AlertLevelCritical {
t.Errorf("expected critical level, got %s", alert.Level)
}
if alert.Message != "New message" {
t.Errorf("expected 'New message', got %s", alert.Message)
}
if !alert.LastSeen.After(oldTime) {
t.Error("expected LastSeen to be updated")
}
})
}
func TestCheckPMGQueueDepths(t *testing.T) {
// t.Parallel()
t.Run("no thresholds configured does not create alerts", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
pmg := models.PMGInstance{
ID: "pmg1",
Name: "PMG 1",
Nodes: []models.PMGNodeStatus{
{Name: "node1", QueueStatus: &models.PMGQueueStatus{Total: 1000, Deferred: 500, Hold: 300}},
},
}
// No thresholds configured (all 0)
defaults := PMGThresholdConfig{}
m.checkPMGQueueDepths(pmg, defaults)
m.mu.RLock()
totalAlerts := len(m.activeAlerts)
m.mu.RUnlock()
if totalAlerts != 0 {
t.Errorf("expected no alerts when no thresholds configured, got %d", totalAlerts)
}
})
t.Run("total queue warning alert", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
pmg := models.PMGInstance{
ID: "pmg1",
Name: "PMG 1",
Host: "pmg-server",
Nodes: []models.PMGNodeStatus{
{Name: "node1", QueueStatus: &models.PMGQueueStatus{Total: 300}},
{Name: "node2", QueueStatus: &models.PMGQueueStatus{Total: 250}},
},
}
defaults := PMGThresholdConfig{
QueueTotalWarning: 500,
QueueTotalCritical: 1000,
}
m.checkPMGQueueDepths(pmg, defaults)
m.mu.RLock()
alert := m.activeAlerts["pmg1-queue-total"]
m.mu.RUnlock()
if alert == nil {
t.Fatal("expected warning alert to be created")
}
if alert.Level != AlertLevelWarning {
t.Errorf("expected warning level, got %s", alert.Level)
}
if alert.Value != 550 {
t.Errorf("expected value 550, got %f", alert.Value)
}
})
t.Run("total queue critical alert", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
pmg := models.PMGInstance{
ID: "pmg1",
Name: "PMG 1",
Nodes: []models.PMGNodeStatus{
{Name: "node1", QueueStatus: &models.PMGQueueStatus{Total: 600}},
{Name: "node2", QueueStatus: &models.PMGQueueStatus{Total: 500}},
},
}
defaults := PMGThresholdConfig{
QueueTotalWarning: 500,
QueueTotalCritical: 1000,
}
m.checkPMGQueueDepths(pmg, defaults)
m.mu.RLock()
alert := m.activeAlerts["pmg1-queue-total"]
m.mu.RUnlock()
if alert == nil {
t.Fatal("expected critical alert to be created")
}
if alert.Level != AlertLevelCritical {
t.Errorf("expected critical level, got %s", alert.Level)
}
if alert.Value != 1100 {
t.Errorf("expected value 1100, got %f", alert.Value)
}
})
t.Run("deferred queue warning alert", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
pmg := models.PMGInstance{
ID: "pmg1",
Name: "PMG 1",
Nodes: []models.PMGNodeStatus{
{Name: "node1", QueueStatus: &models.PMGQueueStatus{Deferred: 150}},
{Name: "node2", QueueStatus: &models.PMGQueueStatus{Deferred: 100}},
},
}
defaults := PMGThresholdConfig{
DeferredQueueWarn: 200,
DeferredQueueCritical: 500,
}
m.checkPMGQueueDepths(pmg, defaults)
m.mu.RLock()
alert := m.activeAlerts["pmg1-queue-deferred"]
m.mu.RUnlock()
if alert == nil {
t.Fatal("expected deferred alert to be created")
}
if alert.Level != AlertLevelWarning {
t.Errorf("expected warning level, got %s", alert.Level)
}
if alert.Value != 250 {
t.Errorf("expected value 250, got %f", alert.Value)
}
})
t.Run("deferred queue critical alert", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
pmg := models.PMGInstance{
ID: "pmg1",
Name: "PMG 1",
Nodes: []models.PMGNodeStatus{
{Name: "node1", QueueStatus: &models.PMGQueueStatus{Deferred: 300}},
{Name: "node2", QueueStatus: &models.PMGQueueStatus{Deferred: 250}},
},
}
defaults := PMGThresholdConfig{
DeferredQueueWarn: 200,
DeferredQueueCritical: 500,
}
m.checkPMGQueueDepths(pmg, defaults)
m.mu.RLock()
alert := m.activeAlerts["pmg1-queue-deferred"]
m.mu.RUnlock()
if alert == nil {
t.Fatal("expected critical alert to be created")
}
if alert.Level != AlertLevelCritical {
t.Errorf("expected critical level, got %s", alert.Level)
}
})
t.Run("hold queue warning alert", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
pmg := models.PMGInstance{
ID: "pmg1",
Name: "PMG 1",
Nodes: []models.PMGNodeStatus{
{Name: "node1", QueueStatus: &models.PMGQueueStatus{Hold: 75}},
{Name: "node2", QueueStatus: &models.PMGQueueStatus{Hold: 50}},
},
}
defaults := PMGThresholdConfig{
HoldQueueWarn: 100,
HoldQueueCritical: 300,
}
m.checkPMGQueueDepths(pmg, defaults)
m.mu.RLock()
alert := m.activeAlerts["pmg1-queue-hold"]
m.mu.RUnlock()
if alert == nil {
t.Fatal("expected hold alert to be created")
}
if alert.Level != AlertLevelWarning {
t.Errorf("expected warning level, got %s", alert.Level)
}
if alert.Value != 125 {
t.Errorf("expected value 125, got %f", alert.Value)
}
})
t.Run("hold queue critical alert", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
pmg := models.PMGInstance{
ID: "pmg1",
Name: "PMG 1",
Nodes: []models.PMGNodeStatus{
{Name: "node1", QueueStatus: &models.PMGQueueStatus{Hold: 200}},
{Name: "node2", QueueStatus: &models.PMGQueueStatus{Hold: 150}},
},
}
defaults := PMGThresholdConfig{
HoldQueueWarn: 100,
HoldQueueCritical: 300,
}
m.checkPMGQueueDepths(pmg, defaults)
m.mu.RLock()
alert := m.activeAlerts["pmg1-queue-hold"]
m.mu.RUnlock()
if alert == nil {
t.Fatal("expected critical alert to be created")
}
if alert.Level != AlertLevelCritical {
t.Errorf("expected critical level, got %s", alert.Level)
}
})
t.Run("updates existing alert", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
oldTime := time.Now().Add(-1 * time.Hour)
m.mu.Lock()
m.activeAlerts["pmg1-queue-total"] = &Alert{
ID: "pmg1-queue-total",
Value: 400,
Level: AlertLevelWarning,
LastSeen: oldTime,
}
m.mu.Unlock()
pmg := models.PMGInstance{
ID: "pmg1",
Name: "PMG 1",
Nodes: []models.PMGNodeStatus{
{Name: "node1", QueueStatus: &models.PMGQueueStatus{Total: 1200}},
},
}
defaults := PMGThresholdConfig{
QueueTotalWarning: 500,
QueueTotalCritical: 1000,
}
m.checkPMGQueueDepths(pmg, defaults)
m.mu.RLock()
alert := m.activeAlerts["pmg1-queue-total"]
m.mu.RUnlock()
if alert == nil {
t.Fatal("expected alert to exist")
}
if alert.Value != 1200 {
t.Errorf("expected value 1200, got %f", alert.Value)
}
if alert.Level != AlertLevelCritical {
t.Errorf("expected critical level, got %s", alert.Level)
}
if !alert.LastSeen.After(oldTime) {
t.Error("expected LastSeen to be updated")
}
})
t.Run("below threshold clears alert", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.activeAlerts["pmg1-queue-total"] = &Alert{ID: "pmg1-queue-total"}
m.mu.Unlock()
pmg := models.PMGInstance{
ID: "pmg1",
Name: "PMG 1",
Nodes: []models.PMGNodeStatus{
{Name: "node1", QueueStatus: &models.PMGQueueStatus{Total: 100}},
},
}
defaults := PMGThresholdConfig{
QueueTotalWarning: 500,
QueueTotalCritical: 1000,
}
m.checkPMGQueueDepths(pmg, defaults)
m.mu.RLock()
_, exists := m.activeAlerts["pmg1-queue-total"]
m.mu.RUnlock()
if exists {
t.Error("expected alert to be cleared when below threshold")
}
})
t.Run("nil QueueStatus is handled", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
pmg := models.PMGInstance{
ID: "pmg1",
Name: "PMG 1",
Nodes: []models.PMGNodeStatus{
{Name: "node1", QueueStatus: nil},
{Name: "node2", QueueStatus: &models.PMGQueueStatus{Total: 100}},
},
}
defaults := PMGThresholdConfig{
QueueTotalWarning: 500,
}
// Should not panic
m.checkPMGQueueDepths(pmg, defaults)
m.mu.RLock()
_, exists := m.activeAlerts["pmg1-queue-total"]
m.mu.RUnlock()
if exists {
t.Error("expected no alert with total below threshold")
}
})
}
func TestCheckPMGOldestMessage(t *testing.T) {
// t.Parallel()
t.Run("no thresholds configured returns early", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
pmg := models.PMGInstance{
ID: "pmg1",
Name: "PMG 1",
Nodes: []models.PMGNodeStatus{
{Name: "node1", QueueStatus: &models.PMGQueueStatus{OldestAge: 7200}}, // 2 hours
},
}
defaults := PMGThresholdConfig{} // No thresholds
m.checkPMGOldestMessage(pmg, defaults)
m.mu.RLock()
_, exists := m.activeAlerts["pmg1-oldest-message"]
m.mu.RUnlock()
if exists {
t.Error("expected no alert when no thresholds configured")
}
})
t.Run("no messages clears existing alert", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.activeAlerts["pmg1-oldest-message"] = &Alert{ID: "pmg1-oldest-message"}
m.mu.Unlock()
pmg := models.PMGInstance{
ID: "pmg1",
Name: "PMG 1",
Nodes: []models.PMGNodeStatus{
{Name: "node1", QueueStatus: &models.PMGQueueStatus{OldestAge: 0}},
},
}
defaults := PMGThresholdConfig{
OldestMessageWarnMins: 30,
OldestMessageCritMins: 60,
}
m.checkPMGOldestMessage(pmg, defaults)
m.mu.RLock()
_, exists := m.activeAlerts["pmg1-oldest-message"]
m.mu.RUnlock()
if exists {
t.Error("expected alert to be cleared when no messages in queue")
}
})
t.Run("warning alert when message age exceeds warning threshold", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
pmg := models.PMGInstance{
ID: "pmg1",
Name: "PMG 1",
Host: "pmg-server",
Nodes: []models.PMGNodeStatus{
{Name: "node1", QueueStatus: &models.PMGQueueStatus{OldestAge: 2400}}, // 40 minutes
},
}
defaults := PMGThresholdConfig{
OldestMessageWarnMins: 30,
OldestMessageCritMins: 60,
}
m.checkPMGOldestMessage(pmg, defaults)
m.mu.RLock()
alert := m.activeAlerts["pmg1-oldest-message"]
m.mu.RUnlock()
if alert == nil {
t.Fatal("expected warning alert to be created")
}
if alert.Level != AlertLevelWarning {
t.Errorf("expected warning level, got %s", alert.Level)
}
if alert.Value != 40 {
t.Errorf("expected value 40 minutes, got %f", alert.Value)
}
if alert.Threshold != 30 {
t.Errorf("expected threshold 30, got %f", alert.Threshold)
}
})
t.Run("critical alert when message age exceeds critical threshold", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
pmg := models.PMGInstance{
ID: "pmg1",
Name: "PMG 1",
Nodes: []models.PMGNodeStatus{
{Name: "node1", QueueStatus: &models.PMGQueueStatus{OldestAge: 4200}}, // 70 minutes
},
}
defaults := PMGThresholdConfig{
OldestMessageWarnMins: 30,
OldestMessageCritMins: 60,
}
m.checkPMGOldestMessage(pmg, defaults)
m.mu.RLock()
alert := m.activeAlerts["pmg1-oldest-message"]
m.mu.RUnlock()
if alert == nil {
t.Fatal("expected critical alert to be created")
}
if alert.Level != AlertLevelCritical {
t.Errorf("expected critical level, got %s", alert.Level)
}
if alert.Threshold != 60 {
t.Errorf("expected threshold 60, got %f", alert.Threshold)
}
})
t.Run("below threshold clears alert", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.activeAlerts["pmg1-oldest-message"] = &Alert{ID: "pmg1-oldest-message"}
m.mu.Unlock()
pmg := models.PMGInstance{
ID: "pmg1",
Name: "PMG 1",
Nodes: []models.PMGNodeStatus{
{Name: "node1", QueueStatus: &models.PMGQueueStatus{OldestAge: 900}}, // 15 minutes
},
}
defaults := PMGThresholdConfig{
OldestMessageWarnMins: 30,
OldestMessageCritMins: 60,
}
m.checkPMGOldestMessage(pmg, defaults)
m.mu.RLock()
_, exists := m.activeAlerts["pmg1-oldest-message"]
m.mu.RUnlock()
if exists {
t.Error("expected alert to be cleared when below threshold")
}
})
t.Run("finds oldest across multiple nodes", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
pmg := models.PMGInstance{
ID: "pmg1",
Name: "PMG 1",
Nodes: []models.PMGNodeStatus{
{Name: "node1", QueueStatus: &models.PMGQueueStatus{OldestAge: 1200}}, // 20 minutes
{Name: "node2", QueueStatus: &models.PMGQueueStatus{OldestAge: 3000}}, // 50 minutes (oldest)
{Name: "node3", QueueStatus: &models.PMGQueueStatus{OldestAge: 600}}, // 10 minutes
},
}
defaults := PMGThresholdConfig{
OldestMessageWarnMins: 30,
OldestMessageCritMins: 60,
}
m.checkPMGOldestMessage(pmg, defaults)
m.mu.RLock()
alert := m.activeAlerts["pmg1-oldest-message"]
m.mu.RUnlock()
if alert == nil {
t.Fatal("expected alert to be created")
}
if alert.Value != 50 {
t.Errorf("expected value 50 (oldest across nodes), got %f", alert.Value)
}
})
t.Run("updates existing alert", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
oldTime := time.Now().Add(-1 * time.Hour)
m.mu.Lock()
m.activeAlerts["pmg1-oldest-message"] = &Alert{
ID: "pmg1-oldest-message",
Value: 40,
Level: AlertLevelWarning,
LastSeen: oldTime,
}
m.mu.Unlock()
pmg := models.PMGInstance{
ID: "pmg1",
Name: "PMG 1",
Nodes: []models.PMGNodeStatus{
{Name: "node1", QueueStatus: &models.PMGQueueStatus{OldestAge: 4800}}, // 80 minutes
},
}
defaults := PMGThresholdConfig{
OldestMessageWarnMins: 30,
OldestMessageCritMins: 60,
}
m.checkPMGOldestMessage(pmg, defaults)
m.mu.RLock()
alert := m.activeAlerts["pmg1-oldest-message"]
m.mu.RUnlock()
if alert == nil {
t.Fatal("expected alert to exist")
}
if alert.Value != 80 {
t.Errorf("expected value 80, got %f", alert.Value)
}
if alert.Level != AlertLevelCritical {
t.Errorf("expected critical level, got %s", alert.Level)
}
if !alert.LastSeen.After(oldTime) {
t.Error("expected LastSeen to be updated")
}
})
t.Run("nil QueueStatus is handled", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
pmg := models.PMGInstance{
ID: "pmg1",
Name: "PMG 1",
Nodes: []models.PMGNodeStatus{
{Name: "node1", QueueStatus: nil},
{Name: "node2", QueueStatus: &models.PMGQueueStatus{OldestAge: 2400}}, // 40 minutes
},
}
defaults := PMGThresholdConfig{
OldestMessageWarnMins: 30,
}
// Should not panic and should use the valid node's data
m.checkPMGOldestMessage(pmg, defaults)
m.mu.RLock()
alert := m.activeAlerts["pmg1-oldest-message"]
m.mu.RUnlock()
if alert == nil {
t.Fatal("expected alert to be created from valid node data")
}
if alert.Value != 40 {
t.Errorf("expected value 40, got %f", alert.Value)
}
})
}
func TestCheckStorageOffline(t *testing.T) {
// t.Parallel()
t.Run("first poll increments confirmation but does not create alert", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
storage := models.Storage{
ID: "local-lvm",
Name: "Local LVM",
Node: "pve-node1",
}
m.checkStorageOffline(storage)
m.mu.RLock()
confirmCount := m.offlineConfirmations["local-lvm"]
_, alertExists := m.activeAlerts["storage-offline-local-lvm"]
m.mu.RUnlock()
if confirmCount != 1 {
t.Errorf("expected confirmation count 1, got %d", confirmCount)
}
if alertExists {
t.Error("expected no alert on first poll")
}
})
t.Run("second poll creates alert after confirmation", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
storage := models.Storage{
ID: "local-lvm",
Name: "Local LVM",
Node: "pve-node1",
Instance: "pve-instance",
}
// First poll - confirmation
m.checkStorageOffline(storage)
// Second poll - should create alert
m.checkStorageOffline(storage)
m.mu.RLock()
alert := m.activeAlerts["storage-offline-local-lvm"]
m.mu.RUnlock()
if alert == nil {
t.Fatal("expected alert to be created after second poll")
}
if alert.Type != "offline" {
t.Errorf("expected type 'offline', got %s", alert.Type)
}
if alert.Level != AlertLevelWarning {
t.Errorf("expected warning level, got %s", alert.Level)
}
if alert.ResourceID != "local-lvm" {
t.Errorf("expected resource ID 'local-lvm', got %s", alert.ResourceID)
}
if alert.Node != "pve-node1" {
t.Errorf("expected node 'pve-node1', got %s", alert.Node)
}
})
t.Run("existing alert updates LastSeen", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
oldTime := time.Now().Add(-1 * time.Hour)
m.mu.Lock()
m.offlineConfirmations["local-lvm"] = 5 // Already confirmed
m.activeAlerts["storage-offline-local-lvm"] = &Alert{
ID: "storage-offline-local-lvm",
LastSeen: oldTime,
}
m.mu.Unlock()
storage := models.Storage{
ID: "local-lvm",
Name: "Local LVM",
Node: "pve-node1",
}
m.checkStorageOffline(storage)
m.mu.RLock()
alert := m.activeAlerts["storage-offline-local-lvm"]
m.mu.RUnlock()
if !alert.LastSeen.After(oldTime) {
t.Error("expected LastSeen to be updated")
}
})
t.Run("disabled storage clears existing alert", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
// Pre-create an alert
m.mu.Lock()
m.activeAlerts["storage-offline-local-lvm"] = &Alert{ID: "storage-offline-local-lvm"}
m.config.Overrides = map[string]ThresholdConfig{
"local-lvm": {Disabled: true},
}
m.mu.Unlock()
storage := models.Storage{
ID: "local-lvm",
Name: "Local LVM",
Node: "pve-node1",
}
m.checkStorageOffline(storage)
m.mu.RLock()
_, exists := m.activeAlerts["storage-offline-local-lvm"]
m.mu.RUnlock()
if exists {
t.Error("expected alert to be cleared when storage is disabled")
}
})
t.Run("disabled storage does not create alert", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.config.Overrides = map[string]ThresholdConfig{
"local-lvm": {Disabled: true},
}
m.mu.Unlock()
storage := models.Storage{
ID: "local-lvm",
Name: "Local LVM",
Node: "pve-node1",
}
// Multiple polls should not create alert
m.checkStorageOffline(storage)
m.checkStorageOffline(storage)
m.checkStorageOffline(storage)
m.mu.RLock()
_, exists := m.activeAlerts["storage-offline-local-lvm"]
m.mu.RUnlock()
if exists {
t.Error("expected no alert when storage is disabled")
}
})
}
func TestCheckGuestPoweredOff(t *testing.T) {
// t.Parallel()
t.Run("first poll increments confirmation but does not create alert", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.checkGuestPoweredOff("vm100", "TestVM", "pve-node1", "pve-instance", "VM", false)
m.mu.RLock()
confirmCount := m.offlineConfirmations["vm100"]
_, alertExists := m.activeAlerts["guest-powered-off-vm100"]
m.mu.RUnlock()
if confirmCount != 1 {
t.Errorf("expected confirmation count 1, got %d", confirmCount)
}
if alertExists {
t.Error("expected no alert on first poll")
}
})
t.Run("second poll creates alert after confirmation", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
// First poll - confirmation
m.checkGuestPoweredOff("vm100", "TestVM", "pve-node1", "pve-instance", "VM", false)
// Second poll - should create alert
m.checkGuestPoweredOff("vm100", "TestVM", "pve-node1", "pve-instance", "VM", false)
m.mu.RLock()
alert := m.activeAlerts["guest-powered-off-vm100"]
m.mu.RUnlock()
if alert == nil {
t.Fatal("expected alert to be created after second poll")
}
if alert.Type != "powered-off" {
t.Errorf("expected type 'powered-off', got %s", alert.Type)
}
if alert.Level != AlertLevelWarning {
t.Errorf("expected warning level (default severity), got %s", alert.Level)
}
if alert.ResourceID != "vm100" {
t.Errorf("expected resource ID 'vm100', got %s", alert.ResourceID)
}
})
t.Run("existing alert updates LastSeen and level", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
oldTime := time.Now().Add(-1 * time.Hour)
m.mu.Lock()
m.activeAlerts["guest-powered-off-vm100"] = &Alert{
ID: "guest-powered-off-vm100",
LastSeen: oldTime,
Level: AlertLevelWarning,
}
m.mu.Unlock()
m.checkGuestPoweredOff("vm100", "TestVM", "pve-node1", "pve-instance", "VM", false)
m.mu.RLock()
alert := m.activeAlerts["guest-powered-off-vm100"]
m.mu.RUnlock()
if !alert.LastSeen.After(oldTime) {
t.Error("expected LastSeen to be updated")
}
})
t.Run("monitorOnly flag is set in metadata", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
// First poll
m.checkGuestPoweredOff("vm100", "TestVM", "pve-node1", "pve-instance", "VM", true)
// Second poll - creates alert
m.checkGuestPoweredOff("vm100", "TestVM", "pve-node1", "pve-instance", "VM", true)
m.mu.RLock()
alert := m.activeAlerts["guest-powered-off-vm100"]
m.mu.RUnlock()
if alert == nil {
t.Fatal("expected alert to be created")
}
if alert.Metadata == nil {
t.Fatal("expected metadata to be set")
}
if monitorOnly, ok := alert.Metadata["monitorOnly"].(bool); !ok || !monitorOnly {
t.Error("expected monitorOnly to be true")
}
})
t.Run("disabled guest clears existing alert", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
// Pre-create an alert and confirmation count
m.mu.Lock()
m.activeAlerts["guest-powered-off-vm100"] = &Alert{ID: "guest-powered-off-vm100"}
m.offlineConfirmations["vm100"] = 5
m.config.Overrides = map[string]ThresholdConfig{
"vm100": {Disabled: true},
}
m.mu.Unlock()
m.checkGuestPoweredOff("vm100", "TestVM", "pve-node1", "pve-instance", "VM", false)
m.mu.RLock()
_, alertExists := m.activeAlerts["guest-powered-off-vm100"]
_, confirmExists := m.offlineConfirmations["vm100"]
m.mu.RUnlock()
if alertExists {
t.Error("expected alert to be cleared when guest is disabled")
}
if confirmExists {
t.Error("expected confirmation count to be cleared")
}
})
t.Run("disableConnectivity clears existing alert", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
// Pre-create an alert
m.mu.Lock()
m.activeAlerts["guest-powered-off-vm100"] = &Alert{ID: "guest-powered-off-vm100"}
m.config.Overrides = map[string]ThresholdConfig{
"vm100": {DisableConnectivity: true},
}
m.mu.Unlock()
m.checkGuestPoweredOff("vm100", "TestVM", "pve-node1", "pve-instance", "VM", false)
m.mu.RLock()
_, exists := m.activeAlerts["guest-powered-off-vm100"]
m.mu.RUnlock()
if exists {
t.Error("expected alert to be cleared when connectivity is disabled")
}
})
t.Run("uses override severity when configured", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.config.Overrides = map[string]ThresholdConfig{
"vm100": {PoweredOffSeverity: AlertLevelCritical},
}
m.mu.Unlock()
// First poll
m.checkGuestPoweredOff("vm100", "TestVM", "pve-node1", "pve-instance", "VM", false)
// Second poll
m.checkGuestPoweredOff("vm100", "TestVM", "pve-node1", "pve-instance", "VM", false)
m.mu.RLock()
alert := m.activeAlerts["guest-powered-off-vm100"]
m.mu.RUnlock()
if alert == nil {
t.Fatal("expected alert to be created")
}
if alert.Level != AlertLevelCritical {
t.Errorf("expected critical level from override, got %s", alert.Level)
}
})
t.Run("uses default severity when no override", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.config.GuestDefaults.PoweredOffSeverity = AlertLevelCritical
m.mu.Unlock()
// First poll
m.checkGuestPoweredOff("vm100", "TestVM", "pve-node1", "pve-instance", "VM", false)
// Second poll
m.checkGuestPoweredOff("vm100", "TestVM", "pve-node1", "pve-instance", "VM", false)
m.mu.RLock()
alert := m.activeAlerts["guest-powered-off-vm100"]
m.mu.RUnlock()
if alert == nil {
t.Fatal("expected alert to be created")
}
if alert.Level != AlertLevelCritical {
t.Errorf("expected critical level from defaults, got %s", alert.Level)
}
})
t.Run("container type in message", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
// First poll
m.checkGuestPoweredOff("ct200", "TestContainer", "pve-node1", "pve-instance", "Container", false)
// Second poll
m.checkGuestPoweredOff("ct200", "TestContainer", "pve-node1", "pve-instance", "Container", false)
m.mu.RLock()
alert := m.activeAlerts["guest-powered-off-ct200"]
m.mu.RUnlock()
if alert == nil {
t.Fatal("expected alert to be created")
}
if !strings.Contains(alert.Message, "Container") {
t.Errorf("expected message to contain 'Container', got %s", alert.Message)
}
if !strings.Contains(alert.Message, "TestContainer") {
t.Errorf("expected message to contain 'TestContainer', got %s", alert.Message)
}
})
}
func TestCleanup(t *testing.T) {
// t.Parallel()
t.Run("auto-acknowledges old alerts", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
oldTime := time.Now().Add(-3 * time.Hour)
m.mu.Lock()
m.config.AutoAcknowledgeAfterHours = 2
m.activeAlerts["old-alert"] = &Alert{
ID: "old-alert",
StartTime: oldTime,
Acknowledged: false,
}
m.mu.Unlock()
m.Cleanup(1 * time.Hour)
m.mu.RLock()
alert := m.activeAlerts["old-alert"]
m.mu.RUnlock()
if alert == nil {
t.Fatal("expected alert to exist")
}
if !alert.Acknowledged {
t.Error("expected alert to be auto-acknowledged")
}
if alert.AckUser != "system-auto" {
t.Errorf("expected AckUser 'system-auto', got %s", alert.AckUser)
}
})
t.Run("removes old acknowledged alerts by TTL", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
oldAckTime := time.Now().Add(-10 * 24 * time.Hour) // 10 days ago
m.mu.Lock()
m.config.MaxAcknowledgedAgeDays = 7
m.activeAlerts["ack-alert"] = &Alert{
ID: "ack-alert",
Acknowledged: true,
AckTime: &oldAckTime,
}
m.mu.Unlock()
m.Cleanup(1 * time.Hour)
m.mu.RLock()
_, exists := m.activeAlerts["ack-alert"]
m.mu.RUnlock()
if exists {
t.Error("expected acknowledged alert to be removed by TTL")
}
})
t.Run("removes old unacknowledged alerts by TTL", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
oldTime := time.Now().Add(-40 * 24 * time.Hour) // 40 days ago
m.mu.Lock()
m.config.MaxAlertAgeDays = 30
m.config.AutoAcknowledgeAfterHours = 0 // Disable auto-acknowledge to test TTL
m.activeAlerts["old-unack-alert"] = &Alert{
ID: "old-unack-alert",
StartTime: oldTime,
Acknowledged: false,
}
m.mu.Unlock()
m.Cleanup(1 * time.Hour)
m.mu.RLock()
_, exists := m.activeAlerts["old-unack-alert"]
m.mu.RUnlock()
if exists {
t.Error("expected old unacknowledged alert to be removed by TTL")
}
})
t.Run("removes acknowledged alerts by maxAge fallback", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
oldAckTime := time.Now().Add(-2 * time.Hour)
m.mu.Lock()
m.activeAlerts["ack-fallback"] = &Alert{
ID: "ack-fallback",
Acknowledged: true,
AckTime: &oldAckTime,
}
m.mu.Unlock()
m.Cleanup(1 * time.Hour)
m.mu.RLock()
_, exists := m.activeAlerts["ack-fallback"]
m.mu.RUnlock()
if exists {
t.Error("expected acknowledged alert to be removed by maxAge fallback")
}
})
t.Run("keeps acknowledged alerts that are still active", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
oldAckTime := time.Now().Add(-2 * time.Hour)
recentSeen := time.Now().Add(-5 * time.Minute)
m.mu.Lock()
m.activeAlerts["ack-active"] = &Alert{
ID: "ack-active",
Acknowledged: true,
AckTime: &oldAckTime,
LastSeen: recentSeen,
StartTime: recentSeen,
}
m.mu.Unlock()
m.Cleanup(1 * time.Hour)
m.mu.RLock()
_, exists := m.activeAlerts["ack-active"]
m.mu.RUnlock()
if !exists {
t.Error("expected acknowledged active alert to remain")
}
})
t.Run("cleans up old recent alerts", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
oldTime := time.Now().Add(-10 * time.Minute)
m.mu.Lock()
m.recentAlerts["recent-old"] = &Alert{
ID: "recent-old",
StartTime: oldTime,
}
m.mu.Unlock()
m.Cleanup(1 * time.Hour)
m.mu.RLock()
_, exists := m.recentAlerts["recent-old"]
m.mu.RUnlock()
if exists {
t.Error("expected old recent alert to be cleaned up")
}
})
t.Run("cleans up expired suppressions", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.suppressedUntil["suppressed-alert"] = time.Now().Add(-1 * time.Hour)
m.mu.Unlock()
m.Cleanup(1 * time.Hour)
m.mu.RLock()
_, exists := m.suppressedUntil["suppressed-alert"]
m.mu.RUnlock()
if exists {
t.Error("expected expired suppression to be cleaned up")
}
})
t.Run("cleans up old rate limit entries", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.alertRateLimit["rate-limited"] = []time.Time{
time.Now().Add(-2 * time.Hour), // Old, should be removed
time.Now().Add(-30 * time.Minute), // Recent, should remain
}
m.mu.Unlock()
m.Cleanup(1 * time.Hour)
m.mu.RLock()
times := m.alertRateLimit["rate-limited"]
m.mu.RUnlock()
if len(times) != 1 {
t.Errorf("expected 1 recent time, got %d", len(times))
}
})
t.Run("removes empty rate limit entries", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.alertRateLimit["all-old"] = []time.Time{
time.Now().Add(-2 * time.Hour),
}
m.mu.Unlock()
m.Cleanup(1 * time.Hour)
m.mu.RLock()
_, exists := m.alertRateLimit["all-old"]
m.mu.RUnlock()
if exists {
t.Error("expected empty rate limit entry to be removed")
}
})
t.Run("cleans up old recently resolved alerts", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.resolvedMutex.Lock()
m.recentlyResolved["old-resolved"] = &ResolvedAlert{
Alert: &Alert{ID: "old-resolved"},
ResolvedTime: time.Now().Add(-10 * time.Minute),
}
m.resolvedMutex.Unlock()
m.Cleanup(1 * time.Hour)
m.resolvedMutex.Lock()
_, exists := m.recentlyResolved["old-resolved"]
m.resolvedMutex.Unlock()
if exists {
t.Error("expected old recently resolved alert to be cleaned up")
}
})
t.Run("cleans up stale pending alerts", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.pendingAlerts["stale-pending"] = time.Now().Add(-15 * time.Minute)
m.mu.Unlock()
m.Cleanup(1 * time.Hour)
m.mu.RLock()
_, exists := m.pendingAlerts["stale-pending"]
m.mu.RUnlock()
if exists {
t.Error("expected stale pending alert to be cleaned up")
}
})
t.Run("cleans up flapping history for inactive alerts", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.flappingHistory["inactive-alert"] = []time.Time{
time.Now().Add(-30 * time.Minute),
}
m.flappingActive["inactive-alert"] = true
// No active alert, no suppression
m.mu.Unlock()
m.Cleanup(1 * time.Hour)
m.mu.RLock()
_, historyExists := m.flappingHistory["inactive-alert"]
_, activeExists := m.flappingActive["inactive-alert"]
m.mu.RUnlock()
if historyExists {
t.Error("expected flapping history to be cleaned up")
}
if activeExists {
t.Error("expected flapping active flag to be cleaned up")
}
})
t.Run("cleans up stale Docker restart tracking", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.dockerRestartTracking["stale-container"] = &dockerRestartRecord{
lastChecked: time.Now().Add(-25 * time.Hour),
}
m.mu.Unlock()
m.Cleanup(1 * time.Hour)
m.mu.RLock()
_, exists := m.dockerRestartTracking["stale-container"]
m.mu.RUnlock()
if exists {
t.Error("expected stale Docker restart tracking to be cleaned up")
}
})
t.Run("cleans up stale PMG anomaly trackers", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.pmgAnomalyTrackers["stale-pmg"] = &pmgAnomalyTracker{
LastSampleTime: time.Now().Add(-25 * time.Hour),
}
m.mu.Unlock()
m.Cleanup(1 * time.Hour)
m.mu.RLock()
_, exists := m.pmgAnomalyTrackers["stale-pmg"]
m.mu.RUnlock()
if exists {
t.Error("expected stale PMG anomaly tracker to be cleaned up")
}
})
t.Run("cleans up empty PMG quarantine history", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.pmgQuarantineHistory["empty-pmg"] = []pmgQuarantineSnapshot{}
m.mu.Unlock()
m.Cleanup(1 * time.Hour)
m.mu.RLock()
_, exists := m.pmgQuarantineHistory["empty-pmg"]
m.mu.RUnlock()
if exists {
t.Error("expected empty PMG quarantine history to be cleaned up")
}
})
t.Run("cleans up stale PMG quarantine history", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.pmgQuarantineHistory["stale-pmg"] = []pmgQuarantineSnapshot{
{Timestamp: time.Now().Add(-8 * 24 * time.Hour)},
}
m.mu.Unlock()
m.Cleanup(1 * time.Hour)
m.mu.RLock()
_, exists := m.pmgQuarantineHistory["stale-pmg"]
m.mu.RUnlock()
if exists {
t.Error("expected stale PMG quarantine history to be cleaned up")
}
})
}
func TestConvertLegacyThreshold(t *testing.T) {
// t.Parallel()
t.Run("nil input returns nil", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
result := m.convertLegacyThreshold(nil)
if result != nil {
t.Error("expected nil result for nil input")
}
})
t.Run("zero value returns nil", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
zero := 0.0
result := m.convertLegacyThreshold(&zero)
if result != nil {
t.Error("expected nil result for zero value")
}
})
t.Run("negative value returns nil", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
neg := -5.0
result := m.convertLegacyThreshold(&neg)
if result != nil {
t.Error("expected nil result for negative value")
}
})
t.Run("positive value with default margin", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
threshold := 80.0
result := m.convertLegacyThreshold(&threshold)
if result == nil {
t.Fatal("expected non-nil result")
}
if result.Trigger != 80.0 {
t.Errorf("expected trigger 80.0, got %f", result.Trigger)
}
if result.Clear != 75.0 { // 80 - 5 (default margin)
t.Errorf("expected clear 75.0, got %f", result.Clear)
}
})
t.Run("positive value with custom margin", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.config.HysteresisMargin = 10.0
m.mu.Unlock()
threshold := 80.0
result := m.convertLegacyThreshold(&threshold)
if result == nil {
t.Fatal("expected non-nil result")
}
if result.Trigger != 80.0 {
t.Errorf("expected trigger 80.0, got %f", result.Trigger)
}
if result.Clear != 70.0 { // 80 - 10 (custom margin)
t.Errorf("expected clear 70.0, got %f", result.Clear)
}
})
}
func TestCheckEscalations(t *testing.T) {
// t.Parallel()
t.Run("does nothing when escalation is disabled", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
oldTime := time.Now().Add(-2 * time.Hour)
m.mu.Lock()
m.config.ActivationState = ActivationActive
m.config.Schedule.Escalation.Enabled = false
m.config.Schedule.Escalation.Levels = []EscalationLevel{
{After: 30, Notify: "email"},
}
m.activeAlerts["test-alert"] = &Alert{
ID: "test-alert",
StartTime: oldTime,
LastEscalation: 0,
}
m.mu.Unlock()
m.checkEscalations()
m.mu.RLock()
alert := m.activeAlerts["test-alert"]
m.mu.RUnlock()
if alert.LastEscalation != 0 {
t.Errorf("expected no escalation when disabled, got %d", alert.LastEscalation)
}
})
t.Run("does nothing when alerts are globally disabled", func(t *testing.T) {
m := newTestManager(t)
oldTime := time.Now().Add(-2 * time.Hour)
m.mu.Lock()
m.config.Enabled = false
m.config.ActivationState = ActivationActive
m.config.Schedule.Escalation.Enabled = true
m.config.Schedule.Escalation.Levels = []EscalationLevel{
{After: 30, Notify: "email"},
}
m.activeAlerts["global-disabled-alert"] = &Alert{
ID: "global-disabled-alert",
StartTime: oldTime,
LastEscalation: 0,
}
m.mu.Unlock()
m.checkEscalations()
m.mu.RLock()
alert := m.activeAlerts["global-disabled-alert"]
m.mu.RUnlock()
if alert.LastEscalation != 0 {
t.Errorf("expected no escalation when alerts are globally disabled, got %d", alert.LastEscalation)
}
})
t.Run("does nothing when activation state is pending", func(t *testing.T) {
m := newTestManager(t)
oldTime := time.Now().Add(-2 * time.Hour)
m.mu.Lock()
m.config.Enabled = true
m.config.ActivationState = ActivationPending
m.config.Schedule.Escalation.Enabled = true
m.config.Schedule.Escalation.Levels = []EscalationLevel{
{After: 30, Notify: "email"},
}
m.activeAlerts["pending-alert"] = &Alert{
ID: "pending-alert",
StartTime: oldTime,
LastEscalation: 0,
}
m.mu.Unlock()
m.checkEscalations()
m.mu.RLock()
alert := m.activeAlerts["pending-alert"]
m.mu.RUnlock()
if alert.LastEscalation != 0 {
t.Errorf("expected no escalation when activation is pending, got %d", alert.LastEscalation)
}
})
t.Run("does nothing when activation state is snoozed", func(t *testing.T) {
m := newTestManager(t)
oldTime := time.Now().Add(-2 * time.Hour)
m.mu.Lock()
m.config.Enabled = true
m.config.ActivationState = ActivationSnoozed
m.config.Schedule.Escalation.Enabled = true
m.config.Schedule.Escalation.Levels = []EscalationLevel{
{After: 30, Notify: "email"},
}
m.activeAlerts["snoozed-alert"] = &Alert{
ID: "snoozed-alert",
StartTime: oldTime,
LastEscalation: 0,
}
m.mu.Unlock()
m.checkEscalations()
m.mu.RLock()
alert := m.activeAlerts["snoozed-alert"]
m.mu.RUnlock()
if alert.LastEscalation != 0 {
t.Errorf("expected no escalation when activation is snoozed, got %d", alert.LastEscalation)
}
})
t.Run("skips acknowledged alerts", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
oldTime := time.Now().Add(-2 * time.Hour)
m.mu.Lock()
m.config.ActivationState = ActivationActive
m.config.Schedule.Escalation.Enabled = true
m.config.Schedule.Escalation.Levels = []EscalationLevel{
{After: 30, Notify: "email"},
}
m.activeAlerts["ack-alert"] = &Alert{
ID: "ack-alert",
StartTime: oldTime,
LastEscalation: 0,
Acknowledged: true,
}
m.mu.Unlock()
m.checkEscalations()
m.mu.RLock()
alert := m.activeAlerts["ack-alert"]
m.mu.RUnlock()
if alert.LastEscalation != 0 {
t.Error("expected no escalation for acknowledged alert")
}
})
t.Run("escalates alert after threshold time", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
oldTime := time.Now().Add(-45 * time.Minute) // 45 minutes ago
m.mu.Lock()
m.config.ActivationState = ActivationActive
m.config.Schedule.Escalation.Enabled = true
m.config.Schedule.Escalation.Levels = []EscalationLevel{
{After: 30, Notify: "email"}, // 30 minutes
{After: 60, Notify: "webhook"}, // 60 minutes
}
m.activeAlerts["escalate-alert"] = &Alert{
ID: "escalate-alert",
StartTime: oldTime,
LastEscalation: 0,
}
m.mu.Unlock()
m.checkEscalations()
m.mu.RLock()
alert := m.activeAlerts["escalate-alert"]
m.mu.RUnlock()
if alert.LastEscalation != 1 {
t.Errorf("expected escalation to level 1, got %d", alert.LastEscalation)
}
if len(alert.EscalationTimes) != 1 {
t.Errorf("expected 1 escalation time, got %d", len(alert.EscalationTimes))
}
})
t.Run("escalates to multiple levels", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
oldTime := time.Now().Add(-90 * time.Minute) // 90 minutes ago
m.mu.Lock()
m.config.ActivationState = ActivationActive
m.config.Schedule.Escalation.Enabled = true
m.config.Schedule.Escalation.Levels = []EscalationLevel{
{After: 30, Notify: "email"}, // 30 minutes
{After: 60, Notify: "webhook"}, // 60 minutes
}
m.activeAlerts["multi-escalate"] = &Alert{
ID: "multi-escalate",
StartTime: oldTime,
LastEscalation: 0,
}
m.mu.Unlock()
m.checkEscalations()
m.mu.RLock()
alert := m.activeAlerts["multi-escalate"]
m.mu.RUnlock()
if alert.LastEscalation != 2 {
t.Errorf("expected escalation to level 2, got %d", alert.LastEscalation)
}
if len(alert.EscalationTimes) != 2 {
t.Errorf("expected 2 escalation times, got %d", len(alert.EscalationTimes))
}
})
t.Run("does not re-escalate already escalated level", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
oldTime := time.Now().Add(-45 * time.Minute)
m.mu.Lock()
m.config.ActivationState = ActivationActive
m.config.Schedule.Escalation.Enabled = true
m.config.Schedule.Escalation.Levels = []EscalationLevel{
{After: 30, Notify: "email"},
}
m.activeAlerts["already-escalated"] = &Alert{
ID: "already-escalated",
StartTime: oldTime,
LastEscalation: 1,
EscalationTimes: []time.Time{time.Now().Add(-10 * time.Minute)},
}
m.mu.Unlock()
m.checkEscalations()
m.mu.RLock()
alert := m.activeAlerts["already-escalated"]
m.mu.RUnlock()
if alert.LastEscalation != 1 {
t.Errorf("expected escalation to remain at 1, got %d", alert.LastEscalation)
}
if len(alert.EscalationTimes) != 1 {
t.Errorf("expected 1 escalation time (unchanged), got %d", len(alert.EscalationTimes))
}
})
t.Run("does not escalate before threshold time", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
recentTime := time.Now().Add(-10 * time.Minute) // Only 10 minutes ago
m.mu.Lock()
m.config.ActivationState = ActivationActive
m.config.Schedule.Escalation.Enabled = true
m.config.Schedule.Escalation.Levels = []EscalationLevel{
{After: 30, Notify: "email"}, // 30 minutes threshold
}
m.activeAlerts["recent-alert"] = &Alert{
ID: "recent-alert",
StartTime: recentTime,
LastEscalation: 0,
}
m.mu.Unlock()
m.checkEscalations()
m.mu.RLock()
alert := m.activeAlerts["recent-alert"]
m.mu.RUnlock()
if alert.LastEscalation != 0 {
t.Error("expected no escalation for recent alert")
}
})
}
func TestCleanupAlertsForNodes(t *testing.T) {
// t.Parallel()
t.Run("removes alerts for non-existent nodes", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.activeAlerts["alert-old-node"] = &Alert{
ID: "alert-old-node",
Node: "old-node",
}
m.activeAlerts["alert-valid-node"] = &Alert{
ID: "alert-valid-node",
Node: "valid-node",
}
m.mu.Unlock()
existingNodes := map[string]bool{
"valid-node": true,
}
m.CleanupAlertsForNodes(existingNodes)
// Give async save goroutine time to complete
time.Sleep(50 * time.Millisecond)
m.mu.RLock()
_, oldExists := m.activeAlerts["alert-old-node"]
_, validExists := m.activeAlerts["alert-valid-node"]
m.mu.RUnlock()
if oldExists {
t.Error("expected alert for old node to be removed")
}
if !validExists {
t.Error("expected alert for valid node to remain")
}
})
t.Run("skips Docker alerts", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.activeAlerts["docker-container-state"] = &Alert{
ID: "docker-container-state",
ResourceID: "docker:host1:container1",
Node: "non-existent-node",
}
m.activeAlerts["alert-with-docker-resource"] = &Alert{
ID: "alert-with-docker-resource",
ResourceID: "docker:host2:container2",
Node: "non-existent-node",
}
m.mu.Unlock()
existingNodes := map[string]bool{}
m.CleanupAlertsForNodes(existingNodes)
m.mu.RLock()
_, dockerExists := m.activeAlerts["docker-container-state"]
_, dockerResourceExists := m.activeAlerts["alert-with-docker-resource"]
m.mu.RUnlock()
if !dockerExists {
t.Error("expected docker alert to be preserved")
}
if !dockerResourceExists {
t.Error("expected alert with docker resource to be preserved")
}
})
t.Run("skips PBS alerts", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.activeAlerts["pbs-offline-test"] = &Alert{
ID: "pbs-offline-test",
Node: "non-existent-node",
}
m.activeAlerts["pbs-backup-alert"] = &Alert{
ID: "pbs-backup-alert",
Type: "pbs-offline",
Node: "non-existent-node",
}
m.mu.Unlock()
existingNodes := map[string]bool{}
m.CleanupAlertsForNodes(existingNodes)
m.mu.RLock()
_, pbsExists := m.activeAlerts["pbs-offline-test"]
_, pbsTypeExists := m.activeAlerts["pbs-backup-alert"]
m.mu.RUnlock()
if !pbsExists {
t.Error("expected pbs-prefixed alert to be preserved")
}
if !pbsTypeExists {
t.Error("expected pbs-offline type alert to be preserved")
}
})
t.Run("removes alerts with empty node", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.activeAlerts["empty-node-alert"] = &Alert{
ID: "empty-node-alert",
Node: "",
}
m.mu.Unlock()
existingNodes := map[string]bool{
"valid-node": true,
}
m.CleanupAlertsForNodes(existingNodes)
time.Sleep(50 * time.Millisecond)
m.mu.RLock()
_, exists := m.activeAlerts["empty-node-alert"]
m.mu.RUnlock()
if exists {
t.Error("expected alert with empty node to be removed")
}
})
t.Run("handles nil alert in map", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.activeAlerts["nil-alert"] = nil
m.activeAlerts["valid-alert"] = &Alert{
ID: "valid-alert",
Node: "valid-node",
}
m.mu.Unlock()
existingNodes := map[string]bool{
"valid-node": true,
}
// Should not panic
m.CleanupAlertsForNodes(existingNodes)
m.mu.RLock()
_, validExists := m.activeAlerts["valid-alert"]
m.mu.RUnlock()
if !validExists {
t.Error("expected valid alert to remain")
}
})
t.Run("no cleanup needed logs correctly", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.activeAlerts["valid-alert"] = &Alert{
ID: "valid-alert",
Node: "valid-node",
}
m.mu.Unlock()
existingNodes := map[string]bool{
"valid-node": true,
}
// Should not panic and should not remove any alerts
m.CleanupAlertsForNodes(existingNodes)
m.mu.RLock()
count := len(m.activeAlerts)
m.mu.RUnlock()
if count != 1 {
t.Errorf("expected 1 alert, got %d", count)
}
})
}
func TestCheckZFSPoolHealth(t *testing.T) {
// t.Parallel()
t.Run("nil ZFSPool returns early", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
storage := models.Storage{
ID: "local-zfs",
Name: "Local ZFS",
Node: "pve-node1",
ZFSPool: nil,
}
// Should not panic
m.checkZFSPoolHealth(storage)
m.mu.RLock()
count := len(m.activeAlerts)
m.mu.RUnlock()
if count != 0 {
t.Errorf("expected no alerts for nil pool, got %d", count)
}
})
t.Run("ONLINE pool does not create state alert", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
storage := models.Storage{
ID: "local-zfs",
Name: "Local ZFS",
Node: "pve-node1",
ZFSPool: &models.ZFSPool{
Name: "rpool",
State: "ONLINE",
},
}
m.checkZFSPoolHealth(storage)
m.mu.RLock()
_, exists := m.activeAlerts["zfs-pool-state-local-zfs"]
m.mu.RUnlock()
if exists {
t.Error("expected no state alert for ONLINE pool")
}
})
t.Run("DEGRADED pool creates warning alert", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
storage := models.Storage{
ID: "local-zfs",
Name: "Local ZFS",
Node: "pve-node1",
Instance: "pve-instance",
ZFSPool: &models.ZFSPool{
Name: "rpool",
State: "DEGRADED",
},
}
m.checkZFSPoolHealth(storage)
m.mu.RLock()
alert := m.activeAlerts["zfs-pool-state-local-zfs"]
m.mu.RUnlock()
if alert == nil {
t.Fatal("expected state alert for DEGRADED pool")
}
if alert.Level != AlertLevelWarning {
t.Errorf("expected warning level, got %s", alert.Level)
}
if alert.Type != "zfs-pool-state" {
t.Errorf("expected type 'zfs-pool-state', got %s", alert.Type)
}
})
t.Run("FAULTED pool creates critical alert", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
storage := models.Storage{
ID: "local-zfs",
Name: "Local ZFS",
Node: "pve-node1",
ZFSPool: &models.ZFSPool{
Name: "rpool",
State: "FAULTED",
},
}
m.checkZFSPoolHealth(storage)
m.mu.RLock()
alert := m.activeAlerts["zfs-pool-state-local-zfs"]
m.mu.RUnlock()
if alert == nil {
t.Fatal("expected state alert for FAULTED pool")
}
if alert.Level != AlertLevelCritical {
t.Errorf("expected critical level, got %s", alert.Level)
}
})
t.Run("UNAVAIL pool creates critical alert", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
storage := models.Storage{
ID: "local-zfs",
Name: "Local ZFS",
Node: "pve-node1",
ZFSPool: &models.ZFSPool{
Name: "rpool",
State: "UNAVAIL",
},
}
m.checkZFSPoolHealth(storage)
m.mu.RLock()
alert := m.activeAlerts["zfs-pool-state-local-zfs"]
m.mu.RUnlock()
if alert == nil {
t.Fatal("expected state alert for UNAVAIL pool")
}
if alert.Level != AlertLevelCritical {
t.Errorf("expected critical level, got %s", alert.Level)
}
})
t.Run("pool coming back ONLINE clears state alert", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
// Pre-create a state alert
m.mu.Lock()
m.activeAlerts["zfs-pool-state-local-zfs"] = &Alert{
ID: "zfs-pool-state-local-zfs",
Level: AlertLevelWarning,
}
m.mu.Unlock()
storage := models.Storage{
ID: "local-zfs",
Name: "Local ZFS",
Node: "pve-node1",
ZFSPool: &models.ZFSPool{
Name: "rpool",
State: "ONLINE",
},
}
m.checkZFSPoolHealth(storage)
m.mu.RLock()
_, exists := m.activeAlerts["zfs-pool-state-local-zfs"]
m.mu.RUnlock()
if exists {
t.Error("expected state alert to be cleared when pool is ONLINE")
}
})
t.Run("pool with errors creates error alert", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
storage := models.Storage{
ID: "local-zfs",
Name: "Local ZFS",
Node: "pve-node1",
ZFSPool: &models.ZFSPool{
Name: "rpool",
State: "ONLINE",
ReadErrors: 5,
WriteErrors: 2,
ChecksumErrors: 1,
},
}
m.checkZFSPoolHealth(storage)
m.mu.RLock()
alert := m.activeAlerts["zfs-pool-errors-local-zfs"]
m.mu.RUnlock()
if alert == nil {
t.Fatal("expected errors alert for pool with errors")
}
if alert.Type != "zfs-pool-errors" {
t.Errorf("expected type 'zfs-pool-errors', got %s", alert.Type)
}
if alert.Value != 8 { // 5 + 2 + 1
t.Errorf("expected value 8, got %f", alert.Value)
}
})
t.Run("pool error count increase updates alert", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
oldTime := time.Now().Add(-1 * time.Hour)
m.mu.Lock()
m.activeAlerts["zfs-pool-errors-local-zfs"] = &Alert{
ID: "zfs-pool-errors-local-zfs",
Value: 5,
StartTime: oldTime,
}
m.mu.Unlock()
storage := models.Storage{
ID: "local-zfs",
Name: "Local ZFS",
Node: "pve-node1",
ZFSPool: &models.ZFSPool{
Name: "rpool",
State: "ONLINE",
ReadErrors: 10,
WriteErrors: 0,
ChecksumErrors: 0,
},
}
m.checkZFSPoolHealth(storage)
m.mu.RLock()
alert := m.activeAlerts["zfs-pool-errors-local-zfs"]
m.mu.RUnlock()
if alert == nil {
t.Fatal("expected errors alert to exist")
}
if alert.Value != 10 {
t.Errorf("expected value 10, got %f", alert.Value)
}
// Start time should be preserved
if !alert.StartTime.Equal(oldTime) {
t.Error("expected StartTime to be preserved on update")
}
})
t.Run("pool with no errors clears error alert", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.activeAlerts["zfs-pool-errors-local-zfs"] = &Alert{
ID: "zfs-pool-errors-local-zfs",
}
m.mu.Unlock()
storage := models.Storage{
ID: "local-zfs",
Name: "Local ZFS",
Node: "pve-node1",
ZFSPool: &models.ZFSPool{
Name: "rpool",
State: "ONLINE",
ReadErrors: 0,
WriteErrors: 0,
ChecksumErrors: 0,
},
}
m.checkZFSPoolHealth(storage)
m.mu.RLock()
_, exists := m.activeAlerts["zfs-pool-errors-local-zfs"]
m.mu.RUnlock()
if exists {
t.Error("expected errors alert to be cleared when no errors")
}
})
t.Run("device with errors creates device alert", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
storage := models.Storage{
ID: "local-zfs",
Name: "Local ZFS",
Node: "pve-node1",
ZFSPool: &models.ZFSPool{
Name: "rpool",
State: "ONLINE",
Devices: []models.ZFSDevice{
{Name: "sda", State: "ONLINE", ReadErrors: 3, WriteErrors: 0, ChecksumErrors: 0},
},
},
}
m.checkZFSPoolHealth(storage)
m.mu.RLock()
alert := m.activeAlerts["zfs-device-local-zfs-sda"]
m.mu.RUnlock()
if alert == nil {
t.Fatal("expected device alert for device with errors")
}
if alert.Type != "zfs-device" {
t.Errorf("expected type 'zfs-device', got %s", alert.Type)
}
})
t.Run("device in FAULTED state creates critical alert", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
storage := models.Storage{
ID: "local-zfs",
Name: "Local ZFS",
Node: "pve-node1",
ZFSPool: &models.ZFSPool{
Name: "rpool",
State: "DEGRADED",
Devices: []models.ZFSDevice{
{Name: "sda", State: "FAULTED"},
},
},
}
m.checkZFSPoolHealth(storage)
m.mu.RLock()
alert := m.activeAlerts["zfs-device-local-zfs-sda"]
m.mu.RUnlock()
if alert == nil {
t.Fatal("expected device alert for FAULTED device")
}
if alert.Level != AlertLevelCritical {
t.Errorf("expected critical level for FAULTED device, got %s", alert.Level)
}
})
t.Run("healthy device clears device alert", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.activeAlerts["zfs-device-local-zfs-sda"] = &Alert{
ID: "zfs-device-local-zfs-sda",
}
m.mu.Unlock()
storage := models.Storage{
ID: "local-zfs",
Name: "Local ZFS",
Node: "pve-node1",
ZFSPool: &models.ZFSPool{
Name: "rpool",
State: "ONLINE",
Devices: []models.ZFSDevice{
{Name: "sda", State: "ONLINE", ReadErrors: 0, WriteErrors: 0, ChecksumErrors: 0},
},
},
}
m.checkZFSPoolHealth(storage)
m.mu.RLock()
_, exists := m.activeAlerts["zfs-device-local-zfs-sda"]
m.mu.RUnlock()
if exists {
t.Error("expected device alert to be cleared for healthy device")
}
})
t.Run("SPARE device in normal state does not create alert", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
storage := models.Storage{
ID: "local-zfs",
Name: "Local ZFS",
Node: "pve-node1",
ZFSPool: &models.ZFSPool{
Name: "rpool",
State: "ONLINE",
Devices: []models.ZFSDevice{
{Name: "sdb", State: "SPARE", ReadErrors: 0, WriteErrors: 0, ChecksumErrors: 0},
},
},
}
m.checkZFSPoolHealth(storage)
m.mu.RLock()
_, exists := m.activeAlerts["zfs-device-local-zfs-sdb"]
m.mu.RUnlock()
if exists {
t.Error("expected no alert for SPARE device without errors")
}
})
}
func TestCheckPMGNodeQueues(t *testing.T) {
// t.Parallel()
t.Run("empty nodes returns early", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
pmg := models.PMGInstance{
ID: "pmg1",
Name: "PMG 1",
Nodes: []models.PMGNodeStatus{},
}
defaults := PMGThresholdConfig{
QueueTotalWarning: 100,
}
// Should not panic
m.checkPMGNodeQueues(pmg, defaults)
m.mu.RLock()
count := len(m.activeAlerts)
m.mu.RUnlock()
if count != 0 {
t.Errorf("expected no alerts for empty nodes, got %d", count)
}
})
t.Run("nil QueueStatus is skipped", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
pmg := models.PMGInstance{
ID: "pmg1",
Name: "PMG 1",
Nodes: []models.PMGNodeStatus{
{Name: "node1", QueueStatus: nil},
},
}
defaults := PMGThresholdConfig{
QueueTotalWarning: 100,
}
m.checkPMGNodeQueues(pmg, defaults)
m.mu.RLock()
count := len(m.activeAlerts)
m.mu.RUnlock()
if count != 0 {
t.Errorf("expected no alerts for nil QueueStatus, got %d", count)
}
})
t.Run("total queue warning alert", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
pmg := models.PMGInstance{
ID: "pmg1",
Name: "PMG 1",
Nodes: []models.PMGNodeStatus{
{Name: "node1", QueueStatus: &models.PMGQueueStatus{Total: 80}},
},
}
defaults := PMGThresholdConfig{
QueueTotalWarning: 100, // 60% scaled = 60
QueueTotalCritical: 200, // 80% scaled = 160
}
m.checkPMGNodeQueues(pmg, defaults)
m.mu.RLock()
alert := m.activeAlerts["pmg1-node1-queue-total"]
m.mu.RUnlock()
if alert == nil {
t.Fatal("expected total queue warning alert")
}
if alert.Level != AlertLevelWarning {
t.Errorf("expected warning level, got %s", alert.Level)
}
if alert.Value != 80 {
t.Errorf("expected value 80, got %f", alert.Value)
}
})
t.Run("total queue critical alert", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
pmg := models.PMGInstance{
ID: "pmg1",
Name: "PMG 1",
Nodes: []models.PMGNodeStatus{
{Name: "node1", QueueStatus: &models.PMGQueueStatus{Total: 200}},
},
}
defaults := PMGThresholdConfig{
QueueTotalWarning: 100, // 60% scaled = 60
QueueTotalCritical: 200, // 80% scaled = 160
}
m.checkPMGNodeQueues(pmg, defaults)
m.mu.RLock()
alert := m.activeAlerts["pmg1-node1-queue-total"]
m.mu.RUnlock()
if alert == nil {
t.Fatal("expected total queue critical alert")
}
if alert.Level != AlertLevelCritical {
t.Errorf("expected critical level, got %s", alert.Level)
}
})
t.Run("deferred queue warning alert", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
pmg := models.PMGInstance{
ID: "pmg1",
Name: "PMG 1",
Nodes: []models.PMGNodeStatus{
{Name: "node1", QueueStatus: &models.PMGQueueStatus{Deferred: 40}},
},
}
defaults := PMGThresholdConfig{
DeferredQueueWarn: 50, // 60% scaled = 30
DeferredQueueCritical: 100,
}
m.checkPMGNodeQueues(pmg, defaults)
m.mu.RLock()
alert := m.activeAlerts["pmg1-node1-queue-deferred"]
m.mu.RUnlock()
if alert == nil {
t.Fatal("expected deferred queue warning alert")
}
if alert.Level != AlertLevelWarning {
t.Errorf("expected warning level, got %s", alert.Level)
}
})
t.Run("hold queue warning alert", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
pmg := models.PMGInstance{
ID: "pmg1",
Name: "PMG 1",
Nodes: []models.PMGNodeStatus{
{Name: "node1", QueueStatus: &models.PMGQueueStatus{Hold: 25}},
},
}
defaults := PMGThresholdConfig{
HoldQueueWarn: 30, // 60% scaled = 18
HoldQueueCritical: 60,
}
m.checkPMGNodeQueues(pmg, defaults)
m.mu.RLock()
alert := m.activeAlerts["pmg1-node1-queue-hold"]
m.mu.RUnlock()
if alert == nil {
t.Fatal("expected hold queue warning alert")
}
if alert.Level != AlertLevelWarning {
t.Errorf("expected warning level, got %s", alert.Level)
}
})
t.Run("oldest message age warning alert", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
pmg := models.PMGInstance{
ID: "pmg1",
Name: "PMG 1",
Nodes: []models.PMGNodeStatus{
{Name: "node1", QueueStatus: &models.PMGQueueStatus{OldestAge: 2400}}, // 40 minutes
},
}
defaults := PMGThresholdConfig{
OldestMessageWarnMins: 50, // 60% scaled = 30 minutes
OldestMessageCritMins: 90,
}
m.checkPMGNodeQueues(pmg, defaults)
m.mu.RLock()
alert := m.activeAlerts["pmg1-node1-oldest-message"]
m.mu.RUnlock()
if alert == nil {
t.Fatal("expected oldest message warning alert")
}
if alert.Level != AlertLevelWarning {
t.Errorf("expected warning level, got %s", alert.Level)
}
if alert.Value != 40 { // 2400 seconds / 60 = 40 minutes
t.Errorf("expected value 40, got %f", alert.Value)
}
})
t.Run("below threshold clears alert", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
// Pre-create an alert
m.mu.Lock()
m.activeAlerts["pmg1-node1-queue-total"] = &Alert{ID: "pmg1-node1-queue-total"}
m.mu.Unlock()
pmg := models.PMGInstance{
ID: "pmg1",
Name: "PMG 1",
Nodes: []models.PMGNodeStatus{
{Name: "node1", QueueStatus: &models.PMGQueueStatus{Total: 10}},
},
}
defaults := PMGThresholdConfig{
QueueTotalWarning: 100, // 60% scaled = 60
QueueTotalCritical: 200,
}
m.checkPMGNodeQueues(pmg, defaults)
m.mu.RLock()
_, exists := m.activeAlerts["pmg1-node1-queue-total"]
m.mu.RUnlock()
if exists {
t.Error("expected alert to be cleared when below threshold")
}
})
t.Run("outlier detection adds note to message", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
pmg := models.PMGInstance{
ID: "pmg1",
Name: "PMG 1",
Nodes: []models.PMGNodeStatus{
{Name: "node1", QueueStatus: &models.PMGQueueStatus{Total: 10}},
{Name: "node2", QueueStatus: &models.PMGQueueStatus{Total: 10}},
{Name: "node3", QueueStatus: &models.PMGQueueStatus{Total: 100}}, // outlier
},
}
defaults := PMGThresholdConfig{
QueueTotalWarning: 100, // 60% scaled = 60
QueueTotalCritical: 200,
}
m.checkPMGNodeQueues(pmg, defaults)
m.mu.RLock()
alert := m.activeAlerts["pmg1-node3-queue-total"]
m.mu.RUnlock()
if alert == nil {
t.Fatal("expected alert for outlier node")
}
if !strings.Contains(alert.Message, "outlier") {
t.Errorf("expected message to contain 'outlier', got %s", alert.Message)
}
})
t.Run("no thresholds configured does not create alerts", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
pmg := models.PMGInstance{
ID: "pmg1",
Name: "PMG 1",
Nodes: []models.PMGNodeStatus{
{Name: "node1", QueueStatus: &models.PMGQueueStatus{Total: 1000, Deferred: 500, Hold: 300}},
},
}
defaults := PMGThresholdConfig{} // All zero
m.checkPMGNodeQueues(pmg, defaults)
m.mu.RLock()
count := len(m.activeAlerts)
m.mu.RUnlock()
if count != 0 {
t.Errorf("expected no alerts when no thresholds configured, got %d", count)
}
})
t.Run("updates existing alert", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
oldTime := time.Now().Add(-1 * time.Hour)
m.mu.Lock()
m.activeAlerts["pmg1-node1-queue-total"] = &Alert{
ID: "pmg1-node1-queue-total",
Value: 60,
Level: AlertLevelWarning,
LastSeen: oldTime,
StartTime: oldTime,
}
m.mu.Unlock()
pmg := models.PMGInstance{
ID: "pmg1",
Name: "PMG 1",
Nodes: []models.PMGNodeStatus{
{Name: "node1", QueueStatus: &models.PMGQueueStatus{Total: 200}},
},
}
defaults := PMGThresholdConfig{
QueueTotalWarning: 100, // 60% scaled = 60
QueueTotalCritical: 200, // 80% scaled = 160
}
m.checkPMGNodeQueues(pmg, defaults)
m.mu.RLock()
alert := m.activeAlerts["pmg1-node1-queue-total"]
m.mu.RUnlock()
if alert == nil {
t.Fatal("expected alert to exist")
}
if alert.Value != 200 {
t.Errorf("expected value 200, got %f", alert.Value)
}
if alert.Level != AlertLevelCritical {
t.Errorf("expected critical level, got %s", alert.Level)
}
if !alert.LastSeen.After(oldTime) {
t.Error("expected LastSeen to be updated")
}
})
}
func TestDockerContainerHealthAlert(t *testing.T) {
t.Run("healthy container - no alert", func(t *testing.T) {
m := newTestManager(t)
host := models.DockerHost{
ID: "host-health-1",
DisplayName: "Docker Host",
Hostname: "docker.local",
Containers: []models.DockerContainer{
{
ID: "container-1",
Name: "healthy-app",
State: "running",
Status: "Up 10 minutes",
Health: "healthy",
},
},
}
m.CheckDockerHost(host)
resourceID := dockerResourceID(host.ID, host.Containers[0].ID)
alertID := fmt.Sprintf("docker-container-health-%s", resourceID)
if _, exists := m.activeAlerts[alertID]; exists {
t.Fatal("expected no health alert for healthy container")
}
})
t.Run("container with empty health - no alert", func(t *testing.T) {
m := newTestManager(t)
host := models.DockerHost{
ID: "host-health-2",
DisplayName: "Docker Host",
Hostname: "docker.local",
Containers: []models.DockerContainer{
{
ID: "container-2",
Name: "no-health-check",
State: "running",
Status: "Up 10 minutes",
Health: "",
},
},
}
m.CheckDockerHost(host)
resourceID := dockerResourceID(host.ID, host.Containers[0].ID)
alertID := fmt.Sprintf("docker-container-health-%s", resourceID)
if _, exists := m.activeAlerts[alertID]; exists {
t.Fatal("expected no health alert for container with empty health")
}
})
t.Run("container with none health - no alert", func(t *testing.T) {
m := newTestManager(t)
host := models.DockerHost{
ID: "host-health-3",
DisplayName: "Docker Host",
Hostname: "docker.local",
Containers: []models.DockerContainer{
{
ID: "container-3",
Name: "no-health-check",
State: "running",
Status: "Up 10 minutes",
Health: "none",
},
},
}
m.CheckDockerHost(host)
resourceID := dockerResourceID(host.ID, host.Containers[0].ID)
alertID := fmt.Sprintf("docker-container-health-%s", resourceID)
if _, exists := m.activeAlerts[alertID]; exists {
t.Fatal("expected no health alert for container with none health")
}
})
t.Run("container starting - no alert", func(t *testing.T) {
m := newTestManager(t)
host := models.DockerHost{
ID: "host-health-4",
DisplayName: "Docker Host",
Hostname: "docker.local",
Containers: []models.DockerContainer{
{
ID: "container-4",
Name: "starting-app",
State: "running",
Status: "Up 5 seconds",
Health: "starting",
},
},
}
m.CheckDockerHost(host)
resourceID := dockerResourceID(host.ID, host.Containers[0].ID)
alertID := fmt.Sprintf("docker-container-health-%s", resourceID)
if _, exists := m.activeAlerts[alertID]; exists {
t.Fatal("expected no health alert for starting container")
}
})
t.Run("unhealthy container - critical alert", func(t *testing.T) {
m := newTestManager(t)
host := models.DockerHost{
ID: "host-health-5",
DisplayName: "Docker Host",
Hostname: "docker.local",
Containers: []models.DockerContainer{
{
ID: "container-5",
Name: "unhealthy-app",
State: "running",
Status: "Up 10 minutes (unhealthy)",
Health: "unhealthy",
},
},
}
m.CheckDockerHost(host)
resourceID := dockerResourceID(host.ID, host.Containers[0].ID)
alertID := fmt.Sprintf("docker-container-health-%s", resourceID)
alert, exists := m.activeAlerts[alertID]
if !exists {
t.Fatal("expected health alert for unhealthy container")
}
if alert.Level != AlertLevelCritical {
t.Fatalf("expected critical alert for unhealthy container, got %s", alert.Level)
}
if alert.Type != "docker-container-health" {
t.Fatalf("expected alert type docker-container-health, got %s", alert.Type)
}
})
t.Run("container with other health status - warning alert", func(t *testing.T) {
m := newTestManager(t)
host := models.DockerHost{
ID: "host-health-6",
DisplayName: "Docker Host",
Hostname: "docker.local",
Containers: []models.DockerContainer{
{
ID: "container-6",
Name: "degraded-app",
State: "running",
Status: "Up 10 minutes",
Health: "degraded",
},
},
}
m.CheckDockerHost(host)
resourceID := dockerResourceID(host.ID, host.Containers[0].ID)
alertID := fmt.Sprintf("docker-container-health-%s", resourceID)
alert, exists := m.activeAlerts[alertID]
if !exists {
t.Fatal("expected health alert for degraded container")
}
if alert.Level != AlertLevelWarning {
t.Fatalf("expected warning alert for non-unhealthy bad status, got %s", alert.Level)
}
})
t.Run("alert cleared when container becomes healthy", func(t *testing.T) {
m := newTestManager(t)
hostID := "host-health-7"
containerID := "container-7"
// First check with unhealthy container
hostUnhealthy := models.DockerHost{
ID: hostID,
DisplayName: "Docker Host",
Hostname: "docker.local",
Containers: []models.DockerContainer{
{
ID: containerID,
Name: "recovering-app",
State: "running",
Status: "Up 10 minutes (unhealthy)",
Health: "unhealthy",
},
},
}
m.CheckDockerHost(hostUnhealthy)
resourceID := dockerResourceID(hostID, containerID)
alertID := fmt.Sprintf("docker-container-health-%s", resourceID)
if _, exists := m.activeAlerts[alertID]; !exists {
t.Fatal("expected health alert to be raised")
}
// Now container becomes healthy
hostHealthy := models.DockerHost{
ID: hostID,
DisplayName: "Docker Host",
Hostname: "docker.local",
Containers: []models.DockerContainer{
{
ID: containerID,
Name: "recovering-app",
State: "running",
Status: "Up 15 minutes",
Health: "healthy",
},
},
}
m.CheckDockerHost(hostHealthy)
if _, exists := m.activeAlerts[alertID]; exists {
t.Fatal("expected health alert to be cleared when container became healthy")
}
})
}
func TestDockerContainerOOMKillAlert(t *testing.T) {
t.Run("running container - no alert", func(t *testing.T) {
m := newTestManager(t)
host := models.DockerHost{
ID: "host-oom-1",
DisplayName: "Docker Host",
Hostname: "docker.local",
Containers: []models.DockerContainer{
{
ID: "container-1",
Name: "running-app",
State: "running",
Status: "Up 10 minutes",
ExitCode: 0,
},
},
}
m.CheckDockerHost(host)
resourceID := dockerResourceID(host.ID, host.Containers[0].ID)
alertID := fmt.Sprintf("docker-container-oom-%s", resourceID)
if _, exists := m.activeAlerts[alertID]; exists {
t.Fatal("expected no OOM alert for running container")
}
})
t.Run("exited container with non-137 exit code - no alert", func(t *testing.T) {
m := newTestManager(t)
host := models.DockerHost{
ID: "host-oom-2",
DisplayName: "Docker Host",
Hostname: "docker.local",
Containers: []models.DockerContainer{
{
ID: "container-2",
Name: "normal-exit-app",
State: "exited",
Status: "Exited (1) 5 minutes ago",
ExitCode: 1,
},
},
}
m.CheckDockerHost(host)
resourceID := dockerResourceID(host.ID, host.Containers[0].ID)
alertID := fmt.Sprintf("docker-container-oom-%s", resourceID)
if _, exists := m.activeAlerts[alertID]; exists {
t.Fatal("expected no OOM alert for container with exit code 1")
}
})
t.Run("exited container with exit code 137 - critical OOM alert", func(t *testing.T) {
m := newTestManager(t)
host := models.DockerHost{
ID: "host-oom-3",
DisplayName: "Docker Host",
Hostname: "docker.local",
Containers: []models.DockerContainer{
{
ID: "container-3",
Name: "oom-killed-app",
State: "exited",
Status: "Exited (137) 1 minute ago",
ExitCode: 137,
MemoryUsage: 512 * 1024 * 1024,
MemoryLimit: 512 * 1024 * 1024,
},
},
}
m.CheckDockerHost(host)
resourceID := dockerResourceID(host.ID, host.Containers[0].ID)
alertID := fmt.Sprintf("docker-container-oom-%s", resourceID)
alert, exists := m.activeAlerts[alertID]
if !exists {
t.Fatal("expected OOM alert for container with exit code 137")
}
if alert.Level != AlertLevelCritical {
t.Fatalf("expected critical OOM alert, got %s", alert.Level)
}
if alert.Type != "docker-container-oom-kill" {
t.Fatalf("expected alert type docker-container-oom-kill, got %s", alert.Type)
}
})
t.Run("dead container with exit code 137 - critical OOM alert", func(t *testing.T) {
m := newTestManager(t)
host := models.DockerHost{
ID: "host-oom-dead",
DisplayName: "Docker Host",
Hostname: "docker.local",
Containers: []models.DockerContainer{
{
ID: "container-dead",
Name: "dead-oom-app",
State: "dead",
Status: "Dead",
ExitCode: 137,
},
},
}
m.CheckDockerHost(host)
resourceID := dockerResourceID(host.ID, host.Containers[0].ID)
alertID := fmt.Sprintf("docker-container-oom-%s", resourceID)
alert, exists := m.activeAlerts[alertID]
if !exists {
t.Fatal("expected OOM alert for dead container with exit code 137")
}
if alert.Level != AlertLevelCritical {
t.Fatalf("expected critical OOM alert, got %s", alert.Level)
}
})
t.Run("repeated 137 exit code - no new alert", func(t *testing.T) {
m := newTestManager(t)
hostID := "host-oom-4"
containerID := "container-4"
host := models.DockerHost{
ID: hostID,
DisplayName: "Docker Host",
Hostname: "docker.local",
Containers: []models.DockerContainer{
{
ID: containerID,
Name: "oom-killed-app",
State: "exited",
Status: "Exited (137) 1 minute ago",
ExitCode: 137,
},
},
}
// First check - should create alert
m.CheckDockerHost(host)
resourceID := dockerResourceID(hostID, containerID)
alertID := fmt.Sprintf("docker-container-oom-%s", resourceID)
alert1, exists := m.activeAlerts[alertID]
if !exists {
t.Fatal("expected OOM alert on first check")
}
startTime := alert1.StartTime
// Second check with same exit code - should not create new alert
m.CheckDockerHost(host)
alert2, exists := m.activeAlerts[alertID]
if !exists {
t.Fatal("expected OOM alert to still exist on second check")
}
if alert2.StartTime != startTime {
t.Fatal("expected alert start time to be preserved (not a new alert)")
}
})
t.Run("container recovers - alert cleared", func(t *testing.T) {
m := newTestManager(t)
hostID := "host-oom-5"
containerID := "container-5"
// First check with OOM killed container
hostOOM := models.DockerHost{
ID: hostID,
DisplayName: "Docker Host",
Hostname: "docker.local",
Containers: []models.DockerContainer{
{
ID: containerID,
Name: "recovering-app",
State: "exited",
Status: "Exited (137) 1 minute ago",
ExitCode: 137,
},
},
}
m.CheckDockerHost(hostOOM)
resourceID := dockerResourceID(hostID, containerID)
alertID := fmt.Sprintf("docker-container-oom-%s", resourceID)
if _, exists := m.activeAlerts[alertID]; !exists {
t.Fatal("expected OOM alert to be raised")
}
// Container is restarted and running again
hostRunning := models.DockerHost{
ID: hostID,
DisplayName: "Docker Host",
Hostname: "docker.local",
Containers: []models.DockerContainer{
{
ID: containerID,
Name: "recovering-app",
State: "running",
Status: "Up 30 seconds",
ExitCode: 0,
},
},
}
m.CheckDockerHost(hostRunning)
if _, exists := m.activeAlerts[alertID]; exists {
t.Fatal("expected OOM alert to be cleared when container started running")
}
})
t.Run("container exits with different code - alert cleared", func(t *testing.T) {
m := newTestManager(t)
hostID := "host-oom-6"
containerID := "container-6"
// First check with OOM killed container
hostOOM := models.DockerHost{
ID: hostID,
DisplayName: "Docker Host",
Hostname: "docker.local",
Containers: []models.DockerContainer{
{
ID: containerID,
Name: "multi-exit-app",
State: "exited",
Status: "Exited (137) 1 minute ago",
ExitCode: 137,
},
},
}
m.CheckDockerHost(hostOOM)
resourceID := dockerResourceID(hostID, containerID)
alertID := fmt.Sprintf("docker-container-oom-%s", resourceID)
if _, exists := m.activeAlerts[alertID]; !exists {
t.Fatal("expected OOM alert to be raised")
}
// Container exits with different exit code (normal error)
hostNormalExit := models.DockerHost{
ID: hostID,
DisplayName: "Docker Host",
Hostname: "docker.local",
Containers: []models.DockerContainer{
{
ID: containerID,
Name: "multi-exit-app",
State: "exited",
Status: "Exited (1) 30 seconds ago",
ExitCode: 1,
},
},
}
m.CheckDockerHost(hostNormalExit)
if _, exists := m.activeAlerts[alertID]; exists {
t.Fatal("expected OOM alert to be cleared when container exited with different code")
}
})
}
func TestDockerContainerRestartLoopAlert(t *testing.T) {
t.Run("first check - no alert", func(t *testing.T) {
m := newTestManager(t)
host := models.DockerHost{
ID: "host-restart-1",
DisplayName: "Docker Host",
Hostname: "docker.local",
Containers: []models.DockerContainer{
{
ID: "container-1",
Name: "first-check-app",
State: "running",
Status: "Up 10 minutes",
RestartCount: 5, // Even with high restart count, first check just initializes
},
},
}
m.CheckDockerHost(host)
resourceID := dockerResourceID(host.ID, host.Containers[0].ID)
alertID := fmt.Sprintf("docker-container-restart-loop-%s", resourceID)
if _, exists := m.activeAlerts[alertID]; exists {
t.Fatal("expected no restart loop alert on first check (just initializes tracking)")
}
// Verify tracking was initialized
m.mu.Lock()
record, exists := m.dockerRestartTracking[resourceID]
m.mu.Unlock()
if !exists {
t.Fatal("expected tracking record to be initialized")
}
if record.lastCount != 5 {
t.Fatalf("expected lastCount=5, got %d", record.lastCount)
}
})
t.Run("stable restart count - no alert", func(t *testing.T) {
m := newTestManager(t)
hostID := "host-restart-2"
containerID := "container-2"
host := models.DockerHost{
ID: hostID,
DisplayName: "Docker Host",
Hostname: "docker.local",
Containers: []models.DockerContainer{
{
ID: containerID,
Name: "stable-app",
State: "running",
Status: "Up 10 minutes",
RestartCount: 2,
},
},
}
// First check - initializes tracking
m.CheckDockerHost(host)
// Second check - same restart count
m.CheckDockerHost(host)
// Third check - still same restart count
m.CheckDockerHost(host)
resourceID := dockerResourceID(hostID, containerID)
alertID := fmt.Sprintf("docker-container-restart-loop-%s", resourceID)
if _, exists := m.activeAlerts[alertID]; exists {
t.Fatal("expected no restart loop alert for stable container")
}
})
t.Run("restarts under threshold - no alert", func(t *testing.T) {
m := newTestManager(t)
// Configure threshold to 3 (default)
m.config.DockerDefaults.RestartCount = 3
m.config.DockerDefaults.RestartWindow = 300
hostID := "host-restart-3"
containerID := "container-3"
// First check - initializes with RestartCount=0
host := models.DockerHost{
ID: hostID,
DisplayName: "Docker Host",
Hostname: "docker.local",
Containers: []models.DockerContainer{
{
ID: containerID,
Name: "under-threshold-app",
State: "running",
Status: "Up 10 minutes",
RestartCount: 0,
},
},
}
m.CheckDockerHost(host)
// Container restarts twice (under threshold of 3)
host.Containers[0].RestartCount = 2
m.CheckDockerHost(host)
// One more restart (now at 3, threshold is >3 so still no alert)
host.Containers[0].RestartCount = 3
m.CheckDockerHost(host)
resourceID := dockerResourceID(hostID, containerID)
alertID := fmt.Sprintf("docker-container-restart-loop-%s", resourceID)
if _, exists := m.activeAlerts[alertID]; exists {
t.Fatal("expected no restart loop alert when restarts <= threshold")
}
// Verify we tracked 3 restarts
m.mu.Lock()
record := m.dockerRestartTracking[resourceID]
recentCount := len(record.times)
m.mu.Unlock()
if recentCount != 3 {
t.Fatalf("expected 3 tracked restarts, got %d", recentCount)
}
})
t.Run("hits restart loop threshold - alert raised", func(t *testing.T) {
m := newTestManager(t)
// Configure threshold to 3 (alert when >3)
m.config.DockerDefaults.RestartCount = 3
m.config.DockerDefaults.RestartWindow = 300
hostID := "host-restart-4"
containerID := "container-4"
// First check - initializes with RestartCount=0
host := models.DockerHost{
ID: hostID,
DisplayName: "Docker Host",
Hostname: "docker.local",
Containers: []models.DockerContainer{
{
ID: containerID,
Name: "restart-loop-app",
State: "running",
Status: "Up 1 minute",
RestartCount: 0,
},
},
}
m.CheckDockerHost(host)
// Container restarts 4 times (exceeds threshold of 3)
host.Containers[0].RestartCount = 4
m.CheckDockerHost(host)
resourceID := dockerResourceID(hostID, containerID)
alertID := fmt.Sprintf("docker-container-restart-loop-%s", resourceID)
alert, exists := m.activeAlerts[alertID]
if !exists {
t.Fatal("expected restart loop alert when restarts > threshold")
}
if alert.Level != AlertLevelCritical {
t.Fatalf("expected critical alert, got %s", alert.Level)
}
if alert.Type != "docker-container-restart-loop" {
t.Fatalf("expected alert type docker-container-restart-loop, got %s", alert.Type)
}
// Verify metadata
if alert.Metadata["restartCount"] != 4 {
t.Fatalf("expected restartCount=4 in metadata, got %v", alert.Metadata["restartCount"])
}
if alert.Metadata["recentRestarts"] != 4 {
t.Fatalf("expected recentRestarts=4 in metadata, got %v", alert.Metadata["recentRestarts"])
}
})
t.Run("restart loop recovery - alert cleared", func(t *testing.T) {
m := newTestManager(t)
// Configure short window for testing
m.config.DockerDefaults.RestartCount = 3
m.config.DockerDefaults.RestartWindow = 1 // 1 second window for testing
hostID := "host-restart-5"
containerID := "container-5"
resourceID := dockerResourceID(hostID, containerID)
// Manually set up a restart loop state
m.mu.Lock()
now := time.Now()
m.dockerRestartTracking[resourceID] = &dockerRestartRecord{
count: 5,
lastCount: 5,
times: []time.Time{now, now, now, now}, // 4 recent restarts
lastChecked: now,
}
m.mu.Unlock()
// Create initial alert
host := models.DockerHost{
ID: hostID,
DisplayName: "Docker Host",
Hostname: "docker.local",
Containers: []models.DockerContainer{
{
ID: containerID,
Name: "recovering-app",
State: "running",
Status: "Up 1 minute",
RestartCount: 5,
},
},
}
m.CheckDockerHost(host)
alertID := fmt.Sprintf("docker-container-restart-loop-%s", resourceID)
if _, exists := m.activeAlerts[alertID]; !exists {
t.Fatal("expected restart loop alert to be raised initially")
}
// Wait for time window to pass
time.Sleep(1100 * time.Millisecond)
// Check again with same restart count - old restarts should be cleaned up
m.CheckDockerHost(host)
if _, exists := m.activeAlerts[alertID]; exists {
t.Fatal("expected restart loop alert to be cleared after window passes")
}
})
t.Run("incremental restarts trigger alert", func(t *testing.T) {
m := newTestManager(t)
m.config.DockerDefaults.RestartCount = 2
m.config.DockerDefaults.RestartWindow = 300
hostID := "host-restart-6"
containerID := "container-6"
host := models.DockerHost{
ID: hostID,
DisplayName: "Docker Host",
Hostname: "docker.local",
Containers: []models.DockerContainer{
{
ID: containerID,
Name: "incremental-restart-app",
State: "running",
Status: "Up 1 minute",
RestartCount: 0,
},
},
}
// First check - initializes
m.CheckDockerHost(host)
resourceID := dockerResourceID(hostID, containerID)
alertID := fmt.Sprintf("docker-container-restart-loop-%s", resourceID)
// Restart 1
host.Containers[0].RestartCount = 1
m.CheckDockerHost(host)
if _, exists := m.activeAlerts[alertID]; exists {
t.Fatal("expected no alert after 1 restart")
}
// Restart 2
host.Containers[0].RestartCount = 2
m.CheckDockerHost(host)
if _, exists := m.activeAlerts[alertID]; exists {
t.Fatal("expected no alert after 2 restarts (threshold is >2)")
}
// Restart 3 - exceeds threshold
host.Containers[0].RestartCount = 3
m.CheckDockerHost(host)
if _, exists := m.activeAlerts[alertID]; !exists {
t.Fatal("expected alert after 3 restarts (>2 threshold)")
}
})
t.Run("alert preserves start time on updates", func(t *testing.T) {
m := newTestManager(t)
m.config.DockerDefaults.RestartCount = 2
m.config.DockerDefaults.RestartWindow = 300
hostID := "host-restart-7"
containerID := "container-7"
host := models.DockerHost{
ID: hostID,
DisplayName: "Docker Host",
Hostname: "docker.local",
Containers: []models.DockerContainer{
{
ID: containerID,
Name: "preserve-time-app",
State: "running",
Status: "Up 1 minute",
RestartCount: 0,
},
},
}
// Initialize and trigger alert
m.CheckDockerHost(host)
host.Containers[0].RestartCount = 5
m.CheckDockerHost(host)
resourceID := dockerResourceID(hostID, containerID)
alertID := fmt.Sprintf("docker-container-restart-loop-%s", resourceID)
alert1, exists := m.activeAlerts[alertID]
if !exists {
t.Fatal("expected alert to be raised")
}
startTime1 := alert1.StartTime
// More restarts - alert should update but preserve start time
time.Sleep(10 * time.Millisecond)
host.Containers[0].RestartCount = 7
m.CheckDockerHost(host)
alert2, exists := m.activeAlerts[alertID]
if !exists {
t.Fatal("expected alert to still exist")
}
if !alert2.StartTime.Equal(startTime1) {
t.Fatalf("expected start time to be preserved, got %v vs %v", alert2.StartTime, startTime1)
}
})
}
func TestApplyThresholdOverride(t *testing.T) {
t.Run("empty override returns base unchanged", func(t *testing.T) {
m := newTestManager(t)
base := ThresholdConfig{
CPU: &HysteresisThreshold{Trigger: 80, Clear: 75},
Memory: &HysteresisThreshold{Trigger: 90, Clear: 85},
}
override := ThresholdConfig{}
result := m.applyThresholdOverride(base, override)
if result.CPU == nil || result.CPU.Trigger != 80 || result.CPU.Clear != 75 {
t.Errorf("expected CPU to match base, got %+v", result.CPU)
}
if result.Memory == nil || result.Memory.Trigger != 90 || result.Memory.Clear != 85 {
t.Errorf("expected Memory to match base, got %+v", result.Memory)
}
if result.Disabled {
t.Error("expected Disabled to remain false")
}
})
t.Run("Disabled flag override", func(t *testing.T) {
m := newTestManager(t)
base := ThresholdConfig{Disabled: false}
override := ThresholdConfig{Disabled: true}
result := m.applyThresholdOverride(base, override)
if !result.Disabled {
t.Error("expected Disabled to be true after override")
}
})
t.Run("DisableConnectivity override", func(t *testing.T) {
m := newTestManager(t)
base := ThresholdConfig{DisableConnectivity: false}
override := ThresholdConfig{DisableConnectivity: true}
result := m.applyThresholdOverride(base, override)
if !result.DisableConnectivity {
t.Error("expected DisableConnectivity to be true after override")
}
})
t.Run("CPU threshold override", func(t *testing.T) {
m := newTestManager(t)
base := ThresholdConfig{
CPU: &HysteresisThreshold{Trigger: 80, Clear: 75},
}
override := ThresholdConfig{
CPU: &HysteresisThreshold{Trigger: 95, Clear: 90},
}
result := m.applyThresholdOverride(base, override)
if result.CPU == nil {
t.Fatal("expected CPU to be set")
}
if result.CPU.Trigger != 95 || result.CPU.Clear != 90 {
t.Errorf("expected CPU override values, got Trigger=%v Clear=%v", result.CPU.Trigger, result.CPU.Clear)
}
})
t.Run("legacy CPU threshold conversion", func(t *testing.T) {
m := newTestManager(t)
m.config.HysteresisMargin = 5.0
base := ThresholdConfig{}
legacyVal := 85.0
override := ThresholdConfig{
CPULegacy: &legacyVal,
}
result := m.applyThresholdOverride(base, override)
if result.CPU == nil {
t.Fatal("expected CPU to be converted from legacy")
}
if result.CPU.Trigger != 85.0 {
t.Errorf("expected Trigger=85, got %v", result.CPU.Trigger)
}
if result.CPU.Clear != 80.0 {
t.Errorf("expected Clear=80 (85-5 margin), got %v", result.CPU.Clear)
}
})
t.Run("modern CPU takes precedence over legacy", func(t *testing.T) {
m := newTestManager(t)
legacyVal := 70.0
base := ThresholdConfig{}
override := ThresholdConfig{
CPU: &HysteresisThreshold{Trigger: 95, Clear: 90},
CPULegacy: &legacyVal,
}
result := m.applyThresholdOverride(base, override)
if result.CPU.Trigger != 95 {
t.Errorf("expected modern CPU to take precedence, got Trigger=%v", result.CPU.Trigger)
}
})
t.Run("multiple metrics override", func(t *testing.T) {
m := newTestManager(t)
base := ThresholdConfig{
CPU: &HysteresisThreshold{Trigger: 80, Clear: 75},
Memory: &HysteresisThreshold{Trigger: 80, Clear: 75},
Disk: &HysteresisThreshold{Trigger: 80, Clear: 75},
}
override := ThresholdConfig{
CPU: &HysteresisThreshold{Trigger: 90, Clear: 85},
Memory: &HysteresisThreshold{Trigger: 95, Clear: 90},
NetworkIn: &HysteresisThreshold{Trigger: 100, Clear: 95},
NetworkOut: &HysteresisThreshold{Trigger: 200, Clear: 190},
}
result := m.applyThresholdOverride(base, override)
if result.CPU.Trigger != 90 {
t.Errorf("expected CPU override, got %v", result.CPU.Trigger)
}
if result.Memory.Trigger != 95 {
t.Errorf("expected Memory override, got %v", result.Memory.Trigger)
}
// Disk should remain unchanged (not in override)
if result.Disk.Trigger != 80 {
t.Errorf("expected Disk unchanged, got %v", result.Disk.Trigger)
}
if result.NetworkIn == nil || result.NetworkIn.Trigger != 100 {
t.Errorf("expected NetworkIn to be added, got %+v", result.NetworkIn)
}
if result.NetworkOut == nil || result.NetworkOut.Trigger != 200 {
t.Errorf("expected NetworkOut to be added, got %+v", result.NetworkOut)
}
})
t.Run("Note override", func(t *testing.T) {
m := newTestManager(t)
base := ThresholdConfig{}
note := "test note"
override := ThresholdConfig{Note: &note}
result := m.applyThresholdOverride(base, override)
if result.Note == nil || *result.Note != "test note" {
t.Errorf("expected Note to be set, got %v", result.Note)
}
})
t.Run("Note cleared when empty string", func(t *testing.T) {
m := newTestManager(t)
existingNote := "existing note"
base := ThresholdConfig{Note: &existingNote}
emptyNote := ""
override := ThresholdConfig{Note: &emptyNote}
result := m.applyThresholdOverride(base, override)
if result.Note != nil {
t.Errorf("expected Note to be nil when empty string override, got %v", *result.Note)
}
})
t.Run("Note trimmed of whitespace", func(t *testing.T) {
m := newTestManager(t)
base := ThresholdConfig{}
note := " trimmed note "
override := ThresholdConfig{Note: &note}
result := m.applyThresholdOverride(base, override)
if result.Note == nil || *result.Note != "trimmed note" {
t.Errorf("expected Note to be trimmed, got %v", result.Note)
}
})
t.Run("whitespace-only Note becomes nil", func(t *testing.T) {
m := newTestManager(t)
existingNote := "existing"
base := ThresholdConfig{Note: &existingNote}
whitespaceNote := " "
override := ThresholdConfig{Note: &whitespaceNote}
result := m.applyThresholdOverride(base, override)
if result.Note != nil {
t.Errorf("expected whitespace-only Note to become nil, got %v", *result.Note)
}
})
t.Run("all metric types with legacy conversion", func(t *testing.T) {
m := newTestManager(t)
m.config.HysteresisMargin = 5.0
base := ThresholdConfig{}
val80 := 80.0
val90 := 90.0
val100 := 100.0
val200 := 200.0
override := ThresholdConfig{
MemoryLegacy: &val80,
DiskLegacy: &val90,
DiskReadLegacy: &val100,
DiskWriteLegacy: &val100,
NetworkInLegacy: &val200,
NetworkOutLegacy: &val200,
}
result := m.applyThresholdOverride(base, override)
if result.Memory == nil || result.Memory.Trigger != 80 {
t.Errorf("expected Memory converted, got %+v", result.Memory)
}
if result.Disk == nil || result.Disk.Trigger != 90 {
t.Errorf("expected Disk converted, got %+v", result.Disk)
}
if result.DiskRead == nil || result.DiskRead.Trigger != 100 {
t.Errorf("expected DiskRead converted, got %+v", result.DiskRead)
}
if result.DiskWrite == nil || result.DiskWrite.Trigger != 100 {
t.Errorf("expected DiskWrite converted, got %+v", result.DiskWrite)
}
if result.NetworkIn == nil || result.NetworkIn.Trigger != 200 {
t.Errorf("expected NetworkIn converted, got %+v", result.NetworkIn)
}
if result.NetworkOut == nil || result.NetworkOut.Trigger != 200 {
t.Errorf("expected NetworkOut converted, got %+v", result.NetworkOut)
}
})
t.Run("Temperature and Usage override", func(t *testing.T) {
m := newTestManager(t)
base := ThresholdConfig{}
override := ThresholdConfig{
Temperature: &HysteresisThreshold{Trigger: 85, Clear: 80},
Usage: &HysteresisThreshold{Trigger: 90, Clear: 85},
}
result := m.applyThresholdOverride(base, override)
if result.Temperature == nil || result.Temperature.Trigger != 85 {
t.Errorf("expected Temperature override, got %+v", result.Temperature)
}
if result.Usage == nil || result.Usage.Trigger != 90 {
t.Errorf("expected Usage override, got %+v", result.Usage)
}
})
t.Run("ensureHysteresisThreshold fills missing Clear", func(t *testing.T) {
m := newTestManager(t)
base := ThresholdConfig{}
override := ThresholdConfig{
CPU: &HysteresisThreshold{Trigger: 80, Clear: 0}, // Clear not set
}
result := m.applyThresholdOverride(base, override)
if result.CPU == nil {
t.Fatal("expected CPU to be set")
}
// ensureHysteresisThreshold sets Clear to Trigger - 5 when Clear <= 0
if result.CPU.Clear != 75 {
t.Errorf("expected Clear to be 75 (80-5 default), got %v", result.CPU.Clear)
}
})
}
func TestSuppressGuestAlerts(t *testing.T) {
t.Run("no alerts for guest returns false", func(t *testing.T) {
m := newTestManager(t)
result := m.suppressGuestAlerts("vm100")
if result {
t.Error("expected false when no alerts exist for guest")
}
})
t.Run("active alert with exact ResourceID match clears and returns true", func(t *testing.T) {
m := newTestManager(t)
m.mu.Lock()
m.activeAlerts["vm100-cpu"] = &Alert{
ID: "vm100-cpu",
ResourceID: "vm100",
Type: "cpu",
}
m.mu.Unlock()
result := m.suppressGuestAlerts("vm100")
if !result {
t.Error("expected true when active alert was cleared")
}
m.mu.RLock()
if _, exists := m.activeAlerts["vm100-cpu"]; exists {
t.Error("expected alert to be cleared from activeAlerts")
}
m.mu.RUnlock()
})
t.Run("active alert with prefix match clears", func(t *testing.T) {
m := newTestManager(t)
m.mu.Lock()
m.activeAlerts["vm100/disk1-disk"] = &Alert{
ID: "vm100/disk1-disk",
ResourceID: "vm100/disk1",
Type: "disk",
}
m.mu.Unlock()
result := m.suppressGuestAlerts("vm100")
if !result {
t.Error("expected true when active alert was cleared")
}
m.mu.RLock()
if _, exists := m.activeAlerts["vm100/disk1-disk"]; exists {
t.Error("expected alert with prefix match to be cleared")
}
m.mu.RUnlock()
})
t.Run("clears from all auxiliary maps", func(t *testing.T) {
m := newTestManager(t)
now := time.Now()
m.mu.Lock()
m.activeAlerts["vm100-cpu"] = &Alert{
ID: "vm100-cpu",
ResourceID: "vm100",
Type: "cpu",
}
m.pendingAlerts["vm100-memory"] = now
m.recentAlerts["vm100-disk"] = &Alert{ID: "vm100-disk", ResourceID: "vm100"}
m.suppressedUntil["vm100-network"] = now.Add(time.Hour)
m.alertRateLimit["vm100-io"] = []time.Time{now}
m.offlineConfirmations["vm100"] = 1
m.mu.Unlock()
result := m.suppressGuestAlerts("vm100")
if !result {
t.Error("expected true when active alert was cleared")
}
m.mu.RLock()
defer m.mu.RUnlock()
if _, exists := m.activeAlerts["vm100-cpu"]; exists {
t.Error("expected activeAlerts to be cleared")
}
if _, exists := m.pendingAlerts["vm100-memory"]; exists {
t.Error("expected pendingAlerts to be cleared")
}
if _, exists := m.recentAlerts["vm100-disk"]; exists {
t.Error("expected recentAlerts to be cleared")
}
if _, exists := m.suppressedUntil["vm100-network"]; exists {
t.Error("expected suppressedUntil to be cleared")
}
if _, exists := m.alertRateLimit["vm100-io"]; exists {
t.Error("expected alertRateLimit to be cleared")
}
if _, exists := m.offlineConfirmations["vm100"]; exists {
t.Error("expected offlineConfirmations to be cleared")
}
})
t.Run("multiple alerts cleared", func(t *testing.T) {
m := newTestManager(t)
m.mu.Lock()
m.activeAlerts["vm100-cpu"] = &Alert{
ID: "vm100-cpu",
ResourceID: "vm100",
Type: "cpu",
}
m.activeAlerts["vm100-memory"] = &Alert{
ID: "vm100-memory",
ResourceID: "vm100",
Type: "memory",
}
m.activeAlerts["vm100/disk0-disk"] = &Alert{
ID: "vm100/disk0-disk",
ResourceID: "vm100/disk0",
Type: "disk",
}
// Also add an alert for a different guest that should NOT be cleared
m.activeAlerts["vm200-cpu"] = &Alert{
ID: "vm200-cpu",
ResourceID: "vm200",
Type: "cpu",
}
m.mu.Unlock()
result := m.suppressGuestAlerts("vm100")
if !result {
t.Error("expected true when alerts were cleared")
}
m.mu.RLock()
defer m.mu.RUnlock()
if _, exists := m.activeAlerts["vm100-cpu"]; exists {
t.Error("expected vm100-cpu to be cleared")
}
if _, exists := m.activeAlerts["vm100-memory"]; exists {
t.Error("expected vm100-memory to be cleared")
}
if _, exists := m.activeAlerts["vm100/disk0-disk"]; exists {
t.Error("expected vm100/disk0-disk to be cleared")
}
if _, exists := m.activeAlerts["vm200-cpu"]; !exists {
t.Error("expected vm200-cpu to NOT be cleared")
}
})
t.Run("clears auxiliary maps even without active alerts", func(t *testing.T) {
m := newTestManager(t)
now := time.Now()
m.mu.Lock()
// No active alerts, but has entries in auxiliary maps
m.pendingAlerts["vm100-memory"] = now
m.recentAlerts["vm100-disk"] = &Alert{ID: "vm100-disk", ResourceID: "vm100"}
m.suppressedUntil["vm100-network"] = now.Add(time.Hour)
m.alertRateLimit["vm100-io"] = []time.Time{now}
m.offlineConfirmations["vm100"] = 1
m.mu.Unlock()
result := m.suppressGuestAlerts("vm100")
// Returns false because no active alerts were cleared
if result {
t.Error("expected false when no active alerts were cleared")
}
m.mu.RLock()
defer m.mu.RUnlock()
// But auxiliary maps should still be cleared
if _, exists := m.pendingAlerts["vm100-memory"]; exists {
t.Error("expected pendingAlerts to be cleared")
}
if _, exists := m.recentAlerts["vm100-disk"]; exists {
t.Error("expected recentAlerts to be cleared")
}
if _, exists := m.suppressedUntil["vm100-network"]; exists {
t.Error("expected suppressedUntil to be cleared")
}
if _, exists := m.alertRateLimit["vm100-io"]; exists {
t.Error("expected alertRateLimit to be cleared")
}
if _, exists := m.offlineConfirmations["vm100"]; exists {
t.Error("expected offlineConfirmations to be cleared")
}
})
}
func TestGuestHasMonitorOnlyAlerts(t *testing.T) {
t.Run("no alerts returns false", func(t *testing.T) {
m := newTestManager(t)
result := m.guestHasMonitorOnlyAlerts("vm100")
if result {
t.Error("expected false when no alerts exist")
}
})
t.Run("has non-monitor-only alert returns false", func(t *testing.T) {
m := newTestManager(t)
m.mu.Lock()
m.activeAlerts["vm100-cpu"] = &Alert{
ID: "vm100-cpu",
ResourceID: "vm100",
Type: "cpu",
Metadata: nil, // No metadata means not monitor-only
}
m.mu.Unlock()
result := m.guestHasMonitorOnlyAlerts("vm100")
if result {
t.Error("expected false when alert is not monitor-only")
}
})
t.Run("has monitor-only alert with bool metadata returns true", func(t *testing.T) {
m := newTestManager(t)
m.mu.Lock()
m.activeAlerts["vm100-cpu"] = &Alert{
ID: "vm100-cpu",
ResourceID: "vm100",
Type: "cpu",
Metadata: map[string]interface{}{
"monitorOnly": true,
},
}
m.mu.Unlock()
result := m.guestHasMonitorOnlyAlerts("vm100")
if !result {
t.Error("expected true when monitor-only alert exists")
}
})
t.Run("has monitor-only alert with string metadata returns true", func(t *testing.T) {
m := newTestManager(t)
m.mu.Lock()
m.activeAlerts["vm100-cpu"] = &Alert{
ID: "vm100-cpu",
ResourceID: "vm100",
Type: "cpu",
Metadata: map[string]interface{}{
"monitorOnly": "true",
},
}
m.mu.Unlock()
result := m.guestHasMonitorOnlyAlerts("vm100")
if !result {
t.Error("expected true when monitor-only alert exists (string metadata)")
}
})
t.Run("alert for different guest not matched", func(t *testing.T) {
m := newTestManager(t)
m.mu.Lock()
m.activeAlerts["vm200-cpu"] = &Alert{
ID: "vm200-cpu",
ResourceID: "vm200",
Type: "cpu",
Metadata: map[string]interface{}{
"monitorOnly": true,
},
}
m.mu.Unlock()
result := m.guestHasMonitorOnlyAlerts("vm100")
if result {
t.Error("expected false when monitor-only alert is for different guest")
}
})
t.Run("monitorOnly false returns false", func(t *testing.T) {
m := newTestManager(t)
m.mu.Lock()
m.activeAlerts["vm100-cpu"] = &Alert{
ID: "vm100-cpu",
ResourceID: "vm100",
Type: "cpu",
Metadata: map[string]interface{}{
"monitorOnly": false,
},
}
m.mu.Unlock()
result := m.guestHasMonitorOnlyAlerts("vm100")
if result {
t.Error("expected false when monitorOnly is explicitly false")
}
})
}
func TestCheckNode(t *testing.T) {
// t.Parallel()
t.Run("returns early when alerts disabled", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.config.Enabled = false
m.mu.Unlock()
node := models.Node{
ID: "node1",
Name: "Node 1",
CPU: 0.95, // Would trigger alert if enabled
Status: "online",
}
m.CheckNode(node)
m.mu.RLock()
alertCount := len(m.activeAlerts)
m.mu.RUnlock()
if alertCount != 0 {
t.Errorf("expected no alerts when disabled, got %d", alertCount)
}
})
t.Run("DisableAllNodes clears existing alerts", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
// Pre-create alerts that should be cleared
m.mu.Lock()
m.activeAlerts["node1-cpu"] = &Alert{ID: "node1-cpu", ResourceID: "node1", Type: "cpu"}
m.activeAlerts["node1-memory"] = &Alert{ID: "node1-memory", ResourceID: "node1", Type: "memory"}
m.activeAlerts["node1-disk"] = &Alert{ID: "node1-disk", ResourceID: "node1", Type: "disk"}
m.activeAlerts["node1-temperature"] = &Alert{ID: "node1-temperature", ResourceID: "node1", Type: "temperature"}
m.activeAlerts["node-offline-node1"] = &Alert{ID: "node-offline-node1", ResourceID: "node1", Type: "connectivity"}
m.nodeOfflineCount["node1"] = 5
m.config.DisableAllNodes = true
m.mu.Unlock()
node := models.Node{ID: "node1", Name: "Node 1", Status: "online"}
m.CheckNode(node)
m.mu.RLock()
_, cpuExists := m.activeAlerts["node1-cpu"]
_, memExists := m.activeAlerts["node1-memory"]
_, diskExists := m.activeAlerts["node1-disk"]
_, tempExists := m.activeAlerts["node1-temperature"]
_, offlineExists := m.activeAlerts["node-offline-node1"]
_, countExists := m.nodeOfflineCount["node1"]
m.mu.RUnlock()
if cpuExists {
t.Error("expected cpu alert to be cleared")
}
if memExists {
t.Error("expected memory alert to be cleared")
}
if diskExists {
t.Error("expected disk alert to be cleared")
}
if tempExists {
t.Error("expected temperature alert to be cleared")
}
if offlineExists {
t.Error("expected offline alert to be cleared")
}
if countExists {
t.Error("expected offline count to be cleared")
}
})
t.Run("DisableNodesOffline clears tracking and offline alerts", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
// Pre-create offline alert and tracking
m.mu.Lock()
m.activeAlerts["node-offline-node1"] = &Alert{ID: "node-offline-node1", ResourceID: "node1", Type: "connectivity"}
m.nodeOfflineCount["node1"] = 3
m.config.DisableAllNodesOffline = true
m.mu.Unlock()
node := models.Node{ID: "node1", Name: "Node 1", Status: "offline"}
m.CheckNode(node)
m.mu.RLock()
_, alertExists := m.activeAlerts["node-offline-node1"]
_, countExists := m.nodeOfflineCount["node1"]
m.mu.RUnlock()
if alertExists {
t.Error("expected offline alert to be cleared")
}
if countExists {
t.Error("expected offline count to be cleared")
}
})
t.Run("offline node triggers offline check", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
// Pre-set count to trigger alert on this call
m.mu.Lock()
m.nodeOfflineCount["node1"] = 2
m.mu.Unlock()
node := models.Node{
ID: "node1",
Name: "Node 1",
Instance: "pve1",
Status: "offline",
}
m.CheckNode(node)
m.mu.RLock()
alert := m.activeAlerts["node-offline-node1"]
m.mu.RUnlock()
if alert == nil {
t.Fatal("expected offline alert to be created")
}
if alert.Type != "connectivity" {
t.Errorf("expected type connectivity, got %s", alert.Type)
}
})
t.Run("node with connection error triggers offline check", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.nodeOfflineCount["node1"] = 2
m.mu.Unlock()
node := models.Node{
ID: "node1",
Name: "Node 1",
Instance: "pve1",
Status: "online",
ConnectionHealth: "error",
}
m.CheckNode(node)
m.mu.RLock()
alert := m.activeAlerts["node-offline-node1"]
m.mu.RUnlock()
if alert == nil {
t.Fatal("expected offline alert for connection error")
}
})
t.Run("node with connection failed triggers offline check", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.nodeOfflineCount["node1"] = 2
m.mu.Unlock()
node := models.Node{
ID: "node1",
Name: "Node 1",
Instance: "pve1",
Status: "online",
ConnectionHealth: "failed",
}
m.CheckNode(node)
m.mu.RLock()
alert := m.activeAlerts["node-offline-node1"]
m.mu.RUnlock()
if alert == nil {
t.Fatal("expected offline alert for connection failed")
}
})
t.Run("online node clears offline alert", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
// Pre-create offline alert
m.mu.Lock()
m.activeAlerts["node-offline-node1"] = &Alert{
ID: "node-offline-node1",
ResourceID: "node1",
Type: "connectivity",
}
m.nodeOfflineCount["node1"] = 5
m.mu.Unlock()
node := models.Node{
ID: "node1",
Name: "Node 1",
Instance: "pve1",
Status: "online",
ConnectionHealth: "connected",
}
m.CheckNode(node)
m.mu.RLock()
_, alertStillActive := m.activeAlerts["node-offline-node1"]
m.mu.RUnlock()
if !alertStillActive {
t.Fatal("expected offline alert to remain until recovery is confirmed")
}
m.CheckNode(node)
m.mu.RLock()
_, alertStillActive = m.activeAlerts["node-offline-node1"]
m.mu.RUnlock()
if !alertStillActive {
t.Fatal("expected offline alert to remain until final recovery confirmation")
}
m.CheckNode(node)
m.mu.RLock()
_, alertExists := m.activeAlerts["node-offline-node1"]
_, countExists := m.nodeOfflineCount["node1"]
m.mu.RUnlock()
if alertExists {
t.Error("expected offline alert to be cleared")
}
if countExists {
t.Error("expected offline count to be cleared")
}
})
t.Run("online node triggers metric checks", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
// Set thresholds that will trigger and disable time threshold
m.mu.Lock()
m.config.TimeThreshold = 0
m.config.TimeThresholds = map[string]int{}
m.config.NodeDefaults = ThresholdConfig{
CPU: &HysteresisThreshold{Trigger: 80.0, Clear: 70.0},
}
m.mu.Unlock()
node := models.Node{
ID: "node1",
Name: "Node 1",
Instance: "pve1",
Status: "online",
CPU: 0.95, // 95% - above trigger
}
m.CheckNode(node)
m.mu.RLock()
alert := m.activeAlerts["node1-cpu"]
m.mu.RUnlock()
if alert == nil {
t.Fatal("expected cpu alert to be created")
}
if alert.Type != "cpu" {
t.Errorf("expected type cpu, got %s", alert.Type)
}
})
t.Run("offline node skips metric checks", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.config.NodeDefaults = ThresholdConfig{
CPU: &HysteresisThreshold{Trigger: 80.0, Clear: 70.0},
}
m.mu.Unlock()
node := models.Node{
ID: "node1",
Name: "Node 1",
Status: "offline",
CPU: 0.95, // Would trigger if checked
}
m.CheckNode(node)
m.mu.RLock()
_, cpuExists := m.activeAlerts["node1-cpu"]
m.mu.RUnlock()
if cpuExists {
t.Error("expected no cpu alert for offline node")
}
})
t.Run("applies override thresholds", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.config.NodeDefaults = ThresholdConfig{
CPU: &HysteresisThreshold{Trigger: 80.0, Clear: 70.0},
}
m.config.Overrides = map[string]ThresholdConfig{
"node1": {
CPU: &HysteresisThreshold{Trigger: 99.0, Clear: 90.0}, // Higher threshold
},
}
m.mu.Unlock()
node := models.Node{
ID: "node1",
Name: "Node 1",
Instance: "pve1",
Status: "online",
CPU: 0.95, // 95% - below override trigger of 99%
}
m.CheckNode(node)
m.mu.RLock()
_, cpuExists := m.activeAlerts["node1-cpu"]
m.mu.RUnlock()
if cpuExists {
t.Error("expected no alert due to higher override threshold")
}
})
t.Run("checks temperature with package temp", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.config.TimeThreshold = 0
m.config.TimeThresholds = map[string]int{}
m.config.NodeDefaults = ThresholdConfig{
Temperature: &HysteresisThreshold{Trigger: 80.0, Clear: 70.0},
}
m.mu.Unlock()
node := models.Node{
ID: "node1",
Name: "Node 1",
Instance: "pve1",
Status: "online",
Temperature: &models.Temperature{
Available: true,
CPUPackage: 90.0, // Above trigger
CPUMax: 85.0,
},
}
m.CheckNode(node)
m.mu.RLock()
alert := m.activeAlerts["node1-temperature"]
m.mu.RUnlock()
if alert == nil {
t.Fatal("expected temperature alert")
}
})
t.Run("checks temperature with max temp fallback", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.config.TimeThreshold = 0
m.config.TimeThresholds = map[string]int{}
m.config.NodeDefaults = ThresholdConfig{
Temperature: &HysteresisThreshold{Trigger: 80.0, Clear: 70.0},
}
m.mu.Unlock()
node := models.Node{
ID: "node1",
Name: "Node 1",
Instance: "pve1",
Status: "online",
Temperature: &models.Temperature{
Available: true,
CPUPackage: 0, // Zero - will use max
CPUMax: 90.0, // Above trigger
},
}
m.CheckNode(node)
m.mu.RLock()
alert := m.activeAlerts["node1-temperature"]
m.mu.RUnlock()
if alert == nil {
t.Fatal("expected temperature alert using max temp fallback")
}
})
t.Run("skips temperature when not available", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.config.NodeDefaults = ThresholdConfig{
Temperature: &HysteresisThreshold{Trigger: 80.0, Clear: 70.0},
}
m.mu.Unlock()
node := models.Node{
ID: "node1",
Name: "Node 1",
Instance: "pve1",
Status: "online",
Temperature: &models.Temperature{
Available: false, // Not available
CPUPackage: 90.0,
},
}
m.CheckNode(node)
m.mu.RLock()
_, tempExists := m.activeAlerts["node1-temperature"]
m.mu.RUnlock()
if tempExists {
t.Error("expected no temperature alert when not available")
}
})
t.Run("skips temperature when nil", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.config.NodeDefaults = ThresholdConfig{
Temperature: &HysteresisThreshold{Trigger: 80.0, Clear: 70.0},
}
m.mu.Unlock()
node := models.Node{
ID: "node1",
Name: "Node 1",
Instance: "pve1",
Status: "online",
Temperature: nil, // Nil temperature
}
m.CheckNode(node)
m.mu.RLock()
_, tempExists := m.activeAlerts["node1-temperature"]
m.mu.RUnlock()
if tempExists {
t.Error("expected no temperature alert when temp is nil")
}
})
t.Run("skips temperature when threshold nil", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
// No temperature threshold set
m.mu.Lock()
m.config.NodeDefaults = ThresholdConfig{
Temperature: nil,
}
m.mu.Unlock()
node := models.Node{
ID: "node1",
Name: "Node 1",
Instance: "pve1",
Status: "online",
Temperature: &models.Temperature{
Available: true,
CPUPackage: 90.0,
},
}
m.CheckNode(node)
m.mu.RLock()
_, tempExists := m.activeAlerts["node1-temperature"]
m.mu.RUnlock()
if tempExists {
t.Error("expected no temperature alert when threshold nil")
}
})
t.Run("checks memory metric", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.config.TimeThreshold = 0
m.config.TimeThresholds = map[string]int{}
m.config.NodeDefaults = ThresholdConfig{
Memory: &HysteresisThreshold{Trigger: 80.0, Clear: 70.0},
}
m.mu.Unlock()
node := models.Node{
ID: "node1",
Name: "Node 1",
Instance: "pve1",
Status: "online",
Memory: models.Memory{
Usage: 95.0, // Above trigger
},
}
m.CheckNode(node)
m.mu.RLock()
alert := m.activeAlerts["node1-memory"]
m.mu.RUnlock()
if alert == nil {
t.Fatal("expected memory alert")
}
})
t.Run("checks disk metric", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.config.TimeThreshold = 0
m.config.TimeThresholds = map[string]int{}
m.config.NodeDefaults = ThresholdConfig{
Disk: &HysteresisThreshold{Trigger: 80.0, Clear: 70.0},
}
m.mu.Unlock()
node := models.Node{
ID: "node1",
Name: "Node 1",
Instance: "pve1",
Status: "online",
Disk: models.Disk{
Usage: 95.0, // Above trigger
},
}
m.CheckNode(node)
m.mu.RLock()
alert := m.activeAlerts["node1-disk"]
m.mu.RUnlock()
if alert == nil {
t.Fatal("expected disk alert")
}
})
}
func TestCheckGuest(t *testing.T) {
// t.Parallel()
t.Run("returns early when alerts disabled", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.config.Enabled = false
m.mu.Unlock()
vm := models.VM{
ID: "vm100",
Name: "TestVM",
Node: "node1",
Status: "running",
CPU: 0.95,
}
m.CheckGuest(vm, "pve1")
m.mu.RLock()
alertCount := len(m.activeAlerts)
m.mu.RUnlock()
if alertCount != 0 {
t.Errorf("expected no alerts when disabled, got %d", alertCount)
}
})
t.Run("returns early when all guests disabled", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.config.DisableAllGuests = true
m.mu.Unlock()
vm := models.VM{
ID: "vm100",
Name: "TestVM",
Node: "node1",
Status: "running",
CPU: 0.95,
}
m.CheckGuest(vm, "pve1")
m.mu.RLock()
alertCount := len(m.activeAlerts)
m.mu.RUnlock()
if alertCount != 0 {
t.Errorf("expected no alerts when all guests disabled, got %d", alertCount)
}
})
t.Run("handles VM type correctly", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.config.TimeThreshold = 0
m.config.TimeThresholds = map[string]int{}
m.config.GuestDefaults = ThresholdConfig{
CPU: &HysteresisThreshold{Trigger: 80.0, Clear: 70.0},
}
m.mu.Unlock()
vm := models.VM{
ID: "vm100",
Name: "TestVM",
Node: "node1",
Status: "running",
CPU: 0.95, // 95%
}
m.CheckGuest(vm, "pve1")
m.mu.RLock()
alert := m.activeAlerts["vm100-cpu"]
m.mu.RUnlock()
if alert == nil {
t.Fatal("expected cpu alert for VM")
}
})
t.Run("handles Container type correctly", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.config.TimeThreshold = 0
m.config.TimeThresholds = map[string]int{}
m.config.GuestDefaults = ThresholdConfig{
CPU: &HysteresisThreshold{Trigger: 80.0, Clear: 70.0},
}
m.mu.Unlock()
ct := models.Container{
ID: "ct101",
Name: "TestCT",
Node: "node1",
Status: "running",
CPU: 0.95, // 95%
}
m.CheckGuest(ct, "pve1")
m.mu.RLock()
alert := m.activeAlerts["ct101-cpu"]
m.mu.RUnlock()
if alert == nil {
t.Fatal("expected cpu alert for Container")
}
})
t.Run("returns for unsupported guest type", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
// Pass a string which is unsupported
m.CheckGuest("invalid", "pve1")
m.mu.RLock()
alertCount := len(m.activeAlerts)
m.mu.RUnlock()
if alertCount != 0 {
t.Errorf("expected no alerts for unsupported type, got %d", alertCount)
}
})
t.Run("suppresses alerts with pulse-no-alerts tag", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
// Pre-create an alert
m.mu.Lock()
m.activeAlerts["vm100-cpu"] = &Alert{
ID: "vm100-cpu",
ResourceID: "vm100",
Type: "cpu",
}
m.mu.Unlock()
vm := models.VM{
ID: "vm100",
Name: "TestVM",
Node: "node1",
Status: "running",
CPU: 0.95,
Tags: []string{"pulse-no-alerts"},
}
m.CheckGuest(vm, "pve1")
m.mu.RLock()
_, exists := m.activeAlerts["vm100-cpu"]
m.mu.RUnlock()
if exists {
t.Error("expected alert to be suppressed with pulse-no-alerts tag")
}
})
t.Run("stopped guest triggers powered-off check", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
// Pre-set confirmation count to trigger alert
m.mu.Lock()
m.offlineConfirmations["vm100"] = 2
m.mu.Unlock()
vm := models.VM{
ID: "vm100",
Name: "TestVM",
Node: "node1",
Status: "stopped",
}
m.CheckGuest(vm, "pve1")
m.mu.RLock()
alert := m.activeAlerts["guest-powered-off-vm100"]
m.mu.RUnlock()
if alert == nil {
t.Fatal("expected powered-off alert for stopped guest")
}
})
t.Run("stopped guest with DisableAllGuestsOffline clears tracking", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.config.DisableAllGuestsOffline = true
m.offlineConfirmations["vm100"] = 5
m.activeAlerts["guest-powered-off-vm100"] = &Alert{
ID: "guest-powered-off-vm100",
ResourceID: "vm100",
Type: "powered-off",
}
m.mu.Unlock()
vm := models.VM{
ID: "vm100",
Name: "TestVM",
Node: "node1",
Status: "stopped",
}
m.CheckGuest(vm, "pve1")
m.mu.RLock()
_, alertExists := m.activeAlerts["guest-powered-off-vm100"]
_, countExists := m.offlineConfirmations["vm100"]
m.mu.RUnlock()
if alertExists {
t.Error("expected powered-off alert to be cleared")
}
if countExists {
t.Error("expected offline count to be cleared")
}
})
t.Run("paused guest clears powered-off alert", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.activeAlerts["guest-powered-off-vm100"] = &Alert{
ID: "guest-powered-off-vm100",
ResourceID: "vm100",
Type: "powered-off",
}
m.mu.Unlock()
vm := models.VM{
ID: "vm100",
Name: "TestVM",
Node: "node1",
Status: "paused",
}
m.CheckGuest(vm, "pve1")
m.mu.RLock()
_, exists := m.activeAlerts["guest-powered-off-vm100"]
m.mu.RUnlock()
if exists {
t.Error("expected powered-off alert to be cleared for paused guest")
}
})
t.Run("non-running guest clears metric alerts", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.activeAlerts["vm100-cpu"] = &Alert{
ID: "vm100-cpu",
ResourceID: "vm100",
Type: "cpu",
}
m.activeAlerts["vm100-memory"] = &Alert{
ID: "vm100-memory",
ResourceID: "vm100",
Type: "memory",
}
m.mu.Unlock()
vm := models.VM{
ID: "vm100",
Name: "TestVM",
Node: "node1",
Status: "stopped",
}
m.CheckGuest(vm, "pve1")
m.mu.RLock()
_, cpuExists := m.activeAlerts["vm100-cpu"]
_, memExists := m.activeAlerts["vm100-memory"]
m.mu.RUnlock()
if cpuExists {
t.Error("expected cpu alert to be cleared for non-running guest")
}
if memExists {
t.Error("expected memory alert to be cleared for non-running guest")
}
})
t.Run("running guest clears powered-off alert", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.activeAlerts["guest-powered-off-vm100"] = &Alert{
ID: "guest-powered-off-vm100",
ResourceID: "vm100",
Type: "powered-off",
}
m.offlineConfirmations["vm100"] = 5
m.mu.Unlock()
vm := models.VM{
ID: "vm100",
Name: "TestVM",
Node: "node1",
Status: "running",
}
m.CheckGuest(vm, "pve1")
m.mu.RLock()
_, exists := m.activeAlerts["guest-powered-off-vm100"]
m.mu.RUnlock()
if exists {
t.Error("expected powered-off alert to be cleared for running guest")
}
})
t.Run("disabled thresholds clear existing alerts", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.activeAlerts["vm100-cpu"] = &Alert{
ID: "vm100-cpu",
ResourceID: "vm100",
Type: "cpu",
}
m.config.Overrides = map[string]ThresholdConfig{
"vm100": {Disabled: true},
}
m.mu.Unlock()
vm := models.VM{
ID: "vm100",
Name: "TestVM",
Node: "node1",
Status: "running",
}
m.CheckGuest(vm, "pve1")
m.mu.RLock()
_, exists := m.activeAlerts["vm100-cpu"]
m.mu.RUnlock()
if exists {
t.Error("expected alert to be cleared when guest has alerts disabled")
}
})
t.Run("checks memory metric", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.config.TimeThreshold = 0
m.config.TimeThresholds = map[string]int{}
m.config.GuestDefaults = ThresholdConfig{
Memory: &HysteresisThreshold{Trigger: 80.0, Clear: 70.0},
}
m.mu.Unlock()
vm := models.VM{
ID: "vm100",
Name: "TestVM",
Node: "node1",
Status: "running",
Memory: models.Memory{Usage: 95.0},
}
m.CheckGuest(vm, "pve1")
m.mu.RLock()
alert := m.activeAlerts["vm100-memory"]
m.mu.RUnlock()
if alert == nil {
t.Fatal("expected memory alert")
}
})
t.Run("checks disk metric", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.config.TimeThreshold = 0
m.config.TimeThresholds = map[string]int{}
m.config.GuestDefaults = ThresholdConfig{
Disk: &HysteresisThreshold{Trigger: 80.0, Clear: 70.0},
}
m.mu.Unlock()
vm := models.VM{
ID: "vm100",
Name: "TestVM",
Node: "node1",
Status: "running",
Disk: models.Disk{Usage: 95.0},
}
m.CheckGuest(vm, "pve1")
m.mu.RLock()
alert := m.activeAlerts["vm100-disk"]
m.mu.RUnlock()
if alert == nil {
t.Fatal("expected disk alert")
}
})
t.Run("checks individual disks", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.config.TimeThreshold = 0
m.config.TimeThresholds = map[string]int{}
m.config.GuestDefaults = ThresholdConfig{
Disk: &HysteresisThreshold{Trigger: 80.0, Clear: 70.0},
}
m.mu.Unlock()
vm := models.VM{
ID: "vm100",
Name: "TestVM",
Node: "node1",
Status: "running",
Disks: []models.Disk{
{Mountpoint: "/", Usage: 95.0, Total: 100},
{Mountpoint: "/data", Usage: 50.0, Total: 100},
},
}
m.CheckGuest(vm, "pve1")
m.mu.RLock()
// Check that alert for high disk was created
var foundDiskAlert bool
for alertID := range m.activeAlerts {
if strings.Contains(alertID, "vm100-disk-") {
foundDiskAlert = true
break
}
}
m.mu.RUnlock()
if !foundDiskAlert {
t.Fatal("expected individual disk alert")
}
})
t.Run("skips disk with zero total", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.config.TimeThreshold = 0
m.config.TimeThresholds = map[string]int{}
m.config.GuestDefaults = ThresholdConfig{
Disk: &HysteresisThreshold{Trigger: 80.0, Clear: 70.0},
}
m.mu.Unlock()
vm := models.VM{
ID: "vm100",
Name: "TestVM",
Node: "node1",
Status: "running",
Disks: []models.Disk{
{Mountpoint: "/", Usage: 95.0, Total: 0}, // Zero total - should skip
},
}
m.CheckGuest(vm, "pve1")
m.mu.RLock()
var foundDiskAlert bool
for alertID := range m.activeAlerts {
if strings.Contains(alertID, "vm100-disk-") {
foundDiskAlert = true
break
}
}
m.mu.RUnlock()
if foundDiskAlert {
t.Error("expected no disk alert for disk with zero total")
}
})
t.Run("skips disk with negative usage", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.config.TimeThreshold = 0
m.config.TimeThresholds = map[string]int{}
m.config.GuestDefaults = ThresholdConfig{
Disk: &HysteresisThreshold{Trigger: 80.0, Clear: 70.0},
}
m.mu.Unlock()
vm := models.VM{
ID: "vm100",
Name: "TestVM",
Node: "node1",
Status: "running",
Disks: []models.Disk{
{Mountpoint: "/", Usage: -1.0, Total: 100}, // Negative usage - should skip
},
}
m.CheckGuest(vm, "pve1")
m.mu.RLock()
var foundDiskAlert bool
for alertID := range m.activeAlerts {
if strings.Contains(alertID, "vm100-disk-") {
foundDiskAlert = true
break
}
}
m.mu.RUnlock()
if foundDiskAlert {
t.Error("expected no disk alert for disk with negative usage")
}
})
t.Run("checks diskRead metric", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.config.TimeThreshold = 0
m.config.TimeThresholds = map[string]int{}
m.config.GuestDefaults = ThresholdConfig{
DiskRead: &HysteresisThreshold{Trigger: 100.0, Clear: 80.0}, // MB/s
}
m.mu.Unlock()
vm := models.VM{
ID: "vm100",
Name: "TestVM",
Node: "node1",
Status: "running",
DiskRead: 200 * 1024 * 1024, // 200 MB/s in bytes
}
m.CheckGuest(vm, "pve1")
m.mu.RLock()
alert := m.activeAlerts["vm100-diskRead"]
m.mu.RUnlock()
if alert == nil {
t.Fatal("expected diskRead alert")
}
})
t.Run("checks diskWrite metric", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.config.TimeThreshold = 0
m.config.TimeThresholds = map[string]int{}
m.config.GuestDefaults = ThresholdConfig{
DiskWrite: &HysteresisThreshold{Trigger: 100.0, Clear: 80.0}, // MB/s
}
m.mu.Unlock()
vm := models.VM{
ID: "vm100",
Name: "TestVM",
Node: "node1",
Status: "running",
DiskWrite: 200 * 1024 * 1024, // 200 MB/s in bytes
}
m.CheckGuest(vm, "pve1")
m.mu.RLock()
alert := m.activeAlerts["vm100-diskWrite"]
m.mu.RUnlock()
if alert == nil {
t.Fatal("expected diskWrite alert")
}
})
t.Run("checks networkIn metric", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.config.TimeThreshold = 0
m.config.TimeThresholds = map[string]int{}
m.config.GuestDefaults = ThresholdConfig{
NetworkIn: &HysteresisThreshold{Trigger: 100.0, Clear: 80.0}, // MB/s
}
m.mu.Unlock()
vm := models.VM{
ID: "vm100",
Name: "TestVM",
Node: "node1",
Status: "running",
NetworkIn: 200 * 1024 * 1024, // 200 MB/s in bytes
}
m.CheckGuest(vm, "pve1")
m.mu.RLock()
alert := m.activeAlerts["vm100-networkIn"]
m.mu.RUnlock()
if alert == nil {
t.Fatal("expected networkIn alert")
}
})
t.Run("checks networkOut metric", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.config.TimeThreshold = 0
m.config.TimeThresholds = map[string]int{}
m.config.GuestDefaults = ThresholdConfig{
NetworkOut: &HysteresisThreshold{Trigger: 100.0, Clear: 80.0}, // MB/s
}
m.mu.Unlock()
vm := models.VM{
ID: "vm100",
Name: "TestVM",
Node: "node1",
Status: "running",
NetworkOut: 200 * 1024 * 1024, // 200 MB/s in bytes
}
m.CheckGuest(vm, "pve1")
m.mu.RLock()
alert := m.activeAlerts["vm100-networkOut"]
m.mu.RUnlock()
if alert == nil {
t.Fatal("expected networkOut alert")
}
})
t.Run("applies relaxed thresholds with pulse-relaxed tag", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.config.TimeThreshold = 0
m.config.TimeThresholds = map[string]int{}
m.config.GuestDefaults = ThresholdConfig{
CPU: &HysteresisThreshold{Trigger: 80.0, Clear: 70.0},
}
m.mu.Unlock()
// CPU at 90% - would trigger normally but relaxed threshold is 95%
vm := models.VM{
ID: "vm100",
Name: "TestVM",
Node: "node1",
Status: "running",
CPU: 0.90, // 90%
Tags: []string{"pulse-relaxed"},
}
m.CheckGuest(vm, "pve1")
m.mu.RLock()
_, exists := m.activeAlerts["vm100-cpu"]
m.mu.RUnlock()
if exists {
t.Error("expected no alert due to relaxed thresholds")
}
})
t.Run("disk uses device as label fallback", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.config.TimeThreshold = 0
m.config.TimeThresholds = map[string]int{}
m.config.GuestDefaults = ThresholdConfig{
Disk: &HysteresisThreshold{Trigger: 80.0, Clear: 70.0},
}
m.mu.Unlock()
vm := models.VM{
ID: "vm100",
Name: "TestVM",
Node: "node1",
Status: "running",
Disks: []models.Disk{
{Device: "sda1", Usage: 95.0, Total: 100}, // No mountpoint, has device
},
}
m.CheckGuest(vm, "pve1")
m.mu.RLock()
var foundDiskAlert bool
for alertID := range m.activeAlerts {
if strings.Contains(alertID, "vm100-disk-") {
foundDiskAlert = true
break
}
}
m.mu.RUnlock()
if !foundDiskAlert {
t.Fatal("expected disk alert using device as label")
}
})
t.Run("disk uses index as label when no mountpoint or device", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.config.TimeThreshold = 0
m.config.TimeThresholds = map[string]int{}
m.config.GuestDefaults = ThresholdConfig{
Disk: &HysteresisThreshold{Trigger: 80.0, Clear: 70.0},
}
m.mu.Unlock()
vm := models.VM{
ID: "vm100",
Name: "TestVM",
Node: "node1",
Status: "running",
Disks: []models.Disk{
{Usage: 95.0, Total: 100}, // No mountpoint or device
},
}
m.CheckGuest(vm, "pve1")
m.mu.RLock()
var foundDiskAlert bool
for alertID := range m.activeAlerts {
if strings.Contains(alertID, "vm100-disk-") {
foundDiskAlert = true
break
}
}
m.mu.RUnlock()
if !foundDiskAlert {
t.Fatal("expected disk alert using index as label")
}
})
}
func TestCheckHostComprehensive(t *testing.T) {
// t.Parallel()
t.Run("returns early for empty host ID", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.config.TimeThreshold = 0
m.config.TimeThresholds = map[string]int{}
m.config.HostDefaults = ThresholdConfig{
CPU: &HysteresisThreshold{Trigger: 80.0, Clear: 70.0},
}
m.mu.Unlock()
host := models.Host{
ID: "",
CPUUsage: 95.0,
}
m.CheckHost(host)
m.mu.RLock()
alertCount := len(m.activeAlerts)
m.mu.RUnlock()
if alertCount != 0 {
t.Errorf("expected no alerts for empty host ID, got %d", alertCount)
}
})
t.Run("returns early when alerts disabled", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.config.Enabled = false
m.mu.Unlock()
host := models.Host{
ID: "host1",
CPUUsage: 95.0,
}
m.CheckHost(host)
m.mu.RLock()
alertCount := len(m.activeAlerts)
m.mu.RUnlock()
if alertCount != 0 {
t.Errorf("expected no alerts when disabled, got %d", alertCount)
}
})
t.Run("DisableAllHosts clears existing alerts", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.activeAlerts["host:host1-cpu"] = &Alert{ID: "host:host1-cpu", ResourceID: "host:host1", Type: "cpu"}
m.activeAlerts["host:host1-memory"] = &Alert{ID: "host:host1-memory", ResourceID: "host:host1", Type: "memory"}
m.config.DisableAllHosts = true
m.mu.Unlock()
host := models.Host{
ID: "host1",
CPUUsage: 95.0,
}
m.CheckHost(host)
m.mu.RLock()
_, cpuExists := m.activeAlerts["host:host1-cpu"]
_, memExists := m.activeAlerts["host:host1-memory"]
m.mu.RUnlock()
if cpuExists {
t.Error("expected cpu alert to be cleared")
}
if memExists {
t.Error("expected memory alert to be cleared")
}
})
t.Run("override with Disabled clears alerts", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.activeAlerts["host:host1-cpu"] = &Alert{ID: "host:host1-cpu", ResourceID: "host:host1", Type: "cpu"}
m.config.Overrides = map[string]ThresholdConfig{
"host1": {Disabled: true},
}
m.mu.Unlock()
host := models.Host{
ID: "host1",
CPUUsage: 95.0,
}
m.CheckHost(host)
m.mu.RLock()
_, exists := m.activeAlerts["host:host1-cpu"]
m.mu.RUnlock()
if exists {
t.Error("expected alert to be cleared when host has alerts disabled")
}
})
t.Run("clears CPU alerts when threshold nil", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.activeAlerts["host:host1-cpu"] = &Alert{ID: "host:host1-cpu", ResourceID: "host:host1", Type: "cpu"}
m.config.HostDefaults = ThresholdConfig{
CPU: nil, // No CPU threshold
}
m.mu.Unlock()
host := models.Host{
ID: "host1",
CPUUsage: 95.0,
}
m.CheckHost(host)
m.mu.RLock()
_, exists := m.activeAlerts["host:host1-cpu"]
m.mu.RUnlock()
if exists {
t.Error("expected CPU alert to be cleared when threshold is nil")
}
})
t.Run("clears memory alerts when threshold nil", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.activeAlerts["host:host1-memory"] = &Alert{ID: "host:host1-memory", ResourceID: "host:host1", Type: "memory"}
m.config.HostDefaults = ThresholdConfig{
Memory: nil, // No memory threshold
}
m.mu.Unlock()
host := models.Host{
ID: "host1",
Memory: models.Memory{
Usage: 95.0,
},
}
m.CheckHost(host)
m.mu.RLock()
_, exists := m.activeAlerts["host:host1-memory"]
m.mu.RUnlock()
if exists {
t.Error("expected memory alert to be cleared when threshold is nil")
}
})
t.Run("clears disk alerts when threshold nil", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
// Disk alert ID format: {resourceID}-disk where resourceID is host:hostID/disk:mountpoint
alertID := "host:host1/disk:/-disk"
m.mu.Lock()
m.activeAlerts[alertID] = &Alert{ID: alertID, ResourceID: "host:host1/disk:/", Type: "disk"}
m.config.HostDefaults = ThresholdConfig{
Disk: nil, // No disk threshold
}
m.mu.Unlock()
host := models.Host{
ID: "host1",
Disks: []models.Disk{
{Mountpoint: "/", Usage: 95.0, Total: 100},
},
}
m.CheckHost(host)
m.mu.RLock()
_, exists := m.activeAlerts[alertID]
m.mu.RUnlock()
if exists {
t.Error("expected disk alert to be cleared when threshold is nil")
}
})
t.Run("RAID degraded creates critical alert", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
host := models.Host{
ID: "host1",
Hostname: "testhost",
RAID: []models.HostRAIDArray{
{
Device: "/dev/md2", // Note: md0/md1 are skipped for Synology compatibility
Level: "raid1",
State: "degraded",
TotalDevices: 2,
ActiveDevices: 1,
FailedDevices: 1,
},
},
}
m.CheckHost(host)
m.mu.RLock()
alert := m.activeAlerts["host-host1-raid-md2"]
m.mu.RUnlock()
if alert == nil {
t.Fatal("expected RAID degraded alert")
}
if alert.Level != AlertLevelCritical {
t.Errorf("expected critical level, got %s", alert.Level)
}
})
t.Run("RAID rebuilding creates warning alert", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
host := models.Host{
ID: "host1",
Hostname: "testhost",
RAID: []models.HostRAIDArray{
{
Device: "/dev/md2", // Note: md0/md1 are skipped for Synology compatibility
Level: "raid1",
State: "recovering",
TotalDevices: 2,
ActiveDevices: 2,
FailedDevices: 0,
RebuildPercent: 50.0,
},
},
}
m.CheckHost(host)
m.mu.RLock()
alert := m.activeAlerts["host-host1-raid-md2"]
m.mu.RUnlock()
if alert == nil {
t.Fatal("expected RAID rebuilding alert")
}
if alert.Level != AlertLevelWarning {
t.Errorf("expected warning level, got %s", alert.Level)
}
})
t.Run("RAID healthy clears alert", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.activeAlerts["host-host1-raid-md2"] = &Alert{
ID: "host-host1-raid-md2",
Type: "raid",
Level: AlertLevelCritical,
}
m.mu.Unlock()
host := models.Host{
ID: "host1",
Hostname: "testhost",
RAID: []models.HostRAIDArray{
{
Device: "/dev/md2", // Note: md0/md1 are skipped for Synology compatibility
Level: "raid1",
State: "active",
TotalDevices: 2,
ActiveDevices: 2,
FailedDevices: 0,
},
},
}
m.CheckHost(host)
m.mu.RLock()
_, exists := m.activeAlerts["host-host1-raid-md2"]
m.mu.RUnlock()
if exists {
t.Error("expected RAID alert to be cleared for healthy array")
}
})
t.Run("RAID with failed devices triggers degraded", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
host := models.Host{
ID: "host1",
Hostname: "testhost",
RAID: []models.HostRAIDArray{
{
Device: "/dev/md2", // Note: md0/md1 are skipped for Synology compatibility
Level: "raid1",
State: "active", // State might say active but with failed devices
TotalDevices: 2,
ActiveDevices: 1,
FailedDevices: 1, // This triggers degraded alert
},
},
}
m.CheckHost(host)
m.mu.RLock()
alert := m.activeAlerts["host-host1-raid-md2"]
m.mu.RUnlock()
if alert == nil {
t.Fatal("expected RAID alert for failed devices")
}
if alert.Level != AlertLevelCritical {
t.Errorf("expected critical level for failed devices, got %s", alert.Level)
}
})
t.Run("RAID resync triggers rebuilding alert", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
host := models.Host{
ID: "host1",
Hostname: "testhost",
RAID: []models.HostRAIDArray{
{
Device: "/dev/md2", // Note: md0/md1 are skipped for Synology compatibility
Level: "raid1",
State: "resync",
TotalDevices: 2,
ActiveDevices: 2,
FailedDevices: 0,
},
},
}
m.CheckHost(host)
m.mu.RLock()
alert := m.activeAlerts["host-host1-raid-md2"]
m.mu.RUnlock()
if alert == nil {
t.Fatal("expected RAID rebuilding alert for resync")
}
if alert.Level != AlertLevelWarning {
t.Errorf("expected warning level for resync, got %s", alert.Level)
}
})
t.Run("existing RAID alert not duplicated", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
originalTime := time.Now().Add(-1 * time.Hour)
m.mu.Lock()
m.activeAlerts["host-host1-raid-md2"] = &Alert{
ID: "host-host1-raid-md2",
Type: "raid",
Level: AlertLevelCritical,
StartTime: originalTime,
}
m.mu.Unlock()
host := models.Host{
ID: "host1",
Hostname: "testhost",
RAID: []models.HostRAIDArray{
{
Device: "/dev/md2", // Note: md0/md1 are skipped for Synology compatibility
Level: "raid1",
State: "degraded",
TotalDevices: 2,
ActiveDevices: 1,
FailedDevices: 1,
},
},
}
m.CheckHost(host)
m.mu.RLock()
alert := m.activeAlerts["host-host1-raid-md2"]
m.mu.RUnlock()
if alert == nil {
t.Fatal("expected RAID alert to still exist")
}
// The alert should preserve its original start time
if !alert.StartTime.Equal(originalTime) {
t.Error("expected alert start time to be preserved")
}
})
t.Run("applies override thresholds", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.config.TimeThreshold = 0
m.config.TimeThresholds = map[string]int{}
m.config.HostDefaults = ThresholdConfig{
CPU: &HysteresisThreshold{Trigger: 80.0, Clear: 70.0},
}
m.config.Overrides = map[string]ThresholdConfig{
"host1": {
CPU: &HysteresisThreshold{Trigger: 99.0, Clear: 95.0}, // Higher threshold
},
}
m.mu.Unlock()
host := models.Host{
ID: "host1",
Hostname: "testhost",
CPUUsage: 95.0, // Below override trigger
}
m.CheckHost(host)
m.mu.RLock()
_, exists := m.activeAlerts["host:host1-cpu"]
m.mu.RUnlock()
if exists {
t.Error("expected no alert due to higher override threshold")
}
})
t.Run("checks multiple disks", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.config.TimeThreshold = 0
m.config.TimeThresholds = map[string]int{}
m.config.HostDefaults = ThresholdConfig{
Disk: &HysteresisThreshold{Trigger: 80.0, Clear: 70.0},
}
m.mu.Unlock()
host := models.Host{
ID: "host1",
Hostname: "testhost",
Disks: []models.Disk{
{Mountpoint: "/", Usage: 95.0, Total: 100},
{Mountpoint: "/data", Usage: 50.0, Total: 100}, // Below threshold
},
}
m.CheckHost(host)
m.mu.RLock()
var diskAlertCount int
for alertID := range m.activeAlerts {
// Disk alert ID format: host:hostID/disk:label-disk
if strings.Contains(alertID, "host:host1/disk:") {
diskAlertCount++
}
}
m.mu.RUnlock()
if diskAlertCount != 1 {
t.Errorf("expected 1 disk alert, got %d", diskAlertCount)
}
})
t.Run("clears offline alert when host comes online", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
resourceKey := hostResourceID("host1")
m.mu.Lock()
m.activeAlerts["host-offline-host1"] = &Alert{
ID: "host-offline-host1",
Type: "connectivity",
}
m.offlineConfirmations[resourceKey] = 5
m.mu.Unlock()
host := models.Host{
ID: "host1",
Hostname: "testhost",
}
m.CheckHost(host)
m.mu.RLock()
_, alertExists := m.activeAlerts["host-offline-host1"]
_, countExists := m.offlineConfirmations[resourceKey]
m.mu.RUnlock()
if alertExists {
t.Error("expected offline alert to be cleared")
}
if countExists {
t.Error("expected offline count to be cleared")
}
})
t.Run("includes tags in metadata", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.config.TimeThreshold = 0
m.config.TimeThresholds = map[string]int{}
m.config.HostDefaults = ThresholdConfig{
CPU: &HysteresisThreshold{Trigger: 80.0, Clear: 70.0},
}
m.mu.Unlock()
host := models.Host{
ID: "host1",
Hostname: "testhost",
CPUUsage: 95.0,
Tags: []string{"production", "critical"},
}
m.CheckHost(host)
m.mu.RLock()
alert := m.activeAlerts["host:host1-cpu"]
m.mu.RUnlock()
if alert == nil {
t.Fatal("expected CPU alert")
}
if alert.Metadata == nil {
t.Fatal("expected metadata in alert")
}
tags, ok := alert.Metadata["tags"].([]string)
if !ok || len(tags) != 2 {
t.Error("expected tags in metadata")
}
})
t.Run("qualifies linked host agent alert resource names", func(t *testing.T) {
m := newTestManager(t)
m.mu.Lock()
m.config.TimeThreshold = 0
m.config.TimeThresholds = map[string]int{}
m.config.HostDefaults = ThresholdConfig{
CPU: &HysteresisThreshold{Trigger: 80.0, Clear: 70.0},
}
m.mu.Unlock()
host := models.Host{
ID: "host1",
DisplayName: "Hamster",
Hostname: "hamster.local",
LinkedVMID: "Main:node3:101",
CPUUsage: 97.5,
}
m.CheckHost(host)
m.mu.RLock()
alert := m.activeAlerts["host:host1-cpu"]
m.mu.RUnlock()
if alert == nil {
t.Fatal("expected CPU alert")
}
if alert.ResourceName != "Hamster (Host Agent)" {
t.Fatalf("expected qualified host resource name, got %q", alert.ResourceName)
}
})
t.Run("inherits linked node overrides for host agent metrics", func(t *testing.T) {
m := newTestManager(t)
m.ClearActiveAlerts()
m.mu.Lock()
m.config.TimeThreshold = 0
m.config.TimeThresholds = map[string]int{}
m.config.HostDefaults = ThresholdConfig{
Memory: &HysteresisThreshold{Trigger: 85.0, Clear: 80.0},
}
m.config.Overrides = map[string]ThresholdConfig{
"ProxmoxCluster-proxmoxn3": {
Memory: &HysteresisThreshold{Trigger: 97.0, Clear: 92.0},
},
}
m.mu.Unlock()
host := models.Host{
ID: "host-proxmoxn3",
DisplayName: "proxmoxn3",
Hostname: "proxmoxn3",
LinkedNodeID: "ProxmoxCluster-proxmoxn3",
Memory: models.Memory{
Usage: 90.6,
Total: 1024,
Used: 928,
Free: 96,
},
Status: "online",
LastSeen: time.Now(),
}
m.CheckHost(host)
m.mu.RLock()
_, exists := m.activeAlerts["host:host-proxmoxn3-memory"]
m.mu.RUnlock()
if exists {
t.Fatal("expected linked node override to suppress host-agent memory alert")
}
})
t.Run("inherits linked guest overrides for host agent metrics", func(t *testing.T) {
m := newTestManager(t)
m.ClearActiveAlerts()
m.mu.Lock()
m.config.TimeThreshold = 0
m.config.TimeThresholds = map[string]int{}
m.config.HostDefaults = ThresholdConfig{
CPU: &HysteresisThreshold{Trigger: 80.0, Clear: 75.0},
}
m.config.Overrides = map[string]ThresholdConfig{
"Main:node3:101": {
CPU: &HysteresisThreshold{Trigger: 105.0, Clear: 100.0},
},
}
m.mu.Unlock()
host := models.Host{
ID: "host-hamster",
DisplayName: "Hamster",
Hostname: "hamster.local",
LinkedVMID: "Main:node3:101",
CPUUsage: 97.5,
Status: "online",
LastSeen: time.Now(),
}
m.CheckHost(host)
m.mu.RLock()
_, exists := m.activeAlerts["host:host-hamster-cpu"]
m.mu.RUnlock()
if exists {
t.Fatal("expected linked guest override to suppress host-agent cpu alert")
}
})
t.Run("prefers explicit host overrides over linked resource overrides", func(t *testing.T) {
m := newTestManager(t)
m.ClearActiveAlerts()
m.mu.Lock()
m.config.TimeThreshold = 0
m.config.TimeThresholds = map[string]int{}
m.config.HostDefaults = ThresholdConfig{
CPU: &HysteresisThreshold{Trigger: 80.0, Clear: 75.0},
}
m.config.Overrides = map[string]ThresholdConfig{
"Main:node3:101": {
CPU: &HysteresisThreshold{Trigger: 105.0, Clear: 100.0},
},
"host-hamster": {
CPU: &HysteresisThreshold{Trigger: 90.0, Clear: 85.0},
},
}
m.mu.Unlock()
host := models.Host{
ID: "host-hamster",
DisplayName: "Hamster",
Hostname: "hamster.local",
LinkedVMID: "Main:node3:101",
CPUUsage: 97.5,
Status: "online",
LastSeen: time.Now(),
}
m.CheckHost(host)
m.mu.RLock()
alert := m.activeAlerts["host:host-hamster-cpu"]
m.mu.RUnlock()
if alert == nil {
t.Fatal("expected explicit host override to take precedence and trigger alert")
}
})
}
func TestCheckPBSComprehensive(t *testing.T) {
// t.Parallel()
t.Run("returns early when alerts disabled", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.config.Enabled = false
m.mu.Unlock()
pbs := models.PBSInstance{
ID: "pbs1",
Name: "testpbs",
CPU: 95.0,
}
m.CheckPBS(pbs)
m.mu.RLock()
alertCount := len(m.activeAlerts)
m.mu.RUnlock()
if alertCount != 0 {
t.Errorf("expected no alerts when disabled, got %d", alertCount)
}
})
t.Run("DisableAllPBS clears existing alerts", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.activeAlerts["pbs1-cpu"] = &Alert{ID: "pbs1-cpu", Type: "cpu"}
m.activeAlerts["pbs1-memory"] = &Alert{ID: "pbs1-memory", Type: "memory"}
m.activeAlerts["pbs-offline-pbs1"] = &Alert{ID: "pbs-offline-pbs1", Type: "connectivity"}
m.offlineConfirmations["pbs1"] = 3
m.config.DisableAllPBS = true
m.mu.Unlock()
pbs := models.PBSInstance{
ID: "pbs1",
Name: "testpbs",
}
m.CheckPBS(pbs)
m.mu.RLock()
_, cpuExists := m.activeAlerts["pbs1-cpu"]
_, memExists := m.activeAlerts["pbs1-memory"]
_, offlineExists := m.activeAlerts["pbs-offline-pbs1"]
_, confirmExists := m.offlineConfirmations["pbs1"]
m.mu.RUnlock()
if cpuExists {
t.Error("expected CPU alert to be cleared")
}
if memExists {
t.Error("expected memory alert to be cleared")
}
if offlineExists {
t.Error("expected offline alert to be cleared")
}
if confirmExists {
t.Error("expected offline confirmation to be cleared")
}
})
t.Run("override with Disabled clears alerts", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.activeAlerts["pbs1-cpu"] = &Alert{ID: "pbs1-cpu", Type: "cpu"}
m.activeAlerts["pbs1-memory"] = &Alert{ID: "pbs1-memory", Type: "memory"}
m.activeAlerts["pbs-offline-pbs1"] = &Alert{ID: "pbs-offline-pbs1", Type: "connectivity"}
m.offlineConfirmations["pbs1"] = 3
m.config.Overrides = map[string]ThresholdConfig{
"pbs1": {Disabled: true},
}
m.mu.Unlock()
pbs := models.PBSInstance{
ID: "pbs1",
Name: "testpbs",
}
m.CheckPBS(pbs)
m.mu.RLock()
_, cpuExists := m.activeAlerts["pbs1-cpu"]
_, memExists := m.activeAlerts["pbs1-memory"]
_, offlineExists := m.activeAlerts["pbs-offline-pbs1"]
_, confirmExists := m.offlineConfirmations["pbs1"]
m.mu.RUnlock()
if cpuExists {
t.Error("expected CPU alert to be cleared")
}
if memExists {
t.Error("expected memory alert to be cleared")
}
if offlineExists {
t.Error("expected offline alert to be cleared")
}
if confirmExists {
t.Error("expected offline confirmation to be cleared")
}
})
t.Run("DisableAllPBSOffline clears offline alert", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.activeAlerts["pbs-offline-pbs1"] = &Alert{ID: "pbs-offline-pbs1", Type: "connectivity"}
m.offlineConfirmations["pbs1"] = 3
m.config.DisableAllPBSOffline = true
m.mu.Unlock()
pbs := models.PBSInstance{
ID: "pbs1",
Name: "testpbs",
Status: "offline",
}
m.CheckPBS(pbs)
m.mu.RLock()
_, offlineExists := m.activeAlerts["pbs-offline-pbs1"]
_, confirmExists := m.offlineConfirmations["pbs1"]
m.mu.RUnlock()
if offlineExists {
t.Error("expected offline alert to be cleared when DisableAllPBSOffline is true")
}
if confirmExists {
t.Error("expected offline confirmation to be cleared")
}
})
t.Run("checks CPU threshold when online", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.config.TimeThreshold = 0
m.config.TimeThresholds = map[string]int{}
m.config.NodeDefaults = ThresholdConfig{
CPU: &HysteresisThreshold{Trigger: 80.0, Clear: 70.0},
}
m.mu.Unlock()
pbs := models.PBSInstance{
ID: "pbs1",
Name: "testpbs",
Host: "pbshost",
Status: "online",
CPU: 95.0,
}
m.CheckPBS(pbs)
m.mu.RLock()
alert := m.activeAlerts["pbs1-cpu"]
m.mu.RUnlock()
if alert == nil {
t.Fatal("expected CPU alert")
}
})
t.Run("checks memory threshold when online", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.config.TimeThreshold = 0
m.config.TimeThresholds = map[string]int{}
m.config.NodeDefaults = ThresholdConfig{
Memory: &HysteresisThreshold{Trigger: 80.0, Clear: 70.0},
}
m.mu.Unlock()
pbs := models.PBSInstance{
ID: "pbs1",
Name: "testpbs",
Host: "pbshost",
Status: "online",
Memory: 95.0,
}
m.CheckPBS(pbs)
m.mu.RLock()
alert := m.activeAlerts["pbs1-memory"]
m.mu.RUnlock()
if alert == nil {
t.Fatal("expected memory alert")
}
})
t.Run("skips metrics when PBS is offline", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.config.TimeThreshold = 0
m.config.TimeThresholds = map[string]int{}
m.config.NodeDefaults = ThresholdConfig{
CPU: &HysteresisThreshold{Trigger: 80.0, Clear: 70.0},
Memory: &HysteresisThreshold{Trigger: 80.0, Clear: 70.0},
}
m.mu.Unlock()
pbs := models.PBSInstance{
ID: "pbs1",
Name: "testpbs",
Status: "offline",
CPU: 95.0,
Memory: 95.0,
}
m.CheckPBS(pbs)
m.mu.RLock()
_, cpuExists := m.activeAlerts["pbs1-cpu"]
_, memExists := m.activeAlerts["pbs1-memory"]
m.mu.RUnlock()
if cpuExists {
t.Error("expected no CPU alert when offline")
}
if memExists {
t.Error("expected no memory alert when offline")
}
})
t.Run("applies override thresholds", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.config.TimeThreshold = 0
m.config.TimeThresholds = map[string]int{}
m.config.NodeDefaults = ThresholdConfig{
CPU: &HysteresisThreshold{Trigger: 80.0, Clear: 70.0},
}
m.config.Overrides = map[string]ThresholdConfig{
"pbs1": {
CPU: &HysteresisThreshold{Trigger: 99.0, Clear: 95.0}, // Higher threshold
},
}
m.mu.Unlock()
pbs := models.PBSInstance{
ID: "pbs1",
Name: "testpbs",
Status: "online",
CPU: 95.0, // Below override trigger
}
m.CheckPBS(pbs)
m.mu.RLock()
_, exists := m.activeAlerts["pbs1-cpu"]
m.mu.RUnlock()
if exists {
t.Error("expected no alert due to higher override threshold")
}
})
t.Run("checks offline status", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
// Pre-populate confirmation count to bypass waiting period
m.offlineConfirmations["pbs1"] = 2
m.mu.Unlock()
pbs := models.PBSInstance{
ID: "pbs1",
Name: "testpbs",
Status: "offline",
}
m.CheckPBS(pbs)
m.mu.RLock()
alert := m.activeAlerts["pbs-offline-pbs1"]
m.mu.RUnlock()
if alert == nil {
t.Fatal("expected offline alert")
}
if alert.Type != "offline" {
t.Errorf("expected offline type, got %s", alert.Type)
}
})
t.Run("checks connection health error", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
// Pre-populate confirmation count to bypass waiting period
m.offlineConfirmations["pbs1"] = 2
m.mu.Unlock()
pbs := models.PBSInstance{
ID: "pbs1",
Name: "testpbs",
Status: "online",
ConnectionHealth: "error",
}
m.CheckPBS(pbs)
m.mu.RLock()
alert := m.activeAlerts["pbs-offline-pbs1"]
m.mu.RUnlock()
if alert == nil {
t.Fatal("expected offline alert for connection health error")
}
})
t.Run("checks connection health unhealthy", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
// Pre-populate confirmation count to bypass waiting period
m.offlineConfirmations["pbs1"] = 2
m.mu.Unlock()
pbs := models.PBSInstance{
ID: "pbs1",
Name: "testpbs",
Status: "online",
ConnectionHealth: "unhealthy",
}
m.CheckPBS(pbs)
m.mu.RLock()
alert := m.activeAlerts["pbs-offline-pbs1"]
m.mu.RUnlock()
if alert == nil {
t.Fatal("expected offline alert for connection health unhealthy")
}
})
t.Run("clears offline alert when back online", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.activeAlerts["pbs-offline-pbs1"] = &Alert{ID: "pbs-offline-pbs1", Type: "connectivity"}
m.offlineConfirmations["pbs1"] = 5
m.mu.Unlock()
pbs := models.PBSInstance{
ID: "pbs1",
Name: "testpbs",
Status: "online",
ConnectionHealth: "healthy",
}
m.CheckPBS(pbs)
m.mu.RLock()
_, offlineStillActive := m.activeAlerts["pbs-offline-pbs1"]
m.mu.RUnlock()
if !offlineStillActive {
t.Fatal("expected offline alert to remain until recovery is confirmed")
}
m.CheckPBS(pbs)
m.mu.RLock()
_, offlineStillActive = m.activeAlerts["pbs-offline-pbs1"]
m.mu.RUnlock()
if !offlineStillActive {
t.Fatal("expected offline alert to remain until final recovery confirmation")
}
m.CheckPBS(pbs)
m.mu.RLock()
_, offlineExists := m.activeAlerts["pbs-offline-pbs1"]
_, confirmExists := m.offlineConfirmations["pbs1"]
m.mu.RUnlock()
if offlineExists {
t.Error("expected offline alert to be cleared when back online")
}
if confirmExists {
t.Error("expected offline confirmation to be cleared")
}
})
t.Run("transient healthy poll does not re-arm offline alert notifications", func(t *testing.T) {
m := newTestManager(t)
alertsCh := make(chan string, 2)
resolvedCh := make(chan string, 1)
m.mu.Lock()
m.config.ActivationState = ActivationActive
m.offlineConfirmations["pbs1"] = 2
m.mu.Unlock()
m.SetAlertCallback(func(alert *Alert) {
if alert != nil {
alertsCh <- alert.ID
}
})
m.SetResolvedCallback(func(alertID string) {
resolvedCh <- alertID
})
offlinePBS := models.PBSInstance{
ID: "pbs1",
Name: "testpbs",
Status: "offline",
}
onlinePBS := models.PBSInstance{
ID: "pbs1",
Name: "testpbs",
Status: "online",
ConnectionHealth: "healthy",
}
m.CheckPBS(offlinePBS)
select {
case alertID := <-alertsCh:
if alertID != "pbs-offline-pbs1" {
t.Fatalf("expected initial PBS offline notification, got %q", alertID)
}
case <-time.After(2 * time.Second):
t.Fatal("expected initial PBS offline notification")
}
m.CheckPBS(onlinePBS)
m.mu.RLock()
_, stillActive := m.activeAlerts["pbs-offline-pbs1"]
recoveryCount := m.offlineRecoveryConfirmations["pbs-offline-pbs1"]
m.mu.RUnlock()
if !stillActive {
t.Fatal("expected transient healthy poll to keep the offline alert active")
}
if recoveryCount != 1 {
t.Fatalf("expected recovery confirmation count 1 after transient healthy poll, got %d", recoveryCount)
}
select {
case resolvedID := <-resolvedCh:
t.Fatalf("expected no recovery notification from a single healthy poll, got %q", resolvedID)
default:
}
m.CheckPBS(offlinePBS)
m.mu.RLock()
_, stillActive = m.activeAlerts["pbs-offline-pbs1"]
_, recoveryTracked := m.offlineRecoveryConfirmations["pbs-offline-pbs1"]
m.mu.RUnlock()
if !stillActive {
t.Fatal("expected offline alert to remain active after connectivity drops again")
}
if recoveryTracked {
t.Fatal("expected transient recovery tracking to reset once PBS is offline again")
}
select {
case alertID := <-alertsCh:
t.Fatalf("expected no duplicate offline notification while alert stays active, got %q", alertID)
case <-time.After(200 * time.Millisecond):
}
})
}
func TestCheckPMGComprehensive(t *testing.T) {
// t.Parallel()
t.Run("returns early when alerts disabled", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.config.Enabled = false
m.mu.Unlock()
pmg := models.PMGInstance{
ID: "pmg1",
Name: "testpmg",
}
m.CheckPMG(pmg)
m.mu.RLock()
alertCount := len(m.activeAlerts)
m.mu.RUnlock()
if alertCount != 0 {
t.Errorf("expected no alerts when disabled, got %d", alertCount)
}
})
t.Run("DisableAllPMG clears existing alerts", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.activeAlerts["pmg1-queue-total"] = &Alert{ID: "pmg1-queue-total", Type: "queue-total"}
m.activeAlerts["pmg1-queue-deferred"] = &Alert{ID: "pmg1-queue-deferred", Type: "queue-deferred"}
m.activeAlerts["pmg1-queue-hold"] = &Alert{ID: "pmg1-queue-hold", Type: "queue-hold"}
m.activeAlerts["pmg1-oldest-message"] = &Alert{ID: "pmg1-oldest-message", Type: "oldest-message"}
m.activeAlerts["pmg-offline-pmg1"] = &Alert{ID: "pmg-offline-pmg1", Type: "connectivity"}
m.offlineConfirmations["pmg1"] = 3
m.config.DisableAllPMG = true
m.mu.Unlock()
pmg := models.PMGInstance{
ID: "pmg1",
Name: "testpmg",
}
m.CheckPMG(pmg)
m.mu.RLock()
_, queueTotalExists := m.activeAlerts["pmg1-queue-total"]
_, queueDeferredExists := m.activeAlerts["pmg1-queue-deferred"]
_, queueHoldExists := m.activeAlerts["pmg1-queue-hold"]
_, oldestMsgExists := m.activeAlerts["pmg1-oldest-message"]
_, offlineExists := m.activeAlerts["pmg-offline-pmg1"]
_, confirmExists := m.offlineConfirmations["pmg1"]
m.mu.RUnlock()
if queueTotalExists {
t.Error("expected queue-total alert to be cleared")
}
if queueDeferredExists {
t.Error("expected queue-deferred alert to be cleared")
}
if queueHoldExists {
t.Error("expected queue-hold alert to be cleared")
}
if oldestMsgExists {
t.Error("expected oldest-message alert to be cleared")
}
if offlineExists {
t.Error("expected offline alert to be cleared")
}
if confirmExists {
t.Error("expected offline confirmation to be cleared")
}
})
t.Run("override with Disabled clears alerts", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.activeAlerts["pmg1-queue-total"] = &Alert{ID: "pmg1-queue-total", Type: "queue-total"}
m.activeAlerts["pmg1-oldest-message"] = &Alert{ID: "pmg1-oldest-message", Type: "oldest-message"}
m.activeAlerts["pmg-offline-pmg1"] = &Alert{ID: "pmg-offline-pmg1", Type: "connectivity"}
m.offlineConfirmations["pmg1"] = 3
m.config.Overrides = map[string]ThresholdConfig{
"pmg1": {Disabled: true},
}
m.mu.Unlock()
pmg := models.PMGInstance{
ID: "pmg1",
Name: "testpmg",
}
m.CheckPMG(pmg)
m.mu.RLock()
_, queueExists := m.activeAlerts["pmg1-queue-total"]
_, oldestExists := m.activeAlerts["pmg1-oldest-message"]
_, offlineExists := m.activeAlerts["pmg-offline-pmg1"]
_, confirmExists := m.offlineConfirmations["pmg1"]
m.mu.RUnlock()
if queueExists {
t.Error("expected queue alert to be cleared")
}
if oldestExists {
t.Error("expected oldest-message alert to be cleared")
}
if offlineExists {
t.Error("expected offline alert to be cleared")
}
if confirmExists {
t.Error("expected offline confirmation to be cleared")
}
})
t.Run("DisableAllPMGOffline clears offline alert", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.activeAlerts["pmg-offline-pmg1"] = &Alert{ID: "pmg-offline-pmg1", Type: "connectivity"}
m.offlineConfirmations["pmg1"] = 3
m.config.DisableAllPMGOffline = true
m.mu.Unlock()
pmg := models.PMGInstance{
ID: "pmg1",
Name: "testpmg",
Status: "offline",
}
m.CheckPMG(pmg)
m.mu.RLock()
_, offlineExists := m.activeAlerts["pmg-offline-pmg1"]
_, confirmExists := m.offlineConfirmations["pmg1"]
m.mu.RUnlock()
if offlineExists {
t.Error("expected offline alert to be cleared when DisableAllPMGOffline is true")
}
if confirmExists {
t.Error("expected offline confirmation to be cleared")
}
})
t.Run("checks offline status", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
// Pre-populate confirmation count to bypass waiting period (3 required)
m.offlineConfirmations["pmg1"] = 2
m.mu.Unlock()
pmg := models.PMGInstance{
ID: "pmg1",
Name: "testpmg",
Status: "offline",
}
m.CheckPMG(pmg)
m.mu.RLock()
alert := m.activeAlerts["pmg-offline-pmg1"]
m.mu.RUnlock()
if alert == nil {
t.Fatal("expected offline alert")
}
if alert.Type != "offline" {
t.Errorf("expected offline type, got %s", alert.Type)
}
})
t.Run("checks connection health error", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
// Pre-populate confirmation count to bypass waiting period
m.offlineConfirmations["pmg1"] = 2
m.mu.Unlock()
pmg := models.PMGInstance{
ID: "pmg1",
Name: "testpmg",
Status: "online",
ConnectionHealth: "error",
}
m.CheckPMG(pmg)
m.mu.RLock()
alert := m.activeAlerts["pmg-offline-pmg1"]
m.mu.RUnlock()
if alert == nil {
t.Fatal("expected offline alert for connection health error")
}
})
t.Run("checks connection health unhealthy", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
// Pre-populate confirmation count to bypass waiting period
m.offlineConfirmations["pmg1"] = 2
m.mu.Unlock()
pmg := models.PMGInstance{
ID: "pmg1",
Name: "testpmg",
Status: "online",
ConnectionHealth: "unhealthy",
}
m.CheckPMG(pmg)
m.mu.RLock()
alert := m.activeAlerts["pmg-offline-pmg1"]
m.mu.RUnlock()
if alert == nil {
t.Fatal("expected offline alert for connection health unhealthy")
}
})
t.Run("clears offline alert when back online", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.activeAlerts["pmg-offline-pmg1"] = &Alert{ID: "pmg-offline-pmg1", Type: "connectivity"}
m.offlineConfirmations["pmg1"] = 5
m.mu.Unlock()
pmg := models.PMGInstance{
ID: "pmg1",
Name: "testpmg",
Status: "online",
ConnectionHealth: "healthy",
}
m.CheckPMG(pmg)
m.mu.RLock()
_, offlineStillActive := m.activeAlerts["pmg-offline-pmg1"]
m.mu.RUnlock()
if !offlineStillActive {
t.Fatal("expected offline alert to remain until recovery is confirmed")
}
m.CheckPMG(pmg)
m.mu.RLock()
_, offlineStillActive = m.activeAlerts["pmg-offline-pmg1"]
m.mu.RUnlock()
if !offlineStillActive {
t.Fatal("expected offline alert to remain until final recovery confirmation")
}
m.CheckPMG(pmg)
m.mu.RLock()
_, offlineExists := m.activeAlerts["pmg-offline-pmg1"]
_, confirmExists := m.offlineConfirmations["pmg1"]
m.mu.RUnlock()
if offlineExists {
t.Error("expected offline alert to be cleared when back online")
}
if confirmExists {
t.Error("expected offline confirmation to be cleared")
}
})
t.Run("skips metrics when PMG is offline", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
pmg := models.PMGInstance{
ID: "pmg1",
Name: "testpmg",
Status: "offline",
}
m.CheckPMG(pmg)
m.mu.RLock()
var queueAlertCount int
for alertID := range m.activeAlerts {
if strings.Contains(alertID, "pmg1-queue") || strings.Contains(alertID, "pmg1-oldest") {
queueAlertCount++
}
}
m.mu.RUnlock()
if queueAlertCount != 0 {
t.Error("expected no queue alerts when offline")
}
})
}
func TestCheckStorageComprehensive(t *testing.T) {
// t.Parallel()
t.Run("returns early when alerts disabled", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.config.Enabled = false
m.mu.Unlock()
storage := models.Storage{
ID: "storage1",
Name: "teststorage",
Status: "active",
Usage: 95.0,
}
m.CheckStorage(storage)
m.mu.RLock()
alertCount := len(m.activeAlerts)
m.mu.RUnlock()
if alertCount != 0 {
t.Errorf("expected no alerts when disabled, got %d", alertCount)
}
})
t.Run("DisableAllStorage clears existing alerts", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.activeAlerts["storage1-usage"] = &Alert{ID: "storage1-usage", Type: "usage"}
m.activeAlerts["storage-offline-storage1"] = &Alert{ID: "storage-offline-storage1", Type: "connectivity"}
m.config.DisableAllStorage = true
m.mu.Unlock()
storage := models.Storage{
ID: "storage1",
Name: "teststorage",
Status: "active",
}
m.CheckStorage(storage)
m.mu.RLock()
_, usageExists := m.activeAlerts["storage1-usage"]
_, offlineExists := m.activeAlerts["storage-offline-storage1"]
m.mu.RUnlock()
if usageExists {
t.Error("expected usage alert to be cleared")
}
if offlineExists {
t.Error("expected offline alert to be cleared")
}
})
t.Run("override with Disabled clears alerts", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.activeAlerts["storage1-usage"] = &Alert{ID: "storage1-usage", Type: "usage"}
m.activeAlerts["storage-offline-storage1"] = &Alert{ID: "storage-offline-storage1", Type: "connectivity"}
m.config.Overrides = map[string]ThresholdConfig{
"storage1": {Disabled: true},
}
m.mu.Unlock()
storage := models.Storage{
ID: "storage1",
Name: "teststorage",
Status: "active",
}
m.CheckStorage(storage)
m.mu.RLock()
_, usageExists := m.activeAlerts["storage1-usage"]
_, offlineExists := m.activeAlerts["storage-offline-storage1"]
m.mu.RUnlock()
if usageExists {
t.Error("expected usage alert to be cleared")
}
if offlineExists {
t.Error("expected offline alert to be cleared")
}
})
t.Run("checks usage threshold", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.config.TimeThreshold = 0
m.config.TimeThresholds = map[string]int{}
m.config.StorageDefault = HysteresisThreshold{Trigger: 80.0, Clear: 70.0}
m.mu.Unlock()
storage := models.Storage{
ID: "storage1",
Name: "teststorage",
Node: "node1",
Status: "active",
Usage: 95.0,
}
m.CheckStorage(storage)
m.mu.RLock()
alert := m.activeAlerts["storage1-usage"]
m.mu.RUnlock()
if alert == nil {
t.Fatal("expected usage alert")
}
})
t.Run("applies override threshold", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.config.TimeThreshold = 0
m.config.TimeThresholds = map[string]int{}
m.config.StorageDefault = HysteresisThreshold{Trigger: 80.0, Clear: 70.0}
overrideThreshold := HysteresisThreshold{Trigger: 99.0, Clear: 95.0}
m.config.Overrides = map[string]ThresholdConfig{
"storage1": {Usage: &overrideThreshold},
}
m.mu.Unlock()
storage := models.Storage{
ID: "storage1",
Name: "teststorage",
Status: "active",
Usage: 95.0, // Below override threshold
}
m.CheckStorage(storage)
m.mu.RLock()
_, exists := m.activeAlerts["storage1-usage"]
m.mu.RUnlock()
if exists {
t.Error("expected no alert due to higher override threshold")
}
})
t.Run("applies legacy shared storage override threshold", func(t *testing.T) {
m := newTestManager(t)
m.mu.Lock()
m.config.TimeThreshold = 0
m.config.TimeThresholds = map[string]int{}
m.config.StorageDefault = HysteresisThreshold{Trigger: 80.0, Clear: 70.0}
overrideThreshold := HysteresisThreshold{Trigger: 10.0, Clear: 5.0}
m.config.Overrides = map[string]ThresholdConfig{
"Main-pve1-ceph-pool": {Usage: &overrideThreshold},
}
m.mu.Unlock()
storage := models.Storage{
ID: "Main-cluster-ceph-pool",
Name: "ceph-pool",
Node: "cluster",
Instance: "Main",
Status: "available",
Usage: 20.0,
Shared: true,
Nodes: []string{"pve1", "pve2"},
NodeIDs: []string{"Main-pve1", "Main-pve2"},
}
m.CheckStorage(storage)
m.mu.RLock()
alert := m.activeAlerts["Main-cluster-ceph-pool-usage"]
m.mu.RUnlock()
if alert == nil {
t.Fatal("expected usage alert when legacy shared-storage override matches canonical storage ID")
}
})
t.Run("skips usage check when offline", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.config.TimeThreshold = 0
m.config.TimeThresholds = map[string]int{}
m.config.StorageDefault = HysteresisThreshold{Trigger: 80.0, Clear: 70.0}
m.mu.Unlock()
storage := models.Storage{
ID: "storage1",
Name: "teststorage",
Status: "offline",
Usage: 95.0,
}
m.CheckStorage(storage)
m.mu.RLock()
_, exists := m.activeAlerts["storage1-usage"]
m.mu.RUnlock()
if exists {
t.Error("expected no usage alert when offline")
}
})
t.Run("skips usage check when unavailable", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.config.TimeThreshold = 0
m.config.TimeThresholds = map[string]int{}
m.config.StorageDefault = HysteresisThreshold{Trigger: 80.0, Clear: 70.0}
m.mu.Unlock()
storage := models.Storage{
ID: "storage1",
Name: "teststorage",
Status: "unavailable",
Usage: 95.0,
}
m.CheckStorage(storage)
m.mu.RLock()
_, exists := m.activeAlerts["storage1-usage"]
m.mu.RUnlock()
if exists {
t.Error("expected no usage alert when unavailable")
}
})
t.Run("checks offline status", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
// Pre-populate confirmation count (requires 2)
m.offlineConfirmations["storage1"] = 1
m.mu.Unlock()
storage := models.Storage{
ID: "storage1",
Name: "teststorage",
Status: "offline",
}
m.CheckStorage(storage)
m.mu.RLock()
alert := m.activeAlerts["storage-offline-storage1"]
m.mu.RUnlock()
if alert == nil {
t.Fatal("expected offline alert")
}
})
t.Run("checks unavailable status", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
// Pre-populate confirmation count (requires 2)
m.offlineConfirmations["storage1"] = 1
m.mu.Unlock()
storage := models.Storage{
ID: "storage1",
Name: "teststorage",
Status: "unavailable",
}
m.CheckStorage(storage)
m.mu.RLock()
alert := m.activeAlerts["storage-offline-storage1"]
m.mu.RUnlock()
if alert == nil {
t.Fatal("expected offline alert for unavailable status")
}
})
t.Run("clears offline alert when back online", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.activeAlerts["storage-offline-storage1"] = &Alert{ID: "storage-offline-storage1", Type: "connectivity"}
m.offlineConfirmations["storage1"] = 5
m.mu.Unlock()
storage := models.Storage{
ID: "storage1",
Name: "teststorage",
Status: "active",
}
m.CheckStorage(storage)
m.mu.RLock()
_, offlineStillActive := m.activeAlerts["storage-offline-storage1"]
m.mu.RUnlock()
if !offlineStillActive {
t.Fatal("expected offline alert to remain until recovery is confirmed")
}
m.CheckStorage(storage)
m.mu.RLock()
_, offlineExists := m.activeAlerts["storage-offline-storage1"]
_, confirmExists := m.offlineConfirmations["storage1"]
m.mu.RUnlock()
if offlineExists {
t.Error("expected offline alert to be cleared when back online")
}
if confirmExists {
t.Error("expected offline confirmation to be cleared")
}
})
t.Run("skips usage check when usage is zero", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
m.mu.Lock()
m.config.TimeThreshold = 0
m.config.TimeThresholds = map[string]int{}
m.config.StorageDefault = HysteresisThreshold{Trigger: 80.0, Clear: 70.0}
m.mu.Unlock()
storage := models.Storage{
ID: "storage1",
Name: "teststorage",
Status: "active",
Usage: 0, // No usage data
}
m.CheckStorage(storage)
m.mu.RLock()
_, exists := m.activeAlerts["storage1-usage"]
m.mu.RUnlock()
if exists {
t.Error("expected no usage alert when usage is zero")
}
})
}
func TestSyncStorageAlertsForInstance(t *testing.T) {
t.Run("clears missing storage alerts while preserving other alert types", func(t *testing.T) {
m := newTestManager(t)
m.mu.Lock()
m.activeAlerts["inst1-node1-old-usage"] = &Alert{
ID: "inst1-node1-old-usage",
Type: "usage",
ResourceID: "inst1-node1-old",
Instance: "inst1",
Metadata: map[string]interface{}{
"resourceType": "Storage",
},
}
m.activeAlerts["storage-offline-inst1-node1-old"] = &Alert{
ID: "storage-offline-inst1-node1-old",
Type: "connectivity",
ResourceID: "inst1-node1-old",
Instance: "inst1",
}
m.activeAlerts["zfs-device-inst1-node1-old-sda"] = &Alert{
ID: "zfs-device-inst1-node1-old-sda",
Type: "zfs-device",
ResourceID: "inst1-node1-old",
Instance: "inst1",
}
m.activeAlerts["inst1:vm:100-cpu"] = &Alert{
ID: "inst1:vm:100-cpu",
Type: "cpu",
ResourceID: "inst1:vm:100",
Instance: "inst1",
Metadata: map[string]interface{}{
"resourceType": "VM",
},
}
m.mu.Unlock()
m.SyncStorageAlertsForInstance("inst1", []models.Storage{
{
ID: "inst1-node1-new",
Name: "new-storage",
Instance: "inst1",
},
})
m.mu.RLock()
defer m.mu.RUnlock()
if _, exists := m.activeAlerts["inst1-node1-old-usage"]; exists {
t.Fatal("expected stale storage usage alert to be cleared")
}
if _, exists := m.activeAlerts["storage-offline-inst1-node1-old"]; exists {
t.Fatal("expected stale storage offline alert to be cleared")
}
if _, exists := m.activeAlerts["zfs-device-inst1-node1-old-sda"]; exists {
t.Fatal("expected stale zfs device alert to be cleared")
}
if _, exists := m.activeAlerts["inst1:vm:100-cpu"]; !exists {
t.Fatal("expected non-storage alert to be preserved")
}
})
t.Run("preserves current storage and zfs device alerts", func(t *testing.T) {
m := newTestManager(t)
m.mu.Lock()
m.activeAlerts["inst1-node1-rpool-usage"] = &Alert{
ID: "inst1-node1-rpool-usage",
Type: "usage",
ResourceID: "inst1-node1-rpool",
Instance: "inst1",
Metadata: map[string]interface{}{
"resourceType": "Storage",
},
}
m.activeAlerts["storage-offline-inst1-node1-rpool"] = &Alert{
ID: "storage-offline-inst1-node1-rpool",
Type: "connectivity",
ResourceID: "inst1-node1-rpool",
Instance: "inst1",
}
m.activeAlerts["zfs-device-inst1-node1-rpool-sda"] = &Alert{
ID: "zfs-device-inst1-node1-rpool-sda",
Type: "zfs-device",
ResourceID: "inst1-node1-rpool",
Instance: "inst1",
}
m.mu.Unlock()
m.SyncStorageAlertsForInstance("inst1", []models.Storage{
{
ID: "inst1-node1-rpool",
Name: "rpool",
Instance: "inst1",
ZFSPool: &models.ZFSPool{
Name: "rpool",
Devices: []models.ZFSDevice{
{Name: "sda", State: "ONLINE"},
},
},
},
})
m.mu.RLock()
defer m.mu.RUnlock()
if _, exists := m.activeAlerts["inst1-node1-rpool-usage"]; !exists {
t.Fatal("expected current storage usage alert to remain active")
}
if _, exists := m.activeAlerts["storage-offline-inst1-node1-rpool"]; !exists {
t.Fatal("expected current storage offline alert to remain active")
}
if _, exists := m.activeAlerts["zfs-device-inst1-node1-rpool-sda"]; !exists {
t.Fatal("expected current zfs device alert to remain active")
}
})
}
func TestDispatchAlert(t *testing.T) {
// t.Parallel()
t.Run("returns false when onAlert is nil", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
alert := &Alert{
ID: "test-alert",
Type: "cpu",
}
result := m.dispatchAlert(alert, false)
if result {
t.Error("expected false when onAlert callback is nil")
}
})
t.Run("returns false when alert is nil", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
called := false
m.SetAlertCallback(func(a *Alert) {
called = true
})
result := m.dispatchAlert(nil, false)
if result {
t.Error("expected false when alert is nil")
}
if called {
t.Error("callback should not be called for nil alert")
}
})
t.Run("returns false when activation state is pending", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
called := false
m.SetAlertCallback(func(a *Alert) {
called = true
})
m.mu.Lock()
m.config.ActivationState = ActivationPending
m.mu.Unlock()
alert := &Alert{
ID: "test-alert",
Type: "cpu",
}
result := m.dispatchAlert(alert, false)
if result {
t.Error("expected false when activation is pending")
}
if called {
t.Error("callback should not be called when pending")
}
})
t.Run("returns false when activation state is snoozed", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
called := false
m.SetAlertCallback(func(a *Alert) {
called = true
})
m.mu.Lock()
m.config.ActivationState = ActivationSnoozed
m.mu.Unlock()
alert := &Alert{
ID: "test-alert",
Type: "cpu",
}
result := m.dispatchAlert(alert, false)
if result {
t.Error("expected false when activation is snoozed")
}
if called {
t.Error("callback should not be called when snoozed")
}
})
t.Run("returns false for monitor-only alert", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
called := false
m.SetAlertCallback(func(a *Alert) {
called = true
})
m.mu.Lock()
m.config.ActivationState = ActivationActive
m.mu.Unlock()
alert := &Alert{
ID: "test-alert",
Type: "cpu",
Metadata: map[string]interface{}{"monitorOnly": true},
}
result := m.dispatchAlert(alert, false)
if result {
t.Error("expected false for monitor-only alert")
}
if called {
t.Error("callback should not be called for monitor-only alert")
}
})
t.Run("dispatches synchronously when async is false", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
var receivedAlert *Alert
m.SetAlertCallback(func(a *Alert) {
receivedAlert = a
})
m.mu.Lock()
m.config.ActivationState = ActivationActive
m.mu.Unlock()
alert := &Alert{
ID: "test-alert",
Type: "cpu",
ResourceName: "testvm",
}
result := m.dispatchAlert(alert, false)
if !result {
t.Error("expected true for successful dispatch")
}
if receivedAlert == nil {
t.Fatal("callback should have been called")
}
if receivedAlert.ID != alert.ID {
t.Error("alert ID should match")
}
})
t.Run("dispatches asynchronously when async is true", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
var receivedAlert *Alert
done := make(chan struct{})
m.SetAlertCallback(func(a *Alert) {
receivedAlert = a
close(done)
})
m.mu.Lock()
m.config.ActivationState = ActivationActive
m.mu.Unlock()
alert := &Alert{
ID: "test-alert",
Type: "cpu",
ResourceName: "testvm",
}
result := m.dispatchAlert(alert, true)
if !result {
t.Error("expected true for successful dispatch")
}
// Wait for async callback
select {
case <-done:
// Success
case <-time.After(time.Second):
t.Fatal("async callback not called within timeout")
}
if receivedAlert == nil {
t.Fatal("callback should have been called")
}
if receivedAlert.ID != alert.ID {
t.Error("alert ID should match")
}
})
t.Run("clones alert before dispatch", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
var receivedAlert *Alert
m.SetAlertCallback(func(a *Alert) {
receivedAlert = a
})
m.mu.Lock()
m.config.ActivationState = ActivationActive
m.mu.Unlock()
alert := &Alert{
ID: "test-alert",
Type: "cpu",
ResourceName: "testvm",
}
m.dispatchAlert(alert, false)
if receivedAlert == alert {
t.Error("alert should be cloned, not passed directly")
}
})
}
func TestPreserveAlertState(t *testing.T) {
t.Run("nil updated alert is handled", func(t *testing.T) {
m := newTestManager(t)
// Should not panic
m.preserveAlertState("test-id", nil)
})
t.Run("preserves state from existing alert", func(t *testing.T) {
m := newTestManager(t)
ackTime := time.Now().Add(-30 * time.Minute)
existing := &Alert{
ID: "test-alert",
Type: "cpu",
StartTime: time.Now().Add(-1 * time.Hour),
Acknowledged: true,
AckUser: "testuser",
AckTime: &ackTime,
LastEscalation: 2,
EscalationTimes: []time.Time{time.Now().Add(-25 * time.Minute)},
}
m.mu.Lock()
m.activeAlerts["test-alert"] = existing
m.mu.Unlock()
updated := &Alert{
ID: "test-alert",
Type: "cpu",
StartTime: time.Now(), // Different start time
}
m.preserveAlertState("test-alert", updated)
if !updated.StartTime.Equal(existing.StartTime) {
t.Error("StartTime should be preserved from existing alert")
}
if !updated.Acknowledged {
t.Error("Acknowledged should be preserved")
}
if updated.AckUser != "testuser" {
t.Errorf("AckUser should be preserved, got %s", updated.AckUser)
}
if updated.AckTime == nil || !updated.AckTime.Equal(ackTime) {
t.Error("AckTime should be preserved")
}
if updated.LastEscalation != 2 {
t.Error("LastEscalation should be preserved")
}
if len(updated.EscalationTimes) != 1 {
t.Error("EscalationTimes should be preserved")
}
})
t.Run("falls back to ackState for new alert", func(t *testing.T) {
m := newTestManager(t)
ackTime := time.Now().Add(-15 * time.Minute)
m.mu.Lock()
m.ackState["test-alert"] = ackRecord{
acknowledged: true,
user: "fallbackuser",
time: ackTime,
}
m.mu.Unlock()
updated := &Alert{
ID: "test-alert",
Type: "cpu",
StartTime: time.Now(),
}
m.preserveAlertState("test-alert", updated)
if !updated.Acknowledged {
t.Error("Acknowledged should be set from ackState")
}
if updated.AckUser != "fallbackuser" {
t.Errorf("AckUser should be from ackState, got %s", updated.AckUser)
}
if updated.AckTime == nil || !updated.AckTime.Equal(ackTime) {
t.Error("AckTime should be from ackState")
}
})
t.Run("no state to preserve for new alert", func(t *testing.T) {
m := newTestManager(t)
startTime := time.Now()
updated := &Alert{
ID: "new-alert",
Type: "cpu",
StartTime: startTime,
}
m.preserveAlertState("new-alert", updated)
if !updated.StartTime.Equal(startTime) {
t.Error("StartTime should remain unchanged for new alert")
}
if updated.Acknowledged {
t.Error("Acknowledged should remain false for new alert")
}
})
}
func TestCheckPMGQuarantineBacklog(t *testing.T) {
t.Run("nil quarantine clears alerts", func(t *testing.T) {
m := newTestManager(t)
// Create an existing quarantine alert
m.mu.Lock()
m.activeAlerts["pmg1-quarantine-spam"] = &Alert{
ID: "pmg1-quarantine-spam",
Type: "quarantine-spam",
}
m.activeAlerts["pmg1-quarantine-virus"] = &Alert{
ID: "pmg1-quarantine-virus",
Type: "quarantine-virus",
}
m.mu.Unlock()
pmg := models.PMGInstance{
ID: "pmg1",
Name: "pmg-server",
Host: "pmg.example.com",
Quarantine: nil,
}
m.checkPMGQuarantineBacklog(pmg, PMGThresholdConfig{})
m.mu.RLock()
_, spamExists := m.activeAlerts["pmg1-quarantine-spam"]
_, virusExists := m.activeAlerts["pmg1-quarantine-virus"]
m.mu.RUnlock()
if spamExists {
t.Error("spam alert should be cleared when quarantine is nil")
}
if virusExists {
t.Error("virus alert should be cleared when quarantine is nil")
}
})
t.Run("warning threshold triggers alert", func(t *testing.T) {
m := newTestManager(t)
m.ClearActiveAlerts()
m.mu.Lock()
m.pmgQuarantineHistory = make(map[string][]pmgQuarantineSnapshot)
m.config.ActivationState = ActivationActive
m.mu.Unlock()
pmg := models.PMGInstance{
ID: "pmg1",
Name: "pmg-server",
Host: "pmg.example.com",
Quarantine: &models.PMGQuarantineTotals{
Spam: 2500, // Above warning threshold
Virus: 100,
},
}
thresholds := PMGThresholdConfig{
QuarantineSpamWarn: 2000,
QuarantineSpamCritical: 5000,
QuarantineVirusWarn: 2000,
QuarantineVirusCritical: 5000,
}
m.checkPMGQuarantineBacklog(pmg, thresholds)
m.mu.RLock()
alert, exists := m.activeAlerts["pmg1-quarantine-spam"]
m.mu.RUnlock()
if !exists {
t.Fatal("spam quarantine warning alert should be created")
}
if alert.Level != AlertLevelWarning {
t.Errorf("alert level should be warning, got %s", alert.Level)
}
})
t.Run("critical threshold triggers alert", func(t *testing.T) {
m := newTestManager(t)
m.ClearActiveAlerts()
m.mu.Lock()
m.pmgQuarantineHistory = make(map[string][]pmgQuarantineSnapshot)
m.config.ActivationState = ActivationActive
m.mu.Unlock()
pmg := models.PMGInstance{
ID: "pmg1",
Name: "pmg-server",
Host: "pmg.example.com",
Quarantine: &models.PMGQuarantineTotals{
Spam: 6000, // Above critical threshold
Virus: 100,
},
}
thresholds := PMGThresholdConfig{
QuarantineSpamWarn: 2000,
QuarantineSpamCritical: 5000,
QuarantineVirusWarn: 2000,
QuarantineVirusCritical: 5000,
}
m.checkPMGQuarantineBacklog(pmg, thresholds)
m.mu.RLock()
alert, exists := m.activeAlerts["pmg1-quarantine-spam"]
m.mu.RUnlock()
if !exists {
t.Fatal("spam quarantine critical alert should be created")
}
if alert.Level != AlertLevelCritical {
t.Errorf("alert level should be critical, got %s", alert.Level)
}
})
t.Run("below threshold clears alert", func(t *testing.T) {
m := newTestManager(t)
m.ClearActiveAlerts()
m.mu.Lock()
m.pmgQuarantineHistory = make(map[string][]pmgQuarantineSnapshot)
m.activeAlerts["pmg1-quarantine-spam"] = &Alert{
ID: "pmg1-quarantine-spam",
Type: "quarantine-spam",
Level: AlertLevelWarning,
}
m.mu.Unlock()
pmg := models.PMGInstance{
ID: "pmg1",
Name: "pmg-server",
Host: "pmg.example.com",
Quarantine: &models.PMGQuarantineTotals{
Spam: 500, // Below warning threshold
Virus: 100,
},
}
thresholds := PMGThresholdConfig{
QuarantineSpamWarn: 2000,
QuarantineSpamCritical: 5000,
}
m.checkPMGQuarantineBacklog(pmg, thresholds)
m.mu.RLock()
_, exists := m.activeAlerts["pmg1-quarantine-spam"]
m.mu.RUnlock()
if exists {
t.Error("spam quarantine alert should be cleared when below threshold")
}
})
t.Run("growth rate triggers warning alert", func(t *testing.T) {
m := newTestManager(t)
m.ClearActiveAlerts()
m.mu.Lock()
m.config.ActivationState = ActivationActive
// Set up history from ~2 hours ago
m.pmgQuarantineHistory = map[string][]pmgQuarantineSnapshot{
"pmg1": {
{
Spam: 1000,
Virus: 100,
Timestamp: time.Now().Add(-2 * time.Hour),
},
},
}
m.mu.Unlock()
pmg := models.PMGInstance{
ID: "pmg1",
Name: "pmg-server",
Host: "pmg.example.com",
Quarantine: &models.PMGQuarantineTotals{
Spam: 1500, // 50% growth (500 messages)
Virus: 100,
},
}
thresholds := PMGThresholdConfig{
QuarantineSpamWarn: 10000, // High absolute threshold (won't trigger)
QuarantineSpamCritical: 20000,
QuarantineGrowthWarnPct: 25, // 25% growth warning
QuarantineGrowthWarnMin: 250, // Minimum 250 messages
QuarantineGrowthCritPct: 50, // 50% growth critical
QuarantineGrowthCritMin: 500, // Minimum 500 messages
}
m.checkPMGQuarantineBacklog(pmg, thresholds)
m.mu.RLock()
alert, exists := m.activeAlerts["pmg1-quarantine-spam"]
m.mu.RUnlock()
if !exists {
t.Fatal("spam quarantine growth alert should be created")
}
if alert.Level != AlertLevelCritical {
t.Errorf("alert level should be critical due to 50%% growth + 500 messages, got %s", alert.Level)
}
})
t.Run("updates existing alert", func(t *testing.T) {
m := newTestManager(t)
m.ClearActiveAlerts()
m.mu.Lock()
m.pmgQuarantineHistory = make(map[string][]pmgQuarantineSnapshot)
m.config.ActivationState = ActivationActive
m.activeAlerts["pmg1-quarantine-spam"] = &Alert{
ID: "pmg1-quarantine-spam",
Type: "quarantine-spam",
Level: AlertLevelWarning,
Value: 2500,
Threshold: 2000,
LastSeen: time.Now().Add(-5 * time.Minute),
}
m.mu.Unlock()
pmg := models.PMGInstance{
ID: "pmg1",
Name: "pmg-server",
Host: "pmg.example.com",
Quarantine: &models.PMGQuarantineTotals{
Spam: 3000, // Higher spam count
Virus: 100,
},
}
thresholds := PMGThresholdConfig{
QuarantineSpamWarn: 2000,
QuarantineSpamCritical: 5000,
}
m.checkPMGQuarantineBacklog(pmg, thresholds)
m.mu.RLock()
alert, exists := m.activeAlerts["pmg1-quarantine-spam"]
m.mu.RUnlock()
if !exists {
t.Fatal("spam quarantine alert should still exist")
}
if alert.Value != 3000 {
t.Errorf("alert value should be updated to 3000, got %.0f", alert.Value)
}
})
t.Run("virus quarantine alert", func(t *testing.T) {
m := newTestManager(t)
m.ClearActiveAlerts()
m.mu.Lock()
m.pmgQuarantineHistory = make(map[string][]pmgQuarantineSnapshot)
m.config.ActivationState = ActivationActive
m.mu.Unlock()
pmg := models.PMGInstance{
ID: "pmg1",
Name: "pmg-server",
Host: "pmg.example.com",
Quarantine: &models.PMGQuarantineTotals{
Spam: 100,
Virus: 3000, // Above virus warning threshold
},
}
thresholds := PMGThresholdConfig{
QuarantineSpamWarn: 2000,
QuarantineSpamCritical: 5000,
QuarantineVirusWarn: 2000,
QuarantineVirusCritical: 5000,
}
m.checkPMGQuarantineBacklog(pmg, thresholds)
m.mu.RLock()
alert, exists := m.activeAlerts["pmg1-quarantine-virus"]
m.mu.RUnlock()
if !exists {
t.Fatal("virus quarantine warning alert should be created")
}
if alert.Level != AlertLevelWarning {
t.Errorf("alert level should be warning, got %s", alert.Level)
}
})
}
func TestLoadActiveAlerts(t *testing.T) {
t.Run("no file returns nil error", func(t *testing.T) {
m := newTestManager(t)
m.ClearActiveAlerts()
err := m.LoadActiveAlerts()
if err != nil {
t.Errorf("expected no error when file doesn't exist, got %v", err)
}
})
t.Run("loads alerts from valid file", func(t *testing.T) {
m := newTestManager(t)
// Create an alert and save it
startTime := time.Now().Add(-30 * time.Minute)
alert := &Alert{
ID: "test-load-alert",
Type: "cpu",
Level: AlertLevelWarning,
ResourceID: "test-resource",
ResourceName: "test-vm",
Node: "node1",
Instance: "pve1",
Message: "Test alert",
Value: 85.0,
Threshold: 80.0,
StartTime: startTime,
LastSeen: time.Now(),
}
m.mu.Lock()
m.activeAlerts[alert.ID] = alert
m.mu.Unlock()
// Save to disk
_ = m.SaveActiveAlerts()
// Clear in-memory map only (don't use ClearActiveAlerts which triggers async save)
m.mu.Lock()
m.activeAlerts = make(map[string]*Alert)
m.mu.Unlock()
err := m.LoadActiveAlerts()
if err != nil {
t.Fatalf("failed to load alerts: %v", err)
}
m.mu.RLock()
loaded, exists := m.activeAlerts["test-load-alert"]
m.mu.RUnlock()
if !exists {
t.Fatal("alert should be loaded from file")
}
if loaded.Type != "cpu" {
t.Errorf("loaded alert type should be cpu, got %s", loaded.Type)
}
if loaded.Value != 85.0 {
t.Errorf("loaded alert value should be 85.0, got %.1f", loaded.Value)
}
})
t.Run("skips old alerts", func(t *testing.T) {
m := newTestManager(t)
// Create an old alert (>24 hours)
startTime := time.Now().Add(-25 * time.Hour)
alert := &Alert{
ID: "old-alert",
Type: "cpu",
Level: AlertLevelWarning,
ResourceID: "test-resource",
ResourceName: "test-vm",
StartTime: startTime,
LastSeen: startTime,
}
m.mu.Lock()
m.activeAlerts[alert.ID] = alert
m.mu.Unlock()
// Save to disk
_ = m.SaveActiveAlerts()
// Clear in-memory map only (don't use ClearActiveAlerts which triggers async save)
m.mu.Lock()
m.activeAlerts = make(map[string]*Alert)
m.mu.Unlock()
err := m.LoadActiveAlerts()
if err != nil {
t.Fatalf("failed to load alerts: %v", err)
}
m.mu.RLock()
_, exists := m.activeAlerts["old-alert"]
m.mu.RUnlock()
if exists {
t.Error("old alert (>24h) should be skipped during load")
}
})
t.Run("skips old acknowledged alerts", func(t *testing.T) {
m := newTestManager(t)
// Create an alert acknowledged >1 hour ago
startTime := time.Now().Add(-30 * time.Minute)
ackTime := time.Now().Add(-2 * time.Hour)
alert := &Alert{
ID: "old-ack-alert",
Type: "cpu",
Level: AlertLevelWarning,
ResourceID: "test-resource",
ResourceName: "test-vm",
StartTime: startTime,
LastSeen: time.Now(),
Acknowledged: true,
AckTime: &ackTime,
AckUser: "testuser",
}
m.mu.Lock()
m.activeAlerts[alert.ID] = alert
m.mu.Unlock()
// Save to disk
_ = m.SaveActiveAlerts()
// Clear in-memory map only (don't use ClearActiveAlerts which triggers async save)
m.mu.Lock()
m.activeAlerts = make(map[string]*Alert)
m.mu.Unlock()
err := m.LoadActiveAlerts()
if err != nil {
t.Fatalf("failed to load alerts: %v", err)
}
m.mu.RLock()
_, exists := m.activeAlerts["old-ack-alert"]
ackRecord, ackExists := m.ackState["old-ack-alert"]
m.mu.RUnlock()
if exists {
t.Error("old acknowledged alert (>1h) should be skipped from activeAlerts")
}
// But ackState should be preserved so the alert doesn't retrigger if it reappears
if !ackExists {
t.Error("ackState should be preserved for old acknowledged alerts to prevent retriggering")
}
if ackExists && !ackRecord.acknowledged {
t.Error("ackState.acknowledged should be true")
}
if ackExists && ackRecord.user != "testuser" {
t.Errorf("ackState.user should be 'testuser', got %q", ackRecord.user)
}
})
t.Run("restores acknowledgment state", func(t *testing.T) {
m := newTestManager(t)
// Create an acknowledged alert
startTime := time.Now().Add(-10 * time.Minute)
ackTime := time.Now().Add(-5 * time.Minute)
alert := &Alert{
ID: "ack-alert",
Type: "cpu",
Level: AlertLevelWarning,
ResourceID: "test-resource",
ResourceName: "test-vm",
StartTime: startTime,
LastSeen: time.Now(),
Acknowledged: true,
AckTime: &ackTime,
AckUser: "testuser",
}
m.mu.Lock()
m.activeAlerts[alert.ID] = alert
m.mu.Unlock()
// Save to disk
_ = m.SaveActiveAlerts()
// Clear in-memory maps only (don't use ClearActiveAlerts which triggers async save)
m.mu.Lock()
m.activeAlerts = make(map[string]*Alert)
m.ackState = make(map[string]ackRecord)
m.mu.Unlock()
err := m.LoadActiveAlerts()
if err != nil {
t.Fatalf("failed to load alerts: %v", err)
}
m.mu.RLock()
loaded, exists := m.activeAlerts["ack-alert"]
ackRecord, hasAckRecord := m.ackState["ack-alert"]
m.mu.RUnlock()
if !exists {
t.Fatal("acknowledged alert should be loaded")
}
if !loaded.Acknowledged {
t.Error("loaded alert should be acknowledged")
}
if loaded.AckUser != "testuser" {
t.Errorf("loaded alert AckUser should be testuser, got %s", loaded.AckUser)
}
if !hasAckRecord {
t.Error("ackState should be restored for acknowledged alert")
}
if !ackRecord.acknowledged {
t.Error("ackState should show acknowledged=true")
}
})
t.Run("invalid JSON returns error", func(t *testing.T) {
m := newTestManager(t)
m.ClearActiveAlerts()
// Write invalid JSON to the alerts file
alertsDir := filepath.Join(utils.GetDataDir(), "alerts")
if err := os.MkdirAll(alertsDir, 0755); err != nil {
t.Fatalf("failed to create alerts dir: %v", err)
}
alertsFile := filepath.Join(alertsDir, "active-alerts.json")
if err := os.WriteFile(alertsFile, []byte("invalid json"), 0644); err != nil {
t.Fatalf("failed to write invalid json: %v", err)
}
err := m.LoadActiveAlerts()
if err == nil {
t.Error("expected error for invalid JSON")
}
})
t.Run("skips duplicate alerts", func(t *testing.T) {
m := newTestManager(t)
m.ClearActiveAlerts()
// Write JSON with duplicate alert IDs
alertsDir := filepath.Join(utils.GetDataDir(), "alerts")
if err := os.MkdirAll(alertsDir, 0755); err != nil {
t.Fatalf("failed to create alerts dir: %v", err)
}
startTime := time.Now().Add(-10 * time.Minute)
alerts := []Alert{
{ID: "dup-alert", Type: "cpu", StartTime: startTime, LastSeen: time.Now()},
{ID: "dup-alert", Type: "memory", StartTime: startTime, LastSeen: time.Now()},
}
data, _ := json.Marshal(alerts)
alertsFile := filepath.Join(alertsDir, "active-alerts.json")
if err := os.WriteFile(alertsFile, data, 0644); err != nil {
t.Fatalf("failed to write alerts json: %v", err)
}
err := m.LoadActiveAlerts()
if err != nil {
t.Fatalf("failed to load alerts: %v", err)
}
m.mu.RLock()
alert, exists := m.activeAlerts["dup-alert"]
m.mu.RUnlock()
if !exists {
t.Fatal("alert should exist after load")
}
// First one wins
if alert.Type != "cpu" {
t.Errorf("first alert should win, got type %s", alert.Type)
}
})
t.Run("migrates legacy guest alert resource IDs to canonical format", func(t *testing.T) {
m := newTestManager(t)
m.ClearActiveAlerts()
alertsDir := filepath.Join(utils.GetDataDir(), "alerts")
if err := os.MkdirAll(alertsDir, 0755); err != nil {
t.Fatalf("failed to create alerts dir: %v", err)
}
startTime := time.Now().Add(-10 * time.Minute)
legacyResourceID := "pve1-100"
canonicalResourceID := BuildGuestKey("pve1", "node1", 100)
alerts := []Alert{
{
ID: legacyResourceID + "-cpu",
Type: "cpu",
Level: AlertLevelWarning,
ResourceID: legacyResourceID,
ResourceName: "test-vm",
Node: "node1",
Instance: "pve1",
StartTime: startTime,
LastSeen: time.Now(),
},
}
data, _ := json.Marshal(alerts)
alertsFile := filepath.Join(alertsDir, "active-alerts.json")
if err := os.WriteFile(alertsFile, data, 0644); err != nil {
t.Fatalf("failed to write alerts json: %v", err)
}
err := m.LoadActiveAlerts()
if err != nil {
t.Fatalf("failed to load alerts: %v", err)
}
m.mu.RLock()
alert, exists := m.activeAlerts[canonicalResourceID+"-cpu"]
_, oldExists := m.activeAlerts[legacyResourceID+"-cpu"]
m.mu.RUnlock()
if !exists {
t.Fatal("expected canonical guest alert to be loaded")
}
if oldExists {
t.Fatal("expected legacy guest alert ID to be replaced")
}
if alert.ResourceID != canonicalResourceID {
t.Fatalf("expected resource ID %q, got %q", canonicalResourceID, alert.ResourceID)
}
if alert.ID != canonicalResourceID+"-cpu" {
t.Fatalf("expected alert ID %q, got %q", canonicalResourceID+"-cpu", alert.ID)
}
})
}
func TestNamespaceMatchesInstance(t *testing.T) {
tests := []struct {
name string
namespace string
instance string
expected bool
}{
// Exact matches
{"exact match", "pve", "pve", true},
{"exact match with numbers", "pve1", "pve1", true},
// Suffix matches (namespace is suffix of instance)
{"namespace suffix of instance", "nat", "pve-nat", true},
{"namespace suffix of instance no dash", "nat", "pvenat", true},
// Suffix matches (instance is suffix of namespace)
{"instance suffix of namespace", "pvebackups", "pve", false}, // "pve" is not suffix of "pvebackups"
{"instance suffix of namespace 2", "backupspve", "pve", true}, // "pve" IS suffix of "backupspve"
// Case insensitive
{"case insensitive exact", "PVE", "pve", true},
{"case insensitive suffix", "NAT", "pve-nat", true},
// Special characters ignored
{"special chars in namespace", "pve_nat", "pvenat", true},
{"special chars in instance", "pvenat", "pve-nat", true},
{"both have special chars", "pve-1", "pve_1", true},
// No matches - substring but not suffix
{"no match substring not suffix", "production", "my-production-server", false}, // "production" is not suffix of "myproductionserver"
{"no match pve not suffix of pvenat", "pve", "pve-nat", false}, // "pve" is not suffix of "pvenat"
// No matches
{"no match", "production", "staging", false},
{"no match different names", "pve1", "pve2", false},
{"no match partial mismatch", "abc", "xyz", false},
// Empty values
{"empty namespace", "", "pve", false},
{"empty instance", "pve", "", false},
{"both empty", "", "", false},
// Real-world scenarios from issue #1095
{"pve namespace with pve instance", "pve", "pve", true},
{"nat namespace with pve-nat instance", "nat", "pve-nat", true},
{"pve1 namespace with pve1 instance", "pve1", "pve1", true},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := namespaceMatchesInstance(tt.namespace, tt.instance)
if result != tt.expected {
t.Errorf("namespaceMatchesInstance(%q, %q) = %v, want %v",
tt.namespace, tt.instance, result, tt.expected)
}
})
}
}