mirror of
https://github.com/rcourtman/Pulse.git
synced 2026-04-28 11:30:15 +00:00
17078 lines
432 KiB
Go
17078 lines
432 KiB
Go
package alerts
|
|
|
|
import (
|
|
"encoding/json"
|
|
"fmt"
|
|
"math"
|
|
"os"
|
|
"path/filepath"
|
|
"reflect"
|
|
"strings"
|
|
"sync"
|
|
"testing"
|
|
"time"
|
|
|
|
"github.com/rcourtman/pulse-go-rewrite/internal/models"
|
|
"github.com/rcourtman/pulse-go-rewrite/internal/utils"
|
|
"github.com/rcourtman/pulse-go-rewrite/pkg/proxmox"
|
|
)
|
|
|
|
// testEnvMu protects concurrent access to PULSE_DATA_DIR during parallel tests.
|
|
// Tests using newTestManager are effectively serialized because the Manager
|
|
// calls GetDataDir() repeatedly (not just at creation time).
|
|
var testEnvMu sync.Mutex
|
|
|
|
// newTestManager creates a Manager with an isolated temp directory for testing.
|
|
// It uses os.Setenv with a mutex to safely handle parallel tests that call // t.Parallel()
|
|
// before invoking this function (t.Setenv cannot be used after t.Parallel).
|
|
//
|
|
// IMPORTANT: The mutex is held for the entire duration of the test because the
|
|
// Manager calls GetDataDir() not just at creation time, but also during operations
|
|
// like SaveActiveAlerts() and LoadActiveAlerts(). This effectively serializes
|
|
// tests that use newTestManager, but ensures correct isolation.
|
|
func newTestManager(t *testing.T) *Manager {
|
|
t.Helper()
|
|
|
|
tmpDir := t.TempDir()
|
|
|
|
testEnvMu.Lock()
|
|
oldVal, hadOld := os.LookupEnv("PULSE_DATA_DIR")
|
|
os.Setenv("PULSE_DATA_DIR", tmpDir)
|
|
|
|
m := NewManager()
|
|
|
|
// Restore env var and release mutex when test completes.
|
|
// We also stop the history manager's background goroutines (but not the
|
|
// full manager Stop which includes a 100ms sleep) to prevent writes to
|
|
// the temp directory after the test completes.
|
|
t.Cleanup(func() {
|
|
// Stop the history manager to halt background save routines
|
|
m.historyManager.Stop()
|
|
// Close escalation channel to stop that goroutine too
|
|
select {
|
|
case <-m.escalationStop:
|
|
// Already closed
|
|
default:
|
|
close(m.escalationStop)
|
|
}
|
|
// Close cleanup channel
|
|
select {
|
|
case <-m.cleanupStop:
|
|
// Already closed
|
|
default:
|
|
close(m.cleanupStop)
|
|
}
|
|
// Brief pause to let goroutines finish any in-flight operations.
|
|
// Under heavy parallel test load (full suite), 10ms is not enough.
|
|
time.Sleep(50 * time.Millisecond)
|
|
|
|
if hadOld {
|
|
os.Setenv("PULSE_DATA_DIR", oldVal)
|
|
} else {
|
|
os.Unsetenv("PULSE_DATA_DIR")
|
|
}
|
|
testEnvMu.Unlock()
|
|
})
|
|
|
|
return m
|
|
}
|
|
|
|
func TestAcknowledgePersistsThroughCheckMetric(t *testing.T) {
|
|
m := newTestManager(t)
|
|
m.ClearActiveAlerts()
|
|
// Set config fields directly to bypass UpdateConfig's default value enforcement
|
|
m.mu.Lock()
|
|
m.config.TimeThreshold = 0
|
|
m.config.TimeThresholds = map[string]int{}
|
|
m.config.SuppressionWindow = 0
|
|
m.config.MinimumDelta = 0
|
|
m.mu.Unlock()
|
|
|
|
threshold := &HysteresisThreshold{Trigger: 80, Clear: 70}
|
|
m.checkMetric("res1", "Resource", "node1", "inst1", "guest", "usage", 90, threshold, nil)
|
|
if _, exists := m.activeAlerts["res1-usage"]; !exists {
|
|
t.Fatalf("expected alert to be created")
|
|
}
|
|
|
|
if err := m.AcknowledgeAlert("res1-usage", "tester"); err != nil {
|
|
t.Fatalf("ack failed: %v", err)
|
|
}
|
|
|
|
if !m.activeAlerts["res1-usage"].Acknowledged {
|
|
t.Fatalf("acknowledged flag not set")
|
|
}
|
|
|
|
alerts := m.GetActiveAlerts()
|
|
if len(alerts) != 1 || !alerts[0].Acknowledged {
|
|
t.Fatalf("GetActiveAlerts lost acknowledgement")
|
|
}
|
|
|
|
m.checkMetric("res1", "Resource", "node1", "inst1", "guest", "usage", 85, threshold, nil)
|
|
if !m.activeAlerts["res1-usage"].Acknowledged {
|
|
t.Fatalf("acknowledged flag lost after update")
|
|
}
|
|
}
|
|
|
|
func TestCheckMetricClearsAlertWhenThresholdDisabled(t *testing.T) {
|
|
m := newTestManager(t)
|
|
m.ClearActiveAlerts()
|
|
m.mu.Lock()
|
|
m.config.TimeThreshold = 0
|
|
m.config.TimeThresholds = map[string]int{}
|
|
m.config.SuppressionWindow = 0
|
|
m.config.MinimumDelta = 0
|
|
m.mu.Unlock()
|
|
|
|
// First, create an active alert with an enabled threshold
|
|
threshold := &HysteresisThreshold{Trigger: 80, Clear: 70}
|
|
m.checkMetric("res1", "Resource", "node1", "inst1", "guest", "memory", 90, threshold, nil)
|
|
|
|
m.mu.RLock()
|
|
_, exists := m.activeAlerts["res1-memory"]
|
|
m.mu.RUnlock()
|
|
if !exists {
|
|
t.Fatalf("expected alert to be created")
|
|
}
|
|
|
|
// Now call checkMetric with a disabled threshold (Trigger=0) — should clear the alert
|
|
disabledThreshold := &HysteresisThreshold{Trigger: 0, Clear: 0}
|
|
m.checkMetric("res1", "Resource", "node1", "inst1", "guest", "memory", 90, disabledThreshold, nil)
|
|
|
|
m.mu.RLock()
|
|
_, stillExists := m.activeAlerts["res1-memory"]
|
|
m.mu.RUnlock()
|
|
if stillExists {
|
|
t.Errorf("expected alert to be cleared when threshold is disabled (Trigger=0)")
|
|
}
|
|
|
|
// Also test with nil threshold
|
|
// Re-create the alert
|
|
m.checkMetric("res1", "Resource", "node1", "inst1", "guest", "memory", 90, threshold, nil)
|
|
m.mu.RLock()
|
|
_, exists = m.activeAlerts["res1-memory"]
|
|
m.mu.RUnlock()
|
|
if !exists {
|
|
t.Fatalf("expected alert to be re-created")
|
|
}
|
|
|
|
// Call with nil threshold — should also clear
|
|
m.checkMetric("res1", "Resource", "node1", "inst1", "guest", "memory", 90, nil, nil)
|
|
|
|
m.mu.RLock()
|
|
_, stillExists = m.activeAlerts["res1-memory"]
|
|
m.mu.RUnlock()
|
|
if stillExists {
|
|
t.Errorf("expected alert to be cleared when threshold is nil")
|
|
}
|
|
}
|
|
|
|
func TestGetActiveAlertsKeepsInstanceScopedNodeDisplayNames(t *testing.T) {
|
|
m := newTestManager(t)
|
|
m.ClearActiveAlerts()
|
|
m.mu.Lock()
|
|
m.config.TimeThreshold = 0
|
|
m.config.TimeThresholds = map[string]int{}
|
|
m.config.SuppressionWindow = 0
|
|
m.config.MinimumDelta = 0
|
|
m.mu.Unlock()
|
|
|
|
threshold := &HysteresisThreshold{Trigger: 80, Clear: 70}
|
|
|
|
m.UpdateNodeDisplayName("cluster-a", "pve", "Alpha")
|
|
m.UpdateNodeDisplayName("cluster-b", "pve", "Beta")
|
|
|
|
m.checkMetric("guest-a", "vm-a", "pve", "cluster-a", "guest", "cpu", 90, threshold, nil)
|
|
m.checkMetric("guest-b", "vm-b", "pve", "cluster-b", "guest", "cpu", 91, threshold, nil)
|
|
|
|
m.UpdateNodeDisplayName("cluster-a", "pve", "Alpha Updated")
|
|
m.checkMetric("guest-a", "vm-a", "pve", "cluster-a", "guest", "cpu", 92, threshold, nil)
|
|
m.checkMetric("guest-b", "vm-b", "pve", "cluster-b", "guest", "cpu", 93, threshold, nil)
|
|
|
|
gotByID := make(map[string]Alert)
|
|
for _, alert := range m.GetActiveAlerts() {
|
|
gotByID[alert.ID] = alert
|
|
}
|
|
|
|
if got := gotByID["guest-a-cpu"].NodeDisplayName; got != "Alpha Updated" {
|
|
t.Fatalf("guest-a NodeDisplayName = %q, want %q", got, "Alpha Updated")
|
|
}
|
|
if got := gotByID["guest-b-cpu"].NodeDisplayName; got != "Beta" {
|
|
t.Fatalf("guest-b NodeDisplayName = %q, want %q", got, "Beta")
|
|
}
|
|
}
|
|
|
|
func TestCheckGuestSkipsAlertsWhenMetricDisabled(t *testing.T) {
|
|
m := newTestManager(t)
|
|
|
|
vmID := "instance-node-101"
|
|
instanceName := "instance"
|
|
|
|
// Start with default configuration to allow CPU alerts.
|
|
initialConfig := AlertConfig{
|
|
Enabled: true,
|
|
GuestDefaults: ThresholdConfig{
|
|
CPU: &HysteresisThreshold{Trigger: 80, Clear: 75},
|
|
},
|
|
TimeThreshold: 0,
|
|
TimeThresholds: map[string]int{},
|
|
NodeDefaults: ThresholdConfig{
|
|
CPU: &HysteresisThreshold{Trigger: 80, Clear: 75},
|
|
Memory: &HysteresisThreshold{Trigger: 85, Clear: 80},
|
|
Disk: &HysteresisThreshold{Trigger: 90, Clear: 85},
|
|
},
|
|
StorageDefault: HysteresisThreshold{Trigger: 85, Clear: 80},
|
|
Overrides: make(map[string]ThresholdConfig),
|
|
}
|
|
m.UpdateConfig(initialConfig)
|
|
m.mu.Lock()
|
|
m.config.TimeThreshold = 0
|
|
m.config.TimeThresholds = map[string]int{}
|
|
m.config.ActivationState = ActivationActive
|
|
m.mu.Unlock()
|
|
|
|
var dispatched []*Alert
|
|
done := make(chan struct{}, 1)
|
|
var resolved []string
|
|
resolvedDone := make(chan struct{}, 1)
|
|
m.SetAlertCallback(func(alert *Alert) {
|
|
dispatched = append(dispatched, alert)
|
|
select {
|
|
case done <- struct{}{}:
|
|
default:
|
|
}
|
|
})
|
|
m.SetResolvedCallback(func(alertID string) {
|
|
resolved = append(resolved, alertID)
|
|
select {
|
|
case resolvedDone <- struct{}{}:
|
|
default:
|
|
}
|
|
})
|
|
|
|
vm := models.VM{
|
|
ID: vmID,
|
|
Name: "test-vm",
|
|
Node: "node",
|
|
Instance: instanceName,
|
|
Status: "running",
|
|
CPU: 1.0, // 100% once multiplied by 100 inside CheckGuest
|
|
Memory: models.Memory{
|
|
Usage: 65,
|
|
},
|
|
Disk: models.Disk{
|
|
Usage: 40,
|
|
},
|
|
}
|
|
|
|
// Initial check should trigger an alert with default thresholds.
|
|
m.CheckGuest(vm, instanceName)
|
|
select {
|
|
case <-done:
|
|
case <-time.After(100 * time.Millisecond):
|
|
t.Fatalf("did not receive initial alert dispatch")
|
|
}
|
|
if len(dispatched) != 1 {
|
|
t.Fatalf("expected 1 alert before disabling metric, got %d", len(dispatched))
|
|
}
|
|
|
|
// Apply override disabling CPU alerts for this VM.
|
|
disabledConfig := initialConfig
|
|
disabledConfig.Overrides = map[string]ThresholdConfig{
|
|
vmID: {
|
|
CPU: &HysteresisThreshold{Trigger: -1, Clear: 0},
|
|
},
|
|
}
|
|
disabledConfig.TimeThreshold = 0
|
|
disabledConfig.TimeThresholds = map[string]int{}
|
|
m.UpdateConfig(disabledConfig)
|
|
m.mu.Lock()
|
|
m.config.TimeThreshold = 0
|
|
m.config.TimeThresholds = map[string]int{}
|
|
m.config.ActivationState = ActivationActive
|
|
m.mu.Unlock()
|
|
|
|
// Clear dispatched slice to capture only post-disable notifications.
|
|
dispatched = dispatched[:0]
|
|
done = make(chan struct{}, 1)
|
|
|
|
// Re-run evaluation with high CPU; no alert should be dispatched.
|
|
m.CheckGuest(vm, instanceName)
|
|
select {
|
|
case <-done:
|
|
t.Fatalf("expected no alerts after disabling CPU metric, but callback fired")
|
|
case <-time.After(100 * time.Millisecond):
|
|
// No callback fired as expected.
|
|
}
|
|
|
|
// Active alerts should be cleared by the config update.
|
|
m.mu.RLock()
|
|
activeCount := len(m.activeAlerts)
|
|
m.mu.RUnlock()
|
|
if activeCount != 0 {
|
|
t.Fatalf("expected active alerts to be cleared after disabling metric, got %d", activeCount)
|
|
}
|
|
|
|
select {
|
|
case <-resolvedDone:
|
|
case <-time.After(100 * time.Millisecond):
|
|
t.Fatalf("expected resolved callback to fire after disabling metric")
|
|
}
|
|
if len(resolved) != 1 || resolved[0] != fmt.Sprintf("%s-cpu", vmID) {
|
|
t.Fatalf("expected resolved callback for %s-cpu, got %v", vmID, resolved)
|
|
}
|
|
|
|
m.mu.RLock()
|
|
_, isPending := m.pendingAlerts[fmt.Sprintf("%s-cpu", vmID)]
|
|
m.mu.RUnlock()
|
|
if isPending {
|
|
t.Fatalf("expected pending alert entry to be cleared after disabling metric")
|
|
}
|
|
}
|
|
|
|
func TestPulseNoAlertsSuppressesGuestAlerts(t *testing.T) {
|
|
m := newTestManager(t)
|
|
m.ClearActiveAlerts()
|
|
m.mu.Lock()
|
|
m.config.TimeThreshold = 0
|
|
m.config.TimeThresholds = map[string]int{}
|
|
m.config.ActivationState = ActivationActive
|
|
m.mu.Unlock()
|
|
|
|
var dispatched int
|
|
m.SetAlertCallback(func(alert *Alert) {
|
|
dispatched++
|
|
})
|
|
|
|
vm := models.VM{
|
|
ID: "inst/qemu/101",
|
|
Name: "test-vm",
|
|
Node: "node1",
|
|
Instance: "inst",
|
|
Status: "running",
|
|
CPU: 1.0,
|
|
Memory: models.Memory{
|
|
Usage: 95,
|
|
},
|
|
Disk: models.Disk{
|
|
Usage: 95,
|
|
},
|
|
Tags: []string{"pulse-no-alerts"},
|
|
}
|
|
|
|
m.CheckGuest(vm, "inst")
|
|
|
|
if dispatched != 0 {
|
|
t.Fatalf("expected no alert dispatch, got %d", dispatched)
|
|
}
|
|
|
|
if alerts := m.GetActiveAlerts(); len(alerts) != 0 {
|
|
t.Fatalf("expected no active alerts, got %d", len(alerts))
|
|
}
|
|
}
|
|
|
|
func TestPulseMonitorOnlySkipsDispatchButRetainsAlert(t *testing.T) {
|
|
m := newTestManager(t)
|
|
m.ClearActiveAlerts()
|
|
m.mu.Lock()
|
|
m.config.TimeThreshold = 0
|
|
m.config.TimeThresholds = map[string]int{}
|
|
m.config.ActivationState = ActivationActive
|
|
m.mu.Unlock()
|
|
|
|
var dispatched int
|
|
m.SetAlertCallback(func(alert *Alert) {
|
|
dispatched++
|
|
})
|
|
|
|
vm := models.VM{
|
|
ID: "inst/qemu/102",
|
|
Name: "monitor-vm",
|
|
Node: "node1",
|
|
Instance: "inst",
|
|
Status: "running",
|
|
CPU: 1.0,
|
|
Memory: models.Memory{Usage: 90},
|
|
Disk: models.Disk{Usage: 50},
|
|
Tags: []string{"pulse-monitor-only"},
|
|
}
|
|
|
|
m.CheckGuest(vm, "inst")
|
|
|
|
if dispatched != 0 {
|
|
t.Fatalf("expected monitor-only alert to skip dispatch, got %d callbacks", dispatched)
|
|
}
|
|
|
|
alerts := m.GetActiveAlerts()
|
|
if len(alerts) == 0 {
|
|
t.Fatalf("expected monitor-only alert to remain active")
|
|
}
|
|
|
|
if alerts[0].Metadata == nil || alerts[0].Metadata["monitorOnly"] != true {
|
|
t.Fatalf("expected alert metadata to mark monitorOnly, got %+v", alerts[0].Metadata)
|
|
}
|
|
}
|
|
|
|
func TestPulseRelaxedThresholdsIncreaseCpuTrigger(t *testing.T) {
|
|
m := newTestManager(t)
|
|
m.ClearActiveAlerts()
|
|
m.mu.Lock()
|
|
m.config.TimeThreshold = 0
|
|
m.config.TimeThresholds = map[string]int{}
|
|
m.mu.Unlock()
|
|
|
|
vm := models.VM{
|
|
ID: "inst/qemu/103",
|
|
Name: "relaxed-vm",
|
|
Node: "node1",
|
|
Instance: "inst",
|
|
Status: "running",
|
|
CPU: 0.9, // 90%
|
|
Memory: models.Memory{Usage: 60},
|
|
Disk: models.Disk{Usage: 40},
|
|
Tags: []string{"pulse-relaxed"},
|
|
}
|
|
|
|
m.CheckGuest(vm, "inst")
|
|
|
|
if alerts := m.GetActiveAlerts(); len(alerts) != 0 {
|
|
t.Fatalf("expected no alerts at 90%% CPU with relaxed thresholds, got %d", len(alerts))
|
|
}
|
|
|
|
vm.CPU = 1.0
|
|
m.CheckGuest(vm, "inst")
|
|
|
|
if alerts := m.GetActiveAlerts(); len(alerts) == 0 {
|
|
t.Fatalf("expected alert once CPU exceeds relaxed threshold")
|
|
}
|
|
}
|
|
|
|
func TestClearAlertMarksResolutionAndReturnsStatus(t *testing.T) {
|
|
m := newTestManager(t)
|
|
m.ClearActiveAlerts()
|
|
m.mu.Lock()
|
|
m.config.TimeThreshold = 0
|
|
m.config.TimeThresholds = map[string]int{}
|
|
m.mu.Unlock()
|
|
|
|
vm := models.VM{
|
|
ID: "inst/qemu/104",
|
|
Name: "clear-vm",
|
|
Node: "node1",
|
|
Instance: "inst",
|
|
Status: "running",
|
|
CPU: 1.0,
|
|
Memory: models.Memory{Usage: 80},
|
|
Disk: models.Disk{Usage: 80},
|
|
}
|
|
|
|
m.CheckGuest(vm, "inst")
|
|
alerts := m.GetActiveAlerts()
|
|
if len(alerts) == 0 {
|
|
t.Fatalf("expected alert to be active before clearing")
|
|
}
|
|
|
|
alertID := alerts[0].ID
|
|
if ok := m.ClearAlert(alertID); !ok {
|
|
t.Fatalf("expected manual clear to succeed")
|
|
}
|
|
|
|
if remaining := m.GetActiveAlerts(); len(remaining) != 0 {
|
|
t.Fatalf("expected no active alerts after clear, found %d", len(remaining))
|
|
}
|
|
|
|
resolved := m.GetRecentlyResolved()
|
|
if len(resolved) == 0 || resolved[0].Alert.ID != alertID {
|
|
t.Fatalf("expected alert %s to be tracked as recently resolved", alertID)
|
|
}
|
|
|
|
if ok := m.ClearAlert(alertID); ok {
|
|
t.Fatalf("expected second clear to report missing alert")
|
|
}
|
|
}
|
|
|
|
func TestHandleDockerHostRemovedClearsAlertsAndTracking(t *testing.T) {
|
|
m := newTestManager(t)
|
|
host := models.DockerHost{ID: "host1", DisplayName: "Host One", Hostname: "host-one"}
|
|
containerResourceID := "docker:host1/container1"
|
|
containerAlertID := "docker-container-state-" + containerResourceID
|
|
hostAlertID := "docker-host-offline-host1"
|
|
|
|
m.mu.Lock()
|
|
m.activeAlerts[hostAlertID] = &Alert{ID: hostAlertID, ResourceID: "docker:host1"}
|
|
m.activeAlerts[containerAlertID] = &Alert{ID: containerAlertID, ResourceID: containerResourceID}
|
|
m.dockerOfflineCount[host.ID] = 2
|
|
m.dockerStateConfirm[containerResourceID] = 1
|
|
m.dockerRestartTracking[containerResourceID] = &dockerRestartRecord{}
|
|
m.dockerLastExitCode[containerResourceID] = 137
|
|
m.mu.Unlock()
|
|
|
|
m.HandleDockerHostRemoved(host)
|
|
|
|
m.mu.RLock()
|
|
defer m.mu.RUnlock()
|
|
|
|
if _, exists := m.activeAlerts[containerAlertID]; exists {
|
|
t.Fatalf("expected container alerts to be cleared")
|
|
}
|
|
if _, exists := m.activeAlerts[hostAlertID]; exists {
|
|
t.Fatalf("expected host offline alert to be cleared")
|
|
}
|
|
if _, exists := m.dockerOfflineCount[host.ID]; exists {
|
|
t.Fatalf("expected offline tracking to be cleared")
|
|
}
|
|
if _, exists := m.dockerStateConfirm[containerResourceID]; exists {
|
|
t.Fatalf("expected state confirmation to be cleared")
|
|
}
|
|
if _, exists := m.dockerRestartTracking[containerResourceID]; exists {
|
|
t.Fatalf("expected restart tracking to be cleared")
|
|
}
|
|
if _, exists := m.dockerLastExitCode[containerResourceID]; exists {
|
|
t.Fatalf("expected last exit code tracking to be cleared")
|
|
}
|
|
}
|
|
|
|
func TestCheckHostGeneratesMetricAlerts(t *testing.T) {
|
|
m := newTestManager(t)
|
|
m.ClearActiveAlerts()
|
|
m.mu.Lock()
|
|
m.config.TimeThreshold = 0
|
|
m.config.TimeThresholds = map[string]int{}
|
|
m.mu.Unlock()
|
|
|
|
host := models.Host{
|
|
ID: "host-1",
|
|
DisplayName: "Test Host",
|
|
Hostname: "host-1.example",
|
|
Platform: "linux",
|
|
OSName: "ubuntu",
|
|
CPUUsage: 95,
|
|
CPUCount: 8,
|
|
Memory: models.Memory{
|
|
Usage: 92,
|
|
Total: 16384,
|
|
Used: 15000,
|
|
Free: 1384,
|
|
},
|
|
Disks: []models.Disk{
|
|
{
|
|
Mountpoint: "/",
|
|
Usage: 93,
|
|
Total: 100,
|
|
Used: 93,
|
|
Free: 7,
|
|
},
|
|
},
|
|
Status: "online",
|
|
IntervalSeconds: 30,
|
|
LastSeen: time.Now(),
|
|
Tags: []string{"prod"},
|
|
}
|
|
|
|
m.CheckHost(host)
|
|
|
|
m.mu.RLock()
|
|
defer m.mu.RUnlock()
|
|
|
|
cpuAlertID := fmt.Sprintf("%s-cpu", hostResourceID(host.ID))
|
|
if _, exists := m.activeAlerts[cpuAlertID]; !exists {
|
|
t.Fatalf("expected CPU alert %q to be active", cpuAlertID)
|
|
}
|
|
|
|
memAlertID := fmt.Sprintf("%s-memory", hostResourceID(host.ID))
|
|
if _, exists := m.activeAlerts[memAlertID]; !exists {
|
|
t.Fatalf("expected memory alert %q to be active", memAlertID)
|
|
}
|
|
|
|
diskResourceID, _ := hostDiskResourceID(host, host.Disks[0])
|
|
diskAlertID := fmt.Sprintf("%s-disk", diskResourceID)
|
|
if _, exists := m.activeAlerts[diskAlertID]; !exists {
|
|
t.Fatalf("expected disk alert %q to be active", diskAlertID)
|
|
}
|
|
}
|
|
|
|
func TestHandleHostOfflineRequiresConfirmations(t *testing.T) {
|
|
m := newTestManager(t)
|
|
m.ClearActiveAlerts()
|
|
host := models.Host{ID: "host-2", DisplayName: "Second Host", Hostname: "host-two"}
|
|
alertID := fmt.Sprintf("host-offline-%s", host.ID)
|
|
resourceKey := hostResourceID(host.ID)
|
|
|
|
m.HandleHostOffline(host)
|
|
m.mu.RLock()
|
|
if _, exists := m.activeAlerts[alertID]; exists {
|
|
t.Fatalf("expected no alert after first offline detection")
|
|
}
|
|
if count := m.offlineConfirmations[resourceKey]; count != 1 {
|
|
t.Fatalf("expected confirmation count to be 1, got %d", count)
|
|
}
|
|
m.mu.RUnlock()
|
|
|
|
m.HandleHostOffline(host)
|
|
m.mu.RLock()
|
|
if _, exists := m.activeAlerts[alertID]; exists {
|
|
t.Fatalf("expected no alert after second offline detection")
|
|
}
|
|
if count := m.offlineConfirmations[resourceKey]; count != 2 {
|
|
t.Fatalf("expected confirmation count to be 2, got %d", count)
|
|
}
|
|
m.mu.RUnlock()
|
|
|
|
m.HandleHostOffline(host)
|
|
m.mu.RLock()
|
|
if _, exists := m.activeAlerts[alertID]; !exists {
|
|
t.Fatalf("expected alert %q after third offline detection", alertID)
|
|
}
|
|
m.mu.RUnlock()
|
|
|
|
m.HandleHostOnline(host)
|
|
m.mu.RLock()
|
|
if _, exists := m.activeAlerts[alertID]; exists {
|
|
t.Fatalf("expected offline alert %q to be cleared after host online", alertID)
|
|
}
|
|
if _, exists := m.offlineConfirmations[resourceKey]; exists {
|
|
t.Fatalf("expected offline confirmations to be cleared when host online")
|
|
}
|
|
m.mu.RUnlock()
|
|
}
|
|
|
|
func TestCheckHostDisabledOverrideClearsAlerts(t *testing.T) {
|
|
m := newTestManager(t)
|
|
m.ClearActiveAlerts()
|
|
m.mu.Lock()
|
|
m.config.TimeThreshold = 0
|
|
m.config.TimeThresholds = map[string]int{}
|
|
m.mu.Unlock()
|
|
|
|
host := models.Host{
|
|
ID: "host-3",
|
|
DisplayName: "Override Host",
|
|
Hostname: "override.example",
|
|
CPUUsage: 90,
|
|
Memory: models.Memory{
|
|
Usage: 91,
|
|
Total: 16000,
|
|
Used: 14560,
|
|
Free: 1440,
|
|
},
|
|
Disks: []models.Disk{
|
|
{Mountpoint: "/data", Usage: 92, Total: 200, Used: 184, Free: 16},
|
|
},
|
|
Status: "online",
|
|
IntervalSeconds: 30,
|
|
LastSeen: time.Now(),
|
|
}
|
|
|
|
m.CheckHost(host)
|
|
|
|
m.mu.RLock()
|
|
if len(m.activeAlerts) == 0 {
|
|
m.mu.RUnlock()
|
|
t.Fatalf("expected active alerts prior to disabling host overrides")
|
|
}
|
|
m.mu.RUnlock()
|
|
|
|
cfg := m.GetConfig()
|
|
cfg.Overrides = map[string]ThresholdConfig{
|
|
host.ID: {
|
|
Disabled: true,
|
|
},
|
|
}
|
|
m.UpdateConfig(cfg)
|
|
m.mu.Lock()
|
|
m.config.TimeThreshold = 0
|
|
m.config.TimeThresholds = map[string]int{}
|
|
m.mu.Unlock()
|
|
|
|
m.CheckHost(host)
|
|
|
|
m.mu.RLock()
|
|
defer m.mu.RUnlock()
|
|
if len(m.activeAlerts) != 0 {
|
|
t.Fatalf("expected all host alerts to be cleared after disabling override, got %d", len(m.activeAlerts))
|
|
}
|
|
}
|
|
|
|
func TestCheckSnapshotsForInstanceCreatesAndClearsAlerts(t *testing.T) {
|
|
m := newTestManager(t)
|
|
m.ClearActiveAlerts()
|
|
|
|
cfg := AlertConfig{
|
|
Enabled: true,
|
|
StorageDefault: HysteresisThreshold{Trigger: 85, Clear: 80},
|
|
SnapshotDefaults: SnapshotAlertConfig{
|
|
Enabled: true,
|
|
WarningDays: 7,
|
|
CriticalDays: 14,
|
|
WarningSizeGiB: 0,
|
|
CriticalSizeGiB: 0,
|
|
},
|
|
Overrides: make(map[string]ThresholdConfig),
|
|
}
|
|
m.UpdateConfig(cfg)
|
|
m.mu.Lock()
|
|
m.config.TimeThreshold = 0
|
|
m.config.TimeThresholds = map[string]int{}
|
|
m.mu.Unlock()
|
|
|
|
now := time.Now()
|
|
snapshots := []models.GuestSnapshot{
|
|
{
|
|
ID: "inst-node-100-weekly",
|
|
Name: "weekly",
|
|
Node: "node",
|
|
Instance: "inst",
|
|
Type: "qemu",
|
|
VMID: 100,
|
|
Time: now.Add(-15 * 24 * time.Hour),
|
|
SizeBytes: 60 << 30,
|
|
},
|
|
}
|
|
guestNames := map[string]string{
|
|
"inst:node:100": "app-server",
|
|
}
|
|
|
|
m.CheckSnapshotsForInstance("inst", snapshots, guestNames)
|
|
|
|
m.mu.RLock()
|
|
alert, exists := m.activeAlerts["snapshot-age-inst-node-100-weekly"]
|
|
m.mu.RUnlock()
|
|
if !exists {
|
|
t.Fatalf("expected snapshot age alert to be created")
|
|
}
|
|
if alert.Level != AlertLevelCritical {
|
|
t.Fatalf("expected critical level for old snapshot, got %s", alert.Level)
|
|
}
|
|
if alert.ResourceName != "app-server snapshot 'weekly'" {
|
|
t.Fatalf("unexpected resource name: %s", alert.ResourceName)
|
|
}
|
|
|
|
m.CheckSnapshotsForInstance("inst", nil, guestNames)
|
|
|
|
m.mu.RLock()
|
|
_, exists = m.activeAlerts["snapshot-age-inst-node-100-weekly"]
|
|
m.mu.RUnlock()
|
|
if exists {
|
|
t.Fatalf("expected snapshot alert to be cleared when snapshot missing")
|
|
}
|
|
}
|
|
|
|
func TestCheckSnapshotsRespectsOverrides(t *testing.T) {
|
|
m := newTestManager(t)
|
|
m.ClearActiveAlerts()
|
|
|
|
cfg := AlertConfig{
|
|
Enabled: true,
|
|
SnapshotDefaults: SnapshotAlertConfig{
|
|
Enabled: true,
|
|
WarningDays: 7,
|
|
CriticalDays: 14,
|
|
},
|
|
}
|
|
m.UpdateConfig(cfg)
|
|
m.mu.Lock()
|
|
m.config.TimeThreshold = 0
|
|
m.mu.Unlock()
|
|
|
|
now := time.Now()
|
|
snapshots := []models.GuestSnapshot{
|
|
{
|
|
ID: "inst:node:100:weekly",
|
|
Name: "weekly",
|
|
Node: "node",
|
|
Instance: "inst",
|
|
Type: "qemu",
|
|
VMID: 100,
|
|
Time: now.Add(-10 * 24 * time.Hour), // Triggers Warning (10 > 7)
|
|
},
|
|
}
|
|
resourceKey := "inst:node:100"
|
|
guestNames := map[string]string{
|
|
resourceKey: "app-server",
|
|
}
|
|
|
|
// 1. Verify warning alert is created
|
|
m.CheckSnapshotsForInstance("inst", snapshots, guestNames)
|
|
m.mu.RLock()
|
|
alert, exists := m.activeAlerts["snapshot-age-inst:node:100:weekly"]
|
|
m.mu.RUnlock()
|
|
if !exists {
|
|
t.Fatalf("expected snapshot warning alert")
|
|
}
|
|
if alert.Level != AlertLevelWarning {
|
|
t.Fatalf("expected warning alert, got %s", alert.Level)
|
|
}
|
|
|
|
// 2. Disable via override
|
|
cfg = m.GetConfig()
|
|
cfg.Overrides = map[string]ThresholdConfig{
|
|
"inst:node:100": {
|
|
Snapshot: &SnapshotAlertConfig{Enabled: false},
|
|
},
|
|
}
|
|
m.UpdateConfig(cfg)
|
|
m.CheckSnapshotsForInstance("inst", snapshots, guestNames)
|
|
m.mu.RLock()
|
|
_, exists = m.activeAlerts["snapshot-age-inst:node:100:weekly"]
|
|
m.mu.RUnlock()
|
|
if exists {
|
|
t.Fatalf("expected snapshot alert to be suppressed by override")
|
|
}
|
|
}
|
|
|
|
func TestCheckSnapshotsForInstanceTriggersOnSnapshotSize(t *testing.T) {
|
|
m := newTestManager(t)
|
|
m.ClearActiveAlerts()
|
|
|
|
cfg := AlertConfig{
|
|
Enabled: true,
|
|
StorageDefault: HysteresisThreshold{Trigger: 85, Clear: 80},
|
|
SnapshotDefaults: SnapshotAlertConfig{
|
|
Enabled: true,
|
|
WarningDays: 0,
|
|
CriticalDays: 0,
|
|
WarningSizeGiB: 50,
|
|
CriticalSizeGiB: 100,
|
|
},
|
|
Overrides: make(map[string]ThresholdConfig),
|
|
}
|
|
m.UpdateConfig(cfg)
|
|
m.mu.Lock()
|
|
m.config.TimeThreshold = 0
|
|
m.config.TimeThresholds = map[string]int{}
|
|
m.mu.Unlock()
|
|
|
|
now := time.Now()
|
|
snapshots := []models.GuestSnapshot{
|
|
{
|
|
ID: "inst-node-200-sizey",
|
|
Name: "pre-maintenance",
|
|
Node: "node",
|
|
Instance: "inst",
|
|
Type: "qemu",
|
|
VMID: 200,
|
|
Time: now.Add(-2 * time.Hour),
|
|
SizeBytes: int64(120) << 30,
|
|
},
|
|
}
|
|
guestNames := map[string]string{
|
|
"inst:node:200": "db-server",
|
|
}
|
|
|
|
m.CheckSnapshotsForInstance("inst", snapshots, guestNames)
|
|
|
|
m.mu.RLock()
|
|
alert, exists := m.activeAlerts["snapshot-age-inst-node-200-sizey"]
|
|
m.mu.RUnlock()
|
|
if !exists {
|
|
t.Fatalf("expected snapshot size alert to be created")
|
|
}
|
|
if alert.Level != AlertLevelCritical {
|
|
t.Fatalf("expected critical level for large snapshot, got %s", alert.Level)
|
|
}
|
|
if alert.Value < 119.5 || alert.Value > 120.5 {
|
|
t.Fatalf("expected alert value near 120 GiB, got %.2f", alert.Value)
|
|
}
|
|
if alert.Threshold != 100 {
|
|
t.Fatalf("expected threshold 100 GiB, got %.2f", alert.Threshold)
|
|
}
|
|
if alert.Metadata == nil {
|
|
t.Fatalf("expected metadata for snapshot alert")
|
|
}
|
|
if metric, ok := alert.Metadata["primaryMetric"].(string); !ok || metric != "size" {
|
|
t.Fatalf("expected primary metric size, got %#v", alert.Metadata["primaryMetric"])
|
|
}
|
|
if sizeBytes, ok := alert.Metadata["snapshotSizeBytes"].(int64); !ok || sizeBytes == 0 {
|
|
t.Fatalf("expected snapshotSizeBytes in metadata")
|
|
}
|
|
metrics, ok := alert.Metadata["triggeredMetrics"].([]string)
|
|
if !ok {
|
|
t.Fatalf("expected triggeredMetrics slice, got %#v", alert.Metadata["triggeredMetrics"])
|
|
}
|
|
foundSize := false
|
|
for _, metric := range metrics {
|
|
if metric == "size" {
|
|
foundSize = true
|
|
break
|
|
}
|
|
}
|
|
if !foundSize {
|
|
t.Fatalf("expected size metric recorded in metadata")
|
|
}
|
|
}
|
|
|
|
func TestCheckSnapshotsForInstanceIncludesAgeAndSizeReasons(t *testing.T) {
|
|
m := newTestManager(t)
|
|
m.ClearActiveAlerts()
|
|
|
|
cfg := AlertConfig{
|
|
Enabled: true,
|
|
StorageDefault: HysteresisThreshold{Trigger: 85, Clear: 80},
|
|
SnapshotDefaults: SnapshotAlertConfig{
|
|
Enabled: true,
|
|
WarningDays: 5,
|
|
CriticalDays: 10,
|
|
WarningSizeGiB: 40,
|
|
CriticalSizeGiB: 80,
|
|
},
|
|
Overrides: make(map[string]ThresholdConfig),
|
|
}
|
|
m.UpdateConfig(cfg)
|
|
m.mu.Lock()
|
|
m.config.TimeThreshold = 0
|
|
m.config.TimeThresholds = map[string]int{}
|
|
m.mu.Unlock()
|
|
|
|
now := time.Now()
|
|
snapshots := []models.GuestSnapshot{
|
|
{
|
|
ID: "inst-node-300-combined",
|
|
Name: "long-running",
|
|
Node: "node",
|
|
Instance: "inst",
|
|
Type: "qemu",
|
|
VMID: 300,
|
|
Time: now.Add(-15 * 24 * time.Hour),
|
|
SizeBytes: int64(90) << 30,
|
|
},
|
|
}
|
|
guestNames := map[string]string{
|
|
"inst:node:300": "app-server",
|
|
}
|
|
|
|
m.CheckSnapshotsForInstance("inst", snapshots, guestNames)
|
|
|
|
m.mu.RLock()
|
|
alert, exists := m.activeAlerts["snapshot-age-inst-node-300-combined"]
|
|
m.mu.RUnlock()
|
|
if !exists {
|
|
t.Fatalf("expected combined snapshot alert to be created")
|
|
}
|
|
if alert.Level != AlertLevelCritical {
|
|
t.Fatalf("expected critical level, got %s", alert.Level)
|
|
}
|
|
if !strings.Contains(alert.Message, "days old") || !strings.Contains(strings.ToLower(alert.Message), "gib") {
|
|
t.Fatalf("expected alert message to reference age and size, got %q", alert.Message)
|
|
}
|
|
if alert.Metadata == nil {
|
|
t.Fatalf("expected metadata for combined alert")
|
|
}
|
|
metrics, ok := alert.Metadata["triggeredMetrics"].([]string)
|
|
if !ok {
|
|
t.Fatalf("expected triggeredMetrics slice, got %#v", alert.Metadata["triggeredMetrics"])
|
|
}
|
|
if len(metrics) < 2 {
|
|
t.Fatalf("expected both age and size metrics recorded, got %v", metrics)
|
|
}
|
|
if metric, ok := alert.Metadata["primaryMetric"].(string); !ok || metric != "age" {
|
|
t.Fatalf("expected primary metric age, got %#v", alert.Metadata["primaryMetric"])
|
|
}
|
|
}
|
|
|
|
func TestCheckBackupsCreatesAndClearsAlerts(t *testing.T) {
|
|
m := newTestManager(t)
|
|
m.ClearActiveAlerts()
|
|
|
|
m.mu.Lock()
|
|
m.config.Enabled = true
|
|
m.config.BackupDefaults = BackupAlertConfig{
|
|
Enabled: true,
|
|
WarningDays: 7,
|
|
CriticalDays: 14,
|
|
}
|
|
m.config.TimeThreshold = 0
|
|
m.config.TimeThresholds = map[string]int{}
|
|
m.mu.Unlock()
|
|
|
|
now := time.Now()
|
|
storageBackups := []models.StorageBackup{
|
|
{
|
|
ID: "inst-node-100-backup",
|
|
Storage: "local",
|
|
Node: "node",
|
|
Instance: "inst",
|
|
Type: "qemu",
|
|
VMID: 100,
|
|
Time: now.Add(-15 * 24 * time.Hour),
|
|
},
|
|
}
|
|
|
|
key := BuildGuestKey("inst", "node", 100)
|
|
guestsByKey := map[string]GuestLookup{
|
|
key: {
|
|
ResourceID: "qemu/100",
|
|
Name: "app-server",
|
|
Instance: "inst",
|
|
Node: "node",
|
|
Type: "qemu",
|
|
VMID: 100,
|
|
},
|
|
}
|
|
guestsByVMID := map[string][]GuestLookup{
|
|
"100": {guestsByKey[key]},
|
|
}
|
|
|
|
m.CheckBackups(storageBackups, nil, nil, guestsByKey, guestsByVMID, nil)
|
|
|
|
m.mu.RLock()
|
|
alert, exists := m.activeAlerts["backup-age-"+sanitizeAlertKey(key)]
|
|
m.mu.RUnlock()
|
|
if !exists {
|
|
t.Fatalf("expected backup age alert to be created")
|
|
}
|
|
if alert.Level != AlertLevelCritical {
|
|
t.Fatalf("expected critical backup alert, got %s", alert.Level)
|
|
}
|
|
|
|
// Recent backup clears alert
|
|
storageBackups[0].Time = now
|
|
m.CheckBackups(storageBackups, nil, nil, guestsByKey, guestsByVMID, nil)
|
|
|
|
m.mu.RLock()
|
|
_, exists = m.activeAlerts["backup-age-"+sanitizeAlertKey(key)]
|
|
m.mu.RUnlock()
|
|
if exists {
|
|
t.Fatalf("expected backup-age alert to clear after fresh backup")
|
|
}
|
|
}
|
|
|
|
func TestCheckBackupsRespectsOverrides(t *testing.T) {
|
|
m := newTestManager(t)
|
|
m.ClearActiveAlerts()
|
|
|
|
m.mu.Lock()
|
|
m.config.Enabled = true
|
|
m.config.BackupDefaults = BackupAlertConfig{
|
|
Enabled: true,
|
|
WarningDays: 7,
|
|
CriticalDays: 14,
|
|
}
|
|
m.config.TimeThreshold = 0
|
|
m.mu.Unlock()
|
|
|
|
now := time.Now()
|
|
storageBackups := []models.StorageBackup{
|
|
{
|
|
ID: "inst-node-100-backup",
|
|
Storage: "local",
|
|
Node: "node",
|
|
Instance: "inst",
|
|
Type: "qemu",
|
|
VMID: 100,
|
|
Time: now.Add(-10 * 24 * time.Hour), // Triggers Warning (10 > 7)
|
|
},
|
|
}
|
|
|
|
key := BuildGuestKey("inst", "node", 100)
|
|
resourceID := "inst:node:100"
|
|
guestsByKey := map[string]GuestLookup{
|
|
key: {
|
|
ResourceID: resourceID,
|
|
Name: "app-server",
|
|
Instance: "inst",
|
|
Node: "node",
|
|
Type: "qemu",
|
|
VMID: 100,
|
|
},
|
|
}
|
|
guestsByVMID := map[string][]GuestLookup{
|
|
"100": {guestsByKey[key]},
|
|
}
|
|
|
|
// 1. Verify warning alert is created with defaults
|
|
m.CheckBackups(storageBackups, nil, nil, guestsByKey, guestsByVMID, nil)
|
|
m.mu.RLock()
|
|
alert, exists := m.activeAlerts["backup-age-"+sanitizeAlertKey(key)]
|
|
m.mu.RUnlock()
|
|
if !exists {
|
|
t.Fatalf("expected backup warning alert")
|
|
}
|
|
if alert.Level != AlertLevelWarning {
|
|
t.Fatalf("expected warning alert, got %s", alert.Level)
|
|
}
|
|
|
|
// 2. Apply override to disable backup alerts for this guest
|
|
cfg := m.GetConfig()
|
|
cfg.Overrides = map[string]ThresholdConfig{
|
|
resourceID: {
|
|
Backup: &BackupAlertConfig{Enabled: false},
|
|
},
|
|
}
|
|
m.UpdateConfig(cfg)
|
|
|
|
m.CheckBackups(storageBackups, nil, nil, guestsByKey, guestsByVMID, nil)
|
|
m.mu.RLock()
|
|
_, exists = m.activeAlerts["backup-age-"+sanitizeAlertKey(key)]
|
|
m.mu.RUnlock()
|
|
if exists {
|
|
t.Fatalf("expected backup alert to be cleared/suppressed by override")
|
|
}
|
|
|
|
// 3. Apply override to change thresholds
|
|
cfg.Overrides[resourceID] = ThresholdConfig{
|
|
Backup: &BackupAlertConfig{
|
|
Enabled: true,
|
|
WarningDays: 15, // 10 < 15, so no alert
|
|
CriticalDays: 20,
|
|
},
|
|
}
|
|
m.UpdateConfig(cfg)
|
|
m.CheckBackups(storageBackups, nil, nil, guestsByKey, guestsByVMID, nil)
|
|
m.mu.RLock()
|
|
_, exists = m.activeAlerts["backup-age-"+sanitizeAlertKey(key)]
|
|
m.mu.RUnlock()
|
|
if exists {
|
|
t.Fatalf("expected no backup alert with increased thresholds in override")
|
|
}
|
|
|
|
// 4. Test global guest disable
|
|
cfg.Overrides[resourceID] = ThresholdConfig{
|
|
Disabled: true,
|
|
}
|
|
m.UpdateConfig(cfg)
|
|
storageBackups[0].Time = now.Add(-30 * 24 * time.Hour) // Way past defaults
|
|
m.CheckBackups(storageBackups, nil, nil, guestsByKey, guestsByVMID, nil)
|
|
m.mu.RLock()
|
|
_, exists = m.activeAlerts["backup-age-"+sanitizeAlertKey(key)]
|
|
m.mu.RUnlock()
|
|
if exists {
|
|
t.Fatalf("expected no backup alert for globally disabled guest")
|
|
}
|
|
}
|
|
|
|
func TestCheckBackupsHandlesPbsOnlyGuests(t *testing.T) {
|
|
m := newTestManager(t)
|
|
m.ClearActiveAlerts()
|
|
|
|
m.mu.Lock()
|
|
m.config.Enabled = true
|
|
m.config.BackupDefaults = BackupAlertConfig{
|
|
Enabled: true,
|
|
WarningDays: 3,
|
|
CriticalDays: 5,
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
now := time.Now()
|
|
pbsBackups := []models.PBSBackup{
|
|
{
|
|
ID: "pbs-backup-999-0",
|
|
Instance: "pbs-main",
|
|
Datastore: "backup-store",
|
|
BackupType: "qemu",
|
|
VMID: "999",
|
|
BackupTime: now.Add(-6 * 24 * time.Hour),
|
|
},
|
|
}
|
|
|
|
// Include a live sentinel guest so hasLiveInventory is true and orphan detection runs.
|
|
sentinelKey := BuildGuestKey("inst", "snode", 9999)
|
|
guestsByKey := map[string]GuestLookup{
|
|
sentinelKey: {ResourceID: "qemu/9999", Name: "sentinel-vm", Instance: "inst", Node: "snode", Type: "qemu", VMID: 9999},
|
|
}
|
|
guestsByVMID := map[string][]GuestLookup{
|
|
"9999": {guestsByKey[sentinelKey]},
|
|
}
|
|
|
|
m.CheckBackups(nil, pbsBackups, nil, guestsByKey, guestsByVMID, nil)
|
|
|
|
m.mu.RLock()
|
|
found := false
|
|
for id, alert := range m.activeAlerts {
|
|
if strings.HasPrefix(id, "backup-orphaned-") {
|
|
found = true
|
|
if alert.Level != AlertLevelWarning {
|
|
t.Fatalf("expected PBS orphaned backup alert to be warning, got %s", alert.Level)
|
|
}
|
|
break
|
|
}
|
|
}
|
|
m.mu.RUnlock()
|
|
if !found {
|
|
t.Fatalf("expected PBS orphaned backup alert to be created")
|
|
}
|
|
}
|
|
|
|
func TestCheckBackupsDisambiguatesWithNamespace(t *testing.T) {
|
|
// Test that when multiple guests have the same VMID from different instances,
|
|
// the namespace is used to match the backup to the correct guest.
|
|
// This addresses issue #1095 where users have multiple PVE instances with
|
|
// overlapping VMIDs and separate PBS instances backing them up.
|
|
m := newTestManager(t)
|
|
m.ClearActiveAlerts()
|
|
|
|
m.mu.Lock()
|
|
m.config.Enabled = true
|
|
m.config.BackupDefaults = BackupAlertConfig{
|
|
Enabled: true,
|
|
WarningDays: 3,
|
|
CriticalDays: 5,
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
now := time.Now()
|
|
|
|
// Two guests with the same VMID (100) but on different instances
|
|
guestsByKey := map[string]GuestLookup{
|
|
"pve-node1-100": {
|
|
ResourceID: "qemu/100",
|
|
Name: "webserver-pve",
|
|
Instance: "pve",
|
|
Node: "node1",
|
|
Type: "qemu",
|
|
VMID: 100,
|
|
},
|
|
"pve-nat-node2-100": {
|
|
ResourceID: "qemu/100",
|
|
Name: "webserver-nat",
|
|
Instance: "pve-nat",
|
|
Node: "node2",
|
|
Type: "qemu",
|
|
VMID: 100,
|
|
},
|
|
}
|
|
|
|
// Both guests have VMID "100"
|
|
guestsByVMID := map[string][]GuestLookup{
|
|
"100": {
|
|
guestsByKey["pve-node1-100"],
|
|
guestsByKey["pve-nat-node2-100"],
|
|
},
|
|
}
|
|
|
|
// PBS backup with namespace "nat" should match the "pve-nat" instance
|
|
pbsBackups := []models.PBSBackup{
|
|
{
|
|
ID: "pbs-backup-100-nat",
|
|
Instance: "pbs-main",
|
|
Datastore: "backup-store",
|
|
Namespace: "nat", // This namespace should match "pve-nat"
|
|
BackupType: "qemu",
|
|
VMID: "100",
|
|
BackupTime: now.Add(-6 * 24 * time.Hour), // Critical
|
|
},
|
|
}
|
|
|
|
m.CheckBackups(nil, pbsBackups, nil, guestsByKey, guestsByVMID, nil)
|
|
|
|
m.mu.RLock()
|
|
defer m.mu.RUnlock()
|
|
|
|
// Should find an alert keyed to the pve-nat instance (node2), not pve (node1)
|
|
expectedKey := "backup-age-pve-nat-node2-100"
|
|
alert, exists := m.activeAlerts[expectedKey]
|
|
if !exists {
|
|
// List what keys we do have for debugging
|
|
var keys []string
|
|
for k := range m.activeAlerts {
|
|
keys = append(keys, k)
|
|
}
|
|
t.Fatalf("expected alert with key %q not found; found keys: %v", expectedKey, keys)
|
|
}
|
|
|
|
if alert.ResourceName != "webserver-nat backup" {
|
|
t.Errorf("expected ResourceName 'webserver-nat backup', got %q", alert.ResourceName)
|
|
}
|
|
if alert.Instance != "pve-nat" {
|
|
t.Errorf("expected Instance 'pve-nat', got %q", alert.Instance)
|
|
}
|
|
}
|
|
|
|
// TestCheckBackupsVMIDCollisionNonMatchingNamespace verifies that when multiple guests
|
|
// share a VMID and the PBS backup namespace matches none of them, the alert uses the
|
|
// generic PBS key rather than falsely attributing to a specific guest.
|
|
func TestCheckBackupsVMIDCollisionNonMatchingNamespace(t *testing.T) {
|
|
m := newTestManager(t)
|
|
m.ClearActiveAlerts()
|
|
|
|
m.mu.Lock()
|
|
m.config.Enabled = true
|
|
m.config.BackupDefaults = BackupAlertConfig{
|
|
Enabled: true,
|
|
WarningDays: 3,
|
|
CriticalDays: 5,
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
now := time.Now()
|
|
|
|
guestsByKey := map[string]GuestLookup{
|
|
"pve1-node1-100": {
|
|
ResourceID: "qemu/100",
|
|
Name: "vm-pve1",
|
|
Instance: "pve1",
|
|
Node: "node1",
|
|
Type: "qemu",
|
|
VMID: 100,
|
|
},
|
|
"pve2-node2-100": {
|
|
ResourceID: "qemu/100",
|
|
Name: "vm-pve2",
|
|
Instance: "pve2",
|
|
Node: "node2",
|
|
Type: "qemu",
|
|
VMID: 100,
|
|
},
|
|
}
|
|
|
|
guestsByVMID := map[string][]GuestLookup{
|
|
"100": {
|
|
guestsByKey["pve1-node1-100"],
|
|
guestsByKey["pve2-node2-100"],
|
|
},
|
|
}
|
|
|
|
// PBS backup with namespace "staging" — matches neither pve1 nor pve2
|
|
pbsBackups := []models.PBSBackup{
|
|
{
|
|
ID: "pbs-100",
|
|
Instance: "pbs-main",
|
|
Datastore: "backup-store",
|
|
Namespace: "staging",
|
|
BackupType: "qemu",
|
|
VMID: "100",
|
|
BackupTime: now.Add(-6 * 24 * time.Hour),
|
|
},
|
|
}
|
|
|
|
m.CheckBackups(nil, pbsBackups, nil, guestsByKey, guestsByVMID, nil)
|
|
|
|
m.mu.RLock()
|
|
defer m.mu.RUnlock()
|
|
|
|
// Should NOT have a guest-specific alert key
|
|
for key := range m.activeAlerts {
|
|
if key == "backup-age-pve1-node1-100" || key == "backup-age-pve2-node2-100" {
|
|
t.Errorf("should not attribute ambiguous backup to a specific guest, but found key %q", key)
|
|
}
|
|
}
|
|
|
|
// Should have a generic PBS alert key
|
|
expectedKey := "backup-age-pbs-pbs-main-qemu-100"
|
|
if _, exists := m.activeAlerts[expectedKey]; !exists {
|
|
var keys []string
|
|
for k := range m.activeAlerts {
|
|
keys = append(keys, k)
|
|
}
|
|
t.Errorf("expected generic PBS alert key %q, found keys: %v", expectedKey, keys)
|
|
}
|
|
}
|
|
|
|
// TestCheckBackupsVMIDCollisionNoNamespace verifies that when multiple guests
|
|
// share a VMID and the PBS backup has no namespace, the alert uses the generic PBS key.
|
|
func TestCheckBackupsVMIDCollisionNoNamespace(t *testing.T) {
|
|
m := newTestManager(t)
|
|
m.ClearActiveAlerts()
|
|
|
|
m.mu.Lock()
|
|
m.config.Enabled = true
|
|
m.config.BackupDefaults = BackupAlertConfig{
|
|
Enabled: true,
|
|
WarningDays: 3,
|
|
CriticalDays: 5,
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
now := time.Now()
|
|
|
|
guestsByKey := map[string]GuestLookup{
|
|
"pve1-node1-100": {
|
|
ResourceID: "qemu/100",
|
|
Name: "vm-pve1",
|
|
Instance: "pve1",
|
|
Node: "node1",
|
|
Type: "qemu",
|
|
VMID: 100,
|
|
},
|
|
"pve2-node2-100": {
|
|
ResourceID: "qemu/100",
|
|
Name: "vm-pve2",
|
|
Instance: "pve2",
|
|
Node: "node2",
|
|
Type: "qemu",
|
|
VMID: 100,
|
|
},
|
|
}
|
|
|
|
guestsByVMID := map[string][]GuestLookup{
|
|
"100": {
|
|
guestsByKey["pve1-node1-100"],
|
|
guestsByKey["pve2-node2-100"],
|
|
},
|
|
}
|
|
|
|
// PBS backup with NO namespace
|
|
pbsBackups := []models.PBSBackup{
|
|
{
|
|
ID: "pbs-100",
|
|
Instance: "pbs-main",
|
|
Datastore: "backup-store",
|
|
Namespace: "",
|
|
BackupType: "qemu",
|
|
VMID: "100",
|
|
BackupTime: now.Add(-6 * 24 * time.Hour),
|
|
},
|
|
}
|
|
|
|
m.CheckBackups(nil, pbsBackups, nil, guestsByKey, guestsByVMID, nil)
|
|
|
|
m.mu.RLock()
|
|
defer m.mu.RUnlock()
|
|
|
|
// Should NOT have a guest-specific alert key
|
|
for key := range m.activeAlerts {
|
|
if key == "backup-age-pve1-node1-100" || key == "backup-age-pve2-node2-100" {
|
|
t.Errorf("should not attribute ambiguous backup to a specific guest, but found key %q", key)
|
|
}
|
|
}
|
|
|
|
// Should have a generic PBS alert key
|
|
expectedKey := "backup-age-pbs-pbs-main-qemu-100"
|
|
if _, exists := m.activeAlerts[expectedKey]; !exists {
|
|
var keys []string
|
|
for k := range m.activeAlerts {
|
|
keys = append(keys, k)
|
|
}
|
|
t.Errorf("expected generic PBS alert key %q, found keys: %v", expectedKey, keys)
|
|
}
|
|
}
|
|
|
|
func TestCheckBackupsHandlesPmgBackups(t *testing.T) {
|
|
m := newTestManager(t)
|
|
m.ClearActiveAlerts()
|
|
|
|
m.mu.Lock()
|
|
m.config.Enabled = true
|
|
m.config.BackupDefaults = BackupAlertConfig{
|
|
Enabled: true,
|
|
WarningDays: 5,
|
|
CriticalDays: 7,
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
now := time.Now()
|
|
pmgBackups := []models.PMGBackup{
|
|
{
|
|
ID: "pmg-backup-mail-01",
|
|
Instance: "mail",
|
|
Node: "mail-gateway",
|
|
Filename: "pmg-backup_2024-01-01.tgz",
|
|
BackupTime: now.Add(-8 * 24 * time.Hour),
|
|
Size: 123456,
|
|
},
|
|
}
|
|
|
|
m.CheckBackups(nil, nil, pmgBackups, map[string]GuestLookup{}, map[string][]GuestLookup{}, nil)
|
|
|
|
m.mu.RLock()
|
|
found := false
|
|
for id, alert := range m.activeAlerts {
|
|
if strings.HasPrefix(id, "backup-age-") {
|
|
found = true
|
|
if alert.Level != AlertLevelCritical {
|
|
t.Fatalf("expected PMG backup alert to be critical")
|
|
}
|
|
break
|
|
}
|
|
}
|
|
m.mu.RUnlock()
|
|
if !found {
|
|
t.Fatalf("expected PMG backup alert to be created")
|
|
}
|
|
}
|
|
|
|
func TestCheckBackupsSkipsOrphanedWhenDisabled(t *testing.T) {
|
|
m := newTestManager(t)
|
|
m.ClearActiveAlerts()
|
|
|
|
alertOrphaned := false
|
|
m.mu.Lock()
|
|
m.config.Enabled = true
|
|
m.config.BackupDefaults = BackupAlertConfig{
|
|
Enabled: true,
|
|
WarningDays: 3,
|
|
CriticalDays: 5,
|
|
AlertOrphaned: &alertOrphaned,
|
|
IgnoreVMIDs: []string{},
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
now := time.Now()
|
|
storageBackups := []models.StorageBackup{
|
|
{
|
|
ID: "inst-node-200-backup",
|
|
Storage: "local",
|
|
Node: "node",
|
|
Instance: "inst",
|
|
Type: "qemu",
|
|
VMID: 200,
|
|
Time: now.Add(-6 * 24 * time.Hour),
|
|
},
|
|
}
|
|
|
|
// Include a live sentinel guest so hasLiveInventory is true and orphan detection runs.
|
|
sentinelKey := BuildGuestKey("inst", "snode", 9999)
|
|
guestsByKey := map[string]GuestLookup{
|
|
sentinelKey: {ResourceID: "qemu/9999", Name: "sentinel-vm", Instance: "inst", Node: "snode", Type: "qemu", VMID: 9999},
|
|
}
|
|
guestsByVMID := map[string][]GuestLookup{
|
|
"9999": {guestsByKey[sentinelKey]},
|
|
}
|
|
|
|
m.CheckBackups(storageBackups, nil, nil, guestsByKey, guestsByVMID, nil)
|
|
|
|
m.mu.RLock()
|
|
defer m.mu.RUnlock()
|
|
for id := range m.activeAlerts {
|
|
if strings.HasPrefix(id, "backup-age-") {
|
|
t.Fatalf("expected orphaned backup to be skipped, found alert %s", id)
|
|
}
|
|
if strings.HasPrefix(id, "backup-orphaned-") {
|
|
t.Fatalf("expected orphaned backup alert to be suppressed when alertOrphaned=false, found alert %s", id)
|
|
}
|
|
}
|
|
}
|
|
|
|
func TestCheckBackupsCreatesOrphanedAlert(t *testing.T) {
|
|
m := newTestManager(t)
|
|
m.ClearActiveAlerts()
|
|
|
|
alertOrphaned := true
|
|
m.mu.Lock()
|
|
m.config.Enabled = true
|
|
m.config.BackupDefaults = BackupAlertConfig{
|
|
Enabled: true,
|
|
WarningDays: 7,
|
|
CriticalDays: 14,
|
|
AlertOrphaned: &alertOrphaned,
|
|
IgnoreVMIDs: []string{},
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
now := time.Now()
|
|
// Backup is only 1 day old — well below both age thresholds.
|
|
storageBackups := []models.StorageBackup{
|
|
{
|
|
ID: "inst-node-200-backup",
|
|
Storage: "local",
|
|
Node: "node",
|
|
Instance: "inst",
|
|
Type: "qemu",
|
|
VMID: 200,
|
|
Time: now.Add(-1 * 24 * time.Hour),
|
|
},
|
|
}
|
|
|
|
// Include a live sentinel guest so hasLiveInventory is true and orphan detection runs.
|
|
// VMID 200 is still orphaned because it's not in the inventory.
|
|
sentinelKey := BuildGuestKey("inst", "snode", 9999)
|
|
guestsByKey := map[string]GuestLookup{
|
|
sentinelKey: {ResourceID: "qemu/9999", Name: "sentinel-vm", Instance: "inst", Node: "snode", Type: "qemu", VMID: 9999},
|
|
}
|
|
guestsByVMID := map[string][]GuestLookup{
|
|
"9999": {guestsByKey[sentinelKey]},
|
|
}
|
|
|
|
m.CheckBackups(storageBackups, nil, nil, guestsByKey, guestsByVMID, nil)
|
|
|
|
m.mu.RLock()
|
|
defer m.mu.RUnlock()
|
|
found := false
|
|
for id, alert := range m.activeAlerts {
|
|
if strings.HasPrefix(id, "backup-orphaned-") {
|
|
found = true
|
|
if alert.Type != "backup-orphaned" {
|
|
t.Fatalf("expected alert type backup-orphaned, got %s", alert.Type)
|
|
}
|
|
if alert.Level != AlertLevelWarning {
|
|
t.Fatalf("expected alert level warning, got %s", alert.Level)
|
|
}
|
|
if alert.Metadata == nil || alert.Metadata["orphaned"] != true {
|
|
t.Fatalf("expected metadata orphaned=true")
|
|
}
|
|
if alert.Metadata["vmid"] != "200" {
|
|
t.Fatalf("expected metadata vmid=200, got %v", alert.Metadata["vmid"])
|
|
}
|
|
}
|
|
}
|
|
if !found {
|
|
t.Fatalf("expected a backup-orphaned alert to be created for orphaned VMID 200")
|
|
}
|
|
// Also verify no backup-age alert was created (below thresholds).
|
|
for id := range m.activeAlerts {
|
|
if strings.HasPrefix(id, "backup-age-") {
|
|
t.Fatalf("expected no backup-age alert for orphan below age threshold, found %s", id)
|
|
}
|
|
}
|
|
}
|
|
|
|
func TestCheckBackupsOrphanedAlertClearsWhenGuestReappears(t *testing.T) {
|
|
m := newTestManager(t)
|
|
m.ClearActiveAlerts()
|
|
|
|
alertOrphaned := true
|
|
m.mu.Lock()
|
|
m.config.Enabled = true
|
|
m.config.BackupDefaults = BackupAlertConfig{
|
|
Enabled: true,
|
|
WarningDays: 7,
|
|
CriticalDays: 14,
|
|
AlertOrphaned: &alertOrphaned,
|
|
IgnoreVMIDs: []string{},
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
now := time.Now()
|
|
storageBackups := []models.StorageBackup{
|
|
{
|
|
ID: "inst-node-300-backup",
|
|
Storage: "local",
|
|
Node: "node",
|
|
Instance: "inst",
|
|
Type: "qemu",
|
|
VMID: 300,
|
|
Time: now.Add(-1 * 24 * time.Hour),
|
|
},
|
|
}
|
|
|
|
// Include a live sentinel guest on the same instance so orphan detection runs.
|
|
sentinelKey := BuildGuestKey("inst", "snode", 9999)
|
|
sentinelByKey := map[string]GuestLookup{
|
|
sentinelKey: {ResourceID: "qemu/9999", Name: "sentinel-vm", Instance: "inst", Node: "snode", Type: "qemu", VMID: 9999},
|
|
}
|
|
sentinelByVMID := map[string][]GuestLookup{
|
|
"9999": {sentinelByKey[sentinelKey]},
|
|
}
|
|
|
|
// First cycle: guest 300 absent (only sentinel present) → orphaned alert fires.
|
|
m.CheckBackups(storageBackups, nil, nil, sentinelByKey, sentinelByVMID, nil)
|
|
|
|
m.mu.RLock()
|
|
orphanedFound := false
|
|
for id := range m.activeAlerts {
|
|
if strings.HasPrefix(id, "backup-orphaned-") {
|
|
orphanedFound = true
|
|
}
|
|
}
|
|
m.mu.RUnlock()
|
|
if !orphanedFound {
|
|
t.Fatalf("expected orphaned alert after first cycle")
|
|
}
|
|
|
|
// Second cycle: guest reappears in inventory → orphaned alert should clear.
|
|
guestKey := BuildGuestKey("inst", "node", 300)
|
|
guestsByKey := map[string]GuestLookup{
|
|
guestKey: {ResourceID: "qemu/300", Name: "restored-vm", Instance: "inst", Node: "node", Type: "qemu", VMID: 300},
|
|
}
|
|
guestsByVMID := map[string][]GuestLookup{
|
|
"300": {guestsByKey[guestKey]},
|
|
}
|
|
m.CheckBackups(storageBackups, nil, nil, guestsByKey, guestsByVMID, nil)
|
|
|
|
m.mu.RLock()
|
|
defer m.mu.RUnlock()
|
|
for id := range m.activeAlerts {
|
|
if strings.HasPrefix(id, "backup-orphaned-") {
|
|
t.Fatalf("expected orphaned alert to be cleared after guest reappears, found %s", id)
|
|
}
|
|
}
|
|
}
|
|
|
|
func TestCheckBackupsOrphanedIgnoresVMIDs(t *testing.T) {
|
|
m := newTestManager(t)
|
|
m.ClearActiveAlerts()
|
|
|
|
alertOrphaned := true
|
|
m.mu.Lock()
|
|
m.config.Enabled = true
|
|
m.config.BackupDefaults = BackupAlertConfig{
|
|
Enabled: true,
|
|
WarningDays: 7,
|
|
CriticalDays: 14,
|
|
AlertOrphaned: &alertOrphaned,
|
|
IgnoreVMIDs: []string{"20*"},
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
now := time.Now()
|
|
storageBackups := []models.StorageBackup{
|
|
{
|
|
ID: "inst-node-200-backup",
|
|
Storage: "local",
|
|
Node: "node",
|
|
Instance: "inst",
|
|
Type: "qemu",
|
|
VMID: 200,
|
|
Time: now.Add(-1 * 24 * time.Hour),
|
|
},
|
|
{
|
|
ID: "inst-node-300-backup",
|
|
Storage: "local",
|
|
Node: "node",
|
|
Instance: "inst",
|
|
Type: "qemu",
|
|
VMID: 300,
|
|
Time: now.Add(-1 * 24 * time.Hour),
|
|
},
|
|
}
|
|
|
|
// Include a live sentinel guest so hasLiveInventory is true and orphan detection runs.
|
|
sentinelKey := BuildGuestKey("inst", "snode", 9999)
|
|
guestsByKey := map[string]GuestLookup{
|
|
sentinelKey: {ResourceID: "qemu/9999", Name: "sentinel-vm", Instance: "inst", Node: "snode", Type: "qemu", VMID: 9999},
|
|
}
|
|
guestsByVMID := map[string][]GuestLookup{
|
|
"9999": {guestsByKey[sentinelKey]},
|
|
}
|
|
|
|
// Both are orphaned (not in inventory), but VMID 200 matches ignore pattern "20*".
|
|
m.CheckBackups(storageBackups, nil, nil, guestsByKey, guestsByVMID, nil)
|
|
|
|
m.mu.RLock()
|
|
defer m.mu.RUnlock()
|
|
for id := range m.activeAlerts {
|
|
if strings.HasPrefix(id, "backup-orphaned-") && strings.Contains(id, "200") {
|
|
t.Fatalf("expected orphaned alert for VMID 200 to be suppressed by ignoreVMIDs, found %s", id)
|
|
}
|
|
}
|
|
found300 := false
|
|
for id := range m.activeAlerts {
|
|
if strings.HasPrefix(id, "backup-orphaned-") && strings.Contains(id, "300") {
|
|
found300 = true
|
|
}
|
|
}
|
|
if !found300 {
|
|
t.Fatalf("expected orphaned alert for VMID 300 (not in ignore list)")
|
|
}
|
|
}
|
|
|
|
func TestCheckBackupsOrphanedWithZeroAgeThresholds(t *testing.T) {
|
|
m := newTestManager(t)
|
|
m.ClearActiveAlerts()
|
|
|
|
alertOrphaned := true
|
|
m.mu.Lock()
|
|
m.config.Enabled = true
|
|
m.config.BackupDefaults = BackupAlertConfig{
|
|
Enabled: true,
|
|
WarningDays: 0,
|
|
CriticalDays: 0,
|
|
AlertOrphaned: &alertOrphaned,
|
|
IgnoreVMIDs: []string{},
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
now := time.Now()
|
|
storageBackups := []models.StorageBackup{
|
|
{
|
|
ID: "inst-node-400-backup",
|
|
Storage: "local",
|
|
Node: "node",
|
|
Instance: "inst",
|
|
Type: "qemu",
|
|
VMID: 400,
|
|
Time: now.Add(-1 * 24 * time.Hour),
|
|
},
|
|
}
|
|
|
|
// Include a live sentinel guest so hasLiveInventory is true and orphan detection runs.
|
|
sentinelKey := BuildGuestKey("inst", "snode", 9999)
|
|
guestsByKey := map[string]GuestLookup{
|
|
sentinelKey: {ResourceID: "qemu/9999", Name: "sentinel-vm", Instance: "inst", Node: "snode", Type: "qemu", VMID: 9999},
|
|
}
|
|
guestsByVMID := map[string][]GuestLookup{
|
|
"9999": {guestsByKey[sentinelKey]},
|
|
}
|
|
|
|
// Orphaned guest with zero age thresholds — should still fire orphaned alert.
|
|
m.CheckBackups(storageBackups, nil, nil, guestsByKey, guestsByVMID, nil)
|
|
|
|
m.mu.RLock()
|
|
defer m.mu.RUnlock()
|
|
found := false
|
|
for id := range m.activeAlerts {
|
|
if strings.HasPrefix(id, "backup-orphaned-") {
|
|
found = true
|
|
}
|
|
}
|
|
if !found {
|
|
t.Fatalf("expected backup-orphaned alert even with zero age thresholds")
|
|
}
|
|
// No backup-age alerts should exist since thresholds are 0.
|
|
for id := range m.activeAlerts {
|
|
if strings.HasPrefix(id, "backup-age-") {
|
|
t.Fatalf("expected no backup-age alert with zero thresholds, found %s", id)
|
|
}
|
|
}
|
|
}
|
|
|
|
func TestCheckBackupsOrphanedWithPersistedMetadata(t *testing.T) {
|
|
// When a guest is deleted, enrichWithPersistedMetadata adds an entry to
|
|
// guestsByVMID with an empty ResourceID (just name/type metadata for display).
|
|
// This must NOT suppress orphaned alerts — only live guests (ResourceID != "")
|
|
// indicate the guest is still in inventory.
|
|
m := newTestManager(t)
|
|
m.ClearActiveAlerts()
|
|
|
|
alertOrphaned := true
|
|
m.mu.Lock()
|
|
m.config.Enabled = true
|
|
m.config.BackupDefaults = BackupAlertConfig{
|
|
Enabled: true,
|
|
WarningDays: 7,
|
|
CriticalDays: 14,
|
|
AlertOrphaned: &alertOrphaned,
|
|
IgnoreVMIDs: []string{},
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
now := time.Now()
|
|
storageBackups := []models.StorageBackup{
|
|
{
|
|
ID: "inst-node-500-backup",
|
|
Storage: "local",
|
|
Node: "node",
|
|
Instance: "inst",
|
|
Type: "qemu",
|
|
VMID: 500,
|
|
Time: now.Add(-1 * 24 * time.Hour),
|
|
},
|
|
}
|
|
|
|
// Simulate persisted metadata for deleted guest: entry exists in
|
|
// guestsByVMID but with empty ResourceID (no live guest).
|
|
// Include a live sentinel guest on the same instance so orphan detection runs.
|
|
sentinelKey := BuildGuestKey("inst", "snode", 9999)
|
|
guestsByKey := map[string]GuestLookup{
|
|
sentinelKey: {ResourceID: "qemu/9999", Name: "sentinel-vm", Instance: "inst", Node: "snode", Type: "qemu", VMID: 9999},
|
|
}
|
|
guestsByVMID := map[string][]GuestLookup{
|
|
"500": {{Name: "deleted-vm", Instance: "inst", Node: "node", Type: "qemu", VMID: 500}},
|
|
"9999": {guestsByKey[sentinelKey]},
|
|
}
|
|
|
|
m.CheckBackups(storageBackups, nil, nil, guestsByKey, guestsByVMID, nil)
|
|
|
|
m.mu.RLock()
|
|
defer m.mu.RUnlock()
|
|
found := false
|
|
for id := range m.activeAlerts {
|
|
if strings.HasPrefix(id, "backup-orphaned-") {
|
|
found = true
|
|
}
|
|
}
|
|
if !found {
|
|
t.Fatalf("expected backup-orphaned alert even when guestsByVMID has metadata-only entry (no ResourceID)")
|
|
}
|
|
}
|
|
|
|
func TestCheckBackupsOrphanedSkippedWhenNoLiveInventory(t *testing.T) {
|
|
// When no live guests exist (empty maps or only persisted metadata),
|
|
// orphan detection is skipped entirely to avoid false positives during
|
|
// startup race / auth failure / inventory outage.
|
|
m := newTestManager(t)
|
|
m.ClearActiveAlerts()
|
|
|
|
alertOrphaned := true
|
|
m.mu.Lock()
|
|
m.config.Enabled = true
|
|
m.config.BackupDefaults = BackupAlertConfig{
|
|
Enabled: true,
|
|
WarningDays: 7,
|
|
CriticalDays: 14,
|
|
AlertOrphaned: &alertOrphaned,
|
|
IgnoreVMIDs: []string{},
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
now := time.Now()
|
|
storageBackups := []models.StorageBackup{
|
|
{
|
|
ID: "inst-node-600-backup",
|
|
Storage: "local",
|
|
Node: "node",
|
|
Instance: "inst",
|
|
Type: "qemu",
|
|
VMID: 600,
|
|
Time: now.Add(-1 * 24 * time.Hour),
|
|
},
|
|
}
|
|
|
|
// Completely empty guest maps — no live inventory.
|
|
m.CheckBackups(storageBackups, nil, nil, map[string]GuestLookup{}, map[string][]GuestLookup{}, nil)
|
|
|
|
m.mu.RLock()
|
|
defer m.mu.RUnlock()
|
|
for id := range m.activeAlerts {
|
|
if strings.HasPrefix(id, "backup-orphaned-") {
|
|
t.Fatalf("expected no orphaned alerts when guest inventory is empty (startup race guard), found %s", id)
|
|
}
|
|
}
|
|
}
|
|
|
|
func TestCheckBackupsOrphanedPreservedWhenNoLiveInventory(t *testing.T) {
|
|
// When a legitimate orphan alert already exists and inventory becomes
|
|
// unavailable (auth failure, restart), the alert should be preserved
|
|
// rather than cleared — we can't confirm it's resolved.
|
|
m := newTestManager(t)
|
|
m.ClearActiveAlerts()
|
|
|
|
alertOrphaned := true
|
|
m.mu.Lock()
|
|
m.config.Enabled = true
|
|
m.config.BackupDefaults = BackupAlertConfig{
|
|
Enabled: true,
|
|
WarningDays: 7,
|
|
CriticalDays: 14,
|
|
AlertOrphaned: &alertOrphaned,
|
|
IgnoreVMIDs: []string{},
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
now := time.Now()
|
|
storageBackups := []models.StorageBackup{
|
|
{
|
|
ID: "inst-node-700-backup",
|
|
Storage: "local",
|
|
Node: "node",
|
|
Instance: "inst",
|
|
Type: "qemu",
|
|
VMID: 700,
|
|
Time: now.Add(-1 * 24 * time.Hour),
|
|
},
|
|
}
|
|
|
|
// First cycle: with live inventory → orphan alert fires.
|
|
sentinelKey := BuildGuestKey("inst", "snode", 9999)
|
|
guestsByKey := map[string]GuestLookup{
|
|
sentinelKey: {ResourceID: "qemu/9999", Name: "sentinel-vm", Instance: "inst", Node: "snode", Type: "qemu", VMID: 9999},
|
|
}
|
|
guestsByVMID := map[string][]GuestLookup{
|
|
"9999": {guestsByKey[sentinelKey]},
|
|
}
|
|
m.CheckBackups(storageBackups, nil, nil, guestsByKey, guestsByVMID, nil)
|
|
|
|
m.mu.RLock()
|
|
orphanFound := false
|
|
for id := range m.activeAlerts {
|
|
if strings.HasPrefix(id, "backup-orphaned-") {
|
|
orphanFound = true
|
|
}
|
|
}
|
|
m.mu.RUnlock()
|
|
if !orphanFound {
|
|
t.Fatalf("expected orphan alert after first cycle with live inventory")
|
|
}
|
|
|
|
// Second cycle: inventory disappears (empty maps) — orphan alert must be preserved.
|
|
m.CheckBackups(storageBackups, nil, nil, map[string]GuestLookup{}, map[string][]GuestLookup{}, nil)
|
|
|
|
m.mu.RLock()
|
|
defer m.mu.RUnlock()
|
|
preserved := false
|
|
for id := range m.activeAlerts {
|
|
if strings.HasPrefix(id, "backup-orphaned-") {
|
|
preserved = true
|
|
}
|
|
}
|
|
if !preserved {
|
|
t.Fatalf("expected orphan alert to be preserved when inventory is unavailable, but it was cleared")
|
|
}
|
|
}
|
|
|
|
func TestCheckBackupsOrphanedCrossInstanceVMID(t *testing.T) {
|
|
// Instance A's guest (VMID 600) is deleted, but instance B has a live
|
|
// guest with the same VMID. The storage backup from instance A should
|
|
// still fire an orphaned alert — the live guest on instance B is irrelevant.
|
|
m := newTestManager(t)
|
|
m.ClearActiveAlerts()
|
|
|
|
alertOrphaned := true
|
|
m.mu.Lock()
|
|
m.config.Enabled = true
|
|
m.config.BackupDefaults = BackupAlertConfig{
|
|
Enabled: true,
|
|
WarningDays: 7,
|
|
CriticalDays: 14,
|
|
AlertOrphaned: &alertOrphaned,
|
|
IgnoreVMIDs: []string{},
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
now := time.Now()
|
|
// Storage backup from instance A.
|
|
storageBackups := []models.StorageBackup{
|
|
{
|
|
ID: "instA-nodeA-600-backup",
|
|
Storage: "local",
|
|
Node: "nodeA",
|
|
Instance: "instA",
|
|
Type: "qemu",
|
|
VMID: 600,
|
|
Time: now.Add(-1 * 24 * time.Hour),
|
|
},
|
|
}
|
|
|
|
// Instance B has a live guest with the same VMID.
|
|
// Instance A also has a sentinel guest so its inventory is considered populated.
|
|
keyB := BuildGuestKey("instB", "nodeB", 600)
|
|
sentinelA := BuildGuestKey("instA", "nodeA", 9999)
|
|
guestsByKey := map[string]GuestLookup{
|
|
keyB: {ResourceID: "qemu/600", Name: "vm-instB", Instance: "instB", Node: "nodeB", Type: "qemu", VMID: 600},
|
|
sentinelA: {ResourceID: "qemu/9999", Name: "sentinel-vm", Instance: "instA", Node: "nodeA", Type: "qemu", VMID: 9999},
|
|
}
|
|
guestsByVMID := map[string][]GuestLookup{
|
|
"600": {guestsByKey[keyB]},
|
|
"9999": {guestsByKey[sentinelA]},
|
|
}
|
|
|
|
m.CheckBackups(storageBackups, nil, nil, guestsByKey, guestsByVMID, nil)
|
|
|
|
m.mu.RLock()
|
|
defer m.mu.RUnlock()
|
|
found := false
|
|
for id := range m.activeAlerts {
|
|
if strings.HasPrefix(id, "backup-orphaned-") {
|
|
found = true
|
|
}
|
|
}
|
|
if !found {
|
|
t.Fatalf("expected backup-orphaned alert for instA even though instB has a live guest with the same VMID")
|
|
}
|
|
}
|
|
|
|
func TestCheckBackupsSkipsPVEOrphanDetectionUntilTemplateInventoryReady(t *testing.T) {
|
|
m := newTestManager(t)
|
|
m.ClearActiveAlerts()
|
|
|
|
alertOrphaned := true
|
|
m.mu.Lock()
|
|
m.config.Enabled = true
|
|
m.config.BackupDefaults = BackupAlertConfig{
|
|
Enabled: true,
|
|
WarningDays: 7,
|
|
CriticalDays: 14,
|
|
AlertOrphaned: &alertOrphaned,
|
|
IgnoreVMIDs: []string{},
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
now := time.Now()
|
|
storageBackups := []models.StorageBackup{
|
|
{
|
|
ID: "instA-node2-700-backup",
|
|
Storage: "local",
|
|
Node: "node2",
|
|
Instance: "instA",
|
|
Type: "qemu",
|
|
VMID: 700,
|
|
Time: now.Add(-1 * 24 * time.Hour),
|
|
},
|
|
}
|
|
|
|
// Simulate the startup/concurrency window where the instance has enough live
|
|
// guest data to satisfy the legacy readiness heuristic, but template inventory
|
|
// has not been populated yet. This backup must not be treated as orphaned.
|
|
sentinelKey := BuildGuestKey("instA", "node3", 9999)
|
|
guestsByKey := map[string]GuestLookup{
|
|
sentinelKey: {
|
|
ResourceID: "qemu/9999",
|
|
Name: "sentinel-vm",
|
|
Instance: "instA",
|
|
Node: "node3",
|
|
Type: "qemu",
|
|
VMID: 9999,
|
|
},
|
|
}
|
|
guestsByVMID := map[string][]GuestLookup{
|
|
"9999": {guestsByKey[sentinelKey]},
|
|
}
|
|
|
|
m.CheckBackups(storageBackups, nil, nil, guestsByKey, guestsByVMID, map[string]bool{})
|
|
|
|
m.mu.RLock()
|
|
defer m.mu.RUnlock()
|
|
for id := range m.activeAlerts {
|
|
if strings.HasPrefix(id, "backup-orphaned-") {
|
|
t.Fatalf("expected no orphaned alert before template inventory is ready, found %s", id)
|
|
}
|
|
}
|
|
}
|
|
|
|
func TestCheckBackupsIgnoresVMIDs(t *testing.T) {
|
|
m := newTestManager(t)
|
|
m.ClearActiveAlerts()
|
|
|
|
alertOrphaned := true
|
|
m.mu.Lock()
|
|
m.config.Enabled = true
|
|
m.config.BackupDefaults = BackupAlertConfig{
|
|
Enabled: true,
|
|
WarningDays: 1,
|
|
CriticalDays: 2,
|
|
AlertOrphaned: &alertOrphaned,
|
|
IgnoreVMIDs: []string{"10*"},
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
now := time.Now()
|
|
storageBackups := []models.StorageBackup{
|
|
{
|
|
ID: "inst-node-101-backup",
|
|
Storage: "local",
|
|
Node: "node",
|
|
Instance: "inst",
|
|
Type: "qemu",
|
|
VMID: 101,
|
|
Time: now.Add(-3 * 24 * time.Hour),
|
|
},
|
|
{
|
|
ID: "inst-node-200-backup",
|
|
Storage: "local",
|
|
Node: "node",
|
|
Instance: "inst",
|
|
Type: "qemu",
|
|
VMID: 200,
|
|
Time: now.Add(-3 * 24 * time.Hour),
|
|
},
|
|
}
|
|
|
|
keyIgnored := BuildGuestKey("inst", "node", 101)
|
|
keyAllowed := BuildGuestKey("inst", "node", 200)
|
|
guestsByKey := map[string]GuestLookup{
|
|
keyIgnored: {ResourceID: "qemu/101", Name: "ignored-vm", Instance: "inst", Node: "node", Type: "qemu", VMID: 101},
|
|
keyAllowed: {ResourceID: "qemu/200", Name: "allowed-vm", Instance: "inst", Node: "node", Type: "qemu", VMID: 200},
|
|
}
|
|
guestsByVMID := map[string][]GuestLookup{
|
|
"101": {guestsByKey[keyIgnored]},
|
|
"200": {guestsByKey[keyAllowed]},
|
|
}
|
|
|
|
m.CheckBackups(storageBackups, nil, nil, guestsByKey, guestsByVMID, nil)
|
|
|
|
m.mu.RLock()
|
|
_, ignoredExists := m.activeAlerts["backup-age-"+sanitizeAlertKey(keyIgnored)]
|
|
_, allowedExists := m.activeAlerts["backup-age-"+sanitizeAlertKey(keyAllowed)]
|
|
m.mu.RUnlock()
|
|
|
|
if ignoredExists {
|
|
t.Fatalf("expected backup alert for ignored VMID to be suppressed")
|
|
}
|
|
if !allowedExists {
|
|
t.Fatalf("expected backup alert for non-ignored VMID")
|
|
}
|
|
}
|
|
|
|
func TestCheckDockerHostIgnoresContainersByPrefix(t *testing.T) {
|
|
m := newTestManager(t)
|
|
|
|
m.mu.Lock()
|
|
m.config.DockerIgnoredContainerPrefixes = []string{"runner-"}
|
|
m.mu.Unlock()
|
|
|
|
container := models.DockerContainer{
|
|
ID: "1234567890ab",
|
|
Name: "runner-auto-1",
|
|
State: "exited",
|
|
Status: "Exited (0) 3 seconds ago",
|
|
}
|
|
|
|
host := models.DockerHost{
|
|
ID: "host-ephemeral",
|
|
Hostname: "ci-host",
|
|
DisplayName: "CI Host",
|
|
Containers: []models.DockerContainer{container},
|
|
}
|
|
|
|
resourceID := dockerResourceID(host.ID, container.ID)
|
|
alertID := fmt.Sprintf("docker-container-state-%s", resourceID)
|
|
|
|
// Run twice to satisfy the confirmation threshold when not ignored
|
|
m.CheckDockerHost(host)
|
|
m.CheckDockerHost(host)
|
|
|
|
if _, exists := m.activeAlerts[alertID]; exists {
|
|
t.Fatalf("expected no state alert for ignored container")
|
|
}
|
|
if _, exists := m.dockerStateConfirm[resourceID]; exists {
|
|
t.Fatalf("expected no state confirmation tracking for ignored container")
|
|
}
|
|
}
|
|
|
|
func TestDockerServiceReplicaAlerts(t *testing.T) {
|
|
m := newTestManager(t)
|
|
m.ClearActiveAlerts()
|
|
|
|
m.mu.RLock()
|
|
cfg := m.config
|
|
m.mu.RUnlock()
|
|
cfg.Enabled = true
|
|
m.UpdateConfig(cfg)
|
|
|
|
host := models.DockerHost{
|
|
ID: "host-1",
|
|
DisplayName: "Prod Swarm",
|
|
Hostname: "swarm-prod",
|
|
Services: []models.DockerService{
|
|
{
|
|
ID: "svc-1",
|
|
Name: "web",
|
|
DesiredTasks: 4,
|
|
RunningTasks: 2,
|
|
Mode: "replicated",
|
|
},
|
|
},
|
|
}
|
|
|
|
m.CheckDockerHost(host)
|
|
|
|
resourceID := dockerServiceResourceID(host.ID, "svc-1", "web")
|
|
alertID := fmt.Sprintf("docker-service-health-%s", resourceID)
|
|
alert, exists := m.activeAlerts[alertID]
|
|
if !exists {
|
|
t.Fatalf("expected service alert %s to be raised", alertID)
|
|
}
|
|
if alert.Level != AlertLevelCritical {
|
|
t.Fatalf("expected critical severity, got %s", alert.Level)
|
|
}
|
|
if missing, ok := alert.Metadata["missingTasks"].(int); !ok || missing != 2 {
|
|
t.Fatalf("expected missingTasks metadata to be 2, got %v", alert.Metadata["missingTasks"])
|
|
}
|
|
|
|
// Resolve by restoring replicas
|
|
host.Services[0].RunningTasks = 4
|
|
m.CheckDockerHost(host)
|
|
|
|
if _, exists := m.activeAlerts[alertID]; exists {
|
|
t.Fatalf("expected service alert %s to be cleared when replicas restored", alertID)
|
|
}
|
|
}
|
|
|
|
func TestDockerServiceAlertDoesNotRenotifyWhenUnchanged(t *testing.T) {
|
|
m := newTestManager(t)
|
|
m.ClearActiveAlerts()
|
|
|
|
cfg := m.GetConfig()
|
|
cfg.Enabled = true
|
|
cfg.ActivationState = ActivationActive
|
|
cfg.Schedule.MaxAlertsHour = 100
|
|
m.UpdateConfig(cfg)
|
|
|
|
dispatched := make(chan string, 4)
|
|
m.SetAlertCallback(func(alert *Alert) {
|
|
dispatched <- alert.ID
|
|
})
|
|
|
|
host := models.DockerHost{
|
|
ID: "host-1",
|
|
DisplayName: "Prod Swarm",
|
|
Hostname: "swarm-prod",
|
|
Services: []models.DockerService{
|
|
{
|
|
ID: "svc-1",
|
|
Name: "web",
|
|
DesiredTasks: 4,
|
|
RunningTasks: 2,
|
|
Mode: "replicated",
|
|
},
|
|
},
|
|
}
|
|
|
|
m.CheckDockerHost(host)
|
|
|
|
select {
|
|
case <-dispatched:
|
|
case <-time.After(1 * time.Second):
|
|
t.Fatal("expected initial docker service alert notification")
|
|
}
|
|
|
|
// Same degraded state should update LastSeen/value but not re-notify every poll.
|
|
m.CheckDockerHost(host)
|
|
|
|
select {
|
|
case id := <-dispatched:
|
|
t.Fatalf("expected no second notification for unchanged service alert, got %s", id)
|
|
case <-time.After(250 * time.Millisecond):
|
|
}
|
|
}
|
|
|
|
func TestDockerServiceAlertPreservesLastNotifiedWhenUnchanged(t *testing.T) {
|
|
m := newTestManager(t)
|
|
m.ClearActiveAlerts()
|
|
|
|
cfg := m.GetConfig()
|
|
cfg.Enabled = true
|
|
cfg.ActivationState = ActivationActive
|
|
cfg.Schedule.MaxAlertsHour = 100
|
|
m.UpdateConfig(cfg)
|
|
|
|
host := models.DockerHost{
|
|
ID: "host-1",
|
|
DisplayName: "Prod Swarm",
|
|
Hostname: "swarm-prod",
|
|
Services: []models.DockerService{
|
|
{
|
|
ID: "svc-1",
|
|
Name: "web",
|
|
DesiredTasks: 4,
|
|
RunningTasks: 2,
|
|
Mode: "replicated",
|
|
},
|
|
},
|
|
}
|
|
|
|
m.CheckDockerHost(host)
|
|
|
|
resourceID := dockerServiceResourceID(host.ID, "svc-1", "web")
|
|
alertID := fmt.Sprintf("docker-service-health-%s", resourceID)
|
|
alert, exists := m.activeAlerts[alertID]
|
|
if !exists {
|
|
t.Fatalf("expected service alert %s to be raised", alertID)
|
|
}
|
|
|
|
notifiedAt := time.Now().Add(-2 * time.Minute).UTC()
|
|
alert.LastNotified = ¬ifiedAt
|
|
|
|
// Same degraded state should keep LastNotified while refreshing state.
|
|
m.CheckDockerHost(host)
|
|
|
|
updated, exists := m.activeAlerts[alertID]
|
|
if !exists {
|
|
t.Fatalf("expected service alert %s to remain active", alertID)
|
|
}
|
|
if updated.LastNotified == nil {
|
|
t.Fatal("expected LastNotified to be preserved, got nil")
|
|
}
|
|
if !updated.LastNotified.Equal(notifiedAt) {
|
|
t.Fatalf("expected LastNotified %s, got %s", notifiedAt, updated.LastNotified)
|
|
}
|
|
}
|
|
|
|
func TestDockerServiceAlertRenotifiesOnEscalationToCritical(t *testing.T) {
|
|
m := newTestManager(t)
|
|
m.ClearActiveAlerts()
|
|
|
|
cfg := m.GetConfig()
|
|
cfg.Enabled = true
|
|
cfg.ActivationState = ActivationActive
|
|
cfg.Schedule.MaxAlertsHour = 100
|
|
cfg.DockerDefaults.ServiceWarnGapPct = 10
|
|
cfg.DockerDefaults.ServiceCritGapPct = 50
|
|
m.UpdateConfig(cfg)
|
|
|
|
dispatched := make(chan AlertLevel, 4)
|
|
m.SetAlertCallback(func(alert *Alert) {
|
|
dispatched <- alert.Level
|
|
})
|
|
|
|
host := models.DockerHost{
|
|
ID: "host-1",
|
|
DisplayName: "Prod Swarm",
|
|
Hostname: "swarm-prod",
|
|
Services: []models.DockerService{
|
|
{
|
|
ID: "svc-1",
|
|
Name: "web",
|
|
DesiredTasks: 4,
|
|
RunningTasks: 3, // 25% missing -> warning
|
|
Mode: "replicated",
|
|
},
|
|
},
|
|
}
|
|
|
|
m.CheckDockerHost(host)
|
|
|
|
select {
|
|
case level := <-dispatched:
|
|
if level != AlertLevelWarning {
|
|
t.Fatalf("expected warning notification first, got %s", level)
|
|
}
|
|
case <-time.After(1 * time.Second):
|
|
t.Fatal("expected initial warning notification")
|
|
}
|
|
|
|
// Escalate from warning to critical: should notify again.
|
|
host.Services[0].RunningTasks = 1 // 75% missing -> critical
|
|
m.CheckDockerHost(host)
|
|
|
|
select {
|
|
case level := <-dispatched:
|
|
if level != AlertLevelCritical {
|
|
t.Fatalf("expected critical escalation notification, got %s", level)
|
|
}
|
|
case <-time.After(1 * time.Second):
|
|
t.Fatal("expected escalation notification")
|
|
}
|
|
}
|
|
|
|
func TestDockerServiceUpdateStateAlert(t *testing.T) {
|
|
m := newTestManager(t)
|
|
cfg := m.GetConfig()
|
|
cfg.Enabled = true
|
|
m.UpdateConfig(cfg)
|
|
|
|
now := time.Now()
|
|
host := models.DockerHost{
|
|
ID: "host-update",
|
|
DisplayName: "Swarm",
|
|
Hostname: "swarm.local",
|
|
Services: []models.DockerService{
|
|
{
|
|
ID: "svc-update",
|
|
Name: "api",
|
|
DesiredTasks: 1,
|
|
RunningTasks: 1,
|
|
UpdateStatus: &models.DockerServiceUpdate{
|
|
State: "rollback_failed",
|
|
Message: "Rollback failed",
|
|
CompletedAt: &now,
|
|
},
|
|
},
|
|
},
|
|
}
|
|
|
|
m.CheckDockerHost(host)
|
|
|
|
resourceID := dockerServiceResourceID(host.ID, "svc-update", "api")
|
|
alertID := fmt.Sprintf("docker-service-health-%s", resourceID)
|
|
alert, exists := m.activeAlerts[alertID]
|
|
if !exists {
|
|
t.Fatalf("expected docker service alert %s to be raised", alertID)
|
|
}
|
|
if alert.Level != AlertLevelCritical {
|
|
t.Fatalf("expected critical severity for rollback failure, got %s", alert.Level)
|
|
}
|
|
if state, ok := alert.Metadata["updateState"].(string); !ok || state != "rollback_failed" {
|
|
t.Fatalf("expected updateState metadata to be rollback_failed, got %v", alert.Metadata["updateState"])
|
|
}
|
|
}
|
|
|
|
func TestDockerContainerStateUsesDockerDefaults(t *testing.T) {
|
|
m := newTestManager(t)
|
|
cfg := m.GetConfig()
|
|
cfg.DockerDefaults.StatePoweredOffSeverity = AlertLevelCritical
|
|
m.UpdateConfig(cfg)
|
|
|
|
container := models.DockerContainer{
|
|
ID: "container-1",
|
|
Name: "web",
|
|
State: "exited",
|
|
Status: "Exited (1) seconds ago",
|
|
}
|
|
host := models.DockerHost{
|
|
ID: "host-1",
|
|
DisplayName: "Docker Host",
|
|
Hostname: "docker.local",
|
|
Containers: []models.DockerContainer{container},
|
|
}
|
|
|
|
m.CheckDockerHost(host)
|
|
m.CheckDockerHost(host)
|
|
|
|
resourceID := dockerResourceID(host.ID, container.ID)
|
|
alertID := fmt.Sprintf("docker-container-state-%s", resourceID)
|
|
alert, exists := m.activeAlerts[alertID]
|
|
if !exists {
|
|
t.Fatalf("expected docker container state alert %s to be raised", alertID)
|
|
}
|
|
if alert.Level != AlertLevelCritical {
|
|
t.Fatalf("expected critical severity from docker defaults, got %s", alert.Level)
|
|
}
|
|
}
|
|
|
|
func TestDockerContainerStateRespectsDisableDefault(t *testing.T) {
|
|
m := newTestManager(t)
|
|
cfg := m.GetConfig()
|
|
cfg.DockerDefaults.StateDisableConnectivity = true
|
|
m.UpdateConfig(cfg)
|
|
|
|
container := models.DockerContainer{
|
|
ID: "container-2",
|
|
Name: "batch",
|
|
State: "exited",
|
|
Status: "Exited (0) seconds ago",
|
|
}
|
|
host := models.DockerHost{
|
|
ID: "host-2",
|
|
DisplayName: "Docker Host",
|
|
Hostname: "docker.example",
|
|
Containers: []models.DockerContainer{container},
|
|
}
|
|
|
|
m.CheckDockerHost(host)
|
|
m.CheckDockerHost(host)
|
|
|
|
resourceID := dockerResourceID(host.ID, container.ID)
|
|
alertID := fmt.Sprintf("docker-container-state-%s", resourceID)
|
|
if _, exists := m.activeAlerts[alertID]; exists {
|
|
t.Fatalf("did not expect docker container state alert when defaults disable connectivity")
|
|
}
|
|
}
|
|
|
|
func TestDockerContainerMemoryLimitHysteresis(t *testing.T) {
|
|
m := newTestManager(t)
|
|
|
|
hostID := "host-mem"
|
|
containerID := "container-mem"
|
|
hostHigh := models.DockerHost{
|
|
ID: hostID,
|
|
DisplayName: "Docker Host",
|
|
Hostname: "docker.mem",
|
|
Containers: []models.DockerContainer{
|
|
{
|
|
ID: containerID,
|
|
Name: "memory-hog",
|
|
State: "running",
|
|
Status: "Up 10 minutes",
|
|
MemoryUsage: 96 * 1024 * 1024,
|
|
MemoryLimit: 100 * 1024 * 1024,
|
|
},
|
|
},
|
|
}
|
|
|
|
m.CheckDockerHost(hostHigh)
|
|
|
|
resourceID := dockerResourceID(hostID, containerID)
|
|
alertID := fmt.Sprintf("docker-container-memory-limit-%s", resourceID)
|
|
if _, exists := m.activeAlerts[alertID]; !exists {
|
|
t.Fatalf("expected memory limit alert to be raised")
|
|
}
|
|
|
|
hostLow := models.DockerHost{
|
|
ID: hostID,
|
|
DisplayName: "Docker Host",
|
|
Hostname: "docker.mem",
|
|
Containers: []models.DockerContainer{
|
|
{
|
|
ID: containerID,
|
|
Name: "memory-hog",
|
|
State: "running",
|
|
Status: "Up 12 minutes",
|
|
MemoryUsage: 80 * 1024 * 1024,
|
|
MemoryLimit: 100 * 1024 * 1024,
|
|
},
|
|
},
|
|
}
|
|
|
|
m.CheckDockerHost(hostLow)
|
|
|
|
if _, exists := m.activeAlerts[alertID]; exists {
|
|
t.Fatalf("expected memory limit alert to clear after usage dropped below hysteresis threshold")
|
|
}
|
|
}
|
|
|
|
func TestDockerContainerDiskUsageAlert(t *testing.T) {
|
|
m := newTestManager(t)
|
|
|
|
cfg := m.GetConfig()
|
|
cfg.Enabled = true
|
|
cfg.TimeThreshold = 0
|
|
if cfg.TimeThresholds == nil {
|
|
cfg.TimeThresholds = make(map[string]int)
|
|
}
|
|
cfg.TimeThresholds["docker"] = 0
|
|
cfg.TimeThresholds["guest"] = 0
|
|
cfg.DockerDefaults.Disk = HysteresisThreshold{Trigger: 75, Clear: 65}
|
|
m.UpdateConfig(cfg)
|
|
|
|
const gib = 1024 * 1024 * 1024
|
|
|
|
host := models.DockerHost{
|
|
ID: "host-disk",
|
|
DisplayName: "Docker Host",
|
|
Hostname: "docker.disk",
|
|
Containers: []models.DockerContainer{
|
|
{
|
|
ID: "container-disk",
|
|
Name: "disk-hog",
|
|
State: "running",
|
|
Status: "Up 5 minutes",
|
|
WritableLayerBytes: int64(8 * gib),
|
|
RootFilesystemBytes: int64(10 * gib),
|
|
},
|
|
},
|
|
}
|
|
|
|
m.CheckDockerHost(host)
|
|
|
|
resourceID := dockerResourceID(host.ID, host.Containers[0].ID)
|
|
alertID := fmt.Sprintf("%s-%s", resourceID, "disk")
|
|
alert, exists := m.activeAlerts[alertID]
|
|
if !exists {
|
|
t.Fatalf("expected docker container disk alert %s to be raised", alertID)
|
|
}
|
|
if alert.Level != AlertLevelWarning {
|
|
t.Fatalf("expected warning severity for disk usage alert, got %s", alert.Level)
|
|
}
|
|
if alert.Metadata == nil {
|
|
t.Fatalf("expected disk alert metadata to be populated")
|
|
}
|
|
if percent, ok := alert.Metadata["diskPercent"].(float64); !ok || percent < 79.5 || percent > 80.5 {
|
|
t.Fatalf("expected diskPercent metadata to be ~80%%, got %v", alert.Metadata["diskPercent"])
|
|
}
|
|
if used, ok := alert.Metadata["writableLayerBytes"].(int64); !ok || used != int64(8*gib) {
|
|
t.Fatalf("expected writableLayerBytes metadata to be %d, got %v", int64(8*gib), alert.Metadata["writableLayerBytes"])
|
|
}
|
|
|
|
// Drop usage below the clear threshold and ensure the alert resolves.
|
|
host.Containers[0].WritableLayerBytes = int64(4 * gib)
|
|
m.CheckDockerHost(host)
|
|
|
|
if _, stillActive := m.activeAlerts[alertID]; stillActive {
|
|
t.Fatalf("expected docker container disk alert %s to clear after usage dropped", alertID)
|
|
}
|
|
}
|
|
|
|
func TestUpdateConfigClampsDockerServiceCriticalGap(t *testing.T) {
|
|
// t.Parallel()
|
|
|
|
m := newTestManager(t)
|
|
|
|
cfg := AlertConfig{
|
|
Enabled: true,
|
|
GuestDefaults: ThresholdConfig{},
|
|
NodeDefaults: ThresholdConfig{},
|
|
HostDefaults: ThresholdConfig{},
|
|
StorageDefault: HysteresisThreshold{},
|
|
DockerDefaults: DockerThresholdConfig{
|
|
ServiceWarnGapPct: 35,
|
|
ServiceCritGapPct: 20,
|
|
},
|
|
PMGDefaults: PMGThresholdConfig{},
|
|
SnapshotDefaults: SnapshotAlertConfig{},
|
|
BackupDefaults: BackupAlertConfig{},
|
|
Overrides: make(map[string]ThresholdConfig),
|
|
Schedule: ScheduleConfig{},
|
|
}
|
|
|
|
m.UpdateConfig(cfg)
|
|
|
|
m.mu.RLock()
|
|
defer m.mu.RUnlock()
|
|
|
|
if m.config.DockerDefaults.ServiceWarnGapPct != 35 {
|
|
t.Fatalf("expected warning gap to remain 35, got %d", m.config.DockerDefaults.ServiceWarnGapPct)
|
|
}
|
|
if m.config.DockerDefaults.ServiceCritGapPct != 35 {
|
|
t.Fatalf("expected critical gap to be clamped to 35, got %d", m.config.DockerDefaults.ServiceCritGapPct)
|
|
}
|
|
}
|
|
|
|
func TestDockerServiceAlertUsesClampedCriticalGap(t *testing.T) {
|
|
m := newTestManager(t)
|
|
m.ClearActiveAlerts()
|
|
|
|
cfg := AlertConfig{
|
|
Enabled: true,
|
|
GuestDefaults: ThresholdConfig{},
|
|
NodeDefaults: ThresholdConfig{},
|
|
HostDefaults: ThresholdConfig{},
|
|
StorageDefault: HysteresisThreshold{},
|
|
DockerDefaults: DockerThresholdConfig{
|
|
ServiceWarnGapPct: 20,
|
|
ServiceCritGapPct: 5,
|
|
},
|
|
PMGDefaults: PMGThresholdConfig{},
|
|
SnapshotDefaults: SnapshotAlertConfig{},
|
|
BackupDefaults: BackupAlertConfig{},
|
|
Overrides: make(map[string]ThresholdConfig),
|
|
Schedule: ScheduleConfig{},
|
|
}
|
|
|
|
m.UpdateConfig(cfg)
|
|
|
|
host := models.DockerHost{
|
|
ID: "docker-host-1",
|
|
DisplayName: "Docker Host",
|
|
Hostname: "docker-host.local",
|
|
Services: []models.DockerService{
|
|
{
|
|
ID: "svc-123",
|
|
Name: "api",
|
|
DesiredTasks: 10,
|
|
RunningTasks: 7,
|
|
},
|
|
},
|
|
}
|
|
|
|
m.CheckDockerHost(host)
|
|
|
|
resourceID := dockerServiceResourceID(host.ID, "svc-123", "api")
|
|
alertID := fmt.Sprintf("docker-service-health-%s", resourceID)
|
|
|
|
alert, exists := m.activeAlerts[alertID]
|
|
if !exists {
|
|
t.Fatalf("expected docker service alert %s to be raised", alertID)
|
|
}
|
|
if alert.Level != AlertLevelCritical {
|
|
t.Fatalf("expected critical severity when replicas 7/10, got %s", alert.Level)
|
|
}
|
|
if pct, ok := alert.Metadata["percentMissing"].(float64); !ok || math.Abs(pct-30.0) > 0.01 {
|
|
t.Fatalf("expected percentMissing metadata ~30, got %v", alert.Metadata["percentMissing"])
|
|
}
|
|
}
|
|
|
|
// TestNormalizeHostDefaultsPreservesZeroTrigger verifies that setting
|
|
// Host Agent thresholds to 0 is preserved (fixes GitHub issue #864).
|
|
// Setting a threshold to 0 should disable alerting for that metric.
|
|
func TestNormalizeHostDefaultsPreservesZeroTrigger(t *testing.T) {
|
|
// t.Parallel()
|
|
|
|
t.Run("nil HostDefaults get factory defaults", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
cfg := AlertConfig{
|
|
Enabled: true,
|
|
HostDefaults: ThresholdConfig{}, // Empty - needs defaults
|
|
}
|
|
|
|
m.UpdateConfig(cfg)
|
|
|
|
m.mu.RLock()
|
|
defer m.mu.RUnlock()
|
|
|
|
if m.config.HostDefaults.CPU == nil {
|
|
t.Fatal("CPU defaults should be set")
|
|
}
|
|
if m.config.HostDefaults.CPU.Trigger != 80 {
|
|
t.Errorf("CPU trigger = %v, want 80", m.config.HostDefaults.CPU.Trigger)
|
|
}
|
|
if m.config.HostDefaults.Memory == nil {
|
|
t.Fatal("Memory defaults should be set")
|
|
}
|
|
if m.config.HostDefaults.Memory.Trigger != 85 {
|
|
t.Errorf("Memory trigger = %v, want 85", m.config.HostDefaults.Memory.Trigger)
|
|
}
|
|
if m.config.HostDefaults.Disk == nil {
|
|
t.Fatal("Disk defaults should be set")
|
|
}
|
|
if m.config.HostDefaults.Disk.Trigger != 90 {
|
|
t.Errorf("Disk trigger = %v, want 90", m.config.HostDefaults.Disk.Trigger)
|
|
}
|
|
})
|
|
|
|
t.Run("Trigger=0 preserved to disable alerting", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
// Set Memory to 0 to disable memory alerting for host agents
|
|
cfg := AlertConfig{
|
|
Enabled: true,
|
|
HostDefaults: ThresholdConfig{
|
|
CPU: &HysteresisThreshold{Trigger: 80, Clear: 75},
|
|
Memory: &HysteresisThreshold{Trigger: 0, Clear: 0}, // Disabled
|
|
Disk: &HysteresisThreshold{Trigger: 90, Clear: 85},
|
|
},
|
|
}
|
|
|
|
m.UpdateConfig(cfg)
|
|
|
|
m.mu.RLock()
|
|
defer m.mu.RUnlock()
|
|
|
|
// Memory threshold should remain at 0 (disabled), not reset to default
|
|
if m.config.HostDefaults.Memory == nil {
|
|
t.Fatal("Memory defaults should be preserved (not nil)")
|
|
}
|
|
if m.config.HostDefaults.Memory.Trigger != 0 {
|
|
t.Errorf("Memory trigger = %v, want 0 (disabled)", m.config.HostDefaults.Memory.Trigger)
|
|
}
|
|
if m.config.HostDefaults.Memory.Clear != 0 {
|
|
t.Errorf("Memory clear = %v, want 0 (disabled)", m.config.HostDefaults.Memory.Clear)
|
|
}
|
|
|
|
// CPU and Disk should still have their values
|
|
if m.config.HostDefaults.CPU.Trigger != 80 {
|
|
t.Errorf("CPU trigger = %v, want 80", m.config.HostDefaults.CPU.Trigger)
|
|
}
|
|
if m.config.HostDefaults.Disk.Trigger != 90 {
|
|
t.Errorf("Disk trigger = %v, want 90", m.config.HostDefaults.Disk.Trigger)
|
|
}
|
|
})
|
|
|
|
t.Run("Trigger=0 sets Clear=0 automatically", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
// Set CPU to 0 with a non-zero Clear - Clear should be normalized to 0
|
|
cfg := AlertConfig{
|
|
Enabled: true,
|
|
HostDefaults: ThresholdConfig{
|
|
CPU: &HysteresisThreshold{Trigger: 0, Clear: 50}, // Clear should become 0
|
|
Memory: &HysteresisThreshold{Trigger: 85, Clear: 80},
|
|
Disk: &HysteresisThreshold{Trigger: 0, Clear: 75}, // Clear should become 0
|
|
},
|
|
}
|
|
|
|
m.UpdateConfig(cfg)
|
|
|
|
m.mu.RLock()
|
|
defer m.mu.RUnlock()
|
|
|
|
if m.config.HostDefaults.CPU.Clear != 0 {
|
|
t.Errorf("CPU clear = %v, want 0 when trigger is 0", m.config.HostDefaults.CPU.Clear)
|
|
}
|
|
if m.config.HostDefaults.Disk.Clear != 0 {
|
|
t.Errorf("Disk clear = %v, want 0 when trigger is 0", m.config.HostDefaults.Disk.Clear)
|
|
}
|
|
})
|
|
|
|
t.Run("missing Clear computed from Trigger", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
cfg := AlertConfig{
|
|
Enabled: true,
|
|
HostDefaults: ThresholdConfig{
|
|
CPU: &HysteresisThreshold{Trigger: 90, Clear: 0}, // Clear should be computed
|
|
Memory: &HysteresisThreshold{Trigger: 95, Clear: 0}, // Clear should be computed
|
|
Disk: &HysteresisThreshold{Trigger: 92, Clear: 0}, // Clear should be computed
|
|
},
|
|
}
|
|
|
|
m.UpdateConfig(cfg)
|
|
|
|
m.mu.RLock()
|
|
defer m.mu.RUnlock()
|
|
|
|
// Clear should be Trigger - 5
|
|
if m.config.HostDefaults.CPU.Clear != 85 {
|
|
t.Errorf("CPU clear = %v, want 85", m.config.HostDefaults.CPU.Clear)
|
|
}
|
|
if m.config.HostDefaults.Memory.Clear != 90 {
|
|
t.Errorf("Memory clear = %v, want 90", m.config.HostDefaults.Memory.Clear)
|
|
}
|
|
if m.config.HostDefaults.Disk.Clear != 87 {
|
|
t.Errorf("Disk clear = %v, want 87", m.config.HostDefaults.Disk.Clear)
|
|
}
|
|
})
|
|
}
|
|
|
|
// TestNormalizeStorageDefaultsPreservesZeroTrigger verifies that setting
|
|
// StorageDefault threshold to 0 is preserved to disable storage alerting.
|
|
func TestNormalizeStorageDefaultsPreservesZeroTrigger(t *testing.T) {
|
|
// t.Parallel()
|
|
|
|
t.Run("negative trigger gets factory defaults", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
cfg := AlertConfig{
|
|
Enabled: true,
|
|
StorageDefault: HysteresisThreshold{Trigger: -1, Clear: 0},
|
|
}
|
|
|
|
m.UpdateConfig(cfg)
|
|
|
|
m.mu.RLock()
|
|
defer m.mu.RUnlock()
|
|
|
|
if m.config.StorageDefault.Trigger != 85 {
|
|
t.Errorf("StorageDefault trigger = %v, want 85", m.config.StorageDefault.Trigger)
|
|
}
|
|
if m.config.StorageDefault.Clear != 80 {
|
|
t.Errorf("StorageDefault clear = %v, want 80", m.config.StorageDefault.Clear)
|
|
}
|
|
})
|
|
|
|
t.Run("Trigger=0 preserved to disable storage alerting", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
cfg := AlertConfig{
|
|
Enabled: true,
|
|
StorageDefault: HysteresisThreshold{Trigger: 0, Clear: 0},
|
|
}
|
|
|
|
m.UpdateConfig(cfg)
|
|
|
|
m.mu.RLock()
|
|
defer m.mu.RUnlock()
|
|
|
|
if m.config.StorageDefault.Trigger != 0 {
|
|
t.Errorf("StorageDefault trigger = %v, want 0 (disabled)", m.config.StorageDefault.Trigger)
|
|
}
|
|
if m.config.StorageDefault.Clear != 0 {
|
|
t.Errorf("StorageDefault clear = %v, want 0 (disabled)", m.config.StorageDefault.Clear)
|
|
}
|
|
})
|
|
|
|
t.Run("missing Clear computed from Trigger", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
cfg := AlertConfig{
|
|
Enabled: true,
|
|
StorageDefault: HysteresisThreshold{Trigger: 90, Clear: 0},
|
|
}
|
|
|
|
m.UpdateConfig(cfg)
|
|
|
|
m.mu.RLock()
|
|
defer m.mu.RUnlock()
|
|
|
|
if m.config.StorageDefault.Trigger != 90 {
|
|
t.Errorf("StorageDefault trigger = %v, want 90", m.config.StorageDefault.Trigger)
|
|
}
|
|
if m.config.StorageDefault.Clear != 85 {
|
|
t.Errorf("StorageDefault clear = %v, want 85 (trigger - 5)", m.config.StorageDefault.Clear)
|
|
}
|
|
})
|
|
}
|
|
|
|
// TestNormalizeNodeDefaultsTemperaturePreservesZeroTrigger verifies that setting
|
|
// NodeDefaults.Temperature threshold to 0 is preserved to disable temperature alerting.
|
|
func TestNormalizeNodeDefaultsTemperaturePreservesZeroTrigger(t *testing.T) {
|
|
// t.Parallel()
|
|
|
|
t.Run("nil Temperature gets factory defaults", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
cfg := AlertConfig{
|
|
Enabled: true,
|
|
NodeDefaults: ThresholdConfig{}, // Empty - Temperature needs defaults
|
|
}
|
|
|
|
m.UpdateConfig(cfg)
|
|
|
|
m.mu.RLock()
|
|
defer m.mu.RUnlock()
|
|
|
|
if m.config.NodeDefaults.Temperature == nil {
|
|
t.Fatal("Temperature defaults should be set")
|
|
}
|
|
if m.config.NodeDefaults.Temperature.Trigger != 80 {
|
|
t.Errorf("Temperature trigger = %v, want 80", m.config.NodeDefaults.Temperature.Trigger)
|
|
}
|
|
if m.config.NodeDefaults.Temperature.Clear != 75 {
|
|
t.Errorf("Temperature clear = %v, want 75", m.config.NodeDefaults.Temperature.Clear)
|
|
}
|
|
})
|
|
|
|
t.Run("Trigger=0 preserved to disable temperature alerting", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
cfg := AlertConfig{
|
|
Enabled: true,
|
|
NodeDefaults: ThresholdConfig{
|
|
Temperature: &HysteresisThreshold{Trigger: 0, Clear: 0},
|
|
},
|
|
}
|
|
|
|
m.UpdateConfig(cfg)
|
|
|
|
m.mu.RLock()
|
|
defer m.mu.RUnlock()
|
|
|
|
if m.config.NodeDefaults.Temperature == nil {
|
|
t.Fatal("Temperature should be preserved (not nil)")
|
|
}
|
|
if m.config.NodeDefaults.Temperature.Trigger != 0 {
|
|
t.Errorf("Temperature trigger = %v, want 0 (disabled)", m.config.NodeDefaults.Temperature.Trigger)
|
|
}
|
|
if m.config.NodeDefaults.Temperature.Clear != 0 {
|
|
t.Errorf("Temperature clear = %v, want 0 (disabled)", m.config.NodeDefaults.Temperature.Clear)
|
|
}
|
|
})
|
|
|
|
t.Run("missing Clear computed from Trigger", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
cfg := AlertConfig{
|
|
Enabled: true,
|
|
NodeDefaults: ThresholdConfig{
|
|
Temperature: &HysteresisThreshold{Trigger: 85, Clear: 0},
|
|
},
|
|
}
|
|
|
|
m.UpdateConfig(cfg)
|
|
|
|
m.mu.RLock()
|
|
defer m.mu.RUnlock()
|
|
|
|
if m.config.NodeDefaults.Temperature.Trigger != 85 {
|
|
t.Errorf("Temperature trigger = %v, want 85", m.config.NodeDefaults.Temperature.Trigger)
|
|
}
|
|
if m.config.NodeDefaults.Temperature.Clear != 80 {
|
|
t.Errorf("Temperature clear = %v, want 80 (trigger - 5)", m.config.NodeDefaults.Temperature.Clear)
|
|
}
|
|
})
|
|
}
|
|
|
|
// TestNormalizeDockerThresholdPreservesZeroTrigger verifies that Docker
|
|
// container thresholds can be set to 0 to disable alerting.
|
|
func TestNormalizeDockerThresholdPreservesZeroTrigger(t *testing.T) {
|
|
// t.Parallel()
|
|
|
|
t.Run("Trigger=0 disables Docker CPU alerting", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
cfg := AlertConfig{
|
|
Enabled: true,
|
|
DockerDefaults: DockerThresholdConfig{
|
|
CPU: HysteresisThreshold{Trigger: 0, Clear: 0},
|
|
Memory: HysteresisThreshold{Trigger: 85, Clear: 80},
|
|
Disk: HysteresisThreshold{Trigger: 85, Clear: 80},
|
|
},
|
|
}
|
|
|
|
m.UpdateConfig(cfg)
|
|
|
|
m.mu.RLock()
|
|
defer m.mu.RUnlock()
|
|
|
|
if m.config.DockerDefaults.CPU.Trigger != 0 {
|
|
t.Errorf("Docker CPU trigger = %v, want 0 (disabled)", m.config.DockerDefaults.CPU.Trigger)
|
|
}
|
|
if m.config.DockerDefaults.Memory.Trigger != 85 {
|
|
t.Errorf("Docker Memory trigger = %v, want 85", m.config.DockerDefaults.Memory.Trigger)
|
|
}
|
|
})
|
|
|
|
t.Run("negative trigger replaced with defaults", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
cfg := AlertConfig{
|
|
Enabled: true,
|
|
DockerDefaults: DockerThresholdConfig{
|
|
CPU: HysteresisThreshold{Trigger: -5, Clear: 0},
|
|
Memory: HysteresisThreshold{Trigger: -10, Clear: 0},
|
|
Disk: HysteresisThreshold{Trigger: -1, Clear: 0},
|
|
},
|
|
}
|
|
|
|
m.UpdateConfig(cfg)
|
|
|
|
m.mu.RLock()
|
|
defer m.mu.RUnlock()
|
|
|
|
if m.config.DockerDefaults.CPU.Trigger != 80 {
|
|
t.Errorf("Docker CPU trigger = %v, want 80 (default)", m.config.DockerDefaults.CPU.Trigger)
|
|
}
|
|
if m.config.DockerDefaults.Memory.Trigger != 85 {
|
|
t.Errorf("Docker Memory trigger = %v, want 85 (default)", m.config.DockerDefaults.Memory.Trigger)
|
|
}
|
|
if m.config.DockerDefaults.Disk.Trigger != 85 {
|
|
t.Errorf("Docker Disk trigger = %v, want 85 (default)", m.config.DockerDefaults.Disk.Trigger)
|
|
}
|
|
})
|
|
}
|
|
|
|
func TestNormalizeDockerIgnoredPrefixes(t *testing.T) {
|
|
// t.Parallel()
|
|
|
|
tests := []struct {
|
|
name string
|
|
input []string
|
|
expected []string
|
|
}{
|
|
{
|
|
name: "nil input",
|
|
input: nil,
|
|
expected: nil,
|
|
},
|
|
{
|
|
name: "blank entries removed",
|
|
input: []string{"", " ", "\t"},
|
|
expected: nil,
|
|
},
|
|
{
|
|
name: "trims and deduplicates preserving first occurrence casing",
|
|
input: []string{" Foo ", "foo", "Bar", " bar ", "Baz"},
|
|
expected: []string{"Foo", "Bar", "Baz"},
|
|
},
|
|
{
|
|
name: "already normalized list remains unchanged",
|
|
input: []string{"alpha", "beta"},
|
|
expected: []string{"alpha", "beta"},
|
|
},
|
|
}
|
|
|
|
for _, tc := range tests {
|
|
tc := tc
|
|
t.Run(tc.name, func(t *testing.T) {
|
|
// t.Parallel()
|
|
|
|
got := NormalizeDockerIgnoredPrefixes(tc.input)
|
|
if !reflect.DeepEqual(got, tc.expected) {
|
|
t.Fatalf("expected %v, got %v", tc.expected, got)
|
|
}
|
|
})
|
|
}
|
|
}
|
|
|
|
func TestCheckDockerHostIgnoredPrefixClearsExistingAlerts(t *testing.T) {
|
|
m := newTestManager(t)
|
|
|
|
container := models.DockerContainer{
|
|
ID: "abc123456789",
|
|
Name: "runner-job-1",
|
|
State: "exited",
|
|
Status: "Exited (1) 10 seconds ago",
|
|
}
|
|
host := models.DockerHost{
|
|
ID: "docker-host",
|
|
DisplayName: "Docker Host",
|
|
Hostname: "docker-host.local",
|
|
Containers: []models.DockerContainer{container},
|
|
}
|
|
resourceID := dockerResourceID(host.ID, container.ID)
|
|
stateAlertID := fmt.Sprintf("docker-container-state-%s", resourceID)
|
|
healthAlertID := fmt.Sprintf("docker-container-health-%s", resourceID)
|
|
restartAlertID := fmt.Sprintf("docker-container-restart-loop-%s", resourceID)
|
|
|
|
m.mu.Lock()
|
|
m.config.Enabled = true
|
|
m.config.DockerIgnoredContainerPrefixes = []string{"runner-"}
|
|
m.activeAlerts[stateAlertID] = &Alert{ID: stateAlertID, ResourceID: resourceID}
|
|
m.activeAlerts[healthAlertID] = &Alert{ID: healthAlertID, ResourceID: resourceID}
|
|
m.activeAlerts[restartAlertID] = &Alert{ID: restartAlertID, ResourceID: resourceID}
|
|
m.dockerStateConfirm[resourceID] = 2
|
|
m.dockerRestartTracking[resourceID] = &dockerRestartRecord{}
|
|
m.dockerLastExitCode[resourceID] = 137
|
|
m.mu.Unlock()
|
|
|
|
m.CheckDockerHost(host)
|
|
|
|
m.mu.RLock()
|
|
defer m.mu.RUnlock()
|
|
|
|
if _, exists := m.activeAlerts[stateAlertID]; exists {
|
|
t.Fatalf("expected state alert cleared for ignored container")
|
|
}
|
|
if _, exists := m.activeAlerts[healthAlertID]; exists {
|
|
t.Fatalf("expected health alert cleared for ignored container")
|
|
}
|
|
if _, exists := m.activeAlerts[restartAlertID]; exists {
|
|
t.Fatalf("expected restart alert cleared for ignored container")
|
|
}
|
|
if _, exists := m.dockerStateConfirm[resourceID]; exists {
|
|
t.Fatalf("expected state confirmation tracking cleared")
|
|
}
|
|
if _, exists := m.dockerRestartTracking[resourceID]; exists {
|
|
t.Fatalf("expected restart tracking cleared")
|
|
}
|
|
if _, exists := m.dockerLastExitCode[resourceID]; exists {
|
|
t.Fatalf("expected last exit code cleared")
|
|
}
|
|
}
|
|
|
|
func TestUpdateConfigNormalizesDockerIgnoredPrefixes(t *testing.T) {
|
|
// t.Parallel()
|
|
|
|
t.Run("nil input remains nil", func(t *testing.T) {
|
|
// t.Parallel()
|
|
|
|
m := newTestManager(t)
|
|
m.UpdateConfig(AlertConfig{})
|
|
|
|
m.mu.RLock()
|
|
defer m.mu.RUnlock()
|
|
|
|
if m.config.DockerIgnoredContainerPrefixes != nil {
|
|
t.Fatalf("expected nil prefixes, got %v", m.config.DockerIgnoredContainerPrefixes)
|
|
}
|
|
})
|
|
|
|
t.Run("duplicates trimmed and deduplicated", func(t *testing.T) {
|
|
// t.Parallel()
|
|
|
|
m := newTestManager(t)
|
|
cfg := AlertConfig{
|
|
DockerIgnoredContainerPrefixes: []string{
|
|
" Foo ",
|
|
"foo",
|
|
"Bar",
|
|
},
|
|
}
|
|
|
|
m.UpdateConfig(cfg)
|
|
|
|
m.mu.RLock()
|
|
defer m.mu.RUnlock()
|
|
|
|
expected := []string{"Foo", "Bar"}
|
|
if !reflect.DeepEqual(m.config.DockerIgnoredContainerPrefixes, expected) {
|
|
t.Fatalf("expected normalized prefixes %v, got %v", expected, m.config.DockerIgnoredContainerPrefixes)
|
|
}
|
|
})
|
|
}
|
|
|
|
func TestMatchesDockerIgnoredPrefix(t *testing.T) {
|
|
// t.Parallel()
|
|
|
|
tests := []struct {
|
|
name string
|
|
containerName string
|
|
containerID string
|
|
prefixes []string
|
|
want bool
|
|
}{
|
|
{name: "empty prefixes", containerName: "runner-123", containerID: "abc", prefixes: nil, want: false},
|
|
{name: "match with name", containerName: "runner-123", containerID: "abc", prefixes: []string{"runner-"}, want: true},
|
|
{name: "match with id", containerName: "app", containerID: "abc123", prefixes: []string{"abc"}, want: true},
|
|
{name: "trimmed comparison", containerName: "runner-job", containerID: "abc", prefixes: []string{" runner- "}, want: true},
|
|
{name: "case insensitive", containerName: "Runner-Job", containerID: "abc", prefixes: []string{"runner-"}, want: true},
|
|
{name: "no match", containerName: "service", containerID: "xyz", prefixes: []string{"runner-"}, want: false},
|
|
{name: "skips empty prefix in list", containerName: "runner-job", containerID: "abc", prefixes: []string{"", "runner-"}, want: true},
|
|
{name: "all empty prefixes returns false", containerName: "runner-job", containerID: "abc", prefixes: []string{"", " ", ""}, want: false},
|
|
{name: "empty name matches id", containerName: "", containerID: "runner-123", prefixes: []string{"runner-"}, want: true},
|
|
{name: "empty id matches name", containerName: "runner-job", containerID: "", prefixes: []string{"runner-"}, want: true},
|
|
{name: "both empty no match", containerName: "", containerID: "", prefixes: []string{"runner-"}, want: false},
|
|
}
|
|
|
|
for _, tc := range tests {
|
|
tc := tc
|
|
t.Run(tc.name, func(t *testing.T) {
|
|
// t.Parallel()
|
|
|
|
if got := matchesDockerIgnoredPrefix(tc.containerName, tc.containerID, tc.prefixes); got != tc.want {
|
|
t.Fatalf("matchesDockerIgnoredPrefix(%q, %q, %v) = %v, want %v", tc.containerName, tc.containerID, tc.prefixes, got, tc.want)
|
|
}
|
|
})
|
|
}
|
|
}
|
|
|
|
func TestDockerInstanceName(t *testing.T) {
|
|
// t.Parallel()
|
|
|
|
tests := []struct {
|
|
name string
|
|
host models.DockerHost
|
|
want string
|
|
}{
|
|
{name: "uses display name", host: models.DockerHost{DisplayName: "Prod Host"}, want: "Docker:Prod Host"},
|
|
{name: "falls back to hostname", host: models.DockerHost{Hostname: "docker.local"}, want: "Docker:docker.local"},
|
|
{name: "defaults when empty", host: models.DockerHost{}, want: "Docker"},
|
|
}
|
|
|
|
for _, tc := range tests {
|
|
tc := tc
|
|
t.Run(tc.name, func(t *testing.T) {
|
|
// t.Parallel()
|
|
|
|
if got := dockerInstanceName(tc.host); got != tc.want {
|
|
t.Fatalf("dockerInstanceName(%+v) = %q, want %q", tc.host, got, tc.want)
|
|
}
|
|
})
|
|
}
|
|
}
|
|
|
|
func TestDockerContainerDisplayName(t *testing.T) {
|
|
// t.Parallel()
|
|
|
|
tests := []struct {
|
|
name string
|
|
container models.DockerContainer
|
|
want string
|
|
}{
|
|
{name: "trims whitespace", container: models.DockerContainer{Name: " app "}, want: "app"},
|
|
{name: "strips leading slash", container: models.DockerContainer{Name: "/runner"}, want: "runner"},
|
|
{name: "falls back to id truncated", container: models.DockerContainer{ID: "0123456789abcdef"}, want: "0123456789ab"},
|
|
}
|
|
|
|
for _, tc := range tests {
|
|
tc := tc
|
|
t.Run(tc.name, func(t *testing.T) {
|
|
// t.Parallel()
|
|
|
|
if got := dockerContainerDisplayName(tc.container); got != tc.want {
|
|
t.Fatalf("dockerContainerDisplayName(%+v) = %q, want %q", tc.container, got, tc.want)
|
|
}
|
|
})
|
|
}
|
|
}
|
|
|
|
func TestDockerResourceID(t *testing.T) {
|
|
// t.Parallel()
|
|
|
|
tests := []struct {
|
|
name string
|
|
hostID string
|
|
containerID string
|
|
want string
|
|
}{
|
|
{name: "both ids present", hostID: "host1", containerID: "abc", want: "docker:host1/abc"},
|
|
{name: "missing host id", hostID: "", containerID: "abc", want: "docker:container/abc"},
|
|
{name: "missing container id", hostID: "host1", containerID: "", want: "docker:host1"},
|
|
{name: "both missing", hostID: "", containerID: "", want: "docker:unknown"},
|
|
}
|
|
|
|
for _, tc := range tests {
|
|
tc := tc
|
|
t.Run(tc.name, func(t *testing.T) {
|
|
// t.Parallel()
|
|
|
|
if got := dockerResourceID(tc.hostID, tc.containerID); got != tc.want {
|
|
t.Fatalf("dockerResourceID(%q, %q) = %q, want %q", tc.hostID, tc.containerID, got, tc.want)
|
|
}
|
|
})
|
|
}
|
|
}
|
|
|
|
func TestHasKnownFirmwareBug(t *testing.T) {
|
|
// t.Parallel()
|
|
|
|
tests := []struct {
|
|
name string
|
|
model string
|
|
want bool
|
|
}{
|
|
{name: "Samsung 980 with SSD prefix", model: "Samsung SSD 980 1TB", want: true},
|
|
{name: "Samsung 980 without SSD prefix", model: "Samsung 980 PRO 2TB", want: true},
|
|
{name: "Samsung 990 with SSD prefix", model: "Samsung SSD 990 PRO 2TB", want: true},
|
|
{name: "Samsung 990 without SSD prefix", model: "Samsung 990 EVO 1TB", want: true},
|
|
{name: "Samsung 980 lowercase", model: "samsung ssd 980 1tb", want: true},
|
|
{name: "Samsung 990 mixed case", model: "SAMSUNG 990 PRO", want: true},
|
|
{name: "Samsung 970 (not affected)", model: "Samsung SSD 970 EVO Plus", want: false},
|
|
{name: "Samsung 870 (not affected)", model: "Samsung 870 QVO", want: false},
|
|
{name: "Other manufacturer", model: "WD Blue SN570", want: false},
|
|
{name: "Empty model", model: "", want: false},
|
|
}
|
|
|
|
for _, tc := range tests {
|
|
tc := tc
|
|
t.Run(tc.name, func(t *testing.T) {
|
|
// t.Parallel()
|
|
|
|
if got := hasKnownFirmwareBug(tc.model); got != tc.want {
|
|
t.Fatalf("hasKnownFirmwareBug(%q) = %v, want %v", tc.model, got, tc.want)
|
|
}
|
|
})
|
|
}
|
|
}
|
|
|
|
func TestCheckDiskHealthSkipsSamsung980FalseAlerts(t *testing.T) {
|
|
m := newTestManager(t)
|
|
m.ClearActiveAlerts()
|
|
|
|
// Samsung 980 reporting FAILED health (firmware bug) but actually healthy
|
|
disk := proxmox.Disk{
|
|
DevPath: "/dev/nvme0n1",
|
|
Model: "Samsung SSD 980 1TB",
|
|
Serial: "S649NF0R123456",
|
|
Type: "nvme",
|
|
Health: "FAILED", // False report due to firmware bug
|
|
Wearout: 99, // Drive is actually healthy with 99% life remaining
|
|
Size: 1000204886016,
|
|
}
|
|
|
|
// Should not create an alert for health status
|
|
m.CheckDiskHealth("test-instance", "pve-node1", disk)
|
|
|
|
m.mu.RLock()
|
|
healthAlertID := "disk-health-test-instance-pve-node1-/dev/nvme0n1"
|
|
if _, exists := m.activeAlerts[healthAlertID]; exists {
|
|
m.mu.RUnlock()
|
|
t.Fatalf("expected no health alert for Samsung 980 with known firmware bug")
|
|
}
|
|
m.mu.RUnlock()
|
|
|
|
// Now test that wearout alerts still work for these drives
|
|
disk.Wearout = 5 // Low wearout should still trigger alert
|
|
m.CheckDiskHealth("test-instance", "pve-node1", disk)
|
|
|
|
m.mu.RLock()
|
|
wearoutAlertID := "disk-wearout-test-instance-pve-node1-/dev/nvme0n1"
|
|
if _, exists := m.activeAlerts[wearoutAlertID]; !exists {
|
|
m.mu.RUnlock()
|
|
t.Fatalf("expected wearout alert to still work for Samsung 980")
|
|
}
|
|
m.mu.RUnlock()
|
|
}
|
|
|
|
func TestCheckDiskHealthClearsExistingSamsung980Alerts(t *testing.T) {
|
|
m := newTestManager(t)
|
|
m.ClearActiveAlerts()
|
|
|
|
disk := proxmox.Disk{
|
|
DevPath: "/dev/nvme0n1",
|
|
Model: "Samsung SSD 990 PRO 2TB",
|
|
Serial: "S6Z0NF0R654321",
|
|
Type: "nvme",
|
|
Health: "FAILED",
|
|
Wearout: 98,
|
|
Size: 2000398934016,
|
|
}
|
|
|
|
alertID := "disk-health-test-instance-pve-node1-/dev/nvme0n1"
|
|
|
|
// Manually create an existing alert (simulating alert from before the fix)
|
|
m.mu.Lock()
|
|
m.activeAlerts[alertID] = &Alert{
|
|
ID: alertID,
|
|
Type: "disk-health",
|
|
Level: AlertLevelCritical,
|
|
ResourceID: "pve-node1-/dev/nvme0n1",
|
|
ResourceName: "Samsung SSD 990 PRO 2TB (/dev/nvme0n1)",
|
|
Node: "pve-node1",
|
|
Instance: "test-instance",
|
|
Message: "Disk health check failed: FAILED",
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
// Check disk health - should clear the existing false alert
|
|
m.CheckDiskHealth("test-instance", "pve-node1", disk)
|
|
|
|
m.mu.RLock()
|
|
defer m.mu.RUnlock()
|
|
if _, exists := m.activeAlerts[alertID]; exists {
|
|
t.Fatalf("expected existing Samsung 990 health alert to be cleared")
|
|
}
|
|
}
|
|
|
|
func TestCheckDiskHealthHealthyDiskNoAlert(t *testing.T) {
|
|
m := newTestManager(t)
|
|
m.ClearActiveAlerts()
|
|
|
|
// Non-Samsung disk with PASSED health should not create alert
|
|
disk := proxmox.Disk{
|
|
DevPath: "/dev/sda",
|
|
Model: "Western Digital WD40EFZX",
|
|
Serial: "WD-WCC4E0123456",
|
|
Type: "hdd",
|
|
Health: "PASSED",
|
|
Wearout: 0, // N/A for HDD
|
|
Size: 4000787030016,
|
|
}
|
|
|
|
m.CheckDiskHealth("test-instance", "pve-node1", disk)
|
|
|
|
m.mu.RLock()
|
|
healthAlertID := "disk-health-test-instance-pve-node1-/dev/sda"
|
|
if _, exists := m.activeAlerts[healthAlertID]; exists {
|
|
m.mu.RUnlock()
|
|
t.Fatalf("expected no health alert for healthy disk with PASSED status")
|
|
}
|
|
m.mu.RUnlock()
|
|
|
|
// Also test with "OK" status
|
|
disk.Health = "OK"
|
|
m.CheckDiskHealth("test-instance", "pve-node1", disk)
|
|
|
|
m.mu.RLock()
|
|
defer m.mu.RUnlock()
|
|
if _, exists := m.activeAlerts[healthAlertID]; exists {
|
|
t.Fatalf("expected no health alert for healthy disk with OK status")
|
|
}
|
|
}
|
|
|
|
func TestCheckDiskHealthFailedDiskCreatesAlert(t *testing.T) {
|
|
m := newTestManager(t)
|
|
m.ClearActiveAlerts()
|
|
|
|
// Non-Samsung disk with FAILED health should create alert
|
|
disk := proxmox.Disk{
|
|
DevPath: "/dev/sdb",
|
|
Model: "Seagate ST2000DM008",
|
|
Serial: "ZA123456",
|
|
Type: "hdd",
|
|
Health: "FAILED",
|
|
Wearout: 0,
|
|
Size: 2000398934016,
|
|
}
|
|
|
|
m.CheckDiskHealth("test-instance", "pve-node1", disk)
|
|
|
|
m.mu.RLock()
|
|
defer m.mu.RUnlock()
|
|
|
|
healthAlertID := "disk-health-test-instance-pve-node1-/dev/sdb"
|
|
alert, exists := m.activeAlerts[healthAlertID]
|
|
if !exists {
|
|
t.Fatalf("expected health alert to be created for failed disk")
|
|
}
|
|
|
|
if alert.Level != AlertLevelCritical {
|
|
t.Errorf("expected critical alert level, got %s", alert.Level)
|
|
}
|
|
if alert.Type != "disk-health" {
|
|
t.Errorf("expected type disk-health, got %s", alert.Type)
|
|
}
|
|
if alert.Node != "pve-node1" {
|
|
t.Errorf("expected node pve-node1, got %s", alert.Node)
|
|
}
|
|
if alert.Instance != "test-instance" {
|
|
t.Errorf("expected instance test-instance, got %s", alert.Instance)
|
|
}
|
|
}
|
|
|
|
func TestCheckDiskHealthRecoveryAlertCleared(t *testing.T) {
|
|
m := newTestManager(t)
|
|
m.ClearActiveAlerts()
|
|
|
|
disk := proxmox.Disk{
|
|
DevPath: "/dev/sdc",
|
|
Model: "Intel SSDSC2BB480G4",
|
|
Serial: "BTWL123456789",
|
|
Type: "ssd",
|
|
Health: "FAILED",
|
|
Wearout: 50,
|
|
Size: 480103981056,
|
|
}
|
|
|
|
// First check creates alert
|
|
m.CheckDiskHealth("test-instance", "pve-node1", disk)
|
|
|
|
healthAlertID := "disk-health-test-instance-pve-node1-/dev/sdc"
|
|
m.mu.RLock()
|
|
if _, exists := m.activeAlerts[healthAlertID]; !exists {
|
|
m.mu.RUnlock()
|
|
t.Fatalf("expected health alert to be created")
|
|
}
|
|
m.mu.RUnlock()
|
|
|
|
// Disk health recovers
|
|
disk.Health = "PASSED"
|
|
m.CheckDiskHealth("test-instance", "pve-node1", disk)
|
|
|
|
m.mu.RLock()
|
|
defer m.mu.RUnlock()
|
|
if _, exists := m.activeAlerts[healthAlertID]; exists {
|
|
t.Fatalf("expected health alert to be cleared after recovery")
|
|
}
|
|
}
|
|
|
|
func TestCheckDiskHealthLowWearoutCreatesAlert(t *testing.T) {
|
|
m := newTestManager(t)
|
|
m.ClearActiveAlerts()
|
|
|
|
// SSD with low wearout (less than 10% life remaining)
|
|
disk := proxmox.Disk{
|
|
DevPath: "/dev/nvme1n1",
|
|
Model: "Crucial CT1000MX500",
|
|
Serial: "12345678ABCD",
|
|
Type: "nvme",
|
|
Health: "PASSED",
|
|
Wearout: 5, // Only 5% life remaining
|
|
Size: 1000204886016,
|
|
}
|
|
|
|
m.CheckDiskHealth("test-instance", "pve-node1", disk)
|
|
|
|
m.mu.RLock()
|
|
defer m.mu.RUnlock()
|
|
|
|
wearoutAlertID := "disk-wearout-test-instance-pve-node1-/dev/nvme1n1"
|
|
alert, exists := m.activeAlerts[wearoutAlertID]
|
|
if !exists {
|
|
t.Fatalf("expected wearout alert to be created for disk with low life remaining")
|
|
}
|
|
|
|
if alert.Level != AlertLevelWarning {
|
|
t.Errorf("expected warning alert level, got %s", alert.Level)
|
|
}
|
|
if alert.Type != "disk-wearout" {
|
|
t.Errorf("expected type disk-wearout, got %s", alert.Type)
|
|
}
|
|
if alert.Value != 5 {
|
|
t.Errorf("expected value 5, got %f", alert.Value)
|
|
}
|
|
if alert.Threshold != 10.0 {
|
|
t.Errorf("expected threshold 10.0, got %f", alert.Threshold)
|
|
}
|
|
}
|
|
|
|
func TestCheckDiskHealthWearoutAlertUpdatesOnSubsequentChecks(t *testing.T) {
|
|
m := newTestManager(t)
|
|
m.ClearActiveAlerts()
|
|
|
|
disk := proxmox.Disk{
|
|
DevPath: "/dev/nvme2n1",
|
|
Model: "Kingston SA2000M8",
|
|
Serial: "50026B768A123456",
|
|
Type: "nvme",
|
|
Health: "PASSED",
|
|
Wearout: 8,
|
|
Size: 500107862016,
|
|
}
|
|
|
|
// First check creates alert
|
|
m.CheckDiskHealth("test-instance", "pve-node1", disk)
|
|
|
|
wearoutAlertID := "disk-wearout-test-instance-pve-node1-/dev/nvme2n1"
|
|
m.mu.RLock()
|
|
alert, exists := m.activeAlerts[wearoutAlertID]
|
|
if !exists {
|
|
m.mu.RUnlock()
|
|
t.Fatalf("expected wearout alert to be created")
|
|
}
|
|
firstLastSeen := alert.LastSeen
|
|
m.mu.RUnlock()
|
|
|
|
// Wait a moment to ensure time difference
|
|
time.Sleep(10 * time.Millisecond)
|
|
|
|
// Wearout decreases further
|
|
disk.Wearout = 6
|
|
m.CheckDiskHealth("test-instance", "pve-node1", disk)
|
|
|
|
m.mu.RLock()
|
|
defer m.mu.RUnlock()
|
|
|
|
alert, exists = m.activeAlerts[wearoutAlertID]
|
|
if !exists {
|
|
t.Fatalf("expected wearout alert to still exist")
|
|
}
|
|
|
|
if !alert.LastSeen.After(firstLastSeen) {
|
|
t.Errorf("expected LastSeen to be updated, got %v (original: %v)", alert.LastSeen, firstLastSeen)
|
|
}
|
|
if alert.Value != 6 {
|
|
t.Errorf("expected value to be updated to 6, got %f", alert.Value)
|
|
}
|
|
}
|
|
|
|
func TestCheckDiskHealthWearoutRecoveryAlertCleared(t *testing.T) {
|
|
m := newTestManager(t)
|
|
m.ClearActiveAlerts()
|
|
|
|
disk := proxmox.Disk{
|
|
DevPath: "/dev/sdd",
|
|
Model: "ADATA SU800",
|
|
Serial: "2J012345678",
|
|
Type: "ssd",
|
|
Health: "PASSED",
|
|
Wearout: 5,
|
|
Size: 256060514304,
|
|
}
|
|
|
|
// First check creates wearout alert
|
|
m.CheckDiskHealth("test-instance", "pve-node1", disk)
|
|
|
|
wearoutAlertID := "disk-wearout-test-instance-pve-node1-/dev/sdd"
|
|
m.mu.RLock()
|
|
if _, exists := m.activeAlerts[wearoutAlertID]; !exists {
|
|
m.mu.RUnlock()
|
|
t.Fatalf("expected wearout alert to be created")
|
|
}
|
|
m.mu.RUnlock()
|
|
|
|
// Wearout recovers (replaced drive, or misread corrected)
|
|
disk.Wearout = 95
|
|
m.CheckDiskHealth("test-instance", "pve-node1", disk)
|
|
|
|
m.mu.RLock()
|
|
defer m.mu.RUnlock()
|
|
if _, exists := m.activeAlerts[wearoutAlertID]; exists {
|
|
t.Fatalf("expected wearout alert to be cleared after recovery")
|
|
}
|
|
}
|
|
|
|
func TestCheckDiskHealthEmptyOrUnknownHealthNoAlert(t *testing.T) {
|
|
m := newTestManager(t)
|
|
m.ClearActiveAlerts()
|
|
|
|
disk := proxmox.Disk{
|
|
DevPath: "/dev/sde",
|
|
Model: "Generic USB Storage",
|
|
Serial: "USB123456",
|
|
Type: "hdd",
|
|
Health: "", // Empty health - SMART not supported
|
|
Wearout: 0,
|
|
Size: 128043712512,
|
|
}
|
|
|
|
healthAlertID := "disk-health-test-instance-pve-node1-/dev/sde"
|
|
|
|
// Empty health should not create alert
|
|
m.CheckDiskHealth("test-instance", "pve-node1", disk)
|
|
|
|
m.mu.RLock()
|
|
if _, exists := m.activeAlerts[healthAlertID]; exists {
|
|
m.mu.RUnlock()
|
|
t.Fatalf("expected no health alert for disk with empty health status")
|
|
}
|
|
m.mu.RUnlock()
|
|
|
|
// UNKNOWN health should not create alert
|
|
disk.Health = "UNKNOWN"
|
|
m.CheckDiskHealth("test-instance", "pve-node1", disk)
|
|
|
|
m.mu.RLock()
|
|
if _, exists := m.activeAlerts[healthAlertID]; exists {
|
|
m.mu.RUnlock()
|
|
t.Fatalf("expected no health alert for disk with UNKNOWN health status")
|
|
}
|
|
m.mu.RUnlock()
|
|
|
|
// Lowercase "unknown" should also not create alert (normalized to uppercase)
|
|
disk.Health = "unknown"
|
|
m.CheckDiskHealth("test-instance", "pve-node1", disk)
|
|
|
|
m.mu.RLock()
|
|
defer m.mu.RUnlock()
|
|
if _, exists := m.activeAlerts[healthAlertID]; exists {
|
|
t.Fatalf("expected no health alert for disk with lowercase unknown health status")
|
|
}
|
|
}
|
|
|
|
func TestDisableAllStorageClearsExistingAlerts(t *testing.T) {
|
|
m := newTestManager(t)
|
|
|
|
storageID := "local-lvm"
|
|
|
|
// Start with configuration that allows storage alerts
|
|
initialConfig := AlertConfig{
|
|
Enabled: true,
|
|
DisableAllStorage: false,
|
|
StorageDefault: HysteresisThreshold{Trigger: 80, Clear: 75},
|
|
TimeThreshold: 0,
|
|
TimeThresholds: map[string]int{},
|
|
NodeDefaults: ThresholdConfig{
|
|
CPU: &HysteresisThreshold{Trigger: 80, Clear: 75},
|
|
Memory: &HysteresisThreshold{Trigger: 85, Clear: 80},
|
|
Disk: &HysteresisThreshold{Trigger: 90, Clear: 85},
|
|
},
|
|
GuestDefaults: ThresholdConfig{
|
|
CPU: &HysteresisThreshold{Trigger: 80, Clear: 75},
|
|
},
|
|
Overrides: make(map[string]ThresholdConfig),
|
|
}
|
|
m.UpdateConfig(initialConfig)
|
|
m.mu.Lock()
|
|
m.config.TimeThreshold = 0
|
|
m.config.TimeThresholds = map[string]int{}
|
|
m.config.ActivationState = ActivationActive
|
|
m.mu.Unlock()
|
|
|
|
var dispatched []*Alert
|
|
done := make(chan struct{}, 1)
|
|
var resolved []string
|
|
resolvedDone := make(chan struct{}, 1)
|
|
m.SetAlertCallback(func(alert *Alert) {
|
|
dispatched = append(dispatched, alert)
|
|
select {
|
|
case done <- struct{}{}:
|
|
default:
|
|
}
|
|
})
|
|
m.SetResolvedCallback(func(alertID string) {
|
|
resolved = append(resolved, alertID)
|
|
select {
|
|
case resolvedDone <- struct{}{}:
|
|
default:
|
|
}
|
|
})
|
|
|
|
storage := models.Storage{
|
|
ID: storageID,
|
|
Name: "local-lvm",
|
|
Usage: 90.0,
|
|
Status: "available",
|
|
}
|
|
|
|
// Initial check should trigger an alert
|
|
m.CheckStorage(storage)
|
|
select {
|
|
case <-done:
|
|
case <-time.After(100 * time.Millisecond):
|
|
t.Fatalf("did not receive initial alert dispatch")
|
|
}
|
|
if len(dispatched) != 1 {
|
|
t.Fatalf("expected 1 alert before disabling storage, got %d", len(dispatched))
|
|
}
|
|
|
|
// Apply config with DisableAllStorage enabled
|
|
disabledConfig := initialConfig
|
|
disabledConfig.DisableAllStorage = true
|
|
m.UpdateConfig(disabledConfig)
|
|
m.mu.Lock()
|
|
m.config.TimeThreshold = 0
|
|
m.config.TimeThresholds = map[string]int{}
|
|
m.config.ActivationState = ActivationActive
|
|
m.mu.Unlock()
|
|
|
|
// Clear dispatched slice to capture only post-disable notifications
|
|
dispatched = dispatched[:0]
|
|
done = make(chan struct{}, 1)
|
|
|
|
// Re-run CheckStorage with high usage; no alert should be dispatched
|
|
m.CheckStorage(storage)
|
|
select {
|
|
case <-done:
|
|
t.Fatalf("expected no alerts after disabling all storage, but callback fired")
|
|
case <-time.After(100 * time.Millisecond):
|
|
// No callback fired as expected
|
|
}
|
|
|
|
// Active alerts should be cleared by reevaluateActiveAlertsLocked
|
|
m.mu.RLock()
|
|
activeCount := len(m.activeAlerts)
|
|
m.mu.RUnlock()
|
|
if activeCount != 0 {
|
|
t.Fatalf("expected active alerts to be cleared after disabling all storage, got %d", activeCount)
|
|
}
|
|
|
|
// Resolved callback should have fired
|
|
select {
|
|
case <-resolvedDone:
|
|
case <-time.After(100 * time.Millisecond):
|
|
t.Fatalf("expected resolved callback to fire after disabling all storage")
|
|
}
|
|
expectedAlertID := fmt.Sprintf("%s-usage", storageID)
|
|
if len(resolved) != 1 || resolved[0] != expectedAlertID {
|
|
t.Fatalf("expected resolved callback for %s, got %v", expectedAlertID, resolved)
|
|
}
|
|
|
|
// Pending alert should be cleared
|
|
m.mu.RLock()
|
|
_, isPending := m.pendingAlerts[expectedAlertID]
|
|
m.mu.RUnlock()
|
|
if isPending {
|
|
t.Fatalf("expected pending alert entry to be cleared after disabling all storage")
|
|
}
|
|
}
|
|
|
|
func TestUpdateConfigPreservesZeroDockerThresholds(t *testing.T) {
|
|
t.Helper()
|
|
|
|
m := newTestManager(t)
|
|
config := m.GetConfig()
|
|
config.DockerDefaults.Memory = HysteresisThreshold{Trigger: 0, Clear: 0}
|
|
|
|
m.UpdateConfig(config)
|
|
|
|
m.mu.RLock()
|
|
defer m.mu.RUnlock()
|
|
|
|
if m.config.DockerDefaults.Memory.Trigger != 0 {
|
|
t.Fatalf("expected docker memory trigger to remain 0 when disabled, got %.1f", m.config.DockerDefaults.Memory.Trigger)
|
|
}
|
|
if m.config.DockerDefaults.Memory.Clear != 0 {
|
|
t.Fatalf("expected docker memory clear to remain 0 when disabled, got %.1f", m.config.DockerDefaults.Memory.Clear)
|
|
}
|
|
}
|
|
|
|
func TestReevaluateClearsDockerContainerAlertWhenOverrideDisabled(t *testing.T) {
|
|
m := newTestManager(t)
|
|
|
|
resourceID := "docker:host-1/container-1"
|
|
alertID := resourceID + "-memory"
|
|
|
|
resolved := make(chan string, 1)
|
|
m.SetResolvedCallback(func(id string) {
|
|
resolved <- id
|
|
})
|
|
|
|
m.mu.Lock()
|
|
m.activeAlerts[alertID] = &Alert{
|
|
ID: alertID,
|
|
Type: "memory",
|
|
ResourceID: resourceID,
|
|
ResourceName: "qbittorrent",
|
|
Instance: "Docker",
|
|
Metadata: map[string]interface{}{
|
|
"resourceType": "Docker Container",
|
|
},
|
|
Threshold: 80,
|
|
Value: 90,
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
config := m.GetConfig()
|
|
config.Overrides = map[string]ThresholdConfig{
|
|
resourceID: {
|
|
Disabled: true,
|
|
},
|
|
}
|
|
config.ActivationState = ActivationActive
|
|
|
|
m.UpdateConfig(config)
|
|
|
|
select {
|
|
case got := <-resolved:
|
|
if got != alertID {
|
|
t.Fatalf("resolved callback fired for unexpected alert %s", got)
|
|
}
|
|
case <-time.After(200 * time.Millisecond):
|
|
t.Fatalf("expected alert to be resolved when docker container override is disabled")
|
|
}
|
|
|
|
m.mu.RLock()
|
|
_, exists := m.activeAlerts[alertID]
|
|
m.mu.RUnlock()
|
|
if exists {
|
|
t.Fatalf("expected docker container alert to be cleared when override is disabled")
|
|
}
|
|
}
|
|
|
|
func TestReevaluateClearsDockerContainerAlertWhenIgnoredPrefixAdded(t *testing.T) {
|
|
m := newTestManager(t)
|
|
|
|
resourceID := "docker:host-2/container-abc123"
|
|
alertID := resourceID + "-cpu"
|
|
|
|
resolved := make(chan string, 1)
|
|
m.SetResolvedCallback(func(id string) {
|
|
resolved <- id
|
|
})
|
|
|
|
m.mu.Lock()
|
|
m.activeAlerts[alertID] = &Alert{
|
|
ID: alertID,
|
|
Type: "cpu",
|
|
ResourceID: resourceID,
|
|
ResourceName: "qbittorrentvpn",
|
|
Instance: "Docker",
|
|
Metadata: map[string]interface{}{
|
|
"resourceType": "Docker Container",
|
|
"containerId": "abc123",
|
|
"containerName": "qbittorrentvpn",
|
|
},
|
|
Threshold: 80,
|
|
Value: 95,
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
config := m.GetConfig()
|
|
config.DockerIgnoredContainerPrefixes = []string{"qbit"}
|
|
config.ActivationState = ActivationActive
|
|
|
|
m.UpdateConfig(config)
|
|
|
|
select {
|
|
case got := <-resolved:
|
|
if got != alertID {
|
|
t.Fatalf("resolved callback fired for unexpected alert %s", got)
|
|
}
|
|
case <-time.After(200 * time.Millisecond):
|
|
t.Fatalf("expected alert to be resolved after adding ignored prefix")
|
|
}
|
|
|
|
m.mu.RLock()
|
|
_, exists := m.activeAlerts[alertID]
|
|
m.mu.RUnlock()
|
|
if exists {
|
|
t.Fatalf("expected docker container alert to be cleared when ignored prefix is configured")
|
|
}
|
|
}
|
|
|
|
func TestBuildGuestKey(t *testing.T) {
|
|
// t.Parallel()
|
|
|
|
tests := []struct {
|
|
name string
|
|
instance string
|
|
node string
|
|
vmid int
|
|
want string
|
|
}{
|
|
{
|
|
name: "different instance and node",
|
|
instance: "cluster-1",
|
|
node: "pve-node",
|
|
vmid: 100,
|
|
want: "cluster-1:pve-node:100",
|
|
},
|
|
{
|
|
name: "same instance and node",
|
|
instance: "pve-node",
|
|
node: "pve-node",
|
|
vmid: 200,
|
|
want: "pve-node:pve-node:200",
|
|
},
|
|
{
|
|
name: "empty instance uses node",
|
|
instance: "",
|
|
node: "pve-node",
|
|
vmid: 300,
|
|
want: "pve-node:pve-node:300",
|
|
},
|
|
{
|
|
name: "whitespace instance uses node",
|
|
instance: " ",
|
|
node: "pve-node",
|
|
vmid: 400,
|
|
want: "pve-node:pve-node:400",
|
|
},
|
|
{
|
|
name: "instance with whitespace trimmed",
|
|
instance: " cluster-1 ",
|
|
node: "pve-node",
|
|
vmid: 500,
|
|
want: "cluster-1:pve-node:500",
|
|
},
|
|
}
|
|
|
|
for _, tt := range tests {
|
|
t.Run(tt.name, func(t *testing.T) {
|
|
// t.Parallel()
|
|
got := BuildGuestKey(tt.instance, tt.node, tt.vmid)
|
|
if got != tt.want {
|
|
t.Errorf("BuildGuestKey(%q, %q, %d) = %q, want %q", tt.instance, tt.node, tt.vmid, got, tt.want)
|
|
}
|
|
})
|
|
}
|
|
}
|
|
|
|
func TestCheckFlapping(t *testing.T) {
|
|
// t.Parallel()
|
|
|
|
tests := []struct {
|
|
name string
|
|
flappingEnabled bool
|
|
threshold int
|
|
windowSeconds int
|
|
cooldownMinutes int
|
|
historyEntries int // number of state changes to simulate before the test call
|
|
expectFlapping bool
|
|
expectNewFlapping bool // should this trigger a new flapping detection (vs already flapping)
|
|
}{
|
|
{
|
|
name: "disabled returns false",
|
|
flappingEnabled: false,
|
|
threshold: 5,
|
|
windowSeconds: 300,
|
|
historyEntries: 10, // way over threshold
|
|
expectFlapping: false,
|
|
},
|
|
{
|
|
name: "below threshold returns false",
|
|
flappingEnabled: true,
|
|
threshold: 5,
|
|
windowSeconds: 300,
|
|
historyEntries: 2, // only 2 + 1 (test call) = 3 < 5
|
|
expectFlapping: false,
|
|
},
|
|
{
|
|
name: "at threshold triggers new flapping",
|
|
flappingEnabled: true,
|
|
threshold: 5,
|
|
windowSeconds: 300,
|
|
cooldownMinutes: 15,
|
|
historyEntries: 4, // 4 + 1 (test call) = 5 == threshold
|
|
expectFlapping: true,
|
|
expectNewFlapping: true,
|
|
},
|
|
{
|
|
name: "above threshold triggers flapping",
|
|
flappingEnabled: true,
|
|
threshold: 5,
|
|
windowSeconds: 300,
|
|
cooldownMinutes: 15,
|
|
historyEntries: 6, // 6 + 1 = 7 > 5
|
|
expectFlapping: true,
|
|
expectNewFlapping: true,
|
|
},
|
|
{
|
|
name: "single state change below threshold",
|
|
flappingEnabled: true,
|
|
threshold: 5,
|
|
windowSeconds: 300,
|
|
historyEntries: 0, // only the test call = 1 < 5
|
|
expectFlapping: false,
|
|
},
|
|
}
|
|
|
|
for _, tt := range tests {
|
|
t.Run(tt.name, func(t *testing.T) {
|
|
// t.Parallel()
|
|
|
|
m := newTestManager(t)
|
|
|
|
// Configure flapping settings
|
|
m.mu.Lock()
|
|
m.config.FlappingEnabled = tt.flappingEnabled
|
|
m.config.FlappingThreshold = tt.threshold
|
|
m.config.FlappingWindowSeconds = tt.windowSeconds
|
|
m.config.FlappingCooldownMinutes = tt.cooldownMinutes
|
|
|
|
alertID := "test-alert-" + tt.name
|
|
|
|
// Add history entries within the time window
|
|
now := time.Now()
|
|
for i := 0; i < tt.historyEntries; i++ {
|
|
m.flappingHistory[alertID] = append(m.flappingHistory[alertID], now.Add(-time.Duration(i)*time.Second))
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
// Call checkFlappingLocked
|
|
m.mu.Lock()
|
|
result := m.checkFlappingLocked(alertID)
|
|
m.mu.Unlock()
|
|
|
|
if result != tt.expectFlapping {
|
|
t.Errorf("checkFlappingLocked() = %v, want %v", result, tt.expectFlapping)
|
|
}
|
|
|
|
// Check if flapping was newly detected
|
|
m.mu.RLock()
|
|
isFlappingActive := m.flappingActive[alertID]
|
|
_, hasSuppression := m.suppressedUntil[alertID]
|
|
m.mu.RUnlock()
|
|
|
|
if tt.expectNewFlapping {
|
|
if !isFlappingActive {
|
|
t.Errorf("expected flappingActive[%s] to be true", alertID)
|
|
}
|
|
if !hasSuppression {
|
|
t.Errorf("expected suppressedUntil[%s] to be set", alertID)
|
|
}
|
|
}
|
|
})
|
|
}
|
|
}
|
|
|
|
func TestCheckFlappingAlreadyFlapping(t *testing.T) {
|
|
// t.Parallel()
|
|
|
|
m := newTestManager(t)
|
|
|
|
alertID := "already-flapping-alert"
|
|
|
|
m.mu.Lock()
|
|
m.config.FlappingEnabled = true
|
|
m.config.FlappingThreshold = 3
|
|
m.config.FlappingWindowSeconds = 300
|
|
m.config.FlappingCooldownMinutes = 15
|
|
|
|
// Pre-set flapping state
|
|
m.flappingActive[alertID] = true
|
|
existingSuppression := time.Now().Add(10 * time.Minute)
|
|
m.suppressedUntil[alertID] = existingSuppression
|
|
|
|
// Add history to exceed threshold
|
|
now := time.Now()
|
|
m.flappingHistory[alertID] = []time.Time{
|
|
now.Add(-10 * time.Second),
|
|
now.Add(-5 * time.Second),
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
// Call checkFlappingLocked - should return true but NOT update suppression
|
|
m.mu.Lock()
|
|
result := m.checkFlappingLocked(alertID)
|
|
m.mu.Unlock()
|
|
|
|
if !result {
|
|
t.Errorf("checkFlappingLocked() = false, want true for already flapping alert")
|
|
}
|
|
|
|
// Verify suppression time was NOT updated (existing suppression should remain)
|
|
m.mu.RLock()
|
|
currentSuppression := m.suppressedUntil[alertID]
|
|
m.mu.RUnlock()
|
|
|
|
if !currentSuppression.Equal(existingSuppression) {
|
|
t.Errorf("suppressedUntil was updated from %v to %v; should remain unchanged for already-flapping alert",
|
|
existingSuppression, currentSuppression)
|
|
}
|
|
}
|
|
|
|
func TestCheckFlappingWindowExpiry(t *testing.T) {
|
|
// t.Parallel()
|
|
|
|
m := newTestManager(t)
|
|
|
|
alertID := "window-expiry-alert"
|
|
|
|
m.mu.Lock()
|
|
m.config.FlappingEnabled = true
|
|
m.config.FlappingThreshold = 3
|
|
m.config.FlappingWindowSeconds = 60 // 1 minute window
|
|
|
|
// Add old history entries outside the window
|
|
now := time.Now()
|
|
m.flappingHistory[alertID] = []time.Time{
|
|
now.Add(-5 * time.Minute), // outside 1 minute window
|
|
now.Add(-4 * time.Minute), // outside 1 minute window
|
|
now.Add(-3 * time.Minute), // outside 1 minute window
|
|
now.Add(-2 * time.Minute), // outside 1 minute window
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
// Call checkFlappingLocked - old entries should be pruned
|
|
m.mu.Lock()
|
|
result := m.checkFlappingLocked(alertID)
|
|
historyLen := len(m.flappingHistory[alertID])
|
|
m.mu.Unlock()
|
|
|
|
if result {
|
|
t.Errorf("checkFlappingLocked() = true, want false (old entries should be pruned)")
|
|
}
|
|
|
|
// Only the current call should remain in history
|
|
if historyLen != 1 {
|
|
t.Errorf("history length = %d, want 1 (old entries should be pruned)", historyLen)
|
|
}
|
|
}
|
|
|
|
func TestGetGlobalMetricTimeThreshold(t *testing.T) {
|
|
// t.Parallel()
|
|
|
|
tests := []struct {
|
|
name string
|
|
metricTimeThresholds map[string]map[string]int
|
|
metricType string
|
|
wantDelay int
|
|
wantFound bool
|
|
}{
|
|
{
|
|
name: "empty MetricTimeThresholds returns false",
|
|
metricTimeThresholds: nil,
|
|
metricType: "cpu",
|
|
wantDelay: 0,
|
|
wantFound: false,
|
|
},
|
|
{
|
|
name: "no all key returns false",
|
|
metricTimeThresholds: map[string]map[string]int{"specific": {"cpu": 60}},
|
|
metricType: "cpu",
|
|
wantDelay: 0,
|
|
wantFound: false,
|
|
},
|
|
{
|
|
name: "empty all map returns false",
|
|
metricTimeThresholds: map[string]map[string]int{"all": {}},
|
|
metricType: "cpu",
|
|
wantDelay: 0,
|
|
wantFound: false,
|
|
},
|
|
{
|
|
name: "empty metricType returns false",
|
|
metricTimeThresholds: map[string]map[string]int{"all": {"cpu": 60}},
|
|
metricType: "",
|
|
wantDelay: 0,
|
|
wantFound: false,
|
|
},
|
|
{
|
|
name: "whitespace metricType returns false",
|
|
metricTimeThresholds: map[string]map[string]int{"all": {"cpu": 60}},
|
|
metricType: " ",
|
|
wantDelay: 0,
|
|
wantFound: false,
|
|
},
|
|
{
|
|
name: "direct metric match",
|
|
metricTimeThresholds: map[string]map[string]int{"all": {"cpu": 120, "memory": 90}},
|
|
metricType: "cpu",
|
|
wantDelay: 120,
|
|
wantFound: true,
|
|
},
|
|
{
|
|
name: "metric match case insensitive",
|
|
metricTimeThresholds: map[string]map[string]int{"all": {"cpu": 120}},
|
|
metricType: "CPU",
|
|
wantDelay: 120,
|
|
wantFound: true,
|
|
},
|
|
{
|
|
name: "metric match with whitespace",
|
|
metricTimeThresholds: map[string]map[string]int{"all": {"cpu": 120}},
|
|
metricType: " cpu ",
|
|
wantDelay: 120,
|
|
wantFound: true,
|
|
},
|
|
{
|
|
name: "default fallback",
|
|
metricTimeThresholds: map[string]map[string]int{"all": {"default": 30}},
|
|
metricType: "unknown",
|
|
wantDelay: 30,
|
|
wantFound: true,
|
|
},
|
|
{
|
|
name: "_default fallback",
|
|
metricTimeThresholds: map[string]map[string]int{"all": {"_default": 45}},
|
|
metricType: "unknown",
|
|
wantDelay: 45,
|
|
wantFound: true,
|
|
},
|
|
{
|
|
name: "wildcard fallback",
|
|
metricTimeThresholds: map[string]map[string]int{"all": {"*": 15}},
|
|
metricType: "unknown",
|
|
wantDelay: 15,
|
|
wantFound: true,
|
|
},
|
|
{
|
|
name: "direct match takes precedence over default",
|
|
metricTimeThresholds: map[string]map[string]int{"all": {"cpu": 120, "default": 30}},
|
|
metricType: "cpu",
|
|
wantDelay: 120,
|
|
wantFound: true,
|
|
},
|
|
{
|
|
name: "no match and no fallback returns false",
|
|
metricTimeThresholds: map[string]map[string]int{"all": {"cpu": 120, "memory": 90}},
|
|
metricType: "disk",
|
|
wantDelay: 0,
|
|
wantFound: false,
|
|
},
|
|
}
|
|
|
|
for _, tt := range tests {
|
|
t.Run(tt.name, func(t *testing.T) {
|
|
// t.Parallel()
|
|
|
|
m := newTestManager(t)
|
|
m.mu.Lock()
|
|
m.config.MetricTimeThresholds = tt.metricTimeThresholds
|
|
m.mu.Unlock()
|
|
|
|
m.mu.RLock()
|
|
gotDelay, gotFound := m.getGlobalMetricTimeThreshold(tt.metricType)
|
|
m.mu.RUnlock()
|
|
|
|
if gotDelay != tt.wantDelay {
|
|
t.Errorf("getGlobalMetricTimeThreshold() delay = %d, want %d", gotDelay, tt.wantDelay)
|
|
}
|
|
if gotFound != tt.wantFound {
|
|
t.Errorf("getGlobalMetricTimeThreshold() found = %v, want %v", gotFound, tt.wantFound)
|
|
}
|
|
})
|
|
}
|
|
}
|
|
|
|
func TestGetBaseTimeThreshold(t *testing.T) {
|
|
// t.Parallel()
|
|
|
|
tests := []struct {
|
|
name string
|
|
timeThresholds map[string]int
|
|
timeThreshold int // global fallback
|
|
resourceType string
|
|
wantDelay int
|
|
wantFound bool
|
|
}{
|
|
{
|
|
name: "nil TimeThresholds returns global TimeThreshold",
|
|
timeThresholds: nil,
|
|
timeThreshold: 60,
|
|
resourceType: "guest",
|
|
wantDelay: 60,
|
|
wantFound: false,
|
|
},
|
|
{
|
|
name: "direct resource type match",
|
|
timeThresholds: map[string]int{"guest": 120, "node": 90},
|
|
timeThreshold: 60,
|
|
resourceType: "guest",
|
|
wantDelay: 120,
|
|
wantFound: true,
|
|
},
|
|
{
|
|
name: "canonical key match for vm",
|
|
timeThresholds: map[string]int{"guest": 120},
|
|
timeThreshold: 60,
|
|
resourceType: "vm",
|
|
wantDelay: 120,
|
|
wantFound: true,
|
|
},
|
|
{
|
|
name: "canonical key match for container",
|
|
timeThresholds: map[string]int{"guest": 120},
|
|
timeThreshold: 60,
|
|
resourceType: "container",
|
|
wantDelay: 120,
|
|
wantFound: true,
|
|
},
|
|
{
|
|
name: "all fallback when no specific match",
|
|
timeThresholds: map[string]int{"all": 45},
|
|
timeThreshold: 60,
|
|
resourceType: "storage",
|
|
wantDelay: 45,
|
|
wantFound: false, // "all" returns found=false
|
|
},
|
|
{
|
|
name: "specific match takes precedence over all",
|
|
timeThresholds: map[string]int{"storage": 30, "all": 45},
|
|
timeThreshold: 60,
|
|
resourceType: "storage",
|
|
wantDelay: 30,
|
|
wantFound: true,
|
|
},
|
|
{
|
|
name: "no match and no all returns global threshold",
|
|
timeThresholds: map[string]int{"guest": 120},
|
|
timeThreshold: 60,
|
|
resourceType: "storage",
|
|
wantDelay: 60,
|
|
wantFound: false,
|
|
},
|
|
{
|
|
name: "empty TimeThresholds returns global threshold",
|
|
timeThresholds: map[string]int{},
|
|
timeThreshold: 60,
|
|
resourceType: "guest",
|
|
wantDelay: 60,
|
|
wantFound: false,
|
|
},
|
|
}
|
|
|
|
for _, tt := range tests {
|
|
t.Run(tt.name, func(t *testing.T) {
|
|
// t.Parallel()
|
|
|
|
m := newTestManager(t)
|
|
m.mu.Lock()
|
|
m.config.TimeThresholds = tt.timeThresholds
|
|
m.config.TimeThreshold = tt.timeThreshold
|
|
m.mu.Unlock()
|
|
|
|
m.mu.RLock()
|
|
gotDelay, gotFound := m.getBaseTimeThreshold(tt.resourceType)
|
|
m.mu.RUnlock()
|
|
|
|
if gotDelay != tt.wantDelay {
|
|
t.Errorf("getBaseTimeThreshold() delay = %d, want %d", gotDelay, tt.wantDelay)
|
|
}
|
|
if gotFound != tt.wantFound {
|
|
t.Errorf("getBaseTimeThreshold() found = %v, want %v", gotFound, tt.wantFound)
|
|
}
|
|
})
|
|
}
|
|
}
|
|
|
|
func TestGetMetricTimeThreshold(t *testing.T) {
|
|
// t.Parallel()
|
|
|
|
tests := []struct {
|
|
name string
|
|
metricTimeThresholds map[string]map[string]int
|
|
resourceType string
|
|
metricType string
|
|
wantDelay int
|
|
wantFound bool
|
|
}{
|
|
{
|
|
name: "empty MetricTimeThresholds returns false",
|
|
metricTimeThresholds: nil,
|
|
resourceType: "guest",
|
|
metricType: "cpu",
|
|
wantDelay: 0,
|
|
wantFound: false,
|
|
},
|
|
{
|
|
name: "empty metricType returns false",
|
|
metricTimeThresholds: map[string]map[string]int{"guest": {"cpu": 60}},
|
|
resourceType: "guest",
|
|
metricType: "",
|
|
wantDelay: 0,
|
|
wantFound: false,
|
|
},
|
|
{
|
|
name: "whitespace metricType returns false",
|
|
metricTimeThresholds: map[string]map[string]int{"guest": {"cpu": 60}},
|
|
resourceType: "guest",
|
|
metricType: " ",
|
|
wantDelay: 0,
|
|
wantFound: false,
|
|
},
|
|
{
|
|
name: "direct match on resourceType and metricType",
|
|
metricTimeThresholds: map[string]map[string]int{"guest": {"cpu": 120, "memory": 90}},
|
|
resourceType: "guest",
|
|
metricType: "cpu",
|
|
wantDelay: 120,
|
|
wantFound: true,
|
|
},
|
|
{
|
|
name: "canonical key match vm to guest",
|
|
metricTimeThresholds: map[string]map[string]int{"guest": {"cpu": 120}},
|
|
resourceType: "vm",
|
|
metricType: "cpu",
|
|
wantDelay: 120,
|
|
wantFound: true,
|
|
},
|
|
{
|
|
name: "canonical key match container to guest",
|
|
metricTimeThresholds: map[string]map[string]int{"guest": {"memory": 90}},
|
|
resourceType: "container",
|
|
metricType: "memory",
|
|
wantDelay: 90,
|
|
wantFound: true,
|
|
},
|
|
{
|
|
name: "default fallback within resourceType",
|
|
metricTimeThresholds: map[string]map[string]int{"guest": {"default": 30}},
|
|
resourceType: "guest",
|
|
metricType: "unknown",
|
|
wantDelay: 30,
|
|
wantFound: true,
|
|
},
|
|
{
|
|
name: "_default fallback within resourceType",
|
|
metricTimeThresholds: map[string]map[string]int{"guest": {"_default": 45}},
|
|
resourceType: "guest",
|
|
metricType: "unknown",
|
|
wantDelay: 45,
|
|
wantFound: true,
|
|
},
|
|
{
|
|
name: "wildcard fallback within resourceType",
|
|
metricTimeThresholds: map[string]map[string]int{"guest": {"*": 15}},
|
|
resourceType: "guest",
|
|
metricType: "unknown",
|
|
wantDelay: 15,
|
|
wantFound: true,
|
|
},
|
|
{
|
|
name: "direct match takes precedence over default",
|
|
metricTimeThresholds: map[string]map[string]int{"guest": {"cpu": 120, "default": 30}},
|
|
resourceType: "guest",
|
|
metricType: "cpu",
|
|
wantDelay: 120,
|
|
wantFound: true,
|
|
},
|
|
{
|
|
name: "no match for resourceType returns false",
|
|
metricTimeThresholds: map[string]map[string]int{"node": {"cpu": 60}},
|
|
resourceType: "guest",
|
|
metricType: "cpu",
|
|
wantDelay: 0,
|
|
wantFound: false,
|
|
},
|
|
{
|
|
name: "empty perType map skipped",
|
|
metricTimeThresholds: map[string]map[string]int{"guest": {}},
|
|
resourceType: "guest",
|
|
metricType: "cpu",
|
|
wantDelay: 0,
|
|
wantFound: false,
|
|
},
|
|
{
|
|
name: "metricType case insensitive",
|
|
metricTimeThresholds: map[string]map[string]int{"guest": {"cpu": 120}},
|
|
resourceType: "guest",
|
|
metricType: "CPU",
|
|
wantDelay: 120,
|
|
wantFound: true,
|
|
},
|
|
}
|
|
|
|
for _, tt := range tests {
|
|
t.Run(tt.name, func(t *testing.T) {
|
|
// t.Parallel()
|
|
|
|
m := newTestManager(t)
|
|
m.mu.Lock()
|
|
m.config.MetricTimeThresholds = tt.metricTimeThresholds
|
|
m.mu.Unlock()
|
|
|
|
m.mu.RLock()
|
|
gotDelay, gotFound := m.getMetricTimeThreshold(tt.resourceType, tt.metricType)
|
|
m.mu.RUnlock()
|
|
|
|
if gotDelay != tt.wantDelay {
|
|
t.Errorf("getMetricTimeThreshold() delay = %d, want %d", gotDelay, tt.wantDelay)
|
|
}
|
|
if gotFound != tt.wantFound {
|
|
t.Errorf("getMetricTimeThreshold() found = %v, want %v", gotFound, tt.wantFound)
|
|
}
|
|
})
|
|
}
|
|
}
|
|
|
|
func TestCheckRateLimit(t *testing.T) {
|
|
// t.Parallel()
|
|
|
|
t.Run("no rate limit when MaxAlertsHour is zero", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
m.mu.Lock()
|
|
m.config.Schedule.MaxAlertsHour = 0
|
|
m.mu.Unlock()
|
|
|
|
m.mu.Lock()
|
|
result := m.checkRateLimit("test-alert")
|
|
m.mu.Unlock()
|
|
|
|
if !result {
|
|
t.Errorf("checkRateLimit() = false, want true when MaxAlertsHour is 0")
|
|
}
|
|
})
|
|
|
|
t.Run("no rate limit when MaxAlertsHour is negative", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
m.mu.Lock()
|
|
m.config.Schedule.MaxAlertsHour = -1
|
|
m.mu.Unlock()
|
|
|
|
m.mu.Lock()
|
|
result := m.checkRateLimit("test-alert")
|
|
m.mu.Unlock()
|
|
|
|
if !result {
|
|
t.Errorf("checkRateLimit() = false, want true when MaxAlertsHour is negative")
|
|
}
|
|
})
|
|
|
|
t.Run("allows alerts under rate limit", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
m.mu.Lock()
|
|
m.config.Schedule.MaxAlertsHour = 5
|
|
m.mu.Unlock()
|
|
|
|
// First 5 alerts should be allowed
|
|
for i := 0; i < 5; i++ {
|
|
m.mu.Lock()
|
|
result := m.checkRateLimit("test-alert")
|
|
m.mu.Unlock()
|
|
|
|
if !result {
|
|
t.Errorf("checkRateLimit() call %d = false, want true (under limit)", i+1)
|
|
}
|
|
}
|
|
})
|
|
|
|
t.Run("blocks alerts at rate limit", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
m.mu.Lock()
|
|
m.config.Schedule.MaxAlertsHour = 3
|
|
m.mu.Unlock()
|
|
|
|
// Use up the rate limit
|
|
for i := 0; i < 3; i++ {
|
|
m.mu.Lock()
|
|
_ = m.checkRateLimit("test-alert")
|
|
m.mu.Unlock()
|
|
}
|
|
|
|
// Fourth alert should be blocked
|
|
m.mu.Lock()
|
|
result := m.checkRateLimit("test-alert")
|
|
m.mu.Unlock()
|
|
|
|
if result {
|
|
t.Errorf("checkRateLimit() = true, want false (at rate limit)")
|
|
}
|
|
})
|
|
|
|
t.Run("different alert IDs have separate limits", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
m.mu.Lock()
|
|
m.config.Schedule.MaxAlertsHour = 2
|
|
m.mu.Unlock()
|
|
|
|
// Use up limit for alert-1
|
|
for i := 0; i < 2; i++ {
|
|
m.mu.Lock()
|
|
_ = m.checkRateLimit("alert-1")
|
|
m.mu.Unlock()
|
|
}
|
|
|
|
// alert-2 should still be allowed
|
|
m.mu.Lock()
|
|
result := m.checkRateLimit("alert-2")
|
|
m.mu.Unlock()
|
|
|
|
if !result {
|
|
t.Errorf("checkRateLimit(alert-2) = false, want true (separate limit)")
|
|
}
|
|
})
|
|
|
|
t.Run("old entries are cleaned up", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
m.mu.Lock()
|
|
m.config.Schedule.MaxAlertsHour = 2
|
|
|
|
// Pre-populate with old entries (more than 1 hour ago)
|
|
oldTime := time.Now().Add(-2 * time.Hour)
|
|
m.alertRateLimit["test-alert"] = []time.Time{oldTime, oldTime}
|
|
m.mu.Unlock()
|
|
|
|
// Should be allowed because old entries are cleaned up
|
|
m.mu.Lock()
|
|
result := m.checkRateLimit("test-alert")
|
|
m.mu.Unlock()
|
|
|
|
if !result {
|
|
t.Errorf("checkRateLimit() = false, want true (old entries should be cleaned)")
|
|
}
|
|
})
|
|
|
|
t.Run("mixed old and recent entries", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
m.mu.Lock()
|
|
m.config.Schedule.MaxAlertsHour = 2
|
|
|
|
// Pre-populate with 1 old and 1 recent entry
|
|
oldTime := time.Now().Add(-2 * time.Hour)
|
|
recentTime := time.Now().Add(-30 * time.Minute)
|
|
m.alertRateLimit["test-alert"] = []time.Time{oldTime, recentTime}
|
|
m.mu.Unlock()
|
|
|
|
// First call should be allowed (1 recent + 1 new = 2)
|
|
m.mu.Lock()
|
|
result1 := m.checkRateLimit("test-alert")
|
|
m.mu.Unlock()
|
|
|
|
if !result1 {
|
|
t.Errorf("checkRateLimit() call 1 = false, want true")
|
|
}
|
|
|
|
// Second call should be blocked (2 recent + 1 new would exceed 2)
|
|
m.mu.Lock()
|
|
result2 := m.checkRateLimit("test-alert")
|
|
m.mu.Unlock()
|
|
|
|
if result2 {
|
|
t.Errorf("checkRateLimit() call 2 = true, want false (at limit)")
|
|
}
|
|
})
|
|
}
|
|
|
|
func TestApplyRelaxedGuestThresholds(t *testing.T) {
|
|
// t.Parallel()
|
|
|
|
t.Run("nil thresholds get defaults", func(t *testing.T) {
|
|
// t.Parallel()
|
|
cfg := ThresholdConfig{
|
|
CPU: nil,
|
|
Memory: nil,
|
|
Disk: nil,
|
|
}
|
|
|
|
result := applyRelaxedGuestThresholds(cfg)
|
|
|
|
if result.CPU == nil {
|
|
t.Fatal("expected CPU threshold to be set")
|
|
}
|
|
if result.CPU.Trigger != 95 {
|
|
t.Errorf("CPU.Trigger = %v, want 95", result.CPU.Trigger)
|
|
}
|
|
if result.CPU.Clear != 90 {
|
|
t.Errorf("CPU.Clear = %v, want 90", result.CPU.Clear)
|
|
}
|
|
|
|
if result.Memory == nil {
|
|
t.Fatal("expected Memory threshold to be set")
|
|
}
|
|
if result.Memory.Trigger != 92 {
|
|
t.Errorf("Memory.Trigger = %v, want 92", result.Memory.Trigger)
|
|
}
|
|
|
|
if result.Disk == nil {
|
|
t.Fatal("expected Disk threshold to be set")
|
|
}
|
|
if result.Disk.Trigger != 95 {
|
|
t.Errorf("Disk.Trigger = %v, want 95", result.Disk.Trigger)
|
|
}
|
|
})
|
|
|
|
t.Run("low thresholds raised to minimum", func(t *testing.T) {
|
|
// t.Parallel()
|
|
cfg := ThresholdConfig{
|
|
CPU: &HysteresisThreshold{Trigger: 50, Clear: 45},
|
|
Memory: &HysteresisThreshold{Trigger: 60, Clear: 55},
|
|
Disk: &HysteresisThreshold{Trigger: 70, Clear: 65},
|
|
}
|
|
|
|
result := applyRelaxedGuestThresholds(cfg)
|
|
|
|
if result.CPU.Trigger != 95 {
|
|
t.Errorf("CPU.Trigger = %v, want 95 (raised to minimum)", result.CPU.Trigger)
|
|
}
|
|
if result.Memory.Trigger != 92 {
|
|
t.Errorf("Memory.Trigger = %v, want 92 (raised to minimum)", result.Memory.Trigger)
|
|
}
|
|
if result.Disk.Trigger != 95 {
|
|
t.Errorf("Disk.Trigger = %v, want 95 (raised to minimum)", result.Disk.Trigger)
|
|
}
|
|
})
|
|
|
|
t.Run("high thresholds unchanged", func(t *testing.T) {
|
|
// t.Parallel()
|
|
cfg := ThresholdConfig{
|
|
CPU: &HysteresisThreshold{Trigger: 98, Clear: 93},
|
|
Memory: &HysteresisThreshold{Trigger: 95, Clear: 90},
|
|
Disk: &HysteresisThreshold{Trigger: 99, Clear: 94},
|
|
}
|
|
|
|
result := applyRelaxedGuestThresholds(cfg)
|
|
|
|
if result.CPU.Trigger != 98 {
|
|
t.Errorf("CPU.Trigger = %v, want 98 (unchanged)", result.CPU.Trigger)
|
|
}
|
|
if result.Memory.Trigger != 95 {
|
|
t.Errorf("Memory.Trigger = %v, want 95 (unchanged)", result.Memory.Trigger)
|
|
}
|
|
if result.Disk.Trigger != 99 {
|
|
t.Errorf("Disk.Trigger = %v, want 99 (unchanged)", result.Disk.Trigger)
|
|
}
|
|
})
|
|
|
|
t.Run("clear adjusted when too close to trigger", func(t *testing.T) {
|
|
// t.Parallel()
|
|
cfg := ThresholdConfig{
|
|
CPU: &HysteresisThreshold{Trigger: 95, Clear: 96}, // Clear >= Trigger
|
|
}
|
|
|
|
result := applyRelaxedGuestThresholds(cfg)
|
|
|
|
if result.CPU.Clear >= result.CPU.Trigger {
|
|
t.Errorf("CPU.Clear = %v should be less than Trigger = %v", result.CPU.Clear, result.CPU.Trigger)
|
|
}
|
|
if result.CPU.Clear != 90 {
|
|
t.Errorf("CPU.Clear = %v, want 90 (Trigger - 5)", result.CPU.Clear)
|
|
}
|
|
})
|
|
|
|
t.Run("clear clamped at zero when it would go negative", func(t *testing.T) {
|
|
// t.Parallel()
|
|
// Create a threshold where Trigger is above min but Clear would go negative
|
|
// The adjust function sets Clear = Trigger - 5 if Clear >= Trigger
|
|
// Then clamps to 0 if Clear < 0
|
|
// Since all triggers get raised to 95/92/95, the negative clamp path
|
|
// won't be hit in normal use. Test the logic directly with a config
|
|
// that has Trigger exactly at minimum and Clear at minimum
|
|
cfg := ThresholdConfig{
|
|
CPU: &HysteresisThreshold{Trigger: 95, Clear: 3},
|
|
}
|
|
|
|
result := applyRelaxedGuestThresholds(cfg)
|
|
|
|
// Clear at 3 is valid (less than Trigger 95), should stay at 3
|
|
if result.CPU.Trigger != 95 {
|
|
t.Errorf("CPU.Trigger = %v, want 95", result.CPU.Trigger)
|
|
}
|
|
if result.CPU.Clear != 3 {
|
|
t.Errorf("CPU.Clear = %v, want 3 (unchanged since < Trigger)", result.CPU.Clear)
|
|
}
|
|
})
|
|
|
|
t.Run("original config unchanged", func(t *testing.T) {
|
|
// t.Parallel()
|
|
original := ThresholdConfig{
|
|
CPU: &HysteresisThreshold{Trigger: 50, Clear: 45},
|
|
}
|
|
|
|
_ = applyRelaxedGuestThresholds(original)
|
|
|
|
// Original should be unchanged
|
|
if original.CPU.Trigger != 50 {
|
|
t.Errorf("original CPU.Trigger = %v, want 50 (should be unchanged)", original.CPU.Trigger)
|
|
}
|
|
})
|
|
}
|
|
|
|
func TestShouldNotifyAfterCooldown(t *testing.T) {
|
|
// t.Parallel()
|
|
|
|
t.Run("cooldown disabled allows notification", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
m.mu.Lock()
|
|
m.config.Schedule.Cooldown = 0
|
|
m.mu.Unlock()
|
|
|
|
alert := &Alert{
|
|
ID: "test-alert",
|
|
LastNotified: nil,
|
|
}
|
|
|
|
if !m.shouldNotifyAfterCooldown(alert) {
|
|
t.Error("expected true when cooldown is 0")
|
|
}
|
|
})
|
|
|
|
t.Run("negative cooldown allows notification", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
m.mu.Lock()
|
|
m.config.Schedule.Cooldown = -5
|
|
m.mu.Unlock()
|
|
|
|
now := time.Now()
|
|
alert := &Alert{
|
|
ID: "test-alert",
|
|
LastNotified: &now,
|
|
}
|
|
|
|
if !m.shouldNotifyAfterCooldown(alert) {
|
|
t.Error("expected true when cooldown is negative")
|
|
}
|
|
})
|
|
|
|
t.Run("first notification allowed when never notified", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
m.mu.Lock()
|
|
m.config.Schedule.Cooldown = 30 // 30 minutes
|
|
m.mu.Unlock()
|
|
|
|
alert := &Alert{
|
|
ID: "test-alert",
|
|
LastNotified: nil,
|
|
}
|
|
|
|
if !m.shouldNotifyAfterCooldown(alert) {
|
|
t.Error("expected true when alert has never been notified")
|
|
}
|
|
})
|
|
|
|
t.Run("notification blocked during cooldown period", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
m.mu.Lock()
|
|
m.config.Schedule.Cooldown = 30 // 30 minutes
|
|
m.mu.Unlock()
|
|
|
|
lastNotified := time.Now().Add(-10 * time.Minute) // Notified 10 minutes ago
|
|
alert := &Alert{
|
|
ID: "test-alert",
|
|
LastNotified: &lastNotified,
|
|
}
|
|
|
|
if m.shouldNotifyAfterCooldown(alert) {
|
|
t.Error("expected false when still in cooldown period")
|
|
}
|
|
})
|
|
|
|
t.Run("notification allowed after cooldown expires", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
m.mu.Lock()
|
|
m.config.Schedule.Cooldown = 30 // 30 minutes
|
|
m.mu.Unlock()
|
|
|
|
lastNotified := time.Now().Add(-45 * time.Minute) // Notified 45 minutes ago
|
|
alert := &Alert{
|
|
ID: "test-alert",
|
|
LastNotified: &lastNotified,
|
|
}
|
|
|
|
if !m.shouldNotifyAfterCooldown(alert) {
|
|
t.Error("expected true after cooldown period expires")
|
|
}
|
|
})
|
|
|
|
t.Run("notification allowed at exact cooldown boundary", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
m.mu.Lock()
|
|
m.config.Schedule.Cooldown = 30 // 30 minutes
|
|
m.mu.Unlock()
|
|
|
|
lastNotified := time.Now().Add(-30 * time.Minute) // Exactly 30 minutes ago
|
|
alert := &Alert{
|
|
ID: "test-alert",
|
|
LastNotified: &lastNotified,
|
|
}
|
|
|
|
if !m.shouldNotifyAfterCooldown(alert) {
|
|
t.Error("expected true at exact cooldown boundary (>=)")
|
|
}
|
|
})
|
|
}
|
|
|
|
func TestDockerServiceDisplayName(t *testing.T) {
|
|
// t.Parallel()
|
|
|
|
tests := []struct {
|
|
name string
|
|
service models.DockerService
|
|
expected string
|
|
}{
|
|
{
|
|
name: "returns name when present",
|
|
service: models.DockerService{Name: "my-service", ID: "abc123456789xyz"},
|
|
expected: "my-service",
|
|
},
|
|
{
|
|
name: "returns trimmed name",
|
|
service: models.DockerService{Name: " my-service ", ID: "abc123456789xyz"},
|
|
expected: "my-service",
|
|
},
|
|
{
|
|
name: "returns truncated ID when name is empty",
|
|
service: models.DockerService{Name: "", ID: "abc123456789xyz"},
|
|
expected: "abc123456789",
|
|
},
|
|
{
|
|
name: "returns full short ID when less than 12 chars",
|
|
service: models.DockerService{Name: "", ID: "abc123"},
|
|
expected: "abc123",
|
|
},
|
|
{
|
|
name: "returns trimmed ID",
|
|
service: models.DockerService{Name: "", ID: " abc123456789xyz "},
|
|
expected: "abc123456789",
|
|
},
|
|
{
|
|
name: "returns 'service' when both name and ID empty",
|
|
service: models.DockerService{Name: "", ID: ""},
|
|
expected: "service",
|
|
},
|
|
{
|
|
name: "returns 'service' when both whitespace only",
|
|
service: models.DockerService{Name: " ", ID: " "},
|
|
expected: "service",
|
|
},
|
|
{
|
|
name: "prefers name over ID",
|
|
service: models.DockerService{Name: "preferred", ID: "not-this-id"},
|
|
expected: "preferred",
|
|
},
|
|
}
|
|
|
|
for _, tt := range tests {
|
|
t.Run(tt.name, func(t *testing.T) {
|
|
// t.Parallel()
|
|
result := dockerServiceDisplayName(tt.service)
|
|
if result != tt.expected {
|
|
t.Errorf("dockerServiceDisplayName(%+v) = %q, want %q", tt.service, result, tt.expected)
|
|
}
|
|
})
|
|
}
|
|
}
|
|
|
|
func TestDockerServiceResourceID(t *testing.T) {
|
|
// t.Parallel()
|
|
|
|
tests := []struct {
|
|
name string
|
|
hostID string
|
|
serviceID string
|
|
serviceName string
|
|
expected string
|
|
}{
|
|
{
|
|
name: "with host and service ID",
|
|
hostID: "host-1",
|
|
serviceID: "svc-123",
|
|
serviceName: "my-service",
|
|
expected: "docker:host-1/service/svc-123",
|
|
},
|
|
{
|
|
name: "without host ID uses service prefix only",
|
|
hostID: "",
|
|
serviceID: "svc-123",
|
|
serviceName: "my-service",
|
|
expected: "docker-service:svc-123",
|
|
},
|
|
{
|
|
name: "whitespace host ID treated as empty",
|
|
hostID: " ",
|
|
serviceID: "svc-123",
|
|
serviceName: "my-service",
|
|
expected: "docker-service:svc-123",
|
|
},
|
|
{
|
|
name: "derives ID from service name when ID empty",
|
|
hostID: "host-1",
|
|
serviceID: "",
|
|
serviceName: "My Service",
|
|
expected: "docker:host-1/service/my-service",
|
|
},
|
|
{
|
|
name: "special chars in name replaced with dash",
|
|
hostID: "host-1",
|
|
serviceID: "",
|
|
serviceName: "my/service:v1.0",
|
|
expected: "docker:host-1/service/my-service-v1-0",
|
|
},
|
|
{
|
|
name: "backslash and colon replaced",
|
|
hostID: "host-1",
|
|
serviceID: "",
|
|
serviceName: "path\\to:service",
|
|
expected: "docker:host-1/service/path-to-service",
|
|
},
|
|
{
|
|
name: "preserves alphanumeric and underscore",
|
|
hostID: "host-1",
|
|
serviceID: "",
|
|
serviceName: "my_service_123",
|
|
expected: "docker:host-1/service/my_service_123",
|
|
},
|
|
{
|
|
name: "preserves hyphens",
|
|
hostID: "host-1",
|
|
serviceID: "",
|
|
serviceName: "my-service-name",
|
|
expected: "docker:host-1/service/my-service-name",
|
|
},
|
|
{
|
|
name: "trims leading/trailing dashes and underscores",
|
|
hostID: "host-1",
|
|
serviceID: "",
|
|
serviceName: "---my-service___",
|
|
expected: "docker:host-1/service/my-service",
|
|
},
|
|
{
|
|
name: "truncates long derived ID to 32 chars",
|
|
hostID: "host-1",
|
|
serviceID: "",
|
|
serviceName: "this-is-a-very-long-service-name-that-exceeds-the-limit",
|
|
expected: "docker:host-1/service/this-is-a-very-long-service-name",
|
|
},
|
|
{
|
|
name: "uses 'service' when name is all special chars",
|
|
hostID: "host-1",
|
|
serviceID: "",
|
|
serviceName: "!!!@@@###",
|
|
expected: "docker:host-1/service/service",
|
|
},
|
|
{
|
|
name: "uses 'service' when both ID and name empty",
|
|
hostID: "host-1",
|
|
serviceID: "",
|
|
serviceName: "",
|
|
expected: "docker:host-1/service/service",
|
|
},
|
|
{
|
|
name: "uses 'service' when both ID and name whitespace",
|
|
hostID: "host-1",
|
|
serviceID: " ",
|
|
serviceName: " ",
|
|
expected: "docker:host-1/service/service",
|
|
},
|
|
{
|
|
name: "no host and derived name",
|
|
hostID: "",
|
|
serviceID: "",
|
|
serviceName: "webserver",
|
|
expected: "docker-service:webserver",
|
|
},
|
|
}
|
|
|
|
for _, tt := range tests {
|
|
t.Run(tt.name, func(t *testing.T) {
|
|
// t.Parallel()
|
|
result := dockerServiceResourceID(tt.hostID, tt.serviceID, tt.serviceName)
|
|
if result != tt.expected {
|
|
t.Errorf("dockerServiceResourceID(%q, %q, %q) = %q, want %q",
|
|
tt.hostID, tt.serviceID, tt.serviceName, result, tt.expected)
|
|
}
|
|
})
|
|
}
|
|
}
|
|
|
|
func TestClearStorageOfflineAlert(t *testing.T) {
|
|
// t.Parallel()
|
|
|
|
t.Run("clears existing offline alert", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
storage := models.Storage{
|
|
ID: "storage-1",
|
|
Name: "local-lvm",
|
|
Node: "pve1",
|
|
}
|
|
|
|
alertID := fmt.Sprintf("storage-offline-%s", storage.ID)
|
|
|
|
// Create an existing offline alert
|
|
m.mu.Lock()
|
|
m.activeAlerts[alertID] = &Alert{
|
|
ID: alertID,
|
|
Type: "storage-offline",
|
|
Level: "critical",
|
|
StartTime: time.Now().Add(-10 * time.Minute),
|
|
}
|
|
m.offlineConfirmations[storage.ID] = 3
|
|
m.mu.Unlock()
|
|
|
|
resolvedCh := make(chan string, 1)
|
|
m.SetResolvedCallback(func(id string) {
|
|
resolvedCh <- id
|
|
})
|
|
|
|
m.clearStorageOfflineAlert(storage)
|
|
|
|
m.mu.RLock()
|
|
_, alertExists := m.activeAlerts[alertID]
|
|
_, confirmExists := m.offlineConfirmations[storage.ID]
|
|
m.mu.RUnlock()
|
|
|
|
if alertExists {
|
|
t.Error("expected alert to be cleared")
|
|
}
|
|
if confirmExists {
|
|
t.Error("expected offline confirmation to be cleared")
|
|
}
|
|
select {
|
|
case resolvedID := <-resolvedCh:
|
|
if resolvedID != alertID {
|
|
t.Errorf("expected resolved callback with %q, got %q", alertID, resolvedID)
|
|
}
|
|
case <-time.After(2 * time.Second):
|
|
t.Error("expected resolved callback to be called")
|
|
}
|
|
})
|
|
|
|
t.Run("noop when no alert exists", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
storage := models.Storage{
|
|
ID: "storage-2",
|
|
Name: "local-zfs",
|
|
Node: "pve1",
|
|
}
|
|
|
|
var callbackCalled bool
|
|
m.SetResolvedCallback(func(id string) {
|
|
callbackCalled = true
|
|
})
|
|
|
|
m.clearStorageOfflineAlert(storage)
|
|
|
|
if callbackCalled {
|
|
t.Error("expected no callback when no alert exists")
|
|
}
|
|
})
|
|
|
|
t.Run("clears offline confirmation even when no alert", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
storage := models.Storage{
|
|
ID: "storage-3",
|
|
Name: "ceph-pool",
|
|
Node: "pve2",
|
|
}
|
|
|
|
// Set confirmation without alert
|
|
m.mu.Lock()
|
|
m.offlineConfirmations[storage.ID] = 2
|
|
m.mu.Unlock()
|
|
|
|
m.clearStorageOfflineAlert(storage)
|
|
|
|
m.mu.RLock()
|
|
_, confirmExists := m.offlineConfirmations[storage.ID]
|
|
m.mu.RUnlock()
|
|
|
|
if confirmExists {
|
|
t.Error("expected offline confirmation to be cleared")
|
|
}
|
|
})
|
|
}
|
|
|
|
func TestClearHostMetricAlerts(t *testing.T) {
|
|
// t.Parallel()
|
|
|
|
t.Run("clears specified metrics", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
hostID := "my-host"
|
|
resourceID := fmt.Sprintf("host:%s", hostID)
|
|
|
|
// Create alerts for cpu and memory
|
|
m.mu.Lock()
|
|
m.activeAlerts[fmt.Sprintf("%s-cpu", resourceID)] = &Alert{ID: fmt.Sprintf("%s-cpu", resourceID)}
|
|
m.activeAlerts[fmt.Sprintf("%s-memory", resourceID)] = &Alert{ID: fmt.Sprintf("%s-memory", resourceID)}
|
|
m.activeAlerts[fmt.Sprintf("%s-disk", resourceID)] = &Alert{ID: fmt.Sprintf("%s-disk", resourceID)}
|
|
m.mu.Unlock()
|
|
|
|
m.clearHostMetricAlerts(hostID, "cpu", "disk")
|
|
|
|
m.mu.RLock()
|
|
_, cpuExists := m.activeAlerts[fmt.Sprintf("%s-cpu", resourceID)]
|
|
_, memExists := m.activeAlerts[fmt.Sprintf("%s-memory", resourceID)]
|
|
_, diskExists := m.activeAlerts[fmt.Sprintf("%s-disk", resourceID)]
|
|
m.mu.RUnlock()
|
|
|
|
if cpuExists {
|
|
t.Error("expected cpu alert to be cleared")
|
|
}
|
|
if !memExists {
|
|
t.Error("expected memory alert to remain (not specified)")
|
|
}
|
|
if diskExists {
|
|
t.Error("expected disk alert to be cleared")
|
|
}
|
|
})
|
|
|
|
t.Run("defaults to cpu and memory when no metrics specified", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
hostID := "default-host"
|
|
resourceID := fmt.Sprintf("host:%s", hostID)
|
|
|
|
// Create alerts
|
|
m.mu.Lock()
|
|
m.activeAlerts[fmt.Sprintf("%s-cpu", resourceID)] = &Alert{ID: fmt.Sprintf("%s-cpu", resourceID)}
|
|
m.activeAlerts[fmt.Sprintf("%s-memory", resourceID)] = &Alert{ID: fmt.Sprintf("%s-memory", resourceID)}
|
|
m.activeAlerts[fmt.Sprintf("%s-disk", resourceID)] = &Alert{ID: fmt.Sprintf("%s-disk", resourceID)}
|
|
m.mu.Unlock()
|
|
|
|
m.clearHostMetricAlerts(hostID) // No metrics specified
|
|
|
|
m.mu.RLock()
|
|
_, cpuExists := m.activeAlerts[fmt.Sprintf("%s-cpu", resourceID)]
|
|
_, memExists := m.activeAlerts[fmt.Sprintf("%s-memory", resourceID)]
|
|
_, diskExists := m.activeAlerts[fmt.Sprintf("%s-disk", resourceID)]
|
|
m.mu.RUnlock()
|
|
|
|
if cpuExists {
|
|
t.Error("expected cpu alert to be cleared (default)")
|
|
}
|
|
if memExists {
|
|
t.Error("expected memory alert to be cleared (default)")
|
|
}
|
|
if !diskExists {
|
|
t.Error("expected disk alert to remain (not in defaults)")
|
|
}
|
|
})
|
|
|
|
t.Run("empty hostID is noop", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
// Create an alert that should not be touched
|
|
m.mu.Lock()
|
|
m.activeAlerts["host:unknown-cpu"] = &Alert{ID: "host:unknown-cpu"}
|
|
m.mu.Unlock()
|
|
|
|
m.clearHostMetricAlerts("", "cpu")
|
|
|
|
m.mu.RLock()
|
|
_, exists := m.activeAlerts["host:unknown-cpu"]
|
|
m.mu.RUnlock()
|
|
|
|
if !exists {
|
|
t.Error("expected alert to remain when hostID is empty")
|
|
}
|
|
})
|
|
}
|
|
|
|
func TestClearHostDiskAlerts(t *testing.T) {
|
|
// t.Parallel()
|
|
|
|
t.Run("clears all disk alerts for host", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
hostID := "disk-host"
|
|
resourceID := fmt.Sprintf("host:%s", hostID)
|
|
|
|
// Create disk alerts with the expected ResourceID format
|
|
m.mu.Lock()
|
|
m.activeAlerts["disk1-alert"] = &Alert{
|
|
ID: "disk1-alert",
|
|
ResourceID: fmt.Sprintf("%s/disk:sda", resourceID),
|
|
}
|
|
m.activeAlerts["disk2-alert"] = &Alert{
|
|
ID: "disk2-alert",
|
|
ResourceID: fmt.Sprintf("%s/disk:sdb", resourceID),
|
|
}
|
|
m.activeAlerts["cpu-alert"] = &Alert{
|
|
ID: "cpu-alert",
|
|
ResourceID: fmt.Sprintf("%s-cpu", resourceID),
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
m.clearHostDiskAlerts(hostID)
|
|
|
|
m.mu.RLock()
|
|
_, disk1Exists := m.activeAlerts["disk1-alert"]
|
|
_, disk2Exists := m.activeAlerts["disk2-alert"]
|
|
_, cpuExists := m.activeAlerts["cpu-alert"]
|
|
m.mu.RUnlock()
|
|
|
|
if disk1Exists {
|
|
t.Error("expected disk1 alert to be cleared")
|
|
}
|
|
if disk2Exists {
|
|
t.Error("expected disk2 alert to be cleared")
|
|
}
|
|
if !cpuExists {
|
|
t.Error("expected cpu alert to remain (not a disk alert)")
|
|
}
|
|
})
|
|
|
|
t.Run("empty hostID is noop", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
// Create an alert
|
|
m.mu.Lock()
|
|
m.activeAlerts["disk-alert"] = &Alert{
|
|
ID: "disk-alert",
|
|
ResourceID: "host:unknown/disk:sda",
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
m.clearHostDiskAlerts("")
|
|
|
|
m.mu.RLock()
|
|
_, exists := m.activeAlerts["disk-alert"]
|
|
m.mu.RUnlock()
|
|
|
|
if !exists {
|
|
t.Error("expected alert to remain when hostID is empty")
|
|
}
|
|
})
|
|
|
|
t.Run("skips nil alerts", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
hostID := "nil-test"
|
|
resourceID := fmt.Sprintf("host:%s", hostID)
|
|
|
|
m.mu.Lock()
|
|
m.activeAlerts["nil-alert"] = nil
|
|
m.activeAlerts["real-alert"] = &Alert{
|
|
ID: "real-alert",
|
|
ResourceID: fmt.Sprintf("%s/disk:sda", resourceID),
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
// Should not panic
|
|
m.clearHostDiskAlerts(hostID)
|
|
|
|
m.mu.RLock()
|
|
_, realExists := m.activeAlerts["real-alert"]
|
|
m.mu.RUnlock()
|
|
|
|
if realExists {
|
|
t.Error("expected real alert to be cleared")
|
|
}
|
|
})
|
|
|
|
t.Run("noop when no matching alerts", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
m.mu.Lock()
|
|
m.activeAlerts["other-alert"] = &Alert{
|
|
ID: "other-alert",
|
|
ResourceID: "host:other-host/disk:sda",
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
m.clearHostDiskAlerts("my-host")
|
|
|
|
m.mu.RLock()
|
|
_, exists := m.activeAlerts["other-alert"]
|
|
m.mu.RUnlock()
|
|
|
|
if !exists {
|
|
t.Error("expected other host's alert to remain")
|
|
}
|
|
})
|
|
}
|
|
|
|
func TestCleanupHostDiskAlerts(t *testing.T) {
|
|
// t.Parallel()
|
|
|
|
t.Run("clears alerts not in seen set", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
host := models.Host{ID: "host-1"}
|
|
resourceID := fmt.Sprintf("host:%s", host.ID)
|
|
|
|
// Create disk alerts
|
|
m.mu.Lock()
|
|
m.activeAlerts["disk-sda"] = &Alert{
|
|
ID: "disk-sda",
|
|
ResourceID: fmt.Sprintf("%s/disk:sda", resourceID),
|
|
}
|
|
m.activeAlerts["disk-sdb"] = &Alert{
|
|
ID: "disk-sdb",
|
|
ResourceID: fmt.Sprintf("%s/disk:sdb", resourceID),
|
|
}
|
|
m.activeAlerts["disk-sdc"] = &Alert{
|
|
ID: "disk-sdc",
|
|
ResourceID: fmt.Sprintf("%s/disk:sdc", resourceID),
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
// Only sda and sdb are in the seen set
|
|
seen := map[string]struct{}{
|
|
fmt.Sprintf("%s/disk:sda", resourceID): {},
|
|
fmt.Sprintf("%s/disk:sdb", resourceID): {},
|
|
}
|
|
|
|
m.cleanupHostDiskAlerts(host, seen)
|
|
|
|
m.mu.RLock()
|
|
_, sdaExists := m.activeAlerts["disk-sda"]
|
|
_, sdbExists := m.activeAlerts["disk-sdb"]
|
|
_, sdcExists := m.activeAlerts["disk-sdc"]
|
|
m.mu.RUnlock()
|
|
|
|
if !sdaExists {
|
|
t.Error("expected sda alert to remain (in seen set)")
|
|
}
|
|
if !sdbExists {
|
|
t.Error("expected sdb alert to remain (in seen set)")
|
|
}
|
|
if sdcExists {
|
|
t.Error("expected sdc alert to be cleared (not in seen set)")
|
|
}
|
|
})
|
|
|
|
t.Run("empty host ID is noop", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
m.mu.Lock()
|
|
m.activeAlerts["disk-alert"] = &Alert{
|
|
ID: "disk-alert",
|
|
ResourceID: "host:unknown/disk:sda",
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
host := models.Host{ID: ""}
|
|
m.cleanupHostDiskAlerts(host, nil)
|
|
|
|
m.mu.RLock()
|
|
_, exists := m.activeAlerts["disk-alert"]
|
|
m.mu.RUnlock()
|
|
|
|
if !exists {
|
|
t.Error("expected alert to remain when host ID is empty")
|
|
}
|
|
})
|
|
|
|
t.Run("skips nil alerts", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
host := models.Host{ID: "host-2"}
|
|
resourceID := fmt.Sprintf("host:%s", host.ID)
|
|
|
|
m.mu.Lock()
|
|
m.activeAlerts["nil-alert"] = nil
|
|
m.activeAlerts["real-alert"] = &Alert{
|
|
ID: "real-alert",
|
|
ResourceID: fmt.Sprintf("%s/disk:sda", resourceID),
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
seen := map[string]struct{}{} // Empty seen set
|
|
|
|
// Should not panic
|
|
m.cleanupHostDiskAlerts(host, seen)
|
|
|
|
m.mu.RLock()
|
|
_, realExists := m.activeAlerts["real-alert"]
|
|
m.mu.RUnlock()
|
|
|
|
if realExists {
|
|
t.Error("expected real alert to be cleared (not in seen set)")
|
|
}
|
|
})
|
|
|
|
t.Run("skips non-matching prefix", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
host := models.Host{ID: "host-3"}
|
|
resourceID := fmt.Sprintf("host:%s", host.ID)
|
|
|
|
m.mu.Lock()
|
|
m.activeAlerts["cpu-alert"] = &Alert{
|
|
ID: "cpu-alert",
|
|
ResourceID: fmt.Sprintf("%s-cpu", resourceID), // Not a disk alert
|
|
}
|
|
m.activeAlerts["disk-alert"] = &Alert{
|
|
ID: "disk-alert",
|
|
ResourceID: fmt.Sprintf("%s/disk:sda", resourceID),
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
seen := map[string]struct{}{} // Empty seen set
|
|
|
|
m.cleanupHostDiskAlerts(host, seen)
|
|
|
|
m.mu.RLock()
|
|
_, cpuExists := m.activeAlerts["cpu-alert"]
|
|
_, diskExists := m.activeAlerts["disk-alert"]
|
|
m.mu.RUnlock()
|
|
|
|
if !cpuExists {
|
|
t.Error("expected cpu alert to remain (not a disk alert)")
|
|
}
|
|
if diskExists {
|
|
t.Error("expected disk alert to be cleared")
|
|
}
|
|
})
|
|
}
|
|
|
|
func TestHandleDockerHostRemovedEmptyID(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
// Create some alerts that should not be touched
|
|
m.mu.Lock()
|
|
m.activeAlerts["docker-host-offline-host1"] = &Alert{ID: "docker-host-offline-host1"}
|
|
m.dockerOfflineCount["host1"] = 3
|
|
m.mu.Unlock()
|
|
|
|
// Call with empty ID - should be noop
|
|
host := models.DockerHost{ID: ""}
|
|
m.HandleDockerHostRemoved(host)
|
|
|
|
m.mu.RLock()
|
|
_, alertExists := m.activeAlerts["docker-host-offline-host1"]
|
|
_, countExists := m.dockerOfflineCount["host1"]
|
|
m.mu.RUnlock()
|
|
|
|
if !alertExists {
|
|
t.Error("expected alert to remain when host ID is empty")
|
|
}
|
|
if !countExists {
|
|
t.Error("expected offline count to remain when host ID is empty")
|
|
}
|
|
}
|
|
|
|
func TestHandleDockerHostOnline(t *testing.T) {
|
|
// t.Parallel()
|
|
|
|
t.Run("clears offline alert and tracking", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
host := models.DockerHost{ID: "docker-host-1", DisplayName: "My Host"}
|
|
alertID := fmt.Sprintf("docker-host-offline-%s", host.ID)
|
|
|
|
// Set up offline alert and tracking
|
|
m.mu.Lock()
|
|
m.activeAlerts[alertID] = &Alert{ID: alertID, ResourceID: fmt.Sprintf("docker:%s", host.ID)}
|
|
m.dockerOfflineCount[host.ID] = 5
|
|
m.mu.Unlock()
|
|
|
|
m.HandleDockerHostOnline(host)
|
|
|
|
m.mu.RLock()
|
|
_, alertExists := m.activeAlerts[alertID]
|
|
_, countExists := m.dockerOfflineCount[host.ID]
|
|
m.mu.RUnlock()
|
|
|
|
if alertExists {
|
|
t.Error("expected offline alert to be cleared")
|
|
}
|
|
if countExists {
|
|
t.Error("expected offline count to be cleared")
|
|
}
|
|
})
|
|
|
|
t.Run("noop when no offline alert exists", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
host := models.DockerHost{ID: "docker-host-2"}
|
|
|
|
// Set up only tracking, no alert
|
|
m.mu.Lock()
|
|
m.dockerOfflineCount[host.ID] = 2
|
|
m.mu.Unlock()
|
|
|
|
m.HandleDockerHostOnline(host)
|
|
|
|
m.mu.RLock()
|
|
_, countExists := m.dockerOfflineCount[host.ID]
|
|
m.mu.RUnlock()
|
|
|
|
if countExists {
|
|
t.Error("expected offline count to be cleared even without alert")
|
|
}
|
|
})
|
|
|
|
t.Run("empty host ID is noop", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
// Create some data that should not be touched
|
|
m.mu.Lock()
|
|
m.activeAlerts["docker-host-offline-other"] = &Alert{ID: "docker-host-offline-other"}
|
|
m.dockerOfflineCount["other"] = 3
|
|
m.mu.Unlock()
|
|
|
|
host := models.DockerHost{ID: ""}
|
|
m.HandleDockerHostOnline(host)
|
|
|
|
m.mu.RLock()
|
|
_, alertExists := m.activeAlerts["docker-host-offline-other"]
|
|
_, countExists := m.dockerOfflineCount["other"]
|
|
m.mu.RUnlock()
|
|
|
|
if !alertExists {
|
|
t.Error("expected other alert to remain when host ID is empty")
|
|
}
|
|
if !countExists {
|
|
t.Error("expected other count to remain when host ID is empty")
|
|
}
|
|
})
|
|
}
|
|
|
|
func TestCleanupDockerContainerAlerts(t *testing.T) {
|
|
// t.Parallel()
|
|
|
|
t.Run("clears alerts not in seen set", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
host := models.DockerHost{ID: "docker-host-1"}
|
|
prefix := fmt.Sprintf("docker:%s/", host.ID)
|
|
|
|
// Create container alerts
|
|
m.mu.Lock()
|
|
m.activeAlerts["container1-alert"] = &Alert{
|
|
ID: "container1-alert",
|
|
ResourceID: prefix + "container1",
|
|
}
|
|
m.activeAlerts["container2-alert"] = &Alert{
|
|
ID: "container2-alert",
|
|
ResourceID: prefix + "container2",
|
|
}
|
|
m.activeAlerts["container3-alert"] = &Alert{
|
|
ID: "container3-alert",
|
|
ResourceID: prefix + "container3",
|
|
}
|
|
m.dockerStateConfirm[prefix+"container1"] = 2
|
|
m.dockerStateConfirm[prefix+"container2"] = 1
|
|
m.dockerStateConfirm[prefix+"container3"] = 3
|
|
m.mu.Unlock()
|
|
|
|
// Only container1 and container2 are in seen set
|
|
seen := map[string]struct{}{
|
|
prefix + "container1": {},
|
|
prefix + "container2": {},
|
|
}
|
|
|
|
m.cleanupDockerContainerAlerts(host, seen)
|
|
|
|
m.mu.RLock()
|
|
_, c1Exists := m.activeAlerts["container1-alert"]
|
|
_, c2Exists := m.activeAlerts["container2-alert"]
|
|
_, c3Exists := m.activeAlerts["container3-alert"]
|
|
_, s1Exists := m.dockerStateConfirm[prefix+"container1"]
|
|
_, s2Exists := m.dockerStateConfirm[prefix+"container2"]
|
|
_, s3Exists := m.dockerStateConfirm[prefix+"container3"]
|
|
m.mu.RUnlock()
|
|
|
|
if !c1Exists {
|
|
t.Error("expected container1 alert to remain (in seen set)")
|
|
}
|
|
if !c2Exists {
|
|
t.Error("expected container2 alert to remain (in seen set)")
|
|
}
|
|
if c3Exists {
|
|
t.Error("expected container3 alert to be cleared (not in seen set)")
|
|
}
|
|
if !s1Exists {
|
|
t.Error("expected container1 state confirm to remain (in seen set)")
|
|
}
|
|
if !s2Exists {
|
|
t.Error("expected container2 state confirm to remain (in seen set)")
|
|
}
|
|
if s3Exists {
|
|
t.Error("expected container3 state confirm to be cleared (not in seen set)")
|
|
}
|
|
})
|
|
|
|
t.Run("skips alerts from other hosts", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
host := models.DockerHost{ID: "host-a"}
|
|
|
|
// Create alert for a different host
|
|
m.mu.Lock()
|
|
m.activeAlerts["other-host-alert"] = &Alert{
|
|
ID: "other-host-alert",
|
|
ResourceID: "docker:host-b/container1",
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
seen := map[string]struct{}{} // Empty seen set
|
|
|
|
m.cleanupDockerContainerAlerts(host, seen)
|
|
|
|
m.mu.RLock()
|
|
_, exists := m.activeAlerts["other-host-alert"]
|
|
m.mu.RUnlock()
|
|
|
|
if !exists {
|
|
t.Error("expected other host's alert to remain")
|
|
}
|
|
})
|
|
|
|
t.Run("handles empty seen set", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
host := models.DockerHost{ID: "host-c"}
|
|
prefix := fmt.Sprintf("docker:%s/", host.ID)
|
|
|
|
m.mu.Lock()
|
|
m.activeAlerts["to-clear"] = &Alert{
|
|
ID: "to-clear",
|
|
ResourceID: prefix + "container1",
|
|
}
|
|
m.dockerStateConfirm[prefix+"container1"] = 1
|
|
m.mu.Unlock()
|
|
|
|
m.cleanupDockerContainerAlerts(host, map[string]struct{}{})
|
|
|
|
m.mu.RLock()
|
|
_, alertExists := m.activeAlerts["to-clear"]
|
|
_, stateExists := m.dockerStateConfirm[prefix+"container1"]
|
|
m.mu.RUnlock()
|
|
|
|
if alertExists {
|
|
t.Error("expected alert to be cleared with empty seen set")
|
|
}
|
|
if stateExists {
|
|
t.Error("expected state confirm to be cleared with empty seen set")
|
|
}
|
|
})
|
|
}
|
|
|
|
func TestSafeCallEscalateCallback(t *testing.T) {
|
|
// t.Parallel()
|
|
|
|
t.Run("calls callback with alert and level", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
var receivedAlert *Alert
|
|
var receivedLevel int
|
|
done := make(chan struct{})
|
|
|
|
m.SetEscalateCallback(func(alert *Alert, level int) {
|
|
receivedAlert = alert
|
|
receivedLevel = level
|
|
close(done)
|
|
})
|
|
|
|
alert := &Alert{
|
|
ID: "test-alert",
|
|
Type: "test",
|
|
ResourceName: "resource-1",
|
|
}
|
|
|
|
m.safeCallEscalateCallback(alert, 2)
|
|
|
|
select {
|
|
case <-done:
|
|
if receivedAlert == nil {
|
|
t.Fatal("expected alert to be received")
|
|
}
|
|
if receivedAlert.ID != "test-alert" {
|
|
t.Errorf("expected alert ID 'test-alert', got %q", receivedAlert.ID)
|
|
}
|
|
if receivedLevel != 2 {
|
|
t.Errorf("expected level 2, got %d", receivedLevel)
|
|
}
|
|
case <-time.After(1 * time.Second):
|
|
t.Fatal("callback not called within timeout")
|
|
}
|
|
})
|
|
|
|
t.Run("noop when callback is nil", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
// No callback set
|
|
|
|
alert := &Alert{ID: "test-alert"}
|
|
|
|
// Should not panic
|
|
m.safeCallEscalateCallback(alert, 1)
|
|
})
|
|
|
|
t.Run("recovers from panic in callback", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
done := make(chan struct{})
|
|
m.SetEscalateCallback(func(alert *Alert, level int) {
|
|
defer close(done)
|
|
panic("test panic")
|
|
})
|
|
|
|
alert := &Alert{ID: "panic-test"}
|
|
|
|
// Should not panic the caller
|
|
m.safeCallEscalateCallback(alert, 1)
|
|
|
|
select {
|
|
case <-done:
|
|
// Callback ran (and panicked, but recovered)
|
|
case <-time.After(1 * time.Second):
|
|
t.Fatal("callback not called within timeout")
|
|
}
|
|
})
|
|
|
|
t.Run("clones alert to prevent modification", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
var receivedAlert *Alert
|
|
done := make(chan struct{})
|
|
|
|
m.SetEscalateCallback(func(alert *Alert, level int) {
|
|
receivedAlert = alert
|
|
close(done)
|
|
})
|
|
|
|
original := &Alert{
|
|
ID: "original-alert",
|
|
ResourceName: "original-resource",
|
|
}
|
|
|
|
m.safeCallEscalateCallback(original, 1)
|
|
|
|
select {
|
|
case <-done:
|
|
// Modify original after callback started
|
|
original.ResourceName = "modified"
|
|
|
|
// Received alert should be a clone, not affected by modification
|
|
if receivedAlert.ID != "original-alert" {
|
|
t.Errorf("expected cloned alert ID")
|
|
}
|
|
case <-time.After(1 * time.Second):
|
|
t.Fatal("callback not called within timeout")
|
|
}
|
|
})
|
|
}
|
|
|
|
func TestSafeCallResolvedCallback(t *testing.T) {
|
|
// t.Parallel()
|
|
|
|
t.Run("calls callback with alert ID synchronously", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
var receivedID string
|
|
m.SetResolvedCallback(func(alertID string) {
|
|
receivedID = alertID
|
|
})
|
|
|
|
m.safeCallResolvedCallback("test-alert-123", false)
|
|
|
|
if receivedID != "test-alert-123" {
|
|
t.Errorf("expected alert ID 'test-alert-123', got %q", receivedID)
|
|
}
|
|
})
|
|
|
|
t.Run("calls callback asynchronously", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
var receivedID string
|
|
done := make(chan struct{})
|
|
|
|
m.SetResolvedCallback(func(alertID string) {
|
|
receivedID = alertID
|
|
close(done)
|
|
})
|
|
|
|
m.safeCallResolvedCallback("async-alert", true)
|
|
|
|
select {
|
|
case <-done:
|
|
if receivedID != "async-alert" {
|
|
t.Errorf("expected alert ID 'async-alert', got %q", receivedID)
|
|
}
|
|
case <-time.After(1 * time.Second):
|
|
t.Fatal("async callback not called within timeout")
|
|
}
|
|
})
|
|
|
|
t.Run("noop when callback is nil", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
// No callback set
|
|
|
|
// Should not panic
|
|
m.safeCallResolvedCallback("test-alert", false)
|
|
m.safeCallResolvedCallback("test-alert", true)
|
|
})
|
|
|
|
t.Run("recovers from panic in sync callback", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
m.SetResolvedCallback(func(alertID string) {
|
|
panic("test panic")
|
|
})
|
|
|
|
// Should not panic the caller
|
|
m.safeCallResolvedCallback("panic-test", false)
|
|
})
|
|
|
|
t.Run("recovers from panic in async callback", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
done := make(chan struct{})
|
|
m.SetResolvedCallback(func(alertID string) {
|
|
defer close(done)
|
|
panic("async panic")
|
|
})
|
|
|
|
m.safeCallResolvedCallback("async-panic", true)
|
|
|
|
select {
|
|
case <-done:
|
|
// Callback ran (and panicked, but recovered)
|
|
case <-time.After(1 * time.Second):
|
|
t.Fatal("async callback not called within timeout")
|
|
}
|
|
})
|
|
}
|
|
|
|
func TestHandleHostOnline(t *testing.T) {
|
|
// t.Parallel()
|
|
|
|
t.Run("clears offline alert and confirmation tracking", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
host := models.Host{ID: "host-1", Hostname: "my-host"}
|
|
alertID := fmt.Sprintf("host-offline-%s", host.ID)
|
|
resourceKey := fmt.Sprintf("host:%s", host.ID)
|
|
|
|
// Set up offline alert and tracking
|
|
m.mu.Lock()
|
|
m.activeAlerts[alertID] = &Alert{ID: alertID, ResourceID: resourceKey}
|
|
m.offlineConfirmations[resourceKey] = 5
|
|
m.mu.Unlock()
|
|
|
|
m.HandleHostOnline(host)
|
|
|
|
m.mu.RLock()
|
|
_, alertExists := m.activeAlerts[alertID]
|
|
_, confirmExists := m.offlineConfirmations[resourceKey]
|
|
m.mu.RUnlock()
|
|
|
|
if alertExists {
|
|
t.Error("expected offline alert to be cleared")
|
|
}
|
|
if confirmExists {
|
|
t.Error("expected offline confirmation to be cleared")
|
|
}
|
|
})
|
|
|
|
t.Run("clears confirmation even without alert", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
host := models.Host{ID: "host-2"}
|
|
resourceKey := fmt.Sprintf("host:%s", host.ID)
|
|
|
|
// Only tracking, no alert
|
|
m.mu.Lock()
|
|
m.offlineConfirmations[resourceKey] = 2
|
|
m.mu.Unlock()
|
|
|
|
m.HandleHostOnline(host)
|
|
|
|
m.mu.RLock()
|
|
_, exists := m.offlineConfirmations[resourceKey]
|
|
m.mu.RUnlock()
|
|
|
|
if exists {
|
|
t.Error("expected offline confirmation to be cleared")
|
|
}
|
|
})
|
|
|
|
t.Run("empty host ID is noop", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
// Create data that should not be touched
|
|
m.mu.Lock()
|
|
m.activeAlerts["host-offline-other"] = &Alert{ID: "host-offline-other"}
|
|
m.offlineConfirmations["host:other"] = 3
|
|
m.mu.Unlock()
|
|
|
|
host := models.Host{ID: ""}
|
|
m.HandleHostOnline(host)
|
|
|
|
m.mu.RLock()
|
|
_, alertExists := m.activeAlerts["host-offline-other"]
|
|
_, confirmExists := m.offlineConfirmations["host:other"]
|
|
m.mu.RUnlock()
|
|
|
|
if !alertExists {
|
|
t.Error("expected other alert to remain when host ID is empty")
|
|
}
|
|
if !confirmExists {
|
|
t.Error("expected other confirmation to remain when host ID is empty")
|
|
}
|
|
})
|
|
}
|
|
|
|
func TestAcknowledgeAlertNotFound(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
err := m.AcknowledgeAlert("nonexistent-alert", "user1")
|
|
|
|
if err == nil {
|
|
t.Fatal("expected error when acknowledging nonexistent alert")
|
|
}
|
|
if !strings.Contains(err.Error(), "alert not found") {
|
|
t.Errorf("expected 'alert not found' error, got: %v", err)
|
|
}
|
|
}
|
|
|
|
func TestUnacknowledgeAlertNotFound(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
err := m.UnacknowledgeAlert("nonexistent-alert")
|
|
|
|
if err == nil {
|
|
t.Fatal("expected error when unacknowledging nonexistent alert")
|
|
}
|
|
if !strings.Contains(err.Error(), "alert not found") {
|
|
t.Errorf("expected 'alert not found' error, got: %v", err)
|
|
}
|
|
}
|
|
|
|
func TestUnacknowledgeAlertSuccess(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
// Create and acknowledge an alert first
|
|
alertID := "test-alert-123"
|
|
now := time.Now()
|
|
m.activeAlerts[alertID] = &Alert{
|
|
ID: alertID,
|
|
Acknowledged: true,
|
|
AckTime: &now,
|
|
AckUser: "user1",
|
|
}
|
|
m.ackState[alertID] = ackRecord{acknowledged: true, user: "user1", time: now}
|
|
|
|
// Unacknowledge the alert
|
|
err := m.UnacknowledgeAlert(alertID)
|
|
|
|
if err != nil {
|
|
t.Fatalf("unexpected error unacknowledging alert: %v", err)
|
|
}
|
|
|
|
// Verify alert state was updated
|
|
alert := m.activeAlerts[alertID]
|
|
if alert.Acknowledged {
|
|
t.Error("expected Acknowledged to be false")
|
|
}
|
|
if alert.AckTime != nil {
|
|
t.Error("expected AckTime to be nil")
|
|
}
|
|
if alert.AckUser != "" {
|
|
t.Errorf("expected AckUser to be empty, got: %s", alert.AckUser)
|
|
}
|
|
|
|
// Verify ackState was removed
|
|
if _, exists := m.ackState[alertID]; exists {
|
|
t.Error("expected ackState entry to be deleted")
|
|
}
|
|
}
|
|
|
|
func TestClearActiveAlertsEmptyMaps(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
// Ensure maps are empty initially
|
|
if len(m.activeAlerts) != 0 {
|
|
t.Fatalf("expected activeAlerts to be empty, got %d", len(m.activeAlerts))
|
|
}
|
|
if len(m.pendingAlerts) != 0 {
|
|
t.Fatalf("expected pendingAlerts to be empty, got %d", len(m.pendingAlerts))
|
|
}
|
|
|
|
// Call ClearActiveAlerts on empty manager - should return early without panic
|
|
m.ClearActiveAlerts()
|
|
|
|
// Verify maps are still empty (function returned early)
|
|
if len(m.activeAlerts) != 0 {
|
|
t.Errorf("expected activeAlerts to remain empty, got %d", len(m.activeAlerts))
|
|
}
|
|
}
|
|
|
|
func TestClearActiveAlertsWithExistingAlerts(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
// Populate various maps with test data
|
|
m.mu.Lock()
|
|
m.activeAlerts["test-alert-1"] = &Alert{ID: "test-alert-1", Type: "cpu-usage"}
|
|
m.activeAlerts["test-alert-2"] = &Alert{ID: "test-alert-2", Type: "memory-usage"}
|
|
m.pendingAlerts["pending-1"] = time.Now()
|
|
m.recentAlerts["recent-1"] = &Alert{ID: "recent-1", Type: "disk-usage"}
|
|
m.suppressedUntil["suppressed-1"] = time.Now().Add(time.Hour)
|
|
m.alertRateLimit["rate-1"] = []time.Time{time.Now()}
|
|
m.nodeOfflineCount["node-1"] = 3
|
|
m.offlineConfirmations["node-1"] = 2
|
|
m.dockerOfflineCount["docker-1"] = 1
|
|
m.dockerStateConfirm["docker-1"] = 1
|
|
m.ackState["test-alert-1"] = ackRecord{acknowledged: true, user: "testuser", time: time.Now()}
|
|
m.mu.Unlock()
|
|
|
|
m.resolvedMutex.Lock()
|
|
m.recentlyResolved["resolved-1"] = &ResolvedAlert{Alert: &Alert{ID: "resolved-1"}, ResolvedTime: time.Now()}
|
|
m.resolvedMutex.Unlock()
|
|
|
|
// Call ClearActiveAlerts
|
|
m.ClearActiveAlerts()
|
|
|
|
// Give goroutine time to run SaveActiveAlerts
|
|
time.Sleep(50 * time.Millisecond)
|
|
|
|
// Verify all maps are cleared
|
|
m.mu.RLock()
|
|
if len(m.activeAlerts) != 0 {
|
|
t.Errorf("expected activeAlerts to be empty, got %d", len(m.activeAlerts))
|
|
}
|
|
if len(m.pendingAlerts) != 0 {
|
|
t.Errorf("expected pendingAlerts to be empty, got %d", len(m.pendingAlerts))
|
|
}
|
|
if len(m.recentAlerts) != 0 {
|
|
t.Errorf("expected recentAlerts to be empty, got %d", len(m.recentAlerts))
|
|
}
|
|
if len(m.suppressedUntil) != 0 {
|
|
t.Errorf("expected suppressedUntil to be empty, got %d", len(m.suppressedUntil))
|
|
}
|
|
if len(m.alertRateLimit) != 0 {
|
|
t.Errorf("expected alertRateLimit to be empty, got %d", len(m.alertRateLimit))
|
|
}
|
|
if len(m.nodeOfflineCount) != 0 {
|
|
t.Errorf("expected nodeOfflineCount to be empty, got %d", len(m.nodeOfflineCount))
|
|
}
|
|
if len(m.offlineConfirmations) != 0 {
|
|
t.Errorf("expected offlineConfirmations to be empty, got %d", len(m.offlineConfirmations))
|
|
}
|
|
if len(m.dockerOfflineCount) != 0 {
|
|
t.Errorf("expected dockerOfflineCount to be empty, got %d", len(m.dockerOfflineCount))
|
|
}
|
|
if len(m.dockerStateConfirm) != 0 {
|
|
t.Errorf("expected dockerStateConfirm to be empty, got %d", len(m.dockerStateConfirm))
|
|
}
|
|
if len(m.ackState) != 0 {
|
|
t.Errorf("expected ackState to be empty, got %d", len(m.ackState))
|
|
}
|
|
m.mu.RUnlock()
|
|
|
|
m.resolvedMutex.RLock()
|
|
if len(m.recentlyResolved) != 0 {
|
|
t.Errorf("expected recentlyResolved to be empty, got %d", len(m.recentlyResolved))
|
|
}
|
|
m.resolvedMutex.RUnlock()
|
|
}
|
|
|
|
func TestClearBackupAlertsLocked(t *testing.T) {
|
|
// t.Parallel()
|
|
|
|
t.Run("clears backup-age and backup-orphaned alerts only", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
// Add a backup-age alert
|
|
m.activeAlerts["backup-alert-1"] = &Alert{
|
|
ID: "backup-alert-1",
|
|
Type: "backup-age",
|
|
}
|
|
// Add a non-backup alert
|
|
m.activeAlerts["cpu-alert-1"] = &Alert{
|
|
ID: "cpu-alert-1",
|
|
Type: "cpu",
|
|
}
|
|
// Add another backup-age alert
|
|
m.activeAlerts["backup-alert-2"] = &Alert{
|
|
ID: "backup-alert-2",
|
|
Type: "backup-age",
|
|
}
|
|
// Add a backup-orphaned alert
|
|
m.activeAlerts["backup-orphaned-1"] = &Alert{
|
|
ID: "backup-orphaned-1",
|
|
Type: "backup-orphaned",
|
|
}
|
|
|
|
if len(m.activeAlerts) != 4 {
|
|
t.Fatalf("expected 4 alerts, got %d", len(m.activeAlerts))
|
|
}
|
|
|
|
m.mu.Lock()
|
|
m.clearBackupAlertsLocked()
|
|
m.mu.Unlock()
|
|
|
|
// Should have removed backup-age and backup-orphaned alerts, keeping cpu alert
|
|
if len(m.activeAlerts) != 1 {
|
|
t.Errorf("expected 1 alert remaining, got %d", len(m.activeAlerts))
|
|
}
|
|
if _, exists := m.activeAlerts["cpu-alert-1"]; !exists {
|
|
t.Error("expected cpu-alert-1 to remain")
|
|
}
|
|
if _, exists := m.activeAlerts["backup-alert-1"]; exists {
|
|
t.Error("expected backup-alert-1 to be cleared")
|
|
}
|
|
if _, exists := m.activeAlerts["backup-alert-2"]; exists {
|
|
t.Error("expected backup-alert-2 to be cleared")
|
|
}
|
|
if _, exists := m.activeAlerts["backup-orphaned-1"]; exists {
|
|
t.Error("expected backup-orphaned-1 to be cleared")
|
|
}
|
|
})
|
|
|
|
t.Run("handles nil alert in map", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
// Add a nil alert entry
|
|
m.activeAlerts["nil-alert"] = nil
|
|
// Add a valid backup-age alert
|
|
m.activeAlerts["backup-alert"] = &Alert{
|
|
ID: "backup-alert",
|
|
Type: "backup-age",
|
|
}
|
|
|
|
m.mu.Lock()
|
|
m.clearBackupAlertsLocked()
|
|
m.mu.Unlock()
|
|
|
|
// Should have skipped nil and removed backup-age
|
|
if len(m.activeAlerts) != 1 {
|
|
t.Errorf("expected 1 alert remaining, got %d", len(m.activeAlerts))
|
|
}
|
|
// Nil entry should remain
|
|
if _, exists := m.activeAlerts["nil-alert"]; !exists {
|
|
t.Error("expected nil-alert entry to remain (nil check should skip it)")
|
|
}
|
|
})
|
|
|
|
t.Run("empty alerts map is no-op", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
m.mu.Lock()
|
|
m.clearBackupAlertsLocked()
|
|
m.mu.Unlock()
|
|
|
|
if len(m.activeAlerts) != 0 {
|
|
t.Errorf("expected 0 alerts, got %d", len(m.activeAlerts))
|
|
}
|
|
})
|
|
}
|
|
|
|
func TestClearBackupAlerts(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
// Add a backup-age alert
|
|
m.activeAlerts["backup-alert"] = &Alert{
|
|
ID: "backup-alert",
|
|
Type: "backup-age",
|
|
}
|
|
// Add a non-backup alert
|
|
m.activeAlerts["cpu-alert"] = &Alert{
|
|
ID: "cpu-alert",
|
|
Type: "cpu",
|
|
}
|
|
|
|
// Call the public method (handles locking internally)
|
|
m.clearBackupAlerts()
|
|
|
|
// Only cpu alert should remain
|
|
if len(m.activeAlerts) != 1 {
|
|
t.Errorf("expected 1 alert remaining, got %d", len(m.activeAlerts))
|
|
}
|
|
if _, exists := m.activeAlerts["cpu-alert"]; !exists {
|
|
t.Error("expected cpu-alert to remain")
|
|
}
|
|
}
|
|
|
|
func TestClearSnapshotAlertsForInstanceLocked(t *testing.T) {
|
|
// t.Parallel()
|
|
|
|
t.Run("clears snapshot alerts for specific instance", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
// Add snapshot alerts for different instances
|
|
m.activeAlerts["snap-inst1"] = &Alert{
|
|
ID: "snap-inst1",
|
|
Type: "snapshot-age",
|
|
Instance: "instance1",
|
|
}
|
|
m.activeAlerts["snap-inst2"] = &Alert{
|
|
ID: "snap-inst2",
|
|
Type: "snapshot-age",
|
|
Instance: "instance2",
|
|
}
|
|
// Add a non-snapshot alert
|
|
m.activeAlerts["cpu-alert"] = &Alert{
|
|
ID: "cpu-alert",
|
|
Type: "cpu",
|
|
}
|
|
|
|
m.mu.Lock()
|
|
m.clearSnapshotAlertsForInstanceLocked("instance1")
|
|
m.mu.Unlock()
|
|
|
|
// Should keep instance2 snapshot and cpu alert
|
|
if len(m.activeAlerts) != 2 {
|
|
t.Errorf("expected 2 alerts remaining, got %d", len(m.activeAlerts))
|
|
}
|
|
if _, exists := m.activeAlerts["snap-inst1"]; exists {
|
|
t.Error("expected snap-inst1 to be cleared")
|
|
}
|
|
if _, exists := m.activeAlerts["snap-inst2"]; !exists {
|
|
t.Error("expected snap-inst2 to remain")
|
|
}
|
|
})
|
|
|
|
t.Run("clears all snapshot alerts when instance is empty", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
// Add snapshot alerts for different instances
|
|
m.activeAlerts["snap-inst1"] = &Alert{
|
|
ID: "snap-inst1",
|
|
Type: "snapshot-age",
|
|
Instance: "instance1",
|
|
}
|
|
m.activeAlerts["snap-inst2"] = &Alert{
|
|
ID: "snap-inst2",
|
|
Type: "snapshot-age",
|
|
Instance: "instance2",
|
|
}
|
|
// Add a non-snapshot alert
|
|
m.activeAlerts["cpu-alert"] = &Alert{
|
|
ID: "cpu-alert",
|
|
Type: "cpu",
|
|
}
|
|
|
|
m.mu.Lock()
|
|
m.clearSnapshotAlertsForInstanceLocked("")
|
|
m.mu.Unlock()
|
|
|
|
// Should keep only cpu alert
|
|
if len(m.activeAlerts) != 1 {
|
|
t.Errorf("expected 1 alert remaining, got %d", len(m.activeAlerts))
|
|
}
|
|
if _, exists := m.activeAlerts["cpu-alert"]; !exists {
|
|
t.Error("expected cpu-alert to remain")
|
|
}
|
|
})
|
|
|
|
t.Run("handles nil alert in map", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
// Add nil entry and valid snapshot alert
|
|
m.activeAlerts["nil-alert"] = nil
|
|
m.activeAlerts["snap-alert"] = &Alert{
|
|
ID: "snap-alert",
|
|
Type: "snapshot-age",
|
|
Instance: "inst1",
|
|
}
|
|
|
|
m.mu.Lock()
|
|
m.clearSnapshotAlertsForInstanceLocked("inst1")
|
|
m.mu.Unlock()
|
|
|
|
// Nil entry should remain, snapshot should be cleared
|
|
if len(m.activeAlerts) != 1 {
|
|
t.Errorf("expected 1 alert remaining, got %d", len(m.activeAlerts))
|
|
}
|
|
if _, exists := m.activeAlerts["nil-alert"]; !exists {
|
|
t.Error("expected nil-alert entry to remain")
|
|
}
|
|
})
|
|
}
|
|
|
|
func TestClearSnapshotAlertsForInstance(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
// Add a snapshot alert
|
|
m.activeAlerts["snap-alert"] = &Alert{
|
|
ID: "snap-alert",
|
|
Type: "snapshot-age",
|
|
Instance: "instance1",
|
|
}
|
|
|
|
// Call the public method (handles locking internally)
|
|
m.clearSnapshotAlertsForInstance("instance1")
|
|
|
|
if len(m.activeAlerts) != 0 {
|
|
t.Errorf("expected 0 alerts remaining, got %d", len(m.activeAlerts))
|
|
}
|
|
}
|
|
|
|
func TestApplyGlobalOfflineSettingsLocked(t *testing.T) {
|
|
// t.Parallel()
|
|
|
|
t.Run("DisableAllNodesOffline clears node offline alerts", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
// Add node offline alerts
|
|
m.activeAlerts["node-offline-node1"] = &Alert{ID: "node-offline-node1", Type: "offline"}
|
|
m.activeAlerts["node-offline-node2"] = &Alert{ID: "node-offline-node2", Type: "offline"}
|
|
// Add non-node alert
|
|
m.activeAlerts["cpu-alert"] = &Alert{ID: "cpu-alert", Type: "cpu"}
|
|
// Add to nodeOfflineCount
|
|
m.nodeOfflineCount["node1"] = 3
|
|
m.nodeOfflineCount["node2"] = 2
|
|
|
|
m.config.DisableAllNodesOffline = true
|
|
|
|
m.mu.Lock()
|
|
m.applyGlobalOfflineSettingsLocked()
|
|
m.mu.Unlock()
|
|
|
|
// Node alerts should be cleared
|
|
if _, exists := m.activeAlerts["node-offline-node1"]; exists {
|
|
t.Error("expected node-offline-node1 to be cleared")
|
|
}
|
|
if _, exists := m.activeAlerts["node-offline-node2"]; exists {
|
|
t.Error("expected node-offline-node2 to be cleared")
|
|
}
|
|
// Non-node alert should remain
|
|
if _, exists := m.activeAlerts["cpu-alert"]; !exists {
|
|
t.Error("expected cpu-alert to remain")
|
|
}
|
|
// nodeOfflineCount should be reset
|
|
if len(m.nodeOfflineCount) != 0 {
|
|
t.Errorf("expected nodeOfflineCount to be empty, got %d entries", len(m.nodeOfflineCount))
|
|
}
|
|
})
|
|
|
|
t.Run("DisableAllPBSOffline clears PBS offline alerts", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
// Add PBS offline alerts
|
|
m.activeAlerts["pbs-offline-pbs1"] = &Alert{ID: "pbs-offline-pbs1", ResourceID: "pbs1", Type: "offline"}
|
|
// Add non-PBS alert
|
|
m.activeAlerts["cpu-alert"] = &Alert{ID: "cpu-alert", Type: "cpu"}
|
|
// Add to offlineConfirmations
|
|
m.offlineConfirmations["pbs1"] = 3
|
|
|
|
m.config.DisableAllPBSOffline = true
|
|
|
|
m.mu.Lock()
|
|
m.applyGlobalOfflineSettingsLocked()
|
|
m.mu.Unlock()
|
|
|
|
// PBS alert should be cleared
|
|
if _, exists := m.activeAlerts["pbs-offline-pbs1"]; exists {
|
|
t.Error("expected pbs-offline-pbs1 to be cleared")
|
|
}
|
|
// Non-PBS alert should remain
|
|
if _, exists := m.activeAlerts["cpu-alert"]; !exists {
|
|
t.Error("expected cpu-alert to remain")
|
|
}
|
|
// offlineConfirmations for PBS should be removed
|
|
if _, exists := m.offlineConfirmations["pbs1"]; exists {
|
|
t.Error("expected offlineConfirmations for pbs1 to be removed")
|
|
}
|
|
})
|
|
|
|
t.Run("DisableAllGuestsOffline clears guest powered off alerts", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
// Add guest powered off alerts
|
|
m.activeAlerts["guest-powered-off-vm1"] = &Alert{ID: "guest-powered-off-vm1", ResourceID: "vm1", Type: "powered-off"}
|
|
// Add non-guest alert
|
|
m.activeAlerts["cpu-alert"] = &Alert{ID: "cpu-alert", Type: "cpu"}
|
|
// Add to offlineConfirmations
|
|
m.offlineConfirmations["vm1"] = 2
|
|
|
|
m.config.DisableAllGuestsOffline = true
|
|
|
|
m.mu.Lock()
|
|
m.applyGlobalOfflineSettingsLocked()
|
|
m.mu.Unlock()
|
|
|
|
// Guest alert should be cleared
|
|
if _, exists := m.activeAlerts["guest-powered-off-vm1"]; exists {
|
|
t.Error("expected guest-powered-off-vm1 to be cleared")
|
|
}
|
|
// Non-guest alert should remain
|
|
if _, exists := m.activeAlerts["cpu-alert"]; !exists {
|
|
t.Error("expected cpu-alert to remain")
|
|
}
|
|
// offlineConfirmations for guest should be removed
|
|
if _, exists := m.offlineConfirmations["vm1"]; exists {
|
|
t.Error("expected offlineConfirmations for vm1 to be removed")
|
|
}
|
|
})
|
|
|
|
t.Run("DisableAllDockerHostsOffline clears docker host alerts", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
// Add docker host offline alerts
|
|
m.activeAlerts["docker-host-offline-host1"] = &Alert{ID: "docker-host-offline-host1", Type: "offline"}
|
|
// Add non-docker host alert
|
|
m.activeAlerts["cpu-alert"] = &Alert{ID: "cpu-alert", Type: "cpu"}
|
|
// Add to dockerOfflineCount
|
|
m.dockerOfflineCount["host1"] = 3
|
|
|
|
m.config.DisableAllDockerHostsOffline = true
|
|
|
|
m.mu.Lock()
|
|
m.applyGlobalOfflineSettingsLocked()
|
|
m.mu.Unlock()
|
|
|
|
// Docker host alert should be cleared
|
|
if _, exists := m.activeAlerts["docker-host-offline-host1"]; exists {
|
|
t.Error("expected docker-host-offline-host1 to be cleared")
|
|
}
|
|
// Non-docker host alert should remain
|
|
if _, exists := m.activeAlerts["cpu-alert"]; !exists {
|
|
t.Error("expected cpu-alert to remain")
|
|
}
|
|
// dockerOfflineCount should be reset
|
|
if len(m.dockerOfflineCount) != 0 {
|
|
t.Errorf("expected dockerOfflineCount to be empty, got %d entries", len(m.dockerOfflineCount))
|
|
}
|
|
})
|
|
|
|
t.Run("DisableAllDockerContainers clears docker container alerts", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
// Add docker container alerts
|
|
m.activeAlerts["docker-container-unhealthy-c1"] = &Alert{ID: "docker-container-unhealthy-c1", Type: "unhealthy"}
|
|
m.activeAlerts["docker-container-exited-c2"] = &Alert{ID: "docker-container-exited-c2", Type: "exited"}
|
|
// Add non-container alert
|
|
m.activeAlerts["cpu-alert"] = &Alert{ID: "cpu-alert", Type: "cpu"}
|
|
// Add tracking state
|
|
m.dockerStateConfirm["c1"] = 2
|
|
m.dockerRestartTracking["c1"] = &dockerRestartRecord{count: 5}
|
|
m.dockerLastExitCode["c1"] = 137
|
|
|
|
m.config.DisableAllDockerContainers = true
|
|
|
|
m.mu.Lock()
|
|
m.applyGlobalOfflineSettingsLocked()
|
|
m.mu.Unlock()
|
|
|
|
// Docker container alerts should be cleared
|
|
if _, exists := m.activeAlerts["docker-container-unhealthy-c1"]; exists {
|
|
t.Error("expected docker-container-unhealthy-c1 to be cleared")
|
|
}
|
|
if _, exists := m.activeAlerts["docker-container-exited-c2"]; exists {
|
|
t.Error("expected docker-container-exited-c2 to be cleared")
|
|
}
|
|
// Non-container alert should remain
|
|
if _, exists := m.activeAlerts["cpu-alert"]; !exists {
|
|
t.Error("expected cpu-alert to remain")
|
|
}
|
|
// Tracking state should be reset
|
|
if len(m.dockerStateConfirm) != 0 {
|
|
t.Errorf("expected dockerStateConfirm to be empty, got %d entries", len(m.dockerStateConfirm))
|
|
}
|
|
if len(m.dockerRestartTracking) != 0 {
|
|
t.Errorf("expected dockerRestartTracking to be empty, got %d entries", len(m.dockerRestartTracking))
|
|
}
|
|
if len(m.dockerLastExitCode) != 0 {
|
|
t.Errorf("expected dockerLastExitCode to be empty, got %d entries", len(m.dockerLastExitCode))
|
|
}
|
|
})
|
|
|
|
t.Run("DisableAllDockerServices clears docker service alerts", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
// Add docker service alerts
|
|
m.activeAlerts["docker-service-unhealthy-svc1"] = &Alert{ID: "docker-service-unhealthy-svc1", Type: "unhealthy"}
|
|
// Add non-service alert
|
|
m.activeAlerts["cpu-alert"] = &Alert{ID: "cpu-alert", Type: "cpu"}
|
|
|
|
m.config.DisableAllDockerServices = true
|
|
|
|
m.mu.Lock()
|
|
m.applyGlobalOfflineSettingsLocked()
|
|
m.mu.Unlock()
|
|
|
|
// Docker service alert should be cleared
|
|
if _, exists := m.activeAlerts["docker-service-unhealthy-svc1"]; exists {
|
|
t.Error("expected docker-service-unhealthy-svc1 to be cleared")
|
|
}
|
|
// Non-service alert should remain
|
|
if _, exists := m.activeAlerts["cpu-alert"]; !exists {
|
|
t.Error("expected cpu-alert to remain")
|
|
}
|
|
})
|
|
|
|
t.Run("no settings enabled does nothing", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
// Add various alerts
|
|
m.activeAlerts["node-offline-node1"] = &Alert{ID: "node-offline-node1", Type: "offline"}
|
|
m.activeAlerts["pbs-offline-pbs1"] = &Alert{ID: "pbs-offline-pbs1", Type: "offline"}
|
|
m.activeAlerts["docker-container-unhealthy-c1"] = &Alert{ID: "docker-container-unhealthy-c1", Type: "unhealthy"}
|
|
|
|
// All disable settings are false by default
|
|
|
|
m.mu.Lock()
|
|
m.applyGlobalOfflineSettingsLocked()
|
|
m.mu.Unlock()
|
|
|
|
// All alerts should remain
|
|
if len(m.activeAlerts) != 3 {
|
|
t.Errorf("expected 3 alerts to remain, got %d", len(m.activeAlerts))
|
|
}
|
|
})
|
|
}
|
|
|
|
func TestHandleHostOffline(t *testing.T) {
|
|
// t.Parallel()
|
|
|
|
t.Run("empty host ID returns early", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
m.config.Enabled = true
|
|
|
|
host := models.Host{ID: "", Hostname: "test-host"}
|
|
m.HandleHostOffline(host)
|
|
|
|
// No alert should be created
|
|
if len(m.activeAlerts) != 0 {
|
|
t.Errorf("expected 0 alerts, got %d", len(m.activeAlerts))
|
|
}
|
|
})
|
|
|
|
t.Run("alerts disabled returns early", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
m.config.Enabled = false
|
|
|
|
host := models.Host{ID: "host1", Hostname: "test-host"}
|
|
m.HandleHostOffline(host)
|
|
|
|
// No alert should be created
|
|
if len(m.activeAlerts) != 0 {
|
|
t.Errorf("expected 0 alerts, got %d", len(m.activeAlerts))
|
|
}
|
|
})
|
|
|
|
t.Run("DisableAllHostsOffline clears alert and returns", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
m.config.Enabled = true
|
|
m.config.DisableAllHostsOffline = true
|
|
|
|
// Pre-create an alert and confirmation
|
|
alertID := "host-offline-host1"
|
|
m.activeAlerts[alertID] = &Alert{ID: alertID, Type: "host-offline"}
|
|
m.offlineConfirmations["host:host1"] = 5
|
|
|
|
host := models.Host{ID: "host1", Hostname: "test-host"}
|
|
m.HandleHostOffline(host)
|
|
|
|
// Alert should be cleared and confirmations removed
|
|
if _, exists := m.activeAlerts[alertID]; exists {
|
|
t.Error("expected alert to be cleared")
|
|
}
|
|
if _, exists := m.offlineConfirmations["host:host1"]; exists {
|
|
t.Error("expected offlineConfirmations to be cleared")
|
|
}
|
|
})
|
|
|
|
t.Run("override DisableConnectivity clears alert and returns", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
m.config.Enabled = true
|
|
m.config.Overrides = map[string]ThresholdConfig{
|
|
"host1": {DisableConnectivity: true},
|
|
}
|
|
|
|
// Pre-create an alert and confirmation
|
|
alertID := "host-offline-host1"
|
|
m.activeAlerts[alertID] = &Alert{ID: alertID, Type: "host-offline"}
|
|
m.offlineConfirmations["host:host1"] = 5
|
|
|
|
host := models.Host{ID: "host1", Hostname: "test-host"}
|
|
m.HandleHostOffline(host)
|
|
|
|
// Alert should be cleared and confirmations removed
|
|
if _, exists := m.activeAlerts[alertID]; exists {
|
|
t.Error("expected alert to be cleared")
|
|
}
|
|
if _, exists := m.offlineConfirmations["host:host1"]; exists {
|
|
t.Error("expected offlineConfirmations to be cleared")
|
|
}
|
|
})
|
|
|
|
t.Run("override Disabled clears alert and returns", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
m.config.Enabled = true
|
|
m.config.Overrides = map[string]ThresholdConfig{
|
|
"host1": {Disabled: true},
|
|
}
|
|
|
|
host := models.Host{ID: "host1", Hostname: "test-host"}
|
|
m.HandleHostOffline(host)
|
|
|
|
// No alert should be created
|
|
if len(m.activeAlerts) != 0 {
|
|
t.Errorf("expected 0 alerts, got %d", len(m.activeAlerts))
|
|
}
|
|
})
|
|
|
|
t.Run("existing alert updates LastSeen", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
m.config.Enabled = true
|
|
|
|
alertID := "host-offline-host1"
|
|
oldTime := time.Now().Add(-1 * time.Hour)
|
|
m.activeAlerts[alertID] = &Alert{ID: alertID, Type: "host-offline", LastSeen: oldTime}
|
|
|
|
host := models.Host{ID: "host1", Hostname: "test-host"}
|
|
m.HandleHostOffline(host)
|
|
|
|
// LastSeen should be updated
|
|
alert := m.activeAlerts[alertID]
|
|
if alert.LastSeen.Before(time.Now().Add(-1 * time.Minute)) {
|
|
t.Errorf("expected LastSeen to be updated to recent time, got %v", alert.LastSeen)
|
|
}
|
|
})
|
|
|
|
t.Run("insufficient confirmations waits", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
m.config.Enabled = true
|
|
|
|
host := models.Host{ID: "host1", Hostname: "test-host"}
|
|
|
|
// First two calls should not create alert
|
|
m.HandleHostOffline(host)
|
|
if len(m.activeAlerts) != 0 {
|
|
t.Errorf("expected 0 alerts after 1st call, got %d", len(m.activeAlerts))
|
|
}
|
|
if m.offlineConfirmations["host:host1"] != 1 {
|
|
t.Errorf("expected 1 confirmation, got %d", m.offlineConfirmations["host:host1"])
|
|
}
|
|
|
|
m.HandleHostOffline(host)
|
|
if len(m.activeAlerts) != 0 {
|
|
t.Errorf("expected 0 alerts after 2nd call, got %d", len(m.activeAlerts))
|
|
}
|
|
if m.offlineConfirmations["host:host1"] != 2 {
|
|
t.Errorf("expected 2 confirmations, got %d", m.offlineConfirmations["host:host1"])
|
|
}
|
|
})
|
|
|
|
t.Run("sufficient confirmations creates alert", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
m.config.Enabled = true
|
|
|
|
host := models.Host{
|
|
ID: "host1",
|
|
Hostname: "test-host",
|
|
DisplayName: "Test Host",
|
|
Platform: "linux",
|
|
OSName: "Ubuntu",
|
|
OSVersion: "22.04",
|
|
}
|
|
|
|
// Make 3 calls to reach required confirmations
|
|
m.HandleHostOffline(host)
|
|
m.HandleHostOffline(host)
|
|
m.HandleHostOffline(host)
|
|
|
|
// Alert should now be created
|
|
alertID := "host-offline-host1"
|
|
alert, exists := m.activeAlerts[alertID]
|
|
if !exists {
|
|
t.Fatal("expected alert to be created after 3 confirmations")
|
|
}
|
|
if alert.Type != "host-offline" {
|
|
t.Errorf("expected type 'host-offline', got '%s'", alert.Type)
|
|
}
|
|
if alert.Level != AlertLevelCritical {
|
|
t.Errorf("expected level Critical, got '%s'", alert.Level)
|
|
}
|
|
if alert.ResourceName == "" {
|
|
t.Error("expected ResourceName to be set")
|
|
}
|
|
})
|
|
}
|
|
|
|
func TestReevaluateActiveAlertsLocked(t *testing.T) {
|
|
// t.Parallel()
|
|
|
|
t.Run("empty alerts map is no-op", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
m.mu.Lock()
|
|
m.reevaluateActiveAlertsLocked()
|
|
m.mu.Unlock()
|
|
|
|
if len(m.activeAlerts) != 0 {
|
|
t.Errorf("expected 0 alerts, got %d", len(m.activeAlerts))
|
|
}
|
|
})
|
|
|
|
t.Run("alert with insufficient ID parts is skipped", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
// Alert ID without dash separator
|
|
m.activeAlerts["singlepart"] = &Alert{ID: "singlepart", Type: "cpu", Value: 90}
|
|
|
|
m.mu.Lock()
|
|
m.reevaluateActiveAlertsLocked()
|
|
m.mu.Unlock()
|
|
|
|
// Alert should remain (skipped due to ID format)
|
|
if _, exists := m.activeAlerts["singlepart"]; !exists {
|
|
t.Error("expected singlepart alert to remain")
|
|
}
|
|
})
|
|
|
|
t.Run("DisableAllPMG resolves PMG queue alerts", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
// Add PMG queue alert
|
|
m.activeAlerts["pmg-queue-cpu"] = &Alert{
|
|
ID: "pmg-queue-cpu",
|
|
Type: "queue-depth",
|
|
}
|
|
|
|
m.config.DisableAllPMG = true
|
|
|
|
m.mu.Lock()
|
|
m.reevaluateActiveAlertsLocked()
|
|
m.mu.Unlock()
|
|
|
|
// PMG alert should be resolved
|
|
if _, exists := m.activeAlerts["pmg-queue-cpu"]; exists {
|
|
t.Error("expected PMG alert to be resolved")
|
|
}
|
|
})
|
|
|
|
t.Run("DisableAllHosts resolves Host alerts", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
// Add host alert with resourceType metadata
|
|
m.activeAlerts["host-1-cpu"] = &Alert{
|
|
ID: "host-1-cpu",
|
|
Type: "cpu",
|
|
Value: 90,
|
|
Metadata: map[string]interface{}{
|
|
"resourceType": "Host",
|
|
},
|
|
}
|
|
|
|
m.config.DisableAllHosts = true
|
|
|
|
m.mu.Lock()
|
|
m.reevaluateActiveAlertsLocked()
|
|
m.mu.Unlock()
|
|
|
|
// Host alert should be resolved
|
|
if _, exists := m.activeAlerts["host-1-cpu"]; exists {
|
|
t.Error("expected Host alert to be resolved")
|
|
}
|
|
})
|
|
|
|
t.Run("Docker host offline alerts are skipped", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
// Add docker host offline alert
|
|
m.activeAlerts["docker-host-1-offline"] = &Alert{
|
|
ID: "docker-host-1-offline",
|
|
Type: "docker-host-offline",
|
|
}
|
|
|
|
m.mu.Lock()
|
|
m.reevaluateActiveAlertsLocked()
|
|
m.mu.Unlock()
|
|
|
|
// Docker host offline alert should remain (skipped)
|
|
if _, exists := m.activeAlerts["docker-host-1-offline"]; !exists {
|
|
t.Error("expected docker-host-offline alert to remain")
|
|
}
|
|
})
|
|
|
|
t.Run("DisableAllDockerHosts resolves dockerhost alerts", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
// Add dockerhost metric alert
|
|
m.activeAlerts["dockerhost-1-cpu"] = &Alert{
|
|
ID: "dockerhost-1-cpu",
|
|
Type: "cpu",
|
|
Value: 90,
|
|
Metadata: map[string]interface{}{
|
|
"resourceType": "dockerhost",
|
|
},
|
|
}
|
|
|
|
m.config.DisableAllDockerHosts = true
|
|
|
|
m.mu.Lock()
|
|
m.reevaluateActiveAlertsLocked()
|
|
m.mu.Unlock()
|
|
|
|
// Dockerhost alert should be resolved
|
|
if _, exists := m.activeAlerts["dockerhost-1-cpu"]; exists {
|
|
t.Error("expected dockerhost alert to be resolved")
|
|
}
|
|
})
|
|
|
|
t.Run("DisableAllNodes resolves Node alerts", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
// Add node alert with Instance = "Node"
|
|
m.activeAlerts["node1-cpu"] = &Alert{
|
|
ID: "node1-cpu",
|
|
Type: "cpu",
|
|
Value: 90,
|
|
Instance: "Node",
|
|
}
|
|
|
|
m.config.DisableAllNodes = true
|
|
|
|
m.mu.Lock()
|
|
m.reevaluateActiveAlertsLocked()
|
|
m.mu.Unlock()
|
|
|
|
// Node alert should be resolved
|
|
if _, exists := m.activeAlerts["node1-cpu"]; exists {
|
|
t.Error("expected Node alert to be resolved")
|
|
}
|
|
})
|
|
|
|
t.Run("DisableAllStorage resolves Storage alerts", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
// Add storage alert with Instance = "Storage"
|
|
m.activeAlerts["storage1-usage"] = &Alert{
|
|
ID: "storage1-usage",
|
|
Type: "usage",
|
|
Value: 90,
|
|
Instance: "Storage",
|
|
}
|
|
|
|
m.config.DisableAllStorage = true
|
|
|
|
m.mu.Lock()
|
|
m.reevaluateActiveAlertsLocked()
|
|
m.mu.Unlock()
|
|
|
|
// Storage alert should be resolved
|
|
if _, exists := m.activeAlerts["storage1-usage"]; exists {
|
|
t.Error("expected Storage alert to be resolved")
|
|
}
|
|
})
|
|
|
|
t.Run("DisableAllPBS resolves PBS alerts", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
// Add PBS alert with Instance = "PBS"
|
|
m.activeAlerts["pbs1-cpu"] = &Alert{
|
|
ID: "pbs1-cpu",
|
|
Type: "cpu",
|
|
Value: 90,
|
|
Instance: "PBS",
|
|
}
|
|
|
|
m.config.DisableAllPBS = true
|
|
|
|
m.mu.Lock()
|
|
m.reevaluateActiveAlertsLocked()
|
|
m.mu.Unlock()
|
|
|
|
// PBS alert should be resolved
|
|
if _, exists := m.activeAlerts["pbs1-cpu"]; exists {
|
|
t.Error("expected PBS alert to be resolved")
|
|
}
|
|
})
|
|
|
|
t.Run("DisableAllGuests resolves Guest alerts", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
// Add guest alert with Instance set to something other than "Node"/"Storage"/"PBS"
|
|
// Note: If both Instance and Node are empty, it matches the node branch
|
|
m.activeAlerts["guest1-cpu"] = &Alert{
|
|
ID: "guest1-cpu",
|
|
Type: "cpu",
|
|
Value: 90,
|
|
Instance: "qemu/100", // Guest instance
|
|
Node: "pve1", // Different from Instance, so doesn't match node branch
|
|
}
|
|
|
|
m.config.DisableAllGuests = true
|
|
|
|
m.mu.Lock()
|
|
m.reevaluateActiveAlertsLocked()
|
|
m.mu.Unlock()
|
|
|
|
// Guest alert should be resolved
|
|
if _, exists := m.activeAlerts["guest1-cpu"]; exists {
|
|
t.Error("expected Guest alert to be resolved")
|
|
}
|
|
})
|
|
|
|
t.Run("alert with disabled override is resolved", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
// Add guest alert with override
|
|
m.activeAlerts["guest1-cpu"] = &Alert{
|
|
ID: "guest1-cpu",
|
|
Type: "cpu",
|
|
Value: 90,
|
|
Instance: "qemu/100",
|
|
Node: "pve1",
|
|
}
|
|
m.config.Overrides = map[string]ThresholdConfig{
|
|
"guest1": {Disabled: true},
|
|
}
|
|
|
|
m.mu.Lock()
|
|
m.reevaluateActiveAlertsLocked()
|
|
m.mu.Unlock()
|
|
|
|
// Alert should be resolved due to disabled override
|
|
if _, exists := m.activeAlerts["guest1-cpu"]; exists {
|
|
t.Error("expected alert with disabled override to be resolved")
|
|
}
|
|
})
|
|
|
|
t.Run("alert below clear threshold is resolved", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
// Add guest alert below new clear threshold
|
|
m.activeAlerts["guest1-cpu"] = &Alert{
|
|
ID: "guest1-cpu",
|
|
Type: "cpu",
|
|
Value: 70, // Below clear threshold
|
|
Threshold: 80,
|
|
Instance: "qemu/100",
|
|
Node: "pve1",
|
|
}
|
|
m.config.GuestDefaults.CPU = &HysteresisThreshold{Trigger: 80, Clear: 75}
|
|
|
|
m.mu.Lock()
|
|
m.reevaluateActiveAlertsLocked()
|
|
m.mu.Unlock()
|
|
|
|
// Alert should be resolved (value 70 < clear 75)
|
|
if _, exists := m.activeAlerts["guest1-cpu"]; exists {
|
|
t.Error("expected alert below clear threshold to be resolved")
|
|
}
|
|
})
|
|
|
|
t.Run("alert between clear and trigger is resolved on config change", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
// Add guest alert between clear and new higher trigger
|
|
m.activeAlerts["guest1-cpu"] = &Alert{
|
|
ID: "guest1-cpu",
|
|
Type: "cpu",
|
|
Value: 85, // Between clear (75) and new trigger (90)
|
|
Threshold: 80,
|
|
Instance: "qemu/100",
|
|
Node: "pve1",
|
|
}
|
|
m.config.GuestDefaults.CPU = &HysteresisThreshold{Trigger: 90, Clear: 75}
|
|
|
|
m.mu.Lock()
|
|
m.reevaluateActiveAlertsLocked()
|
|
m.mu.Unlock()
|
|
|
|
// Alert should be resolved (value 85 < trigger 90)
|
|
if _, exists := m.activeAlerts["guest1-cpu"]; exists {
|
|
t.Error("expected alert between thresholds to be resolved")
|
|
}
|
|
})
|
|
}
|
|
|
|
func TestHandleHostRemoved(t *testing.T) {
|
|
// t.Parallel()
|
|
|
|
t.Run("empty host ID is no-op", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
m.mu.Lock()
|
|
m.activeAlerts["host-offline-host1"] = &Alert{ID: "host-offline-host1"}
|
|
m.mu.Unlock()
|
|
|
|
// Empty ID host
|
|
m.HandleHostRemoved(models.Host{ID: ""})
|
|
|
|
// Alert should still exist
|
|
m.mu.RLock()
|
|
_, exists := m.activeAlerts["host-offline-host1"]
|
|
m.mu.RUnlock()
|
|
if !exists {
|
|
t.Error("expected alert to remain when empty host ID passed")
|
|
}
|
|
})
|
|
|
|
t.Run("clears host offline alert", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
m.mu.Lock()
|
|
m.config.Enabled = true
|
|
m.activeAlerts["host-offline-host1"] = &Alert{
|
|
ID: "host-offline-host1",
|
|
ResourceID: "host:host1",
|
|
}
|
|
m.offlineConfirmations["host:host1"] = 5
|
|
m.mu.Unlock()
|
|
|
|
m.HandleHostRemoved(models.Host{ID: "host1", Hostname: "testhost"})
|
|
|
|
m.mu.RLock()
|
|
_, alertExists := m.activeAlerts["host-offline-host1"]
|
|
_, confirmExists := m.offlineConfirmations["host:host1"]
|
|
m.mu.RUnlock()
|
|
|
|
if alertExists {
|
|
t.Error("expected host offline alert to be cleared")
|
|
}
|
|
if confirmExists {
|
|
t.Error("expected offline confirmations to be cleared")
|
|
}
|
|
})
|
|
|
|
t.Run("clears host metric alerts", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
m.mu.Lock()
|
|
m.config.Enabled = true
|
|
// Add CPU and memory alerts for host
|
|
m.activeAlerts["host:host1-cpu"] = &Alert{
|
|
ID: "host:host1-cpu",
|
|
ResourceID: "host:host1",
|
|
}
|
|
m.activeAlerts["host:host1-memory"] = &Alert{
|
|
ID: "host:host1-memory",
|
|
ResourceID: "host:host1",
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
m.HandleHostRemoved(models.Host{ID: "host1", Hostname: "testhost"})
|
|
|
|
m.mu.RLock()
|
|
_, cpuExists := m.activeAlerts["host:host1-cpu"]
|
|
_, memExists := m.activeAlerts["host:host1-memory"]
|
|
m.mu.RUnlock()
|
|
|
|
if cpuExists {
|
|
t.Error("expected host CPU alert to be cleared")
|
|
}
|
|
if memExists {
|
|
t.Error("expected host memory alert to be cleared")
|
|
}
|
|
})
|
|
|
|
t.Run("clears host disk alerts", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
m.mu.Lock()
|
|
m.config.Enabled = true
|
|
// Add disk alerts for host
|
|
m.activeAlerts["host:host1/disk:sda-usage"] = &Alert{
|
|
ID: "host:host1/disk:sda-usage",
|
|
ResourceID: "host:host1/disk:sda",
|
|
}
|
|
m.activeAlerts["host:host1/disk:sdb-usage"] = &Alert{
|
|
ID: "host:host1/disk:sdb-usage",
|
|
ResourceID: "host:host1/disk:sdb",
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
m.HandleHostRemoved(models.Host{ID: "host1", Hostname: "testhost"})
|
|
|
|
m.mu.RLock()
|
|
_, sda := m.activeAlerts["host:host1/disk:sda-usage"]
|
|
_, sdb := m.activeAlerts["host:host1/disk:sdb-usage"]
|
|
m.mu.RUnlock()
|
|
|
|
if sda {
|
|
t.Error("expected host disk sda alert to be cleared")
|
|
}
|
|
if sdb {
|
|
t.Error("expected host disk sdb alert to be cleared")
|
|
}
|
|
})
|
|
|
|
t.Run("clears all alert types together", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
m.mu.Lock()
|
|
m.config.Enabled = true
|
|
// Add multiple alert types
|
|
m.activeAlerts["host-offline-host1"] = &Alert{ID: "host-offline-host1", ResourceID: "host:host1"}
|
|
m.activeAlerts["host:host1-cpu"] = &Alert{ID: "host:host1-cpu", ResourceID: "host:host1"}
|
|
m.activeAlerts["host:host1-memory"] = &Alert{ID: "host:host1-memory", ResourceID: "host:host1"}
|
|
m.activeAlerts["host:host1/disk:sda-usage"] = &Alert{ID: "host:host1/disk:sda-usage", ResourceID: "host:host1/disk:sda"}
|
|
m.offlineConfirmations["host:host1"] = 3
|
|
m.mu.Unlock()
|
|
|
|
m.HandleHostRemoved(models.Host{ID: "host1", Hostname: "testhost"})
|
|
|
|
m.mu.RLock()
|
|
alertCount := 0
|
|
for id := range m.activeAlerts {
|
|
if strings.Contains(id, "host1") {
|
|
alertCount++
|
|
}
|
|
}
|
|
_, confirmExists := m.offlineConfirmations["host:host1"]
|
|
m.mu.RUnlock()
|
|
|
|
if alertCount > 0 {
|
|
t.Errorf("expected all host1 alerts to be cleared, got %d remaining", alertCount)
|
|
}
|
|
if confirmExists {
|
|
t.Error("expected offline confirmations to be cleared")
|
|
}
|
|
})
|
|
}
|
|
|
|
func TestReevaluateGuestAlert(t *testing.T) {
|
|
// t.Parallel()
|
|
|
|
t.Run("no active alerts is no-op", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
m.mu.Lock()
|
|
m.config.Enabled = true
|
|
m.config.GuestDefaults.CPU = &HysteresisThreshold{Trigger: 80, Clear: 70}
|
|
m.mu.Unlock()
|
|
|
|
// No alerts exist - should not panic
|
|
m.ReevaluateGuestAlert(nil, "guest1")
|
|
|
|
m.mu.RLock()
|
|
count := len(m.activeAlerts)
|
|
m.mu.RUnlock()
|
|
if count != 0 {
|
|
t.Errorf("expected 0 alerts, got %d", count)
|
|
}
|
|
})
|
|
|
|
t.Run("clears alert when threshold disabled (nil)", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
m.mu.Lock()
|
|
m.config.Enabled = true
|
|
m.activeAlerts["guest1-cpu"] = &Alert{
|
|
ID: "guest1-cpu",
|
|
Type: "cpu",
|
|
Value: 90,
|
|
}
|
|
m.config.GuestDefaults.CPU = nil // Disabled
|
|
m.mu.Unlock()
|
|
|
|
m.ReevaluateGuestAlert(nil, "guest1")
|
|
|
|
m.mu.RLock()
|
|
_, exists := m.activeAlerts["guest1-cpu"]
|
|
m.mu.RUnlock()
|
|
if exists {
|
|
t.Error("expected alert to be cleared when threshold is nil")
|
|
}
|
|
})
|
|
|
|
t.Run("clears alert when trigger is zero", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
m.mu.Lock()
|
|
m.config.Enabled = true
|
|
m.activeAlerts["guest1-memory"] = &Alert{
|
|
ID: "guest1-memory",
|
|
Type: "memory",
|
|
Value: 85,
|
|
}
|
|
m.config.GuestDefaults.Memory = &HysteresisThreshold{Trigger: 0, Clear: 0}
|
|
m.mu.Unlock()
|
|
|
|
m.ReevaluateGuestAlert(nil, "guest1")
|
|
|
|
m.mu.RLock()
|
|
_, exists := m.activeAlerts["guest1-memory"]
|
|
m.mu.RUnlock()
|
|
if exists {
|
|
t.Error("expected alert to be cleared when trigger is 0")
|
|
}
|
|
})
|
|
|
|
t.Run("clears alert when value below clear threshold", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
m.mu.Lock()
|
|
m.config.Enabled = true
|
|
m.activeAlerts["guest1-cpu"] = &Alert{
|
|
ID: "guest1-cpu",
|
|
Type: "cpu",
|
|
Value: 65, // Below clear threshold of 70
|
|
}
|
|
m.config.GuestDefaults.CPU = &HysteresisThreshold{Trigger: 80, Clear: 70}
|
|
m.mu.Unlock()
|
|
|
|
m.ReevaluateGuestAlert(nil, "guest1")
|
|
|
|
m.mu.RLock()
|
|
_, exists := m.activeAlerts["guest1-cpu"]
|
|
m.mu.RUnlock()
|
|
if exists {
|
|
t.Error("expected alert to be cleared when value below clear threshold")
|
|
}
|
|
})
|
|
|
|
t.Run("clears alert when value below trigger threshold", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
m.mu.Lock()
|
|
m.config.Enabled = true
|
|
m.activeAlerts["guest1-disk"] = &Alert{
|
|
ID: "guest1-disk",
|
|
Type: "disk",
|
|
Value: 75, // Below trigger of 80
|
|
}
|
|
m.config.GuestDefaults.Disk = &HysteresisThreshold{Trigger: 80, Clear: 70}
|
|
m.mu.Unlock()
|
|
|
|
m.ReevaluateGuestAlert(nil, "guest1")
|
|
|
|
m.mu.RLock()
|
|
_, exists := m.activeAlerts["guest1-disk"]
|
|
m.mu.RUnlock()
|
|
if exists {
|
|
t.Error("expected alert to be cleared when value below trigger")
|
|
}
|
|
})
|
|
|
|
t.Run("keeps alert when value above both thresholds", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
m.mu.Lock()
|
|
m.config.Enabled = true
|
|
m.activeAlerts["guest1-cpu"] = &Alert{
|
|
ID: "guest1-cpu",
|
|
Type: "cpu",
|
|
Value: 90, // Above both trigger (80) and clear (70)
|
|
}
|
|
m.config.GuestDefaults.CPU = &HysteresisThreshold{Trigger: 80, Clear: 70}
|
|
m.mu.Unlock()
|
|
|
|
m.ReevaluateGuestAlert(nil, "guest1")
|
|
|
|
m.mu.RLock()
|
|
_, exists := m.activeAlerts["guest1-cpu"]
|
|
m.mu.RUnlock()
|
|
if !exists {
|
|
t.Error("expected alert to remain when value above thresholds")
|
|
}
|
|
})
|
|
|
|
t.Run("processes all metric types", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
m.mu.Lock()
|
|
m.config.Enabled = true
|
|
// Add alerts for all metric types with values below threshold
|
|
metrics := []string{"cpu", "memory", "disk", "diskRead", "diskWrite", "networkIn", "networkOut"}
|
|
for _, metric := range metrics {
|
|
m.activeAlerts[fmt.Sprintf("guest1-%s", metric)] = &Alert{
|
|
ID: fmt.Sprintf("guest1-%s", metric),
|
|
Type: metric,
|
|
Value: 50, // Below threshold
|
|
}
|
|
}
|
|
threshold := &HysteresisThreshold{Trigger: 80, Clear: 70}
|
|
m.config.GuestDefaults.CPU = threshold
|
|
m.config.GuestDefaults.Memory = threshold
|
|
m.config.GuestDefaults.Disk = threshold
|
|
m.config.GuestDefaults.DiskRead = threshold
|
|
m.config.GuestDefaults.DiskWrite = threshold
|
|
m.config.GuestDefaults.NetworkIn = threshold
|
|
m.config.GuestDefaults.NetworkOut = threshold
|
|
m.mu.Unlock()
|
|
|
|
m.ReevaluateGuestAlert(nil, "guest1")
|
|
|
|
m.mu.RLock()
|
|
remaining := len(m.activeAlerts)
|
|
m.mu.RUnlock()
|
|
if remaining != 0 {
|
|
t.Errorf("expected all alerts to be cleared, got %d remaining", remaining)
|
|
}
|
|
})
|
|
|
|
t.Run("clears pending alert when threshold disabled", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
m.mu.Lock()
|
|
m.config.Enabled = true
|
|
m.activeAlerts["guest1-cpu"] = &Alert{
|
|
ID: "guest1-cpu",
|
|
Type: "cpu",
|
|
Value: 90,
|
|
}
|
|
m.pendingAlerts["guest1-cpu"] = time.Now() // pendingAlerts is map[string]time.Time
|
|
m.config.GuestDefaults.CPU = nil // Disabled
|
|
m.mu.Unlock()
|
|
|
|
m.ReevaluateGuestAlert(nil, "guest1")
|
|
|
|
m.mu.RLock()
|
|
_, alertExists := m.activeAlerts["guest1-cpu"]
|
|
_, pendingExists := m.pendingAlerts["guest1-cpu"]
|
|
m.mu.RUnlock()
|
|
if alertExists {
|
|
t.Error("expected active alert to be cleared")
|
|
}
|
|
if pendingExists {
|
|
t.Error("expected pending alert to be cleared")
|
|
}
|
|
})
|
|
|
|
t.Run("uses clear equals trigger when clear is zero", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
m.mu.Lock()
|
|
m.config.Enabled = true
|
|
m.activeAlerts["guest1-cpu"] = &Alert{
|
|
ID: "guest1-cpu",
|
|
Type: "cpu",
|
|
Value: 75, // Below trigger of 80
|
|
}
|
|
// Clear is 0, so it should use trigger (80) as clear threshold
|
|
m.config.GuestDefaults.CPU = &HysteresisThreshold{Trigger: 80, Clear: 0}
|
|
m.mu.Unlock()
|
|
|
|
m.ReevaluateGuestAlert(nil, "guest1")
|
|
|
|
m.mu.RLock()
|
|
_, exists := m.activeAlerts["guest1-cpu"]
|
|
m.mu.RUnlock()
|
|
if exists {
|
|
t.Error("expected alert to be cleared when value below trigger (used as clear)")
|
|
}
|
|
})
|
|
|
|
t.Run("ignores alerts for different guests", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
m.mu.Lock()
|
|
m.config.Enabled = true
|
|
m.activeAlerts["guest1-cpu"] = &Alert{
|
|
ID: "guest1-cpu",
|
|
Type: "cpu",
|
|
Value: 50, // Below threshold
|
|
}
|
|
m.activeAlerts["guest2-cpu"] = &Alert{
|
|
ID: "guest2-cpu",
|
|
Type: "cpu",
|
|
Value: 50, // Below threshold
|
|
}
|
|
m.config.GuestDefaults.CPU = &HysteresisThreshold{Trigger: 80, Clear: 70}
|
|
m.mu.Unlock()
|
|
|
|
// Only reevaluate guest1
|
|
m.ReevaluateGuestAlert(nil, "guest1")
|
|
|
|
m.mu.RLock()
|
|
_, guest1Exists := m.activeAlerts["guest1-cpu"]
|
|
_, guest2Exists := m.activeAlerts["guest2-cpu"]
|
|
m.mu.RUnlock()
|
|
|
|
if guest1Exists {
|
|
t.Error("expected guest1 alert to be cleared")
|
|
}
|
|
if !guest2Exists {
|
|
t.Error("expected guest2 alert to remain (not reevaluated)")
|
|
}
|
|
})
|
|
}
|
|
|
|
func TestHandleDockerHostOffline(t *testing.T) {
|
|
// t.Parallel()
|
|
|
|
t.Run("empty host ID is no-op", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
m.mu.Lock()
|
|
m.config.Enabled = true
|
|
initialCount := len(m.activeAlerts)
|
|
m.mu.Unlock()
|
|
|
|
m.HandleDockerHostOffline(models.DockerHost{ID: ""})
|
|
|
|
m.mu.RLock()
|
|
finalCount := len(m.activeAlerts)
|
|
m.mu.RUnlock()
|
|
if finalCount != initialCount {
|
|
t.Error("expected no change when empty host ID passed")
|
|
}
|
|
})
|
|
|
|
t.Run("disabled alerts is no-op", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
m.mu.Lock()
|
|
m.config.Enabled = false
|
|
m.mu.Unlock()
|
|
|
|
m.HandleDockerHostOffline(models.DockerHost{ID: "docker1", DisplayName: "Docker Host 1"})
|
|
|
|
m.mu.RLock()
|
|
_, exists := m.activeAlerts["docker-host-offline-docker1"]
|
|
m.mu.RUnlock()
|
|
if exists {
|
|
t.Error("expected no alert when alerts are disabled")
|
|
}
|
|
})
|
|
|
|
t.Run("DisableAllDockerHostsOffline clears tracking and alert", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
m.mu.Lock()
|
|
m.config.Enabled = true
|
|
m.config.DisableAllDockerHostsOffline = true
|
|
m.dockerOfflineCount["docker1"] = 5
|
|
m.activeAlerts["docker-host-offline-docker1"] = &Alert{ID: "docker-host-offline-docker1"}
|
|
m.mu.Unlock()
|
|
|
|
m.HandleDockerHostOffline(models.DockerHost{ID: "docker1", DisplayName: "Docker Host 1"})
|
|
|
|
m.mu.RLock()
|
|
_, alertExists := m.activeAlerts["docker-host-offline-docker1"]
|
|
_, countExists := m.dockerOfflineCount["docker1"]
|
|
m.mu.RUnlock()
|
|
if alertExists {
|
|
t.Error("expected alert to be cleared")
|
|
}
|
|
if countExists {
|
|
t.Error("expected offline count to be cleared")
|
|
}
|
|
})
|
|
|
|
t.Run("override DisableConnectivity clears tracking and alert", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
m.mu.Lock()
|
|
m.config.Enabled = true
|
|
m.config.Overrides = map[string]ThresholdConfig{
|
|
"docker1": {DisableConnectivity: true},
|
|
}
|
|
m.dockerOfflineCount["docker1"] = 3
|
|
m.activeAlerts["docker-host-offline-docker1"] = &Alert{ID: "docker-host-offline-docker1"}
|
|
m.mu.Unlock()
|
|
|
|
m.HandleDockerHostOffline(models.DockerHost{ID: "docker1", DisplayName: "Docker Host 1"})
|
|
|
|
m.mu.RLock()
|
|
_, alertExists := m.activeAlerts["docker-host-offline-docker1"]
|
|
_, countExists := m.dockerOfflineCount["docker1"]
|
|
m.mu.RUnlock()
|
|
if alertExists {
|
|
t.Error("expected alert to be cleared with override")
|
|
}
|
|
if countExists {
|
|
t.Error("expected offline count to be cleared with override")
|
|
}
|
|
})
|
|
|
|
t.Run("existing alert updates LastSeen", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
oldTime := time.Now().Add(-1 * time.Hour)
|
|
m.mu.Lock()
|
|
m.config.Enabled = true
|
|
m.activeAlerts["docker-host-offline-docker1"] = &Alert{
|
|
ID: "docker-host-offline-docker1",
|
|
LastSeen: oldTime,
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
m.HandleDockerHostOffline(models.DockerHost{ID: "docker1", DisplayName: "Docker Host 1"})
|
|
|
|
m.mu.RLock()
|
|
alert := m.activeAlerts["docker-host-offline-docker1"]
|
|
m.mu.RUnlock()
|
|
if alert == nil {
|
|
t.Fatal("expected alert to exist")
|
|
}
|
|
if !alert.LastSeen.After(oldTime) {
|
|
t.Error("expected LastSeen to be updated")
|
|
}
|
|
})
|
|
|
|
t.Run("requires 3 confirmations before alert", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
m.mu.Lock()
|
|
m.config.Enabled = true
|
|
m.mu.Unlock()
|
|
|
|
host := models.DockerHost{ID: "docker1", DisplayName: "Docker Host 1", Hostname: "docker-server"}
|
|
|
|
// First call - confirmation 1
|
|
m.HandleDockerHostOffline(host)
|
|
m.mu.RLock()
|
|
count1 := m.dockerOfflineCount["docker1"]
|
|
alert1 := m.activeAlerts["docker-host-offline-docker1"]
|
|
m.mu.RUnlock()
|
|
if count1 != 1 {
|
|
t.Errorf("expected count 1, got %d", count1)
|
|
}
|
|
if alert1 != nil {
|
|
t.Error("expected no alert after 1 confirmation")
|
|
}
|
|
|
|
// Second call - confirmation 2
|
|
m.HandleDockerHostOffline(host)
|
|
m.mu.RLock()
|
|
count2 := m.dockerOfflineCount["docker1"]
|
|
alert2 := m.activeAlerts["docker-host-offline-docker1"]
|
|
m.mu.RUnlock()
|
|
if count2 != 2 {
|
|
t.Errorf("expected count 2, got %d", count2)
|
|
}
|
|
if alert2 != nil {
|
|
t.Error("expected no alert after 2 confirmations")
|
|
}
|
|
|
|
// Third call - confirmation 3 - should create alert
|
|
m.HandleDockerHostOffline(host)
|
|
m.mu.RLock()
|
|
count3 := m.dockerOfflineCount["docker1"]
|
|
alert3 := m.activeAlerts["docker-host-offline-docker1"]
|
|
m.mu.RUnlock()
|
|
if count3 != 3 {
|
|
t.Errorf("expected count 3, got %d", count3)
|
|
}
|
|
if alert3 == nil {
|
|
t.Fatal("expected alert after 3 confirmations")
|
|
}
|
|
if alert3.Type != "docker-host-offline" {
|
|
t.Errorf("expected type docker-host-offline, got %s", alert3.Type)
|
|
}
|
|
if alert3.Level != AlertLevelCritical {
|
|
t.Errorf("expected critical level, got %s", alert3.Level)
|
|
}
|
|
})
|
|
|
|
t.Run("alert has correct metadata", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
m.mu.Lock()
|
|
m.config.Enabled = true
|
|
m.dockerOfflineCount["docker1"] = 2 // Pre-set to trigger on next call
|
|
m.mu.Unlock()
|
|
|
|
host := models.DockerHost{
|
|
ID: "docker1",
|
|
DisplayName: "My Docker Host",
|
|
Hostname: "docker-server.local",
|
|
AgentID: "agent-123",
|
|
}
|
|
|
|
m.HandleDockerHostOffline(host)
|
|
|
|
m.mu.RLock()
|
|
alert := m.activeAlerts["docker-host-offline-docker1"]
|
|
m.mu.RUnlock()
|
|
|
|
if alert == nil {
|
|
t.Fatal("expected alert to be created")
|
|
}
|
|
if alert.ResourceID != "docker:docker1" {
|
|
t.Errorf("expected resourceID docker:docker1, got %s", alert.ResourceID)
|
|
}
|
|
if alert.ResourceName != "My Docker Host" {
|
|
t.Errorf("expected resourceName 'My Docker Host', got %s", alert.ResourceName)
|
|
}
|
|
if alert.Node != "docker-server.local" {
|
|
t.Errorf("expected node docker-server.local, got %s", alert.Node)
|
|
}
|
|
if alert.Metadata["resourceType"] != "DockerHost" {
|
|
t.Errorf("expected metadata resourceType DockerHost, got %v", alert.Metadata["resourceType"])
|
|
}
|
|
if alert.Metadata["hostId"] != "docker1" {
|
|
t.Errorf("expected metadata hostId docker1, got %v", alert.Metadata["hostId"])
|
|
}
|
|
if alert.Metadata["agentId"] != "agent-123" {
|
|
t.Errorf("expected metadata agentId agent-123, got %v", alert.Metadata["agentId"])
|
|
}
|
|
})
|
|
}
|
|
|
|
func TestSetMetricHooks(t *testing.T) {
|
|
// NOT parallel - modifies package-level state
|
|
// Save existing state and restore after test
|
|
oldFired := recordAlertFired
|
|
oldResolved := recordAlertResolved
|
|
oldSuppressed := recordAlertSuppressed
|
|
oldAcknowledged := recordAlertAcknowledged
|
|
defer func() {
|
|
recordAlertFired = oldFired
|
|
recordAlertResolved = oldResolved
|
|
recordAlertSuppressed = oldSuppressed
|
|
recordAlertAcknowledged = oldAcknowledged
|
|
}()
|
|
|
|
t.Run("sets all hooks", func(t *testing.T) {
|
|
var firedCalled, resolvedCalled, suppressedCalled, acknowledgedCalled bool
|
|
|
|
SetMetricHooks(
|
|
func(a *Alert) { firedCalled = true },
|
|
func(a *Alert) { resolvedCalled = true },
|
|
func(s string) { suppressedCalled = true },
|
|
func() { acknowledgedCalled = true },
|
|
)
|
|
|
|
// Verify hooks are set by calling them (if they were nil, this would panic)
|
|
if recordAlertFired != nil {
|
|
recordAlertFired(&Alert{})
|
|
}
|
|
if recordAlertResolved != nil {
|
|
recordAlertResolved(&Alert{})
|
|
}
|
|
if recordAlertSuppressed != nil {
|
|
recordAlertSuppressed("test")
|
|
}
|
|
if recordAlertAcknowledged != nil {
|
|
recordAlertAcknowledged()
|
|
}
|
|
|
|
if !firedCalled {
|
|
t.Error("expected fired hook to be called")
|
|
}
|
|
if !resolvedCalled {
|
|
t.Error("expected resolved hook to be called")
|
|
}
|
|
if !suppressedCalled {
|
|
t.Error("expected suppressed hook to be called")
|
|
}
|
|
if !acknowledgedCalled {
|
|
t.Error("expected acknowledged hook to be called")
|
|
}
|
|
})
|
|
|
|
t.Run("nil hooks are safe", func(t *testing.T) {
|
|
SetMetricHooks(nil, nil, nil, nil)
|
|
|
|
// Should not panic
|
|
if recordAlertFired != nil {
|
|
t.Error("expected fired hook to be nil")
|
|
}
|
|
if recordAlertResolved != nil {
|
|
t.Error("expected resolved hook to be nil")
|
|
}
|
|
})
|
|
}
|
|
|
|
func TestNotifyExistingAlert(t *testing.T) {
|
|
// t.Parallel()
|
|
|
|
t.Run("non-existent alert is no-op", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
// Should not panic
|
|
m.NotifyExistingAlert("non-existent-alert")
|
|
})
|
|
|
|
t.Run("existing alert dispatches notification", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
dispatchedCh := make(chan bool, 1)
|
|
m.SetAlertCallback(func(a *Alert) {
|
|
dispatchedCh <- true
|
|
})
|
|
|
|
m.mu.Lock()
|
|
m.config.Enabled = true
|
|
m.config.ActivationState = ActivationActive // Must be active to dispatch
|
|
m.activeAlerts["test-alert"] = &Alert{
|
|
ID: "test-alert",
|
|
Type: "test",
|
|
Level: AlertLevelWarning,
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
m.NotifyExistingAlert("test-alert")
|
|
|
|
// Wait for async dispatch with timeout
|
|
select {
|
|
case <-dispatchedCh:
|
|
// Success
|
|
case <-time.After(1 * time.Second):
|
|
t.Error("expected alert callback to be called (timeout)")
|
|
}
|
|
})
|
|
}
|
|
|
|
func TestGetResolvedAlert(t *testing.T) {
|
|
// t.Parallel()
|
|
|
|
t.Run("returns nil for non-existent alert", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
result := m.GetResolvedAlert("non-existent")
|
|
if result != nil {
|
|
t.Error("expected nil for non-existent alert")
|
|
}
|
|
})
|
|
|
|
t.Run("returns nil for nil resolved entry", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
m.resolvedMutex.Lock()
|
|
m.recentlyResolved["test"] = nil
|
|
m.resolvedMutex.Unlock()
|
|
|
|
result := m.GetResolvedAlert("test")
|
|
if result != nil {
|
|
t.Error("expected nil for nil resolved entry")
|
|
}
|
|
})
|
|
|
|
t.Run("returns nil when Alert is nil", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
m.resolvedMutex.Lock()
|
|
m.recentlyResolved["test"] = &ResolvedAlert{Alert: nil}
|
|
m.resolvedMutex.Unlock()
|
|
|
|
result := m.GetResolvedAlert("test")
|
|
if result != nil {
|
|
t.Error("expected nil when Alert is nil")
|
|
}
|
|
})
|
|
|
|
t.Run("returns cloned alert", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
resolvedTime := time.Now()
|
|
m.resolvedMutex.Lock()
|
|
m.recentlyResolved["test"] = &ResolvedAlert{
|
|
Alert: &Alert{
|
|
ID: "test",
|
|
Type: "cpu",
|
|
Level: AlertLevelWarning,
|
|
ResourceID: "res1",
|
|
ResourceName: "Resource 1",
|
|
},
|
|
ResolvedTime: resolvedTime,
|
|
}
|
|
m.resolvedMutex.Unlock()
|
|
|
|
result := m.GetResolvedAlert("test")
|
|
if result == nil {
|
|
t.Fatal("expected non-nil result")
|
|
}
|
|
if result.Alert.ID != "test" {
|
|
t.Errorf("expected ID test, got %s", result.Alert.ID)
|
|
}
|
|
if result.ResolvedTime != resolvedTime {
|
|
t.Error("expected resolved time to match")
|
|
}
|
|
})
|
|
}
|
|
|
|
func TestGetAlertHistory(t *testing.T) {
|
|
// t.Parallel()
|
|
|
|
t.Run("returns history from history manager", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
// Add some alerts to history
|
|
m.historyManager.AddAlert(Alert{ID: "alert1", Type: "cpu"})
|
|
m.historyManager.AddAlert(Alert{ID: "alert2", Type: "memory"})
|
|
|
|
history := m.GetAlertHistory(10)
|
|
if len(history) < 2 {
|
|
t.Errorf("expected at least 2 history entries, got %d", len(history))
|
|
}
|
|
})
|
|
|
|
t.Run("respects limit", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
// Add alerts
|
|
for i := 0; i < 5; i++ {
|
|
m.historyManager.AddAlert(Alert{ID: fmt.Sprintf("alert%d", i), Type: "test"})
|
|
}
|
|
|
|
history := m.GetAlertHistory(2)
|
|
if len(history) > 2 {
|
|
t.Errorf("expected max 2 entries, got %d", len(history))
|
|
}
|
|
})
|
|
}
|
|
|
|
func TestGetAlertHistorySince(t *testing.T) {
|
|
// t.Parallel()
|
|
|
|
t.Run("zero time returns all history", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
m.historyManager.AddAlert(Alert{ID: "alert1", Type: "cpu"})
|
|
|
|
history := m.GetAlertHistorySince(time.Time{}, 10)
|
|
if len(history) == 0 {
|
|
t.Error("expected history entries for zero time")
|
|
}
|
|
})
|
|
|
|
t.Run("filters by time", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
// Add an alert
|
|
m.historyManager.AddAlert(Alert{ID: "alert1", Type: "cpu", StartTime: time.Now()})
|
|
|
|
// Query for alerts after now + 1 hour (should return none)
|
|
future := time.Now().Add(1 * time.Hour)
|
|
history := m.GetAlertHistorySince(future, 10)
|
|
if len(history) != 0 {
|
|
t.Errorf("expected 0 entries for future time, got %d", len(history))
|
|
}
|
|
})
|
|
}
|
|
|
|
func TestClearAlertHistory(t *testing.T) {
|
|
// t.Parallel()
|
|
|
|
t.Run("clears all history", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
// Add some alerts
|
|
m.historyManager.AddAlert(Alert{ID: "alert1", Type: "cpu"})
|
|
m.historyManager.AddAlert(Alert{ID: "alert2", Type: "memory"})
|
|
|
|
err := m.ClearAlertHistory()
|
|
if err != nil {
|
|
t.Fatalf("unexpected error: %v", err)
|
|
}
|
|
|
|
history := m.GetAlertHistory(10)
|
|
if len(history) != 0 {
|
|
t.Errorf("expected 0 entries after clear, got %d", len(history))
|
|
}
|
|
})
|
|
}
|
|
|
|
func TestClearNodeOfflineAlert(t *testing.T) {
|
|
// t.Parallel()
|
|
|
|
t.Run("no alert and no count is no-op", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
node := models.Node{ID: "node1", Name: "Node 1"}
|
|
m.clearNodeOfflineAlert(node)
|
|
|
|
m.mu.RLock()
|
|
alertCount := len(m.activeAlerts)
|
|
m.mu.RUnlock()
|
|
if alertCount != 0 {
|
|
t.Errorf("expected 0 alerts, got %d", alertCount)
|
|
}
|
|
})
|
|
|
|
t.Run("resets offline count when node comes online", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
m.mu.Lock()
|
|
m.nodeOfflineCount["node1"] = 5
|
|
m.mu.Unlock()
|
|
|
|
node := models.Node{ID: "node1", Name: "Node 1"}
|
|
m.clearNodeOfflineAlert(node)
|
|
|
|
m.mu.RLock()
|
|
_, exists := m.nodeOfflineCount["node1"]
|
|
m.mu.RUnlock()
|
|
if exists {
|
|
t.Error("expected offline count to be cleared")
|
|
}
|
|
})
|
|
|
|
t.Run("clears existing alert and adds to resolved", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
resolvedCh := make(chan struct{}, 1)
|
|
m.SetResolvedCallback(func(alertID string) {
|
|
resolvedCh <- struct{}{}
|
|
})
|
|
|
|
m.mu.Lock()
|
|
m.nodeOfflineCount["node1"] = 3
|
|
m.activeAlerts["node-offline-node1"] = &Alert{
|
|
ID: "node-offline-node1",
|
|
Type: "offline",
|
|
StartTime: time.Now().Add(-10 * time.Minute),
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
node := models.Node{ID: "node1", Name: "Node 1", Instance: "pve1"}
|
|
m.clearNodeOfflineAlert(node)
|
|
|
|
m.mu.RLock()
|
|
_, alertExists := m.activeAlerts["node-offline-node1"]
|
|
_, countExists := m.nodeOfflineCount["node1"]
|
|
m.mu.RUnlock()
|
|
|
|
if alertExists {
|
|
t.Error("expected alert to be cleared")
|
|
}
|
|
if countExists {
|
|
t.Error("expected offline count to be cleared")
|
|
}
|
|
|
|
// Check resolved
|
|
m.resolvedMutex.RLock()
|
|
resolved := m.recentlyResolved["node-offline-node1"]
|
|
m.resolvedMutex.RUnlock()
|
|
if resolved == nil {
|
|
t.Error("expected alert to be added to recently resolved")
|
|
}
|
|
select {
|
|
case <-resolvedCh:
|
|
case <-time.After(2 * time.Second):
|
|
t.Error("expected resolved callback to be called")
|
|
}
|
|
})
|
|
}
|
|
|
|
// TestClearOfflineAlertNoDeadlock is a regression test for a deadlock introduced
|
|
// by commit 07b4765b. The resolved callback (handleAlertResolved) calls
|
|
// ShouldSuppressResolvedNotification which acquires m.mu.RLock(). If the
|
|
// clear*OfflineAlert functions call the callback synchronously while holding
|
|
// m.mu.Lock(), Go's non-reentrant RWMutex deadlocks.
|
|
func TestClearOfflineAlertNoDeadlock(t *testing.T) {
|
|
// t.Parallel()
|
|
|
|
type testCase struct {
|
|
name string
|
|
setupFn func(m *Manager)
|
|
clearFn func(m *Manager)
|
|
}
|
|
|
|
cases := []testCase{
|
|
{
|
|
name: "clearNodeOfflineAlert",
|
|
setupFn: func(m *Manager) {
|
|
m.mu.Lock()
|
|
m.activeAlerts["node-offline-node1"] = &Alert{
|
|
ID: "node-offline-node1",
|
|
Type: "offline",
|
|
StartTime: time.Now().Add(-5 * time.Minute),
|
|
}
|
|
m.mu.Unlock()
|
|
},
|
|
clearFn: func(m *Manager) {
|
|
m.clearNodeOfflineAlert(models.Node{ID: "node1", Name: "Node 1", Instance: "pve1"})
|
|
},
|
|
},
|
|
{
|
|
name: "clearPBSOfflineAlert",
|
|
setupFn: func(m *Manager) {
|
|
m.mu.Lock()
|
|
m.activeAlerts["pbs-offline-pbs1"] = &Alert{
|
|
ID: "pbs-offline-pbs1",
|
|
Type: "offline",
|
|
StartTime: time.Now().Add(-5 * time.Minute),
|
|
}
|
|
m.mu.Unlock()
|
|
},
|
|
clearFn: func(m *Manager) {
|
|
m.clearPBSOfflineAlert(models.PBSInstance{ID: "pbs1", Name: "PBS 1", Host: "host1"})
|
|
},
|
|
},
|
|
{
|
|
name: "clearPMGOfflineAlert",
|
|
setupFn: func(m *Manager) {
|
|
m.mu.Lock()
|
|
m.activeAlerts["pmg-offline-pmg1"] = &Alert{
|
|
ID: "pmg-offline-pmg1",
|
|
Type: "offline",
|
|
StartTime: time.Now().Add(-5 * time.Minute),
|
|
}
|
|
m.mu.Unlock()
|
|
},
|
|
clearFn: func(m *Manager) {
|
|
m.clearPMGOfflineAlert(models.PMGInstance{ID: "pmg1", Name: "PMG 1", Host: "host1"})
|
|
},
|
|
},
|
|
{
|
|
name: "clearStorageOfflineAlert",
|
|
setupFn: func(m *Manager) {
|
|
m.mu.Lock()
|
|
m.activeAlerts["storage-offline-stor1"] = &Alert{
|
|
ID: "storage-offline-stor1",
|
|
Type: "offline",
|
|
StartTime: time.Now().Add(-5 * time.Minute),
|
|
}
|
|
m.mu.Unlock()
|
|
},
|
|
clearFn: func(m *Manager) {
|
|
m.clearStorageOfflineAlert(models.Storage{ID: "stor1", Name: "Storage 1", Node: "node1"})
|
|
},
|
|
},
|
|
{
|
|
name: "clearGuestPoweredOffAlert",
|
|
setupFn: func(m *Manager) {
|
|
m.mu.Lock()
|
|
m.activeAlerts["guest-powered-off-vm100"] = &Alert{
|
|
ID: "guest-powered-off-vm100",
|
|
Type: "powered-off",
|
|
StartTime: time.Now().Add(-5 * time.Minute),
|
|
}
|
|
m.mu.Unlock()
|
|
},
|
|
clearFn: func(m *Manager) {
|
|
m.clearGuestPoweredOffAlert("vm100", "TestVM")
|
|
},
|
|
},
|
|
}
|
|
|
|
for _, tc := range cases {
|
|
t.Run(tc.name, func(t *testing.T) {
|
|
m := newTestManager(t)
|
|
|
|
// Simulate what handleAlertResolved does in production:
|
|
// it calls ShouldSuppressResolvedNotification which acquires m.mu.RLock().
|
|
// Before the fix, this deadlocked because the caller held m.mu.Lock().
|
|
done := make(chan struct{})
|
|
m.SetResolvedCallback(func(alertID string) {
|
|
_ = m.ShouldSuppressResolvedNotification(&Alert{ID: alertID})
|
|
close(done)
|
|
})
|
|
|
|
tc.setupFn(m)
|
|
tc.clearFn(m)
|
|
|
|
select {
|
|
case <-done:
|
|
// Callback completed without deadlock
|
|
case <-time.After(3 * time.Second):
|
|
t.Fatal("deadlock: resolved callback did not complete within 3 seconds")
|
|
}
|
|
})
|
|
}
|
|
}
|
|
|
|
func TestClearPBSOfflineAlert(t *testing.T) {
|
|
// t.Parallel()
|
|
|
|
t.Run("no alert and no count is no-op", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
pbs := models.PBSInstance{ID: "pbs1", Name: "PBS 1"}
|
|
m.clearPBSOfflineAlert(pbs)
|
|
|
|
m.mu.RLock()
|
|
alertCount := len(m.activeAlerts)
|
|
m.mu.RUnlock()
|
|
if alertCount != 0 {
|
|
t.Errorf("expected 0 alerts, got %d", alertCount)
|
|
}
|
|
})
|
|
|
|
t.Run("resets offline confirmation count", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
m.mu.Lock()
|
|
m.offlineConfirmations["pbs1"] = 5
|
|
m.mu.Unlock()
|
|
|
|
pbs := models.PBSInstance{ID: "pbs1", Name: "PBS 1"}
|
|
m.clearPBSOfflineAlert(pbs)
|
|
|
|
m.mu.RLock()
|
|
_, exists := m.offlineConfirmations["pbs1"]
|
|
m.mu.RUnlock()
|
|
if exists {
|
|
t.Error("expected offline confirmation count to be cleared")
|
|
}
|
|
})
|
|
|
|
t.Run("clears existing alert and adds to resolved", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
resolvedCh := make(chan struct{}, 1)
|
|
m.SetResolvedCallback(func(alertID string) {
|
|
resolvedCh <- struct{}{}
|
|
})
|
|
|
|
m.mu.Lock()
|
|
m.offlineConfirmations["pbs1"] = 3
|
|
m.activeAlerts["pbs-offline-pbs1"] = &Alert{
|
|
ID: "pbs-offline-pbs1",
|
|
Type: "offline",
|
|
StartTime: time.Now().Add(-5 * time.Minute),
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
pbs := models.PBSInstance{ID: "pbs1", Name: "PBS 1", Host: "pbs.local"}
|
|
m.clearPBSOfflineAlert(pbs)
|
|
|
|
m.mu.RLock()
|
|
_, alertExists := m.activeAlerts["pbs-offline-pbs1"]
|
|
_, countExists := m.offlineConfirmations["pbs1"]
|
|
m.mu.RUnlock()
|
|
|
|
if alertExists {
|
|
t.Error("expected alert to be cleared")
|
|
}
|
|
if countExists {
|
|
t.Error("expected offline confirmation count to be cleared")
|
|
}
|
|
|
|
// Check resolved
|
|
m.resolvedMutex.RLock()
|
|
resolved := m.recentlyResolved["pbs-offline-pbs1"]
|
|
m.resolvedMutex.RUnlock()
|
|
if resolved == nil {
|
|
t.Error("expected alert to be added to recently resolved")
|
|
}
|
|
select {
|
|
case <-resolvedCh:
|
|
case <-time.After(2 * time.Second):
|
|
t.Error("expected resolved callback to be called")
|
|
}
|
|
})
|
|
}
|
|
|
|
func TestClearPMGOfflineAlert(t *testing.T) {
|
|
// t.Parallel()
|
|
|
|
t.Run("no alert and no count is no-op", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
pmg := models.PMGInstance{ID: "pmg1", Name: "PMG 1"}
|
|
m.clearPMGOfflineAlert(pmg)
|
|
|
|
m.mu.RLock()
|
|
alertCount := len(m.activeAlerts)
|
|
m.mu.RUnlock()
|
|
if alertCount != 0 {
|
|
t.Errorf("expected 0 alerts, got %d", alertCount)
|
|
}
|
|
})
|
|
|
|
t.Run("resets offline confirmation count", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
m.mu.Lock()
|
|
m.offlineConfirmations["pmg1"] = 5
|
|
m.mu.Unlock()
|
|
|
|
pmg := models.PMGInstance{ID: "pmg1", Name: "PMG 1"}
|
|
m.clearPMGOfflineAlert(pmg)
|
|
|
|
m.mu.RLock()
|
|
_, exists := m.offlineConfirmations["pmg1"]
|
|
m.mu.RUnlock()
|
|
if exists {
|
|
t.Error("expected offline confirmation count to be cleared")
|
|
}
|
|
})
|
|
|
|
t.Run("clears existing alert and adds to resolved", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
resolvedCh := make(chan struct{}, 1)
|
|
m.SetResolvedCallback(func(alertID string) {
|
|
resolvedCh <- struct{}{}
|
|
})
|
|
|
|
m.mu.Lock()
|
|
m.offlineConfirmations["pmg1"] = 3
|
|
m.activeAlerts["pmg-offline-pmg1"] = &Alert{
|
|
ID: "pmg-offline-pmg1",
|
|
Type: "offline",
|
|
StartTime: time.Now().Add(-5 * time.Minute),
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
pmg := models.PMGInstance{ID: "pmg1", Name: "PMG 1", Host: "pmg.local"}
|
|
m.clearPMGOfflineAlert(pmg)
|
|
|
|
m.mu.RLock()
|
|
_, alertExists := m.activeAlerts["pmg-offline-pmg1"]
|
|
_, countExists := m.offlineConfirmations["pmg1"]
|
|
m.mu.RUnlock()
|
|
|
|
if alertExists {
|
|
t.Error("expected alert to be cleared")
|
|
}
|
|
if countExists {
|
|
t.Error("expected offline confirmation count to be cleared")
|
|
}
|
|
|
|
// Check resolved
|
|
m.resolvedMutex.RLock()
|
|
resolved := m.recentlyResolved["pmg-offline-pmg1"]
|
|
m.resolvedMutex.RUnlock()
|
|
if resolved == nil {
|
|
t.Error("expected alert to be added to recently resolved")
|
|
}
|
|
select {
|
|
case <-resolvedCh:
|
|
case <-time.After(2 * time.Second):
|
|
t.Error("expected resolved callback to be called")
|
|
}
|
|
})
|
|
}
|
|
|
|
func TestCheckNodeOffline(t *testing.T) {
|
|
// t.Parallel()
|
|
|
|
t.Run("override DisableConnectivity clears alert and returns", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
m.mu.Lock()
|
|
m.config.Overrides = map[string]ThresholdConfig{
|
|
"node1": {DisableConnectivity: true},
|
|
}
|
|
m.activeAlerts["node-offline-node1"] = &Alert{ID: "node-offline-node1"}
|
|
m.nodeOfflineCount["node1"] = 5
|
|
m.mu.Unlock()
|
|
|
|
node := models.Node{ID: "node1", Name: "Node 1"}
|
|
m.checkNodeOffline(node)
|
|
|
|
m.mu.RLock()
|
|
_, alertExists := m.activeAlerts["node-offline-node1"]
|
|
_, countExists := m.nodeOfflineCount["node1"]
|
|
m.mu.RUnlock()
|
|
|
|
if alertExists {
|
|
t.Error("expected alert to be cleared when connectivity disabled")
|
|
}
|
|
if countExists {
|
|
t.Error("expected offline count to be cleared")
|
|
}
|
|
})
|
|
|
|
t.Run("existing alert updates LastSeen", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
oldTime := time.Now().Add(-1 * time.Hour)
|
|
m.mu.Lock()
|
|
m.activeAlerts["node-offline-node1"] = &Alert{
|
|
ID: "node-offline-node1",
|
|
StartTime: oldTime,
|
|
LastSeen: oldTime,
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
node := models.Node{ID: "node1", Name: "Node 1"}
|
|
m.checkNodeOffline(node)
|
|
|
|
m.mu.RLock()
|
|
alert := m.activeAlerts["node-offline-node1"]
|
|
m.mu.RUnlock()
|
|
|
|
if alert == nil {
|
|
t.Fatal("expected alert to exist")
|
|
}
|
|
if !alert.LastSeen.After(oldTime) {
|
|
t.Error("expected LastSeen to be updated")
|
|
}
|
|
if !alert.StartTime.Equal(oldTime) {
|
|
t.Error("expected StartTime to remain unchanged")
|
|
}
|
|
})
|
|
|
|
t.Run("insufficient confirmations waits", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
node := models.Node{ID: "node1", Name: "Node 1", Instance: "pve1"}
|
|
|
|
// First call - count 1
|
|
m.checkNodeOffline(node)
|
|
m.mu.RLock()
|
|
count1 := m.nodeOfflineCount["node1"]
|
|
alert1 := m.activeAlerts["node-offline-node1"]
|
|
m.mu.RUnlock()
|
|
if count1 != 1 {
|
|
t.Errorf("expected count 1, got %d", count1)
|
|
}
|
|
if alert1 != nil {
|
|
t.Error("expected no alert after 1 confirmation")
|
|
}
|
|
|
|
// Second call - count 2
|
|
m.checkNodeOffline(node)
|
|
m.mu.RLock()
|
|
count2 := m.nodeOfflineCount["node1"]
|
|
alert2 := m.activeAlerts["node-offline-node1"]
|
|
m.mu.RUnlock()
|
|
if count2 != 2 {
|
|
t.Errorf("expected count 2, got %d", count2)
|
|
}
|
|
if alert2 != nil {
|
|
t.Error("expected no alert after 2 confirmations")
|
|
}
|
|
})
|
|
|
|
t.Run("creates alert after 3 confirmations", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
m.mu.Lock()
|
|
m.nodeOfflineCount["node1"] = 2 // Pre-set to trigger on next call
|
|
m.mu.Unlock()
|
|
|
|
node := models.Node{
|
|
ID: "node1",
|
|
Name: "Node 1",
|
|
Instance: "pve1",
|
|
Status: "offline",
|
|
ConnectionHealth: "disconnected",
|
|
}
|
|
|
|
m.checkNodeOffline(node)
|
|
|
|
m.mu.RLock()
|
|
alert := m.activeAlerts["node-offline-node1"]
|
|
count := m.nodeOfflineCount["node1"]
|
|
m.mu.RUnlock()
|
|
|
|
if count != 3 {
|
|
t.Errorf("expected count 3, got %d", count)
|
|
}
|
|
if alert == nil {
|
|
t.Fatal("expected alert after 3 confirmations")
|
|
}
|
|
if alert.Type != "connectivity" {
|
|
t.Errorf("expected type connectivity, got %s", alert.Type)
|
|
}
|
|
if alert.Level != AlertLevelCritical {
|
|
t.Errorf("expected critical level, got %s", alert.Level)
|
|
}
|
|
if alert.ResourceID != "node1" {
|
|
t.Errorf("expected resourceID node1, got %s", alert.ResourceID)
|
|
}
|
|
})
|
|
|
|
t.Run("alert added to history", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
m.mu.Lock()
|
|
m.nodeOfflineCount["node1"] = 2
|
|
m.mu.Unlock()
|
|
|
|
node := models.Node{ID: "node1", Name: "Node 1", Instance: "pve1"}
|
|
m.checkNodeOffline(node)
|
|
|
|
// Check history
|
|
history := m.GetAlertHistory(10)
|
|
found := false
|
|
for _, h := range history {
|
|
if h.ID == "node-offline-node1" {
|
|
found = true
|
|
break
|
|
}
|
|
}
|
|
if !found {
|
|
t.Error("expected alert to be added to history")
|
|
}
|
|
})
|
|
}
|
|
|
|
func TestCheckPBSOffline(t *testing.T) {
|
|
// t.Parallel()
|
|
|
|
t.Run("override Disabled clears alert and returns", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
m.mu.Lock()
|
|
m.config.Overrides = map[string]ThresholdConfig{
|
|
"pbs1": {Disabled: true},
|
|
}
|
|
m.activeAlerts["pbs-offline-pbs1"] = &Alert{ID: "pbs-offline-pbs1"}
|
|
m.mu.Unlock()
|
|
|
|
pbs := models.PBSInstance{ID: "pbs1", Name: "PBS 1"}
|
|
m.checkPBSOffline(pbs)
|
|
|
|
m.mu.RLock()
|
|
_, alertExists := m.activeAlerts["pbs-offline-pbs1"]
|
|
m.mu.RUnlock()
|
|
|
|
if alertExists {
|
|
t.Error("expected alert to be cleared when disabled")
|
|
}
|
|
})
|
|
|
|
t.Run("override DisableConnectivity clears alert and returns", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
m.mu.Lock()
|
|
m.config.Overrides = map[string]ThresholdConfig{
|
|
"pbs1": {DisableConnectivity: true},
|
|
}
|
|
m.activeAlerts["pbs-offline-pbs1"] = &Alert{ID: "pbs-offline-pbs1"}
|
|
m.mu.Unlock()
|
|
|
|
pbs := models.PBSInstance{ID: "pbs1", Name: "PBS 1"}
|
|
m.checkPBSOffline(pbs)
|
|
|
|
m.mu.RLock()
|
|
_, alertExists := m.activeAlerts["pbs-offline-pbs1"]
|
|
m.mu.RUnlock()
|
|
|
|
if alertExists {
|
|
t.Error("expected alert to be cleared when connectivity disabled")
|
|
}
|
|
})
|
|
|
|
t.Run("insufficient confirmations waits", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
pbs := models.PBSInstance{ID: "pbs1", Name: "PBS 1"}
|
|
|
|
// First two calls - not enough confirmations
|
|
m.checkPBSOffline(pbs)
|
|
m.checkPBSOffline(pbs)
|
|
|
|
m.mu.RLock()
|
|
count := m.offlineConfirmations["pbs1"]
|
|
_, alertExists := m.activeAlerts["pbs-offline-pbs1"]
|
|
m.mu.RUnlock()
|
|
|
|
if count != 2 {
|
|
t.Errorf("expected count 2, got %d", count)
|
|
}
|
|
if alertExists {
|
|
t.Error("expected no alert after 2 confirmations")
|
|
}
|
|
})
|
|
|
|
t.Run("creates alert after 3 confirmations", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
m.mu.Lock()
|
|
m.offlineConfirmations["pbs1"] = 2
|
|
m.mu.Unlock()
|
|
|
|
pbs := models.PBSInstance{ID: "pbs1", Name: "PBS 1", Host: "pbs.local"}
|
|
m.checkPBSOffline(pbs)
|
|
|
|
m.mu.RLock()
|
|
alert := m.activeAlerts["pbs-offline-pbs1"]
|
|
count := m.offlineConfirmations["pbs1"]
|
|
m.mu.RUnlock()
|
|
|
|
if count != 3 {
|
|
t.Errorf("expected count 3, got %d", count)
|
|
}
|
|
if alert == nil {
|
|
t.Fatal("expected alert after 3 confirmations")
|
|
}
|
|
if alert.Type != "offline" {
|
|
t.Errorf("expected type offline, got %s", alert.Type)
|
|
}
|
|
if alert.Level != AlertLevelCritical {
|
|
t.Errorf("expected critical level, got %s", alert.Level)
|
|
}
|
|
})
|
|
|
|
t.Run("existing alert updates LastSeen", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
oldTime := time.Now().Add(-1 * time.Hour)
|
|
m.mu.Lock()
|
|
m.offlineConfirmations["pbs1"] = 3
|
|
m.activeAlerts["pbs-offline-pbs1"] = &Alert{
|
|
ID: "pbs-offline-pbs1",
|
|
LastSeen: oldTime,
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
pbs := models.PBSInstance{ID: "pbs1", Name: "PBS 1"}
|
|
m.checkPBSOffline(pbs)
|
|
|
|
m.mu.RLock()
|
|
alert := m.activeAlerts["pbs-offline-pbs1"]
|
|
m.mu.RUnlock()
|
|
|
|
if alert == nil {
|
|
t.Fatal("expected alert to exist")
|
|
}
|
|
if !alert.LastSeen.After(oldTime) {
|
|
t.Error("expected LastSeen to be updated")
|
|
}
|
|
})
|
|
}
|
|
|
|
func TestCheckPMGOffline(t *testing.T) {
|
|
// t.Parallel()
|
|
|
|
t.Run("override Disabled clears alert and returns", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
m.mu.Lock()
|
|
m.config.Overrides = map[string]ThresholdConfig{
|
|
"pmg1": {Disabled: true},
|
|
}
|
|
m.activeAlerts["pmg-offline-pmg1"] = &Alert{ID: "pmg-offline-pmg1"}
|
|
m.mu.Unlock()
|
|
|
|
pmg := models.PMGInstance{ID: "pmg1", Name: "PMG 1"}
|
|
m.checkPMGOffline(pmg)
|
|
|
|
m.mu.RLock()
|
|
_, alertExists := m.activeAlerts["pmg-offline-pmg1"]
|
|
m.mu.RUnlock()
|
|
|
|
if alertExists {
|
|
t.Error("expected alert to be cleared when disabled")
|
|
}
|
|
})
|
|
|
|
t.Run("override DisableConnectivity clears alert and returns", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
m.mu.Lock()
|
|
m.config.Overrides = map[string]ThresholdConfig{
|
|
"pmg1": {DisableConnectivity: true},
|
|
}
|
|
m.activeAlerts["pmg-offline-pmg1"] = &Alert{ID: "pmg-offline-pmg1"}
|
|
m.mu.Unlock()
|
|
|
|
pmg := models.PMGInstance{ID: "pmg1", Name: "PMG 1"}
|
|
m.checkPMGOffline(pmg)
|
|
|
|
m.mu.RLock()
|
|
_, alertExists := m.activeAlerts["pmg-offline-pmg1"]
|
|
m.mu.RUnlock()
|
|
|
|
if alertExists {
|
|
t.Error("expected alert to be cleared when connectivity disabled")
|
|
}
|
|
})
|
|
|
|
t.Run("insufficient confirmations waits", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
pmg := models.PMGInstance{ID: "pmg1", Name: "PMG 1"}
|
|
|
|
// First two calls - not enough confirmations
|
|
m.checkPMGOffline(pmg)
|
|
m.checkPMGOffline(pmg)
|
|
|
|
m.mu.RLock()
|
|
count := m.offlineConfirmations["pmg1"]
|
|
_, alertExists := m.activeAlerts["pmg-offline-pmg1"]
|
|
m.mu.RUnlock()
|
|
|
|
if count != 2 {
|
|
t.Errorf("expected count 2, got %d", count)
|
|
}
|
|
if alertExists {
|
|
t.Error("expected no alert after 2 confirmations")
|
|
}
|
|
})
|
|
|
|
t.Run("creates alert after 3 confirmations", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
m.mu.Lock()
|
|
m.offlineConfirmations["pmg1"] = 2
|
|
m.mu.Unlock()
|
|
|
|
pmg := models.PMGInstance{ID: "pmg1", Name: "PMG 1", Host: "pmg.local"}
|
|
m.checkPMGOffline(pmg)
|
|
|
|
m.mu.RLock()
|
|
alert := m.activeAlerts["pmg-offline-pmg1"]
|
|
count := m.offlineConfirmations["pmg1"]
|
|
m.mu.RUnlock()
|
|
|
|
if count != 3 {
|
|
t.Errorf("expected count 3, got %d", count)
|
|
}
|
|
if alert == nil {
|
|
t.Fatal("expected alert after 3 confirmations")
|
|
}
|
|
if alert.Type != "offline" {
|
|
t.Errorf("expected type offline, got %s", alert.Type)
|
|
}
|
|
if alert.Level != AlertLevelCritical {
|
|
t.Errorf("expected critical level, got %s", alert.Level)
|
|
}
|
|
})
|
|
|
|
t.Run("existing alert updates LastSeen", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
oldTime := time.Now().Add(-1 * time.Hour)
|
|
m.mu.Lock()
|
|
m.offlineConfirmations["pmg1"] = 3
|
|
m.activeAlerts["pmg-offline-pmg1"] = &Alert{
|
|
ID: "pmg-offline-pmg1",
|
|
LastSeen: oldTime,
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
pmg := models.PMGInstance{ID: "pmg1", Name: "PMG 1"}
|
|
m.checkPMGOffline(pmg)
|
|
|
|
m.mu.RLock()
|
|
alert := m.activeAlerts["pmg-offline-pmg1"]
|
|
m.mu.RUnlock()
|
|
|
|
if alert == nil {
|
|
t.Fatal("expected alert to exist")
|
|
}
|
|
if !alert.LastSeen.After(oldTime) {
|
|
t.Error("expected LastSeen to be updated")
|
|
}
|
|
})
|
|
}
|
|
|
|
func TestCalculateTrimmedBaseline(t *testing.T) {
|
|
// t.Parallel()
|
|
|
|
t.Run("less than 12 samples returns untrustworthy", func(t *testing.T) {
|
|
// t.Parallel()
|
|
samples := []float64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}
|
|
baseline, trustworthy := calculateTrimmedBaseline(samples)
|
|
if trustworthy {
|
|
t.Error("expected untrustworthy with less than 12 samples")
|
|
}
|
|
if baseline != 0 {
|
|
t.Errorf("expected baseline 0, got %f", baseline)
|
|
}
|
|
})
|
|
|
|
t.Run("empty samples returns untrustworthy", func(t *testing.T) {
|
|
// t.Parallel()
|
|
samples := []float64{}
|
|
baseline, trustworthy := calculateTrimmedBaseline(samples)
|
|
if trustworthy {
|
|
t.Error("expected untrustworthy with empty samples")
|
|
}
|
|
if baseline != 0 {
|
|
t.Errorf("expected baseline 0, got %f", baseline)
|
|
}
|
|
})
|
|
|
|
t.Run("12-23 samples uses simple mean", func(t *testing.T) {
|
|
// t.Parallel()
|
|
// 12 samples summing to 78
|
|
samples := []float64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}
|
|
baseline, trustworthy := calculateTrimmedBaseline(samples)
|
|
if !trustworthy {
|
|
t.Error("expected trustworthy with 12 samples")
|
|
}
|
|
// Mean of 1-12 is (1+2+...+12)/12 = 78/12 = 6.5
|
|
if baseline != 6.5 {
|
|
t.Errorf("expected baseline 6.5, got %f", baseline)
|
|
}
|
|
})
|
|
|
|
t.Run("24+ samples uses trimmed mean", func(t *testing.T) {
|
|
// t.Parallel()
|
|
// 24 identical values - trimmed mean should equal value
|
|
samples := make([]float64, 24)
|
|
for i := range samples {
|
|
samples[i] = 10.0
|
|
}
|
|
baseline, trustworthy := calculateTrimmedBaseline(samples)
|
|
if !trustworthy {
|
|
t.Error("expected trustworthy with 24 samples")
|
|
}
|
|
if baseline != 10.0 {
|
|
t.Errorf("expected baseline 10.0, got %f", baseline)
|
|
}
|
|
})
|
|
|
|
t.Run("24+ samples falls back to median when diff > 40%", func(t *testing.T) {
|
|
// t.Parallel()
|
|
// Create samples where trimmed mean differs significantly from median
|
|
// Mostly 10s with some extreme outliers that survive trimming
|
|
samples := make([]float64, 24)
|
|
for i := range samples {
|
|
if i < 4 {
|
|
samples[i] = 100.0 // Extreme high values
|
|
} else {
|
|
samples[i] = 10.0 // Normal values
|
|
}
|
|
}
|
|
// After sorting: 10,10,...,10,100,100,100,100
|
|
// Median is 10 (middle values are 10s)
|
|
// Trimmed mean (drop 2 highest and 2 lowest): still has 2 100s
|
|
// So trimmed mean > median * 1.4, should fall back to median
|
|
baseline, trustworthy := calculateTrimmedBaseline(samples)
|
|
if !trustworthy {
|
|
t.Error("expected trustworthy")
|
|
}
|
|
// Should use median (10) due to large diff
|
|
if baseline != 10.0 {
|
|
t.Errorf("expected baseline 10.0 (median fallback), got %f", baseline)
|
|
}
|
|
})
|
|
|
|
t.Run("24+ samples uses trimmed mean when diff <= 40%", func(t *testing.T) {
|
|
// t.Parallel()
|
|
// Sequential values with minimal outlier effect
|
|
samples := make([]float64, 24)
|
|
for i := range samples {
|
|
samples[i] = float64(i + 1) // 1,2,3,...,24
|
|
}
|
|
baseline, trustworthy := calculateTrimmedBaseline(samples)
|
|
if !trustworthy {
|
|
t.Error("expected trustworthy")
|
|
}
|
|
// Median of 1-24 is (12+13)/2 = 12.5
|
|
// Trimmed mean of 3-22 is (3+4+...+22)/20 = 250/20 = 12.5
|
|
// Both are close, should use trimmed mean
|
|
if baseline != 12.5 {
|
|
t.Errorf("expected baseline 12.5, got %f", baseline)
|
|
}
|
|
})
|
|
|
|
t.Run("odd length array uses middle element for median", func(t *testing.T) {
|
|
// t.Parallel()
|
|
// 25 samples: an odd-length array
|
|
samples := make([]float64, 25)
|
|
for i := range samples {
|
|
samples[i] = float64(i + 1) // 1,2,3,...,25
|
|
}
|
|
baseline, trustworthy := calculateTrimmedBaseline(samples)
|
|
if !trustworthy {
|
|
t.Error("expected trustworthy")
|
|
}
|
|
// Median of sorted 1-25 is the 13th element = 13
|
|
// Trimmed mean excludes top/bottom 2: 3..23 = 21 elements, sum = (3+23)*21/2 = 273, mean = 13
|
|
// Both are 13, diff is 0%, should use trimmed mean = 13
|
|
if baseline != 13.0 {
|
|
t.Errorf("expected baseline 13.0, got %f", baseline)
|
|
}
|
|
})
|
|
|
|
t.Run("trimmed mean less than median triggers diff calculation", func(t *testing.T) {
|
|
// t.Parallel()
|
|
// Create samples where trimmed mean < median but within 40%
|
|
// High outliers at top (excluded by trim), low values in middle
|
|
samples := make([]float64, 24)
|
|
// First 2 (will be trimmed): very low
|
|
samples[0], samples[1] = 1, 2
|
|
// Middle 20: mostly 50 but some variance
|
|
for i := 2; i < 22; i++ {
|
|
samples[i] = 50.0
|
|
}
|
|
// Last 2 (will be trimmed): very high
|
|
samples[22], samples[23] = 100, 200
|
|
|
|
baseline, trustworthy := calculateTrimmedBaseline(samples)
|
|
if !trustworthy {
|
|
t.Error("expected trustworthy")
|
|
}
|
|
// After sorting: 1, 2, 50x20, 100, 200
|
|
// Median of even array: (50+50)/2 = 50
|
|
// Trimmed mean: 50x20/20 = 50
|
|
// Should return 50
|
|
if baseline != 50.0 {
|
|
t.Errorf("expected baseline 50.0, got %f", baseline)
|
|
}
|
|
})
|
|
}
|
|
|
|
func TestCreateOrUpdateNodeAlert(t *testing.T) {
|
|
// t.Parallel()
|
|
|
|
t.Run("creates new alert", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
pmg := models.PMGInstance{ID: "pmg1", Name: "PMG 1"}
|
|
m.createOrUpdateNodeAlert(
|
|
"pmg1-node-queue",
|
|
pmg,
|
|
"mail-node1",
|
|
"pmg-node-queue",
|
|
AlertLevelWarning,
|
|
100,
|
|
50,
|
|
"Queue depth high",
|
|
)
|
|
|
|
m.mu.RLock()
|
|
alert := m.activeAlerts["pmg1-node-queue"]
|
|
m.mu.RUnlock()
|
|
|
|
if alert == nil {
|
|
t.Fatal("expected alert to be created")
|
|
}
|
|
if alert.Type != "pmg-node-queue" {
|
|
t.Errorf("expected type pmg-node-queue, got %s", alert.Type)
|
|
}
|
|
if alert.Level != AlertLevelWarning {
|
|
t.Errorf("expected warning level, got %s", alert.Level)
|
|
}
|
|
if alert.Value != 100 {
|
|
t.Errorf("expected value 100, got %f", alert.Value)
|
|
}
|
|
if alert.Threshold != 50 {
|
|
t.Errorf("expected threshold 50, got %f", alert.Threshold)
|
|
}
|
|
if alert.Node != "mail-node1" {
|
|
t.Errorf("expected node mail-node1, got %s", alert.Node)
|
|
}
|
|
})
|
|
|
|
t.Run("updates existing alert", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
oldTime := time.Now().Add(-1 * time.Hour)
|
|
m.mu.Lock()
|
|
m.activeAlerts["pmg1-node-queue"] = &Alert{
|
|
ID: "pmg1-node-queue",
|
|
Value: 50,
|
|
Threshold: 40,
|
|
Level: AlertLevelWarning,
|
|
Message: "Old message",
|
|
LastSeen: oldTime,
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
pmg := models.PMGInstance{ID: "pmg1", Name: "PMG 1"}
|
|
m.createOrUpdateNodeAlert(
|
|
"pmg1-node-queue",
|
|
pmg,
|
|
"mail-node1",
|
|
"pmg-node-queue",
|
|
AlertLevelCritical,
|
|
200,
|
|
100,
|
|
"New message",
|
|
)
|
|
|
|
m.mu.RLock()
|
|
alert := m.activeAlerts["pmg1-node-queue"]
|
|
m.mu.RUnlock()
|
|
|
|
if alert == nil {
|
|
t.Fatal("expected alert to exist")
|
|
}
|
|
if alert.Value != 200 {
|
|
t.Errorf("expected value 200, got %f", alert.Value)
|
|
}
|
|
if alert.Threshold != 100 {
|
|
t.Errorf("expected threshold 100, got %f", alert.Threshold)
|
|
}
|
|
if alert.Level != AlertLevelCritical {
|
|
t.Errorf("expected critical level, got %s", alert.Level)
|
|
}
|
|
if alert.Message != "New message" {
|
|
t.Errorf("expected 'New message', got %s", alert.Message)
|
|
}
|
|
if !alert.LastSeen.After(oldTime) {
|
|
t.Error("expected LastSeen to be updated")
|
|
}
|
|
})
|
|
}
|
|
|
|
func TestCheckPMGQueueDepths(t *testing.T) {
|
|
// t.Parallel()
|
|
|
|
t.Run("no thresholds configured does not create alerts", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
pmg := models.PMGInstance{
|
|
ID: "pmg1",
|
|
Name: "PMG 1",
|
|
Nodes: []models.PMGNodeStatus{
|
|
{Name: "node1", QueueStatus: &models.PMGQueueStatus{Total: 1000, Deferred: 500, Hold: 300}},
|
|
},
|
|
}
|
|
|
|
// No thresholds configured (all 0)
|
|
defaults := PMGThresholdConfig{}
|
|
m.checkPMGQueueDepths(pmg, defaults)
|
|
|
|
m.mu.RLock()
|
|
totalAlerts := len(m.activeAlerts)
|
|
m.mu.RUnlock()
|
|
|
|
if totalAlerts != 0 {
|
|
t.Errorf("expected no alerts when no thresholds configured, got %d", totalAlerts)
|
|
}
|
|
})
|
|
|
|
t.Run("total queue warning alert", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
pmg := models.PMGInstance{
|
|
ID: "pmg1",
|
|
Name: "PMG 1",
|
|
Host: "pmg-server",
|
|
Nodes: []models.PMGNodeStatus{
|
|
{Name: "node1", QueueStatus: &models.PMGQueueStatus{Total: 300}},
|
|
{Name: "node2", QueueStatus: &models.PMGQueueStatus{Total: 250}},
|
|
},
|
|
}
|
|
|
|
defaults := PMGThresholdConfig{
|
|
QueueTotalWarning: 500,
|
|
QueueTotalCritical: 1000,
|
|
}
|
|
m.checkPMGQueueDepths(pmg, defaults)
|
|
|
|
m.mu.RLock()
|
|
alert := m.activeAlerts["pmg1-queue-total"]
|
|
m.mu.RUnlock()
|
|
|
|
if alert == nil {
|
|
t.Fatal("expected warning alert to be created")
|
|
}
|
|
if alert.Level != AlertLevelWarning {
|
|
t.Errorf("expected warning level, got %s", alert.Level)
|
|
}
|
|
if alert.Value != 550 {
|
|
t.Errorf("expected value 550, got %f", alert.Value)
|
|
}
|
|
})
|
|
|
|
t.Run("total queue critical alert", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
pmg := models.PMGInstance{
|
|
ID: "pmg1",
|
|
Name: "PMG 1",
|
|
Nodes: []models.PMGNodeStatus{
|
|
{Name: "node1", QueueStatus: &models.PMGQueueStatus{Total: 600}},
|
|
{Name: "node2", QueueStatus: &models.PMGQueueStatus{Total: 500}},
|
|
},
|
|
}
|
|
|
|
defaults := PMGThresholdConfig{
|
|
QueueTotalWarning: 500,
|
|
QueueTotalCritical: 1000,
|
|
}
|
|
m.checkPMGQueueDepths(pmg, defaults)
|
|
|
|
m.mu.RLock()
|
|
alert := m.activeAlerts["pmg1-queue-total"]
|
|
m.mu.RUnlock()
|
|
|
|
if alert == nil {
|
|
t.Fatal("expected critical alert to be created")
|
|
}
|
|
if alert.Level != AlertLevelCritical {
|
|
t.Errorf("expected critical level, got %s", alert.Level)
|
|
}
|
|
if alert.Value != 1100 {
|
|
t.Errorf("expected value 1100, got %f", alert.Value)
|
|
}
|
|
})
|
|
|
|
t.Run("deferred queue warning alert", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
pmg := models.PMGInstance{
|
|
ID: "pmg1",
|
|
Name: "PMG 1",
|
|
Nodes: []models.PMGNodeStatus{
|
|
{Name: "node1", QueueStatus: &models.PMGQueueStatus{Deferred: 150}},
|
|
{Name: "node2", QueueStatus: &models.PMGQueueStatus{Deferred: 100}},
|
|
},
|
|
}
|
|
|
|
defaults := PMGThresholdConfig{
|
|
DeferredQueueWarn: 200,
|
|
DeferredQueueCritical: 500,
|
|
}
|
|
m.checkPMGQueueDepths(pmg, defaults)
|
|
|
|
m.mu.RLock()
|
|
alert := m.activeAlerts["pmg1-queue-deferred"]
|
|
m.mu.RUnlock()
|
|
|
|
if alert == nil {
|
|
t.Fatal("expected deferred alert to be created")
|
|
}
|
|
if alert.Level != AlertLevelWarning {
|
|
t.Errorf("expected warning level, got %s", alert.Level)
|
|
}
|
|
if alert.Value != 250 {
|
|
t.Errorf("expected value 250, got %f", alert.Value)
|
|
}
|
|
})
|
|
|
|
t.Run("deferred queue critical alert", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
pmg := models.PMGInstance{
|
|
ID: "pmg1",
|
|
Name: "PMG 1",
|
|
Nodes: []models.PMGNodeStatus{
|
|
{Name: "node1", QueueStatus: &models.PMGQueueStatus{Deferred: 300}},
|
|
{Name: "node2", QueueStatus: &models.PMGQueueStatus{Deferred: 250}},
|
|
},
|
|
}
|
|
|
|
defaults := PMGThresholdConfig{
|
|
DeferredQueueWarn: 200,
|
|
DeferredQueueCritical: 500,
|
|
}
|
|
m.checkPMGQueueDepths(pmg, defaults)
|
|
|
|
m.mu.RLock()
|
|
alert := m.activeAlerts["pmg1-queue-deferred"]
|
|
m.mu.RUnlock()
|
|
|
|
if alert == nil {
|
|
t.Fatal("expected critical alert to be created")
|
|
}
|
|
if alert.Level != AlertLevelCritical {
|
|
t.Errorf("expected critical level, got %s", alert.Level)
|
|
}
|
|
})
|
|
|
|
t.Run("hold queue warning alert", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
pmg := models.PMGInstance{
|
|
ID: "pmg1",
|
|
Name: "PMG 1",
|
|
Nodes: []models.PMGNodeStatus{
|
|
{Name: "node1", QueueStatus: &models.PMGQueueStatus{Hold: 75}},
|
|
{Name: "node2", QueueStatus: &models.PMGQueueStatus{Hold: 50}},
|
|
},
|
|
}
|
|
|
|
defaults := PMGThresholdConfig{
|
|
HoldQueueWarn: 100,
|
|
HoldQueueCritical: 300,
|
|
}
|
|
m.checkPMGQueueDepths(pmg, defaults)
|
|
|
|
m.mu.RLock()
|
|
alert := m.activeAlerts["pmg1-queue-hold"]
|
|
m.mu.RUnlock()
|
|
|
|
if alert == nil {
|
|
t.Fatal("expected hold alert to be created")
|
|
}
|
|
if alert.Level != AlertLevelWarning {
|
|
t.Errorf("expected warning level, got %s", alert.Level)
|
|
}
|
|
if alert.Value != 125 {
|
|
t.Errorf("expected value 125, got %f", alert.Value)
|
|
}
|
|
})
|
|
|
|
t.Run("hold queue critical alert", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
pmg := models.PMGInstance{
|
|
ID: "pmg1",
|
|
Name: "PMG 1",
|
|
Nodes: []models.PMGNodeStatus{
|
|
{Name: "node1", QueueStatus: &models.PMGQueueStatus{Hold: 200}},
|
|
{Name: "node2", QueueStatus: &models.PMGQueueStatus{Hold: 150}},
|
|
},
|
|
}
|
|
|
|
defaults := PMGThresholdConfig{
|
|
HoldQueueWarn: 100,
|
|
HoldQueueCritical: 300,
|
|
}
|
|
m.checkPMGQueueDepths(pmg, defaults)
|
|
|
|
m.mu.RLock()
|
|
alert := m.activeAlerts["pmg1-queue-hold"]
|
|
m.mu.RUnlock()
|
|
|
|
if alert == nil {
|
|
t.Fatal("expected critical alert to be created")
|
|
}
|
|
if alert.Level != AlertLevelCritical {
|
|
t.Errorf("expected critical level, got %s", alert.Level)
|
|
}
|
|
})
|
|
|
|
t.Run("updates existing alert", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
oldTime := time.Now().Add(-1 * time.Hour)
|
|
m.mu.Lock()
|
|
m.activeAlerts["pmg1-queue-total"] = &Alert{
|
|
ID: "pmg1-queue-total",
|
|
Value: 400,
|
|
Level: AlertLevelWarning,
|
|
LastSeen: oldTime,
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
pmg := models.PMGInstance{
|
|
ID: "pmg1",
|
|
Name: "PMG 1",
|
|
Nodes: []models.PMGNodeStatus{
|
|
{Name: "node1", QueueStatus: &models.PMGQueueStatus{Total: 1200}},
|
|
},
|
|
}
|
|
|
|
defaults := PMGThresholdConfig{
|
|
QueueTotalWarning: 500,
|
|
QueueTotalCritical: 1000,
|
|
}
|
|
m.checkPMGQueueDepths(pmg, defaults)
|
|
|
|
m.mu.RLock()
|
|
alert := m.activeAlerts["pmg1-queue-total"]
|
|
m.mu.RUnlock()
|
|
|
|
if alert == nil {
|
|
t.Fatal("expected alert to exist")
|
|
}
|
|
if alert.Value != 1200 {
|
|
t.Errorf("expected value 1200, got %f", alert.Value)
|
|
}
|
|
if alert.Level != AlertLevelCritical {
|
|
t.Errorf("expected critical level, got %s", alert.Level)
|
|
}
|
|
if !alert.LastSeen.After(oldTime) {
|
|
t.Error("expected LastSeen to be updated")
|
|
}
|
|
})
|
|
|
|
t.Run("below threshold clears alert", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
m.mu.Lock()
|
|
m.activeAlerts["pmg1-queue-total"] = &Alert{ID: "pmg1-queue-total"}
|
|
m.mu.Unlock()
|
|
|
|
pmg := models.PMGInstance{
|
|
ID: "pmg1",
|
|
Name: "PMG 1",
|
|
Nodes: []models.PMGNodeStatus{
|
|
{Name: "node1", QueueStatus: &models.PMGQueueStatus{Total: 100}},
|
|
},
|
|
}
|
|
|
|
defaults := PMGThresholdConfig{
|
|
QueueTotalWarning: 500,
|
|
QueueTotalCritical: 1000,
|
|
}
|
|
m.checkPMGQueueDepths(pmg, defaults)
|
|
|
|
m.mu.RLock()
|
|
_, exists := m.activeAlerts["pmg1-queue-total"]
|
|
m.mu.RUnlock()
|
|
|
|
if exists {
|
|
t.Error("expected alert to be cleared when below threshold")
|
|
}
|
|
})
|
|
|
|
t.Run("nil QueueStatus is handled", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
pmg := models.PMGInstance{
|
|
ID: "pmg1",
|
|
Name: "PMG 1",
|
|
Nodes: []models.PMGNodeStatus{
|
|
{Name: "node1", QueueStatus: nil},
|
|
{Name: "node2", QueueStatus: &models.PMGQueueStatus{Total: 100}},
|
|
},
|
|
}
|
|
|
|
defaults := PMGThresholdConfig{
|
|
QueueTotalWarning: 500,
|
|
}
|
|
// Should not panic
|
|
m.checkPMGQueueDepths(pmg, defaults)
|
|
|
|
m.mu.RLock()
|
|
_, exists := m.activeAlerts["pmg1-queue-total"]
|
|
m.mu.RUnlock()
|
|
|
|
if exists {
|
|
t.Error("expected no alert with total below threshold")
|
|
}
|
|
})
|
|
}
|
|
|
|
func TestCheckPMGOldestMessage(t *testing.T) {
|
|
// t.Parallel()
|
|
|
|
t.Run("no thresholds configured returns early", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
pmg := models.PMGInstance{
|
|
ID: "pmg1",
|
|
Name: "PMG 1",
|
|
Nodes: []models.PMGNodeStatus{
|
|
{Name: "node1", QueueStatus: &models.PMGQueueStatus{OldestAge: 7200}}, // 2 hours
|
|
},
|
|
}
|
|
|
|
defaults := PMGThresholdConfig{} // No thresholds
|
|
m.checkPMGOldestMessage(pmg, defaults)
|
|
|
|
m.mu.RLock()
|
|
_, exists := m.activeAlerts["pmg1-oldest-message"]
|
|
m.mu.RUnlock()
|
|
|
|
if exists {
|
|
t.Error("expected no alert when no thresholds configured")
|
|
}
|
|
})
|
|
|
|
t.Run("no messages clears existing alert", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
m.mu.Lock()
|
|
m.activeAlerts["pmg1-oldest-message"] = &Alert{ID: "pmg1-oldest-message"}
|
|
m.mu.Unlock()
|
|
|
|
pmg := models.PMGInstance{
|
|
ID: "pmg1",
|
|
Name: "PMG 1",
|
|
Nodes: []models.PMGNodeStatus{
|
|
{Name: "node1", QueueStatus: &models.PMGQueueStatus{OldestAge: 0}},
|
|
},
|
|
}
|
|
|
|
defaults := PMGThresholdConfig{
|
|
OldestMessageWarnMins: 30,
|
|
OldestMessageCritMins: 60,
|
|
}
|
|
m.checkPMGOldestMessage(pmg, defaults)
|
|
|
|
m.mu.RLock()
|
|
_, exists := m.activeAlerts["pmg1-oldest-message"]
|
|
m.mu.RUnlock()
|
|
|
|
if exists {
|
|
t.Error("expected alert to be cleared when no messages in queue")
|
|
}
|
|
})
|
|
|
|
t.Run("warning alert when message age exceeds warning threshold", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
pmg := models.PMGInstance{
|
|
ID: "pmg1",
|
|
Name: "PMG 1",
|
|
Host: "pmg-server",
|
|
Nodes: []models.PMGNodeStatus{
|
|
{Name: "node1", QueueStatus: &models.PMGQueueStatus{OldestAge: 2400}}, // 40 minutes
|
|
},
|
|
}
|
|
|
|
defaults := PMGThresholdConfig{
|
|
OldestMessageWarnMins: 30,
|
|
OldestMessageCritMins: 60,
|
|
}
|
|
m.checkPMGOldestMessage(pmg, defaults)
|
|
|
|
m.mu.RLock()
|
|
alert := m.activeAlerts["pmg1-oldest-message"]
|
|
m.mu.RUnlock()
|
|
|
|
if alert == nil {
|
|
t.Fatal("expected warning alert to be created")
|
|
}
|
|
if alert.Level != AlertLevelWarning {
|
|
t.Errorf("expected warning level, got %s", alert.Level)
|
|
}
|
|
if alert.Value != 40 {
|
|
t.Errorf("expected value 40 minutes, got %f", alert.Value)
|
|
}
|
|
if alert.Threshold != 30 {
|
|
t.Errorf("expected threshold 30, got %f", alert.Threshold)
|
|
}
|
|
})
|
|
|
|
t.Run("critical alert when message age exceeds critical threshold", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
pmg := models.PMGInstance{
|
|
ID: "pmg1",
|
|
Name: "PMG 1",
|
|
Nodes: []models.PMGNodeStatus{
|
|
{Name: "node1", QueueStatus: &models.PMGQueueStatus{OldestAge: 4200}}, // 70 minutes
|
|
},
|
|
}
|
|
|
|
defaults := PMGThresholdConfig{
|
|
OldestMessageWarnMins: 30,
|
|
OldestMessageCritMins: 60,
|
|
}
|
|
m.checkPMGOldestMessage(pmg, defaults)
|
|
|
|
m.mu.RLock()
|
|
alert := m.activeAlerts["pmg1-oldest-message"]
|
|
m.mu.RUnlock()
|
|
|
|
if alert == nil {
|
|
t.Fatal("expected critical alert to be created")
|
|
}
|
|
if alert.Level != AlertLevelCritical {
|
|
t.Errorf("expected critical level, got %s", alert.Level)
|
|
}
|
|
if alert.Threshold != 60 {
|
|
t.Errorf("expected threshold 60, got %f", alert.Threshold)
|
|
}
|
|
})
|
|
|
|
t.Run("below threshold clears alert", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
m.mu.Lock()
|
|
m.activeAlerts["pmg1-oldest-message"] = &Alert{ID: "pmg1-oldest-message"}
|
|
m.mu.Unlock()
|
|
|
|
pmg := models.PMGInstance{
|
|
ID: "pmg1",
|
|
Name: "PMG 1",
|
|
Nodes: []models.PMGNodeStatus{
|
|
{Name: "node1", QueueStatus: &models.PMGQueueStatus{OldestAge: 900}}, // 15 minutes
|
|
},
|
|
}
|
|
|
|
defaults := PMGThresholdConfig{
|
|
OldestMessageWarnMins: 30,
|
|
OldestMessageCritMins: 60,
|
|
}
|
|
m.checkPMGOldestMessage(pmg, defaults)
|
|
|
|
m.mu.RLock()
|
|
_, exists := m.activeAlerts["pmg1-oldest-message"]
|
|
m.mu.RUnlock()
|
|
|
|
if exists {
|
|
t.Error("expected alert to be cleared when below threshold")
|
|
}
|
|
})
|
|
|
|
t.Run("finds oldest across multiple nodes", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
pmg := models.PMGInstance{
|
|
ID: "pmg1",
|
|
Name: "PMG 1",
|
|
Nodes: []models.PMGNodeStatus{
|
|
{Name: "node1", QueueStatus: &models.PMGQueueStatus{OldestAge: 1200}}, // 20 minutes
|
|
{Name: "node2", QueueStatus: &models.PMGQueueStatus{OldestAge: 3000}}, // 50 minutes (oldest)
|
|
{Name: "node3", QueueStatus: &models.PMGQueueStatus{OldestAge: 600}}, // 10 minutes
|
|
},
|
|
}
|
|
|
|
defaults := PMGThresholdConfig{
|
|
OldestMessageWarnMins: 30,
|
|
OldestMessageCritMins: 60,
|
|
}
|
|
m.checkPMGOldestMessage(pmg, defaults)
|
|
|
|
m.mu.RLock()
|
|
alert := m.activeAlerts["pmg1-oldest-message"]
|
|
m.mu.RUnlock()
|
|
|
|
if alert == nil {
|
|
t.Fatal("expected alert to be created")
|
|
}
|
|
if alert.Value != 50 {
|
|
t.Errorf("expected value 50 (oldest across nodes), got %f", alert.Value)
|
|
}
|
|
})
|
|
|
|
t.Run("updates existing alert", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
oldTime := time.Now().Add(-1 * time.Hour)
|
|
m.mu.Lock()
|
|
m.activeAlerts["pmg1-oldest-message"] = &Alert{
|
|
ID: "pmg1-oldest-message",
|
|
Value: 40,
|
|
Level: AlertLevelWarning,
|
|
LastSeen: oldTime,
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
pmg := models.PMGInstance{
|
|
ID: "pmg1",
|
|
Name: "PMG 1",
|
|
Nodes: []models.PMGNodeStatus{
|
|
{Name: "node1", QueueStatus: &models.PMGQueueStatus{OldestAge: 4800}}, // 80 minutes
|
|
},
|
|
}
|
|
|
|
defaults := PMGThresholdConfig{
|
|
OldestMessageWarnMins: 30,
|
|
OldestMessageCritMins: 60,
|
|
}
|
|
m.checkPMGOldestMessage(pmg, defaults)
|
|
|
|
m.mu.RLock()
|
|
alert := m.activeAlerts["pmg1-oldest-message"]
|
|
m.mu.RUnlock()
|
|
|
|
if alert == nil {
|
|
t.Fatal("expected alert to exist")
|
|
}
|
|
if alert.Value != 80 {
|
|
t.Errorf("expected value 80, got %f", alert.Value)
|
|
}
|
|
if alert.Level != AlertLevelCritical {
|
|
t.Errorf("expected critical level, got %s", alert.Level)
|
|
}
|
|
if !alert.LastSeen.After(oldTime) {
|
|
t.Error("expected LastSeen to be updated")
|
|
}
|
|
})
|
|
|
|
t.Run("nil QueueStatus is handled", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
pmg := models.PMGInstance{
|
|
ID: "pmg1",
|
|
Name: "PMG 1",
|
|
Nodes: []models.PMGNodeStatus{
|
|
{Name: "node1", QueueStatus: nil},
|
|
{Name: "node2", QueueStatus: &models.PMGQueueStatus{OldestAge: 2400}}, // 40 minutes
|
|
},
|
|
}
|
|
|
|
defaults := PMGThresholdConfig{
|
|
OldestMessageWarnMins: 30,
|
|
}
|
|
// Should not panic and should use the valid node's data
|
|
m.checkPMGOldestMessage(pmg, defaults)
|
|
|
|
m.mu.RLock()
|
|
alert := m.activeAlerts["pmg1-oldest-message"]
|
|
m.mu.RUnlock()
|
|
|
|
if alert == nil {
|
|
t.Fatal("expected alert to be created from valid node data")
|
|
}
|
|
if alert.Value != 40 {
|
|
t.Errorf("expected value 40, got %f", alert.Value)
|
|
}
|
|
})
|
|
}
|
|
|
|
func TestCheckStorageOffline(t *testing.T) {
|
|
// t.Parallel()
|
|
|
|
t.Run("first poll increments confirmation but does not create alert", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
storage := models.Storage{
|
|
ID: "local-lvm",
|
|
Name: "Local LVM",
|
|
Node: "pve-node1",
|
|
}
|
|
|
|
m.checkStorageOffline(storage)
|
|
|
|
m.mu.RLock()
|
|
confirmCount := m.offlineConfirmations["local-lvm"]
|
|
_, alertExists := m.activeAlerts["storage-offline-local-lvm"]
|
|
m.mu.RUnlock()
|
|
|
|
if confirmCount != 1 {
|
|
t.Errorf("expected confirmation count 1, got %d", confirmCount)
|
|
}
|
|
if alertExists {
|
|
t.Error("expected no alert on first poll")
|
|
}
|
|
})
|
|
|
|
t.Run("second poll creates alert after confirmation", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
storage := models.Storage{
|
|
ID: "local-lvm",
|
|
Name: "Local LVM",
|
|
Node: "pve-node1",
|
|
Instance: "pve-instance",
|
|
}
|
|
|
|
// First poll - confirmation
|
|
m.checkStorageOffline(storage)
|
|
// Second poll - should create alert
|
|
m.checkStorageOffline(storage)
|
|
|
|
m.mu.RLock()
|
|
alert := m.activeAlerts["storage-offline-local-lvm"]
|
|
m.mu.RUnlock()
|
|
|
|
if alert == nil {
|
|
t.Fatal("expected alert to be created after second poll")
|
|
}
|
|
if alert.Type != "offline" {
|
|
t.Errorf("expected type 'offline', got %s", alert.Type)
|
|
}
|
|
if alert.Level != AlertLevelWarning {
|
|
t.Errorf("expected warning level, got %s", alert.Level)
|
|
}
|
|
if alert.ResourceID != "local-lvm" {
|
|
t.Errorf("expected resource ID 'local-lvm', got %s", alert.ResourceID)
|
|
}
|
|
if alert.Node != "pve-node1" {
|
|
t.Errorf("expected node 'pve-node1', got %s", alert.Node)
|
|
}
|
|
})
|
|
|
|
t.Run("existing alert updates LastSeen", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
oldTime := time.Now().Add(-1 * time.Hour)
|
|
m.mu.Lock()
|
|
m.offlineConfirmations["local-lvm"] = 5 // Already confirmed
|
|
m.activeAlerts["storage-offline-local-lvm"] = &Alert{
|
|
ID: "storage-offline-local-lvm",
|
|
LastSeen: oldTime,
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
storage := models.Storage{
|
|
ID: "local-lvm",
|
|
Name: "Local LVM",
|
|
Node: "pve-node1",
|
|
}
|
|
|
|
m.checkStorageOffline(storage)
|
|
|
|
m.mu.RLock()
|
|
alert := m.activeAlerts["storage-offline-local-lvm"]
|
|
m.mu.RUnlock()
|
|
|
|
if !alert.LastSeen.After(oldTime) {
|
|
t.Error("expected LastSeen to be updated")
|
|
}
|
|
})
|
|
|
|
t.Run("disabled storage clears existing alert", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
// Pre-create an alert
|
|
m.mu.Lock()
|
|
m.activeAlerts["storage-offline-local-lvm"] = &Alert{ID: "storage-offline-local-lvm"}
|
|
m.config.Overrides = map[string]ThresholdConfig{
|
|
"local-lvm": {Disabled: true},
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
storage := models.Storage{
|
|
ID: "local-lvm",
|
|
Name: "Local LVM",
|
|
Node: "pve-node1",
|
|
}
|
|
|
|
m.checkStorageOffline(storage)
|
|
|
|
m.mu.RLock()
|
|
_, exists := m.activeAlerts["storage-offline-local-lvm"]
|
|
m.mu.RUnlock()
|
|
|
|
if exists {
|
|
t.Error("expected alert to be cleared when storage is disabled")
|
|
}
|
|
})
|
|
|
|
t.Run("disabled storage does not create alert", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
m.mu.Lock()
|
|
m.config.Overrides = map[string]ThresholdConfig{
|
|
"local-lvm": {Disabled: true},
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
storage := models.Storage{
|
|
ID: "local-lvm",
|
|
Name: "Local LVM",
|
|
Node: "pve-node1",
|
|
}
|
|
|
|
// Multiple polls should not create alert
|
|
m.checkStorageOffline(storage)
|
|
m.checkStorageOffline(storage)
|
|
m.checkStorageOffline(storage)
|
|
|
|
m.mu.RLock()
|
|
_, exists := m.activeAlerts["storage-offline-local-lvm"]
|
|
m.mu.RUnlock()
|
|
|
|
if exists {
|
|
t.Error("expected no alert when storage is disabled")
|
|
}
|
|
})
|
|
}
|
|
|
|
func TestCheckGuestPoweredOff(t *testing.T) {
|
|
// t.Parallel()
|
|
|
|
t.Run("first poll increments confirmation but does not create alert", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
m.checkGuestPoweredOff("vm100", "TestVM", "pve-node1", "pve-instance", "VM", false)
|
|
|
|
m.mu.RLock()
|
|
confirmCount := m.offlineConfirmations["vm100"]
|
|
_, alertExists := m.activeAlerts["guest-powered-off-vm100"]
|
|
m.mu.RUnlock()
|
|
|
|
if confirmCount != 1 {
|
|
t.Errorf("expected confirmation count 1, got %d", confirmCount)
|
|
}
|
|
if alertExists {
|
|
t.Error("expected no alert on first poll")
|
|
}
|
|
})
|
|
|
|
t.Run("second poll creates alert after confirmation", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
// First poll - confirmation
|
|
m.checkGuestPoweredOff("vm100", "TestVM", "pve-node1", "pve-instance", "VM", false)
|
|
// Second poll - should create alert
|
|
m.checkGuestPoweredOff("vm100", "TestVM", "pve-node1", "pve-instance", "VM", false)
|
|
|
|
m.mu.RLock()
|
|
alert := m.activeAlerts["guest-powered-off-vm100"]
|
|
m.mu.RUnlock()
|
|
|
|
if alert == nil {
|
|
t.Fatal("expected alert to be created after second poll")
|
|
}
|
|
if alert.Type != "powered-off" {
|
|
t.Errorf("expected type 'powered-off', got %s", alert.Type)
|
|
}
|
|
if alert.Level != AlertLevelWarning {
|
|
t.Errorf("expected warning level (default severity), got %s", alert.Level)
|
|
}
|
|
if alert.ResourceID != "vm100" {
|
|
t.Errorf("expected resource ID 'vm100', got %s", alert.ResourceID)
|
|
}
|
|
})
|
|
|
|
t.Run("existing alert updates LastSeen and level", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
oldTime := time.Now().Add(-1 * time.Hour)
|
|
m.mu.Lock()
|
|
m.activeAlerts["guest-powered-off-vm100"] = &Alert{
|
|
ID: "guest-powered-off-vm100",
|
|
LastSeen: oldTime,
|
|
Level: AlertLevelWarning,
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
m.checkGuestPoweredOff("vm100", "TestVM", "pve-node1", "pve-instance", "VM", false)
|
|
|
|
m.mu.RLock()
|
|
alert := m.activeAlerts["guest-powered-off-vm100"]
|
|
m.mu.RUnlock()
|
|
|
|
if !alert.LastSeen.After(oldTime) {
|
|
t.Error("expected LastSeen to be updated")
|
|
}
|
|
})
|
|
|
|
t.Run("monitorOnly flag is set in metadata", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
// First poll
|
|
m.checkGuestPoweredOff("vm100", "TestVM", "pve-node1", "pve-instance", "VM", true)
|
|
// Second poll - creates alert
|
|
m.checkGuestPoweredOff("vm100", "TestVM", "pve-node1", "pve-instance", "VM", true)
|
|
|
|
m.mu.RLock()
|
|
alert := m.activeAlerts["guest-powered-off-vm100"]
|
|
m.mu.RUnlock()
|
|
|
|
if alert == nil {
|
|
t.Fatal("expected alert to be created")
|
|
}
|
|
if alert.Metadata == nil {
|
|
t.Fatal("expected metadata to be set")
|
|
}
|
|
if monitorOnly, ok := alert.Metadata["monitorOnly"].(bool); !ok || !monitorOnly {
|
|
t.Error("expected monitorOnly to be true")
|
|
}
|
|
})
|
|
|
|
t.Run("disabled guest clears existing alert", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
// Pre-create an alert and confirmation count
|
|
m.mu.Lock()
|
|
m.activeAlerts["guest-powered-off-vm100"] = &Alert{ID: "guest-powered-off-vm100"}
|
|
m.offlineConfirmations["vm100"] = 5
|
|
m.config.Overrides = map[string]ThresholdConfig{
|
|
"vm100": {Disabled: true},
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
m.checkGuestPoweredOff("vm100", "TestVM", "pve-node1", "pve-instance", "VM", false)
|
|
|
|
m.mu.RLock()
|
|
_, alertExists := m.activeAlerts["guest-powered-off-vm100"]
|
|
_, confirmExists := m.offlineConfirmations["vm100"]
|
|
m.mu.RUnlock()
|
|
|
|
if alertExists {
|
|
t.Error("expected alert to be cleared when guest is disabled")
|
|
}
|
|
if confirmExists {
|
|
t.Error("expected confirmation count to be cleared")
|
|
}
|
|
})
|
|
|
|
t.Run("disableConnectivity clears existing alert", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
// Pre-create an alert
|
|
m.mu.Lock()
|
|
m.activeAlerts["guest-powered-off-vm100"] = &Alert{ID: "guest-powered-off-vm100"}
|
|
m.config.Overrides = map[string]ThresholdConfig{
|
|
"vm100": {DisableConnectivity: true},
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
m.checkGuestPoweredOff("vm100", "TestVM", "pve-node1", "pve-instance", "VM", false)
|
|
|
|
m.mu.RLock()
|
|
_, exists := m.activeAlerts["guest-powered-off-vm100"]
|
|
m.mu.RUnlock()
|
|
|
|
if exists {
|
|
t.Error("expected alert to be cleared when connectivity is disabled")
|
|
}
|
|
})
|
|
|
|
t.Run("uses override severity when configured", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
m.mu.Lock()
|
|
m.config.Overrides = map[string]ThresholdConfig{
|
|
"vm100": {PoweredOffSeverity: AlertLevelCritical},
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
// First poll
|
|
m.checkGuestPoweredOff("vm100", "TestVM", "pve-node1", "pve-instance", "VM", false)
|
|
// Second poll
|
|
m.checkGuestPoweredOff("vm100", "TestVM", "pve-node1", "pve-instance", "VM", false)
|
|
|
|
m.mu.RLock()
|
|
alert := m.activeAlerts["guest-powered-off-vm100"]
|
|
m.mu.RUnlock()
|
|
|
|
if alert == nil {
|
|
t.Fatal("expected alert to be created")
|
|
}
|
|
if alert.Level != AlertLevelCritical {
|
|
t.Errorf("expected critical level from override, got %s", alert.Level)
|
|
}
|
|
})
|
|
|
|
t.Run("uses default severity when no override", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
m.mu.Lock()
|
|
m.config.GuestDefaults.PoweredOffSeverity = AlertLevelCritical
|
|
m.mu.Unlock()
|
|
|
|
// First poll
|
|
m.checkGuestPoweredOff("vm100", "TestVM", "pve-node1", "pve-instance", "VM", false)
|
|
// Second poll
|
|
m.checkGuestPoweredOff("vm100", "TestVM", "pve-node1", "pve-instance", "VM", false)
|
|
|
|
m.mu.RLock()
|
|
alert := m.activeAlerts["guest-powered-off-vm100"]
|
|
m.mu.RUnlock()
|
|
|
|
if alert == nil {
|
|
t.Fatal("expected alert to be created")
|
|
}
|
|
if alert.Level != AlertLevelCritical {
|
|
t.Errorf("expected critical level from defaults, got %s", alert.Level)
|
|
}
|
|
})
|
|
|
|
t.Run("container type in message", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
// First poll
|
|
m.checkGuestPoweredOff("ct200", "TestContainer", "pve-node1", "pve-instance", "Container", false)
|
|
// Second poll
|
|
m.checkGuestPoweredOff("ct200", "TestContainer", "pve-node1", "pve-instance", "Container", false)
|
|
|
|
m.mu.RLock()
|
|
alert := m.activeAlerts["guest-powered-off-ct200"]
|
|
m.mu.RUnlock()
|
|
|
|
if alert == nil {
|
|
t.Fatal("expected alert to be created")
|
|
}
|
|
if !strings.Contains(alert.Message, "Container") {
|
|
t.Errorf("expected message to contain 'Container', got %s", alert.Message)
|
|
}
|
|
if !strings.Contains(alert.Message, "TestContainer") {
|
|
t.Errorf("expected message to contain 'TestContainer', got %s", alert.Message)
|
|
}
|
|
})
|
|
}
|
|
|
|
func TestCleanup(t *testing.T) {
|
|
// t.Parallel()
|
|
|
|
t.Run("auto-acknowledges old alerts", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
oldTime := time.Now().Add(-3 * time.Hour)
|
|
m.mu.Lock()
|
|
m.config.AutoAcknowledgeAfterHours = 2
|
|
m.activeAlerts["old-alert"] = &Alert{
|
|
ID: "old-alert",
|
|
StartTime: oldTime,
|
|
Acknowledged: false,
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
m.Cleanup(1 * time.Hour)
|
|
|
|
m.mu.RLock()
|
|
alert := m.activeAlerts["old-alert"]
|
|
m.mu.RUnlock()
|
|
|
|
if alert == nil {
|
|
t.Fatal("expected alert to exist")
|
|
}
|
|
if !alert.Acknowledged {
|
|
t.Error("expected alert to be auto-acknowledged")
|
|
}
|
|
if alert.AckUser != "system-auto" {
|
|
t.Errorf("expected AckUser 'system-auto', got %s", alert.AckUser)
|
|
}
|
|
})
|
|
|
|
t.Run("removes old acknowledged alerts by TTL", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
oldAckTime := time.Now().Add(-10 * 24 * time.Hour) // 10 days ago
|
|
m.mu.Lock()
|
|
m.config.MaxAcknowledgedAgeDays = 7
|
|
m.activeAlerts["ack-alert"] = &Alert{
|
|
ID: "ack-alert",
|
|
Acknowledged: true,
|
|
AckTime: &oldAckTime,
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
m.Cleanup(1 * time.Hour)
|
|
|
|
m.mu.RLock()
|
|
_, exists := m.activeAlerts["ack-alert"]
|
|
m.mu.RUnlock()
|
|
|
|
if exists {
|
|
t.Error("expected acknowledged alert to be removed by TTL")
|
|
}
|
|
})
|
|
|
|
t.Run("removes old unacknowledged alerts by TTL", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
oldTime := time.Now().Add(-40 * 24 * time.Hour) // 40 days ago
|
|
m.mu.Lock()
|
|
m.config.MaxAlertAgeDays = 30
|
|
m.config.AutoAcknowledgeAfterHours = 0 // Disable auto-acknowledge to test TTL
|
|
m.activeAlerts["old-unack-alert"] = &Alert{
|
|
ID: "old-unack-alert",
|
|
StartTime: oldTime,
|
|
Acknowledged: false,
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
m.Cleanup(1 * time.Hour)
|
|
|
|
m.mu.RLock()
|
|
_, exists := m.activeAlerts["old-unack-alert"]
|
|
m.mu.RUnlock()
|
|
|
|
if exists {
|
|
t.Error("expected old unacknowledged alert to be removed by TTL")
|
|
}
|
|
})
|
|
|
|
t.Run("removes acknowledged alerts by maxAge fallback", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
oldAckTime := time.Now().Add(-2 * time.Hour)
|
|
m.mu.Lock()
|
|
m.activeAlerts["ack-fallback"] = &Alert{
|
|
ID: "ack-fallback",
|
|
Acknowledged: true,
|
|
AckTime: &oldAckTime,
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
m.Cleanup(1 * time.Hour)
|
|
|
|
m.mu.RLock()
|
|
_, exists := m.activeAlerts["ack-fallback"]
|
|
m.mu.RUnlock()
|
|
|
|
if exists {
|
|
t.Error("expected acknowledged alert to be removed by maxAge fallback")
|
|
}
|
|
})
|
|
|
|
t.Run("keeps acknowledged alerts that are still active", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
oldAckTime := time.Now().Add(-2 * time.Hour)
|
|
recentSeen := time.Now().Add(-5 * time.Minute)
|
|
m.mu.Lock()
|
|
m.activeAlerts["ack-active"] = &Alert{
|
|
ID: "ack-active",
|
|
Acknowledged: true,
|
|
AckTime: &oldAckTime,
|
|
LastSeen: recentSeen,
|
|
StartTime: recentSeen,
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
m.Cleanup(1 * time.Hour)
|
|
|
|
m.mu.RLock()
|
|
_, exists := m.activeAlerts["ack-active"]
|
|
m.mu.RUnlock()
|
|
|
|
if !exists {
|
|
t.Error("expected acknowledged active alert to remain")
|
|
}
|
|
})
|
|
|
|
t.Run("cleans up old recent alerts", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
oldTime := time.Now().Add(-10 * time.Minute)
|
|
m.mu.Lock()
|
|
m.recentAlerts["recent-old"] = &Alert{
|
|
ID: "recent-old",
|
|
StartTime: oldTime,
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
m.Cleanup(1 * time.Hour)
|
|
|
|
m.mu.RLock()
|
|
_, exists := m.recentAlerts["recent-old"]
|
|
m.mu.RUnlock()
|
|
|
|
if exists {
|
|
t.Error("expected old recent alert to be cleaned up")
|
|
}
|
|
})
|
|
|
|
t.Run("cleans up expired suppressions", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
m.mu.Lock()
|
|
m.suppressedUntil["suppressed-alert"] = time.Now().Add(-1 * time.Hour)
|
|
m.mu.Unlock()
|
|
|
|
m.Cleanup(1 * time.Hour)
|
|
|
|
m.mu.RLock()
|
|
_, exists := m.suppressedUntil["suppressed-alert"]
|
|
m.mu.RUnlock()
|
|
|
|
if exists {
|
|
t.Error("expected expired suppression to be cleaned up")
|
|
}
|
|
})
|
|
|
|
t.Run("cleans up old rate limit entries", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
m.mu.Lock()
|
|
m.alertRateLimit["rate-limited"] = []time.Time{
|
|
time.Now().Add(-2 * time.Hour), // Old, should be removed
|
|
time.Now().Add(-30 * time.Minute), // Recent, should remain
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
m.Cleanup(1 * time.Hour)
|
|
|
|
m.mu.RLock()
|
|
times := m.alertRateLimit["rate-limited"]
|
|
m.mu.RUnlock()
|
|
|
|
if len(times) != 1 {
|
|
t.Errorf("expected 1 recent time, got %d", len(times))
|
|
}
|
|
})
|
|
|
|
t.Run("removes empty rate limit entries", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
m.mu.Lock()
|
|
m.alertRateLimit["all-old"] = []time.Time{
|
|
time.Now().Add(-2 * time.Hour),
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
m.Cleanup(1 * time.Hour)
|
|
|
|
m.mu.RLock()
|
|
_, exists := m.alertRateLimit["all-old"]
|
|
m.mu.RUnlock()
|
|
|
|
if exists {
|
|
t.Error("expected empty rate limit entry to be removed")
|
|
}
|
|
})
|
|
|
|
t.Run("cleans up old recently resolved alerts", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
m.resolvedMutex.Lock()
|
|
m.recentlyResolved["old-resolved"] = &ResolvedAlert{
|
|
Alert: &Alert{ID: "old-resolved"},
|
|
ResolvedTime: time.Now().Add(-10 * time.Minute),
|
|
}
|
|
m.resolvedMutex.Unlock()
|
|
|
|
m.Cleanup(1 * time.Hour)
|
|
|
|
m.resolvedMutex.Lock()
|
|
_, exists := m.recentlyResolved["old-resolved"]
|
|
m.resolvedMutex.Unlock()
|
|
|
|
if exists {
|
|
t.Error("expected old recently resolved alert to be cleaned up")
|
|
}
|
|
})
|
|
|
|
t.Run("cleans up stale pending alerts", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
m.mu.Lock()
|
|
m.pendingAlerts["stale-pending"] = time.Now().Add(-15 * time.Minute)
|
|
m.mu.Unlock()
|
|
|
|
m.Cleanup(1 * time.Hour)
|
|
|
|
m.mu.RLock()
|
|
_, exists := m.pendingAlerts["stale-pending"]
|
|
m.mu.RUnlock()
|
|
|
|
if exists {
|
|
t.Error("expected stale pending alert to be cleaned up")
|
|
}
|
|
})
|
|
|
|
t.Run("cleans up flapping history for inactive alerts", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
m.mu.Lock()
|
|
m.flappingHistory["inactive-alert"] = []time.Time{
|
|
time.Now().Add(-30 * time.Minute),
|
|
}
|
|
m.flappingActive["inactive-alert"] = true
|
|
// No active alert, no suppression
|
|
m.mu.Unlock()
|
|
|
|
m.Cleanup(1 * time.Hour)
|
|
|
|
m.mu.RLock()
|
|
_, historyExists := m.flappingHistory["inactive-alert"]
|
|
_, activeExists := m.flappingActive["inactive-alert"]
|
|
m.mu.RUnlock()
|
|
|
|
if historyExists {
|
|
t.Error("expected flapping history to be cleaned up")
|
|
}
|
|
if activeExists {
|
|
t.Error("expected flapping active flag to be cleaned up")
|
|
}
|
|
})
|
|
|
|
t.Run("cleans up stale Docker restart tracking", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
m.mu.Lock()
|
|
m.dockerRestartTracking["stale-container"] = &dockerRestartRecord{
|
|
lastChecked: time.Now().Add(-25 * time.Hour),
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
m.Cleanup(1 * time.Hour)
|
|
|
|
m.mu.RLock()
|
|
_, exists := m.dockerRestartTracking["stale-container"]
|
|
m.mu.RUnlock()
|
|
|
|
if exists {
|
|
t.Error("expected stale Docker restart tracking to be cleaned up")
|
|
}
|
|
})
|
|
|
|
t.Run("cleans up stale PMG anomaly trackers", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
m.mu.Lock()
|
|
m.pmgAnomalyTrackers["stale-pmg"] = &pmgAnomalyTracker{
|
|
LastSampleTime: time.Now().Add(-25 * time.Hour),
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
m.Cleanup(1 * time.Hour)
|
|
|
|
m.mu.RLock()
|
|
_, exists := m.pmgAnomalyTrackers["stale-pmg"]
|
|
m.mu.RUnlock()
|
|
|
|
if exists {
|
|
t.Error("expected stale PMG anomaly tracker to be cleaned up")
|
|
}
|
|
})
|
|
|
|
t.Run("cleans up empty PMG quarantine history", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
m.mu.Lock()
|
|
m.pmgQuarantineHistory["empty-pmg"] = []pmgQuarantineSnapshot{}
|
|
m.mu.Unlock()
|
|
|
|
m.Cleanup(1 * time.Hour)
|
|
|
|
m.mu.RLock()
|
|
_, exists := m.pmgQuarantineHistory["empty-pmg"]
|
|
m.mu.RUnlock()
|
|
|
|
if exists {
|
|
t.Error("expected empty PMG quarantine history to be cleaned up")
|
|
}
|
|
})
|
|
|
|
t.Run("cleans up stale PMG quarantine history", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
m.mu.Lock()
|
|
m.pmgQuarantineHistory["stale-pmg"] = []pmgQuarantineSnapshot{
|
|
{Timestamp: time.Now().Add(-8 * 24 * time.Hour)},
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
m.Cleanup(1 * time.Hour)
|
|
|
|
m.mu.RLock()
|
|
_, exists := m.pmgQuarantineHistory["stale-pmg"]
|
|
m.mu.RUnlock()
|
|
|
|
if exists {
|
|
t.Error("expected stale PMG quarantine history to be cleaned up")
|
|
}
|
|
})
|
|
}
|
|
|
|
func TestConvertLegacyThreshold(t *testing.T) {
|
|
// t.Parallel()
|
|
|
|
t.Run("nil input returns nil", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
result := m.convertLegacyThreshold(nil)
|
|
|
|
if result != nil {
|
|
t.Error("expected nil result for nil input")
|
|
}
|
|
})
|
|
|
|
t.Run("zero value returns nil", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
zero := 0.0
|
|
result := m.convertLegacyThreshold(&zero)
|
|
|
|
if result != nil {
|
|
t.Error("expected nil result for zero value")
|
|
}
|
|
})
|
|
|
|
t.Run("negative value returns nil", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
neg := -5.0
|
|
result := m.convertLegacyThreshold(&neg)
|
|
|
|
if result != nil {
|
|
t.Error("expected nil result for negative value")
|
|
}
|
|
})
|
|
|
|
t.Run("positive value with default margin", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
threshold := 80.0
|
|
result := m.convertLegacyThreshold(&threshold)
|
|
|
|
if result == nil {
|
|
t.Fatal("expected non-nil result")
|
|
}
|
|
if result.Trigger != 80.0 {
|
|
t.Errorf("expected trigger 80.0, got %f", result.Trigger)
|
|
}
|
|
if result.Clear != 75.0 { // 80 - 5 (default margin)
|
|
t.Errorf("expected clear 75.0, got %f", result.Clear)
|
|
}
|
|
})
|
|
|
|
t.Run("positive value with custom margin", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
m.mu.Lock()
|
|
m.config.HysteresisMargin = 10.0
|
|
m.mu.Unlock()
|
|
|
|
threshold := 80.0
|
|
result := m.convertLegacyThreshold(&threshold)
|
|
|
|
if result == nil {
|
|
t.Fatal("expected non-nil result")
|
|
}
|
|
if result.Trigger != 80.0 {
|
|
t.Errorf("expected trigger 80.0, got %f", result.Trigger)
|
|
}
|
|
if result.Clear != 70.0 { // 80 - 10 (custom margin)
|
|
t.Errorf("expected clear 70.0, got %f", result.Clear)
|
|
}
|
|
})
|
|
}
|
|
|
|
func TestCheckEscalations(t *testing.T) {
|
|
// t.Parallel()
|
|
|
|
t.Run("does nothing when escalation is disabled", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
oldTime := time.Now().Add(-2 * time.Hour)
|
|
m.mu.Lock()
|
|
m.config.ActivationState = ActivationActive
|
|
m.config.Schedule.Escalation.Enabled = false
|
|
m.config.Schedule.Escalation.Levels = []EscalationLevel{
|
|
{After: 30, Notify: "email"},
|
|
}
|
|
m.activeAlerts["test-alert"] = &Alert{
|
|
ID: "test-alert",
|
|
StartTime: oldTime,
|
|
LastEscalation: 0,
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
m.checkEscalations()
|
|
|
|
m.mu.RLock()
|
|
alert := m.activeAlerts["test-alert"]
|
|
m.mu.RUnlock()
|
|
|
|
if alert.LastEscalation != 0 {
|
|
t.Errorf("expected no escalation when disabled, got %d", alert.LastEscalation)
|
|
}
|
|
})
|
|
|
|
t.Run("does nothing when alerts are globally disabled", func(t *testing.T) {
|
|
m := newTestManager(t)
|
|
|
|
oldTime := time.Now().Add(-2 * time.Hour)
|
|
m.mu.Lock()
|
|
m.config.Enabled = false
|
|
m.config.ActivationState = ActivationActive
|
|
m.config.Schedule.Escalation.Enabled = true
|
|
m.config.Schedule.Escalation.Levels = []EscalationLevel{
|
|
{After: 30, Notify: "email"},
|
|
}
|
|
m.activeAlerts["global-disabled-alert"] = &Alert{
|
|
ID: "global-disabled-alert",
|
|
StartTime: oldTime,
|
|
LastEscalation: 0,
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
m.checkEscalations()
|
|
|
|
m.mu.RLock()
|
|
alert := m.activeAlerts["global-disabled-alert"]
|
|
m.mu.RUnlock()
|
|
|
|
if alert.LastEscalation != 0 {
|
|
t.Errorf("expected no escalation when alerts are globally disabled, got %d", alert.LastEscalation)
|
|
}
|
|
})
|
|
|
|
t.Run("does nothing when activation state is pending", func(t *testing.T) {
|
|
m := newTestManager(t)
|
|
|
|
oldTime := time.Now().Add(-2 * time.Hour)
|
|
m.mu.Lock()
|
|
m.config.Enabled = true
|
|
m.config.ActivationState = ActivationPending
|
|
m.config.Schedule.Escalation.Enabled = true
|
|
m.config.Schedule.Escalation.Levels = []EscalationLevel{
|
|
{After: 30, Notify: "email"},
|
|
}
|
|
m.activeAlerts["pending-alert"] = &Alert{
|
|
ID: "pending-alert",
|
|
StartTime: oldTime,
|
|
LastEscalation: 0,
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
m.checkEscalations()
|
|
|
|
m.mu.RLock()
|
|
alert := m.activeAlerts["pending-alert"]
|
|
m.mu.RUnlock()
|
|
|
|
if alert.LastEscalation != 0 {
|
|
t.Errorf("expected no escalation when activation is pending, got %d", alert.LastEscalation)
|
|
}
|
|
})
|
|
|
|
t.Run("does nothing when activation state is snoozed", func(t *testing.T) {
|
|
m := newTestManager(t)
|
|
|
|
oldTime := time.Now().Add(-2 * time.Hour)
|
|
m.mu.Lock()
|
|
m.config.Enabled = true
|
|
m.config.ActivationState = ActivationSnoozed
|
|
m.config.Schedule.Escalation.Enabled = true
|
|
m.config.Schedule.Escalation.Levels = []EscalationLevel{
|
|
{After: 30, Notify: "email"},
|
|
}
|
|
m.activeAlerts["snoozed-alert"] = &Alert{
|
|
ID: "snoozed-alert",
|
|
StartTime: oldTime,
|
|
LastEscalation: 0,
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
m.checkEscalations()
|
|
|
|
m.mu.RLock()
|
|
alert := m.activeAlerts["snoozed-alert"]
|
|
m.mu.RUnlock()
|
|
|
|
if alert.LastEscalation != 0 {
|
|
t.Errorf("expected no escalation when activation is snoozed, got %d", alert.LastEscalation)
|
|
}
|
|
})
|
|
|
|
t.Run("skips acknowledged alerts", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
oldTime := time.Now().Add(-2 * time.Hour)
|
|
m.mu.Lock()
|
|
m.config.ActivationState = ActivationActive
|
|
m.config.Schedule.Escalation.Enabled = true
|
|
m.config.Schedule.Escalation.Levels = []EscalationLevel{
|
|
{After: 30, Notify: "email"},
|
|
}
|
|
m.activeAlerts["ack-alert"] = &Alert{
|
|
ID: "ack-alert",
|
|
StartTime: oldTime,
|
|
LastEscalation: 0,
|
|
Acknowledged: true,
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
m.checkEscalations()
|
|
|
|
m.mu.RLock()
|
|
alert := m.activeAlerts["ack-alert"]
|
|
m.mu.RUnlock()
|
|
|
|
if alert.LastEscalation != 0 {
|
|
t.Error("expected no escalation for acknowledged alert")
|
|
}
|
|
})
|
|
|
|
t.Run("escalates alert after threshold time", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
oldTime := time.Now().Add(-45 * time.Minute) // 45 minutes ago
|
|
m.mu.Lock()
|
|
m.config.ActivationState = ActivationActive
|
|
m.config.Schedule.Escalation.Enabled = true
|
|
m.config.Schedule.Escalation.Levels = []EscalationLevel{
|
|
{After: 30, Notify: "email"}, // 30 minutes
|
|
{After: 60, Notify: "webhook"}, // 60 minutes
|
|
}
|
|
m.activeAlerts["escalate-alert"] = &Alert{
|
|
ID: "escalate-alert",
|
|
StartTime: oldTime,
|
|
LastEscalation: 0,
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
m.checkEscalations()
|
|
|
|
m.mu.RLock()
|
|
alert := m.activeAlerts["escalate-alert"]
|
|
m.mu.RUnlock()
|
|
|
|
if alert.LastEscalation != 1 {
|
|
t.Errorf("expected escalation to level 1, got %d", alert.LastEscalation)
|
|
}
|
|
if len(alert.EscalationTimes) != 1 {
|
|
t.Errorf("expected 1 escalation time, got %d", len(alert.EscalationTimes))
|
|
}
|
|
})
|
|
|
|
t.Run("escalates to multiple levels", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
oldTime := time.Now().Add(-90 * time.Minute) // 90 minutes ago
|
|
m.mu.Lock()
|
|
m.config.ActivationState = ActivationActive
|
|
m.config.Schedule.Escalation.Enabled = true
|
|
m.config.Schedule.Escalation.Levels = []EscalationLevel{
|
|
{After: 30, Notify: "email"}, // 30 minutes
|
|
{After: 60, Notify: "webhook"}, // 60 minutes
|
|
}
|
|
m.activeAlerts["multi-escalate"] = &Alert{
|
|
ID: "multi-escalate",
|
|
StartTime: oldTime,
|
|
LastEscalation: 0,
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
m.checkEscalations()
|
|
|
|
m.mu.RLock()
|
|
alert := m.activeAlerts["multi-escalate"]
|
|
m.mu.RUnlock()
|
|
|
|
if alert.LastEscalation != 2 {
|
|
t.Errorf("expected escalation to level 2, got %d", alert.LastEscalation)
|
|
}
|
|
if len(alert.EscalationTimes) != 2 {
|
|
t.Errorf("expected 2 escalation times, got %d", len(alert.EscalationTimes))
|
|
}
|
|
})
|
|
|
|
t.Run("does not re-escalate already escalated level", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
oldTime := time.Now().Add(-45 * time.Minute)
|
|
m.mu.Lock()
|
|
m.config.ActivationState = ActivationActive
|
|
m.config.Schedule.Escalation.Enabled = true
|
|
m.config.Schedule.Escalation.Levels = []EscalationLevel{
|
|
{After: 30, Notify: "email"},
|
|
}
|
|
m.activeAlerts["already-escalated"] = &Alert{
|
|
ID: "already-escalated",
|
|
StartTime: oldTime,
|
|
LastEscalation: 1,
|
|
EscalationTimes: []time.Time{time.Now().Add(-10 * time.Minute)},
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
m.checkEscalations()
|
|
|
|
m.mu.RLock()
|
|
alert := m.activeAlerts["already-escalated"]
|
|
m.mu.RUnlock()
|
|
|
|
if alert.LastEscalation != 1 {
|
|
t.Errorf("expected escalation to remain at 1, got %d", alert.LastEscalation)
|
|
}
|
|
if len(alert.EscalationTimes) != 1 {
|
|
t.Errorf("expected 1 escalation time (unchanged), got %d", len(alert.EscalationTimes))
|
|
}
|
|
})
|
|
|
|
t.Run("does not escalate before threshold time", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
recentTime := time.Now().Add(-10 * time.Minute) // Only 10 minutes ago
|
|
m.mu.Lock()
|
|
m.config.ActivationState = ActivationActive
|
|
m.config.Schedule.Escalation.Enabled = true
|
|
m.config.Schedule.Escalation.Levels = []EscalationLevel{
|
|
{After: 30, Notify: "email"}, // 30 minutes threshold
|
|
}
|
|
m.activeAlerts["recent-alert"] = &Alert{
|
|
ID: "recent-alert",
|
|
StartTime: recentTime,
|
|
LastEscalation: 0,
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
m.checkEscalations()
|
|
|
|
m.mu.RLock()
|
|
alert := m.activeAlerts["recent-alert"]
|
|
m.mu.RUnlock()
|
|
|
|
if alert.LastEscalation != 0 {
|
|
t.Error("expected no escalation for recent alert")
|
|
}
|
|
})
|
|
}
|
|
|
|
func TestCleanupAlertsForNodes(t *testing.T) {
|
|
// t.Parallel()
|
|
|
|
t.Run("removes alerts for non-existent nodes", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
m.mu.Lock()
|
|
m.activeAlerts["alert-old-node"] = &Alert{
|
|
ID: "alert-old-node",
|
|
Node: "old-node",
|
|
}
|
|
m.activeAlerts["alert-valid-node"] = &Alert{
|
|
ID: "alert-valid-node",
|
|
Node: "valid-node",
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
existingNodes := map[string]bool{
|
|
"valid-node": true,
|
|
}
|
|
|
|
m.CleanupAlertsForNodes(existingNodes)
|
|
|
|
// Give async save goroutine time to complete
|
|
time.Sleep(50 * time.Millisecond)
|
|
|
|
m.mu.RLock()
|
|
_, oldExists := m.activeAlerts["alert-old-node"]
|
|
_, validExists := m.activeAlerts["alert-valid-node"]
|
|
m.mu.RUnlock()
|
|
|
|
if oldExists {
|
|
t.Error("expected alert for old node to be removed")
|
|
}
|
|
if !validExists {
|
|
t.Error("expected alert for valid node to remain")
|
|
}
|
|
})
|
|
|
|
t.Run("skips Docker alerts", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
m.mu.Lock()
|
|
m.activeAlerts["docker-container-state"] = &Alert{
|
|
ID: "docker-container-state",
|
|
ResourceID: "docker:host1:container1",
|
|
Node: "non-existent-node",
|
|
}
|
|
m.activeAlerts["alert-with-docker-resource"] = &Alert{
|
|
ID: "alert-with-docker-resource",
|
|
ResourceID: "docker:host2:container2",
|
|
Node: "non-existent-node",
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
existingNodes := map[string]bool{}
|
|
|
|
m.CleanupAlertsForNodes(existingNodes)
|
|
|
|
m.mu.RLock()
|
|
_, dockerExists := m.activeAlerts["docker-container-state"]
|
|
_, dockerResourceExists := m.activeAlerts["alert-with-docker-resource"]
|
|
m.mu.RUnlock()
|
|
|
|
if !dockerExists {
|
|
t.Error("expected docker alert to be preserved")
|
|
}
|
|
if !dockerResourceExists {
|
|
t.Error("expected alert with docker resource to be preserved")
|
|
}
|
|
})
|
|
|
|
t.Run("skips PBS alerts", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
m.mu.Lock()
|
|
m.activeAlerts["pbs-offline-test"] = &Alert{
|
|
ID: "pbs-offline-test",
|
|
Node: "non-existent-node",
|
|
}
|
|
m.activeAlerts["pbs-backup-alert"] = &Alert{
|
|
ID: "pbs-backup-alert",
|
|
Type: "pbs-offline",
|
|
Node: "non-existent-node",
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
existingNodes := map[string]bool{}
|
|
|
|
m.CleanupAlertsForNodes(existingNodes)
|
|
|
|
m.mu.RLock()
|
|
_, pbsExists := m.activeAlerts["pbs-offline-test"]
|
|
_, pbsTypeExists := m.activeAlerts["pbs-backup-alert"]
|
|
m.mu.RUnlock()
|
|
|
|
if !pbsExists {
|
|
t.Error("expected pbs-prefixed alert to be preserved")
|
|
}
|
|
if !pbsTypeExists {
|
|
t.Error("expected pbs-offline type alert to be preserved")
|
|
}
|
|
})
|
|
|
|
t.Run("removes alerts with empty node", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
m.mu.Lock()
|
|
m.activeAlerts["empty-node-alert"] = &Alert{
|
|
ID: "empty-node-alert",
|
|
Node: "",
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
existingNodes := map[string]bool{
|
|
"valid-node": true,
|
|
}
|
|
|
|
m.CleanupAlertsForNodes(existingNodes)
|
|
|
|
time.Sleep(50 * time.Millisecond)
|
|
|
|
m.mu.RLock()
|
|
_, exists := m.activeAlerts["empty-node-alert"]
|
|
m.mu.RUnlock()
|
|
|
|
if exists {
|
|
t.Error("expected alert with empty node to be removed")
|
|
}
|
|
})
|
|
|
|
t.Run("handles nil alert in map", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
m.mu.Lock()
|
|
m.activeAlerts["nil-alert"] = nil
|
|
m.activeAlerts["valid-alert"] = &Alert{
|
|
ID: "valid-alert",
|
|
Node: "valid-node",
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
existingNodes := map[string]bool{
|
|
"valid-node": true,
|
|
}
|
|
|
|
// Should not panic
|
|
m.CleanupAlertsForNodes(existingNodes)
|
|
|
|
m.mu.RLock()
|
|
_, validExists := m.activeAlerts["valid-alert"]
|
|
m.mu.RUnlock()
|
|
|
|
if !validExists {
|
|
t.Error("expected valid alert to remain")
|
|
}
|
|
})
|
|
|
|
t.Run("no cleanup needed logs correctly", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
m.mu.Lock()
|
|
m.activeAlerts["valid-alert"] = &Alert{
|
|
ID: "valid-alert",
|
|
Node: "valid-node",
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
existingNodes := map[string]bool{
|
|
"valid-node": true,
|
|
}
|
|
|
|
// Should not panic and should not remove any alerts
|
|
m.CleanupAlertsForNodes(existingNodes)
|
|
|
|
m.mu.RLock()
|
|
count := len(m.activeAlerts)
|
|
m.mu.RUnlock()
|
|
|
|
if count != 1 {
|
|
t.Errorf("expected 1 alert, got %d", count)
|
|
}
|
|
})
|
|
}
|
|
|
|
func TestCheckZFSPoolHealth(t *testing.T) {
|
|
// t.Parallel()
|
|
|
|
t.Run("nil ZFSPool returns early", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
storage := models.Storage{
|
|
ID: "local-zfs",
|
|
Name: "Local ZFS",
|
|
Node: "pve-node1",
|
|
ZFSPool: nil,
|
|
}
|
|
|
|
// Should not panic
|
|
m.checkZFSPoolHealth(storage)
|
|
|
|
m.mu.RLock()
|
|
count := len(m.activeAlerts)
|
|
m.mu.RUnlock()
|
|
|
|
if count != 0 {
|
|
t.Errorf("expected no alerts for nil pool, got %d", count)
|
|
}
|
|
})
|
|
|
|
t.Run("ONLINE pool does not create state alert", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
storage := models.Storage{
|
|
ID: "local-zfs",
|
|
Name: "Local ZFS",
|
|
Node: "pve-node1",
|
|
ZFSPool: &models.ZFSPool{
|
|
Name: "rpool",
|
|
State: "ONLINE",
|
|
},
|
|
}
|
|
|
|
m.checkZFSPoolHealth(storage)
|
|
|
|
m.mu.RLock()
|
|
_, exists := m.activeAlerts["zfs-pool-state-local-zfs"]
|
|
m.mu.RUnlock()
|
|
|
|
if exists {
|
|
t.Error("expected no state alert for ONLINE pool")
|
|
}
|
|
})
|
|
|
|
t.Run("DEGRADED pool creates warning alert", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
storage := models.Storage{
|
|
ID: "local-zfs",
|
|
Name: "Local ZFS",
|
|
Node: "pve-node1",
|
|
Instance: "pve-instance",
|
|
ZFSPool: &models.ZFSPool{
|
|
Name: "rpool",
|
|
State: "DEGRADED",
|
|
},
|
|
}
|
|
|
|
m.checkZFSPoolHealth(storage)
|
|
|
|
m.mu.RLock()
|
|
alert := m.activeAlerts["zfs-pool-state-local-zfs"]
|
|
m.mu.RUnlock()
|
|
|
|
if alert == nil {
|
|
t.Fatal("expected state alert for DEGRADED pool")
|
|
}
|
|
if alert.Level != AlertLevelWarning {
|
|
t.Errorf("expected warning level, got %s", alert.Level)
|
|
}
|
|
if alert.Type != "zfs-pool-state" {
|
|
t.Errorf("expected type 'zfs-pool-state', got %s", alert.Type)
|
|
}
|
|
})
|
|
|
|
t.Run("FAULTED pool creates critical alert", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
storage := models.Storage{
|
|
ID: "local-zfs",
|
|
Name: "Local ZFS",
|
|
Node: "pve-node1",
|
|
ZFSPool: &models.ZFSPool{
|
|
Name: "rpool",
|
|
State: "FAULTED",
|
|
},
|
|
}
|
|
|
|
m.checkZFSPoolHealth(storage)
|
|
|
|
m.mu.RLock()
|
|
alert := m.activeAlerts["zfs-pool-state-local-zfs"]
|
|
m.mu.RUnlock()
|
|
|
|
if alert == nil {
|
|
t.Fatal("expected state alert for FAULTED pool")
|
|
}
|
|
if alert.Level != AlertLevelCritical {
|
|
t.Errorf("expected critical level, got %s", alert.Level)
|
|
}
|
|
})
|
|
|
|
t.Run("UNAVAIL pool creates critical alert", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
storage := models.Storage{
|
|
ID: "local-zfs",
|
|
Name: "Local ZFS",
|
|
Node: "pve-node1",
|
|
ZFSPool: &models.ZFSPool{
|
|
Name: "rpool",
|
|
State: "UNAVAIL",
|
|
},
|
|
}
|
|
|
|
m.checkZFSPoolHealth(storage)
|
|
|
|
m.mu.RLock()
|
|
alert := m.activeAlerts["zfs-pool-state-local-zfs"]
|
|
m.mu.RUnlock()
|
|
|
|
if alert == nil {
|
|
t.Fatal("expected state alert for UNAVAIL pool")
|
|
}
|
|
if alert.Level != AlertLevelCritical {
|
|
t.Errorf("expected critical level, got %s", alert.Level)
|
|
}
|
|
})
|
|
|
|
t.Run("pool coming back ONLINE clears state alert", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
// Pre-create a state alert
|
|
m.mu.Lock()
|
|
m.activeAlerts["zfs-pool-state-local-zfs"] = &Alert{
|
|
ID: "zfs-pool-state-local-zfs",
|
|
Level: AlertLevelWarning,
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
storage := models.Storage{
|
|
ID: "local-zfs",
|
|
Name: "Local ZFS",
|
|
Node: "pve-node1",
|
|
ZFSPool: &models.ZFSPool{
|
|
Name: "rpool",
|
|
State: "ONLINE",
|
|
},
|
|
}
|
|
|
|
m.checkZFSPoolHealth(storage)
|
|
|
|
m.mu.RLock()
|
|
_, exists := m.activeAlerts["zfs-pool-state-local-zfs"]
|
|
m.mu.RUnlock()
|
|
|
|
if exists {
|
|
t.Error("expected state alert to be cleared when pool is ONLINE")
|
|
}
|
|
})
|
|
|
|
t.Run("pool with errors creates error alert", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
storage := models.Storage{
|
|
ID: "local-zfs",
|
|
Name: "Local ZFS",
|
|
Node: "pve-node1",
|
|
ZFSPool: &models.ZFSPool{
|
|
Name: "rpool",
|
|
State: "ONLINE",
|
|
ReadErrors: 5,
|
|
WriteErrors: 2,
|
|
ChecksumErrors: 1,
|
|
},
|
|
}
|
|
|
|
m.checkZFSPoolHealth(storage)
|
|
|
|
m.mu.RLock()
|
|
alert := m.activeAlerts["zfs-pool-errors-local-zfs"]
|
|
m.mu.RUnlock()
|
|
|
|
if alert == nil {
|
|
t.Fatal("expected errors alert for pool with errors")
|
|
}
|
|
if alert.Type != "zfs-pool-errors" {
|
|
t.Errorf("expected type 'zfs-pool-errors', got %s", alert.Type)
|
|
}
|
|
if alert.Value != 8 { // 5 + 2 + 1
|
|
t.Errorf("expected value 8, got %f", alert.Value)
|
|
}
|
|
})
|
|
|
|
t.Run("pool error count increase updates alert", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
oldTime := time.Now().Add(-1 * time.Hour)
|
|
m.mu.Lock()
|
|
m.activeAlerts["zfs-pool-errors-local-zfs"] = &Alert{
|
|
ID: "zfs-pool-errors-local-zfs",
|
|
Value: 5,
|
|
StartTime: oldTime,
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
storage := models.Storage{
|
|
ID: "local-zfs",
|
|
Name: "Local ZFS",
|
|
Node: "pve-node1",
|
|
ZFSPool: &models.ZFSPool{
|
|
Name: "rpool",
|
|
State: "ONLINE",
|
|
ReadErrors: 10,
|
|
WriteErrors: 0,
|
|
ChecksumErrors: 0,
|
|
},
|
|
}
|
|
|
|
m.checkZFSPoolHealth(storage)
|
|
|
|
m.mu.RLock()
|
|
alert := m.activeAlerts["zfs-pool-errors-local-zfs"]
|
|
m.mu.RUnlock()
|
|
|
|
if alert == nil {
|
|
t.Fatal("expected errors alert to exist")
|
|
}
|
|
if alert.Value != 10 {
|
|
t.Errorf("expected value 10, got %f", alert.Value)
|
|
}
|
|
// Start time should be preserved
|
|
if !alert.StartTime.Equal(oldTime) {
|
|
t.Error("expected StartTime to be preserved on update")
|
|
}
|
|
})
|
|
|
|
t.Run("pool with no errors clears error alert", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
m.mu.Lock()
|
|
m.activeAlerts["zfs-pool-errors-local-zfs"] = &Alert{
|
|
ID: "zfs-pool-errors-local-zfs",
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
storage := models.Storage{
|
|
ID: "local-zfs",
|
|
Name: "Local ZFS",
|
|
Node: "pve-node1",
|
|
ZFSPool: &models.ZFSPool{
|
|
Name: "rpool",
|
|
State: "ONLINE",
|
|
ReadErrors: 0,
|
|
WriteErrors: 0,
|
|
ChecksumErrors: 0,
|
|
},
|
|
}
|
|
|
|
m.checkZFSPoolHealth(storage)
|
|
|
|
m.mu.RLock()
|
|
_, exists := m.activeAlerts["zfs-pool-errors-local-zfs"]
|
|
m.mu.RUnlock()
|
|
|
|
if exists {
|
|
t.Error("expected errors alert to be cleared when no errors")
|
|
}
|
|
})
|
|
|
|
t.Run("device with errors creates device alert", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
storage := models.Storage{
|
|
ID: "local-zfs",
|
|
Name: "Local ZFS",
|
|
Node: "pve-node1",
|
|
ZFSPool: &models.ZFSPool{
|
|
Name: "rpool",
|
|
State: "ONLINE",
|
|
Devices: []models.ZFSDevice{
|
|
{Name: "sda", State: "ONLINE", ReadErrors: 3, WriteErrors: 0, ChecksumErrors: 0},
|
|
},
|
|
},
|
|
}
|
|
|
|
m.checkZFSPoolHealth(storage)
|
|
|
|
m.mu.RLock()
|
|
alert := m.activeAlerts["zfs-device-local-zfs-sda"]
|
|
m.mu.RUnlock()
|
|
|
|
if alert == nil {
|
|
t.Fatal("expected device alert for device with errors")
|
|
}
|
|
if alert.Type != "zfs-device" {
|
|
t.Errorf("expected type 'zfs-device', got %s", alert.Type)
|
|
}
|
|
})
|
|
|
|
t.Run("device in FAULTED state creates critical alert", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
storage := models.Storage{
|
|
ID: "local-zfs",
|
|
Name: "Local ZFS",
|
|
Node: "pve-node1",
|
|
ZFSPool: &models.ZFSPool{
|
|
Name: "rpool",
|
|
State: "DEGRADED",
|
|
Devices: []models.ZFSDevice{
|
|
{Name: "sda", State: "FAULTED"},
|
|
},
|
|
},
|
|
}
|
|
|
|
m.checkZFSPoolHealth(storage)
|
|
|
|
m.mu.RLock()
|
|
alert := m.activeAlerts["zfs-device-local-zfs-sda"]
|
|
m.mu.RUnlock()
|
|
|
|
if alert == nil {
|
|
t.Fatal("expected device alert for FAULTED device")
|
|
}
|
|
if alert.Level != AlertLevelCritical {
|
|
t.Errorf("expected critical level for FAULTED device, got %s", alert.Level)
|
|
}
|
|
})
|
|
|
|
t.Run("healthy device clears device alert", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
m.mu.Lock()
|
|
m.activeAlerts["zfs-device-local-zfs-sda"] = &Alert{
|
|
ID: "zfs-device-local-zfs-sda",
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
storage := models.Storage{
|
|
ID: "local-zfs",
|
|
Name: "Local ZFS",
|
|
Node: "pve-node1",
|
|
ZFSPool: &models.ZFSPool{
|
|
Name: "rpool",
|
|
State: "ONLINE",
|
|
Devices: []models.ZFSDevice{
|
|
{Name: "sda", State: "ONLINE", ReadErrors: 0, WriteErrors: 0, ChecksumErrors: 0},
|
|
},
|
|
},
|
|
}
|
|
|
|
m.checkZFSPoolHealth(storage)
|
|
|
|
m.mu.RLock()
|
|
_, exists := m.activeAlerts["zfs-device-local-zfs-sda"]
|
|
m.mu.RUnlock()
|
|
|
|
if exists {
|
|
t.Error("expected device alert to be cleared for healthy device")
|
|
}
|
|
})
|
|
|
|
t.Run("SPARE device in normal state does not create alert", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
storage := models.Storage{
|
|
ID: "local-zfs",
|
|
Name: "Local ZFS",
|
|
Node: "pve-node1",
|
|
ZFSPool: &models.ZFSPool{
|
|
Name: "rpool",
|
|
State: "ONLINE",
|
|
Devices: []models.ZFSDevice{
|
|
{Name: "sdb", State: "SPARE", ReadErrors: 0, WriteErrors: 0, ChecksumErrors: 0},
|
|
},
|
|
},
|
|
}
|
|
|
|
m.checkZFSPoolHealth(storage)
|
|
|
|
m.mu.RLock()
|
|
_, exists := m.activeAlerts["zfs-device-local-zfs-sdb"]
|
|
m.mu.RUnlock()
|
|
|
|
if exists {
|
|
t.Error("expected no alert for SPARE device without errors")
|
|
}
|
|
})
|
|
}
|
|
|
|
func TestCheckPMGNodeQueues(t *testing.T) {
|
|
// t.Parallel()
|
|
|
|
t.Run("empty nodes returns early", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
pmg := models.PMGInstance{
|
|
ID: "pmg1",
|
|
Name: "PMG 1",
|
|
Nodes: []models.PMGNodeStatus{},
|
|
}
|
|
|
|
defaults := PMGThresholdConfig{
|
|
QueueTotalWarning: 100,
|
|
}
|
|
|
|
// Should not panic
|
|
m.checkPMGNodeQueues(pmg, defaults)
|
|
|
|
m.mu.RLock()
|
|
count := len(m.activeAlerts)
|
|
m.mu.RUnlock()
|
|
|
|
if count != 0 {
|
|
t.Errorf("expected no alerts for empty nodes, got %d", count)
|
|
}
|
|
})
|
|
|
|
t.Run("nil QueueStatus is skipped", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
pmg := models.PMGInstance{
|
|
ID: "pmg1",
|
|
Name: "PMG 1",
|
|
Nodes: []models.PMGNodeStatus{
|
|
{Name: "node1", QueueStatus: nil},
|
|
},
|
|
}
|
|
|
|
defaults := PMGThresholdConfig{
|
|
QueueTotalWarning: 100,
|
|
}
|
|
|
|
m.checkPMGNodeQueues(pmg, defaults)
|
|
|
|
m.mu.RLock()
|
|
count := len(m.activeAlerts)
|
|
m.mu.RUnlock()
|
|
|
|
if count != 0 {
|
|
t.Errorf("expected no alerts for nil QueueStatus, got %d", count)
|
|
}
|
|
})
|
|
|
|
t.Run("total queue warning alert", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
pmg := models.PMGInstance{
|
|
ID: "pmg1",
|
|
Name: "PMG 1",
|
|
Nodes: []models.PMGNodeStatus{
|
|
{Name: "node1", QueueStatus: &models.PMGQueueStatus{Total: 80}},
|
|
},
|
|
}
|
|
|
|
defaults := PMGThresholdConfig{
|
|
QueueTotalWarning: 100, // 60% scaled = 60
|
|
QueueTotalCritical: 200, // 80% scaled = 160
|
|
}
|
|
|
|
m.checkPMGNodeQueues(pmg, defaults)
|
|
|
|
m.mu.RLock()
|
|
alert := m.activeAlerts["pmg1-node1-queue-total"]
|
|
m.mu.RUnlock()
|
|
|
|
if alert == nil {
|
|
t.Fatal("expected total queue warning alert")
|
|
}
|
|
if alert.Level != AlertLevelWarning {
|
|
t.Errorf("expected warning level, got %s", alert.Level)
|
|
}
|
|
if alert.Value != 80 {
|
|
t.Errorf("expected value 80, got %f", alert.Value)
|
|
}
|
|
})
|
|
|
|
t.Run("total queue critical alert", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
pmg := models.PMGInstance{
|
|
ID: "pmg1",
|
|
Name: "PMG 1",
|
|
Nodes: []models.PMGNodeStatus{
|
|
{Name: "node1", QueueStatus: &models.PMGQueueStatus{Total: 200}},
|
|
},
|
|
}
|
|
|
|
defaults := PMGThresholdConfig{
|
|
QueueTotalWarning: 100, // 60% scaled = 60
|
|
QueueTotalCritical: 200, // 80% scaled = 160
|
|
}
|
|
|
|
m.checkPMGNodeQueues(pmg, defaults)
|
|
|
|
m.mu.RLock()
|
|
alert := m.activeAlerts["pmg1-node1-queue-total"]
|
|
m.mu.RUnlock()
|
|
|
|
if alert == nil {
|
|
t.Fatal("expected total queue critical alert")
|
|
}
|
|
if alert.Level != AlertLevelCritical {
|
|
t.Errorf("expected critical level, got %s", alert.Level)
|
|
}
|
|
})
|
|
|
|
t.Run("deferred queue warning alert", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
pmg := models.PMGInstance{
|
|
ID: "pmg1",
|
|
Name: "PMG 1",
|
|
Nodes: []models.PMGNodeStatus{
|
|
{Name: "node1", QueueStatus: &models.PMGQueueStatus{Deferred: 40}},
|
|
},
|
|
}
|
|
|
|
defaults := PMGThresholdConfig{
|
|
DeferredQueueWarn: 50, // 60% scaled = 30
|
|
DeferredQueueCritical: 100,
|
|
}
|
|
|
|
m.checkPMGNodeQueues(pmg, defaults)
|
|
|
|
m.mu.RLock()
|
|
alert := m.activeAlerts["pmg1-node1-queue-deferred"]
|
|
m.mu.RUnlock()
|
|
|
|
if alert == nil {
|
|
t.Fatal("expected deferred queue warning alert")
|
|
}
|
|
if alert.Level != AlertLevelWarning {
|
|
t.Errorf("expected warning level, got %s", alert.Level)
|
|
}
|
|
})
|
|
|
|
t.Run("hold queue warning alert", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
pmg := models.PMGInstance{
|
|
ID: "pmg1",
|
|
Name: "PMG 1",
|
|
Nodes: []models.PMGNodeStatus{
|
|
{Name: "node1", QueueStatus: &models.PMGQueueStatus{Hold: 25}},
|
|
},
|
|
}
|
|
|
|
defaults := PMGThresholdConfig{
|
|
HoldQueueWarn: 30, // 60% scaled = 18
|
|
HoldQueueCritical: 60,
|
|
}
|
|
|
|
m.checkPMGNodeQueues(pmg, defaults)
|
|
|
|
m.mu.RLock()
|
|
alert := m.activeAlerts["pmg1-node1-queue-hold"]
|
|
m.mu.RUnlock()
|
|
|
|
if alert == nil {
|
|
t.Fatal("expected hold queue warning alert")
|
|
}
|
|
if alert.Level != AlertLevelWarning {
|
|
t.Errorf("expected warning level, got %s", alert.Level)
|
|
}
|
|
})
|
|
|
|
t.Run("oldest message age warning alert", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
pmg := models.PMGInstance{
|
|
ID: "pmg1",
|
|
Name: "PMG 1",
|
|
Nodes: []models.PMGNodeStatus{
|
|
{Name: "node1", QueueStatus: &models.PMGQueueStatus{OldestAge: 2400}}, // 40 minutes
|
|
},
|
|
}
|
|
|
|
defaults := PMGThresholdConfig{
|
|
OldestMessageWarnMins: 50, // 60% scaled = 30 minutes
|
|
OldestMessageCritMins: 90,
|
|
}
|
|
|
|
m.checkPMGNodeQueues(pmg, defaults)
|
|
|
|
m.mu.RLock()
|
|
alert := m.activeAlerts["pmg1-node1-oldest-message"]
|
|
m.mu.RUnlock()
|
|
|
|
if alert == nil {
|
|
t.Fatal("expected oldest message warning alert")
|
|
}
|
|
if alert.Level != AlertLevelWarning {
|
|
t.Errorf("expected warning level, got %s", alert.Level)
|
|
}
|
|
if alert.Value != 40 { // 2400 seconds / 60 = 40 minutes
|
|
t.Errorf("expected value 40, got %f", alert.Value)
|
|
}
|
|
})
|
|
|
|
t.Run("below threshold clears alert", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
// Pre-create an alert
|
|
m.mu.Lock()
|
|
m.activeAlerts["pmg1-node1-queue-total"] = &Alert{ID: "pmg1-node1-queue-total"}
|
|
m.mu.Unlock()
|
|
|
|
pmg := models.PMGInstance{
|
|
ID: "pmg1",
|
|
Name: "PMG 1",
|
|
Nodes: []models.PMGNodeStatus{
|
|
{Name: "node1", QueueStatus: &models.PMGQueueStatus{Total: 10}},
|
|
},
|
|
}
|
|
|
|
defaults := PMGThresholdConfig{
|
|
QueueTotalWarning: 100, // 60% scaled = 60
|
|
QueueTotalCritical: 200,
|
|
}
|
|
|
|
m.checkPMGNodeQueues(pmg, defaults)
|
|
|
|
m.mu.RLock()
|
|
_, exists := m.activeAlerts["pmg1-node1-queue-total"]
|
|
m.mu.RUnlock()
|
|
|
|
if exists {
|
|
t.Error("expected alert to be cleared when below threshold")
|
|
}
|
|
})
|
|
|
|
t.Run("outlier detection adds note to message", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
pmg := models.PMGInstance{
|
|
ID: "pmg1",
|
|
Name: "PMG 1",
|
|
Nodes: []models.PMGNodeStatus{
|
|
{Name: "node1", QueueStatus: &models.PMGQueueStatus{Total: 10}},
|
|
{Name: "node2", QueueStatus: &models.PMGQueueStatus{Total: 10}},
|
|
{Name: "node3", QueueStatus: &models.PMGQueueStatus{Total: 100}}, // outlier
|
|
},
|
|
}
|
|
|
|
defaults := PMGThresholdConfig{
|
|
QueueTotalWarning: 100, // 60% scaled = 60
|
|
QueueTotalCritical: 200,
|
|
}
|
|
|
|
m.checkPMGNodeQueues(pmg, defaults)
|
|
|
|
m.mu.RLock()
|
|
alert := m.activeAlerts["pmg1-node3-queue-total"]
|
|
m.mu.RUnlock()
|
|
|
|
if alert == nil {
|
|
t.Fatal("expected alert for outlier node")
|
|
}
|
|
if !strings.Contains(alert.Message, "outlier") {
|
|
t.Errorf("expected message to contain 'outlier', got %s", alert.Message)
|
|
}
|
|
})
|
|
|
|
t.Run("no thresholds configured does not create alerts", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
pmg := models.PMGInstance{
|
|
ID: "pmg1",
|
|
Name: "PMG 1",
|
|
Nodes: []models.PMGNodeStatus{
|
|
{Name: "node1", QueueStatus: &models.PMGQueueStatus{Total: 1000, Deferred: 500, Hold: 300}},
|
|
},
|
|
}
|
|
|
|
defaults := PMGThresholdConfig{} // All zero
|
|
|
|
m.checkPMGNodeQueues(pmg, defaults)
|
|
|
|
m.mu.RLock()
|
|
count := len(m.activeAlerts)
|
|
m.mu.RUnlock()
|
|
|
|
if count != 0 {
|
|
t.Errorf("expected no alerts when no thresholds configured, got %d", count)
|
|
}
|
|
})
|
|
|
|
t.Run("updates existing alert", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
oldTime := time.Now().Add(-1 * time.Hour)
|
|
m.mu.Lock()
|
|
m.activeAlerts["pmg1-node1-queue-total"] = &Alert{
|
|
ID: "pmg1-node1-queue-total",
|
|
Value: 60,
|
|
Level: AlertLevelWarning,
|
|
LastSeen: oldTime,
|
|
StartTime: oldTime,
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
pmg := models.PMGInstance{
|
|
ID: "pmg1",
|
|
Name: "PMG 1",
|
|
Nodes: []models.PMGNodeStatus{
|
|
{Name: "node1", QueueStatus: &models.PMGQueueStatus{Total: 200}},
|
|
},
|
|
}
|
|
|
|
defaults := PMGThresholdConfig{
|
|
QueueTotalWarning: 100, // 60% scaled = 60
|
|
QueueTotalCritical: 200, // 80% scaled = 160
|
|
}
|
|
|
|
m.checkPMGNodeQueues(pmg, defaults)
|
|
|
|
m.mu.RLock()
|
|
alert := m.activeAlerts["pmg1-node1-queue-total"]
|
|
m.mu.RUnlock()
|
|
|
|
if alert == nil {
|
|
t.Fatal("expected alert to exist")
|
|
}
|
|
if alert.Value != 200 {
|
|
t.Errorf("expected value 200, got %f", alert.Value)
|
|
}
|
|
if alert.Level != AlertLevelCritical {
|
|
t.Errorf("expected critical level, got %s", alert.Level)
|
|
}
|
|
if !alert.LastSeen.After(oldTime) {
|
|
t.Error("expected LastSeen to be updated")
|
|
}
|
|
})
|
|
}
|
|
|
|
func TestDockerContainerHealthAlert(t *testing.T) {
|
|
t.Run("healthy container - no alert", func(t *testing.T) {
|
|
m := newTestManager(t)
|
|
|
|
host := models.DockerHost{
|
|
ID: "host-health-1",
|
|
DisplayName: "Docker Host",
|
|
Hostname: "docker.local",
|
|
Containers: []models.DockerContainer{
|
|
{
|
|
ID: "container-1",
|
|
Name: "healthy-app",
|
|
State: "running",
|
|
Status: "Up 10 minutes",
|
|
Health: "healthy",
|
|
},
|
|
},
|
|
}
|
|
|
|
m.CheckDockerHost(host)
|
|
|
|
resourceID := dockerResourceID(host.ID, host.Containers[0].ID)
|
|
alertID := fmt.Sprintf("docker-container-health-%s", resourceID)
|
|
if _, exists := m.activeAlerts[alertID]; exists {
|
|
t.Fatal("expected no health alert for healthy container")
|
|
}
|
|
})
|
|
|
|
t.Run("container with empty health - no alert", func(t *testing.T) {
|
|
m := newTestManager(t)
|
|
|
|
host := models.DockerHost{
|
|
ID: "host-health-2",
|
|
DisplayName: "Docker Host",
|
|
Hostname: "docker.local",
|
|
Containers: []models.DockerContainer{
|
|
{
|
|
ID: "container-2",
|
|
Name: "no-health-check",
|
|
State: "running",
|
|
Status: "Up 10 minutes",
|
|
Health: "",
|
|
},
|
|
},
|
|
}
|
|
|
|
m.CheckDockerHost(host)
|
|
|
|
resourceID := dockerResourceID(host.ID, host.Containers[0].ID)
|
|
alertID := fmt.Sprintf("docker-container-health-%s", resourceID)
|
|
if _, exists := m.activeAlerts[alertID]; exists {
|
|
t.Fatal("expected no health alert for container with empty health")
|
|
}
|
|
})
|
|
|
|
t.Run("container with none health - no alert", func(t *testing.T) {
|
|
m := newTestManager(t)
|
|
|
|
host := models.DockerHost{
|
|
ID: "host-health-3",
|
|
DisplayName: "Docker Host",
|
|
Hostname: "docker.local",
|
|
Containers: []models.DockerContainer{
|
|
{
|
|
ID: "container-3",
|
|
Name: "no-health-check",
|
|
State: "running",
|
|
Status: "Up 10 minutes",
|
|
Health: "none",
|
|
},
|
|
},
|
|
}
|
|
|
|
m.CheckDockerHost(host)
|
|
|
|
resourceID := dockerResourceID(host.ID, host.Containers[0].ID)
|
|
alertID := fmt.Sprintf("docker-container-health-%s", resourceID)
|
|
if _, exists := m.activeAlerts[alertID]; exists {
|
|
t.Fatal("expected no health alert for container with none health")
|
|
}
|
|
})
|
|
|
|
t.Run("container starting - no alert", func(t *testing.T) {
|
|
m := newTestManager(t)
|
|
|
|
host := models.DockerHost{
|
|
ID: "host-health-4",
|
|
DisplayName: "Docker Host",
|
|
Hostname: "docker.local",
|
|
Containers: []models.DockerContainer{
|
|
{
|
|
ID: "container-4",
|
|
Name: "starting-app",
|
|
State: "running",
|
|
Status: "Up 5 seconds",
|
|
Health: "starting",
|
|
},
|
|
},
|
|
}
|
|
|
|
m.CheckDockerHost(host)
|
|
|
|
resourceID := dockerResourceID(host.ID, host.Containers[0].ID)
|
|
alertID := fmt.Sprintf("docker-container-health-%s", resourceID)
|
|
if _, exists := m.activeAlerts[alertID]; exists {
|
|
t.Fatal("expected no health alert for starting container")
|
|
}
|
|
})
|
|
|
|
t.Run("unhealthy container - critical alert", func(t *testing.T) {
|
|
m := newTestManager(t)
|
|
|
|
host := models.DockerHost{
|
|
ID: "host-health-5",
|
|
DisplayName: "Docker Host",
|
|
Hostname: "docker.local",
|
|
Containers: []models.DockerContainer{
|
|
{
|
|
ID: "container-5",
|
|
Name: "unhealthy-app",
|
|
State: "running",
|
|
Status: "Up 10 minutes (unhealthy)",
|
|
Health: "unhealthy",
|
|
},
|
|
},
|
|
}
|
|
|
|
m.CheckDockerHost(host)
|
|
|
|
resourceID := dockerResourceID(host.ID, host.Containers[0].ID)
|
|
alertID := fmt.Sprintf("docker-container-health-%s", resourceID)
|
|
alert, exists := m.activeAlerts[alertID]
|
|
if !exists {
|
|
t.Fatal("expected health alert for unhealthy container")
|
|
}
|
|
if alert.Level != AlertLevelCritical {
|
|
t.Fatalf("expected critical alert for unhealthy container, got %s", alert.Level)
|
|
}
|
|
if alert.Type != "docker-container-health" {
|
|
t.Fatalf("expected alert type docker-container-health, got %s", alert.Type)
|
|
}
|
|
})
|
|
|
|
t.Run("container with other health status - warning alert", func(t *testing.T) {
|
|
m := newTestManager(t)
|
|
|
|
host := models.DockerHost{
|
|
ID: "host-health-6",
|
|
DisplayName: "Docker Host",
|
|
Hostname: "docker.local",
|
|
Containers: []models.DockerContainer{
|
|
{
|
|
ID: "container-6",
|
|
Name: "degraded-app",
|
|
State: "running",
|
|
Status: "Up 10 minutes",
|
|
Health: "degraded",
|
|
},
|
|
},
|
|
}
|
|
|
|
m.CheckDockerHost(host)
|
|
|
|
resourceID := dockerResourceID(host.ID, host.Containers[0].ID)
|
|
alertID := fmt.Sprintf("docker-container-health-%s", resourceID)
|
|
alert, exists := m.activeAlerts[alertID]
|
|
if !exists {
|
|
t.Fatal("expected health alert for degraded container")
|
|
}
|
|
if alert.Level != AlertLevelWarning {
|
|
t.Fatalf("expected warning alert for non-unhealthy bad status, got %s", alert.Level)
|
|
}
|
|
})
|
|
|
|
t.Run("alert cleared when container becomes healthy", func(t *testing.T) {
|
|
m := newTestManager(t)
|
|
|
|
hostID := "host-health-7"
|
|
containerID := "container-7"
|
|
|
|
// First check with unhealthy container
|
|
hostUnhealthy := models.DockerHost{
|
|
ID: hostID,
|
|
DisplayName: "Docker Host",
|
|
Hostname: "docker.local",
|
|
Containers: []models.DockerContainer{
|
|
{
|
|
ID: containerID,
|
|
Name: "recovering-app",
|
|
State: "running",
|
|
Status: "Up 10 minutes (unhealthy)",
|
|
Health: "unhealthy",
|
|
},
|
|
},
|
|
}
|
|
|
|
m.CheckDockerHost(hostUnhealthy)
|
|
|
|
resourceID := dockerResourceID(hostID, containerID)
|
|
alertID := fmt.Sprintf("docker-container-health-%s", resourceID)
|
|
if _, exists := m.activeAlerts[alertID]; !exists {
|
|
t.Fatal("expected health alert to be raised")
|
|
}
|
|
|
|
// Now container becomes healthy
|
|
hostHealthy := models.DockerHost{
|
|
ID: hostID,
|
|
DisplayName: "Docker Host",
|
|
Hostname: "docker.local",
|
|
Containers: []models.DockerContainer{
|
|
{
|
|
ID: containerID,
|
|
Name: "recovering-app",
|
|
State: "running",
|
|
Status: "Up 15 minutes",
|
|
Health: "healthy",
|
|
},
|
|
},
|
|
}
|
|
|
|
m.CheckDockerHost(hostHealthy)
|
|
|
|
if _, exists := m.activeAlerts[alertID]; exists {
|
|
t.Fatal("expected health alert to be cleared when container became healthy")
|
|
}
|
|
})
|
|
}
|
|
|
|
func TestDockerContainerOOMKillAlert(t *testing.T) {
|
|
t.Run("running container - no alert", func(t *testing.T) {
|
|
m := newTestManager(t)
|
|
|
|
host := models.DockerHost{
|
|
ID: "host-oom-1",
|
|
DisplayName: "Docker Host",
|
|
Hostname: "docker.local",
|
|
Containers: []models.DockerContainer{
|
|
{
|
|
ID: "container-1",
|
|
Name: "running-app",
|
|
State: "running",
|
|
Status: "Up 10 minutes",
|
|
ExitCode: 0,
|
|
},
|
|
},
|
|
}
|
|
|
|
m.CheckDockerHost(host)
|
|
|
|
resourceID := dockerResourceID(host.ID, host.Containers[0].ID)
|
|
alertID := fmt.Sprintf("docker-container-oom-%s", resourceID)
|
|
if _, exists := m.activeAlerts[alertID]; exists {
|
|
t.Fatal("expected no OOM alert for running container")
|
|
}
|
|
})
|
|
|
|
t.Run("exited container with non-137 exit code - no alert", func(t *testing.T) {
|
|
m := newTestManager(t)
|
|
|
|
host := models.DockerHost{
|
|
ID: "host-oom-2",
|
|
DisplayName: "Docker Host",
|
|
Hostname: "docker.local",
|
|
Containers: []models.DockerContainer{
|
|
{
|
|
ID: "container-2",
|
|
Name: "normal-exit-app",
|
|
State: "exited",
|
|
Status: "Exited (1) 5 minutes ago",
|
|
ExitCode: 1,
|
|
},
|
|
},
|
|
}
|
|
|
|
m.CheckDockerHost(host)
|
|
|
|
resourceID := dockerResourceID(host.ID, host.Containers[0].ID)
|
|
alertID := fmt.Sprintf("docker-container-oom-%s", resourceID)
|
|
if _, exists := m.activeAlerts[alertID]; exists {
|
|
t.Fatal("expected no OOM alert for container with exit code 1")
|
|
}
|
|
})
|
|
|
|
t.Run("exited container with exit code 137 - critical OOM alert", func(t *testing.T) {
|
|
m := newTestManager(t)
|
|
|
|
host := models.DockerHost{
|
|
ID: "host-oom-3",
|
|
DisplayName: "Docker Host",
|
|
Hostname: "docker.local",
|
|
Containers: []models.DockerContainer{
|
|
{
|
|
ID: "container-3",
|
|
Name: "oom-killed-app",
|
|
State: "exited",
|
|
Status: "Exited (137) 1 minute ago",
|
|
ExitCode: 137,
|
|
MemoryUsage: 512 * 1024 * 1024,
|
|
MemoryLimit: 512 * 1024 * 1024,
|
|
},
|
|
},
|
|
}
|
|
|
|
m.CheckDockerHost(host)
|
|
|
|
resourceID := dockerResourceID(host.ID, host.Containers[0].ID)
|
|
alertID := fmt.Sprintf("docker-container-oom-%s", resourceID)
|
|
alert, exists := m.activeAlerts[alertID]
|
|
if !exists {
|
|
t.Fatal("expected OOM alert for container with exit code 137")
|
|
}
|
|
if alert.Level != AlertLevelCritical {
|
|
t.Fatalf("expected critical OOM alert, got %s", alert.Level)
|
|
}
|
|
if alert.Type != "docker-container-oom-kill" {
|
|
t.Fatalf("expected alert type docker-container-oom-kill, got %s", alert.Type)
|
|
}
|
|
})
|
|
|
|
t.Run("dead container with exit code 137 - critical OOM alert", func(t *testing.T) {
|
|
m := newTestManager(t)
|
|
|
|
host := models.DockerHost{
|
|
ID: "host-oom-dead",
|
|
DisplayName: "Docker Host",
|
|
Hostname: "docker.local",
|
|
Containers: []models.DockerContainer{
|
|
{
|
|
ID: "container-dead",
|
|
Name: "dead-oom-app",
|
|
State: "dead",
|
|
Status: "Dead",
|
|
ExitCode: 137,
|
|
},
|
|
},
|
|
}
|
|
|
|
m.CheckDockerHost(host)
|
|
|
|
resourceID := dockerResourceID(host.ID, host.Containers[0].ID)
|
|
alertID := fmt.Sprintf("docker-container-oom-%s", resourceID)
|
|
alert, exists := m.activeAlerts[alertID]
|
|
if !exists {
|
|
t.Fatal("expected OOM alert for dead container with exit code 137")
|
|
}
|
|
if alert.Level != AlertLevelCritical {
|
|
t.Fatalf("expected critical OOM alert, got %s", alert.Level)
|
|
}
|
|
})
|
|
|
|
t.Run("repeated 137 exit code - no new alert", func(t *testing.T) {
|
|
m := newTestManager(t)
|
|
|
|
hostID := "host-oom-4"
|
|
containerID := "container-4"
|
|
|
|
host := models.DockerHost{
|
|
ID: hostID,
|
|
DisplayName: "Docker Host",
|
|
Hostname: "docker.local",
|
|
Containers: []models.DockerContainer{
|
|
{
|
|
ID: containerID,
|
|
Name: "oom-killed-app",
|
|
State: "exited",
|
|
Status: "Exited (137) 1 minute ago",
|
|
ExitCode: 137,
|
|
},
|
|
},
|
|
}
|
|
|
|
// First check - should create alert
|
|
m.CheckDockerHost(host)
|
|
|
|
resourceID := dockerResourceID(hostID, containerID)
|
|
alertID := fmt.Sprintf("docker-container-oom-%s", resourceID)
|
|
alert1, exists := m.activeAlerts[alertID]
|
|
if !exists {
|
|
t.Fatal("expected OOM alert on first check")
|
|
}
|
|
startTime := alert1.StartTime
|
|
|
|
// Second check with same exit code - should not create new alert
|
|
m.CheckDockerHost(host)
|
|
|
|
alert2, exists := m.activeAlerts[alertID]
|
|
if !exists {
|
|
t.Fatal("expected OOM alert to still exist on second check")
|
|
}
|
|
if alert2.StartTime != startTime {
|
|
t.Fatal("expected alert start time to be preserved (not a new alert)")
|
|
}
|
|
})
|
|
|
|
t.Run("container recovers - alert cleared", func(t *testing.T) {
|
|
m := newTestManager(t)
|
|
|
|
hostID := "host-oom-5"
|
|
containerID := "container-5"
|
|
|
|
// First check with OOM killed container
|
|
hostOOM := models.DockerHost{
|
|
ID: hostID,
|
|
DisplayName: "Docker Host",
|
|
Hostname: "docker.local",
|
|
Containers: []models.DockerContainer{
|
|
{
|
|
ID: containerID,
|
|
Name: "recovering-app",
|
|
State: "exited",
|
|
Status: "Exited (137) 1 minute ago",
|
|
ExitCode: 137,
|
|
},
|
|
},
|
|
}
|
|
|
|
m.CheckDockerHost(hostOOM)
|
|
|
|
resourceID := dockerResourceID(hostID, containerID)
|
|
alertID := fmt.Sprintf("docker-container-oom-%s", resourceID)
|
|
if _, exists := m.activeAlerts[alertID]; !exists {
|
|
t.Fatal("expected OOM alert to be raised")
|
|
}
|
|
|
|
// Container is restarted and running again
|
|
hostRunning := models.DockerHost{
|
|
ID: hostID,
|
|
DisplayName: "Docker Host",
|
|
Hostname: "docker.local",
|
|
Containers: []models.DockerContainer{
|
|
{
|
|
ID: containerID,
|
|
Name: "recovering-app",
|
|
State: "running",
|
|
Status: "Up 30 seconds",
|
|
ExitCode: 0,
|
|
},
|
|
},
|
|
}
|
|
|
|
m.CheckDockerHost(hostRunning)
|
|
|
|
if _, exists := m.activeAlerts[alertID]; exists {
|
|
t.Fatal("expected OOM alert to be cleared when container started running")
|
|
}
|
|
})
|
|
|
|
t.Run("container exits with different code - alert cleared", func(t *testing.T) {
|
|
m := newTestManager(t)
|
|
|
|
hostID := "host-oom-6"
|
|
containerID := "container-6"
|
|
|
|
// First check with OOM killed container
|
|
hostOOM := models.DockerHost{
|
|
ID: hostID,
|
|
DisplayName: "Docker Host",
|
|
Hostname: "docker.local",
|
|
Containers: []models.DockerContainer{
|
|
{
|
|
ID: containerID,
|
|
Name: "multi-exit-app",
|
|
State: "exited",
|
|
Status: "Exited (137) 1 minute ago",
|
|
ExitCode: 137,
|
|
},
|
|
},
|
|
}
|
|
|
|
m.CheckDockerHost(hostOOM)
|
|
|
|
resourceID := dockerResourceID(hostID, containerID)
|
|
alertID := fmt.Sprintf("docker-container-oom-%s", resourceID)
|
|
if _, exists := m.activeAlerts[alertID]; !exists {
|
|
t.Fatal("expected OOM alert to be raised")
|
|
}
|
|
|
|
// Container exits with different exit code (normal error)
|
|
hostNormalExit := models.DockerHost{
|
|
ID: hostID,
|
|
DisplayName: "Docker Host",
|
|
Hostname: "docker.local",
|
|
Containers: []models.DockerContainer{
|
|
{
|
|
ID: containerID,
|
|
Name: "multi-exit-app",
|
|
State: "exited",
|
|
Status: "Exited (1) 30 seconds ago",
|
|
ExitCode: 1,
|
|
},
|
|
},
|
|
}
|
|
|
|
m.CheckDockerHost(hostNormalExit)
|
|
|
|
if _, exists := m.activeAlerts[alertID]; exists {
|
|
t.Fatal("expected OOM alert to be cleared when container exited with different code")
|
|
}
|
|
})
|
|
}
|
|
|
|
func TestDockerContainerRestartLoopAlert(t *testing.T) {
|
|
t.Run("first check - no alert", func(t *testing.T) {
|
|
m := newTestManager(t)
|
|
|
|
host := models.DockerHost{
|
|
ID: "host-restart-1",
|
|
DisplayName: "Docker Host",
|
|
Hostname: "docker.local",
|
|
Containers: []models.DockerContainer{
|
|
{
|
|
ID: "container-1",
|
|
Name: "first-check-app",
|
|
State: "running",
|
|
Status: "Up 10 minutes",
|
|
RestartCount: 5, // Even with high restart count, first check just initializes
|
|
},
|
|
},
|
|
}
|
|
|
|
m.CheckDockerHost(host)
|
|
|
|
resourceID := dockerResourceID(host.ID, host.Containers[0].ID)
|
|
alertID := fmt.Sprintf("docker-container-restart-loop-%s", resourceID)
|
|
if _, exists := m.activeAlerts[alertID]; exists {
|
|
t.Fatal("expected no restart loop alert on first check (just initializes tracking)")
|
|
}
|
|
|
|
// Verify tracking was initialized
|
|
m.mu.Lock()
|
|
record, exists := m.dockerRestartTracking[resourceID]
|
|
m.mu.Unlock()
|
|
if !exists {
|
|
t.Fatal("expected tracking record to be initialized")
|
|
}
|
|
if record.lastCount != 5 {
|
|
t.Fatalf("expected lastCount=5, got %d", record.lastCount)
|
|
}
|
|
})
|
|
|
|
t.Run("stable restart count - no alert", func(t *testing.T) {
|
|
m := newTestManager(t)
|
|
|
|
hostID := "host-restart-2"
|
|
containerID := "container-2"
|
|
|
|
host := models.DockerHost{
|
|
ID: hostID,
|
|
DisplayName: "Docker Host",
|
|
Hostname: "docker.local",
|
|
Containers: []models.DockerContainer{
|
|
{
|
|
ID: containerID,
|
|
Name: "stable-app",
|
|
State: "running",
|
|
Status: "Up 10 minutes",
|
|
RestartCount: 2,
|
|
},
|
|
},
|
|
}
|
|
|
|
// First check - initializes tracking
|
|
m.CheckDockerHost(host)
|
|
|
|
// Second check - same restart count
|
|
m.CheckDockerHost(host)
|
|
|
|
// Third check - still same restart count
|
|
m.CheckDockerHost(host)
|
|
|
|
resourceID := dockerResourceID(hostID, containerID)
|
|
alertID := fmt.Sprintf("docker-container-restart-loop-%s", resourceID)
|
|
if _, exists := m.activeAlerts[alertID]; exists {
|
|
t.Fatal("expected no restart loop alert for stable container")
|
|
}
|
|
})
|
|
|
|
t.Run("restarts under threshold - no alert", func(t *testing.T) {
|
|
m := newTestManager(t)
|
|
// Configure threshold to 3 (default)
|
|
m.config.DockerDefaults.RestartCount = 3
|
|
m.config.DockerDefaults.RestartWindow = 300
|
|
|
|
hostID := "host-restart-3"
|
|
containerID := "container-3"
|
|
|
|
// First check - initializes with RestartCount=0
|
|
host := models.DockerHost{
|
|
ID: hostID,
|
|
DisplayName: "Docker Host",
|
|
Hostname: "docker.local",
|
|
Containers: []models.DockerContainer{
|
|
{
|
|
ID: containerID,
|
|
Name: "under-threshold-app",
|
|
State: "running",
|
|
Status: "Up 10 minutes",
|
|
RestartCount: 0,
|
|
},
|
|
},
|
|
}
|
|
m.CheckDockerHost(host)
|
|
|
|
// Container restarts twice (under threshold of 3)
|
|
host.Containers[0].RestartCount = 2
|
|
m.CheckDockerHost(host)
|
|
|
|
// One more restart (now at 3, threshold is >3 so still no alert)
|
|
host.Containers[0].RestartCount = 3
|
|
m.CheckDockerHost(host)
|
|
|
|
resourceID := dockerResourceID(hostID, containerID)
|
|
alertID := fmt.Sprintf("docker-container-restart-loop-%s", resourceID)
|
|
if _, exists := m.activeAlerts[alertID]; exists {
|
|
t.Fatal("expected no restart loop alert when restarts <= threshold")
|
|
}
|
|
|
|
// Verify we tracked 3 restarts
|
|
m.mu.Lock()
|
|
record := m.dockerRestartTracking[resourceID]
|
|
recentCount := len(record.times)
|
|
m.mu.Unlock()
|
|
if recentCount != 3 {
|
|
t.Fatalf("expected 3 tracked restarts, got %d", recentCount)
|
|
}
|
|
})
|
|
|
|
t.Run("hits restart loop threshold - alert raised", func(t *testing.T) {
|
|
m := newTestManager(t)
|
|
// Configure threshold to 3 (alert when >3)
|
|
m.config.DockerDefaults.RestartCount = 3
|
|
m.config.DockerDefaults.RestartWindow = 300
|
|
|
|
hostID := "host-restart-4"
|
|
containerID := "container-4"
|
|
|
|
// First check - initializes with RestartCount=0
|
|
host := models.DockerHost{
|
|
ID: hostID,
|
|
DisplayName: "Docker Host",
|
|
Hostname: "docker.local",
|
|
Containers: []models.DockerContainer{
|
|
{
|
|
ID: containerID,
|
|
Name: "restart-loop-app",
|
|
State: "running",
|
|
Status: "Up 1 minute",
|
|
RestartCount: 0,
|
|
},
|
|
},
|
|
}
|
|
m.CheckDockerHost(host)
|
|
|
|
// Container restarts 4 times (exceeds threshold of 3)
|
|
host.Containers[0].RestartCount = 4
|
|
m.CheckDockerHost(host)
|
|
|
|
resourceID := dockerResourceID(hostID, containerID)
|
|
alertID := fmt.Sprintf("docker-container-restart-loop-%s", resourceID)
|
|
alert, exists := m.activeAlerts[alertID]
|
|
if !exists {
|
|
t.Fatal("expected restart loop alert when restarts > threshold")
|
|
}
|
|
if alert.Level != AlertLevelCritical {
|
|
t.Fatalf("expected critical alert, got %s", alert.Level)
|
|
}
|
|
if alert.Type != "docker-container-restart-loop" {
|
|
t.Fatalf("expected alert type docker-container-restart-loop, got %s", alert.Type)
|
|
}
|
|
|
|
// Verify metadata
|
|
if alert.Metadata["restartCount"] != 4 {
|
|
t.Fatalf("expected restartCount=4 in metadata, got %v", alert.Metadata["restartCount"])
|
|
}
|
|
if alert.Metadata["recentRestarts"] != 4 {
|
|
t.Fatalf("expected recentRestarts=4 in metadata, got %v", alert.Metadata["recentRestarts"])
|
|
}
|
|
})
|
|
|
|
t.Run("restart loop recovery - alert cleared", func(t *testing.T) {
|
|
m := newTestManager(t)
|
|
// Configure short window for testing
|
|
m.config.DockerDefaults.RestartCount = 3
|
|
m.config.DockerDefaults.RestartWindow = 1 // 1 second window for testing
|
|
|
|
hostID := "host-restart-5"
|
|
containerID := "container-5"
|
|
resourceID := dockerResourceID(hostID, containerID)
|
|
|
|
// Manually set up a restart loop state
|
|
m.mu.Lock()
|
|
now := time.Now()
|
|
m.dockerRestartTracking[resourceID] = &dockerRestartRecord{
|
|
count: 5,
|
|
lastCount: 5,
|
|
times: []time.Time{now, now, now, now}, // 4 recent restarts
|
|
lastChecked: now,
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
// Create initial alert
|
|
host := models.DockerHost{
|
|
ID: hostID,
|
|
DisplayName: "Docker Host",
|
|
Hostname: "docker.local",
|
|
Containers: []models.DockerContainer{
|
|
{
|
|
ID: containerID,
|
|
Name: "recovering-app",
|
|
State: "running",
|
|
Status: "Up 1 minute",
|
|
RestartCount: 5,
|
|
},
|
|
},
|
|
}
|
|
m.CheckDockerHost(host)
|
|
|
|
alertID := fmt.Sprintf("docker-container-restart-loop-%s", resourceID)
|
|
if _, exists := m.activeAlerts[alertID]; !exists {
|
|
t.Fatal("expected restart loop alert to be raised initially")
|
|
}
|
|
|
|
// Wait for time window to pass
|
|
time.Sleep(1100 * time.Millisecond)
|
|
|
|
// Check again with same restart count - old restarts should be cleaned up
|
|
m.CheckDockerHost(host)
|
|
|
|
if _, exists := m.activeAlerts[alertID]; exists {
|
|
t.Fatal("expected restart loop alert to be cleared after window passes")
|
|
}
|
|
})
|
|
|
|
t.Run("incremental restarts trigger alert", func(t *testing.T) {
|
|
m := newTestManager(t)
|
|
m.config.DockerDefaults.RestartCount = 2
|
|
m.config.DockerDefaults.RestartWindow = 300
|
|
|
|
hostID := "host-restart-6"
|
|
containerID := "container-6"
|
|
|
|
host := models.DockerHost{
|
|
ID: hostID,
|
|
DisplayName: "Docker Host",
|
|
Hostname: "docker.local",
|
|
Containers: []models.DockerContainer{
|
|
{
|
|
ID: containerID,
|
|
Name: "incremental-restart-app",
|
|
State: "running",
|
|
Status: "Up 1 minute",
|
|
RestartCount: 0,
|
|
},
|
|
},
|
|
}
|
|
|
|
// First check - initializes
|
|
m.CheckDockerHost(host)
|
|
|
|
resourceID := dockerResourceID(hostID, containerID)
|
|
alertID := fmt.Sprintf("docker-container-restart-loop-%s", resourceID)
|
|
|
|
// Restart 1
|
|
host.Containers[0].RestartCount = 1
|
|
m.CheckDockerHost(host)
|
|
if _, exists := m.activeAlerts[alertID]; exists {
|
|
t.Fatal("expected no alert after 1 restart")
|
|
}
|
|
|
|
// Restart 2
|
|
host.Containers[0].RestartCount = 2
|
|
m.CheckDockerHost(host)
|
|
if _, exists := m.activeAlerts[alertID]; exists {
|
|
t.Fatal("expected no alert after 2 restarts (threshold is >2)")
|
|
}
|
|
|
|
// Restart 3 - exceeds threshold
|
|
host.Containers[0].RestartCount = 3
|
|
m.CheckDockerHost(host)
|
|
if _, exists := m.activeAlerts[alertID]; !exists {
|
|
t.Fatal("expected alert after 3 restarts (>2 threshold)")
|
|
}
|
|
})
|
|
|
|
t.Run("alert preserves start time on updates", func(t *testing.T) {
|
|
m := newTestManager(t)
|
|
m.config.DockerDefaults.RestartCount = 2
|
|
m.config.DockerDefaults.RestartWindow = 300
|
|
|
|
hostID := "host-restart-7"
|
|
containerID := "container-7"
|
|
|
|
host := models.DockerHost{
|
|
ID: hostID,
|
|
DisplayName: "Docker Host",
|
|
Hostname: "docker.local",
|
|
Containers: []models.DockerContainer{
|
|
{
|
|
ID: containerID,
|
|
Name: "preserve-time-app",
|
|
State: "running",
|
|
Status: "Up 1 minute",
|
|
RestartCount: 0,
|
|
},
|
|
},
|
|
}
|
|
|
|
// Initialize and trigger alert
|
|
m.CheckDockerHost(host)
|
|
host.Containers[0].RestartCount = 5
|
|
m.CheckDockerHost(host)
|
|
|
|
resourceID := dockerResourceID(hostID, containerID)
|
|
alertID := fmt.Sprintf("docker-container-restart-loop-%s", resourceID)
|
|
alert1, exists := m.activeAlerts[alertID]
|
|
if !exists {
|
|
t.Fatal("expected alert to be raised")
|
|
}
|
|
startTime1 := alert1.StartTime
|
|
|
|
// More restarts - alert should update but preserve start time
|
|
time.Sleep(10 * time.Millisecond)
|
|
host.Containers[0].RestartCount = 7
|
|
m.CheckDockerHost(host)
|
|
|
|
alert2, exists := m.activeAlerts[alertID]
|
|
if !exists {
|
|
t.Fatal("expected alert to still exist")
|
|
}
|
|
if !alert2.StartTime.Equal(startTime1) {
|
|
t.Fatalf("expected start time to be preserved, got %v vs %v", alert2.StartTime, startTime1)
|
|
}
|
|
})
|
|
}
|
|
|
|
func TestApplyThresholdOverride(t *testing.T) {
|
|
t.Run("empty override returns base unchanged", func(t *testing.T) {
|
|
m := newTestManager(t)
|
|
base := ThresholdConfig{
|
|
CPU: &HysteresisThreshold{Trigger: 80, Clear: 75},
|
|
Memory: &HysteresisThreshold{Trigger: 90, Clear: 85},
|
|
}
|
|
override := ThresholdConfig{}
|
|
result := m.applyThresholdOverride(base, override)
|
|
|
|
if result.CPU == nil || result.CPU.Trigger != 80 || result.CPU.Clear != 75 {
|
|
t.Errorf("expected CPU to match base, got %+v", result.CPU)
|
|
}
|
|
if result.Memory == nil || result.Memory.Trigger != 90 || result.Memory.Clear != 85 {
|
|
t.Errorf("expected Memory to match base, got %+v", result.Memory)
|
|
}
|
|
if result.Disabled {
|
|
t.Error("expected Disabled to remain false")
|
|
}
|
|
})
|
|
|
|
t.Run("Disabled flag override", func(t *testing.T) {
|
|
m := newTestManager(t)
|
|
base := ThresholdConfig{Disabled: false}
|
|
override := ThresholdConfig{Disabled: true}
|
|
result := m.applyThresholdOverride(base, override)
|
|
|
|
if !result.Disabled {
|
|
t.Error("expected Disabled to be true after override")
|
|
}
|
|
})
|
|
|
|
t.Run("DisableConnectivity override", func(t *testing.T) {
|
|
m := newTestManager(t)
|
|
base := ThresholdConfig{DisableConnectivity: false}
|
|
override := ThresholdConfig{DisableConnectivity: true}
|
|
result := m.applyThresholdOverride(base, override)
|
|
|
|
if !result.DisableConnectivity {
|
|
t.Error("expected DisableConnectivity to be true after override")
|
|
}
|
|
})
|
|
|
|
t.Run("CPU threshold override", func(t *testing.T) {
|
|
m := newTestManager(t)
|
|
base := ThresholdConfig{
|
|
CPU: &HysteresisThreshold{Trigger: 80, Clear: 75},
|
|
}
|
|
override := ThresholdConfig{
|
|
CPU: &HysteresisThreshold{Trigger: 95, Clear: 90},
|
|
}
|
|
result := m.applyThresholdOverride(base, override)
|
|
|
|
if result.CPU == nil {
|
|
t.Fatal("expected CPU to be set")
|
|
}
|
|
if result.CPU.Trigger != 95 || result.CPU.Clear != 90 {
|
|
t.Errorf("expected CPU override values, got Trigger=%v Clear=%v", result.CPU.Trigger, result.CPU.Clear)
|
|
}
|
|
})
|
|
|
|
t.Run("legacy CPU threshold conversion", func(t *testing.T) {
|
|
m := newTestManager(t)
|
|
m.config.HysteresisMargin = 5.0
|
|
base := ThresholdConfig{}
|
|
legacyVal := 85.0
|
|
override := ThresholdConfig{
|
|
CPULegacy: &legacyVal,
|
|
}
|
|
result := m.applyThresholdOverride(base, override)
|
|
|
|
if result.CPU == nil {
|
|
t.Fatal("expected CPU to be converted from legacy")
|
|
}
|
|
if result.CPU.Trigger != 85.0 {
|
|
t.Errorf("expected Trigger=85, got %v", result.CPU.Trigger)
|
|
}
|
|
if result.CPU.Clear != 80.0 {
|
|
t.Errorf("expected Clear=80 (85-5 margin), got %v", result.CPU.Clear)
|
|
}
|
|
})
|
|
|
|
t.Run("modern CPU takes precedence over legacy", func(t *testing.T) {
|
|
m := newTestManager(t)
|
|
legacyVal := 70.0
|
|
base := ThresholdConfig{}
|
|
override := ThresholdConfig{
|
|
CPU: &HysteresisThreshold{Trigger: 95, Clear: 90},
|
|
CPULegacy: &legacyVal,
|
|
}
|
|
result := m.applyThresholdOverride(base, override)
|
|
|
|
if result.CPU.Trigger != 95 {
|
|
t.Errorf("expected modern CPU to take precedence, got Trigger=%v", result.CPU.Trigger)
|
|
}
|
|
})
|
|
|
|
t.Run("multiple metrics override", func(t *testing.T) {
|
|
m := newTestManager(t)
|
|
base := ThresholdConfig{
|
|
CPU: &HysteresisThreshold{Trigger: 80, Clear: 75},
|
|
Memory: &HysteresisThreshold{Trigger: 80, Clear: 75},
|
|
Disk: &HysteresisThreshold{Trigger: 80, Clear: 75},
|
|
}
|
|
override := ThresholdConfig{
|
|
CPU: &HysteresisThreshold{Trigger: 90, Clear: 85},
|
|
Memory: &HysteresisThreshold{Trigger: 95, Clear: 90},
|
|
NetworkIn: &HysteresisThreshold{Trigger: 100, Clear: 95},
|
|
NetworkOut: &HysteresisThreshold{Trigger: 200, Clear: 190},
|
|
}
|
|
result := m.applyThresholdOverride(base, override)
|
|
|
|
if result.CPU.Trigger != 90 {
|
|
t.Errorf("expected CPU override, got %v", result.CPU.Trigger)
|
|
}
|
|
if result.Memory.Trigger != 95 {
|
|
t.Errorf("expected Memory override, got %v", result.Memory.Trigger)
|
|
}
|
|
// Disk should remain unchanged (not in override)
|
|
if result.Disk.Trigger != 80 {
|
|
t.Errorf("expected Disk unchanged, got %v", result.Disk.Trigger)
|
|
}
|
|
if result.NetworkIn == nil || result.NetworkIn.Trigger != 100 {
|
|
t.Errorf("expected NetworkIn to be added, got %+v", result.NetworkIn)
|
|
}
|
|
if result.NetworkOut == nil || result.NetworkOut.Trigger != 200 {
|
|
t.Errorf("expected NetworkOut to be added, got %+v", result.NetworkOut)
|
|
}
|
|
})
|
|
|
|
t.Run("Note override", func(t *testing.T) {
|
|
m := newTestManager(t)
|
|
base := ThresholdConfig{}
|
|
note := "test note"
|
|
override := ThresholdConfig{Note: ¬e}
|
|
result := m.applyThresholdOverride(base, override)
|
|
|
|
if result.Note == nil || *result.Note != "test note" {
|
|
t.Errorf("expected Note to be set, got %v", result.Note)
|
|
}
|
|
})
|
|
|
|
t.Run("Note cleared when empty string", func(t *testing.T) {
|
|
m := newTestManager(t)
|
|
existingNote := "existing note"
|
|
base := ThresholdConfig{Note: &existingNote}
|
|
emptyNote := ""
|
|
override := ThresholdConfig{Note: &emptyNote}
|
|
result := m.applyThresholdOverride(base, override)
|
|
|
|
if result.Note != nil {
|
|
t.Errorf("expected Note to be nil when empty string override, got %v", *result.Note)
|
|
}
|
|
})
|
|
|
|
t.Run("Note trimmed of whitespace", func(t *testing.T) {
|
|
m := newTestManager(t)
|
|
base := ThresholdConfig{}
|
|
note := " trimmed note "
|
|
override := ThresholdConfig{Note: ¬e}
|
|
result := m.applyThresholdOverride(base, override)
|
|
|
|
if result.Note == nil || *result.Note != "trimmed note" {
|
|
t.Errorf("expected Note to be trimmed, got %v", result.Note)
|
|
}
|
|
})
|
|
|
|
t.Run("whitespace-only Note becomes nil", func(t *testing.T) {
|
|
m := newTestManager(t)
|
|
existingNote := "existing"
|
|
base := ThresholdConfig{Note: &existingNote}
|
|
whitespaceNote := " "
|
|
override := ThresholdConfig{Note: &whitespaceNote}
|
|
result := m.applyThresholdOverride(base, override)
|
|
|
|
if result.Note != nil {
|
|
t.Errorf("expected whitespace-only Note to become nil, got %v", *result.Note)
|
|
}
|
|
})
|
|
|
|
t.Run("all metric types with legacy conversion", func(t *testing.T) {
|
|
m := newTestManager(t)
|
|
m.config.HysteresisMargin = 5.0
|
|
base := ThresholdConfig{}
|
|
|
|
val80 := 80.0
|
|
val90 := 90.0
|
|
val100 := 100.0
|
|
val200 := 200.0
|
|
override := ThresholdConfig{
|
|
MemoryLegacy: &val80,
|
|
DiskLegacy: &val90,
|
|
DiskReadLegacy: &val100,
|
|
DiskWriteLegacy: &val100,
|
|
NetworkInLegacy: &val200,
|
|
NetworkOutLegacy: &val200,
|
|
}
|
|
result := m.applyThresholdOverride(base, override)
|
|
|
|
if result.Memory == nil || result.Memory.Trigger != 80 {
|
|
t.Errorf("expected Memory converted, got %+v", result.Memory)
|
|
}
|
|
if result.Disk == nil || result.Disk.Trigger != 90 {
|
|
t.Errorf("expected Disk converted, got %+v", result.Disk)
|
|
}
|
|
if result.DiskRead == nil || result.DiskRead.Trigger != 100 {
|
|
t.Errorf("expected DiskRead converted, got %+v", result.DiskRead)
|
|
}
|
|
if result.DiskWrite == nil || result.DiskWrite.Trigger != 100 {
|
|
t.Errorf("expected DiskWrite converted, got %+v", result.DiskWrite)
|
|
}
|
|
if result.NetworkIn == nil || result.NetworkIn.Trigger != 200 {
|
|
t.Errorf("expected NetworkIn converted, got %+v", result.NetworkIn)
|
|
}
|
|
if result.NetworkOut == nil || result.NetworkOut.Trigger != 200 {
|
|
t.Errorf("expected NetworkOut converted, got %+v", result.NetworkOut)
|
|
}
|
|
})
|
|
|
|
t.Run("Temperature and Usage override", func(t *testing.T) {
|
|
m := newTestManager(t)
|
|
base := ThresholdConfig{}
|
|
override := ThresholdConfig{
|
|
Temperature: &HysteresisThreshold{Trigger: 85, Clear: 80},
|
|
Usage: &HysteresisThreshold{Trigger: 90, Clear: 85},
|
|
}
|
|
result := m.applyThresholdOverride(base, override)
|
|
|
|
if result.Temperature == nil || result.Temperature.Trigger != 85 {
|
|
t.Errorf("expected Temperature override, got %+v", result.Temperature)
|
|
}
|
|
if result.Usage == nil || result.Usage.Trigger != 90 {
|
|
t.Errorf("expected Usage override, got %+v", result.Usage)
|
|
}
|
|
})
|
|
|
|
t.Run("ensureHysteresisThreshold fills missing Clear", func(t *testing.T) {
|
|
m := newTestManager(t)
|
|
base := ThresholdConfig{}
|
|
override := ThresholdConfig{
|
|
CPU: &HysteresisThreshold{Trigger: 80, Clear: 0}, // Clear not set
|
|
}
|
|
result := m.applyThresholdOverride(base, override)
|
|
|
|
if result.CPU == nil {
|
|
t.Fatal("expected CPU to be set")
|
|
}
|
|
// ensureHysteresisThreshold sets Clear to Trigger - 5 when Clear <= 0
|
|
if result.CPU.Clear != 75 {
|
|
t.Errorf("expected Clear to be 75 (80-5 default), got %v", result.CPU.Clear)
|
|
}
|
|
})
|
|
}
|
|
|
|
func TestSuppressGuestAlerts(t *testing.T) {
|
|
t.Run("no alerts for guest returns false", func(t *testing.T) {
|
|
m := newTestManager(t)
|
|
|
|
result := m.suppressGuestAlerts("vm100")
|
|
if result {
|
|
t.Error("expected false when no alerts exist for guest")
|
|
}
|
|
})
|
|
|
|
t.Run("active alert with exact ResourceID match clears and returns true", func(t *testing.T) {
|
|
m := newTestManager(t)
|
|
|
|
m.mu.Lock()
|
|
m.activeAlerts["vm100-cpu"] = &Alert{
|
|
ID: "vm100-cpu",
|
|
ResourceID: "vm100",
|
|
Type: "cpu",
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
result := m.suppressGuestAlerts("vm100")
|
|
if !result {
|
|
t.Error("expected true when active alert was cleared")
|
|
}
|
|
|
|
m.mu.RLock()
|
|
if _, exists := m.activeAlerts["vm100-cpu"]; exists {
|
|
t.Error("expected alert to be cleared from activeAlerts")
|
|
}
|
|
m.mu.RUnlock()
|
|
})
|
|
|
|
t.Run("active alert with prefix match clears", func(t *testing.T) {
|
|
m := newTestManager(t)
|
|
|
|
m.mu.Lock()
|
|
m.activeAlerts["vm100/disk1-disk"] = &Alert{
|
|
ID: "vm100/disk1-disk",
|
|
ResourceID: "vm100/disk1",
|
|
Type: "disk",
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
result := m.suppressGuestAlerts("vm100")
|
|
if !result {
|
|
t.Error("expected true when active alert was cleared")
|
|
}
|
|
|
|
m.mu.RLock()
|
|
if _, exists := m.activeAlerts["vm100/disk1-disk"]; exists {
|
|
t.Error("expected alert with prefix match to be cleared")
|
|
}
|
|
m.mu.RUnlock()
|
|
})
|
|
|
|
t.Run("clears from all auxiliary maps", func(t *testing.T) {
|
|
m := newTestManager(t)
|
|
|
|
now := time.Now()
|
|
m.mu.Lock()
|
|
m.activeAlerts["vm100-cpu"] = &Alert{
|
|
ID: "vm100-cpu",
|
|
ResourceID: "vm100",
|
|
Type: "cpu",
|
|
}
|
|
m.pendingAlerts["vm100-memory"] = now
|
|
m.recentAlerts["vm100-disk"] = &Alert{ID: "vm100-disk", ResourceID: "vm100"}
|
|
m.suppressedUntil["vm100-network"] = now.Add(time.Hour)
|
|
m.alertRateLimit["vm100-io"] = []time.Time{now}
|
|
m.offlineConfirmations["vm100"] = 1
|
|
m.mu.Unlock()
|
|
|
|
result := m.suppressGuestAlerts("vm100")
|
|
if !result {
|
|
t.Error("expected true when active alert was cleared")
|
|
}
|
|
|
|
m.mu.RLock()
|
|
defer m.mu.RUnlock()
|
|
|
|
if _, exists := m.activeAlerts["vm100-cpu"]; exists {
|
|
t.Error("expected activeAlerts to be cleared")
|
|
}
|
|
if _, exists := m.pendingAlerts["vm100-memory"]; exists {
|
|
t.Error("expected pendingAlerts to be cleared")
|
|
}
|
|
if _, exists := m.recentAlerts["vm100-disk"]; exists {
|
|
t.Error("expected recentAlerts to be cleared")
|
|
}
|
|
if _, exists := m.suppressedUntil["vm100-network"]; exists {
|
|
t.Error("expected suppressedUntil to be cleared")
|
|
}
|
|
if _, exists := m.alertRateLimit["vm100-io"]; exists {
|
|
t.Error("expected alertRateLimit to be cleared")
|
|
}
|
|
if _, exists := m.offlineConfirmations["vm100"]; exists {
|
|
t.Error("expected offlineConfirmations to be cleared")
|
|
}
|
|
})
|
|
|
|
t.Run("multiple alerts cleared", func(t *testing.T) {
|
|
m := newTestManager(t)
|
|
|
|
m.mu.Lock()
|
|
m.activeAlerts["vm100-cpu"] = &Alert{
|
|
ID: "vm100-cpu",
|
|
ResourceID: "vm100",
|
|
Type: "cpu",
|
|
}
|
|
m.activeAlerts["vm100-memory"] = &Alert{
|
|
ID: "vm100-memory",
|
|
ResourceID: "vm100",
|
|
Type: "memory",
|
|
}
|
|
m.activeAlerts["vm100/disk0-disk"] = &Alert{
|
|
ID: "vm100/disk0-disk",
|
|
ResourceID: "vm100/disk0",
|
|
Type: "disk",
|
|
}
|
|
// Also add an alert for a different guest that should NOT be cleared
|
|
m.activeAlerts["vm200-cpu"] = &Alert{
|
|
ID: "vm200-cpu",
|
|
ResourceID: "vm200",
|
|
Type: "cpu",
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
result := m.suppressGuestAlerts("vm100")
|
|
if !result {
|
|
t.Error("expected true when alerts were cleared")
|
|
}
|
|
|
|
m.mu.RLock()
|
|
defer m.mu.RUnlock()
|
|
|
|
if _, exists := m.activeAlerts["vm100-cpu"]; exists {
|
|
t.Error("expected vm100-cpu to be cleared")
|
|
}
|
|
if _, exists := m.activeAlerts["vm100-memory"]; exists {
|
|
t.Error("expected vm100-memory to be cleared")
|
|
}
|
|
if _, exists := m.activeAlerts["vm100/disk0-disk"]; exists {
|
|
t.Error("expected vm100/disk0-disk to be cleared")
|
|
}
|
|
if _, exists := m.activeAlerts["vm200-cpu"]; !exists {
|
|
t.Error("expected vm200-cpu to NOT be cleared")
|
|
}
|
|
})
|
|
|
|
t.Run("clears auxiliary maps even without active alerts", func(t *testing.T) {
|
|
m := newTestManager(t)
|
|
|
|
now := time.Now()
|
|
m.mu.Lock()
|
|
// No active alerts, but has entries in auxiliary maps
|
|
m.pendingAlerts["vm100-memory"] = now
|
|
m.recentAlerts["vm100-disk"] = &Alert{ID: "vm100-disk", ResourceID: "vm100"}
|
|
m.suppressedUntil["vm100-network"] = now.Add(time.Hour)
|
|
m.alertRateLimit["vm100-io"] = []time.Time{now}
|
|
m.offlineConfirmations["vm100"] = 1
|
|
m.mu.Unlock()
|
|
|
|
result := m.suppressGuestAlerts("vm100")
|
|
// Returns false because no active alerts were cleared
|
|
if result {
|
|
t.Error("expected false when no active alerts were cleared")
|
|
}
|
|
|
|
m.mu.RLock()
|
|
defer m.mu.RUnlock()
|
|
|
|
// But auxiliary maps should still be cleared
|
|
if _, exists := m.pendingAlerts["vm100-memory"]; exists {
|
|
t.Error("expected pendingAlerts to be cleared")
|
|
}
|
|
if _, exists := m.recentAlerts["vm100-disk"]; exists {
|
|
t.Error("expected recentAlerts to be cleared")
|
|
}
|
|
if _, exists := m.suppressedUntil["vm100-network"]; exists {
|
|
t.Error("expected suppressedUntil to be cleared")
|
|
}
|
|
if _, exists := m.alertRateLimit["vm100-io"]; exists {
|
|
t.Error("expected alertRateLimit to be cleared")
|
|
}
|
|
if _, exists := m.offlineConfirmations["vm100"]; exists {
|
|
t.Error("expected offlineConfirmations to be cleared")
|
|
}
|
|
})
|
|
}
|
|
|
|
func TestGuestHasMonitorOnlyAlerts(t *testing.T) {
|
|
t.Run("no alerts returns false", func(t *testing.T) {
|
|
m := newTestManager(t)
|
|
|
|
result := m.guestHasMonitorOnlyAlerts("vm100")
|
|
if result {
|
|
t.Error("expected false when no alerts exist")
|
|
}
|
|
})
|
|
|
|
t.Run("has non-monitor-only alert returns false", func(t *testing.T) {
|
|
m := newTestManager(t)
|
|
|
|
m.mu.Lock()
|
|
m.activeAlerts["vm100-cpu"] = &Alert{
|
|
ID: "vm100-cpu",
|
|
ResourceID: "vm100",
|
|
Type: "cpu",
|
|
Metadata: nil, // No metadata means not monitor-only
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
result := m.guestHasMonitorOnlyAlerts("vm100")
|
|
if result {
|
|
t.Error("expected false when alert is not monitor-only")
|
|
}
|
|
})
|
|
|
|
t.Run("has monitor-only alert with bool metadata returns true", func(t *testing.T) {
|
|
m := newTestManager(t)
|
|
|
|
m.mu.Lock()
|
|
m.activeAlerts["vm100-cpu"] = &Alert{
|
|
ID: "vm100-cpu",
|
|
ResourceID: "vm100",
|
|
Type: "cpu",
|
|
Metadata: map[string]interface{}{
|
|
"monitorOnly": true,
|
|
},
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
result := m.guestHasMonitorOnlyAlerts("vm100")
|
|
if !result {
|
|
t.Error("expected true when monitor-only alert exists")
|
|
}
|
|
})
|
|
|
|
t.Run("has monitor-only alert with string metadata returns true", func(t *testing.T) {
|
|
m := newTestManager(t)
|
|
|
|
m.mu.Lock()
|
|
m.activeAlerts["vm100-cpu"] = &Alert{
|
|
ID: "vm100-cpu",
|
|
ResourceID: "vm100",
|
|
Type: "cpu",
|
|
Metadata: map[string]interface{}{
|
|
"monitorOnly": "true",
|
|
},
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
result := m.guestHasMonitorOnlyAlerts("vm100")
|
|
if !result {
|
|
t.Error("expected true when monitor-only alert exists (string metadata)")
|
|
}
|
|
})
|
|
|
|
t.Run("alert for different guest not matched", func(t *testing.T) {
|
|
m := newTestManager(t)
|
|
|
|
m.mu.Lock()
|
|
m.activeAlerts["vm200-cpu"] = &Alert{
|
|
ID: "vm200-cpu",
|
|
ResourceID: "vm200",
|
|
Type: "cpu",
|
|
Metadata: map[string]interface{}{
|
|
"monitorOnly": true,
|
|
},
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
result := m.guestHasMonitorOnlyAlerts("vm100")
|
|
if result {
|
|
t.Error("expected false when monitor-only alert is for different guest")
|
|
}
|
|
})
|
|
|
|
t.Run("monitorOnly false returns false", func(t *testing.T) {
|
|
m := newTestManager(t)
|
|
|
|
m.mu.Lock()
|
|
m.activeAlerts["vm100-cpu"] = &Alert{
|
|
ID: "vm100-cpu",
|
|
ResourceID: "vm100",
|
|
Type: "cpu",
|
|
Metadata: map[string]interface{}{
|
|
"monitorOnly": false,
|
|
},
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
result := m.guestHasMonitorOnlyAlerts("vm100")
|
|
if result {
|
|
t.Error("expected false when monitorOnly is explicitly false")
|
|
}
|
|
})
|
|
}
|
|
|
|
func TestCheckNode(t *testing.T) {
|
|
// t.Parallel()
|
|
|
|
t.Run("returns early when alerts disabled", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
m.mu.Lock()
|
|
m.config.Enabled = false
|
|
m.mu.Unlock()
|
|
|
|
node := models.Node{
|
|
ID: "node1",
|
|
Name: "Node 1",
|
|
CPU: 0.95, // Would trigger alert if enabled
|
|
Status: "online",
|
|
}
|
|
|
|
m.CheckNode(node)
|
|
|
|
m.mu.RLock()
|
|
alertCount := len(m.activeAlerts)
|
|
m.mu.RUnlock()
|
|
|
|
if alertCount != 0 {
|
|
t.Errorf("expected no alerts when disabled, got %d", alertCount)
|
|
}
|
|
})
|
|
|
|
t.Run("DisableAllNodes clears existing alerts", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
// Pre-create alerts that should be cleared
|
|
m.mu.Lock()
|
|
m.activeAlerts["node1-cpu"] = &Alert{ID: "node1-cpu", ResourceID: "node1", Type: "cpu"}
|
|
m.activeAlerts["node1-memory"] = &Alert{ID: "node1-memory", ResourceID: "node1", Type: "memory"}
|
|
m.activeAlerts["node1-disk"] = &Alert{ID: "node1-disk", ResourceID: "node1", Type: "disk"}
|
|
m.activeAlerts["node1-temperature"] = &Alert{ID: "node1-temperature", ResourceID: "node1", Type: "temperature"}
|
|
m.activeAlerts["node-offline-node1"] = &Alert{ID: "node-offline-node1", ResourceID: "node1", Type: "connectivity"}
|
|
m.nodeOfflineCount["node1"] = 5
|
|
m.config.DisableAllNodes = true
|
|
m.mu.Unlock()
|
|
|
|
node := models.Node{ID: "node1", Name: "Node 1", Status: "online"}
|
|
m.CheckNode(node)
|
|
|
|
m.mu.RLock()
|
|
_, cpuExists := m.activeAlerts["node1-cpu"]
|
|
_, memExists := m.activeAlerts["node1-memory"]
|
|
_, diskExists := m.activeAlerts["node1-disk"]
|
|
_, tempExists := m.activeAlerts["node1-temperature"]
|
|
_, offlineExists := m.activeAlerts["node-offline-node1"]
|
|
_, countExists := m.nodeOfflineCount["node1"]
|
|
m.mu.RUnlock()
|
|
|
|
if cpuExists {
|
|
t.Error("expected cpu alert to be cleared")
|
|
}
|
|
if memExists {
|
|
t.Error("expected memory alert to be cleared")
|
|
}
|
|
if diskExists {
|
|
t.Error("expected disk alert to be cleared")
|
|
}
|
|
if tempExists {
|
|
t.Error("expected temperature alert to be cleared")
|
|
}
|
|
if offlineExists {
|
|
t.Error("expected offline alert to be cleared")
|
|
}
|
|
if countExists {
|
|
t.Error("expected offline count to be cleared")
|
|
}
|
|
})
|
|
|
|
t.Run("DisableNodesOffline clears tracking and offline alerts", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
// Pre-create offline alert and tracking
|
|
m.mu.Lock()
|
|
m.activeAlerts["node-offline-node1"] = &Alert{ID: "node-offline-node1", ResourceID: "node1", Type: "connectivity"}
|
|
m.nodeOfflineCount["node1"] = 3
|
|
m.config.DisableAllNodesOffline = true
|
|
m.mu.Unlock()
|
|
|
|
node := models.Node{ID: "node1", Name: "Node 1", Status: "offline"}
|
|
m.CheckNode(node)
|
|
|
|
m.mu.RLock()
|
|
_, alertExists := m.activeAlerts["node-offline-node1"]
|
|
_, countExists := m.nodeOfflineCount["node1"]
|
|
m.mu.RUnlock()
|
|
|
|
if alertExists {
|
|
t.Error("expected offline alert to be cleared")
|
|
}
|
|
if countExists {
|
|
t.Error("expected offline count to be cleared")
|
|
}
|
|
})
|
|
|
|
t.Run("offline node triggers offline check", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
// Pre-set count to trigger alert on this call
|
|
m.mu.Lock()
|
|
m.nodeOfflineCount["node1"] = 2
|
|
m.mu.Unlock()
|
|
|
|
node := models.Node{
|
|
ID: "node1",
|
|
Name: "Node 1",
|
|
Instance: "pve1",
|
|
Status: "offline",
|
|
}
|
|
m.CheckNode(node)
|
|
|
|
m.mu.RLock()
|
|
alert := m.activeAlerts["node-offline-node1"]
|
|
m.mu.RUnlock()
|
|
|
|
if alert == nil {
|
|
t.Fatal("expected offline alert to be created")
|
|
}
|
|
if alert.Type != "connectivity" {
|
|
t.Errorf("expected type connectivity, got %s", alert.Type)
|
|
}
|
|
})
|
|
|
|
t.Run("node with connection error triggers offline check", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
m.mu.Lock()
|
|
m.nodeOfflineCount["node1"] = 2
|
|
m.mu.Unlock()
|
|
|
|
node := models.Node{
|
|
ID: "node1",
|
|
Name: "Node 1",
|
|
Instance: "pve1",
|
|
Status: "online",
|
|
ConnectionHealth: "error",
|
|
}
|
|
m.CheckNode(node)
|
|
|
|
m.mu.RLock()
|
|
alert := m.activeAlerts["node-offline-node1"]
|
|
m.mu.RUnlock()
|
|
|
|
if alert == nil {
|
|
t.Fatal("expected offline alert for connection error")
|
|
}
|
|
})
|
|
|
|
t.Run("node with connection failed triggers offline check", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
m.mu.Lock()
|
|
m.nodeOfflineCount["node1"] = 2
|
|
m.mu.Unlock()
|
|
|
|
node := models.Node{
|
|
ID: "node1",
|
|
Name: "Node 1",
|
|
Instance: "pve1",
|
|
Status: "online",
|
|
ConnectionHealth: "failed",
|
|
}
|
|
m.CheckNode(node)
|
|
|
|
m.mu.RLock()
|
|
alert := m.activeAlerts["node-offline-node1"]
|
|
m.mu.RUnlock()
|
|
|
|
if alert == nil {
|
|
t.Fatal("expected offline alert for connection failed")
|
|
}
|
|
})
|
|
|
|
t.Run("online node clears offline alert", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
// Pre-create offline alert
|
|
m.mu.Lock()
|
|
m.activeAlerts["node-offline-node1"] = &Alert{
|
|
ID: "node-offline-node1",
|
|
ResourceID: "node1",
|
|
Type: "connectivity",
|
|
}
|
|
m.nodeOfflineCount["node1"] = 5
|
|
m.mu.Unlock()
|
|
|
|
node := models.Node{
|
|
ID: "node1",
|
|
Name: "Node 1",
|
|
Instance: "pve1",
|
|
Status: "online",
|
|
ConnectionHealth: "connected",
|
|
}
|
|
m.CheckNode(node)
|
|
|
|
m.mu.RLock()
|
|
_, alertExists := m.activeAlerts["node-offline-node1"]
|
|
_, countExists := m.nodeOfflineCount["node1"]
|
|
m.mu.RUnlock()
|
|
|
|
if alertExists {
|
|
t.Error("expected offline alert to be cleared")
|
|
}
|
|
if countExists {
|
|
t.Error("expected offline count to be cleared")
|
|
}
|
|
})
|
|
|
|
t.Run("online node triggers metric checks", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
// Set thresholds that will trigger and disable time threshold
|
|
m.mu.Lock()
|
|
m.config.TimeThreshold = 0
|
|
m.config.TimeThresholds = map[string]int{}
|
|
m.config.NodeDefaults = ThresholdConfig{
|
|
CPU: &HysteresisThreshold{Trigger: 80.0, Clear: 70.0},
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
node := models.Node{
|
|
ID: "node1",
|
|
Name: "Node 1",
|
|
Instance: "pve1",
|
|
Status: "online",
|
|
CPU: 0.95, // 95% - above trigger
|
|
}
|
|
m.CheckNode(node)
|
|
|
|
m.mu.RLock()
|
|
alert := m.activeAlerts["node1-cpu"]
|
|
m.mu.RUnlock()
|
|
|
|
if alert == nil {
|
|
t.Fatal("expected cpu alert to be created")
|
|
}
|
|
if alert.Type != "cpu" {
|
|
t.Errorf("expected type cpu, got %s", alert.Type)
|
|
}
|
|
})
|
|
|
|
t.Run("offline node skips metric checks", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
m.mu.Lock()
|
|
m.config.NodeDefaults = ThresholdConfig{
|
|
CPU: &HysteresisThreshold{Trigger: 80.0, Clear: 70.0},
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
node := models.Node{
|
|
ID: "node1",
|
|
Name: "Node 1",
|
|
Status: "offline",
|
|
CPU: 0.95, // Would trigger if checked
|
|
}
|
|
m.CheckNode(node)
|
|
|
|
m.mu.RLock()
|
|
_, cpuExists := m.activeAlerts["node1-cpu"]
|
|
m.mu.RUnlock()
|
|
|
|
if cpuExists {
|
|
t.Error("expected no cpu alert for offline node")
|
|
}
|
|
})
|
|
|
|
t.Run("applies override thresholds", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
m.mu.Lock()
|
|
m.config.NodeDefaults = ThresholdConfig{
|
|
CPU: &HysteresisThreshold{Trigger: 80.0, Clear: 70.0},
|
|
}
|
|
m.config.Overrides = map[string]ThresholdConfig{
|
|
"node1": {
|
|
CPU: &HysteresisThreshold{Trigger: 99.0, Clear: 90.0}, // Higher threshold
|
|
},
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
node := models.Node{
|
|
ID: "node1",
|
|
Name: "Node 1",
|
|
Instance: "pve1",
|
|
Status: "online",
|
|
CPU: 0.95, // 95% - below override trigger of 99%
|
|
}
|
|
m.CheckNode(node)
|
|
|
|
m.mu.RLock()
|
|
_, cpuExists := m.activeAlerts["node1-cpu"]
|
|
m.mu.RUnlock()
|
|
|
|
if cpuExists {
|
|
t.Error("expected no alert due to higher override threshold")
|
|
}
|
|
})
|
|
|
|
t.Run("checks temperature with package temp", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
m.mu.Lock()
|
|
m.config.TimeThreshold = 0
|
|
m.config.TimeThresholds = map[string]int{}
|
|
m.config.NodeDefaults = ThresholdConfig{
|
|
Temperature: &HysteresisThreshold{Trigger: 80.0, Clear: 70.0},
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
node := models.Node{
|
|
ID: "node1",
|
|
Name: "Node 1",
|
|
Instance: "pve1",
|
|
Status: "online",
|
|
Temperature: &models.Temperature{
|
|
Available: true,
|
|
CPUPackage: 90.0, // Above trigger
|
|
CPUMax: 85.0,
|
|
},
|
|
}
|
|
m.CheckNode(node)
|
|
|
|
m.mu.RLock()
|
|
alert := m.activeAlerts["node1-temperature"]
|
|
m.mu.RUnlock()
|
|
|
|
if alert == nil {
|
|
t.Fatal("expected temperature alert")
|
|
}
|
|
})
|
|
|
|
t.Run("checks temperature with max temp fallback", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
m.mu.Lock()
|
|
m.config.TimeThreshold = 0
|
|
m.config.TimeThresholds = map[string]int{}
|
|
m.config.NodeDefaults = ThresholdConfig{
|
|
Temperature: &HysteresisThreshold{Trigger: 80.0, Clear: 70.0},
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
node := models.Node{
|
|
ID: "node1",
|
|
Name: "Node 1",
|
|
Instance: "pve1",
|
|
Status: "online",
|
|
Temperature: &models.Temperature{
|
|
Available: true,
|
|
CPUPackage: 0, // Zero - will use max
|
|
CPUMax: 90.0, // Above trigger
|
|
},
|
|
}
|
|
m.CheckNode(node)
|
|
|
|
m.mu.RLock()
|
|
alert := m.activeAlerts["node1-temperature"]
|
|
m.mu.RUnlock()
|
|
|
|
if alert == nil {
|
|
t.Fatal("expected temperature alert using max temp fallback")
|
|
}
|
|
})
|
|
|
|
t.Run("skips temperature when not available", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
m.mu.Lock()
|
|
m.config.NodeDefaults = ThresholdConfig{
|
|
Temperature: &HysteresisThreshold{Trigger: 80.0, Clear: 70.0},
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
node := models.Node{
|
|
ID: "node1",
|
|
Name: "Node 1",
|
|
Instance: "pve1",
|
|
Status: "online",
|
|
Temperature: &models.Temperature{
|
|
Available: false, // Not available
|
|
CPUPackage: 90.0,
|
|
},
|
|
}
|
|
m.CheckNode(node)
|
|
|
|
m.mu.RLock()
|
|
_, tempExists := m.activeAlerts["node1-temperature"]
|
|
m.mu.RUnlock()
|
|
|
|
if tempExists {
|
|
t.Error("expected no temperature alert when not available")
|
|
}
|
|
})
|
|
|
|
t.Run("skips temperature when nil", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
m.mu.Lock()
|
|
m.config.NodeDefaults = ThresholdConfig{
|
|
Temperature: &HysteresisThreshold{Trigger: 80.0, Clear: 70.0},
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
node := models.Node{
|
|
ID: "node1",
|
|
Name: "Node 1",
|
|
Instance: "pve1",
|
|
Status: "online",
|
|
Temperature: nil, // Nil temperature
|
|
}
|
|
m.CheckNode(node)
|
|
|
|
m.mu.RLock()
|
|
_, tempExists := m.activeAlerts["node1-temperature"]
|
|
m.mu.RUnlock()
|
|
|
|
if tempExists {
|
|
t.Error("expected no temperature alert when temp is nil")
|
|
}
|
|
})
|
|
|
|
t.Run("skips temperature when threshold nil", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
// No temperature threshold set
|
|
m.mu.Lock()
|
|
m.config.NodeDefaults = ThresholdConfig{
|
|
Temperature: nil,
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
node := models.Node{
|
|
ID: "node1",
|
|
Name: "Node 1",
|
|
Instance: "pve1",
|
|
Status: "online",
|
|
Temperature: &models.Temperature{
|
|
Available: true,
|
|
CPUPackage: 90.0,
|
|
},
|
|
}
|
|
m.CheckNode(node)
|
|
|
|
m.mu.RLock()
|
|
_, tempExists := m.activeAlerts["node1-temperature"]
|
|
m.mu.RUnlock()
|
|
|
|
if tempExists {
|
|
t.Error("expected no temperature alert when threshold nil")
|
|
}
|
|
})
|
|
|
|
t.Run("checks memory metric", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
m.mu.Lock()
|
|
m.config.TimeThreshold = 0
|
|
m.config.TimeThresholds = map[string]int{}
|
|
m.config.NodeDefaults = ThresholdConfig{
|
|
Memory: &HysteresisThreshold{Trigger: 80.0, Clear: 70.0},
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
node := models.Node{
|
|
ID: "node1",
|
|
Name: "Node 1",
|
|
Instance: "pve1",
|
|
Status: "online",
|
|
Memory: models.Memory{
|
|
Usage: 95.0, // Above trigger
|
|
},
|
|
}
|
|
m.CheckNode(node)
|
|
|
|
m.mu.RLock()
|
|
alert := m.activeAlerts["node1-memory"]
|
|
m.mu.RUnlock()
|
|
|
|
if alert == nil {
|
|
t.Fatal("expected memory alert")
|
|
}
|
|
})
|
|
|
|
t.Run("checks disk metric", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
m.mu.Lock()
|
|
m.config.TimeThreshold = 0
|
|
m.config.TimeThresholds = map[string]int{}
|
|
m.config.NodeDefaults = ThresholdConfig{
|
|
Disk: &HysteresisThreshold{Trigger: 80.0, Clear: 70.0},
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
node := models.Node{
|
|
ID: "node1",
|
|
Name: "Node 1",
|
|
Instance: "pve1",
|
|
Status: "online",
|
|
Disk: models.Disk{
|
|
Usage: 95.0, // Above trigger
|
|
},
|
|
}
|
|
m.CheckNode(node)
|
|
|
|
m.mu.RLock()
|
|
alert := m.activeAlerts["node1-disk"]
|
|
m.mu.RUnlock()
|
|
|
|
if alert == nil {
|
|
t.Fatal("expected disk alert")
|
|
}
|
|
})
|
|
}
|
|
|
|
func TestCheckGuest(t *testing.T) {
|
|
// t.Parallel()
|
|
|
|
t.Run("returns early when alerts disabled", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
m.mu.Lock()
|
|
m.config.Enabled = false
|
|
m.mu.Unlock()
|
|
|
|
vm := models.VM{
|
|
ID: "vm100",
|
|
Name: "TestVM",
|
|
Node: "node1",
|
|
Status: "running",
|
|
CPU: 0.95,
|
|
}
|
|
|
|
m.CheckGuest(vm, "pve1")
|
|
|
|
m.mu.RLock()
|
|
alertCount := len(m.activeAlerts)
|
|
m.mu.RUnlock()
|
|
|
|
if alertCount != 0 {
|
|
t.Errorf("expected no alerts when disabled, got %d", alertCount)
|
|
}
|
|
})
|
|
|
|
t.Run("returns early when all guests disabled", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
m.mu.Lock()
|
|
m.config.DisableAllGuests = true
|
|
m.mu.Unlock()
|
|
|
|
vm := models.VM{
|
|
ID: "vm100",
|
|
Name: "TestVM",
|
|
Node: "node1",
|
|
Status: "running",
|
|
CPU: 0.95,
|
|
}
|
|
|
|
m.CheckGuest(vm, "pve1")
|
|
|
|
m.mu.RLock()
|
|
alertCount := len(m.activeAlerts)
|
|
m.mu.RUnlock()
|
|
|
|
if alertCount != 0 {
|
|
t.Errorf("expected no alerts when all guests disabled, got %d", alertCount)
|
|
}
|
|
})
|
|
|
|
t.Run("handles VM type correctly", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
m.mu.Lock()
|
|
m.config.TimeThreshold = 0
|
|
m.config.TimeThresholds = map[string]int{}
|
|
m.config.GuestDefaults = ThresholdConfig{
|
|
CPU: &HysteresisThreshold{Trigger: 80.0, Clear: 70.0},
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
vm := models.VM{
|
|
ID: "vm100",
|
|
Name: "TestVM",
|
|
Node: "node1",
|
|
Status: "running",
|
|
CPU: 0.95, // 95%
|
|
}
|
|
|
|
m.CheckGuest(vm, "pve1")
|
|
|
|
m.mu.RLock()
|
|
alert := m.activeAlerts["vm100-cpu"]
|
|
m.mu.RUnlock()
|
|
|
|
if alert == nil {
|
|
t.Fatal("expected cpu alert for VM")
|
|
}
|
|
})
|
|
|
|
t.Run("handles Container type correctly", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
m.mu.Lock()
|
|
m.config.TimeThreshold = 0
|
|
m.config.TimeThresholds = map[string]int{}
|
|
m.config.GuestDefaults = ThresholdConfig{
|
|
CPU: &HysteresisThreshold{Trigger: 80.0, Clear: 70.0},
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
ct := models.Container{
|
|
ID: "ct101",
|
|
Name: "TestCT",
|
|
Node: "node1",
|
|
Status: "running",
|
|
CPU: 0.95, // 95%
|
|
}
|
|
|
|
m.CheckGuest(ct, "pve1")
|
|
|
|
m.mu.RLock()
|
|
alert := m.activeAlerts["ct101-cpu"]
|
|
m.mu.RUnlock()
|
|
|
|
if alert == nil {
|
|
t.Fatal("expected cpu alert for Container")
|
|
}
|
|
})
|
|
|
|
t.Run("returns for unsupported guest type", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
// Pass a string which is unsupported
|
|
m.CheckGuest("invalid", "pve1")
|
|
|
|
m.mu.RLock()
|
|
alertCount := len(m.activeAlerts)
|
|
m.mu.RUnlock()
|
|
|
|
if alertCount != 0 {
|
|
t.Errorf("expected no alerts for unsupported type, got %d", alertCount)
|
|
}
|
|
})
|
|
|
|
t.Run("suppresses alerts with pulse-no-alerts tag", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
// Pre-create an alert
|
|
m.mu.Lock()
|
|
m.activeAlerts["vm100-cpu"] = &Alert{
|
|
ID: "vm100-cpu",
|
|
ResourceID: "vm100",
|
|
Type: "cpu",
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
vm := models.VM{
|
|
ID: "vm100",
|
|
Name: "TestVM",
|
|
Node: "node1",
|
|
Status: "running",
|
|
CPU: 0.95,
|
|
Tags: []string{"pulse-no-alerts"},
|
|
}
|
|
|
|
m.CheckGuest(vm, "pve1")
|
|
|
|
m.mu.RLock()
|
|
_, exists := m.activeAlerts["vm100-cpu"]
|
|
m.mu.RUnlock()
|
|
|
|
if exists {
|
|
t.Error("expected alert to be suppressed with pulse-no-alerts tag")
|
|
}
|
|
})
|
|
|
|
t.Run("stopped guest triggers powered-off check", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
// Pre-set confirmation count to trigger alert
|
|
m.mu.Lock()
|
|
m.offlineConfirmations["vm100"] = 2
|
|
m.mu.Unlock()
|
|
|
|
vm := models.VM{
|
|
ID: "vm100",
|
|
Name: "TestVM",
|
|
Node: "node1",
|
|
Status: "stopped",
|
|
}
|
|
|
|
m.CheckGuest(vm, "pve1")
|
|
|
|
m.mu.RLock()
|
|
alert := m.activeAlerts["guest-powered-off-vm100"]
|
|
m.mu.RUnlock()
|
|
|
|
if alert == nil {
|
|
t.Fatal("expected powered-off alert for stopped guest")
|
|
}
|
|
})
|
|
|
|
t.Run("stopped guest with DisableAllGuestsOffline clears tracking", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
m.mu.Lock()
|
|
m.config.DisableAllGuestsOffline = true
|
|
m.offlineConfirmations["vm100"] = 5
|
|
m.activeAlerts["guest-powered-off-vm100"] = &Alert{
|
|
ID: "guest-powered-off-vm100",
|
|
ResourceID: "vm100",
|
|
Type: "powered-off",
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
vm := models.VM{
|
|
ID: "vm100",
|
|
Name: "TestVM",
|
|
Node: "node1",
|
|
Status: "stopped",
|
|
}
|
|
|
|
m.CheckGuest(vm, "pve1")
|
|
|
|
m.mu.RLock()
|
|
_, alertExists := m.activeAlerts["guest-powered-off-vm100"]
|
|
_, countExists := m.offlineConfirmations["vm100"]
|
|
m.mu.RUnlock()
|
|
|
|
if alertExists {
|
|
t.Error("expected powered-off alert to be cleared")
|
|
}
|
|
if countExists {
|
|
t.Error("expected offline count to be cleared")
|
|
}
|
|
})
|
|
|
|
t.Run("paused guest clears powered-off alert", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
m.mu.Lock()
|
|
m.activeAlerts["guest-powered-off-vm100"] = &Alert{
|
|
ID: "guest-powered-off-vm100",
|
|
ResourceID: "vm100",
|
|
Type: "powered-off",
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
vm := models.VM{
|
|
ID: "vm100",
|
|
Name: "TestVM",
|
|
Node: "node1",
|
|
Status: "paused",
|
|
}
|
|
|
|
m.CheckGuest(vm, "pve1")
|
|
|
|
m.mu.RLock()
|
|
_, exists := m.activeAlerts["guest-powered-off-vm100"]
|
|
m.mu.RUnlock()
|
|
|
|
if exists {
|
|
t.Error("expected powered-off alert to be cleared for paused guest")
|
|
}
|
|
})
|
|
|
|
t.Run("non-running guest clears metric alerts", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
m.mu.Lock()
|
|
m.activeAlerts["vm100-cpu"] = &Alert{
|
|
ID: "vm100-cpu",
|
|
ResourceID: "vm100",
|
|
Type: "cpu",
|
|
}
|
|
m.activeAlerts["vm100-memory"] = &Alert{
|
|
ID: "vm100-memory",
|
|
ResourceID: "vm100",
|
|
Type: "memory",
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
vm := models.VM{
|
|
ID: "vm100",
|
|
Name: "TestVM",
|
|
Node: "node1",
|
|
Status: "stopped",
|
|
}
|
|
|
|
m.CheckGuest(vm, "pve1")
|
|
|
|
m.mu.RLock()
|
|
_, cpuExists := m.activeAlerts["vm100-cpu"]
|
|
_, memExists := m.activeAlerts["vm100-memory"]
|
|
m.mu.RUnlock()
|
|
|
|
if cpuExists {
|
|
t.Error("expected cpu alert to be cleared for non-running guest")
|
|
}
|
|
if memExists {
|
|
t.Error("expected memory alert to be cleared for non-running guest")
|
|
}
|
|
})
|
|
|
|
t.Run("running guest clears powered-off alert", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
m.mu.Lock()
|
|
m.activeAlerts["guest-powered-off-vm100"] = &Alert{
|
|
ID: "guest-powered-off-vm100",
|
|
ResourceID: "vm100",
|
|
Type: "powered-off",
|
|
}
|
|
m.offlineConfirmations["vm100"] = 5
|
|
m.mu.Unlock()
|
|
|
|
vm := models.VM{
|
|
ID: "vm100",
|
|
Name: "TestVM",
|
|
Node: "node1",
|
|
Status: "running",
|
|
}
|
|
|
|
m.CheckGuest(vm, "pve1")
|
|
|
|
m.mu.RLock()
|
|
_, exists := m.activeAlerts["guest-powered-off-vm100"]
|
|
m.mu.RUnlock()
|
|
|
|
if exists {
|
|
t.Error("expected powered-off alert to be cleared for running guest")
|
|
}
|
|
})
|
|
|
|
t.Run("disabled thresholds clear existing alerts", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
m.mu.Lock()
|
|
m.activeAlerts["vm100-cpu"] = &Alert{
|
|
ID: "vm100-cpu",
|
|
ResourceID: "vm100",
|
|
Type: "cpu",
|
|
}
|
|
m.config.Overrides = map[string]ThresholdConfig{
|
|
"vm100": {Disabled: true},
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
vm := models.VM{
|
|
ID: "vm100",
|
|
Name: "TestVM",
|
|
Node: "node1",
|
|
Status: "running",
|
|
}
|
|
|
|
m.CheckGuest(vm, "pve1")
|
|
|
|
m.mu.RLock()
|
|
_, exists := m.activeAlerts["vm100-cpu"]
|
|
m.mu.RUnlock()
|
|
|
|
if exists {
|
|
t.Error("expected alert to be cleared when guest has alerts disabled")
|
|
}
|
|
})
|
|
|
|
t.Run("checks memory metric", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
m.mu.Lock()
|
|
m.config.TimeThreshold = 0
|
|
m.config.TimeThresholds = map[string]int{}
|
|
m.config.GuestDefaults = ThresholdConfig{
|
|
Memory: &HysteresisThreshold{Trigger: 80.0, Clear: 70.0},
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
vm := models.VM{
|
|
ID: "vm100",
|
|
Name: "TestVM",
|
|
Node: "node1",
|
|
Status: "running",
|
|
Memory: models.Memory{Usage: 95.0},
|
|
}
|
|
|
|
m.CheckGuest(vm, "pve1")
|
|
|
|
m.mu.RLock()
|
|
alert := m.activeAlerts["vm100-memory"]
|
|
m.mu.RUnlock()
|
|
|
|
if alert == nil {
|
|
t.Fatal("expected memory alert")
|
|
}
|
|
})
|
|
|
|
t.Run("checks disk metric", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
m.mu.Lock()
|
|
m.config.TimeThreshold = 0
|
|
m.config.TimeThresholds = map[string]int{}
|
|
m.config.GuestDefaults = ThresholdConfig{
|
|
Disk: &HysteresisThreshold{Trigger: 80.0, Clear: 70.0},
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
vm := models.VM{
|
|
ID: "vm100",
|
|
Name: "TestVM",
|
|
Node: "node1",
|
|
Status: "running",
|
|
Disk: models.Disk{Usage: 95.0},
|
|
}
|
|
|
|
m.CheckGuest(vm, "pve1")
|
|
|
|
m.mu.RLock()
|
|
alert := m.activeAlerts["vm100-disk"]
|
|
m.mu.RUnlock()
|
|
|
|
if alert == nil {
|
|
t.Fatal("expected disk alert")
|
|
}
|
|
})
|
|
|
|
t.Run("checks individual disks", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
m.mu.Lock()
|
|
m.config.TimeThreshold = 0
|
|
m.config.TimeThresholds = map[string]int{}
|
|
m.config.GuestDefaults = ThresholdConfig{
|
|
Disk: &HysteresisThreshold{Trigger: 80.0, Clear: 70.0},
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
vm := models.VM{
|
|
ID: "vm100",
|
|
Name: "TestVM",
|
|
Node: "node1",
|
|
Status: "running",
|
|
Disks: []models.Disk{
|
|
{Mountpoint: "/", Usage: 95.0, Total: 100},
|
|
{Mountpoint: "/data", Usage: 50.0, Total: 100},
|
|
},
|
|
}
|
|
|
|
m.CheckGuest(vm, "pve1")
|
|
|
|
m.mu.RLock()
|
|
// Check that alert for high disk was created
|
|
var foundDiskAlert bool
|
|
for alertID := range m.activeAlerts {
|
|
if strings.Contains(alertID, "vm100-disk-") {
|
|
foundDiskAlert = true
|
|
break
|
|
}
|
|
}
|
|
m.mu.RUnlock()
|
|
|
|
if !foundDiskAlert {
|
|
t.Fatal("expected individual disk alert")
|
|
}
|
|
})
|
|
|
|
t.Run("skips disk with zero total", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
m.mu.Lock()
|
|
m.config.TimeThreshold = 0
|
|
m.config.TimeThresholds = map[string]int{}
|
|
m.config.GuestDefaults = ThresholdConfig{
|
|
Disk: &HysteresisThreshold{Trigger: 80.0, Clear: 70.0},
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
vm := models.VM{
|
|
ID: "vm100",
|
|
Name: "TestVM",
|
|
Node: "node1",
|
|
Status: "running",
|
|
Disks: []models.Disk{
|
|
{Mountpoint: "/", Usage: 95.0, Total: 0}, // Zero total - should skip
|
|
},
|
|
}
|
|
|
|
m.CheckGuest(vm, "pve1")
|
|
|
|
m.mu.RLock()
|
|
var foundDiskAlert bool
|
|
for alertID := range m.activeAlerts {
|
|
if strings.Contains(alertID, "vm100-disk-") {
|
|
foundDiskAlert = true
|
|
break
|
|
}
|
|
}
|
|
m.mu.RUnlock()
|
|
|
|
if foundDiskAlert {
|
|
t.Error("expected no disk alert for disk with zero total")
|
|
}
|
|
})
|
|
|
|
t.Run("skips disk with negative usage", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
m.mu.Lock()
|
|
m.config.TimeThreshold = 0
|
|
m.config.TimeThresholds = map[string]int{}
|
|
m.config.GuestDefaults = ThresholdConfig{
|
|
Disk: &HysteresisThreshold{Trigger: 80.0, Clear: 70.0},
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
vm := models.VM{
|
|
ID: "vm100",
|
|
Name: "TestVM",
|
|
Node: "node1",
|
|
Status: "running",
|
|
Disks: []models.Disk{
|
|
{Mountpoint: "/", Usage: -1.0, Total: 100}, // Negative usage - should skip
|
|
},
|
|
}
|
|
|
|
m.CheckGuest(vm, "pve1")
|
|
|
|
m.mu.RLock()
|
|
var foundDiskAlert bool
|
|
for alertID := range m.activeAlerts {
|
|
if strings.Contains(alertID, "vm100-disk-") {
|
|
foundDiskAlert = true
|
|
break
|
|
}
|
|
}
|
|
m.mu.RUnlock()
|
|
|
|
if foundDiskAlert {
|
|
t.Error("expected no disk alert for disk with negative usage")
|
|
}
|
|
})
|
|
|
|
t.Run("checks diskRead metric", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
m.mu.Lock()
|
|
m.config.TimeThreshold = 0
|
|
m.config.TimeThresholds = map[string]int{}
|
|
m.config.GuestDefaults = ThresholdConfig{
|
|
DiskRead: &HysteresisThreshold{Trigger: 100.0, Clear: 80.0}, // MB/s
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
vm := models.VM{
|
|
ID: "vm100",
|
|
Name: "TestVM",
|
|
Node: "node1",
|
|
Status: "running",
|
|
DiskRead: 200 * 1024 * 1024, // 200 MB/s in bytes
|
|
}
|
|
|
|
m.CheckGuest(vm, "pve1")
|
|
|
|
m.mu.RLock()
|
|
alert := m.activeAlerts["vm100-diskRead"]
|
|
m.mu.RUnlock()
|
|
|
|
if alert == nil {
|
|
t.Fatal("expected diskRead alert")
|
|
}
|
|
})
|
|
|
|
t.Run("checks diskWrite metric", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
m.mu.Lock()
|
|
m.config.TimeThreshold = 0
|
|
m.config.TimeThresholds = map[string]int{}
|
|
m.config.GuestDefaults = ThresholdConfig{
|
|
DiskWrite: &HysteresisThreshold{Trigger: 100.0, Clear: 80.0}, // MB/s
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
vm := models.VM{
|
|
ID: "vm100",
|
|
Name: "TestVM",
|
|
Node: "node1",
|
|
Status: "running",
|
|
DiskWrite: 200 * 1024 * 1024, // 200 MB/s in bytes
|
|
}
|
|
|
|
m.CheckGuest(vm, "pve1")
|
|
|
|
m.mu.RLock()
|
|
alert := m.activeAlerts["vm100-diskWrite"]
|
|
m.mu.RUnlock()
|
|
|
|
if alert == nil {
|
|
t.Fatal("expected diskWrite alert")
|
|
}
|
|
})
|
|
|
|
t.Run("checks networkIn metric", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
m.mu.Lock()
|
|
m.config.TimeThreshold = 0
|
|
m.config.TimeThresholds = map[string]int{}
|
|
m.config.GuestDefaults = ThresholdConfig{
|
|
NetworkIn: &HysteresisThreshold{Trigger: 100.0, Clear: 80.0}, // MB/s
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
vm := models.VM{
|
|
ID: "vm100",
|
|
Name: "TestVM",
|
|
Node: "node1",
|
|
Status: "running",
|
|
NetworkIn: 200 * 1024 * 1024, // 200 MB/s in bytes
|
|
}
|
|
|
|
m.CheckGuest(vm, "pve1")
|
|
|
|
m.mu.RLock()
|
|
alert := m.activeAlerts["vm100-networkIn"]
|
|
m.mu.RUnlock()
|
|
|
|
if alert == nil {
|
|
t.Fatal("expected networkIn alert")
|
|
}
|
|
})
|
|
|
|
t.Run("checks networkOut metric", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
m.mu.Lock()
|
|
m.config.TimeThreshold = 0
|
|
m.config.TimeThresholds = map[string]int{}
|
|
m.config.GuestDefaults = ThresholdConfig{
|
|
NetworkOut: &HysteresisThreshold{Trigger: 100.0, Clear: 80.0}, // MB/s
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
vm := models.VM{
|
|
ID: "vm100",
|
|
Name: "TestVM",
|
|
Node: "node1",
|
|
Status: "running",
|
|
NetworkOut: 200 * 1024 * 1024, // 200 MB/s in bytes
|
|
}
|
|
|
|
m.CheckGuest(vm, "pve1")
|
|
|
|
m.mu.RLock()
|
|
alert := m.activeAlerts["vm100-networkOut"]
|
|
m.mu.RUnlock()
|
|
|
|
if alert == nil {
|
|
t.Fatal("expected networkOut alert")
|
|
}
|
|
})
|
|
|
|
t.Run("applies relaxed thresholds with pulse-relaxed tag", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
m.mu.Lock()
|
|
m.config.TimeThreshold = 0
|
|
m.config.TimeThresholds = map[string]int{}
|
|
m.config.GuestDefaults = ThresholdConfig{
|
|
CPU: &HysteresisThreshold{Trigger: 80.0, Clear: 70.0},
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
// CPU at 90% - would trigger normally but relaxed threshold is 95%
|
|
vm := models.VM{
|
|
ID: "vm100",
|
|
Name: "TestVM",
|
|
Node: "node1",
|
|
Status: "running",
|
|
CPU: 0.90, // 90%
|
|
Tags: []string{"pulse-relaxed"},
|
|
}
|
|
|
|
m.CheckGuest(vm, "pve1")
|
|
|
|
m.mu.RLock()
|
|
_, exists := m.activeAlerts["vm100-cpu"]
|
|
m.mu.RUnlock()
|
|
|
|
if exists {
|
|
t.Error("expected no alert due to relaxed thresholds")
|
|
}
|
|
})
|
|
|
|
t.Run("disk uses device as label fallback", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
m.mu.Lock()
|
|
m.config.TimeThreshold = 0
|
|
m.config.TimeThresholds = map[string]int{}
|
|
m.config.GuestDefaults = ThresholdConfig{
|
|
Disk: &HysteresisThreshold{Trigger: 80.0, Clear: 70.0},
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
vm := models.VM{
|
|
ID: "vm100",
|
|
Name: "TestVM",
|
|
Node: "node1",
|
|
Status: "running",
|
|
Disks: []models.Disk{
|
|
{Device: "sda1", Usage: 95.0, Total: 100}, // No mountpoint, has device
|
|
},
|
|
}
|
|
|
|
m.CheckGuest(vm, "pve1")
|
|
|
|
m.mu.RLock()
|
|
var foundDiskAlert bool
|
|
for alertID := range m.activeAlerts {
|
|
if strings.Contains(alertID, "vm100-disk-") {
|
|
foundDiskAlert = true
|
|
break
|
|
}
|
|
}
|
|
m.mu.RUnlock()
|
|
|
|
if !foundDiskAlert {
|
|
t.Fatal("expected disk alert using device as label")
|
|
}
|
|
})
|
|
|
|
t.Run("disk uses index as label when no mountpoint or device", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
m.mu.Lock()
|
|
m.config.TimeThreshold = 0
|
|
m.config.TimeThresholds = map[string]int{}
|
|
m.config.GuestDefaults = ThresholdConfig{
|
|
Disk: &HysteresisThreshold{Trigger: 80.0, Clear: 70.0},
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
vm := models.VM{
|
|
ID: "vm100",
|
|
Name: "TestVM",
|
|
Node: "node1",
|
|
Status: "running",
|
|
Disks: []models.Disk{
|
|
{Usage: 95.0, Total: 100}, // No mountpoint or device
|
|
},
|
|
}
|
|
|
|
m.CheckGuest(vm, "pve1")
|
|
|
|
m.mu.RLock()
|
|
var foundDiskAlert bool
|
|
for alertID := range m.activeAlerts {
|
|
if strings.Contains(alertID, "vm100-disk-") {
|
|
foundDiskAlert = true
|
|
break
|
|
}
|
|
}
|
|
m.mu.RUnlock()
|
|
|
|
if !foundDiskAlert {
|
|
t.Fatal("expected disk alert using index as label")
|
|
}
|
|
})
|
|
}
|
|
|
|
func TestCheckHostComprehensive(t *testing.T) {
|
|
// t.Parallel()
|
|
|
|
t.Run("returns early for empty host ID", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
m.mu.Lock()
|
|
m.config.TimeThreshold = 0
|
|
m.config.TimeThresholds = map[string]int{}
|
|
m.config.HostDefaults = ThresholdConfig{
|
|
CPU: &HysteresisThreshold{Trigger: 80.0, Clear: 70.0},
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
host := models.Host{
|
|
ID: "",
|
|
CPUUsage: 95.0,
|
|
}
|
|
|
|
m.CheckHost(host)
|
|
|
|
m.mu.RLock()
|
|
alertCount := len(m.activeAlerts)
|
|
m.mu.RUnlock()
|
|
|
|
if alertCount != 0 {
|
|
t.Errorf("expected no alerts for empty host ID, got %d", alertCount)
|
|
}
|
|
})
|
|
|
|
t.Run("returns early when alerts disabled", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
m.mu.Lock()
|
|
m.config.Enabled = false
|
|
m.mu.Unlock()
|
|
|
|
host := models.Host{
|
|
ID: "host1",
|
|
CPUUsage: 95.0,
|
|
}
|
|
|
|
m.CheckHost(host)
|
|
|
|
m.mu.RLock()
|
|
alertCount := len(m.activeAlerts)
|
|
m.mu.RUnlock()
|
|
|
|
if alertCount != 0 {
|
|
t.Errorf("expected no alerts when disabled, got %d", alertCount)
|
|
}
|
|
})
|
|
|
|
t.Run("DisableAllHosts clears existing alerts", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
m.mu.Lock()
|
|
m.activeAlerts["host:host1-cpu"] = &Alert{ID: "host:host1-cpu", ResourceID: "host:host1", Type: "cpu"}
|
|
m.activeAlerts["host:host1-memory"] = &Alert{ID: "host:host1-memory", ResourceID: "host:host1", Type: "memory"}
|
|
m.config.DisableAllHosts = true
|
|
m.mu.Unlock()
|
|
|
|
host := models.Host{
|
|
ID: "host1",
|
|
CPUUsage: 95.0,
|
|
}
|
|
|
|
m.CheckHost(host)
|
|
|
|
m.mu.RLock()
|
|
_, cpuExists := m.activeAlerts["host:host1-cpu"]
|
|
_, memExists := m.activeAlerts["host:host1-memory"]
|
|
m.mu.RUnlock()
|
|
|
|
if cpuExists {
|
|
t.Error("expected cpu alert to be cleared")
|
|
}
|
|
if memExists {
|
|
t.Error("expected memory alert to be cleared")
|
|
}
|
|
})
|
|
|
|
t.Run("override with Disabled clears alerts", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
m.mu.Lock()
|
|
m.activeAlerts["host:host1-cpu"] = &Alert{ID: "host:host1-cpu", ResourceID: "host:host1", Type: "cpu"}
|
|
m.config.Overrides = map[string]ThresholdConfig{
|
|
"host1": {Disabled: true},
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
host := models.Host{
|
|
ID: "host1",
|
|
CPUUsage: 95.0,
|
|
}
|
|
|
|
m.CheckHost(host)
|
|
|
|
m.mu.RLock()
|
|
_, exists := m.activeAlerts["host:host1-cpu"]
|
|
m.mu.RUnlock()
|
|
|
|
if exists {
|
|
t.Error("expected alert to be cleared when host has alerts disabled")
|
|
}
|
|
})
|
|
|
|
t.Run("clears CPU alerts when threshold nil", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
m.mu.Lock()
|
|
m.activeAlerts["host:host1-cpu"] = &Alert{ID: "host:host1-cpu", ResourceID: "host:host1", Type: "cpu"}
|
|
m.config.HostDefaults = ThresholdConfig{
|
|
CPU: nil, // No CPU threshold
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
host := models.Host{
|
|
ID: "host1",
|
|
CPUUsage: 95.0,
|
|
}
|
|
|
|
m.CheckHost(host)
|
|
|
|
m.mu.RLock()
|
|
_, exists := m.activeAlerts["host:host1-cpu"]
|
|
m.mu.RUnlock()
|
|
|
|
if exists {
|
|
t.Error("expected CPU alert to be cleared when threshold is nil")
|
|
}
|
|
})
|
|
|
|
t.Run("clears memory alerts when threshold nil", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
m.mu.Lock()
|
|
m.activeAlerts["host:host1-memory"] = &Alert{ID: "host:host1-memory", ResourceID: "host:host1", Type: "memory"}
|
|
m.config.HostDefaults = ThresholdConfig{
|
|
Memory: nil, // No memory threshold
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
host := models.Host{
|
|
ID: "host1",
|
|
Memory: models.Memory{
|
|
Usage: 95.0,
|
|
},
|
|
}
|
|
|
|
m.CheckHost(host)
|
|
|
|
m.mu.RLock()
|
|
_, exists := m.activeAlerts["host:host1-memory"]
|
|
m.mu.RUnlock()
|
|
|
|
if exists {
|
|
t.Error("expected memory alert to be cleared when threshold is nil")
|
|
}
|
|
})
|
|
|
|
t.Run("clears disk alerts when threshold nil", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
// Disk alert ID format: {resourceID}-disk where resourceID is host:hostID/disk:mountpoint
|
|
alertID := "host:host1/disk:/-disk"
|
|
m.mu.Lock()
|
|
m.activeAlerts[alertID] = &Alert{ID: alertID, ResourceID: "host:host1/disk:/", Type: "disk"}
|
|
m.config.HostDefaults = ThresholdConfig{
|
|
Disk: nil, // No disk threshold
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
host := models.Host{
|
|
ID: "host1",
|
|
Disks: []models.Disk{
|
|
{Mountpoint: "/", Usage: 95.0, Total: 100},
|
|
},
|
|
}
|
|
|
|
m.CheckHost(host)
|
|
|
|
m.mu.RLock()
|
|
_, exists := m.activeAlerts[alertID]
|
|
m.mu.RUnlock()
|
|
|
|
if exists {
|
|
t.Error("expected disk alert to be cleared when threshold is nil")
|
|
}
|
|
})
|
|
|
|
t.Run("RAID degraded creates critical alert", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
host := models.Host{
|
|
ID: "host1",
|
|
Hostname: "testhost",
|
|
RAID: []models.HostRAIDArray{
|
|
{
|
|
Device: "/dev/md2", // Note: md0/md1 are skipped for Synology compatibility
|
|
Level: "raid1",
|
|
State: "degraded",
|
|
TotalDevices: 2,
|
|
ActiveDevices: 1,
|
|
FailedDevices: 1,
|
|
},
|
|
},
|
|
}
|
|
|
|
m.CheckHost(host)
|
|
|
|
m.mu.RLock()
|
|
alert := m.activeAlerts["host-host1-raid-md2"]
|
|
m.mu.RUnlock()
|
|
|
|
if alert == nil {
|
|
t.Fatal("expected RAID degraded alert")
|
|
}
|
|
if alert.Level != AlertLevelCritical {
|
|
t.Errorf("expected critical level, got %s", alert.Level)
|
|
}
|
|
})
|
|
|
|
t.Run("RAID rebuilding creates warning alert", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
host := models.Host{
|
|
ID: "host1",
|
|
Hostname: "testhost",
|
|
RAID: []models.HostRAIDArray{
|
|
{
|
|
Device: "/dev/md2", // Note: md0/md1 are skipped for Synology compatibility
|
|
Level: "raid1",
|
|
State: "recovering",
|
|
TotalDevices: 2,
|
|
ActiveDevices: 2,
|
|
FailedDevices: 0,
|
|
RebuildPercent: 50.0,
|
|
},
|
|
},
|
|
}
|
|
|
|
m.CheckHost(host)
|
|
|
|
m.mu.RLock()
|
|
alert := m.activeAlerts["host-host1-raid-md2"]
|
|
m.mu.RUnlock()
|
|
|
|
if alert == nil {
|
|
t.Fatal("expected RAID rebuilding alert")
|
|
}
|
|
if alert.Level != AlertLevelWarning {
|
|
t.Errorf("expected warning level, got %s", alert.Level)
|
|
}
|
|
})
|
|
|
|
t.Run("RAID healthy clears alert", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
m.mu.Lock()
|
|
m.activeAlerts["host-host1-raid-md2"] = &Alert{
|
|
ID: "host-host1-raid-md2",
|
|
Type: "raid",
|
|
Level: AlertLevelCritical,
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
host := models.Host{
|
|
ID: "host1",
|
|
Hostname: "testhost",
|
|
RAID: []models.HostRAIDArray{
|
|
{
|
|
Device: "/dev/md2", // Note: md0/md1 are skipped for Synology compatibility
|
|
Level: "raid1",
|
|
State: "active",
|
|
TotalDevices: 2,
|
|
ActiveDevices: 2,
|
|
FailedDevices: 0,
|
|
},
|
|
},
|
|
}
|
|
|
|
m.CheckHost(host)
|
|
|
|
m.mu.RLock()
|
|
_, exists := m.activeAlerts["host-host1-raid-md2"]
|
|
m.mu.RUnlock()
|
|
|
|
if exists {
|
|
t.Error("expected RAID alert to be cleared for healthy array")
|
|
}
|
|
})
|
|
|
|
t.Run("RAID with failed devices triggers degraded", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
host := models.Host{
|
|
ID: "host1",
|
|
Hostname: "testhost",
|
|
RAID: []models.HostRAIDArray{
|
|
{
|
|
Device: "/dev/md2", // Note: md0/md1 are skipped for Synology compatibility
|
|
Level: "raid1",
|
|
State: "active", // State might say active but with failed devices
|
|
TotalDevices: 2,
|
|
ActiveDevices: 1,
|
|
FailedDevices: 1, // This triggers degraded alert
|
|
},
|
|
},
|
|
}
|
|
|
|
m.CheckHost(host)
|
|
|
|
m.mu.RLock()
|
|
alert := m.activeAlerts["host-host1-raid-md2"]
|
|
m.mu.RUnlock()
|
|
|
|
if alert == nil {
|
|
t.Fatal("expected RAID alert for failed devices")
|
|
}
|
|
if alert.Level != AlertLevelCritical {
|
|
t.Errorf("expected critical level for failed devices, got %s", alert.Level)
|
|
}
|
|
})
|
|
|
|
t.Run("RAID resync triggers rebuilding alert", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
host := models.Host{
|
|
ID: "host1",
|
|
Hostname: "testhost",
|
|
RAID: []models.HostRAIDArray{
|
|
{
|
|
Device: "/dev/md2", // Note: md0/md1 are skipped for Synology compatibility
|
|
Level: "raid1",
|
|
State: "resync",
|
|
TotalDevices: 2,
|
|
ActiveDevices: 2,
|
|
FailedDevices: 0,
|
|
},
|
|
},
|
|
}
|
|
|
|
m.CheckHost(host)
|
|
|
|
m.mu.RLock()
|
|
alert := m.activeAlerts["host-host1-raid-md2"]
|
|
m.mu.RUnlock()
|
|
|
|
if alert == nil {
|
|
t.Fatal("expected RAID rebuilding alert for resync")
|
|
}
|
|
if alert.Level != AlertLevelWarning {
|
|
t.Errorf("expected warning level for resync, got %s", alert.Level)
|
|
}
|
|
})
|
|
|
|
t.Run("existing RAID alert not duplicated", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
originalTime := time.Now().Add(-1 * time.Hour)
|
|
m.mu.Lock()
|
|
m.activeAlerts["host-host1-raid-md2"] = &Alert{
|
|
ID: "host-host1-raid-md2",
|
|
Type: "raid",
|
|
Level: AlertLevelCritical,
|
|
StartTime: originalTime,
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
host := models.Host{
|
|
ID: "host1",
|
|
Hostname: "testhost",
|
|
RAID: []models.HostRAIDArray{
|
|
{
|
|
Device: "/dev/md2", // Note: md0/md1 are skipped for Synology compatibility
|
|
Level: "raid1",
|
|
State: "degraded",
|
|
TotalDevices: 2,
|
|
ActiveDevices: 1,
|
|
FailedDevices: 1,
|
|
},
|
|
},
|
|
}
|
|
|
|
m.CheckHost(host)
|
|
|
|
m.mu.RLock()
|
|
alert := m.activeAlerts["host-host1-raid-md2"]
|
|
m.mu.RUnlock()
|
|
|
|
if alert == nil {
|
|
t.Fatal("expected RAID alert to still exist")
|
|
}
|
|
// The alert should preserve its original start time
|
|
if !alert.StartTime.Equal(originalTime) {
|
|
t.Error("expected alert start time to be preserved")
|
|
}
|
|
})
|
|
|
|
t.Run("applies override thresholds", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
m.mu.Lock()
|
|
m.config.TimeThreshold = 0
|
|
m.config.TimeThresholds = map[string]int{}
|
|
m.config.HostDefaults = ThresholdConfig{
|
|
CPU: &HysteresisThreshold{Trigger: 80.0, Clear: 70.0},
|
|
}
|
|
m.config.Overrides = map[string]ThresholdConfig{
|
|
"host1": {
|
|
CPU: &HysteresisThreshold{Trigger: 99.0, Clear: 95.0}, // Higher threshold
|
|
},
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
host := models.Host{
|
|
ID: "host1",
|
|
Hostname: "testhost",
|
|
CPUUsage: 95.0, // Below override trigger
|
|
}
|
|
|
|
m.CheckHost(host)
|
|
|
|
m.mu.RLock()
|
|
_, exists := m.activeAlerts["host:host1-cpu"]
|
|
m.mu.RUnlock()
|
|
|
|
if exists {
|
|
t.Error("expected no alert due to higher override threshold")
|
|
}
|
|
})
|
|
|
|
t.Run("checks multiple disks", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
m.mu.Lock()
|
|
m.config.TimeThreshold = 0
|
|
m.config.TimeThresholds = map[string]int{}
|
|
m.config.HostDefaults = ThresholdConfig{
|
|
Disk: &HysteresisThreshold{Trigger: 80.0, Clear: 70.0},
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
host := models.Host{
|
|
ID: "host1",
|
|
Hostname: "testhost",
|
|
Disks: []models.Disk{
|
|
{Mountpoint: "/", Usage: 95.0, Total: 100},
|
|
{Mountpoint: "/data", Usage: 50.0, Total: 100}, // Below threshold
|
|
},
|
|
}
|
|
|
|
m.CheckHost(host)
|
|
|
|
m.mu.RLock()
|
|
var diskAlertCount int
|
|
for alertID := range m.activeAlerts {
|
|
// Disk alert ID format: host:hostID/disk:label-disk
|
|
if strings.Contains(alertID, "host:host1/disk:") {
|
|
diskAlertCount++
|
|
}
|
|
}
|
|
m.mu.RUnlock()
|
|
|
|
if diskAlertCount != 1 {
|
|
t.Errorf("expected 1 disk alert, got %d", diskAlertCount)
|
|
}
|
|
})
|
|
|
|
t.Run("clears offline alert when host comes online", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
resourceKey := hostResourceID("host1")
|
|
m.mu.Lock()
|
|
m.activeAlerts["host-offline-host1"] = &Alert{
|
|
ID: "host-offline-host1",
|
|
Type: "connectivity",
|
|
}
|
|
m.offlineConfirmations[resourceKey] = 5
|
|
m.mu.Unlock()
|
|
|
|
host := models.Host{
|
|
ID: "host1",
|
|
Hostname: "testhost",
|
|
}
|
|
|
|
m.CheckHost(host)
|
|
|
|
m.mu.RLock()
|
|
_, alertExists := m.activeAlerts["host-offline-host1"]
|
|
_, countExists := m.offlineConfirmations[resourceKey]
|
|
m.mu.RUnlock()
|
|
|
|
if alertExists {
|
|
t.Error("expected offline alert to be cleared")
|
|
}
|
|
if countExists {
|
|
t.Error("expected offline count to be cleared")
|
|
}
|
|
})
|
|
|
|
t.Run("includes tags in metadata", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
m.mu.Lock()
|
|
m.config.TimeThreshold = 0
|
|
m.config.TimeThresholds = map[string]int{}
|
|
m.config.HostDefaults = ThresholdConfig{
|
|
CPU: &HysteresisThreshold{Trigger: 80.0, Clear: 70.0},
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
host := models.Host{
|
|
ID: "host1",
|
|
Hostname: "testhost",
|
|
CPUUsage: 95.0,
|
|
Tags: []string{"production", "critical"},
|
|
}
|
|
|
|
m.CheckHost(host)
|
|
|
|
m.mu.RLock()
|
|
alert := m.activeAlerts["host:host1-cpu"]
|
|
m.mu.RUnlock()
|
|
|
|
if alert == nil {
|
|
t.Fatal("expected CPU alert")
|
|
}
|
|
if alert.Metadata == nil {
|
|
t.Fatal("expected metadata in alert")
|
|
}
|
|
tags, ok := alert.Metadata["tags"].([]string)
|
|
if !ok || len(tags) != 2 {
|
|
t.Error("expected tags in metadata")
|
|
}
|
|
})
|
|
}
|
|
|
|
func TestCheckPBSComprehensive(t *testing.T) {
|
|
// t.Parallel()
|
|
|
|
t.Run("returns early when alerts disabled", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
m.mu.Lock()
|
|
m.config.Enabled = false
|
|
m.mu.Unlock()
|
|
|
|
pbs := models.PBSInstance{
|
|
ID: "pbs1",
|
|
Name: "testpbs",
|
|
CPU: 95.0,
|
|
}
|
|
|
|
m.CheckPBS(pbs)
|
|
|
|
m.mu.RLock()
|
|
alertCount := len(m.activeAlerts)
|
|
m.mu.RUnlock()
|
|
|
|
if alertCount != 0 {
|
|
t.Errorf("expected no alerts when disabled, got %d", alertCount)
|
|
}
|
|
})
|
|
|
|
t.Run("DisableAllPBS clears existing alerts", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
m.mu.Lock()
|
|
m.activeAlerts["pbs1-cpu"] = &Alert{ID: "pbs1-cpu", Type: "cpu"}
|
|
m.activeAlerts["pbs1-memory"] = &Alert{ID: "pbs1-memory", Type: "memory"}
|
|
m.activeAlerts["pbs-offline-pbs1"] = &Alert{ID: "pbs-offline-pbs1", Type: "connectivity"}
|
|
m.offlineConfirmations["pbs1"] = 3
|
|
m.config.DisableAllPBS = true
|
|
m.mu.Unlock()
|
|
|
|
pbs := models.PBSInstance{
|
|
ID: "pbs1",
|
|
Name: "testpbs",
|
|
}
|
|
|
|
m.CheckPBS(pbs)
|
|
|
|
m.mu.RLock()
|
|
_, cpuExists := m.activeAlerts["pbs1-cpu"]
|
|
_, memExists := m.activeAlerts["pbs1-memory"]
|
|
_, offlineExists := m.activeAlerts["pbs-offline-pbs1"]
|
|
_, confirmExists := m.offlineConfirmations["pbs1"]
|
|
m.mu.RUnlock()
|
|
|
|
if cpuExists {
|
|
t.Error("expected CPU alert to be cleared")
|
|
}
|
|
if memExists {
|
|
t.Error("expected memory alert to be cleared")
|
|
}
|
|
if offlineExists {
|
|
t.Error("expected offline alert to be cleared")
|
|
}
|
|
if confirmExists {
|
|
t.Error("expected offline confirmation to be cleared")
|
|
}
|
|
})
|
|
|
|
t.Run("override with Disabled clears alerts", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
m.mu.Lock()
|
|
m.activeAlerts["pbs1-cpu"] = &Alert{ID: "pbs1-cpu", Type: "cpu"}
|
|
m.activeAlerts["pbs1-memory"] = &Alert{ID: "pbs1-memory", Type: "memory"}
|
|
m.activeAlerts["pbs-offline-pbs1"] = &Alert{ID: "pbs-offline-pbs1", Type: "connectivity"}
|
|
m.offlineConfirmations["pbs1"] = 3
|
|
m.config.Overrides = map[string]ThresholdConfig{
|
|
"pbs1": {Disabled: true},
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
pbs := models.PBSInstance{
|
|
ID: "pbs1",
|
|
Name: "testpbs",
|
|
}
|
|
|
|
m.CheckPBS(pbs)
|
|
|
|
m.mu.RLock()
|
|
_, cpuExists := m.activeAlerts["pbs1-cpu"]
|
|
_, memExists := m.activeAlerts["pbs1-memory"]
|
|
_, offlineExists := m.activeAlerts["pbs-offline-pbs1"]
|
|
_, confirmExists := m.offlineConfirmations["pbs1"]
|
|
m.mu.RUnlock()
|
|
|
|
if cpuExists {
|
|
t.Error("expected CPU alert to be cleared")
|
|
}
|
|
if memExists {
|
|
t.Error("expected memory alert to be cleared")
|
|
}
|
|
if offlineExists {
|
|
t.Error("expected offline alert to be cleared")
|
|
}
|
|
if confirmExists {
|
|
t.Error("expected offline confirmation to be cleared")
|
|
}
|
|
})
|
|
|
|
t.Run("DisableAllPBSOffline clears offline alert", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
m.mu.Lock()
|
|
m.activeAlerts["pbs-offline-pbs1"] = &Alert{ID: "pbs-offline-pbs1", Type: "connectivity"}
|
|
m.offlineConfirmations["pbs1"] = 3
|
|
m.config.DisableAllPBSOffline = true
|
|
m.mu.Unlock()
|
|
|
|
pbs := models.PBSInstance{
|
|
ID: "pbs1",
|
|
Name: "testpbs",
|
|
Status: "offline",
|
|
}
|
|
|
|
m.CheckPBS(pbs)
|
|
|
|
m.mu.RLock()
|
|
_, offlineExists := m.activeAlerts["pbs-offline-pbs1"]
|
|
_, confirmExists := m.offlineConfirmations["pbs1"]
|
|
m.mu.RUnlock()
|
|
|
|
if offlineExists {
|
|
t.Error("expected offline alert to be cleared when DisableAllPBSOffline is true")
|
|
}
|
|
if confirmExists {
|
|
t.Error("expected offline confirmation to be cleared")
|
|
}
|
|
})
|
|
|
|
t.Run("checks CPU threshold when online", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
m.mu.Lock()
|
|
m.config.TimeThreshold = 0
|
|
m.config.TimeThresholds = map[string]int{}
|
|
m.config.NodeDefaults = ThresholdConfig{
|
|
CPU: &HysteresisThreshold{Trigger: 80.0, Clear: 70.0},
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
pbs := models.PBSInstance{
|
|
ID: "pbs1",
|
|
Name: "testpbs",
|
|
Host: "pbshost",
|
|
Status: "online",
|
|
CPU: 95.0,
|
|
}
|
|
|
|
m.CheckPBS(pbs)
|
|
|
|
m.mu.RLock()
|
|
alert := m.activeAlerts["pbs1-cpu"]
|
|
m.mu.RUnlock()
|
|
|
|
if alert == nil {
|
|
t.Fatal("expected CPU alert")
|
|
}
|
|
})
|
|
|
|
t.Run("checks memory threshold when online", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
m.mu.Lock()
|
|
m.config.TimeThreshold = 0
|
|
m.config.TimeThresholds = map[string]int{}
|
|
m.config.NodeDefaults = ThresholdConfig{
|
|
Memory: &HysteresisThreshold{Trigger: 80.0, Clear: 70.0},
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
pbs := models.PBSInstance{
|
|
ID: "pbs1",
|
|
Name: "testpbs",
|
|
Host: "pbshost",
|
|
Status: "online",
|
|
Memory: 95.0,
|
|
}
|
|
|
|
m.CheckPBS(pbs)
|
|
|
|
m.mu.RLock()
|
|
alert := m.activeAlerts["pbs1-memory"]
|
|
m.mu.RUnlock()
|
|
|
|
if alert == nil {
|
|
t.Fatal("expected memory alert")
|
|
}
|
|
})
|
|
|
|
t.Run("skips metrics when PBS is offline", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
m.mu.Lock()
|
|
m.config.TimeThreshold = 0
|
|
m.config.TimeThresholds = map[string]int{}
|
|
m.config.NodeDefaults = ThresholdConfig{
|
|
CPU: &HysteresisThreshold{Trigger: 80.0, Clear: 70.0},
|
|
Memory: &HysteresisThreshold{Trigger: 80.0, Clear: 70.0},
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
pbs := models.PBSInstance{
|
|
ID: "pbs1",
|
|
Name: "testpbs",
|
|
Status: "offline",
|
|
CPU: 95.0,
|
|
Memory: 95.0,
|
|
}
|
|
|
|
m.CheckPBS(pbs)
|
|
|
|
m.mu.RLock()
|
|
_, cpuExists := m.activeAlerts["pbs1-cpu"]
|
|
_, memExists := m.activeAlerts["pbs1-memory"]
|
|
m.mu.RUnlock()
|
|
|
|
if cpuExists {
|
|
t.Error("expected no CPU alert when offline")
|
|
}
|
|
if memExists {
|
|
t.Error("expected no memory alert when offline")
|
|
}
|
|
})
|
|
|
|
t.Run("applies override thresholds", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
m.mu.Lock()
|
|
m.config.TimeThreshold = 0
|
|
m.config.TimeThresholds = map[string]int{}
|
|
m.config.NodeDefaults = ThresholdConfig{
|
|
CPU: &HysteresisThreshold{Trigger: 80.0, Clear: 70.0},
|
|
}
|
|
m.config.Overrides = map[string]ThresholdConfig{
|
|
"pbs1": {
|
|
CPU: &HysteresisThreshold{Trigger: 99.0, Clear: 95.0}, // Higher threshold
|
|
},
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
pbs := models.PBSInstance{
|
|
ID: "pbs1",
|
|
Name: "testpbs",
|
|
Status: "online",
|
|
CPU: 95.0, // Below override trigger
|
|
}
|
|
|
|
m.CheckPBS(pbs)
|
|
|
|
m.mu.RLock()
|
|
_, exists := m.activeAlerts["pbs1-cpu"]
|
|
m.mu.RUnlock()
|
|
|
|
if exists {
|
|
t.Error("expected no alert due to higher override threshold")
|
|
}
|
|
})
|
|
|
|
t.Run("checks offline status", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
m.mu.Lock()
|
|
// Pre-populate confirmation count to bypass waiting period
|
|
m.offlineConfirmations["pbs1"] = 2
|
|
m.mu.Unlock()
|
|
|
|
pbs := models.PBSInstance{
|
|
ID: "pbs1",
|
|
Name: "testpbs",
|
|
Status: "offline",
|
|
}
|
|
|
|
m.CheckPBS(pbs)
|
|
|
|
m.mu.RLock()
|
|
alert := m.activeAlerts["pbs-offline-pbs1"]
|
|
m.mu.RUnlock()
|
|
|
|
if alert == nil {
|
|
t.Fatal("expected offline alert")
|
|
}
|
|
if alert.Type != "offline" {
|
|
t.Errorf("expected offline type, got %s", alert.Type)
|
|
}
|
|
})
|
|
|
|
t.Run("checks connection health error", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
m.mu.Lock()
|
|
// Pre-populate confirmation count to bypass waiting period
|
|
m.offlineConfirmations["pbs1"] = 2
|
|
m.mu.Unlock()
|
|
|
|
pbs := models.PBSInstance{
|
|
ID: "pbs1",
|
|
Name: "testpbs",
|
|
Status: "online",
|
|
ConnectionHealth: "error",
|
|
}
|
|
|
|
m.CheckPBS(pbs)
|
|
|
|
m.mu.RLock()
|
|
alert := m.activeAlerts["pbs-offline-pbs1"]
|
|
m.mu.RUnlock()
|
|
|
|
if alert == nil {
|
|
t.Fatal("expected offline alert for connection health error")
|
|
}
|
|
})
|
|
|
|
t.Run("checks connection health unhealthy", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
m.mu.Lock()
|
|
// Pre-populate confirmation count to bypass waiting period
|
|
m.offlineConfirmations["pbs1"] = 2
|
|
m.mu.Unlock()
|
|
|
|
pbs := models.PBSInstance{
|
|
ID: "pbs1",
|
|
Name: "testpbs",
|
|
Status: "online",
|
|
ConnectionHealth: "unhealthy",
|
|
}
|
|
|
|
m.CheckPBS(pbs)
|
|
|
|
m.mu.RLock()
|
|
alert := m.activeAlerts["pbs-offline-pbs1"]
|
|
m.mu.RUnlock()
|
|
|
|
if alert == nil {
|
|
t.Fatal("expected offline alert for connection health unhealthy")
|
|
}
|
|
})
|
|
|
|
t.Run("clears offline alert when back online", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
m.mu.Lock()
|
|
m.activeAlerts["pbs-offline-pbs1"] = &Alert{ID: "pbs-offline-pbs1", Type: "connectivity"}
|
|
m.offlineConfirmations["pbs1"] = 5
|
|
m.mu.Unlock()
|
|
|
|
pbs := models.PBSInstance{
|
|
ID: "pbs1",
|
|
Name: "testpbs",
|
|
Status: "online",
|
|
ConnectionHealth: "healthy",
|
|
}
|
|
|
|
m.CheckPBS(pbs)
|
|
|
|
m.mu.RLock()
|
|
_, offlineExists := m.activeAlerts["pbs-offline-pbs1"]
|
|
_, confirmExists := m.offlineConfirmations["pbs1"]
|
|
m.mu.RUnlock()
|
|
|
|
if offlineExists {
|
|
t.Error("expected offline alert to be cleared when back online")
|
|
}
|
|
if confirmExists {
|
|
t.Error("expected offline confirmation to be cleared")
|
|
}
|
|
})
|
|
}
|
|
|
|
func TestCheckPMGComprehensive(t *testing.T) {
|
|
// t.Parallel()
|
|
|
|
t.Run("returns early when alerts disabled", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
m.mu.Lock()
|
|
m.config.Enabled = false
|
|
m.mu.Unlock()
|
|
|
|
pmg := models.PMGInstance{
|
|
ID: "pmg1",
|
|
Name: "testpmg",
|
|
}
|
|
|
|
m.CheckPMG(pmg)
|
|
|
|
m.mu.RLock()
|
|
alertCount := len(m.activeAlerts)
|
|
m.mu.RUnlock()
|
|
|
|
if alertCount != 0 {
|
|
t.Errorf("expected no alerts when disabled, got %d", alertCount)
|
|
}
|
|
})
|
|
|
|
t.Run("DisableAllPMG clears existing alerts", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
m.mu.Lock()
|
|
m.activeAlerts["pmg1-queue-total"] = &Alert{ID: "pmg1-queue-total", Type: "queue-total"}
|
|
m.activeAlerts["pmg1-queue-deferred"] = &Alert{ID: "pmg1-queue-deferred", Type: "queue-deferred"}
|
|
m.activeAlerts["pmg1-queue-hold"] = &Alert{ID: "pmg1-queue-hold", Type: "queue-hold"}
|
|
m.activeAlerts["pmg1-oldest-message"] = &Alert{ID: "pmg1-oldest-message", Type: "oldest-message"}
|
|
m.activeAlerts["pmg-offline-pmg1"] = &Alert{ID: "pmg-offline-pmg1", Type: "connectivity"}
|
|
m.offlineConfirmations["pmg1"] = 3
|
|
m.config.DisableAllPMG = true
|
|
m.mu.Unlock()
|
|
|
|
pmg := models.PMGInstance{
|
|
ID: "pmg1",
|
|
Name: "testpmg",
|
|
}
|
|
|
|
m.CheckPMG(pmg)
|
|
|
|
m.mu.RLock()
|
|
_, queueTotalExists := m.activeAlerts["pmg1-queue-total"]
|
|
_, queueDeferredExists := m.activeAlerts["pmg1-queue-deferred"]
|
|
_, queueHoldExists := m.activeAlerts["pmg1-queue-hold"]
|
|
_, oldestMsgExists := m.activeAlerts["pmg1-oldest-message"]
|
|
_, offlineExists := m.activeAlerts["pmg-offline-pmg1"]
|
|
_, confirmExists := m.offlineConfirmations["pmg1"]
|
|
m.mu.RUnlock()
|
|
|
|
if queueTotalExists {
|
|
t.Error("expected queue-total alert to be cleared")
|
|
}
|
|
if queueDeferredExists {
|
|
t.Error("expected queue-deferred alert to be cleared")
|
|
}
|
|
if queueHoldExists {
|
|
t.Error("expected queue-hold alert to be cleared")
|
|
}
|
|
if oldestMsgExists {
|
|
t.Error("expected oldest-message alert to be cleared")
|
|
}
|
|
if offlineExists {
|
|
t.Error("expected offline alert to be cleared")
|
|
}
|
|
if confirmExists {
|
|
t.Error("expected offline confirmation to be cleared")
|
|
}
|
|
})
|
|
|
|
t.Run("override with Disabled clears alerts", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
m.mu.Lock()
|
|
m.activeAlerts["pmg1-queue-total"] = &Alert{ID: "pmg1-queue-total", Type: "queue-total"}
|
|
m.activeAlerts["pmg1-oldest-message"] = &Alert{ID: "pmg1-oldest-message", Type: "oldest-message"}
|
|
m.activeAlerts["pmg-offline-pmg1"] = &Alert{ID: "pmg-offline-pmg1", Type: "connectivity"}
|
|
m.offlineConfirmations["pmg1"] = 3
|
|
m.config.Overrides = map[string]ThresholdConfig{
|
|
"pmg1": {Disabled: true},
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
pmg := models.PMGInstance{
|
|
ID: "pmg1",
|
|
Name: "testpmg",
|
|
}
|
|
|
|
m.CheckPMG(pmg)
|
|
|
|
m.mu.RLock()
|
|
_, queueExists := m.activeAlerts["pmg1-queue-total"]
|
|
_, oldestExists := m.activeAlerts["pmg1-oldest-message"]
|
|
_, offlineExists := m.activeAlerts["pmg-offline-pmg1"]
|
|
_, confirmExists := m.offlineConfirmations["pmg1"]
|
|
m.mu.RUnlock()
|
|
|
|
if queueExists {
|
|
t.Error("expected queue alert to be cleared")
|
|
}
|
|
if oldestExists {
|
|
t.Error("expected oldest-message alert to be cleared")
|
|
}
|
|
if offlineExists {
|
|
t.Error("expected offline alert to be cleared")
|
|
}
|
|
if confirmExists {
|
|
t.Error("expected offline confirmation to be cleared")
|
|
}
|
|
})
|
|
|
|
t.Run("DisableAllPMGOffline clears offline alert", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
m.mu.Lock()
|
|
m.activeAlerts["pmg-offline-pmg1"] = &Alert{ID: "pmg-offline-pmg1", Type: "connectivity"}
|
|
m.offlineConfirmations["pmg1"] = 3
|
|
m.config.DisableAllPMGOffline = true
|
|
m.mu.Unlock()
|
|
|
|
pmg := models.PMGInstance{
|
|
ID: "pmg1",
|
|
Name: "testpmg",
|
|
Status: "offline",
|
|
}
|
|
|
|
m.CheckPMG(pmg)
|
|
|
|
m.mu.RLock()
|
|
_, offlineExists := m.activeAlerts["pmg-offline-pmg1"]
|
|
_, confirmExists := m.offlineConfirmations["pmg1"]
|
|
m.mu.RUnlock()
|
|
|
|
if offlineExists {
|
|
t.Error("expected offline alert to be cleared when DisableAllPMGOffline is true")
|
|
}
|
|
if confirmExists {
|
|
t.Error("expected offline confirmation to be cleared")
|
|
}
|
|
})
|
|
|
|
t.Run("checks offline status", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
m.mu.Lock()
|
|
// Pre-populate confirmation count to bypass waiting period (3 required)
|
|
m.offlineConfirmations["pmg1"] = 2
|
|
m.mu.Unlock()
|
|
|
|
pmg := models.PMGInstance{
|
|
ID: "pmg1",
|
|
Name: "testpmg",
|
|
Status: "offline",
|
|
}
|
|
|
|
m.CheckPMG(pmg)
|
|
|
|
m.mu.RLock()
|
|
alert := m.activeAlerts["pmg-offline-pmg1"]
|
|
m.mu.RUnlock()
|
|
|
|
if alert == nil {
|
|
t.Fatal("expected offline alert")
|
|
}
|
|
if alert.Type != "offline" {
|
|
t.Errorf("expected offline type, got %s", alert.Type)
|
|
}
|
|
})
|
|
|
|
t.Run("checks connection health error", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
m.mu.Lock()
|
|
// Pre-populate confirmation count to bypass waiting period
|
|
m.offlineConfirmations["pmg1"] = 2
|
|
m.mu.Unlock()
|
|
|
|
pmg := models.PMGInstance{
|
|
ID: "pmg1",
|
|
Name: "testpmg",
|
|
Status: "online",
|
|
ConnectionHealth: "error",
|
|
}
|
|
|
|
m.CheckPMG(pmg)
|
|
|
|
m.mu.RLock()
|
|
alert := m.activeAlerts["pmg-offline-pmg1"]
|
|
m.mu.RUnlock()
|
|
|
|
if alert == nil {
|
|
t.Fatal("expected offline alert for connection health error")
|
|
}
|
|
})
|
|
|
|
t.Run("checks connection health unhealthy", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
m.mu.Lock()
|
|
// Pre-populate confirmation count to bypass waiting period
|
|
m.offlineConfirmations["pmg1"] = 2
|
|
m.mu.Unlock()
|
|
|
|
pmg := models.PMGInstance{
|
|
ID: "pmg1",
|
|
Name: "testpmg",
|
|
Status: "online",
|
|
ConnectionHealth: "unhealthy",
|
|
}
|
|
|
|
m.CheckPMG(pmg)
|
|
|
|
m.mu.RLock()
|
|
alert := m.activeAlerts["pmg-offline-pmg1"]
|
|
m.mu.RUnlock()
|
|
|
|
if alert == nil {
|
|
t.Fatal("expected offline alert for connection health unhealthy")
|
|
}
|
|
})
|
|
|
|
t.Run("clears offline alert when back online", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
m.mu.Lock()
|
|
m.activeAlerts["pmg-offline-pmg1"] = &Alert{ID: "pmg-offline-pmg1", Type: "connectivity"}
|
|
m.offlineConfirmations["pmg1"] = 5
|
|
m.mu.Unlock()
|
|
|
|
pmg := models.PMGInstance{
|
|
ID: "pmg1",
|
|
Name: "testpmg",
|
|
Status: "online",
|
|
ConnectionHealth: "healthy",
|
|
}
|
|
|
|
m.CheckPMG(pmg)
|
|
|
|
m.mu.RLock()
|
|
_, offlineExists := m.activeAlerts["pmg-offline-pmg1"]
|
|
_, confirmExists := m.offlineConfirmations["pmg1"]
|
|
m.mu.RUnlock()
|
|
|
|
if offlineExists {
|
|
t.Error("expected offline alert to be cleared when back online")
|
|
}
|
|
if confirmExists {
|
|
t.Error("expected offline confirmation to be cleared")
|
|
}
|
|
})
|
|
|
|
t.Run("skips metrics when PMG is offline", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
pmg := models.PMGInstance{
|
|
ID: "pmg1",
|
|
Name: "testpmg",
|
|
Status: "offline",
|
|
}
|
|
|
|
m.CheckPMG(pmg)
|
|
|
|
m.mu.RLock()
|
|
var queueAlertCount int
|
|
for alertID := range m.activeAlerts {
|
|
if strings.Contains(alertID, "pmg1-queue") || strings.Contains(alertID, "pmg1-oldest") {
|
|
queueAlertCount++
|
|
}
|
|
}
|
|
m.mu.RUnlock()
|
|
|
|
if queueAlertCount != 0 {
|
|
t.Error("expected no queue alerts when offline")
|
|
}
|
|
})
|
|
}
|
|
|
|
func TestCheckStorageComprehensive(t *testing.T) {
|
|
// t.Parallel()
|
|
|
|
t.Run("returns early when alerts disabled", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
m.mu.Lock()
|
|
m.config.Enabled = false
|
|
m.mu.Unlock()
|
|
|
|
storage := models.Storage{
|
|
ID: "storage1",
|
|
Name: "teststorage",
|
|
Status: "active",
|
|
Usage: 95.0,
|
|
}
|
|
|
|
m.CheckStorage(storage)
|
|
|
|
m.mu.RLock()
|
|
alertCount := len(m.activeAlerts)
|
|
m.mu.RUnlock()
|
|
|
|
if alertCount != 0 {
|
|
t.Errorf("expected no alerts when disabled, got %d", alertCount)
|
|
}
|
|
})
|
|
|
|
t.Run("DisableAllStorage clears existing alerts", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
m.mu.Lock()
|
|
m.activeAlerts["storage1-usage"] = &Alert{ID: "storage1-usage", Type: "usage"}
|
|
m.activeAlerts["storage-offline-storage1"] = &Alert{ID: "storage-offline-storage1", Type: "connectivity"}
|
|
m.config.DisableAllStorage = true
|
|
m.mu.Unlock()
|
|
|
|
storage := models.Storage{
|
|
ID: "storage1",
|
|
Name: "teststorage",
|
|
Status: "active",
|
|
}
|
|
|
|
m.CheckStorage(storage)
|
|
|
|
m.mu.RLock()
|
|
_, usageExists := m.activeAlerts["storage1-usage"]
|
|
_, offlineExists := m.activeAlerts["storage-offline-storage1"]
|
|
m.mu.RUnlock()
|
|
|
|
if usageExists {
|
|
t.Error("expected usage alert to be cleared")
|
|
}
|
|
if offlineExists {
|
|
t.Error("expected offline alert to be cleared")
|
|
}
|
|
})
|
|
|
|
t.Run("override with Disabled clears alerts", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
m.mu.Lock()
|
|
m.activeAlerts["storage1-usage"] = &Alert{ID: "storage1-usage", Type: "usage"}
|
|
m.activeAlerts["storage-offline-storage1"] = &Alert{ID: "storage-offline-storage1", Type: "connectivity"}
|
|
m.config.Overrides = map[string]ThresholdConfig{
|
|
"storage1": {Disabled: true},
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
storage := models.Storage{
|
|
ID: "storage1",
|
|
Name: "teststorage",
|
|
Status: "active",
|
|
}
|
|
|
|
m.CheckStorage(storage)
|
|
|
|
m.mu.RLock()
|
|
_, usageExists := m.activeAlerts["storage1-usage"]
|
|
_, offlineExists := m.activeAlerts["storage-offline-storage1"]
|
|
m.mu.RUnlock()
|
|
|
|
if usageExists {
|
|
t.Error("expected usage alert to be cleared")
|
|
}
|
|
if offlineExists {
|
|
t.Error("expected offline alert to be cleared")
|
|
}
|
|
})
|
|
|
|
t.Run("checks usage threshold", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
m.mu.Lock()
|
|
m.config.TimeThreshold = 0
|
|
m.config.TimeThresholds = map[string]int{}
|
|
m.config.StorageDefault = HysteresisThreshold{Trigger: 80.0, Clear: 70.0}
|
|
m.mu.Unlock()
|
|
|
|
storage := models.Storage{
|
|
ID: "storage1",
|
|
Name: "teststorage",
|
|
Node: "node1",
|
|
Status: "active",
|
|
Usage: 95.0,
|
|
}
|
|
|
|
m.CheckStorage(storage)
|
|
|
|
m.mu.RLock()
|
|
alert := m.activeAlerts["storage1-usage"]
|
|
m.mu.RUnlock()
|
|
|
|
if alert == nil {
|
|
t.Fatal("expected usage alert")
|
|
}
|
|
})
|
|
|
|
t.Run("applies override threshold", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
m.mu.Lock()
|
|
m.config.TimeThreshold = 0
|
|
m.config.TimeThresholds = map[string]int{}
|
|
m.config.StorageDefault = HysteresisThreshold{Trigger: 80.0, Clear: 70.0}
|
|
overrideThreshold := HysteresisThreshold{Trigger: 99.0, Clear: 95.0}
|
|
m.config.Overrides = map[string]ThresholdConfig{
|
|
"storage1": {Usage: &overrideThreshold},
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
storage := models.Storage{
|
|
ID: "storage1",
|
|
Name: "teststorage",
|
|
Status: "active",
|
|
Usage: 95.0, // Below override threshold
|
|
}
|
|
|
|
m.CheckStorage(storage)
|
|
|
|
m.mu.RLock()
|
|
_, exists := m.activeAlerts["storage1-usage"]
|
|
m.mu.RUnlock()
|
|
|
|
if exists {
|
|
t.Error("expected no alert due to higher override threshold")
|
|
}
|
|
})
|
|
|
|
t.Run("applies legacy shared storage override threshold", func(t *testing.T) {
|
|
m := newTestManager(t)
|
|
|
|
m.mu.Lock()
|
|
m.config.TimeThreshold = 0
|
|
m.config.TimeThresholds = map[string]int{}
|
|
m.config.StorageDefault = HysteresisThreshold{Trigger: 80.0, Clear: 70.0}
|
|
overrideThreshold := HysteresisThreshold{Trigger: 10.0, Clear: 5.0}
|
|
m.config.Overrides = map[string]ThresholdConfig{
|
|
"Main-pve1-ceph-pool": {Usage: &overrideThreshold},
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
storage := models.Storage{
|
|
ID: "Main-cluster-ceph-pool",
|
|
Name: "ceph-pool",
|
|
Node: "cluster",
|
|
Instance: "Main",
|
|
Status: "available",
|
|
Usage: 20.0,
|
|
Shared: true,
|
|
Nodes: []string{"pve1", "pve2"},
|
|
NodeIDs: []string{"Main-pve1", "Main-pve2"},
|
|
}
|
|
|
|
m.CheckStorage(storage)
|
|
|
|
m.mu.RLock()
|
|
alert := m.activeAlerts["Main-cluster-ceph-pool-usage"]
|
|
m.mu.RUnlock()
|
|
|
|
if alert == nil {
|
|
t.Fatal("expected usage alert when legacy shared-storage override matches canonical storage ID")
|
|
}
|
|
})
|
|
|
|
t.Run("skips usage check when offline", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
m.mu.Lock()
|
|
m.config.TimeThreshold = 0
|
|
m.config.TimeThresholds = map[string]int{}
|
|
m.config.StorageDefault = HysteresisThreshold{Trigger: 80.0, Clear: 70.0}
|
|
m.mu.Unlock()
|
|
|
|
storage := models.Storage{
|
|
ID: "storage1",
|
|
Name: "teststorage",
|
|
Status: "offline",
|
|
Usage: 95.0,
|
|
}
|
|
|
|
m.CheckStorage(storage)
|
|
|
|
m.mu.RLock()
|
|
_, exists := m.activeAlerts["storage1-usage"]
|
|
m.mu.RUnlock()
|
|
|
|
if exists {
|
|
t.Error("expected no usage alert when offline")
|
|
}
|
|
})
|
|
|
|
t.Run("skips usage check when unavailable", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
m.mu.Lock()
|
|
m.config.TimeThreshold = 0
|
|
m.config.TimeThresholds = map[string]int{}
|
|
m.config.StorageDefault = HysteresisThreshold{Trigger: 80.0, Clear: 70.0}
|
|
m.mu.Unlock()
|
|
|
|
storage := models.Storage{
|
|
ID: "storage1",
|
|
Name: "teststorage",
|
|
Status: "unavailable",
|
|
Usage: 95.0,
|
|
}
|
|
|
|
m.CheckStorage(storage)
|
|
|
|
m.mu.RLock()
|
|
_, exists := m.activeAlerts["storage1-usage"]
|
|
m.mu.RUnlock()
|
|
|
|
if exists {
|
|
t.Error("expected no usage alert when unavailable")
|
|
}
|
|
})
|
|
|
|
t.Run("checks offline status", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
m.mu.Lock()
|
|
// Pre-populate confirmation count (requires 2)
|
|
m.offlineConfirmations["storage1"] = 1
|
|
m.mu.Unlock()
|
|
|
|
storage := models.Storage{
|
|
ID: "storage1",
|
|
Name: "teststorage",
|
|
Status: "offline",
|
|
}
|
|
|
|
m.CheckStorage(storage)
|
|
|
|
m.mu.RLock()
|
|
alert := m.activeAlerts["storage-offline-storage1"]
|
|
m.mu.RUnlock()
|
|
|
|
if alert == nil {
|
|
t.Fatal("expected offline alert")
|
|
}
|
|
})
|
|
|
|
t.Run("checks unavailable status", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
m.mu.Lock()
|
|
// Pre-populate confirmation count (requires 2)
|
|
m.offlineConfirmations["storage1"] = 1
|
|
m.mu.Unlock()
|
|
|
|
storage := models.Storage{
|
|
ID: "storage1",
|
|
Name: "teststorage",
|
|
Status: "unavailable",
|
|
}
|
|
|
|
m.CheckStorage(storage)
|
|
|
|
m.mu.RLock()
|
|
alert := m.activeAlerts["storage-offline-storage1"]
|
|
m.mu.RUnlock()
|
|
|
|
if alert == nil {
|
|
t.Fatal("expected offline alert for unavailable status")
|
|
}
|
|
})
|
|
|
|
t.Run("clears offline alert when back online", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
m.mu.Lock()
|
|
m.activeAlerts["storage-offline-storage1"] = &Alert{ID: "storage-offline-storage1", Type: "connectivity"}
|
|
m.offlineConfirmations["storage1"] = 5
|
|
m.mu.Unlock()
|
|
|
|
storage := models.Storage{
|
|
ID: "storage1",
|
|
Name: "teststorage",
|
|
Status: "active",
|
|
}
|
|
|
|
m.CheckStorage(storage)
|
|
|
|
m.mu.RLock()
|
|
_, offlineExists := m.activeAlerts["storage-offline-storage1"]
|
|
_, confirmExists := m.offlineConfirmations["storage1"]
|
|
m.mu.RUnlock()
|
|
|
|
if offlineExists {
|
|
t.Error("expected offline alert to be cleared when back online")
|
|
}
|
|
if confirmExists {
|
|
t.Error("expected offline confirmation to be cleared")
|
|
}
|
|
})
|
|
|
|
t.Run("skips usage check when usage is zero", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
m.mu.Lock()
|
|
m.config.TimeThreshold = 0
|
|
m.config.TimeThresholds = map[string]int{}
|
|
m.config.StorageDefault = HysteresisThreshold{Trigger: 80.0, Clear: 70.0}
|
|
m.mu.Unlock()
|
|
|
|
storage := models.Storage{
|
|
ID: "storage1",
|
|
Name: "teststorage",
|
|
Status: "active",
|
|
Usage: 0, // No usage data
|
|
}
|
|
|
|
m.CheckStorage(storage)
|
|
|
|
m.mu.RLock()
|
|
_, exists := m.activeAlerts["storage1-usage"]
|
|
m.mu.RUnlock()
|
|
|
|
if exists {
|
|
t.Error("expected no usage alert when usage is zero")
|
|
}
|
|
})
|
|
}
|
|
|
|
func TestDispatchAlert(t *testing.T) {
|
|
// t.Parallel()
|
|
|
|
t.Run("returns false when onAlert is nil", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
alert := &Alert{
|
|
ID: "test-alert",
|
|
Type: "cpu",
|
|
}
|
|
|
|
result := m.dispatchAlert(alert, false)
|
|
|
|
if result {
|
|
t.Error("expected false when onAlert callback is nil")
|
|
}
|
|
})
|
|
|
|
t.Run("returns false when alert is nil", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
called := false
|
|
m.SetAlertCallback(func(a *Alert) {
|
|
called = true
|
|
})
|
|
|
|
result := m.dispatchAlert(nil, false)
|
|
|
|
if result {
|
|
t.Error("expected false when alert is nil")
|
|
}
|
|
if called {
|
|
t.Error("callback should not be called for nil alert")
|
|
}
|
|
})
|
|
|
|
t.Run("returns false when activation state is pending", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
called := false
|
|
m.SetAlertCallback(func(a *Alert) {
|
|
called = true
|
|
})
|
|
|
|
m.mu.Lock()
|
|
m.config.ActivationState = ActivationPending
|
|
m.mu.Unlock()
|
|
|
|
alert := &Alert{
|
|
ID: "test-alert",
|
|
Type: "cpu",
|
|
}
|
|
|
|
result := m.dispatchAlert(alert, false)
|
|
|
|
if result {
|
|
t.Error("expected false when activation is pending")
|
|
}
|
|
if called {
|
|
t.Error("callback should not be called when pending")
|
|
}
|
|
})
|
|
|
|
t.Run("returns false when activation state is snoozed", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
called := false
|
|
m.SetAlertCallback(func(a *Alert) {
|
|
called = true
|
|
})
|
|
|
|
m.mu.Lock()
|
|
m.config.ActivationState = ActivationSnoozed
|
|
m.mu.Unlock()
|
|
|
|
alert := &Alert{
|
|
ID: "test-alert",
|
|
Type: "cpu",
|
|
}
|
|
|
|
result := m.dispatchAlert(alert, false)
|
|
|
|
if result {
|
|
t.Error("expected false when activation is snoozed")
|
|
}
|
|
if called {
|
|
t.Error("callback should not be called when snoozed")
|
|
}
|
|
})
|
|
|
|
t.Run("returns false for monitor-only alert", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
called := false
|
|
m.SetAlertCallback(func(a *Alert) {
|
|
called = true
|
|
})
|
|
|
|
m.mu.Lock()
|
|
m.config.ActivationState = ActivationActive
|
|
m.mu.Unlock()
|
|
|
|
alert := &Alert{
|
|
ID: "test-alert",
|
|
Type: "cpu",
|
|
Metadata: map[string]interface{}{"monitorOnly": true},
|
|
}
|
|
|
|
result := m.dispatchAlert(alert, false)
|
|
|
|
if result {
|
|
t.Error("expected false for monitor-only alert")
|
|
}
|
|
if called {
|
|
t.Error("callback should not be called for monitor-only alert")
|
|
}
|
|
})
|
|
|
|
t.Run("dispatches synchronously when async is false", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
var receivedAlert *Alert
|
|
m.SetAlertCallback(func(a *Alert) {
|
|
receivedAlert = a
|
|
})
|
|
|
|
m.mu.Lock()
|
|
m.config.ActivationState = ActivationActive
|
|
m.mu.Unlock()
|
|
|
|
alert := &Alert{
|
|
ID: "test-alert",
|
|
Type: "cpu",
|
|
ResourceName: "testvm",
|
|
}
|
|
|
|
result := m.dispatchAlert(alert, false)
|
|
|
|
if !result {
|
|
t.Error("expected true for successful dispatch")
|
|
}
|
|
if receivedAlert == nil {
|
|
t.Fatal("callback should have been called")
|
|
}
|
|
if receivedAlert.ID != alert.ID {
|
|
t.Error("alert ID should match")
|
|
}
|
|
})
|
|
|
|
t.Run("dispatches asynchronously when async is true", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
var receivedAlert *Alert
|
|
done := make(chan struct{})
|
|
m.SetAlertCallback(func(a *Alert) {
|
|
receivedAlert = a
|
|
close(done)
|
|
})
|
|
|
|
m.mu.Lock()
|
|
m.config.ActivationState = ActivationActive
|
|
m.mu.Unlock()
|
|
|
|
alert := &Alert{
|
|
ID: "test-alert",
|
|
Type: "cpu",
|
|
ResourceName: "testvm",
|
|
}
|
|
|
|
result := m.dispatchAlert(alert, true)
|
|
|
|
if !result {
|
|
t.Error("expected true for successful dispatch")
|
|
}
|
|
|
|
// Wait for async callback
|
|
select {
|
|
case <-done:
|
|
// Success
|
|
case <-time.After(time.Second):
|
|
t.Fatal("async callback not called within timeout")
|
|
}
|
|
|
|
if receivedAlert == nil {
|
|
t.Fatal("callback should have been called")
|
|
}
|
|
if receivedAlert.ID != alert.ID {
|
|
t.Error("alert ID should match")
|
|
}
|
|
})
|
|
|
|
t.Run("clones alert before dispatch", func(t *testing.T) {
|
|
// t.Parallel()
|
|
m := newTestManager(t)
|
|
|
|
var receivedAlert *Alert
|
|
m.SetAlertCallback(func(a *Alert) {
|
|
receivedAlert = a
|
|
})
|
|
|
|
m.mu.Lock()
|
|
m.config.ActivationState = ActivationActive
|
|
m.mu.Unlock()
|
|
|
|
alert := &Alert{
|
|
ID: "test-alert",
|
|
Type: "cpu",
|
|
ResourceName: "testvm",
|
|
}
|
|
|
|
m.dispatchAlert(alert, false)
|
|
|
|
if receivedAlert == alert {
|
|
t.Error("alert should be cloned, not passed directly")
|
|
}
|
|
})
|
|
}
|
|
|
|
func TestPreserveAlertState(t *testing.T) {
|
|
t.Run("nil updated alert is handled", func(t *testing.T) {
|
|
m := newTestManager(t)
|
|
// Should not panic
|
|
m.preserveAlertState("test-id", nil)
|
|
})
|
|
|
|
t.Run("preserves state from existing alert", func(t *testing.T) {
|
|
m := newTestManager(t)
|
|
|
|
ackTime := time.Now().Add(-30 * time.Minute)
|
|
existing := &Alert{
|
|
ID: "test-alert",
|
|
Type: "cpu",
|
|
StartTime: time.Now().Add(-1 * time.Hour),
|
|
Acknowledged: true,
|
|
AckUser: "testuser",
|
|
AckTime: &ackTime,
|
|
LastEscalation: 2,
|
|
EscalationTimes: []time.Time{time.Now().Add(-25 * time.Minute)},
|
|
}
|
|
|
|
m.mu.Lock()
|
|
m.activeAlerts["test-alert"] = existing
|
|
m.mu.Unlock()
|
|
|
|
updated := &Alert{
|
|
ID: "test-alert",
|
|
Type: "cpu",
|
|
StartTime: time.Now(), // Different start time
|
|
}
|
|
|
|
m.preserveAlertState("test-alert", updated)
|
|
|
|
if !updated.StartTime.Equal(existing.StartTime) {
|
|
t.Error("StartTime should be preserved from existing alert")
|
|
}
|
|
if !updated.Acknowledged {
|
|
t.Error("Acknowledged should be preserved")
|
|
}
|
|
if updated.AckUser != "testuser" {
|
|
t.Errorf("AckUser should be preserved, got %s", updated.AckUser)
|
|
}
|
|
if updated.AckTime == nil || !updated.AckTime.Equal(ackTime) {
|
|
t.Error("AckTime should be preserved")
|
|
}
|
|
if updated.LastEscalation != 2 {
|
|
t.Error("LastEscalation should be preserved")
|
|
}
|
|
if len(updated.EscalationTimes) != 1 {
|
|
t.Error("EscalationTimes should be preserved")
|
|
}
|
|
})
|
|
|
|
t.Run("falls back to ackState for new alert", func(t *testing.T) {
|
|
m := newTestManager(t)
|
|
|
|
ackTime := time.Now().Add(-15 * time.Minute)
|
|
m.mu.Lock()
|
|
m.ackState["test-alert"] = ackRecord{
|
|
acknowledged: true,
|
|
user: "fallbackuser",
|
|
time: ackTime,
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
updated := &Alert{
|
|
ID: "test-alert",
|
|
Type: "cpu",
|
|
StartTime: time.Now(),
|
|
}
|
|
|
|
m.preserveAlertState("test-alert", updated)
|
|
|
|
if !updated.Acknowledged {
|
|
t.Error("Acknowledged should be set from ackState")
|
|
}
|
|
if updated.AckUser != "fallbackuser" {
|
|
t.Errorf("AckUser should be from ackState, got %s", updated.AckUser)
|
|
}
|
|
if updated.AckTime == nil || !updated.AckTime.Equal(ackTime) {
|
|
t.Error("AckTime should be from ackState")
|
|
}
|
|
})
|
|
|
|
t.Run("no state to preserve for new alert", func(t *testing.T) {
|
|
m := newTestManager(t)
|
|
|
|
startTime := time.Now()
|
|
updated := &Alert{
|
|
ID: "new-alert",
|
|
Type: "cpu",
|
|
StartTime: startTime,
|
|
}
|
|
|
|
m.preserveAlertState("new-alert", updated)
|
|
|
|
if !updated.StartTime.Equal(startTime) {
|
|
t.Error("StartTime should remain unchanged for new alert")
|
|
}
|
|
if updated.Acknowledged {
|
|
t.Error("Acknowledged should remain false for new alert")
|
|
}
|
|
})
|
|
}
|
|
|
|
func TestCheckPMGQuarantineBacklog(t *testing.T) {
|
|
t.Run("nil quarantine clears alerts", func(t *testing.T) {
|
|
m := newTestManager(t)
|
|
|
|
// Create an existing quarantine alert
|
|
m.mu.Lock()
|
|
m.activeAlerts["pmg1-quarantine-spam"] = &Alert{
|
|
ID: "pmg1-quarantine-spam",
|
|
Type: "quarantine-spam",
|
|
}
|
|
m.activeAlerts["pmg1-quarantine-virus"] = &Alert{
|
|
ID: "pmg1-quarantine-virus",
|
|
Type: "quarantine-virus",
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
pmg := models.PMGInstance{
|
|
ID: "pmg1",
|
|
Name: "pmg-server",
|
|
Host: "pmg.example.com",
|
|
Quarantine: nil,
|
|
}
|
|
|
|
m.checkPMGQuarantineBacklog(pmg, PMGThresholdConfig{})
|
|
|
|
m.mu.RLock()
|
|
_, spamExists := m.activeAlerts["pmg1-quarantine-spam"]
|
|
_, virusExists := m.activeAlerts["pmg1-quarantine-virus"]
|
|
m.mu.RUnlock()
|
|
|
|
if spamExists {
|
|
t.Error("spam alert should be cleared when quarantine is nil")
|
|
}
|
|
if virusExists {
|
|
t.Error("virus alert should be cleared when quarantine is nil")
|
|
}
|
|
})
|
|
|
|
t.Run("warning threshold triggers alert", func(t *testing.T) {
|
|
m := newTestManager(t)
|
|
m.ClearActiveAlerts()
|
|
m.mu.Lock()
|
|
m.pmgQuarantineHistory = make(map[string][]pmgQuarantineSnapshot)
|
|
m.config.ActivationState = ActivationActive
|
|
m.mu.Unlock()
|
|
|
|
pmg := models.PMGInstance{
|
|
ID: "pmg1",
|
|
Name: "pmg-server",
|
|
Host: "pmg.example.com",
|
|
Quarantine: &models.PMGQuarantineTotals{
|
|
Spam: 2500, // Above warning threshold
|
|
Virus: 100,
|
|
},
|
|
}
|
|
|
|
thresholds := PMGThresholdConfig{
|
|
QuarantineSpamWarn: 2000,
|
|
QuarantineSpamCritical: 5000,
|
|
QuarantineVirusWarn: 2000,
|
|
QuarantineVirusCritical: 5000,
|
|
}
|
|
|
|
m.checkPMGQuarantineBacklog(pmg, thresholds)
|
|
|
|
m.mu.RLock()
|
|
alert, exists := m.activeAlerts["pmg1-quarantine-spam"]
|
|
m.mu.RUnlock()
|
|
|
|
if !exists {
|
|
t.Fatal("spam quarantine warning alert should be created")
|
|
}
|
|
if alert.Level != AlertLevelWarning {
|
|
t.Errorf("alert level should be warning, got %s", alert.Level)
|
|
}
|
|
})
|
|
|
|
t.Run("critical threshold triggers alert", func(t *testing.T) {
|
|
m := newTestManager(t)
|
|
m.ClearActiveAlerts()
|
|
m.mu.Lock()
|
|
m.pmgQuarantineHistory = make(map[string][]pmgQuarantineSnapshot)
|
|
m.config.ActivationState = ActivationActive
|
|
m.mu.Unlock()
|
|
|
|
pmg := models.PMGInstance{
|
|
ID: "pmg1",
|
|
Name: "pmg-server",
|
|
Host: "pmg.example.com",
|
|
Quarantine: &models.PMGQuarantineTotals{
|
|
Spam: 6000, // Above critical threshold
|
|
Virus: 100,
|
|
},
|
|
}
|
|
|
|
thresholds := PMGThresholdConfig{
|
|
QuarantineSpamWarn: 2000,
|
|
QuarantineSpamCritical: 5000,
|
|
QuarantineVirusWarn: 2000,
|
|
QuarantineVirusCritical: 5000,
|
|
}
|
|
|
|
m.checkPMGQuarantineBacklog(pmg, thresholds)
|
|
|
|
m.mu.RLock()
|
|
alert, exists := m.activeAlerts["pmg1-quarantine-spam"]
|
|
m.mu.RUnlock()
|
|
|
|
if !exists {
|
|
t.Fatal("spam quarantine critical alert should be created")
|
|
}
|
|
if alert.Level != AlertLevelCritical {
|
|
t.Errorf("alert level should be critical, got %s", alert.Level)
|
|
}
|
|
})
|
|
|
|
t.Run("below threshold clears alert", func(t *testing.T) {
|
|
m := newTestManager(t)
|
|
m.ClearActiveAlerts()
|
|
m.mu.Lock()
|
|
m.pmgQuarantineHistory = make(map[string][]pmgQuarantineSnapshot)
|
|
m.activeAlerts["pmg1-quarantine-spam"] = &Alert{
|
|
ID: "pmg1-quarantine-spam",
|
|
Type: "quarantine-spam",
|
|
Level: AlertLevelWarning,
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
pmg := models.PMGInstance{
|
|
ID: "pmg1",
|
|
Name: "pmg-server",
|
|
Host: "pmg.example.com",
|
|
Quarantine: &models.PMGQuarantineTotals{
|
|
Spam: 500, // Below warning threshold
|
|
Virus: 100,
|
|
},
|
|
}
|
|
|
|
thresholds := PMGThresholdConfig{
|
|
QuarantineSpamWarn: 2000,
|
|
QuarantineSpamCritical: 5000,
|
|
}
|
|
|
|
m.checkPMGQuarantineBacklog(pmg, thresholds)
|
|
|
|
m.mu.RLock()
|
|
_, exists := m.activeAlerts["pmg1-quarantine-spam"]
|
|
m.mu.RUnlock()
|
|
|
|
if exists {
|
|
t.Error("spam quarantine alert should be cleared when below threshold")
|
|
}
|
|
})
|
|
|
|
t.Run("growth rate triggers warning alert", func(t *testing.T) {
|
|
m := newTestManager(t)
|
|
m.ClearActiveAlerts()
|
|
m.mu.Lock()
|
|
m.config.ActivationState = ActivationActive
|
|
// Set up history from ~2 hours ago
|
|
m.pmgQuarantineHistory = map[string][]pmgQuarantineSnapshot{
|
|
"pmg1": {
|
|
{
|
|
Spam: 1000,
|
|
Virus: 100,
|
|
Timestamp: time.Now().Add(-2 * time.Hour),
|
|
},
|
|
},
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
pmg := models.PMGInstance{
|
|
ID: "pmg1",
|
|
Name: "pmg-server",
|
|
Host: "pmg.example.com",
|
|
Quarantine: &models.PMGQuarantineTotals{
|
|
Spam: 1500, // 50% growth (500 messages)
|
|
Virus: 100,
|
|
},
|
|
}
|
|
|
|
thresholds := PMGThresholdConfig{
|
|
QuarantineSpamWarn: 10000, // High absolute threshold (won't trigger)
|
|
QuarantineSpamCritical: 20000,
|
|
QuarantineGrowthWarnPct: 25, // 25% growth warning
|
|
QuarantineGrowthWarnMin: 250, // Minimum 250 messages
|
|
QuarantineGrowthCritPct: 50, // 50% growth critical
|
|
QuarantineGrowthCritMin: 500, // Minimum 500 messages
|
|
}
|
|
|
|
m.checkPMGQuarantineBacklog(pmg, thresholds)
|
|
|
|
m.mu.RLock()
|
|
alert, exists := m.activeAlerts["pmg1-quarantine-spam"]
|
|
m.mu.RUnlock()
|
|
|
|
if !exists {
|
|
t.Fatal("spam quarantine growth alert should be created")
|
|
}
|
|
if alert.Level != AlertLevelCritical {
|
|
t.Errorf("alert level should be critical due to 50%% growth + 500 messages, got %s", alert.Level)
|
|
}
|
|
})
|
|
|
|
t.Run("updates existing alert", func(t *testing.T) {
|
|
m := newTestManager(t)
|
|
m.ClearActiveAlerts()
|
|
m.mu.Lock()
|
|
m.pmgQuarantineHistory = make(map[string][]pmgQuarantineSnapshot)
|
|
m.config.ActivationState = ActivationActive
|
|
m.activeAlerts["pmg1-quarantine-spam"] = &Alert{
|
|
ID: "pmg1-quarantine-spam",
|
|
Type: "quarantine-spam",
|
|
Level: AlertLevelWarning,
|
|
Value: 2500,
|
|
Threshold: 2000,
|
|
LastSeen: time.Now().Add(-5 * time.Minute),
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
pmg := models.PMGInstance{
|
|
ID: "pmg1",
|
|
Name: "pmg-server",
|
|
Host: "pmg.example.com",
|
|
Quarantine: &models.PMGQuarantineTotals{
|
|
Spam: 3000, // Higher spam count
|
|
Virus: 100,
|
|
},
|
|
}
|
|
|
|
thresholds := PMGThresholdConfig{
|
|
QuarantineSpamWarn: 2000,
|
|
QuarantineSpamCritical: 5000,
|
|
}
|
|
|
|
m.checkPMGQuarantineBacklog(pmg, thresholds)
|
|
|
|
m.mu.RLock()
|
|
alert, exists := m.activeAlerts["pmg1-quarantine-spam"]
|
|
m.mu.RUnlock()
|
|
|
|
if !exists {
|
|
t.Fatal("spam quarantine alert should still exist")
|
|
}
|
|
if alert.Value != 3000 {
|
|
t.Errorf("alert value should be updated to 3000, got %.0f", alert.Value)
|
|
}
|
|
})
|
|
|
|
t.Run("virus quarantine alert", func(t *testing.T) {
|
|
m := newTestManager(t)
|
|
m.ClearActiveAlerts()
|
|
m.mu.Lock()
|
|
m.pmgQuarantineHistory = make(map[string][]pmgQuarantineSnapshot)
|
|
m.config.ActivationState = ActivationActive
|
|
m.mu.Unlock()
|
|
|
|
pmg := models.PMGInstance{
|
|
ID: "pmg1",
|
|
Name: "pmg-server",
|
|
Host: "pmg.example.com",
|
|
Quarantine: &models.PMGQuarantineTotals{
|
|
Spam: 100,
|
|
Virus: 3000, // Above virus warning threshold
|
|
},
|
|
}
|
|
|
|
thresholds := PMGThresholdConfig{
|
|
QuarantineSpamWarn: 2000,
|
|
QuarantineSpamCritical: 5000,
|
|
QuarantineVirusWarn: 2000,
|
|
QuarantineVirusCritical: 5000,
|
|
}
|
|
|
|
m.checkPMGQuarantineBacklog(pmg, thresholds)
|
|
|
|
m.mu.RLock()
|
|
alert, exists := m.activeAlerts["pmg1-quarantine-virus"]
|
|
m.mu.RUnlock()
|
|
|
|
if !exists {
|
|
t.Fatal("virus quarantine warning alert should be created")
|
|
}
|
|
if alert.Level != AlertLevelWarning {
|
|
t.Errorf("alert level should be warning, got %s", alert.Level)
|
|
}
|
|
})
|
|
}
|
|
|
|
func TestLoadActiveAlerts(t *testing.T) {
|
|
t.Run("no file returns nil error", func(t *testing.T) {
|
|
m := newTestManager(t)
|
|
m.ClearActiveAlerts()
|
|
|
|
err := m.LoadActiveAlerts()
|
|
if err != nil {
|
|
t.Errorf("expected no error when file doesn't exist, got %v", err)
|
|
}
|
|
})
|
|
|
|
t.Run("loads alerts from valid file", func(t *testing.T) {
|
|
m := newTestManager(t)
|
|
|
|
// Create an alert and save it
|
|
startTime := time.Now().Add(-30 * time.Minute)
|
|
alert := &Alert{
|
|
ID: "test-load-alert",
|
|
Type: "cpu",
|
|
Level: AlertLevelWarning,
|
|
ResourceID: "test-resource",
|
|
ResourceName: "test-vm",
|
|
Node: "node1",
|
|
Instance: "pve1",
|
|
Message: "Test alert",
|
|
Value: 85.0,
|
|
Threshold: 80.0,
|
|
StartTime: startTime,
|
|
LastSeen: time.Now(),
|
|
}
|
|
|
|
m.mu.Lock()
|
|
m.activeAlerts[alert.ID] = alert
|
|
m.mu.Unlock()
|
|
|
|
// Save to disk
|
|
_ = m.SaveActiveAlerts()
|
|
|
|
// Clear in-memory map only (don't use ClearActiveAlerts which triggers async save)
|
|
m.mu.Lock()
|
|
m.activeAlerts = make(map[string]*Alert)
|
|
m.mu.Unlock()
|
|
|
|
err := m.LoadActiveAlerts()
|
|
if err != nil {
|
|
t.Fatalf("failed to load alerts: %v", err)
|
|
}
|
|
|
|
m.mu.RLock()
|
|
loaded, exists := m.activeAlerts["test-load-alert"]
|
|
m.mu.RUnlock()
|
|
|
|
if !exists {
|
|
t.Fatal("alert should be loaded from file")
|
|
}
|
|
if loaded.Type != "cpu" {
|
|
t.Errorf("loaded alert type should be cpu, got %s", loaded.Type)
|
|
}
|
|
if loaded.Value != 85.0 {
|
|
t.Errorf("loaded alert value should be 85.0, got %.1f", loaded.Value)
|
|
}
|
|
})
|
|
|
|
t.Run("skips old alerts", func(t *testing.T) {
|
|
m := newTestManager(t)
|
|
|
|
// Create an old alert (>24 hours)
|
|
startTime := time.Now().Add(-25 * time.Hour)
|
|
alert := &Alert{
|
|
ID: "old-alert",
|
|
Type: "cpu",
|
|
Level: AlertLevelWarning,
|
|
ResourceID: "test-resource",
|
|
ResourceName: "test-vm",
|
|
StartTime: startTime,
|
|
LastSeen: startTime,
|
|
}
|
|
|
|
m.mu.Lock()
|
|
m.activeAlerts[alert.ID] = alert
|
|
m.mu.Unlock()
|
|
|
|
// Save to disk
|
|
_ = m.SaveActiveAlerts()
|
|
|
|
// Clear in-memory map only (don't use ClearActiveAlerts which triggers async save)
|
|
m.mu.Lock()
|
|
m.activeAlerts = make(map[string]*Alert)
|
|
m.mu.Unlock()
|
|
|
|
err := m.LoadActiveAlerts()
|
|
if err != nil {
|
|
t.Fatalf("failed to load alerts: %v", err)
|
|
}
|
|
|
|
m.mu.RLock()
|
|
_, exists := m.activeAlerts["old-alert"]
|
|
m.mu.RUnlock()
|
|
|
|
if exists {
|
|
t.Error("old alert (>24h) should be skipped during load")
|
|
}
|
|
})
|
|
|
|
t.Run("skips old acknowledged alerts", func(t *testing.T) {
|
|
m := newTestManager(t)
|
|
|
|
// Create an alert acknowledged >1 hour ago
|
|
startTime := time.Now().Add(-30 * time.Minute)
|
|
ackTime := time.Now().Add(-2 * time.Hour)
|
|
alert := &Alert{
|
|
ID: "old-ack-alert",
|
|
Type: "cpu",
|
|
Level: AlertLevelWarning,
|
|
ResourceID: "test-resource",
|
|
ResourceName: "test-vm",
|
|
StartTime: startTime,
|
|
LastSeen: time.Now(),
|
|
Acknowledged: true,
|
|
AckTime: &ackTime,
|
|
AckUser: "testuser",
|
|
}
|
|
|
|
m.mu.Lock()
|
|
m.activeAlerts[alert.ID] = alert
|
|
m.mu.Unlock()
|
|
|
|
// Save to disk
|
|
_ = m.SaveActiveAlerts()
|
|
|
|
// Clear in-memory map only (don't use ClearActiveAlerts which triggers async save)
|
|
m.mu.Lock()
|
|
m.activeAlerts = make(map[string]*Alert)
|
|
m.mu.Unlock()
|
|
|
|
err := m.LoadActiveAlerts()
|
|
if err != nil {
|
|
t.Fatalf("failed to load alerts: %v", err)
|
|
}
|
|
|
|
m.mu.RLock()
|
|
_, exists := m.activeAlerts["old-ack-alert"]
|
|
ackRecord, ackExists := m.ackState["old-ack-alert"]
|
|
m.mu.RUnlock()
|
|
|
|
if exists {
|
|
t.Error("old acknowledged alert (>1h) should be skipped from activeAlerts")
|
|
}
|
|
|
|
// But ackState should be preserved so the alert doesn't retrigger if it reappears
|
|
if !ackExists {
|
|
t.Error("ackState should be preserved for old acknowledged alerts to prevent retriggering")
|
|
}
|
|
if ackExists && !ackRecord.acknowledged {
|
|
t.Error("ackState.acknowledged should be true")
|
|
}
|
|
if ackExists && ackRecord.user != "testuser" {
|
|
t.Errorf("ackState.user should be 'testuser', got %q", ackRecord.user)
|
|
}
|
|
})
|
|
|
|
t.Run("restores acknowledgment state", func(t *testing.T) {
|
|
m := newTestManager(t)
|
|
|
|
// Create an acknowledged alert
|
|
startTime := time.Now().Add(-10 * time.Minute)
|
|
ackTime := time.Now().Add(-5 * time.Minute)
|
|
alert := &Alert{
|
|
ID: "ack-alert",
|
|
Type: "cpu",
|
|
Level: AlertLevelWarning,
|
|
ResourceID: "test-resource",
|
|
ResourceName: "test-vm",
|
|
StartTime: startTime,
|
|
LastSeen: time.Now(),
|
|
Acknowledged: true,
|
|
AckTime: &ackTime,
|
|
AckUser: "testuser",
|
|
}
|
|
|
|
m.mu.Lock()
|
|
m.activeAlerts[alert.ID] = alert
|
|
m.mu.Unlock()
|
|
|
|
// Save to disk
|
|
_ = m.SaveActiveAlerts()
|
|
|
|
// Clear in-memory maps only (don't use ClearActiveAlerts which triggers async save)
|
|
m.mu.Lock()
|
|
m.activeAlerts = make(map[string]*Alert)
|
|
m.ackState = make(map[string]ackRecord)
|
|
m.mu.Unlock()
|
|
|
|
err := m.LoadActiveAlerts()
|
|
if err != nil {
|
|
t.Fatalf("failed to load alerts: %v", err)
|
|
}
|
|
|
|
m.mu.RLock()
|
|
loaded, exists := m.activeAlerts["ack-alert"]
|
|
ackRecord, hasAckRecord := m.ackState["ack-alert"]
|
|
m.mu.RUnlock()
|
|
|
|
if !exists {
|
|
t.Fatal("acknowledged alert should be loaded")
|
|
}
|
|
if !loaded.Acknowledged {
|
|
t.Error("loaded alert should be acknowledged")
|
|
}
|
|
if loaded.AckUser != "testuser" {
|
|
t.Errorf("loaded alert AckUser should be testuser, got %s", loaded.AckUser)
|
|
}
|
|
if !hasAckRecord {
|
|
t.Error("ackState should be restored for acknowledged alert")
|
|
}
|
|
if !ackRecord.acknowledged {
|
|
t.Error("ackState should show acknowledged=true")
|
|
}
|
|
})
|
|
|
|
t.Run("invalid JSON returns error", func(t *testing.T) {
|
|
m := newTestManager(t)
|
|
m.ClearActiveAlerts()
|
|
|
|
// Write invalid JSON to the alerts file
|
|
alertsDir := filepath.Join(utils.GetDataDir(), "alerts")
|
|
if err := os.MkdirAll(alertsDir, 0755); err != nil {
|
|
t.Fatalf("failed to create alerts dir: %v", err)
|
|
}
|
|
|
|
alertsFile := filepath.Join(alertsDir, "active-alerts.json")
|
|
if err := os.WriteFile(alertsFile, []byte("invalid json"), 0644); err != nil {
|
|
t.Fatalf("failed to write invalid json: %v", err)
|
|
}
|
|
|
|
err := m.LoadActiveAlerts()
|
|
if err == nil {
|
|
t.Error("expected error for invalid JSON")
|
|
}
|
|
})
|
|
|
|
t.Run("skips duplicate alerts", func(t *testing.T) {
|
|
m := newTestManager(t)
|
|
m.ClearActiveAlerts()
|
|
|
|
// Write JSON with duplicate alert IDs
|
|
alertsDir := filepath.Join(utils.GetDataDir(), "alerts")
|
|
if err := os.MkdirAll(alertsDir, 0755); err != nil {
|
|
t.Fatalf("failed to create alerts dir: %v", err)
|
|
}
|
|
|
|
startTime := time.Now().Add(-10 * time.Minute)
|
|
alerts := []Alert{
|
|
{ID: "dup-alert", Type: "cpu", StartTime: startTime, LastSeen: time.Now()},
|
|
{ID: "dup-alert", Type: "memory", StartTime: startTime, LastSeen: time.Now()},
|
|
}
|
|
|
|
data, _ := json.Marshal(alerts)
|
|
alertsFile := filepath.Join(alertsDir, "active-alerts.json")
|
|
if err := os.WriteFile(alertsFile, data, 0644); err != nil {
|
|
t.Fatalf("failed to write alerts json: %v", err)
|
|
}
|
|
|
|
err := m.LoadActiveAlerts()
|
|
if err != nil {
|
|
t.Fatalf("failed to load alerts: %v", err)
|
|
}
|
|
|
|
m.mu.RLock()
|
|
alert, exists := m.activeAlerts["dup-alert"]
|
|
m.mu.RUnlock()
|
|
|
|
if !exists {
|
|
t.Fatal("alert should exist after load")
|
|
}
|
|
// First one wins
|
|
if alert.Type != "cpu" {
|
|
t.Errorf("first alert should win, got type %s", alert.Type)
|
|
}
|
|
})
|
|
|
|
t.Run("migrates legacy guest alert resource IDs to canonical format", func(t *testing.T) {
|
|
m := newTestManager(t)
|
|
m.ClearActiveAlerts()
|
|
|
|
alertsDir := filepath.Join(utils.GetDataDir(), "alerts")
|
|
if err := os.MkdirAll(alertsDir, 0755); err != nil {
|
|
t.Fatalf("failed to create alerts dir: %v", err)
|
|
}
|
|
|
|
startTime := time.Now().Add(-10 * time.Minute)
|
|
legacyResourceID := "pve1-100"
|
|
canonicalResourceID := BuildGuestKey("pve1", "node1", 100)
|
|
alerts := []Alert{
|
|
{
|
|
ID: legacyResourceID + "-cpu",
|
|
Type: "cpu",
|
|
Level: AlertLevelWarning,
|
|
ResourceID: legacyResourceID,
|
|
ResourceName: "test-vm",
|
|
Node: "node1",
|
|
Instance: "pve1",
|
|
StartTime: startTime,
|
|
LastSeen: time.Now(),
|
|
},
|
|
}
|
|
|
|
data, _ := json.Marshal(alerts)
|
|
alertsFile := filepath.Join(alertsDir, "active-alerts.json")
|
|
if err := os.WriteFile(alertsFile, data, 0644); err != nil {
|
|
t.Fatalf("failed to write alerts json: %v", err)
|
|
}
|
|
|
|
err := m.LoadActiveAlerts()
|
|
if err != nil {
|
|
t.Fatalf("failed to load alerts: %v", err)
|
|
}
|
|
|
|
m.mu.RLock()
|
|
alert, exists := m.activeAlerts[canonicalResourceID+"-cpu"]
|
|
_, oldExists := m.activeAlerts[legacyResourceID+"-cpu"]
|
|
m.mu.RUnlock()
|
|
|
|
if !exists {
|
|
t.Fatal("expected canonical guest alert to be loaded")
|
|
}
|
|
if oldExists {
|
|
t.Fatal("expected legacy guest alert ID to be replaced")
|
|
}
|
|
if alert.ResourceID != canonicalResourceID {
|
|
t.Fatalf("expected resource ID %q, got %q", canonicalResourceID, alert.ResourceID)
|
|
}
|
|
if alert.ID != canonicalResourceID+"-cpu" {
|
|
t.Fatalf("expected alert ID %q, got %q", canonicalResourceID+"-cpu", alert.ID)
|
|
}
|
|
})
|
|
}
|
|
|
|
func TestNamespaceMatchesInstance(t *testing.T) {
|
|
tests := []struct {
|
|
name string
|
|
namespace string
|
|
instance string
|
|
expected bool
|
|
}{
|
|
// Exact matches
|
|
{"exact match", "pve", "pve", true},
|
|
{"exact match with numbers", "pve1", "pve1", true},
|
|
|
|
// Suffix matches (namespace is suffix of instance)
|
|
{"namespace suffix of instance", "nat", "pve-nat", true},
|
|
{"namespace suffix of instance no dash", "nat", "pvenat", true},
|
|
|
|
// Suffix matches (instance is suffix of namespace)
|
|
{"instance suffix of namespace", "pvebackups", "pve", false}, // "pve" is not suffix of "pvebackups"
|
|
{"instance suffix of namespace 2", "backupspve", "pve", true}, // "pve" IS suffix of "backupspve"
|
|
|
|
// Case insensitive
|
|
{"case insensitive exact", "PVE", "pve", true},
|
|
{"case insensitive suffix", "NAT", "pve-nat", true},
|
|
|
|
// Special characters ignored
|
|
{"special chars in namespace", "pve_nat", "pvenat", true},
|
|
{"special chars in instance", "pvenat", "pve-nat", true},
|
|
{"both have special chars", "pve-1", "pve_1", true},
|
|
|
|
// No matches - substring but not suffix
|
|
{"no match substring not suffix", "production", "my-production-server", false}, // "production" is not suffix of "myproductionserver"
|
|
{"no match pve not suffix of pvenat", "pve", "pve-nat", false}, // "pve" is not suffix of "pvenat"
|
|
|
|
// No matches
|
|
{"no match", "production", "staging", false},
|
|
{"no match different names", "pve1", "pve2", false},
|
|
{"no match partial mismatch", "abc", "xyz", false},
|
|
|
|
// Empty values
|
|
{"empty namespace", "", "pve", false},
|
|
{"empty instance", "pve", "", false},
|
|
{"both empty", "", "", false},
|
|
|
|
// Real-world scenarios from issue #1095
|
|
{"pve namespace with pve instance", "pve", "pve", true},
|
|
{"nat namespace with pve-nat instance", "nat", "pve-nat", true},
|
|
{"pve1 namespace with pve1 instance", "pve1", "pve1", true},
|
|
}
|
|
|
|
for _, tt := range tests {
|
|
t.Run(tt.name, func(t *testing.T) {
|
|
result := namespaceMatchesInstance(tt.namespace, tt.instance)
|
|
if result != tt.expected {
|
|
t.Errorf("namespaceMatchesInstance(%q, %q) = %v, want %v",
|
|
tt.namespace, tt.instance, result, tt.expected)
|
|
}
|
|
})
|
|
}
|
|
}
|