Improve tests for internal/alerts package

- Fix TestSaveHistoryWithRetry_WriteError to be robust on root
- Add TestOnAlert to history_test.go
- Add pmg_anomaly_test.go for PMG anomaly detection coverage
- Add cleanup_test.go for tracking map cleanup coverage
- extend filter_evaluation_test.go to cover all guest threshold logic
This commit is contained in:
rcourtman 2026-01-02 23:46:35 +00:00
parent 3b48c4acbb
commit fbbefa4546
4 changed files with 600 additions and 21 deletions

View file

@ -0,0 +1,91 @@
package alerts
import (
"testing"
"time"
)
func TestCleanupStaleMaps(t *testing.T) {
t.Parallel()
m := newTestManager(t)
// Populate maps with old data
oldTime := time.Now().Add(-25 * time.Hour)
recentTime := time.Now().Add(-1 * time.Hour)
m.mu.Lock()
// Flapping history
m.flappingHistory["stale-flapping"] = []time.Time{oldTime}
m.flappingHistory["recent-flapping"] = []time.Time{recentTime}
m.flappingActive["stale-flapping"] = true
// Suppressed until
m.suppressedUntil["expired-suppression"] = oldTime
m.suppressedUntil["active-suppression"] = time.Now().Add(1 * time.Hour)
// Pending alerts
m.pendingAlerts["stale-pending"] = oldTime
m.pendingAlerts["recent-pending"] = recentTime
// Should not be cleaned if active alert exists
m.flappingHistory["active-alert-flapping"] = []time.Time{oldTime}
m.activeAlerts["active-alert-flapping"] = &Alert{ID: "active-alert-flapping"}
// Empty history should be cleaned
m.flappingHistory["empty-flapping"] = []time.Time{}
m.mu.Unlock()
// Run cleanup
m.cleanupStaleMaps()
// Verify
m.mu.Lock()
if _, exists := m.flappingHistory["stale-flapping"]; exists {
t.Error("stale-flapping should get removed")
}
if _, exists := m.flappingActive["stale-flapping"]; exists {
t.Error("stale-flapping active flag should get removed")
}
if _, exists := m.flappingHistory["recent-flapping"]; !exists {
t.Error("recent-flapping should NOT get removed")
}
if _, exists := m.flappingHistory["empty-flapping"]; exists {
t.Error("empty-flapping should get removed")
}
if _, exists := m.suppressedUntil["expired-suppression"]; exists {
t.Error("expired-suppression should get removed")
}
if _, exists := m.suppressedUntil["active-suppression"]; !exists {
t.Error("active-suppression should NOT get removed")
}
if _, exists := m.pendingAlerts["stale-pending"]; exists {
t.Error("stale-pending should get removed")
}
if _, exists := m.pendingAlerts["recent-pending"]; !exists {
t.Error("recent-pending should NOT get removed")
}
if _, exists := m.flappingHistory["active-alert-flapping"]; !exists {
t.Error("active-alert-flapping should NOT get removed")
}
m.mu.Unlock()
// Should not be cleaned if active alert exists for pending
m.mu.Lock()
m.pendingAlerts["active-alert-pending"] = oldTime
m.activeAlerts["active-alert-pending"] = &Alert{ID: "active-alert-pending"}
m.mu.Unlock()
// Run cleanup
m.cleanupStaleMaps()
// Verify
m.mu.Lock()
defer m.mu.Unlock()
if _, exists := m.pendingAlerts["active-alert-pending"]; !exists {
t.Error("active-alert-pending should NOT get removed")
}
}

View file

@ -1685,3 +1685,218 @@ func TestGetGuestThresholds(t *testing.T) {
}
})
}
func TestExtractGuestMetrics_Default(t *testing.T) {
t.Parallel()
_, ok := extractGuestMetrics("invalid-type")
if ok {
t.Error("extractGuestMetrics should return false for invalid type")
}
}
func TestGetGuestThresholds_AllFields(t *testing.T) {
t.Parallel()
m := NewManager()
// Define a custom rule that sets all fields
trigger := 90.0
clear := 85.0
threshold := &HysteresisThreshold{Trigger: trigger, Clear: clear}
rule := CustomAlertRule{
ID: "rule-1",
Name: "All Fields Rule",
Enabled: true,
Priority: 100,
FilterConditions: FilterStack{
LogicalOperator: "AND",
Filters: []FilterCondition{
{Type: "text", Field: "name", Value: "test-guest"},
},
},
Thresholds: ThresholdConfig{
CPU: threshold,
Memory: threshold,
Disk: threshold,
DiskRead: threshold,
DiskWrite: threshold,
NetworkIn: threshold,
NetworkOut: threshold,
DisableConnectivity: true,
Backup: &BackupAlertConfig{Enabled: true},
Snapshot: &SnapshotAlertConfig{Enabled: true},
},
}
m.config.CustomRules = []CustomAlertRule{rule}
guest := models.VM{ID: "guest-1", Name: "test-guest"}
thresholds := m.getGuestThresholds(guest, "guest-1")
if thresholds.CPU == nil || thresholds.CPU.Trigger != trigger {
t.Error("CPU threshold not applied")
}
if thresholds.Memory == nil || thresholds.Memory.Trigger != trigger {
t.Error("Memory threshold not applied")
}
if thresholds.Disk == nil || thresholds.Disk.Trigger != trigger {
t.Error("Disk threshold not applied")
}
if thresholds.DiskRead == nil || thresholds.DiskRead.Trigger != trigger {
t.Error("DiskRead threshold not applied")
}
if thresholds.DiskWrite == nil || thresholds.DiskWrite.Trigger != trigger {
t.Error("DiskWrite threshold not applied")
}
if thresholds.NetworkIn == nil || thresholds.NetworkIn.Trigger != trigger {
t.Error("NetworkIn threshold not applied")
}
if thresholds.NetworkOut == nil || thresholds.NetworkOut.Trigger != trigger {
t.Error("NetworkOut threshold not applied")
}
if !thresholds.DisableConnectivity {
t.Error("DisableConnectivity not applied")
}
if thresholds.Backup == nil || !thresholds.Backup.Enabled {
t.Error("Backup config not applied")
}
if thresholds.Snapshot == nil || !thresholds.Snapshot.Enabled {
t.Error("Snapshot config not applied")
}
}
func TestGetGuestThresholds_LegacyFields(t *testing.T) {
t.Parallel()
m := NewManager()
legacyValue := 95.0
ruleLegacy := CustomAlertRule{
ID: "rule-legacy",
Name: "Legacy Fields Rule",
Enabled: true,
Priority: 200,
FilterConditions: FilterStack{
LogicalOperator: "AND",
Filters: []FilterCondition{
{Type: "text", Field: "name", Value: "test-guest"},
},
},
Thresholds: ThresholdConfig{
CPULegacy: &legacyValue,
MemoryLegacy: &legacyValue,
DiskLegacy: &legacyValue,
DiskReadLegacy: &legacyValue,
DiskWriteLegacy: &legacyValue,
NetworkInLegacy: &legacyValue,
NetworkOutLegacy: &legacyValue,
},
}
m.config.CustomRules = []CustomAlertRule{ruleLegacy}
guest := models.VM{ID: "guest-1", Name: "test-guest"}
thresholds := m.getGuestThresholds(guest, "guest-1")
if thresholds.CPU == nil || thresholds.CPU.Trigger != legacyValue {
t.Errorf("Legacy CPU threshold not applied")
}
if thresholds.Memory == nil || thresholds.Memory.Trigger != legacyValue {
t.Errorf("Legacy Memory threshold not applied")
}
if thresholds.Disk == nil || thresholds.Disk.Trigger != legacyValue {
t.Errorf("Legacy Disk threshold not applied")
}
if thresholds.DiskRead == nil || thresholds.DiskRead.Trigger != legacyValue {
t.Errorf("Legacy DiskRead threshold not applied")
}
if thresholds.DiskWrite == nil || thresholds.DiskWrite.Trigger != legacyValue {
t.Errorf("Legacy DiskWrite threshold not applied")
}
if thresholds.NetworkIn == nil || thresholds.NetworkIn.Trigger != legacyValue {
t.Errorf("Legacy NetworkIn threshold not applied")
}
if thresholds.NetworkOut == nil || thresholds.NetworkOut.Trigger != legacyValue {
t.Errorf("Legacy NetworkOut threshold not applied")
}
}
func TestGetGuestThresholds_Override(t *testing.T) {
t.Parallel()
m := NewManager()
trigger := 88.0
threshold := &HysteresisThreshold{Trigger: trigger, Clear: trigger - 5.0}
m.config.Overrides = map[string]ThresholdConfig{
"guest-1": {
CPU: threshold,
Memory: threshold,
Disk: threshold,
DiskRead: threshold,
DiskWrite: threshold,
NetworkIn: threshold,
NetworkOut: threshold,
Disabled: true,
DisableConnectivity: true,
Backup: &BackupAlertConfig{Enabled: true},
Snapshot: &SnapshotAlertConfig{Enabled: true},
},
}
guest := models.VM{ID: "guest-1", Name: "test-guest"}
thresholds := m.getGuestThresholds(guest, "guest-1")
if thresholds.CPU.Trigger != trigger {
t.Error("Override CPU not applied")
}
if !thresholds.Disabled {
t.Error("Override Disabled not applied")
}
if !thresholds.DisableConnectivity {
t.Error("Override DisableConnectivity not applied")
}
if thresholds.Backup == nil {
t.Error("Override Backup not applied")
}
}
func TestGetGuestThresholds_OverrideLegacy(t *testing.T) {
t.Parallel()
m := NewManager()
legacyValue := 77.0
m.config.Overrides = map[string]ThresholdConfig{
"guest-1": {
CPULegacy: &legacyValue,
MemoryLegacy: &legacyValue,
DiskLegacy: &legacyValue,
DiskReadLegacy: &legacyValue,
DiskWriteLegacy: &legacyValue,
NetworkInLegacy: &legacyValue,
NetworkOutLegacy: &legacyValue,
},
}
guest := models.VM{ID: "guest-1", Name: "test-guest"}
thresholds := m.getGuestThresholds(guest, "guest-1")
if thresholds.CPU == nil || thresholds.CPU.Trigger != legacyValue {
t.Error("Override Legacy CPU not applied")
}
}
func TestGetGuestThresholds_InvalidGuest(t *testing.T) {
t.Parallel()
m := NewManager()
// Should return defaults (and hit default case in tryLegacyOverrideMigration)
thresholds := m.getGuestThresholds("invalid-guest-struct", "guest-1")
if thresholds.CPU == nil {
// Just check it returns something valid (defaults)
// actually default has nil pointers, so this check is just ensuring no panic
}
}

View file

@ -128,6 +128,35 @@ func TestAddAlert(t *testing.T) {
}
}
func TestOnAlert(t *testing.T) {
t.Parallel()
hm := newTestHistoryManager(t)
called := false
var capturedAlert Alert
hm.OnAlert(func(alert Alert) {
called = true
capturedAlert = alert
})
alert := Alert{
ID: "callback-test",
Type: "cpu",
}
hm.AddAlert(alert)
if !called {
t.Error("Callback was not called")
}
if capturedAlert.ID != "callback-test" {
t.Errorf("Callback received wrong alert ID: %s", capturedAlert.ID)
}
}
func TestGetHistory_WithLimit(t *testing.T) {
t.Parallel()
@ -568,37 +597,26 @@ func TestSaveHistoryWithRetry_SingleRetry(t *testing.T) {
}
}
func TestSaveHistoryWithRetry_ReadOnlyDirectory(t *testing.T) {
func TestSaveHistoryWithRetry_WriteError(t *testing.T) {
t.Parallel()
tempDir := t.TempDir()
// Create subdirectory and make it read-only
readOnlyDir := filepath.Join(tempDir, "readonly")
if err := os.MkdirAll(readOnlyDir, 0755); err != nil {
t.Fatalf("Failed to create readonly dir: %v", err)
}
hm := &HistoryManager{
dataDir: readOnlyDir,
historyFile: filepath.Join(readOnlyDir, HistoryFileName),
backupFile: filepath.Join(readOnlyDir, HistoryBackupFileName),
history: []HistoryEntry{{Alert: Alert{ID: "test"}, Timestamp: time.Now()}},
stopChan: make(chan struct{}),
dataDir: tempDir,
// Point to a file in a non-existent subdirectory
// os.WriteFile does not create parent directories, so this will fail
historyFile: filepath.Join(tempDir, "nonexistent_dir", HistoryFileName),
backupFile: filepath.Join(tempDir, HistoryBackupFileName),
history: []HistoryEntry{{Alert: Alert{ID: "test"}, Timestamp: time.Now()}},
saveInterval: 5 * time.Minute,
stopChan: make(chan struct{}),
}
// Make directory read-only
if err := os.Chmod(readOnlyDir, 0444); err != nil {
t.Fatalf("Failed to make dir readonly: %v", err)
}
t.Cleanup(func() {
os.Chmod(readOnlyDir, 0755) // Restore for cleanup
})
// Should fail after retries
err := hm.saveHistoryWithRetry(2)
if err == nil {
t.Error("saveHistoryWithRetry should fail on read-only directory")
t.Error("saveHistoryWithRetry should fail when parent directory does not exist")
}
}

View file

@ -0,0 +1,255 @@
package alerts
import (
"testing"
"time"
"github.com/rcourtman/pulse-go-rewrite/internal/models"
)
func TestCheckPMGAnomalies_QuietSite(t *testing.T) {
t.Parallel()
m := newTestManager(t)
pmgID := "pmg1"
pmgName := "PMG 1"
// Helper to feed a sample
feedSample := func(timestamp time.Time, spamIn int) {
sample := models.PMGMailCountPoint{
Timestamp: timestamp,
SpamIn: float64(spamIn),
SpamOut: 5.0,
VirusIn: 2.0,
VirusOut: 0.0,
}
pmg := models.PMGInstance{
ID: pmgID,
Name: pmgName,
MailCount: []models.PMGMailCountPoint{sample},
}
m.checkPMGAnomalies(pmg, PMGThresholdConfig{})
}
// 1. Warmup: Feed 24 samples of steady traffic (10 spam/hour)
start := time.Now().Add(-24 * time.Hour)
for i := 0; i < 24; i++ {
feedSample(start.Add(time.Duration(i)*time.Hour), 10)
}
// Verify no alerts
if len(m.GetActiveAlerts()) != 0 {
t.Errorf("Expected no alerts after steady traffic warmup")
}
// 2. Trigger Pending: Feed a spike (100 spam/hour)
// Baseline should be ~10. 100 is > 10 * 2.5 (CritRatio) and > 10 + 300 (CritDelta)?
// Wait, CritDelta = Baseline + 300. 10 + 300 = 310.
// So 100 is NOT Critical if CritDelta is 310.
// Let's check the logic:
// Normal site (baseline >= 40): CritDelta = Baseline + 300
// Quiet site (baseline < 40): CritDelta = Baseline + 120, WarnDelta = Baseline + 60
// Our baseline is 10. So it's a "Quiet site".
// WarnDelta = 10 + 60 = 70.
// 100 > 70. So it should trigger Warning.
spikeTime := time.Now()
feedSample(spikeTime, 100)
// Verify pending
pendingKey := "pmg-anomaly-pmg1-spamIn"
m.mu.Lock()
_, isPending := m.pendingAlerts[pendingKey]
m.mu.Unlock()
if !isPending {
t.Errorf("Expected pending alert for first spike")
}
if len(m.GetActiveAlerts()) != 0 {
t.Errorf("Expected no active alerts for first spike")
}
// 3. Confirm Alert: Feed another spike
feedSample(spikeTime.Add(1*time.Hour), 110)
// Verify alert
alerts := m.GetActiveAlerts()
if len(alerts) != 1 {
t.Errorf("Expected 1 alert after second spike, got %d", len(alerts))
} else {
if alerts[0].Type != "anomaly-spamIn" {
t.Errorf("Expected anomaly-spamIn alert, got %s", alerts[0].Type)
}
if alerts[0].Level != AlertLevelWarning {
t.Errorf("Expected Warning level (quiet site < 120 delta), got %s", alerts[0].Level)
}
}
// 4. Clear Alert: Return to normal (10 spam/hour)
feedSample(spikeTime.Add(2*time.Hour), 10)
if len(m.GetActiveAlerts()) != 0 {
t.Errorf("Expected alert to clear after return to normal")
}
}
func TestCheckPMGAnomalies_NormalSite(t *testing.T) {
t.Parallel()
m := newTestManager(t)
pmgID := "pmg2"
pmgName := "PMG 2"
// Helper to feed a sample
feedSample := func(timestamp time.Time, spamIn int) {
sample := models.PMGMailCountPoint{
Timestamp: timestamp,
SpamIn: float64(spamIn),
SpamOut: 5,
VirusIn: 2,
VirusOut: 0,
}
pmg := models.PMGInstance{
ID: pmgID,
Name: pmgName,
MailCount: []models.PMGMailCountPoint{sample},
}
m.checkPMGAnomalies(pmg, PMGThresholdConfig{})
}
// 1. Warmup: Feed 24 samples of steady traffic (50 spam/hour) for "Normal" site (>= 40)
baseline := 50
start := time.Now().Add(-24 * time.Hour)
for i := 0; i < 24; i++ {
feedSample(start.Add(time.Duration(i)*time.Hour), baseline)
}
// 2. Trigger Warning:
// Normal site: WarnRatio = 1.8, WarnDelta = Baseline + 150
// 50 * 1.8 = 90.
// 50 + 150 = 200.
// Must exceed BOTH. So need > 200.
spikeTime := time.Now()
val := 210
feedSample(spikeTime, val) // Pending
// Verify pending
pendingKey := "pmg-anomaly-pmg2-spamIn"
m.mu.Lock()
_, isPending := m.pendingAlerts[pendingKey]
m.mu.Unlock()
if !isPending {
t.Errorf("Expected pending alert for first spike")
}
// Confirm
feedSample(spikeTime.Add(1*time.Hour), val) // Alert
alerts := m.GetActiveAlerts()
if len(alerts) != 1 {
t.Errorf("Expected 1 alert, got %d", len(alerts))
} else {
if alerts[0].Level != AlertLevelWarning {
t.Errorf("Expected Warning level, got %s", alerts[0].Level)
}
}
}
func TestCheckPMGAnomalies_NormalSite_Critical(t *testing.T) {
t.Parallel()
m := newTestManager(t)
pmgID := "pmg3"
pmgName := "PMG 3"
feedSample := func(timestamp time.Time, spamIn int) {
sample := models.PMGMailCountPoint{
Timestamp: timestamp,
SpamIn: float64(spamIn),
SpamOut: 5,
VirusIn: 2,
VirusOut: 0,
}
pmg := models.PMGInstance{
ID: pmgID,
Name: pmgName,
MailCount: []models.PMGMailCountPoint{sample},
}
m.checkPMGAnomalies(pmg, PMGThresholdConfig{})
}
baseline := 50
start := time.Now().Add(-24 * time.Hour)
for i := 0; i < 24; i++ {
feedSample(start.Add(time.Duration(i)*time.Hour), baseline)
}
// Normal site: CritRatio = 2.5, CritDelta = Baseline + 300
// 50 * 2.5 = 125.
// 50 + 300 = 350.
// Must exceed BOTH. So need > 350.
spikeTime := time.Now()
val := 360
feedSample(spikeTime, val) // Pending
feedSample(spikeTime.Add(1*time.Hour), val) // Alert
alerts := m.GetActiveAlerts()
if len(alerts) != 1 {
t.Errorf("Expected 1 alert, got %d", len(alerts))
} else {
if alerts[0].Level != AlertLevelCritical {
t.Errorf("Expected Critical level, got %s", alerts[0].Level)
}
}
}
func TestCheckPMGAnomalies_QuietSite_Critical(t *testing.T) {
t.Parallel()
m := newTestManager(t)
pmgID := "pmg1-crit"
pmgName := "PMG 1 Critical"
feedSample := func(timestamp time.Time, spamIn int) {
sample := models.PMGMailCountPoint{
Timestamp: timestamp,
SpamIn: float64(spamIn),
SpamOut: 5,
VirusIn: 2,
VirusOut: 0,
}
pmg := models.PMGInstance{
ID: pmgID,
Name: pmgName,
MailCount: []models.PMGMailCountPoint{sample},
}
m.checkPMGAnomalies(pmg, PMGThresholdConfig{})
}
// Warmup: Steady 10 spam/hour (Baseline = 10)
baseline := 10
start := time.Now().Add(-24 * time.Hour)
for i := 0; i < 24; i++ {
feedSample(start.Add(time.Duration(i)*time.Hour), baseline)
}
// Quiet site (Baseline < 40): CritDelta = Baseline + 120 = 130.
// Feed 140.
spikeTime := time.Now()
val := 140
feedSample(spikeTime, val) // Pending
feedSample(spikeTime.Add(1*time.Hour), val) // Alert
alerts := m.GetActiveAlerts()
if len(alerts) != 1 {
t.Errorf("Expected 1 alert, got %d", len(alerts))
} else {
if alerts[0].Level != AlertLevelCritical {
t.Errorf("Expected Critical level, got %s", alerts[0].Level)
}
}
}