diff --git a/internal/alerts/cleanup_test.go b/internal/alerts/cleanup_test.go new file mode 100644 index 000000000..5e6dea3ed --- /dev/null +++ b/internal/alerts/cleanup_test.go @@ -0,0 +1,91 @@ +package alerts + +import ( + "testing" + "time" +) + +func TestCleanupStaleMaps(t *testing.T) { + t.Parallel() + m := newTestManager(t) + + // Populate maps with old data + oldTime := time.Now().Add(-25 * time.Hour) + recentTime := time.Now().Add(-1 * time.Hour) + + m.mu.Lock() + // Flapping history + m.flappingHistory["stale-flapping"] = []time.Time{oldTime} + m.flappingHistory["recent-flapping"] = []time.Time{recentTime} + m.flappingActive["stale-flapping"] = true + + // Suppressed until + m.suppressedUntil["expired-suppression"] = oldTime + m.suppressedUntil["active-suppression"] = time.Now().Add(1 * time.Hour) + + // Pending alerts + m.pendingAlerts["stale-pending"] = oldTime + m.pendingAlerts["recent-pending"] = recentTime + + // Should not be cleaned if active alert exists + m.flappingHistory["active-alert-flapping"] = []time.Time{oldTime} + m.activeAlerts["active-alert-flapping"] = &Alert{ID: "active-alert-flapping"} + + // Empty history should be cleaned + m.flappingHistory["empty-flapping"] = []time.Time{} + + m.mu.Unlock() + + // Run cleanup + m.cleanupStaleMaps() + + // Verify + m.mu.Lock() + if _, exists := m.flappingHistory["stale-flapping"]; exists { + t.Error("stale-flapping should get removed") + } + if _, exists := m.flappingActive["stale-flapping"]; exists { + t.Error("stale-flapping active flag should get removed") + } + if _, exists := m.flappingHistory["recent-flapping"]; !exists { + t.Error("recent-flapping should NOT get removed") + } + if _, exists := m.flappingHistory["empty-flapping"]; exists { + t.Error("empty-flapping should get removed") + } + + if _, exists := m.suppressedUntil["expired-suppression"]; exists { + t.Error("expired-suppression should get removed") + } + if _, exists := m.suppressedUntil["active-suppression"]; !exists { + t.Error("active-suppression should NOT get removed") + } + + if _, exists := m.pendingAlerts["stale-pending"]; exists { + t.Error("stale-pending should get removed") + } + if _, exists := m.pendingAlerts["recent-pending"]; !exists { + t.Error("recent-pending should NOT get removed") + } + + if _, exists := m.flappingHistory["active-alert-flapping"]; !exists { + t.Error("active-alert-flapping should NOT get removed") + } + m.mu.Unlock() + + // Should not be cleaned if active alert exists for pending + m.mu.Lock() + m.pendingAlerts["active-alert-pending"] = oldTime + m.activeAlerts["active-alert-pending"] = &Alert{ID: "active-alert-pending"} + m.mu.Unlock() + + // Run cleanup + m.cleanupStaleMaps() + + // Verify + m.mu.Lock() + defer m.mu.Unlock() + if _, exists := m.pendingAlerts["active-alert-pending"]; !exists { + t.Error("active-alert-pending should NOT get removed") + } +} diff --git a/internal/alerts/filter_evaluation_test.go b/internal/alerts/filter_evaluation_test.go index 5fe5d485c..4c9002700 100644 --- a/internal/alerts/filter_evaluation_test.go +++ b/internal/alerts/filter_evaluation_test.go @@ -1685,3 +1685,218 @@ func TestGetGuestThresholds(t *testing.T) { } }) } + +func TestExtractGuestMetrics_Default(t *testing.T) { + t.Parallel() + _, ok := extractGuestMetrics("invalid-type") + if ok { + t.Error("extractGuestMetrics should return false for invalid type") + } +} + +func TestGetGuestThresholds_AllFields(t *testing.T) { + t.Parallel() + m := NewManager() + + // Define a custom rule that sets all fields + trigger := 90.0 + clear := 85.0 + threshold := &HysteresisThreshold{Trigger: trigger, Clear: clear} + + rule := CustomAlertRule{ + ID: "rule-1", + Name: "All Fields Rule", + Enabled: true, + Priority: 100, + FilterConditions: FilterStack{ + LogicalOperator: "AND", + Filters: []FilterCondition{ + {Type: "text", Field: "name", Value: "test-guest"}, + }, + }, + Thresholds: ThresholdConfig{ + CPU: threshold, + Memory: threshold, + Disk: threshold, + DiskRead: threshold, + DiskWrite: threshold, + NetworkIn: threshold, + NetworkOut: threshold, + DisableConnectivity: true, + Backup: &BackupAlertConfig{Enabled: true}, + Snapshot: &SnapshotAlertConfig{Enabled: true}, + }, + } + + m.config.CustomRules = []CustomAlertRule{rule} + + guest := models.VM{ID: "guest-1", Name: "test-guest"} + + thresholds := m.getGuestThresholds(guest, "guest-1") + + if thresholds.CPU == nil || thresholds.CPU.Trigger != trigger { + t.Error("CPU threshold not applied") + } + if thresholds.Memory == nil || thresholds.Memory.Trigger != trigger { + t.Error("Memory threshold not applied") + } + if thresholds.Disk == nil || thresholds.Disk.Trigger != trigger { + t.Error("Disk threshold not applied") + } + if thresholds.DiskRead == nil || thresholds.DiskRead.Trigger != trigger { + t.Error("DiskRead threshold not applied") + } + if thresholds.DiskWrite == nil || thresholds.DiskWrite.Trigger != trigger { + t.Error("DiskWrite threshold not applied") + } + if thresholds.NetworkIn == nil || thresholds.NetworkIn.Trigger != trigger { + t.Error("NetworkIn threshold not applied") + } + if thresholds.NetworkOut == nil || thresholds.NetworkOut.Trigger != trigger { + t.Error("NetworkOut threshold not applied") + } + if !thresholds.DisableConnectivity { + t.Error("DisableConnectivity not applied") + } + if thresholds.Backup == nil || !thresholds.Backup.Enabled { + t.Error("Backup config not applied") + } + if thresholds.Snapshot == nil || !thresholds.Snapshot.Enabled { + t.Error("Snapshot config not applied") + } +} + +func TestGetGuestThresholds_LegacyFields(t *testing.T) { + t.Parallel() + m := NewManager() + + legacyValue := 95.0 + + ruleLegacy := CustomAlertRule{ + ID: "rule-legacy", + Name: "Legacy Fields Rule", + Enabled: true, + Priority: 200, + FilterConditions: FilterStack{ + LogicalOperator: "AND", + Filters: []FilterCondition{ + {Type: "text", Field: "name", Value: "test-guest"}, + }, + }, + Thresholds: ThresholdConfig{ + CPULegacy: &legacyValue, + MemoryLegacy: &legacyValue, + DiskLegacy: &legacyValue, + DiskReadLegacy: &legacyValue, + DiskWriteLegacy: &legacyValue, + NetworkInLegacy: &legacyValue, + NetworkOutLegacy: &legacyValue, + }, + } + + m.config.CustomRules = []CustomAlertRule{ruleLegacy} + + guest := models.VM{ID: "guest-1", Name: "test-guest"} + + thresholds := m.getGuestThresholds(guest, "guest-1") + + if thresholds.CPU == nil || thresholds.CPU.Trigger != legacyValue { + t.Errorf("Legacy CPU threshold not applied") + } + if thresholds.Memory == nil || thresholds.Memory.Trigger != legacyValue { + t.Errorf("Legacy Memory threshold not applied") + } + if thresholds.Disk == nil || thresholds.Disk.Trigger != legacyValue { + t.Errorf("Legacy Disk threshold not applied") + } + if thresholds.DiskRead == nil || thresholds.DiskRead.Trigger != legacyValue { + t.Errorf("Legacy DiskRead threshold not applied") + } + if thresholds.DiskWrite == nil || thresholds.DiskWrite.Trigger != legacyValue { + t.Errorf("Legacy DiskWrite threshold not applied") + } + if thresholds.NetworkIn == nil || thresholds.NetworkIn.Trigger != legacyValue { + t.Errorf("Legacy NetworkIn threshold not applied") + } + if thresholds.NetworkOut == nil || thresholds.NetworkOut.Trigger != legacyValue { + t.Errorf("Legacy NetworkOut threshold not applied") + } +} + +func TestGetGuestThresholds_Override(t *testing.T) { + t.Parallel() + m := NewManager() + + trigger := 88.0 + threshold := &HysteresisThreshold{Trigger: trigger, Clear: trigger - 5.0} + + m.config.Overrides = map[string]ThresholdConfig{ + "guest-1": { + CPU: threshold, + Memory: threshold, + Disk: threshold, + DiskRead: threshold, + DiskWrite: threshold, + NetworkIn: threshold, + NetworkOut: threshold, + Disabled: true, + DisableConnectivity: true, + Backup: &BackupAlertConfig{Enabled: true}, + Snapshot: &SnapshotAlertConfig{Enabled: true}, + }, + } + + guest := models.VM{ID: "guest-1", Name: "test-guest"} + thresholds := m.getGuestThresholds(guest, "guest-1") + + if thresholds.CPU.Trigger != trigger { + t.Error("Override CPU not applied") + } + if !thresholds.Disabled { + t.Error("Override Disabled not applied") + } + if !thresholds.DisableConnectivity { + t.Error("Override DisableConnectivity not applied") + } + if thresholds.Backup == nil { + t.Error("Override Backup not applied") + } +} + +func TestGetGuestThresholds_OverrideLegacy(t *testing.T) { + t.Parallel() + m := NewManager() + + legacyValue := 77.0 + + m.config.Overrides = map[string]ThresholdConfig{ + "guest-1": { + CPULegacy: &legacyValue, + MemoryLegacy: &legacyValue, + DiskLegacy: &legacyValue, + DiskReadLegacy: &legacyValue, + DiskWriteLegacy: &legacyValue, + NetworkInLegacy: &legacyValue, + NetworkOutLegacy: &legacyValue, + }, + } + + guest := models.VM{ID: "guest-1", Name: "test-guest"} + thresholds := m.getGuestThresholds(guest, "guest-1") + + if thresholds.CPU == nil || thresholds.CPU.Trigger != legacyValue { + t.Error("Override Legacy CPU not applied") + } +} + +func TestGetGuestThresholds_InvalidGuest(t *testing.T) { + t.Parallel() + m := NewManager() + + // Should return defaults (and hit default case in tryLegacyOverrideMigration) + thresholds := m.getGuestThresholds("invalid-guest-struct", "guest-1") + if thresholds.CPU == nil { + // Just check it returns something valid (defaults) + // actually default has nil pointers, so this check is just ensuring no panic + } +} diff --git a/internal/alerts/history_test.go b/internal/alerts/history_test.go index e089a5e38..4c53accf1 100644 --- a/internal/alerts/history_test.go +++ b/internal/alerts/history_test.go @@ -128,6 +128,35 @@ func TestAddAlert(t *testing.T) { } } +func TestOnAlert(t *testing.T) { + t.Parallel() + + hm := newTestHistoryManager(t) + + called := false + var capturedAlert Alert + + hm.OnAlert(func(alert Alert) { + called = true + capturedAlert = alert + }) + + alert := Alert{ + ID: "callback-test", + Type: "cpu", + } + + hm.AddAlert(alert) + + if !called { + t.Error("Callback was not called") + } + + if capturedAlert.ID != "callback-test" { + t.Errorf("Callback received wrong alert ID: %s", capturedAlert.ID) + } +} + func TestGetHistory_WithLimit(t *testing.T) { t.Parallel() @@ -568,37 +597,26 @@ func TestSaveHistoryWithRetry_SingleRetry(t *testing.T) { } } -func TestSaveHistoryWithRetry_ReadOnlyDirectory(t *testing.T) { +func TestSaveHistoryWithRetry_WriteError(t *testing.T) { t.Parallel() tempDir := t.TempDir() - // Create subdirectory and make it read-only - readOnlyDir := filepath.Join(tempDir, "readonly") - if err := os.MkdirAll(readOnlyDir, 0755); err != nil { - t.Fatalf("Failed to create readonly dir: %v", err) - } - hm := &HistoryManager{ - dataDir: readOnlyDir, - historyFile: filepath.Join(readOnlyDir, HistoryFileName), - backupFile: filepath.Join(readOnlyDir, HistoryBackupFileName), - history: []HistoryEntry{{Alert: Alert{ID: "test"}, Timestamp: time.Now()}}, - stopChan: make(chan struct{}), + dataDir: tempDir, + // Point to a file in a non-existent subdirectory + // os.WriteFile does not create parent directories, so this will fail + historyFile: filepath.Join(tempDir, "nonexistent_dir", HistoryFileName), + backupFile: filepath.Join(tempDir, HistoryBackupFileName), + history: []HistoryEntry{{Alert: Alert{ID: "test"}, Timestamp: time.Now()}}, + saveInterval: 5 * time.Minute, + stopChan: make(chan struct{}), } - // Make directory read-only - if err := os.Chmod(readOnlyDir, 0444); err != nil { - t.Fatalf("Failed to make dir readonly: %v", err) - } - t.Cleanup(func() { - os.Chmod(readOnlyDir, 0755) // Restore for cleanup - }) - // Should fail after retries err := hm.saveHistoryWithRetry(2) if err == nil { - t.Error("saveHistoryWithRetry should fail on read-only directory") + t.Error("saveHistoryWithRetry should fail when parent directory does not exist") } } diff --git a/internal/alerts/pmg_anomaly_test.go b/internal/alerts/pmg_anomaly_test.go new file mode 100644 index 000000000..aa32608c0 --- /dev/null +++ b/internal/alerts/pmg_anomaly_test.go @@ -0,0 +1,255 @@ +package alerts + +import ( + "testing" + "time" + + "github.com/rcourtman/pulse-go-rewrite/internal/models" +) + +func TestCheckPMGAnomalies_QuietSite(t *testing.T) { + t.Parallel() + m := newTestManager(t) + + pmgID := "pmg1" + pmgName := "PMG 1" + + // Helper to feed a sample + feedSample := func(timestamp time.Time, spamIn int) { + sample := models.PMGMailCountPoint{ + Timestamp: timestamp, + SpamIn: float64(spamIn), + SpamOut: 5.0, + VirusIn: 2.0, + VirusOut: 0.0, + } + pmg := models.PMGInstance{ + ID: pmgID, + Name: pmgName, + MailCount: []models.PMGMailCountPoint{sample}, + } + m.checkPMGAnomalies(pmg, PMGThresholdConfig{}) + } + + // 1. Warmup: Feed 24 samples of steady traffic (10 spam/hour) + start := time.Now().Add(-24 * time.Hour) + for i := 0; i < 24; i++ { + feedSample(start.Add(time.Duration(i)*time.Hour), 10) + } + + // Verify no alerts + if len(m.GetActiveAlerts()) != 0 { + t.Errorf("Expected no alerts after steady traffic warmup") + } + + // 2. Trigger Pending: Feed a spike (100 spam/hour) + // Baseline should be ~10. 100 is > 10 * 2.5 (CritRatio) and > 10 + 300 (CritDelta)? + // Wait, CritDelta = Baseline + 300. 10 + 300 = 310. + // So 100 is NOT Critical if CritDelta is 310. + + // Let's check the logic: + // Normal site (baseline >= 40): CritDelta = Baseline + 300 + // Quiet site (baseline < 40): CritDelta = Baseline + 120, WarnDelta = Baseline + 60 + + // Our baseline is 10. So it's a "Quiet site". + // WarnDelta = 10 + 60 = 70. + // 100 > 70. So it should trigger Warning. + + spikeTime := time.Now() + feedSample(spikeTime, 100) + + // Verify pending + pendingKey := "pmg-anomaly-pmg1-spamIn" + m.mu.Lock() + _, isPending := m.pendingAlerts[pendingKey] + m.mu.Unlock() + if !isPending { + t.Errorf("Expected pending alert for first spike") + } + if len(m.GetActiveAlerts()) != 0 { + t.Errorf("Expected no active alerts for first spike") + } + + // 3. Confirm Alert: Feed another spike + feedSample(spikeTime.Add(1*time.Hour), 110) + + // Verify alert + alerts := m.GetActiveAlerts() + if len(alerts) != 1 { + t.Errorf("Expected 1 alert after second spike, got %d", len(alerts)) + } else { + if alerts[0].Type != "anomaly-spamIn" { + t.Errorf("Expected anomaly-spamIn alert, got %s", alerts[0].Type) + } + if alerts[0].Level != AlertLevelWarning { + t.Errorf("Expected Warning level (quiet site < 120 delta), got %s", alerts[0].Level) + } + } + + // 4. Clear Alert: Return to normal (10 spam/hour) + feedSample(spikeTime.Add(2*time.Hour), 10) + + if len(m.GetActiveAlerts()) != 0 { + t.Errorf("Expected alert to clear after return to normal") + } +} + +func TestCheckPMGAnomalies_NormalSite(t *testing.T) { + t.Parallel() + m := newTestManager(t) + + pmgID := "pmg2" + pmgName := "PMG 2" + + // Helper to feed a sample + feedSample := func(timestamp time.Time, spamIn int) { + sample := models.PMGMailCountPoint{ + Timestamp: timestamp, + SpamIn: float64(spamIn), + SpamOut: 5, + VirusIn: 2, + VirusOut: 0, + } + pmg := models.PMGInstance{ + ID: pmgID, + Name: pmgName, + MailCount: []models.PMGMailCountPoint{sample}, + } + m.checkPMGAnomalies(pmg, PMGThresholdConfig{}) + } + + // 1. Warmup: Feed 24 samples of steady traffic (50 spam/hour) for "Normal" site (>= 40) + baseline := 50 + start := time.Now().Add(-24 * time.Hour) + for i := 0; i < 24; i++ { + feedSample(start.Add(time.Duration(i)*time.Hour), baseline) + } + + // 2. Trigger Warning: + // Normal site: WarnRatio = 1.8, WarnDelta = Baseline + 150 + // 50 * 1.8 = 90. + // 50 + 150 = 200. + // Must exceed BOTH. So need > 200. + + spikeTime := time.Now() + val := 210 + feedSample(spikeTime, val) // Pending + + // Verify pending + pendingKey := "pmg-anomaly-pmg2-spamIn" + m.mu.Lock() + _, isPending := m.pendingAlerts[pendingKey] + m.mu.Unlock() + if !isPending { + t.Errorf("Expected pending alert for first spike") + } + + // Confirm + feedSample(spikeTime.Add(1*time.Hour), val) // Alert + + alerts := m.GetActiveAlerts() + if len(alerts) != 1 { + t.Errorf("Expected 1 alert, got %d", len(alerts)) + } else { + if alerts[0].Level != AlertLevelWarning { + t.Errorf("Expected Warning level, got %s", alerts[0].Level) + } + } +} + +func TestCheckPMGAnomalies_NormalSite_Critical(t *testing.T) { + t.Parallel() + m := newTestManager(t) + + pmgID := "pmg3" + pmgName := "PMG 3" + + feedSample := func(timestamp time.Time, spamIn int) { + sample := models.PMGMailCountPoint{ + Timestamp: timestamp, + SpamIn: float64(spamIn), + SpamOut: 5, + VirusIn: 2, + VirusOut: 0, + } + pmg := models.PMGInstance{ + ID: pmgID, + Name: pmgName, + MailCount: []models.PMGMailCountPoint{sample}, + } + m.checkPMGAnomalies(pmg, PMGThresholdConfig{}) + } + + baseline := 50 + start := time.Now().Add(-24 * time.Hour) + for i := 0; i < 24; i++ { + feedSample(start.Add(time.Duration(i)*time.Hour), baseline) + } + + // Normal site: CritRatio = 2.5, CritDelta = Baseline + 300 + // 50 * 2.5 = 125. + // 50 + 300 = 350. + // Must exceed BOTH. So need > 350. + + spikeTime := time.Now() + val := 360 + feedSample(spikeTime, val) // Pending + feedSample(spikeTime.Add(1*time.Hour), val) // Alert + + alerts := m.GetActiveAlerts() + if len(alerts) != 1 { + t.Errorf("Expected 1 alert, got %d", len(alerts)) + } else { + if alerts[0].Level != AlertLevelCritical { + t.Errorf("Expected Critical level, got %s", alerts[0].Level) + } + } +} + +func TestCheckPMGAnomalies_QuietSite_Critical(t *testing.T) { + t.Parallel() + m := newTestManager(t) + + pmgID := "pmg1-crit" + pmgName := "PMG 1 Critical" + + feedSample := func(timestamp time.Time, spamIn int) { + sample := models.PMGMailCountPoint{ + Timestamp: timestamp, + SpamIn: float64(spamIn), + SpamOut: 5, + VirusIn: 2, + VirusOut: 0, + } + pmg := models.PMGInstance{ + ID: pmgID, + Name: pmgName, + MailCount: []models.PMGMailCountPoint{sample}, + } + m.checkPMGAnomalies(pmg, PMGThresholdConfig{}) + } + + // Warmup: Steady 10 spam/hour (Baseline = 10) + baseline := 10 + start := time.Now().Add(-24 * time.Hour) + for i := 0; i < 24; i++ { + feedSample(start.Add(time.Duration(i)*time.Hour), baseline) + } + + // Quiet site (Baseline < 40): CritDelta = Baseline + 120 = 130. + // Feed 140. + + spikeTime := time.Now() + val := 140 + feedSample(spikeTime, val) // Pending + feedSample(spikeTime.Add(1*time.Hour), val) // Alert + + alerts := m.GetActiveAlerts() + if len(alerts) != 1 { + t.Errorf("Expected 1 alert, got %d", len(alerts)) + } else { + if alerts[0].Level != AlertLevelCritical { + t.Errorf("Expected Critical level, got %s", alerts[0].Level) + } + } +}