From a86c7120cfb3894c424da422371ebb2924bf8b5b Mon Sep 17 00:00:00 2001 From: rcourtman Date: Sun, 12 Apr 2026 22:04:10 +0100 Subject: [PATCH] Debounce recovery for poll-driven offline alerts --- internal/alerts/alerts.go | 94 ++++++++++++-- internal/alerts/alerts_test.go | 223 +++++++++++++++++++++++++++++++++ 2 files changed, 310 insertions(+), 7 deletions(-) diff --git a/internal/alerts/alerts.go b/internal/alerts/alerts.go index c29485db9..64f1babcc 100644 --- a/internal/alerts/alerts.go +++ b/internal/alerts/alerts.go @@ -519,13 +519,14 @@ type Manager struct { // Time threshold tracking pendingAlerts map[string]time.Time // Track when thresholds were first exceeded // Offline confirmation tracking - nodeOfflineCount map[string]int // Track consecutive offline counts for nodes (legacy) - offlineConfirmations map[string]int // Track consecutive offline counts for all resources - dockerOfflineCount map[string]int // Track consecutive offline counts for Docker hosts - dockerStateConfirm map[string]int // Track consecutive state confirmations for Docker containers - dockerRestartTracking map[string]*dockerRestartRecord // Track restart counts and times for restart loop detection - dockerLastExitCode map[string]int // Track last exit code for OOM detection - dockerUpdateFirstSeen map[string]time.Time // Track when image updates were first detected for alert delay + nodeOfflineCount map[string]int // Track consecutive offline counts for nodes (legacy) + offlineConfirmations map[string]int // Track consecutive offline counts for all resources + offlineRecoveryConfirmations map[string]int // Track consecutive healthy confirmations before clearing poll-driven offline alerts + dockerOfflineCount map[string]int // Track consecutive offline counts for Docker hosts + dockerStateConfirm map[string]int // Track consecutive state confirmations for Docker containers + dockerRestartTracking map[string]*dockerRestartRecord // Track restart counts and times for restart loop detection + dockerLastExitCode map[string]int // Track last exit code for OOM detection + dockerUpdateFirstSeen map[string]time.Time // Track when image updates were first detected for alert delay // Stable identity tracking prevents update-delay resets when host IDs churn. dockerUpdateFirstSeenByIdentity map[string]time.Time // PMG quarantine growth tracking @@ -591,6 +592,7 @@ func NewManagerWithDataDir(dataDir string) *Manager { pendingAlerts: make(map[string]time.Time), nodeOfflineCount: make(map[string]int), offlineConfirmations: make(map[string]int), + offlineRecoveryConfirmations: make(map[string]int), dockerOfflineCount: make(map[string]int), dockerStateConfirm: make(map[string]int), dockerRestartTracking: make(map[string]*dockerRestartRecord), @@ -7496,6 +7498,7 @@ func (m *Manager) removeActiveAlertNoLock(alertID string) { m.historyManager.UpdateAlertLastSeen(alertID, alert.LastSeen) } delete(m.activeAlerts, alertID) + delete(m.offlineRecoveryConfirmations, alertID) // NOTE: Don't delete ackState here - preserve it so if the same alert // reappears (e.g., powered-off VM during backup), the acknowledgement // is restored via preserveAlertState. ackState is cleaned up in Cleanup(). @@ -7506,6 +7509,22 @@ func (m *Manager) removeActiveAlertNoLock(alertID string) { } } +func (m *Manager) confirmOfflineRecoveryNoLock(alertID string, required int) (int, bool) { + if required <= 1 { + delete(m.offlineRecoveryConfirmations, alertID) + return required, true + } + + m.offlineRecoveryConfirmations[alertID]++ + confirmations := m.offlineRecoveryConfirmations[alertID] + if confirmations < required { + return confirmations, false + } + + delete(m.offlineRecoveryConfirmations, alertID) + return confirmations, true +} + // GetActiveAlerts returns all active alerts func (m *Manager) GetActiveAlerts() []Alert { m.mu.RLock() @@ -7628,6 +7647,7 @@ func (m *Manager) checkNodeOffline(node models.Node) { defer m.mu.Unlock() // Check if node connectivity alerts are disabled + delete(m.offlineRecoveryConfirmations, alertID) if override, exists := m.config.Overrides[node.ID]; exists && override.DisableConnectivity { // Node connectivity alerts are disabled, clear any existing alert and return if _, alertExists := m.activeAlerts[alertID]; alertExists { @@ -7734,6 +7754,18 @@ func (m *Manager) clearNodeOfflineAlert(node models.Node) { // Check if offline alert exists alert, exists := m.activeAlerts[alertID] if !exists { + delete(m.offlineRecoveryConfirmations, alertID) + return + } + + const requiredRecoveryCount = 3 + recoveryCount, confirmed := m.confirmOfflineRecoveryNoLock(alertID, requiredRecoveryCount) + if !confirmed { + log.Debug(). + Str("node", node.Name). + Int("confirmations", recoveryCount). + Int("required", requiredRecoveryCount). + Msg("Node appears back online, waiting for recovery confirmation") return } @@ -7765,6 +7797,7 @@ func (m *Manager) checkPBSOffline(pbs models.PBSInstance) { defer m.mu.Unlock() // Check if PBS offline alerts are disabled via disableConnectivity flag + delete(m.offlineRecoveryConfirmations, alertID) if override, exists := m.config.Overrides[pbs.ID]; exists && (override.Disabled || override.DisableConnectivity) { // PBS connectivity alerts are disabled, clear any existing alert and return if _, alertExists := m.activeAlerts[alertID]; alertExists { @@ -7852,6 +7885,18 @@ func (m *Manager) clearPBSOfflineAlert(pbs models.PBSInstance) { // Check if offline alert exists alert, exists := m.activeAlerts[alertID] if !exists { + delete(m.offlineRecoveryConfirmations, alertID) + return + } + + const requiredRecoveryCount = 3 + recoveryCount, confirmed := m.confirmOfflineRecoveryNoLock(alertID, requiredRecoveryCount) + if !confirmed { + log.Debug(). + Str("pbs", pbs.Name). + Int("confirmations", recoveryCount). + Int("required", requiredRecoveryCount). + Msg("PBS appears back online, waiting for recovery confirmation") return } @@ -7883,6 +7928,7 @@ func (m *Manager) checkPMGOffline(pmg models.PMGInstance) { defer m.mu.Unlock() // Check if PMG offline alerts are disabled via disableConnectivity flag + delete(m.offlineRecoveryConfirmations, alertID) if override, exists := m.config.Overrides[pmg.ID]; exists && (override.Disabled || override.DisableConnectivity) { // PMG connectivity alerts are disabled, clear any existing alert and return if _, alertExists := m.activeAlerts[alertID]; alertExists { @@ -7970,6 +8016,18 @@ func (m *Manager) clearPMGOfflineAlert(pmg models.PMGInstance) { // Check if offline alert exists alert, exists := m.activeAlerts[alertID] if !exists { + delete(m.offlineRecoveryConfirmations, alertID) + return + } + + const requiredRecoveryCount = 3 + recoveryCount, confirmed := m.confirmOfflineRecoveryNoLock(alertID, requiredRecoveryCount) + if !confirmed { + log.Debug(). + Str("pmg", pmg.Name). + Int("confirmations", recoveryCount). + Int("required", requiredRecoveryCount). + Msg("PMG appears back online, waiting for recovery confirmation") return } @@ -9009,6 +9067,7 @@ func (m *Manager) checkStorageOffline(storage models.Storage) { defer m.mu.Unlock() // Check if storage offline alerts are disabled + delete(m.offlineRecoveryConfirmations, alertID) if override, exists, _ := findStorageOverride(m.config.Overrides, storage); exists && override.Disabled { // Storage alerts are disabled, clear any existing alert and return if _, alertExists := m.activeAlerts[alertID]; alertExists { @@ -9097,6 +9156,18 @@ func (m *Manager) clearStorageOfflineAlert(storage models.Storage) { // Check if offline alert exists alert, exists := m.activeAlerts[alertID] if !exists { + delete(m.offlineRecoveryConfirmations, alertID) + return + } + + const requiredRecoveryCount = 2 + recoveryCount, confirmed := m.confirmOfflineRecoveryNoLock(alertID, requiredRecoveryCount) + if !confirmed { + log.Debug(). + Str("storage", storage.Name). + Int("confirmations", recoveryCount). + Int("required", requiredRecoveryCount). + Msg("Storage appears back online, waiting for recovery confirmation") return } @@ -10158,6 +10229,7 @@ func (m *Manager) ClearActiveAlerts() { m.alertRateLimit = make(map[string][]time.Time) m.nodeOfflineCount = make(map[string]int) m.offlineConfirmations = make(map[string]int) + m.offlineRecoveryConfirmations = make(map[string]int) m.dockerOfflineCount = make(map[string]int) m.dockerStateConfirm = make(map[string]int) m.dockerRestartTracking = make(map[string]*dockerRestartRecord) @@ -10275,6 +10347,14 @@ func (m *Manager) cleanupStaleMaps() { } } + // Clean up recovery confirmation counts for alerts that are no longer active + for alertID := range m.offlineRecoveryConfirmations { + if _, exists := m.activeAlerts[alertID]; !exists { + delete(m.offlineRecoveryConfirmations, alertID) + cleaned++ + } + } + // Clean up node offline counts (legacy) for nodeID := range m.nodeOfflineCount { hasRelatedAlert := false diff --git a/internal/alerts/alerts_test.go b/internal/alerts/alerts_test.go index 4790a2188..e781d4c14 100644 --- a/internal/alerts/alerts_test.go +++ b/internal/alerts/alerts_test.go @@ -5183,6 +5183,20 @@ func TestClearStorageOfflineAlert(t *testing.T) { resolvedCh <- id }) + m.clearStorageOfflineAlert(storage) + m.mu.RLock() + _, alertStillActive := m.activeAlerts[alertID] + m.mu.RUnlock() + if !alertStillActive { + t.Fatal("expected alert to remain active until recovery is confirmed") + } + + select { + case <-resolvedCh: + t.Fatal("expected no resolved callback before recovery is confirmed") + default: + } + m.clearStorageOfflineAlert(storage) m.mu.RLock() @@ -8141,6 +8155,28 @@ func TestClearNodeOfflineAlert(t *testing.T) { m.mu.Unlock() node := models.Node{ID: "node1", Name: "Node 1", Instance: "pve1"} + m.clearNodeOfflineAlert(node) + m.mu.RLock() + _, alertStillActive := m.activeAlerts["node-offline-node1"] + m.mu.RUnlock() + if !alertStillActive { + t.Fatal("expected alert to remain active until recovery is confirmed") + } + + select { + case <-resolvedCh: + t.Fatal("expected no resolved callback before recovery is confirmed") + default: + } + + m.clearNodeOfflineAlert(node) + m.mu.RLock() + _, alertStillActive = m.activeAlerts["node-offline-node1"] + m.mu.RUnlock() + if !alertStillActive { + t.Fatal("expected alert to remain active until final recovery confirmation") + } + m.clearNodeOfflineAlert(node) m.mu.RLock() @@ -8194,6 +8230,7 @@ func TestClearOfflineAlertNoDeadlock(t *testing.T) { Type: "offline", StartTime: time.Now().Add(-5 * time.Minute), } + m.offlineRecoveryConfirmations["node-offline-node1"] = 2 m.mu.Unlock() }, clearFn: func(m *Manager) { @@ -8209,6 +8246,7 @@ func TestClearOfflineAlertNoDeadlock(t *testing.T) { Type: "offline", StartTime: time.Now().Add(-5 * time.Minute), } + m.offlineRecoveryConfirmations["pbs-offline-pbs1"] = 2 m.mu.Unlock() }, clearFn: func(m *Manager) { @@ -8224,6 +8262,7 @@ func TestClearOfflineAlertNoDeadlock(t *testing.T) { Type: "offline", StartTime: time.Now().Add(-5 * time.Minute), } + m.offlineRecoveryConfirmations["pmg-offline-pmg1"] = 2 m.mu.Unlock() }, clearFn: func(m *Manager) { @@ -8239,6 +8278,7 @@ func TestClearOfflineAlertNoDeadlock(t *testing.T) { Type: "offline", StartTime: time.Now().Add(-5 * time.Minute), } + m.offlineRecoveryConfirmations["storage-offline-stor1"] = 1 m.mu.Unlock() }, clearFn: func(m *Manager) { @@ -8342,6 +8382,28 @@ func TestClearPBSOfflineAlert(t *testing.T) { m.mu.Unlock() pbs := models.PBSInstance{ID: "pbs1", Name: "PBS 1", Host: "pbs.local"} + m.clearPBSOfflineAlert(pbs) + m.mu.RLock() + _, alertStillActive := m.activeAlerts["pbs-offline-pbs1"] + m.mu.RUnlock() + if !alertStillActive { + t.Fatal("expected alert to remain active until recovery is confirmed") + } + + select { + case <-resolvedCh: + t.Fatal("expected no resolved callback before recovery is confirmed") + default: + } + + m.clearPBSOfflineAlert(pbs) + m.mu.RLock() + _, alertStillActive = m.activeAlerts["pbs-offline-pbs1"] + m.mu.RUnlock() + if !alertStillActive { + t.Fatal("expected alert to remain active until final recovery confirmation") + } + m.clearPBSOfflineAlert(pbs) m.mu.RLock() @@ -8425,6 +8487,28 @@ func TestClearPMGOfflineAlert(t *testing.T) { m.mu.Unlock() pmg := models.PMGInstance{ID: "pmg1", Name: "PMG 1", Host: "pmg.local"} + m.clearPMGOfflineAlert(pmg) + m.mu.RLock() + _, alertStillActive := m.activeAlerts["pmg-offline-pmg1"] + m.mu.RUnlock() + if !alertStillActive { + t.Fatal("expected alert to remain active until recovery is confirmed") + } + + select { + case <-resolvedCh: + t.Fatal("expected no resolved callback before recovery is confirmed") + default: + } + + m.clearPMGOfflineAlert(pmg) + m.mu.RLock() + _, alertStillActive = m.activeAlerts["pmg-offline-pmg1"] + m.mu.RUnlock() + if !alertStillActive { + t.Fatal("expected alert to remain active until final recovery confirmation") + } + m.clearPMGOfflineAlert(pmg) m.mu.RLock() @@ -13370,6 +13454,22 @@ func TestCheckNode(t *testing.T) { Status: "online", ConnectionHealth: "connected", } + m.CheckNode(node) + m.mu.RLock() + _, alertStillActive := m.activeAlerts["node-offline-node1"] + m.mu.RUnlock() + if !alertStillActive { + t.Fatal("expected offline alert to remain until recovery is confirmed") + } + + m.CheckNode(node) + m.mu.RLock() + _, alertStillActive = m.activeAlerts["node-offline-node1"] + m.mu.RUnlock() + if !alertStillActive { + t.Fatal("expected offline alert to remain until final recovery confirmation") + } + m.CheckNode(node) m.mu.RLock() @@ -15429,6 +15529,22 @@ func TestCheckPBSComprehensive(t *testing.T) { ConnectionHealth: "healthy", } + m.CheckPBS(pbs) + m.mu.RLock() + _, offlineStillActive := m.activeAlerts["pbs-offline-pbs1"] + m.mu.RUnlock() + if !offlineStillActive { + t.Fatal("expected offline alert to remain until recovery is confirmed") + } + + m.CheckPBS(pbs) + m.mu.RLock() + _, offlineStillActive = m.activeAlerts["pbs-offline-pbs1"] + m.mu.RUnlock() + if !offlineStillActive { + t.Fatal("expected offline alert to remain until final recovery confirmation") + } + m.CheckPBS(pbs) m.mu.RLock() @@ -15443,6 +15559,89 @@ func TestCheckPBSComprehensive(t *testing.T) { t.Error("expected offline confirmation to be cleared") } }) + + t.Run("transient healthy poll does not re-arm offline alert notifications", func(t *testing.T) { + m := newTestManager(t) + + alertsCh := make(chan string, 2) + resolvedCh := make(chan string, 1) + + m.mu.Lock() + m.config.ActivationState = ActivationActive + m.offlineConfirmations["pbs1"] = 2 + m.mu.Unlock() + + m.SetAlertCallback(func(alert *Alert) { + if alert != nil { + alertsCh <- alert.ID + } + }) + m.SetResolvedCallback(func(alertID string) { + resolvedCh <- alertID + }) + + offlinePBS := models.PBSInstance{ + ID: "pbs1", + Name: "testpbs", + Status: "offline", + } + onlinePBS := models.PBSInstance{ + ID: "pbs1", + Name: "testpbs", + Status: "online", + ConnectionHealth: "healthy", + } + + m.CheckPBS(offlinePBS) + select { + case alertID := <-alertsCh: + if alertID != "pbs-offline-pbs1" { + t.Fatalf("expected initial PBS offline notification, got %q", alertID) + } + case <-time.After(2 * time.Second): + t.Fatal("expected initial PBS offline notification") + } + + m.CheckPBS(onlinePBS) + + m.mu.RLock() + _, stillActive := m.activeAlerts["pbs-offline-pbs1"] + recoveryCount := m.offlineRecoveryConfirmations["pbs-offline-pbs1"] + m.mu.RUnlock() + + if !stillActive { + t.Fatal("expected transient healthy poll to keep the offline alert active") + } + if recoveryCount != 1 { + t.Fatalf("expected recovery confirmation count 1 after transient healthy poll, got %d", recoveryCount) + } + + select { + case resolvedID := <-resolvedCh: + t.Fatalf("expected no recovery notification from a single healthy poll, got %q", resolvedID) + default: + } + + m.CheckPBS(offlinePBS) + + m.mu.RLock() + _, stillActive = m.activeAlerts["pbs-offline-pbs1"] + _, recoveryTracked := m.offlineRecoveryConfirmations["pbs-offline-pbs1"] + m.mu.RUnlock() + + if !stillActive { + t.Fatal("expected offline alert to remain active after connectivity drops again") + } + if recoveryTracked { + t.Fatal("expected transient recovery tracking to reset once PBS is offline again") + } + + select { + case alertID := <-alertsCh: + t.Fatalf("expected no duplicate offline notification while alert stays active, got %q", alertID) + case <-time.After(200 * time.Millisecond): + } + }) } func TestCheckPMGComprehensive(t *testing.T) { @@ -15694,6 +15893,22 @@ func TestCheckPMGComprehensive(t *testing.T) { ConnectionHealth: "healthy", } + m.CheckPMG(pmg) + m.mu.RLock() + _, offlineStillActive := m.activeAlerts["pmg-offline-pmg1"] + m.mu.RUnlock() + if !offlineStillActive { + t.Fatal("expected offline alert to remain until recovery is confirmed") + } + + m.CheckPMG(pmg) + m.mu.RLock() + _, offlineStillActive = m.activeAlerts["pmg-offline-pmg1"] + m.mu.RUnlock() + if !offlineStillActive { + t.Fatal("expected offline alert to remain until final recovery confirmation") + } + m.CheckPMG(pmg) m.mu.RLock() @@ -16049,6 +16264,14 @@ func TestCheckStorageComprehensive(t *testing.T) { Status: "active", } + m.CheckStorage(storage) + m.mu.RLock() + _, offlineStillActive := m.activeAlerts["storage-offline-storage1"] + m.mu.RUnlock() + if !offlineStillActive { + t.Fatal("expected offline alert to remain until recovery is confirmed") + } + m.CheckStorage(storage) m.mu.RLock()