Debounce recovery for poll-driven offline alerts

2026-04-26 10:31:17 +00:00 · 2026-04-12 22:04:10 +01:00 · 2026-04-12 22:04:10 +01:00 · a86c7120cf
commit a86c7120cf
parent 005f64182f
2 changed files with 310 additions and 7 deletions
--- a/internal/alerts/alerts.go
+++ b/internal/alerts/alerts.go
@ -519,13 +519,14 @@ type Manager struct {
 	// Time threshold tracking
 	pendingAlerts map[string]time.Time // Track when thresholds were first exceeded
 	// Offline confirmation tracking
-	nodeOfflineCount      map[string]int                  // Track consecutive offline counts for nodes (legacy)
-	offlineConfirmations  map[string]int                  // Track consecutive offline counts for all resources
-	dockerOfflineCount    map[string]int                  // Track consecutive offline counts for Docker hosts
-	dockerStateConfirm    map[string]int                  // Track consecutive state confirmations for Docker containers
-	dockerRestartTracking map[string]*dockerRestartRecord // Track restart counts and times for restart loop detection
-	dockerLastExitCode    map[string]int                  // Track last exit code for OOM detection
-	dockerUpdateFirstSeen map[string]time.Time            // Track when image updates were first detected for alert delay
+	nodeOfflineCount             map[string]int                  // Track consecutive offline counts for nodes (legacy)
+	offlineConfirmations         map[string]int                  // Track consecutive offline counts for all resources
+	offlineRecoveryConfirmations map[string]int                  // Track consecutive healthy confirmations before clearing poll-driven offline alerts
+	dockerOfflineCount           map[string]int                  // Track consecutive offline counts for Docker hosts
+	dockerStateConfirm           map[string]int                  // Track consecutive state confirmations for Docker containers
+	dockerRestartTracking        map[string]*dockerRestartRecord // Track restart counts and times for restart loop detection
+	dockerLastExitCode           map[string]int                  // Track last exit code for OOM detection
+	dockerUpdateFirstSeen        map[string]time.Time            // Track when image updates were first detected for alert delay
 	// Stable identity tracking prevents update-delay resets when host IDs churn.
 	dockerUpdateFirstSeenByIdentity map[string]time.Time
 	// PMG quarantine growth tracking
@ -591,6 +592,7 @@ func NewManagerWithDataDir(dataDir string) *Manager {
 		pendingAlerts:                   make(map[string]time.Time),
 		nodeOfflineCount:                make(map[string]int),
 		offlineConfirmations:            make(map[string]int),
+		offlineRecoveryConfirmations:    make(map[string]int),
 		dockerOfflineCount:              make(map[string]int),
 		dockerStateConfirm:              make(map[string]int),
 		dockerRestartTracking:           make(map[string]*dockerRestartRecord),
@ -7496,6 +7498,7 @@ func (m *Manager) removeActiveAlertNoLock(alertID string) {
 		m.historyManager.UpdateAlertLastSeen(alertID, alert.LastSeen)
 	}
 	delete(m.activeAlerts, alertID)
+	delete(m.offlineRecoveryConfirmations, alertID)
 	// NOTE: Don't delete ackState here - preserve it so if the same alert
 	// reappears (e.g., powered-off VM during backup), the acknowledgement
 	// is restored via preserveAlertState. ackState is cleaned up in Cleanup().
@ -7506,6 +7509,22 @@ func (m *Manager) removeActiveAlertNoLock(alertID string) {
 	}
 }

+func (m *Manager) confirmOfflineRecoveryNoLock(alertID string, required int) (int, bool) {
+	if required <= 1 {
+		delete(m.offlineRecoveryConfirmations, alertID)
+		return required, true
+	}
+
+	m.offlineRecoveryConfirmations[alertID]++
+	confirmations := m.offlineRecoveryConfirmations[alertID]
+	if confirmations < required {
+		return confirmations, false
+	}
+
+	delete(m.offlineRecoveryConfirmations, alertID)
+	return confirmations, true
+}
+
 // GetActiveAlerts returns all active alerts
 func (m *Manager) GetActiveAlerts() []Alert {
 	m.mu.RLock()
@ -7628,6 +7647,7 @@ func (m *Manager) checkNodeOffline(node models.Node) {
 	defer m.mu.Unlock()

 	// Check if node connectivity alerts are disabled
+	delete(m.offlineRecoveryConfirmations, alertID)
 	if override, exists := m.config.Overrides[node.ID]; exists && override.DisableConnectivity {
 		// Node connectivity alerts are disabled, clear any existing alert and return
 		if _, alertExists := m.activeAlerts[alertID]; alertExists {
@ -7734,6 +7754,18 @@ func (m *Manager) clearNodeOfflineAlert(node models.Node) {
 	// Check if offline alert exists
 	alert, exists := m.activeAlerts[alertID]
 	if !exists {
+		delete(m.offlineRecoveryConfirmations, alertID)
+		return
+	}
+
+	const requiredRecoveryCount = 3
+	recoveryCount, confirmed := m.confirmOfflineRecoveryNoLock(alertID, requiredRecoveryCount)
+	if !confirmed {
+		log.Debug().
+			Str("node", node.Name).
+			Int("confirmations", recoveryCount).
+			Int("required", requiredRecoveryCount).
+			Msg("Node appears back online, waiting for recovery confirmation")
 		return
 	}

@ -7765,6 +7797,7 @@ func (m *Manager) checkPBSOffline(pbs models.PBSInstance) {
 	defer m.mu.Unlock()

 	// Check if PBS offline alerts are disabled via disableConnectivity flag
+	delete(m.offlineRecoveryConfirmations, alertID)
 	if override, exists := m.config.Overrides[pbs.ID]; exists && (override.Disabled || override.DisableConnectivity) {
 		// PBS connectivity alerts are disabled, clear any existing alert and return
 		if _, alertExists := m.activeAlerts[alertID]; alertExists {
@ -7852,6 +7885,18 @@ func (m *Manager) clearPBSOfflineAlert(pbs models.PBSInstance) {
 	// Check if offline alert exists
 	alert, exists := m.activeAlerts[alertID]
 	if !exists {
+		delete(m.offlineRecoveryConfirmations, alertID)
+		return
+	}
+
+	const requiredRecoveryCount = 3
+	recoveryCount, confirmed := m.confirmOfflineRecoveryNoLock(alertID, requiredRecoveryCount)
+	if !confirmed {
+		log.Debug().
+			Str("pbs", pbs.Name).
+			Int("confirmations", recoveryCount).
+			Int("required", requiredRecoveryCount).
+			Msg("PBS appears back online, waiting for recovery confirmation")
 		return
 	}

@ -7883,6 +7928,7 @@ func (m *Manager) checkPMGOffline(pmg models.PMGInstance) {
 	defer m.mu.Unlock()

 	// Check if PMG offline alerts are disabled via disableConnectivity flag
+	delete(m.offlineRecoveryConfirmations, alertID)
 	if override, exists := m.config.Overrides[pmg.ID]; exists && (override.Disabled || override.DisableConnectivity) {
 		// PMG connectivity alerts are disabled, clear any existing alert and return
 		if _, alertExists := m.activeAlerts[alertID]; alertExists {
@ -7970,6 +8016,18 @@ func (m *Manager) clearPMGOfflineAlert(pmg models.PMGInstance) {
 	// Check if offline alert exists
 	alert, exists := m.activeAlerts[alertID]
 	if !exists {
+		delete(m.offlineRecoveryConfirmations, alertID)
+		return
+	}
+
+	const requiredRecoveryCount = 3
+	recoveryCount, confirmed := m.confirmOfflineRecoveryNoLock(alertID, requiredRecoveryCount)
+	if !confirmed {
+		log.Debug().
+			Str("pmg", pmg.Name).
+			Int("confirmations", recoveryCount).
+			Int("required", requiredRecoveryCount).
+			Msg("PMG appears back online, waiting for recovery confirmation")
 		return
 	}

@ -9009,6 +9067,7 @@ func (m *Manager) checkStorageOffline(storage models.Storage) {
 	defer m.mu.Unlock()

 	// Check if storage offline alerts are disabled
+	delete(m.offlineRecoveryConfirmations, alertID)
 	if override, exists, _ := findStorageOverride(m.config.Overrides, storage); exists && override.Disabled {
 		// Storage alerts are disabled, clear any existing alert and return
 		if _, alertExists := m.activeAlerts[alertID]; alertExists {
@ -9097,6 +9156,18 @@ func (m *Manager) clearStorageOfflineAlert(storage models.Storage) {
 	// Check if offline alert exists
 	alert, exists := m.activeAlerts[alertID]
 	if !exists {
+		delete(m.offlineRecoveryConfirmations, alertID)
+		return
+	}
+
+	const requiredRecoveryCount = 2
+	recoveryCount, confirmed := m.confirmOfflineRecoveryNoLock(alertID, requiredRecoveryCount)
+	if !confirmed {
+		log.Debug().
+			Str("storage", storage.Name).
+			Int("confirmations", recoveryCount).
+			Int("required", requiredRecoveryCount).
+			Msg("Storage appears back online, waiting for recovery confirmation")
 		return
 	}

@ -10158,6 +10229,7 @@ func (m *Manager) ClearActiveAlerts() {
 	m.alertRateLimit = make(map[string][]time.Time)
 	m.nodeOfflineCount = make(map[string]int)
 	m.offlineConfirmations = make(map[string]int)
+	m.offlineRecoveryConfirmations = make(map[string]int)
 	m.dockerOfflineCount = make(map[string]int)
 	m.dockerStateConfirm = make(map[string]int)
 	m.dockerRestartTracking = make(map[string]*dockerRestartRecord)
@ -10275,6 +10347,14 @@ func (m *Manager) cleanupStaleMaps() {
 		}
 	}

+	// Clean up recovery confirmation counts for alerts that are no longer active
+	for alertID := range m.offlineRecoveryConfirmations {
+		if _, exists := m.activeAlerts[alertID]; !exists {
+			delete(m.offlineRecoveryConfirmations, alertID)
+			cleaned++
+		}
+	}
+
 	// Clean up node offline counts (legacy)
 	for nodeID := range m.nodeOfflineCount {
 		hasRelatedAlert := false
--- a/internal/alerts/alerts_test.go
+++ b/internal/alerts/alerts_test.go
@ -5183,6 +5183,20 @@ func TestClearStorageOfflineAlert(t *testing.T) {
 			resolvedCh <- id
 		})

+		m.clearStorageOfflineAlert(storage)
+		m.mu.RLock()
+		_, alertStillActive := m.activeAlerts[alertID]
+		m.mu.RUnlock()
+		if !alertStillActive {
+			t.Fatal("expected alert to remain active until recovery is confirmed")
+		}
+
+		select {
+		case <-resolvedCh:
+			t.Fatal("expected no resolved callback before recovery is confirmed")
+		default:
+		}
+
 		m.clearStorageOfflineAlert(storage)

 		m.mu.RLock()
@ -8141,6 +8155,28 @@ func TestClearNodeOfflineAlert(t *testing.T) {
 		m.mu.Unlock()

 		node := models.Node{ID: "node1", Name: "Node 1", Instance: "pve1"}
+		m.clearNodeOfflineAlert(node)
+		m.mu.RLock()
+		_, alertStillActive := m.activeAlerts["node-offline-node1"]
+		m.mu.RUnlock()
+		if !alertStillActive {
+			t.Fatal("expected alert to remain active until recovery is confirmed")
+		}
+
+		select {
+		case <-resolvedCh:
+			t.Fatal("expected no resolved callback before recovery is confirmed")
+		default:
+		}
+
+		m.clearNodeOfflineAlert(node)
+		m.mu.RLock()
+		_, alertStillActive = m.activeAlerts["node-offline-node1"]
+		m.mu.RUnlock()
+		if !alertStillActive {
+			t.Fatal("expected alert to remain active until final recovery confirmation")
+		}
+
 		m.clearNodeOfflineAlert(node)

 		m.mu.RLock()
@ -8194,6 +8230,7 @@ func TestClearOfflineAlertNoDeadlock(t *testing.T) {
 					Type:      "offline",
 					StartTime: time.Now().Add(-5 * time.Minute),
 				}
+				m.offlineRecoveryConfirmations["node-offline-node1"] = 2
 				m.mu.Unlock()
 			},
 			clearFn: func(m *Manager) {
@ -8209,6 +8246,7 @@ func TestClearOfflineAlertNoDeadlock(t *testing.T) {
 					Type:      "offline",
 					StartTime: time.Now().Add(-5 * time.Minute),
 				}
+				m.offlineRecoveryConfirmations["pbs-offline-pbs1"] = 2
 				m.mu.Unlock()
 			},
 			clearFn: func(m *Manager) {
@ -8224,6 +8262,7 @@ func TestClearOfflineAlertNoDeadlock(t *testing.T) {
 					Type:      "offline",
 					StartTime: time.Now().Add(-5 * time.Minute),
 				}
+				m.offlineRecoveryConfirmations["pmg-offline-pmg1"] = 2
 				m.mu.Unlock()
 			},
 			clearFn: func(m *Manager) {
@ -8239,6 +8278,7 @@ func TestClearOfflineAlertNoDeadlock(t *testing.T) {
 					Type:      "offline",
 					StartTime: time.Now().Add(-5 * time.Minute),
 				}
+				m.offlineRecoveryConfirmations["storage-offline-stor1"] = 1
 				m.mu.Unlock()
 			},
 			clearFn: func(m *Manager) {
@ -8342,6 +8382,28 @@ func TestClearPBSOfflineAlert(t *testing.T) {
 		m.mu.Unlock()

 		pbs := models.PBSInstance{ID: "pbs1", Name: "PBS 1", Host: "pbs.local"}
+		m.clearPBSOfflineAlert(pbs)
+		m.mu.RLock()
+		_, alertStillActive := m.activeAlerts["pbs-offline-pbs1"]
+		m.mu.RUnlock()
+		if !alertStillActive {
+			t.Fatal("expected alert to remain active until recovery is confirmed")
+		}
+
+		select {
+		case <-resolvedCh:
+			t.Fatal("expected no resolved callback before recovery is confirmed")
+		default:
+		}
+
+		m.clearPBSOfflineAlert(pbs)
+		m.mu.RLock()
+		_, alertStillActive = m.activeAlerts["pbs-offline-pbs1"]
+		m.mu.RUnlock()
+		if !alertStillActive {
+			t.Fatal("expected alert to remain active until final recovery confirmation")
+		}
+
 		m.clearPBSOfflineAlert(pbs)

 		m.mu.RLock()
@ -8425,6 +8487,28 @@ func TestClearPMGOfflineAlert(t *testing.T) {
 		m.mu.Unlock()

 		pmg := models.PMGInstance{ID: "pmg1", Name: "PMG 1", Host: "pmg.local"}
+		m.clearPMGOfflineAlert(pmg)
+		m.mu.RLock()
+		_, alertStillActive := m.activeAlerts["pmg-offline-pmg1"]
+		m.mu.RUnlock()
+		if !alertStillActive {
+			t.Fatal("expected alert to remain active until recovery is confirmed")
+		}
+
+		select {
+		case <-resolvedCh:
+			t.Fatal("expected no resolved callback before recovery is confirmed")
+		default:
+		}
+
+		m.clearPMGOfflineAlert(pmg)
+		m.mu.RLock()
+		_, alertStillActive = m.activeAlerts["pmg-offline-pmg1"]
+		m.mu.RUnlock()
+		if !alertStillActive {
+			t.Fatal("expected alert to remain active until final recovery confirmation")
+		}
+
 		m.clearPMGOfflineAlert(pmg)

 		m.mu.RLock()
@ -13370,6 +13454,22 @@ func TestCheckNode(t *testing.T) {
 			Status:           "online",
 			ConnectionHealth: "connected",
 		}
+		m.CheckNode(node)
+		m.mu.RLock()
+		_, alertStillActive := m.activeAlerts["node-offline-node1"]
+		m.mu.RUnlock()
+		if !alertStillActive {
+			t.Fatal("expected offline alert to remain until recovery is confirmed")
+		}
+
+		m.CheckNode(node)
+		m.mu.RLock()
+		_, alertStillActive = m.activeAlerts["node-offline-node1"]
+		m.mu.RUnlock()
+		if !alertStillActive {
+			t.Fatal("expected offline alert to remain until final recovery confirmation")
+		}
+
 		m.CheckNode(node)

 		m.mu.RLock()
@ -15429,6 +15529,22 @@ func TestCheckPBSComprehensive(t *testing.T) {
 			ConnectionHealth: "healthy",
 		}

+		m.CheckPBS(pbs)
+		m.mu.RLock()
+		_, offlineStillActive := m.activeAlerts["pbs-offline-pbs1"]
+		m.mu.RUnlock()
+		if !offlineStillActive {
+			t.Fatal("expected offline alert to remain until recovery is confirmed")
+		}
+
+		m.CheckPBS(pbs)
+		m.mu.RLock()
+		_, offlineStillActive = m.activeAlerts["pbs-offline-pbs1"]
+		m.mu.RUnlock()
+		if !offlineStillActive {
+			t.Fatal("expected offline alert to remain until final recovery confirmation")
+		}
+
 		m.CheckPBS(pbs)

 		m.mu.RLock()
@ -15443,6 +15559,89 @@ func TestCheckPBSComprehensive(t *testing.T) {
 			t.Error("expected offline confirmation to be cleared")
 		}
 	})
+
+	t.Run("transient healthy poll does not re-arm offline alert notifications", func(t *testing.T) {
+		m := newTestManager(t)
+
+		alertsCh := make(chan string, 2)
+		resolvedCh := make(chan string, 1)
+
+		m.mu.Lock()
+		m.config.ActivationState = ActivationActive
+		m.offlineConfirmations["pbs1"] = 2
+		m.mu.Unlock()
+
+		m.SetAlertCallback(func(alert *Alert) {
+			if alert != nil {
+				alertsCh <- alert.ID
+			}
+		})
+		m.SetResolvedCallback(func(alertID string) {
+			resolvedCh <- alertID
+		})
+
+		offlinePBS := models.PBSInstance{
+			ID:     "pbs1",
+			Name:   "testpbs",
+			Status: "offline",
+		}
+		onlinePBS := models.PBSInstance{
+			ID:               "pbs1",
+			Name:             "testpbs",
+			Status:           "online",
+			ConnectionHealth: "healthy",
+		}
+
+		m.CheckPBS(offlinePBS)
+		select {
+		case alertID := <-alertsCh:
+			if alertID != "pbs-offline-pbs1" {
+				t.Fatalf("expected initial PBS offline notification, got %q", alertID)
+			}
+		case <-time.After(2 * time.Second):
+			t.Fatal("expected initial PBS offline notification")
+		}
+
+		m.CheckPBS(onlinePBS)
+
+		m.mu.RLock()
+		_, stillActive := m.activeAlerts["pbs-offline-pbs1"]
+		recoveryCount := m.offlineRecoveryConfirmations["pbs-offline-pbs1"]
+		m.mu.RUnlock()
+
+		if !stillActive {
+			t.Fatal("expected transient healthy poll to keep the offline alert active")
+		}
+		if recoveryCount != 1 {
+			t.Fatalf("expected recovery confirmation count 1 after transient healthy poll, got %d", recoveryCount)
+		}
+
+		select {
+		case resolvedID := <-resolvedCh:
+			t.Fatalf("expected no recovery notification from a single healthy poll, got %q", resolvedID)
+		default:
+		}
+
+		m.CheckPBS(offlinePBS)
+
+		m.mu.RLock()
+		_, stillActive = m.activeAlerts["pbs-offline-pbs1"]
+		_, recoveryTracked := m.offlineRecoveryConfirmations["pbs-offline-pbs1"]
+		m.mu.RUnlock()
+
+		if !stillActive {
+			t.Fatal("expected offline alert to remain active after connectivity drops again")
+		}
+		if recoveryTracked {
+			t.Fatal("expected transient recovery tracking to reset once PBS is offline again")
+		}
+
+		select {
+		case alertID := <-alertsCh:
+			t.Fatalf("expected no duplicate offline notification while alert stays active, got %q", alertID)
+		case <-time.After(200 * time.Millisecond):
+		}
+	})
 }

 func TestCheckPMGComprehensive(t *testing.T) {
@ -15694,6 +15893,22 @@ func TestCheckPMGComprehensive(t *testing.T) {
 			ConnectionHealth: "healthy",
 		}

+		m.CheckPMG(pmg)
+		m.mu.RLock()
+		_, offlineStillActive := m.activeAlerts["pmg-offline-pmg1"]
+		m.mu.RUnlock()
+		if !offlineStillActive {
+			t.Fatal("expected offline alert to remain until recovery is confirmed")
+		}
+
+		m.CheckPMG(pmg)
+		m.mu.RLock()
+		_, offlineStillActive = m.activeAlerts["pmg-offline-pmg1"]
+		m.mu.RUnlock()
+		if !offlineStillActive {
+			t.Fatal("expected offline alert to remain until final recovery confirmation")
+		}
+
 		m.CheckPMG(pmg)

 		m.mu.RLock()
@ -16049,6 +16264,14 @@ func TestCheckStorageComprehensive(t *testing.T) {
 			Status: "active",
 		}

+		m.CheckStorage(storage)
+		m.mu.RLock()
+		_, offlineStillActive := m.activeAlerts["storage-offline-storage1"]
+		m.mu.RUnlock()
+		if !offlineStillActive {
+			t.Fatal("expected offline alert to remain until recovery is confirmed")
+		}
+
 		m.CheckStorage(storage)

 		m.mu.RLock()