Debounce recovery for poll-driven offline alerts

This commit is contained in:
rcourtman 2026-04-12 22:04:10 +01:00
parent 005f64182f
commit a86c7120cf
2 changed files with 310 additions and 7 deletions

View file

@ -519,13 +519,14 @@ type Manager struct {
// Time threshold tracking
pendingAlerts map[string]time.Time // Track when thresholds were first exceeded
// Offline confirmation tracking
nodeOfflineCount map[string]int // Track consecutive offline counts for nodes (legacy)
offlineConfirmations map[string]int // Track consecutive offline counts for all resources
dockerOfflineCount map[string]int // Track consecutive offline counts for Docker hosts
dockerStateConfirm map[string]int // Track consecutive state confirmations for Docker containers
dockerRestartTracking map[string]*dockerRestartRecord // Track restart counts and times for restart loop detection
dockerLastExitCode map[string]int // Track last exit code for OOM detection
dockerUpdateFirstSeen map[string]time.Time // Track when image updates were first detected for alert delay
nodeOfflineCount map[string]int // Track consecutive offline counts for nodes (legacy)
offlineConfirmations map[string]int // Track consecutive offline counts for all resources
offlineRecoveryConfirmations map[string]int // Track consecutive healthy confirmations before clearing poll-driven offline alerts
dockerOfflineCount map[string]int // Track consecutive offline counts for Docker hosts
dockerStateConfirm map[string]int // Track consecutive state confirmations for Docker containers
dockerRestartTracking map[string]*dockerRestartRecord // Track restart counts and times for restart loop detection
dockerLastExitCode map[string]int // Track last exit code for OOM detection
dockerUpdateFirstSeen map[string]time.Time // Track when image updates were first detected for alert delay
// Stable identity tracking prevents update-delay resets when host IDs churn.
dockerUpdateFirstSeenByIdentity map[string]time.Time
// PMG quarantine growth tracking
@ -591,6 +592,7 @@ func NewManagerWithDataDir(dataDir string) *Manager {
pendingAlerts: make(map[string]time.Time),
nodeOfflineCount: make(map[string]int),
offlineConfirmations: make(map[string]int),
offlineRecoveryConfirmations: make(map[string]int),
dockerOfflineCount: make(map[string]int),
dockerStateConfirm: make(map[string]int),
dockerRestartTracking: make(map[string]*dockerRestartRecord),
@ -7496,6 +7498,7 @@ func (m *Manager) removeActiveAlertNoLock(alertID string) {
m.historyManager.UpdateAlertLastSeen(alertID, alert.LastSeen)
}
delete(m.activeAlerts, alertID)
delete(m.offlineRecoveryConfirmations, alertID)
// NOTE: Don't delete ackState here - preserve it so if the same alert
// reappears (e.g., powered-off VM during backup), the acknowledgement
// is restored via preserveAlertState. ackState is cleaned up in Cleanup().
@ -7506,6 +7509,22 @@ func (m *Manager) removeActiveAlertNoLock(alertID string) {
}
}
func (m *Manager) confirmOfflineRecoveryNoLock(alertID string, required int) (int, bool) {
if required <= 1 {
delete(m.offlineRecoveryConfirmations, alertID)
return required, true
}
m.offlineRecoveryConfirmations[alertID]++
confirmations := m.offlineRecoveryConfirmations[alertID]
if confirmations < required {
return confirmations, false
}
delete(m.offlineRecoveryConfirmations, alertID)
return confirmations, true
}
// GetActiveAlerts returns all active alerts
func (m *Manager) GetActiveAlerts() []Alert {
m.mu.RLock()
@ -7628,6 +7647,7 @@ func (m *Manager) checkNodeOffline(node models.Node) {
defer m.mu.Unlock()
// Check if node connectivity alerts are disabled
delete(m.offlineRecoveryConfirmations, alertID)
if override, exists := m.config.Overrides[node.ID]; exists && override.DisableConnectivity {
// Node connectivity alerts are disabled, clear any existing alert and return
if _, alertExists := m.activeAlerts[alertID]; alertExists {
@ -7734,6 +7754,18 @@ func (m *Manager) clearNodeOfflineAlert(node models.Node) {
// Check if offline alert exists
alert, exists := m.activeAlerts[alertID]
if !exists {
delete(m.offlineRecoveryConfirmations, alertID)
return
}
const requiredRecoveryCount = 3
recoveryCount, confirmed := m.confirmOfflineRecoveryNoLock(alertID, requiredRecoveryCount)
if !confirmed {
log.Debug().
Str("node", node.Name).
Int("confirmations", recoveryCount).
Int("required", requiredRecoveryCount).
Msg("Node appears back online, waiting for recovery confirmation")
return
}
@ -7765,6 +7797,7 @@ func (m *Manager) checkPBSOffline(pbs models.PBSInstance) {
defer m.mu.Unlock()
// Check if PBS offline alerts are disabled via disableConnectivity flag
delete(m.offlineRecoveryConfirmations, alertID)
if override, exists := m.config.Overrides[pbs.ID]; exists && (override.Disabled || override.DisableConnectivity) {
// PBS connectivity alerts are disabled, clear any existing alert and return
if _, alertExists := m.activeAlerts[alertID]; alertExists {
@ -7852,6 +7885,18 @@ func (m *Manager) clearPBSOfflineAlert(pbs models.PBSInstance) {
// Check if offline alert exists
alert, exists := m.activeAlerts[alertID]
if !exists {
delete(m.offlineRecoveryConfirmations, alertID)
return
}
const requiredRecoveryCount = 3
recoveryCount, confirmed := m.confirmOfflineRecoveryNoLock(alertID, requiredRecoveryCount)
if !confirmed {
log.Debug().
Str("pbs", pbs.Name).
Int("confirmations", recoveryCount).
Int("required", requiredRecoveryCount).
Msg("PBS appears back online, waiting for recovery confirmation")
return
}
@ -7883,6 +7928,7 @@ func (m *Manager) checkPMGOffline(pmg models.PMGInstance) {
defer m.mu.Unlock()
// Check if PMG offline alerts are disabled via disableConnectivity flag
delete(m.offlineRecoveryConfirmations, alertID)
if override, exists := m.config.Overrides[pmg.ID]; exists && (override.Disabled || override.DisableConnectivity) {
// PMG connectivity alerts are disabled, clear any existing alert and return
if _, alertExists := m.activeAlerts[alertID]; alertExists {
@ -7970,6 +8016,18 @@ func (m *Manager) clearPMGOfflineAlert(pmg models.PMGInstance) {
// Check if offline alert exists
alert, exists := m.activeAlerts[alertID]
if !exists {
delete(m.offlineRecoveryConfirmations, alertID)
return
}
const requiredRecoveryCount = 3
recoveryCount, confirmed := m.confirmOfflineRecoveryNoLock(alertID, requiredRecoveryCount)
if !confirmed {
log.Debug().
Str("pmg", pmg.Name).
Int("confirmations", recoveryCount).
Int("required", requiredRecoveryCount).
Msg("PMG appears back online, waiting for recovery confirmation")
return
}
@ -9009,6 +9067,7 @@ func (m *Manager) checkStorageOffline(storage models.Storage) {
defer m.mu.Unlock()
// Check if storage offline alerts are disabled
delete(m.offlineRecoveryConfirmations, alertID)
if override, exists, _ := findStorageOverride(m.config.Overrides, storage); exists && override.Disabled {
// Storage alerts are disabled, clear any existing alert and return
if _, alertExists := m.activeAlerts[alertID]; alertExists {
@ -9097,6 +9156,18 @@ func (m *Manager) clearStorageOfflineAlert(storage models.Storage) {
// Check if offline alert exists
alert, exists := m.activeAlerts[alertID]
if !exists {
delete(m.offlineRecoveryConfirmations, alertID)
return
}
const requiredRecoveryCount = 2
recoveryCount, confirmed := m.confirmOfflineRecoveryNoLock(alertID, requiredRecoveryCount)
if !confirmed {
log.Debug().
Str("storage", storage.Name).
Int("confirmations", recoveryCount).
Int("required", requiredRecoveryCount).
Msg("Storage appears back online, waiting for recovery confirmation")
return
}
@ -10158,6 +10229,7 @@ func (m *Manager) ClearActiveAlerts() {
m.alertRateLimit = make(map[string][]time.Time)
m.nodeOfflineCount = make(map[string]int)
m.offlineConfirmations = make(map[string]int)
m.offlineRecoveryConfirmations = make(map[string]int)
m.dockerOfflineCount = make(map[string]int)
m.dockerStateConfirm = make(map[string]int)
m.dockerRestartTracking = make(map[string]*dockerRestartRecord)
@ -10275,6 +10347,14 @@ func (m *Manager) cleanupStaleMaps() {
}
}
// Clean up recovery confirmation counts for alerts that are no longer active
for alertID := range m.offlineRecoveryConfirmations {
if _, exists := m.activeAlerts[alertID]; !exists {
delete(m.offlineRecoveryConfirmations, alertID)
cleaned++
}
}
// Clean up node offline counts (legacy)
for nodeID := range m.nodeOfflineCount {
hasRelatedAlert := false

View file

@ -5183,6 +5183,20 @@ func TestClearStorageOfflineAlert(t *testing.T) {
resolvedCh <- id
})
m.clearStorageOfflineAlert(storage)
m.mu.RLock()
_, alertStillActive := m.activeAlerts[alertID]
m.mu.RUnlock()
if !alertStillActive {
t.Fatal("expected alert to remain active until recovery is confirmed")
}
select {
case <-resolvedCh:
t.Fatal("expected no resolved callback before recovery is confirmed")
default:
}
m.clearStorageOfflineAlert(storage)
m.mu.RLock()
@ -8141,6 +8155,28 @@ func TestClearNodeOfflineAlert(t *testing.T) {
m.mu.Unlock()
node := models.Node{ID: "node1", Name: "Node 1", Instance: "pve1"}
m.clearNodeOfflineAlert(node)
m.mu.RLock()
_, alertStillActive := m.activeAlerts["node-offline-node1"]
m.mu.RUnlock()
if !alertStillActive {
t.Fatal("expected alert to remain active until recovery is confirmed")
}
select {
case <-resolvedCh:
t.Fatal("expected no resolved callback before recovery is confirmed")
default:
}
m.clearNodeOfflineAlert(node)
m.mu.RLock()
_, alertStillActive = m.activeAlerts["node-offline-node1"]
m.mu.RUnlock()
if !alertStillActive {
t.Fatal("expected alert to remain active until final recovery confirmation")
}
m.clearNodeOfflineAlert(node)
m.mu.RLock()
@ -8194,6 +8230,7 @@ func TestClearOfflineAlertNoDeadlock(t *testing.T) {
Type: "offline",
StartTime: time.Now().Add(-5 * time.Minute),
}
m.offlineRecoveryConfirmations["node-offline-node1"] = 2
m.mu.Unlock()
},
clearFn: func(m *Manager) {
@ -8209,6 +8246,7 @@ func TestClearOfflineAlertNoDeadlock(t *testing.T) {
Type: "offline",
StartTime: time.Now().Add(-5 * time.Minute),
}
m.offlineRecoveryConfirmations["pbs-offline-pbs1"] = 2
m.mu.Unlock()
},
clearFn: func(m *Manager) {
@ -8224,6 +8262,7 @@ func TestClearOfflineAlertNoDeadlock(t *testing.T) {
Type: "offline",
StartTime: time.Now().Add(-5 * time.Minute),
}
m.offlineRecoveryConfirmations["pmg-offline-pmg1"] = 2
m.mu.Unlock()
},
clearFn: func(m *Manager) {
@ -8239,6 +8278,7 @@ func TestClearOfflineAlertNoDeadlock(t *testing.T) {
Type: "offline",
StartTime: time.Now().Add(-5 * time.Minute),
}
m.offlineRecoveryConfirmations["storage-offline-stor1"] = 1
m.mu.Unlock()
},
clearFn: func(m *Manager) {
@ -8342,6 +8382,28 @@ func TestClearPBSOfflineAlert(t *testing.T) {
m.mu.Unlock()
pbs := models.PBSInstance{ID: "pbs1", Name: "PBS 1", Host: "pbs.local"}
m.clearPBSOfflineAlert(pbs)
m.mu.RLock()
_, alertStillActive := m.activeAlerts["pbs-offline-pbs1"]
m.mu.RUnlock()
if !alertStillActive {
t.Fatal("expected alert to remain active until recovery is confirmed")
}
select {
case <-resolvedCh:
t.Fatal("expected no resolved callback before recovery is confirmed")
default:
}
m.clearPBSOfflineAlert(pbs)
m.mu.RLock()
_, alertStillActive = m.activeAlerts["pbs-offline-pbs1"]
m.mu.RUnlock()
if !alertStillActive {
t.Fatal("expected alert to remain active until final recovery confirmation")
}
m.clearPBSOfflineAlert(pbs)
m.mu.RLock()
@ -8425,6 +8487,28 @@ func TestClearPMGOfflineAlert(t *testing.T) {
m.mu.Unlock()
pmg := models.PMGInstance{ID: "pmg1", Name: "PMG 1", Host: "pmg.local"}
m.clearPMGOfflineAlert(pmg)
m.mu.RLock()
_, alertStillActive := m.activeAlerts["pmg-offline-pmg1"]
m.mu.RUnlock()
if !alertStillActive {
t.Fatal("expected alert to remain active until recovery is confirmed")
}
select {
case <-resolvedCh:
t.Fatal("expected no resolved callback before recovery is confirmed")
default:
}
m.clearPMGOfflineAlert(pmg)
m.mu.RLock()
_, alertStillActive = m.activeAlerts["pmg-offline-pmg1"]
m.mu.RUnlock()
if !alertStillActive {
t.Fatal("expected alert to remain active until final recovery confirmation")
}
m.clearPMGOfflineAlert(pmg)
m.mu.RLock()
@ -13370,6 +13454,22 @@ func TestCheckNode(t *testing.T) {
Status: "online",
ConnectionHealth: "connected",
}
m.CheckNode(node)
m.mu.RLock()
_, alertStillActive := m.activeAlerts["node-offline-node1"]
m.mu.RUnlock()
if !alertStillActive {
t.Fatal("expected offline alert to remain until recovery is confirmed")
}
m.CheckNode(node)
m.mu.RLock()
_, alertStillActive = m.activeAlerts["node-offline-node1"]
m.mu.RUnlock()
if !alertStillActive {
t.Fatal("expected offline alert to remain until final recovery confirmation")
}
m.CheckNode(node)
m.mu.RLock()
@ -15429,6 +15529,22 @@ func TestCheckPBSComprehensive(t *testing.T) {
ConnectionHealth: "healthy",
}
m.CheckPBS(pbs)
m.mu.RLock()
_, offlineStillActive := m.activeAlerts["pbs-offline-pbs1"]
m.mu.RUnlock()
if !offlineStillActive {
t.Fatal("expected offline alert to remain until recovery is confirmed")
}
m.CheckPBS(pbs)
m.mu.RLock()
_, offlineStillActive = m.activeAlerts["pbs-offline-pbs1"]
m.mu.RUnlock()
if !offlineStillActive {
t.Fatal("expected offline alert to remain until final recovery confirmation")
}
m.CheckPBS(pbs)
m.mu.RLock()
@ -15443,6 +15559,89 @@ func TestCheckPBSComprehensive(t *testing.T) {
t.Error("expected offline confirmation to be cleared")
}
})
t.Run("transient healthy poll does not re-arm offline alert notifications", func(t *testing.T) {
m := newTestManager(t)
alertsCh := make(chan string, 2)
resolvedCh := make(chan string, 1)
m.mu.Lock()
m.config.ActivationState = ActivationActive
m.offlineConfirmations["pbs1"] = 2
m.mu.Unlock()
m.SetAlertCallback(func(alert *Alert) {
if alert != nil {
alertsCh <- alert.ID
}
})
m.SetResolvedCallback(func(alertID string) {
resolvedCh <- alertID
})
offlinePBS := models.PBSInstance{
ID: "pbs1",
Name: "testpbs",
Status: "offline",
}
onlinePBS := models.PBSInstance{
ID: "pbs1",
Name: "testpbs",
Status: "online",
ConnectionHealth: "healthy",
}
m.CheckPBS(offlinePBS)
select {
case alertID := <-alertsCh:
if alertID != "pbs-offline-pbs1" {
t.Fatalf("expected initial PBS offline notification, got %q", alertID)
}
case <-time.After(2 * time.Second):
t.Fatal("expected initial PBS offline notification")
}
m.CheckPBS(onlinePBS)
m.mu.RLock()
_, stillActive := m.activeAlerts["pbs-offline-pbs1"]
recoveryCount := m.offlineRecoveryConfirmations["pbs-offline-pbs1"]
m.mu.RUnlock()
if !stillActive {
t.Fatal("expected transient healthy poll to keep the offline alert active")
}
if recoveryCount != 1 {
t.Fatalf("expected recovery confirmation count 1 after transient healthy poll, got %d", recoveryCount)
}
select {
case resolvedID := <-resolvedCh:
t.Fatalf("expected no recovery notification from a single healthy poll, got %q", resolvedID)
default:
}
m.CheckPBS(offlinePBS)
m.mu.RLock()
_, stillActive = m.activeAlerts["pbs-offline-pbs1"]
_, recoveryTracked := m.offlineRecoveryConfirmations["pbs-offline-pbs1"]
m.mu.RUnlock()
if !stillActive {
t.Fatal("expected offline alert to remain active after connectivity drops again")
}
if recoveryTracked {
t.Fatal("expected transient recovery tracking to reset once PBS is offline again")
}
select {
case alertID := <-alertsCh:
t.Fatalf("expected no duplicate offline notification while alert stays active, got %q", alertID)
case <-time.After(200 * time.Millisecond):
}
})
}
func TestCheckPMGComprehensive(t *testing.T) {
@ -15694,6 +15893,22 @@ func TestCheckPMGComprehensive(t *testing.T) {
ConnectionHealth: "healthy",
}
m.CheckPMG(pmg)
m.mu.RLock()
_, offlineStillActive := m.activeAlerts["pmg-offline-pmg1"]
m.mu.RUnlock()
if !offlineStillActive {
t.Fatal("expected offline alert to remain until recovery is confirmed")
}
m.CheckPMG(pmg)
m.mu.RLock()
_, offlineStillActive = m.activeAlerts["pmg-offline-pmg1"]
m.mu.RUnlock()
if !offlineStillActive {
t.Fatal("expected offline alert to remain until final recovery confirmation")
}
m.CheckPMG(pmg)
m.mu.RLock()
@ -16049,6 +16264,14 @@ func TestCheckStorageComprehensive(t *testing.T) {
Status: "active",
}
m.CheckStorage(storage)
m.mu.RLock()
_, offlineStillActive := m.activeAlerts["storage-offline-storage1"]
m.mu.RUnlock()
if !offlineStillActive {
t.Fatal("expected offline alert to remain until recovery is confirmed")
}
m.CheckStorage(storage)
m.mu.RLock()