mirror of
https://github.com/rcourtman/Pulse.git
synced 2026-04-28 03:20:11 +00:00
Debounce recovery for poll-driven offline alerts
This commit is contained in:
parent
005f64182f
commit
a86c7120cf
2 changed files with 310 additions and 7 deletions
|
|
@ -519,13 +519,14 @@ type Manager struct {
|
||||||
// Time threshold tracking
|
// Time threshold tracking
|
||||||
pendingAlerts map[string]time.Time // Track when thresholds were first exceeded
|
pendingAlerts map[string]time.Time // Track when thresholds were first exceeded
|
||||||
// Offline confirmation tracking
|
// Offline confirmation tracking
|
||||||
nodeOfflineCount map[string]int // Track consecutive offline counts for nodes (legacy)
|
nodeOfflineCount map[string]int // Track consecutive offline counts for nodes (legacy)
|
||||||
offlineConfirmations map[string]int // Track consecutive offline counts for all resources
|
offlineConfirmations map[string]int // Track consecutive offline counts for all resources
|
||||||
dockerOfflineCount map[string]int // Track consecutive offline counts for Docker hosts
|
offlineRecoveryConfirmations map[string]int // Track consecutive healthy confirmations before clearing poll-driven offline alerts
|
||||||
dockerStateConfirm map[string]int // Track consecutive state confirmations for Docker containers
|
dockerOfflineCount map[string]int // Track consecutive offline counts for Docker hosts
|
||||||
dockerRestartTracking map[string]*dockerRestartRecord // Track restart counts and times for restart loop detection
|
dockerStateConfirm map[string]int // Track consecutive state confirmations for Docker containers
|
||||||
dockerLastExitCode map[string]int // Track last exit code for OOM detection
|
dockerRestartTracking map[string]*dockerRestartRecord // Track restart counts and times for restart loop detection
|
||||||
dockerUpdateFirstSeen map[string]time.Time // Track when image updates were first detected for alert delay
|
dockerLastExitCode map[string]int // Track last exit code for OOM detection
|
||||||
|
dockerUpdateFirstSeen map[string]time.Time // Track when image updates were first detected for alert delay
|
||||||
// Stable identity tracking prevents update-delay resets when host IDs churn.
|
// Stable identity tracking prevents update-delay resets when host IDs churn.
|
||||||
dockerUpdateFirstSeenByIdentity map[string]time.Time
|
dockerUpdateFirstSeenByIdentity map[string]time.Time
|
||||||
// PMG quarantine growth tracking
|
// PMG quarantine growth tracking
|
||||||
|
|
@ -591,6 +592,7 @@ func NewManagerWithDataDir(dataDir string) *Manager {
|
||||||
pendingAlerts: make(map[string]time.Time),
|
pendingAlerts: make(map[string]time.Time),
|
||||||
nodeOfflineCount: make(map[string]int),
|
nodeOfflineCount: make(map[string]int),
|
||||||
offlineConfirmations: make(map[string]int),
|
offlineConfirmations: make(map[string]int),
|
||||||
|
offlineRecoveryConfirmations: make(map[string]int),
|
||||||
dockerOfflineCount: make(map[string]int),
|
dockerOfflineCount: make(map[string]int),
|
||||||
dockerStateConfirm: make(map[string]int),
|
dockerStateConfirm: make(map[string]int),
|
||||||
dockerRestartTracking: make(map[string]*dockerRestartRecord),
|
dockerRestartTracking: make(map[string]*dockerRestartRecord),
|
||||||
|
|
@ -7496,6 +7498,7 @@ func (m *Manager) removeActiveAlertNoLock(alertID string) {
|
||||||
m.historyManager.UpdateAlertLastSeen(alertID, alert.LastSeen)
|
m.historyManager.UpdateAlertLastSeen(alertID, alert.LastSeen)
|
||||||
}
|
}
|
||||||
delete(m.activeAlerts, alertID)
|
delete(m.activeAlerts, alertID)
|
||||||
|
delete(m.offlineRecoveryConfirmations, alertID)
|
||||||
// NOTE: Don't delete ackState here - preserve it so if the same alert
|
// NOTE: Don't delete ackState here - preserve it so if the same alert
|
||||||
// reappears (e.g., powered-off VM during backup), the acknowledgement
|
// reappears (e.g., powered-off VM during backup), the acknowledgement
|
||||||
// is restored via preserveAlertState. ackState is cleaned up in Cleanup().
|
// is restored via preserveAlertState. ackState is cleaned up in Cleanup().
|
||||||
|
|
@ -7506,6 +7509,22 @@ func (m *Manager) removeActiveAlertNoLock(alertID string) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (m *Manager) confirmOfflineRecoveryNoLock(alertID string, required int) (int, bool) {
|
||||||
|
if required <= 1 {
|
||||||
|
delete(m.offlineRecoveryConfirmations, alertID)
|
||||||
|
return required, true
|
||||||
|
}
|
||||||
|
|
||||||
|
m.offlineRecoveryConfirmations[alertID]++
|
||||||
|
confirmations := m.offlineRecoveryConfirmations[alertID]
|
||||||
|
if confirmations < required {
|
||||||
|
return confirmations, false
|
||||||
|
}
|
||||||
|
|
||||||
|
delete(m.offlineRecoveryConfirmations, alertID)
|
||||||
|
return confirmations, true
|
||||||
|
}
|
||||||
|
|
||||||
// GetActiveAlerts returns all active alerts
|
// GetActiveAlerts returns all active alerts
|
||||||
func (m *Manager) GetActiveAlerts() []Alert {
|
func (m *Manager) GetActiveAlerts() []Alert {
|
||||||
m.mu.RLock()
|
m.mu.RLock()
|
||||||
|
|
@ -7628,6 +7647,7 @@ func (m *Manager) checkNodeOffline(node models.Node) {
|
||||||
defer m.mu.Unlock()
|
defer m.mu.Unlock()
|
||||||
|
|
||||||
// Check if node connectivity alerts are disabled
|
// Check if node connectivity alerts are disabled
|
||||||
|
delete(m.offlineRecoveryConfirmations, alertID)
|
||||||
if override, exists := m.config.Overrides[node.ID]; exists && override.DisableConnectivity {
|
if override, exists := m.config.Overrides[node.ID]; exists && override.DisableConnectivity {
|
||||||
// Node connectivity alerts are disabled, clear any existing alert and return
|
// Node connectivity alerts are disabled, clear any existing alert and return
|
||||||
if _, alertExists := m.activeAlerts[alertID]; alertExists {
|
if _, alertExists := m.activeAlerts[alertID]; alertExists {
|
||||||
|
|
@ -7734,6 +7754,18 @@ func (m *Manager) clearNodeOfflineAlert(node models.Node) {
|
||||||
// Check if offline alert exists
|
// Check if offline alert exists
|
||||||
alert, exists := m.activeAlerts[alertID]
|
alert, exists := m.activeAlerts[alertID]
|
||||||
if !exists {
|
if !exists {
|
||||||
|
delete(m.offlineRecoveryConfirmations, alertID)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
const requiredRecoveryCount = 3
|
||||||
|
recoveryCount, confirmed := m.confirmOfflineRecoveryNoLock(alertID, requiredRecoveryCount)
|
||||||
|
if !confirmed {
|
||||||
|
log.Debug().
|
||||||
|
Str("node", node.Name).
|
||||||
|
Int("confirmations", recoveryCount).
|
||||||
|
Int("required", requiredRecoveryCount).
|
||||||
|
Msg("Node appears back online, waiting for recovery confirmation")
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -7765,6 +7797,7 @@ func (m *Manager) checkPBSOffline(pbs models.PBSInstance) {
|
||||||
defer m.mu.Unlock()
|
defer m.mu.Unlock()
|
||||||
|
|
||||||
// Check if PBS offline alerts are disabled via disableConnectivity flag
|
// Check if PBS offline alerts are disabled via disableConnectivity flag
|
||||||
|
delete(m.offlineRecoveryConfirmations, alertID)
|
||||||
if override, exists := m.config.Overrides[pbs.ID]; exists && (override.Disabled || override.DisableConnectivity) {
|
if override, exists := m.config.Overrides[pbs.ID]; exists && (override.Disabled || override.DisableConnectivity) {
|
||||||
// PBS connectivity alerts are disabled, clear any existing alert and return
|
// PBS connectivity alerts are disabled, clear any existing alert and return
|
||||||
if _, alertExists := m.activeAlerts[alertID]; alertExists {
|
if _, alertExists := m.activeAlerts[alertID]; alertExists {
|
||||||
|
|
@ -7852,6 +7885,18 @@ func (m *Manager) clearPBSOfflineAlert(pbs models.PBSInstance) {
|
||||||
// Check if offline alert exists
|
// Check if offline alert exists
|
||||||
alert, exists := m.activeAlerts[alertID]
|
alert, exists := m.activeAlerts[alertID]
|
||||||
if !exists {
|
if !exists {
|
||||||
|
delete(m.offlineRecoveryConfirmations, alertID)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
const requiredRecoveryCount = 3
|
||||||
|
recoveryCount, confirmed := m.confirmOfflineRecoveryNoLock(alertID, requiredRecoveryCount)
|
||||||
|
if !confirmed {
|
||||||
|
log.Debug().
|
||||||
|
Str("pbs", pbs.Name).
|
||||||
|
Int("confirmations", recoveryCount).
|
||||||
|
Int("required", requiredRecoveryCount).
|
||||||
|
Msg("PBS appears back online, waiting for recovery confirmation")
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -7883,6 +7928,7 @@ func (m *Manager) checkPMGOffline(pmg models.PMGInstance) {
|
||||||
defer m.mu.Unlock()
|
defer m.mu.Unlock()
|
||||||
|
|
||||||
// Check if PMG offline alerts are disabled via disableConnectivity flag
|
// Check if PMG offline alerts are disabled via disableConnectivity flag
|
||||||
|
delete(m.offlineRecoveryConfirmations, alertID)
|
||||||
if override, exists := m.config.Overrides[pmg.ID]; exists && (override.Disabled || override.DisableConnectivity) {
|
if override, exists := m.config.Overrides[pmg.ID]; exists && (override.Disabled || override.DisableConnectivity) {
|
||||||
// PMG connectivity alerts are disabled, clear any existing alert and return
|
// PMG connectivity alerts are disabled, clear any existing alert and return
|
||||||
if _, alertExists := m.activeAlerts[alertID]; alertExists {
|
if _, alertExists := m.activeAlerts[alertID]; alertExists {
|
||||||
|
|
@ -7970,6 +8016,18 @@ func (m *Manager) clearPMGOfflineAlert(pmg models.PMGInstance) {
|
||||||
// Check if offline alert exists
|
// Check if offline alert exists
|
||||||
alert, exists := m.activeAlerts[alertID]
|
alert, exists := m.activeAlerts[alertID]
|
||||||
if !exists {
|
if !exists {
|
||||||
|
delete(m.offlineRecoveryConfirmations, alertID)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
const requiredRecoveryCount = 3
|
||||||
|
recoveryCount, confirmed := m.confirmOfflineRecoveryNoLock(alertID, requiredRecoveryCount)
|
||||||
|
if !confirmed {
|
||||||
|
log.Debug().
|
||||||
|
Str("pmg", pmg.Name).
|
||||||
|
Int("confirmations", recoveryCount).
|
||||||
|
Int("required", requiredRecoveryCount).
|
||||||
|
Msg("PMG appears back online, waiting for recovery confirmation")
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -9009,6 +9067,7 @@ func (m *Manager) checkStorageOffline(storage models.Storage) {
|
||||||
defer m.mu.Unlock()
|
defer m.mu.Unlock()
|
||||||
|
|
||||||
// Check if storage offline alerts are disabled
|
// Check if storage offline alerts are disabled
|
||||||
|
delete(m.offlineRecoveryConfirmations, alertID)
|
||||||
if override, exists, _ := findStorageOverride(m.config.Overrides, storage); exists && override.Disabled {
|
if override, exists, _ := findStorageOverride(m.config.Overrides, storage); exists && override.Disabled {
|
||||||
// Storage alerts are disabled, clear any existing alert and return
|
// Storage alerts are disabled, clear any existing alert and return
|
||||||
if _, alertExists := m.activeAlerts[alertID]; alertExists {
|
if _, alertExists := m.activeAlerts[alertID]; alertExists {
|
||||||
|
|
@ -9097,6 +9156,18 @@ func (m *Manager) clearStorageOfflineAlert(storage models.Storage) {
|
||||||
// Check if offline alert exists
|
// Check if offline alert exists
|
||||||
alert, exists := m.activeAlerts[alertID]
|
alert, exists := m.activeAlerts[alertID]
|
||||||
if !exists {
|
if !exists {
|
||||||
|
delete(m.offlineRecoveryConfirmations, alertID)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
const requiredRecoveryCount = 2
|
||||||
|
recoveryCount, confirmed := m.confirmOfflineRecoveryNoLock(alertID, requiredRecoveryCount)
|
||||||
|
if !confirmed {
|
||||||
|
log.Debug().
|
||||||
|
Str("storage", storage.Name).
|
||||||
|
Int("confirmations", recoveryCount).
|
||||||
|
Int("required", requiredRecoveryCount).
|
||||||
|
Msg("Storage appears back online, waiting for recovery confirmation")
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -10158,6 +10229,7 @@ func (m *Manager) ClearActiveAlerts() {
|
||||||
m.alertRateLimit = make(map[string][]time.Time)
|
m.alertRateLimit = make(map[string][]time.Time)
|
||||||
m.nodeOfflineCount = make(map[string]int)
|
m.nodeOfflineCount = make(map[string]int)
|
||||||
m.offlineConfirmations = make(map[string]int)
|
m.offlineConfirmations = make(map[string]int)
|
||||||
|
m.offlineRecoveryConfirmations = make(map[string]int)
|
||||||
m.dockerOfflineCount = make(map[string]int)
|
m.dockerOfflineCount = make(map[string]int)
|
||||||
m.dockerStateConfirm = make(map[string]int)
|
m.dockerStateConfirm = make(map[string]int)
|
||||||
m.dockerRestartTracking = make(map[string]*dockerRestartRecord)
|
m.dockerRestartTracking = make(map[string]*dockerRestartRecord)
|
||||||
|
|
@ -10275,6 +10347,14 @@ func (m *Manager) cleanupStaleMaps() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Clean up recovery confirmation counts for alerts that are no longer active
|
||||||
|
for alertID := range m.offlineRecoveryConfirmations {
|
||||||
|
if _, exists := m.activeAlerts[alertID]; !exists {
|
||||||
|
delete(m.offlineRecoveryConfirmations, alertID)
|
||||||
|
cleaned++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Clean up node offline counts (legacy)
|
// Clean up node offline counts (legacy)
|
||||||
for nodeID := range m.nodeOfflineCount {
|
for nodeID := range m.nodeOfflineCount {
|
||||||
hasRelatedAlert := false
|
hasRelatedAlert := false
|
||||||
|
|
|
||||||
|
|
@ -5183,6 +5183,20 @@ func TestClearStorageOfflineAlert(t *testing.T) {
|
||||||
resolvedCh <- id
|
resolvedCh <- id
|
||||||
})
|
})
|
||||||
|
|
||||||
|
m.clearStorageOfflineAlert(storage)
|
||||||
|
m.mu.RLock()
|
||||||
|
_, alertStillActive := m.activeAlerts[alertID]
|
||||||
|
m.mu.RUnlock()
|
||||||
|
if !alertStillActive {
|
||||||
|
t.Fatal("expected alert to remain active until recovery is confirmed")
|
||||||
|
}
|
||||||
|
|
||||||
|
select {
|
||||||
|
case <-resolvedCh:
|
||||||
|
t.Fatal("expected no resolved callback before recovery is confirmed")
|
||||||
|
default:
|
||||||
|
}
|
||||||
|
|
||||||
m.clearStorageOfflineAlert(storage)
|
m.clearStorageOfflineAlert(storage)
|
||||||
|
|
||||||
m.mu.RLock()
|
m.mu.RLock()
|
||||||
|
|
@ -8141,6 +8155,28 @@ func TestClearNodeOfflineAlert(t *testing.T) {
|
||||||
m.mu.Unlock()
|
m.mu.Unlock()
|
||||||
|
|
||||||
node := models.Node{ID: "node1", Name: "Node 1", Instance: "pve1"}
|
node := models.Node{ID: "node1", Name: "Node 1", Instance: "pve1"}
|
||||||
|
m.clearNodeOfflineAlert(node)
|
||||||
|
m.mu.RLock()
|
||||||
|
_, alertStillActive := m.activeAlerts["node-offline-node1"]
|
||||||
|
m.mu.RUnlock()
|
||||||
|
if !alertStillActive {
|
||||||
|
t.Fatal("expected alert to remain active until recovery is confirmed")
|
||||||
|
}
|
||||||
|
|
||||||
|
select {
|
||||||
|
case <-resolvedCh:
|
||||||
|
t.Fatal("expected no resolved callback before recovery is confirmed")
|
||||||
|
default:
|
||||||
|
}
|
||||||
|
|
||||||
|
m.clearNodeOfflineAlert(node)
|
||||||
|
m.mu.RLock()
|
||||||
|
_, alertStillActive = m.activeAlerts["node-offline-node1"]
|
||||||
|
m.mu.RUnlock()
|
||||||
|
if !alertStillActive {
|
||||||
|
t.Fatal("expected alert to remain active until final recovery confirmation")
|
||||||
|
}
|
||||||
|
|
||||||
m.clearNodeOfflineAlert(node)
|
m.clearNodeOfflineAlert(node)
|
||||||
|
|
||||||
m.mu.RLock()
|
m.mu.RLock()
|
||||||
|
|
@ -8194,6 +8230,7 @@ func TestClearOfflineAlertNoDeadlock(t *testing.T) {
|
||||||
Type: "offline",
|
Type: "offline",
|
||||||
StartTime: time.Now().Add(-5 * time.Minute),
|
StartTime: time.Now().Add(-5 * time.Minute),
|
||||||
}
|
}
|
||||||
|
m.offlineRecoveryConfirmations["node-offline-node1"] = 2
|
||||||
m.mu.Unlock()
|
m.mu.Unlock()
|
||||||
},
|
},
|
||||||
clearFn: func(m *Manager) {
|
clearFn: func(m *Manager) {
|
||||||
|
|
@ -8209,6 +8246,7 @@ func TestClearOfflineAlertNoDeadlock(t *testing.T) {
|
||||||
Type: "offline",
|
Type: "offline",
|
||||||
StartTime: time.Now().Add(-5 * time.Minute),
|
StartTime: time.Now().Add(-5 * time.Minute),
|
||||||
}
|
}
|
||||||
|
m.offlineRecoveryConfirmations["pbs-offline-pbs1"] = 2
|
||||||
m.mu.Unlock()
|
m.mu.Unlock()
|
||||||
},
|
},
|
||||||
clearFn: func(m *Manager) {
|
clearFn: func(m *Manager) {
|
||||||
|
|
@ -8224,6 +8262,7 @@ func TestClearOfflineAlertNoDeadlock(t *testing.T) {
|
||||||
Type: "offline",
|
Type: "offline",
|
||||||
StartTime: time.Now().Add(-5 * time.Minute),
|
StartTime: time.Now().Add(-5 * time.Minute),
|
||||||
}
|
}
|
||||||
|
m.offlineRecoveryConfirmations["pmg-offline-pmg1"] = 2
|
||||||
m.mu.Unlock()
|
m.mu.Unlock()
|
||||||
},
|
},
|
||||||
clearFn: func(m *Manager) {
|
clearFn: func(m *Manager) {
|
||||||
|
|
@ -8239,6 +8278,7 @@ func TestClearOfflineAlertNoDeadlock(t *testing.T) {
|
||||||
Type: "offline",
|
Type: "offline",
|
||||||
StartTime: time.Now().Add(-5 * time.Minute),
|
StartTime: time.Now().Add(-5 * time.Minute),
|
||||||
}
|
}
|
||||||
|
m.offlineRecoveryConfirmations["storage-offline-stor1"] = 1
|
||||||
m.mu.Unlock()
|
m.mu.Unlock()
|
||||||
},
|
},
|
||||||
clearFn: func(m *Manager) {
|
clearFn: func(m *Manager) {
|
||||||
|
|
@ -8342,6 +8382,28 @@ func TestClearPBSOfflineAlert(t *testing.T) {
|
||||||
m.mu.Unlock()
|
m.mu.Unlock()
|
||||||
|
|
||||||
pbs := models.PBSInstance{ID: "pbs1", Name: "PBS 1", Host: "pbs.local"}
|
pbs := models.PBSInstance{ID: "pbs1", Name: "PBS 1", Host: "pbs.local"}
|
||||||
|
m.clearPBSOfflineAlert(pbs)
|
||||||
|
m.mu.RLock()
|
||||||
|
_, alertStillActive := m.activeAlerts["pbs-offline-pbs1"]
|
||||||
|
m.mu.RUnlock()
|
||||||
|
if !alertStillActive {
|
||||||
|
t.Fatal("expected alert to remain active until recovery is confirmed")
|
||||||
|
}
|
||||||
|
|
||||||
|
select {
|
||||||
|
case <-resolvedCh:
|
||||||
|
t.Fatal("expected no resolved callback before recovery is confirmed")
|
||||||
|
default:
|
||||||
|
}
|
||||||
|
|
||||||
|
m.clearPBSOfflineAlert(pbs)
|
||||||
|
m.mu.RLock()
|
||||||
|
_, alertStillActive = m.activeAlerts["pbs-offline-pbs1"]
|
||||||
|
m.mu.RUnlock()
|
||||||
|
if !alertStillActive {
|
||||||
|
t.Fatal("expected alert to remain active until final recovery confirmation")
|
||||||
|
}
|
||||||
|
|
||||||
m.clearPBSOfflineAlert(pbs)
|
m.clearPBSOfflineAlert(pbs)
|
||||||
|
|
||||||
m.mu.RLock()
|
m.mu.RLock()
|
||||||
|
|
@ -8425,6 +8487,28 @@ func TestClearPMGOfflineAlert(t *testing.T) {
|
||||||
m.mu.Unlock()
|
m.mu.Unlock()
|
||||||
|
|
||||||
pmg := models.PMGInstance{ID: "pmg1", Name: "PMG 1", Host: "pmg.local"}
|
pmg := models.PMGInstance{ID: "pmg1", Name: "PMG 1", Host: "pmg.local"}
|
||||||
|
m.clearPMGOfflineAlert(pmg)
|
||||||
|
m.mu.RLock()
|
||||||
|
_, alertStillActive := m.activeAlerts["pmg-offline-pmg1"]
|
||||||
|
m.mu.RUnlock()
|
||||||
|
if !alertStillActive {
|
||||||
|
t.Fatal("expected alert to remain active until recovery is confirmed")
|
||||||
|
}
|
||||||
|
|
||||||
|
select {
|
||||||
|
case <-resolvedCh:
|
||||||
|
t.Fatal("expected no resolved callback before recovery is confirmed")
|
||||||
|
default:
|
||||||
|
}
|
||||||
|
|
||||||
|
m.clearPMGOfflineAlert(pmg)
|
||||||
|
m.mu.RLock()
|
||||||
|
_, alertStillActive = m.activeAlerts["pmg-offline-pmg1"]
|
||||||
|
m.mu.RUnlock()
|
||||||
|
if !alertStillActive {
|
||||||
|
t.Fatal("expected alert to remain active until final recovery confirmation")
|
||||||
|
}
|
||||||
|
|
||||||
m.clearPMGOfflineAlert(pmg)
|
m.clearPMGOfflineAlert(pmg)
|
||||||
|
|
||||||
m.mu.RLock()
|
m.mu.RLock()
|
||||||
|
|
@ -13370,6 +13454,22 @@ func TestCheckNode(t *testing.T) {
|
||||||
Status: "online",
|
Status: "online",
|
||||||
ConnectionHealth: "connected",
|
ConnectionHealth: "connected",
|
||||||
}
|
}
|
||||||
|
m.CheckNode(node)
|
||||||
|
m.mu.RLock()
|
||||||
|
_, alertStillActive := m.activeAlerts["node-offline-node1"]
|
||||||
|
m.mu.RUnlock()
|
||||||
|
if !alertStillActive {
|
||||||
|
t.Fatal("expected offline alert to remain until recovery is confirmed")
|
||||||
|
}
|
||||||
|
|
||||||
|
m.CheckNode(node)
|
||||||
|
m.mu.RLock()
|
||||||
|
_, alertStillActive = m.activeAlerts["node-offline-node1"]
|
||||||
|
m.mu.RUnlock()
|
||||||
|
if !alertStillActive {
|
||||||
|
t.Fatal("expected offline alert to remain until final recovery confirmation")
|
||||||
|
}
|
||||||
|
|
||||||
m.CheckNode(node)
|
m.CheckNode(node)
|
||||||
|
|
||||||
m.mu.RLock()
|
m.mu.RLock()
|
||||||
|
|
@ -15429,6 +15529,22 @@ func TestCheckPBSComprehensive(t *testing.T) {
|
||||||
ConnectionHealth: "healthy",
|
ConnectionHealth: "healthy",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
m.CheckPBS(pbs)
|
||||||
|
m.mu.RLock()
|
||||||
|
_, offlineStillActive := m.activeAlerts["pbs-offline-pbs1"]
|
||||||
|
m.mu.RUnlock()
|
||||||
|
if !offlineStillActive {
|
||||||
|
t.Fatal("expected offline alert to remain until recovery is confirmed")
|
||||||
|
}
|
||||||
|
|
||||||
|
m.CheckPBS(pbs)
|
||||||
|
m.mu.RLock()
|
||||||
|
_, offlineStillActive = m.activeAlerts["pbs-offline-pbs1"]
|
||||||
|
m.mu.RUnlock()
|
||||||
|
if !offlineStillActive {
|
||||||
|
t.Fatal("expected offline alert to remain until final recovery confirmation")
|
||||||
|
}
|
||||||
|
|
||||||
m.CheckPBS(pbs)
|
m.CheckPBS(pbs)
|
||||||
|
|
||||||
m.mu.RLock()
|
m.mu.RLock()
|
||||||
|
|
@ -15443,6 +15559,89 @@ func TestCheckPBSComprehensive(t *testing.T) {
|
||||||
t.Error("expected offline confirmation to be cleared")
|
t.Error("expected offline confirmation to be cleared")
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
|
|
||||||
|
t.Run("transient healthy poll does not re-arm offline alert notifications", func(t *testing.T) {
|
||||||
|
m := newTestManager(t)
|
||||||
|
|
||||||
|
alertsCh := make(chan string, 2)
|
||||||
|
resolvedCh := make(chan string, 1)
|
||||||
|
|
||||||
|
m.mu.Lock()
|
||||||
|
m.config.ActivationState = ActivationActive
|
||||||
|
m.offlineConfirmations["pbs1"] = 2
|
||||||
|
m.mu.Unlock()
|
||||||
|
|
||||||
|
m.SetAlertCallback(func(alert *Alert) {
|
||||||
|
if alert != nil {
|
||||||
|
alertsCh <- alert.ID
|
||||||
|
}
|
||||||
|
})
|
||||||
|
m.SetResolvedCallback(func(alertID string) {
|
||||||
|
resolvedCh <- alertID
|
||||||
|
})
|
||||||
|
|
||||||
|
offlinePBS := models.PBSInstance{
|
||||||
|
ID: "pbs1",
|
||||||
|
Name: "testpbs",
|
||||||
|
Status: "offline",
|
||||||
|
}
|
||||||
|
onlinePBS := models.PBSInstance{
|
||||||
|
ID: "pbs1",
|
||||||
|
Name: "testpbs",
|
||||||
|
Status: "online",
|
||||||
|
ConnectionHealth: "healthy",
|
||||||
|
}
|
||||||
|
|
||||||
|
m.CheckPBS(offlinePBS)
|
||||||
|
select {
|
||||||
|
case alertID := <-alertsCh:
|
||||||
|
if alertID != "pbs-offline-pbs1" {
|
||||||
|
t.Fatalf("expected initial PBS offline notification, got %q", alertID)
|
||||||
|
}
|
||||||
|
case <-time.After(2 * time.Second):
|
||||||
|
t.Fatal("expected initial PBS offline notification")
|
||||||
|
}
|
||||||
|
|
||||||
|
m.CheckPBS(onlinePBS)
|
||||||
|
|
||||||
|
m.mu.RLock()
|
||||||
|
_, stillActive := m.activeAlerts["pbs-offline-pbs1"]
|
||||||
|
recoveryCount := m.offlineRecoveryConfirmations["pbs-offline-pbs1"]
|
||||||
|
m.mu.RUnlock()
|
||||||
|
|
||||||
|
if !stillActive {
|
||||||
|
t.Fatal("expected transient healthy poll to keep the offline alert active")
|
||||||
|
}
|
||||||
|
if recoveryCount != 1 {
|
||||||
|
t.Fatalf("expected recovery confirmation count 1 after transient healthy poll, got %d", recoveryCount)
|
||||||
|
}
|
||||||
|
|
||||||
|
select {
|
||||||
|
case resolvedID := <-resolvedCh:
|
||||||
|
t.Fatalf("expected no recovery notification from a single healthy poll, got %q", resolvedID)
|
||||||
|
default:
|
||||||
|
}
|
||||||
|
|
||||||
|
m.CheckPBS(offlinePBS)
|
||||||
|
|
||||||
|
m.mu.RLock()
|
||||||
|
_, stillActive = m.activeAlerts["pbs-offline-pbs1"]
|
||||||
|
_, recoveryTracked := m.offlineRecoveryConfirmations["pbs-offline-pbs1"]
|
||||||
|
m.mu.RUnlock()
|
||||||
|
|
||||||
|
if !stillActive {
|
||||||
|
t.Fatal("expected offline alert to remain active after connectivity drops again")
|
||||||
|
}
|
||||||
|
if recoveryTracked {
|
||||||
|
t.Fatal("expected transient recovery tracking to reset once PBS is offline again")
|
||||||
|
}
|
||||||
|
|
||||||
|
select {
|
||||||
|
case alertID := <-alertsCh:
|
||||||
|
t.Fatalf("expected no duplicate offline notification while alert stays active, got %q", alertID)
|
||||||
|
case <-time.After(200 * time.Millisecond):
|
||||||
|
}
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestCheckPMGComprehensive(t *testing.T) {
|
func TestCheckPMGComprehensive(t *testing.T) {
|
||||||
|
|
@ -15694,6 +15893,22 @@ func TestCheckPMGComprehensive(t *testing.T) {
|
||||||
ConnectionHealth: "healthy",
|
ConnectionHealth: "healthy",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
m.CheckPMG(pmg)
|
||||||
|
m.mu.RLock()
|
||||||
|
_, offlineStillActive := m.activeAlerts["pmg-offline-pmg1"]
|
||||||
|
m.mu.RUnlock()
|
||||||
|
if !offlineStillActive {
|
||||||
|
t.Fatal("expected offline alert to remain until recovery is confirmed")
|
||||||
|
}
|
||||||
|
|
||||||
|
m.CheckPMG(pmg)
|
||||||
|
m.mu.RLock()
|
||||||
|
_, offlineStillActive = m.activeAlerts["pmg-offline-pmg1"]
|
||||||
|
m.mu.RUnlock()
|
||||||
|
if !offlineStillActive {
|
||||||
|
t.Fatal("expected offline alert to remain until final recovery confirmation")
|
||||||
|
}
|
||||||
|
|
||||||
m.CheckPMG(pmg)
|
m.CheckPMG(pmg)
|
||||||
|
|
||||||
m.mu.RLock()
|
m.mu.RLock()
|
||||||
|
|
@ -16049,6 +16264,14 @@ func TestCheckStorageComprehensive(t *testing.T) {
|
||||||
Status: "active",
|
Status: "active",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
m.CheckStorage(storage)
|
||||||
|
m.mu.RLock()
|
||||||
|
_, offlineStillActive := m.activeAlerts["storage-offline-storage1"]
|
||||||
|
m.mu.RUnlock()
|
||||||
|
if !offlineStillActive {
|
||||||
|
t.Fatal("expected offline alert to remain until recovery is confirmed")
|
||||||
|
}
|
||||||
|
|
||||||
m.CheckStorage(storage)
|
m.CheckStorage(storage)
|
||||||
|
|
||||||
m.mu.RLock()
|
m.mu.RLock()
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue