mirror of
https://github.com/rcourtman/Pulse.git
synced 2026-05-19 07:54:10 +00:00
fix: add confirmation requirement for node connectivity alerts
Requires 3 consecutive offline polls (~15 seconds) before triggering connectivity alerts to prevent false positives from transient cluster communication issues
This commit is contained in:
parent
fc1bd556b6
commit
3e17e290fe
1 changed files with 39 additions and 4 deletions
|
|
@ -188,6 +188,8 @@ type Manager struct {
|
|||
resolvedMutex sync.RWMutex
|
||||
// Time threshold tracking
|
||||
pendingAlerts map[string]time.Time // Track when thresholds were first exceeded
|
||||
// Node offline confirmation tracking
|
||||
nodeOfflineCount map[string]int // Track consecutive offline counts for nodes
|
||||
}
|
||||
|
||||
// NewManager creates a new alert manager
|
||||
|
|
@ -202,6 +204,7 @@ func NewManager() *Manager {
|
|||
suppressedUntil: make(map[string]time.Time),
|
||||
recentlyResolved: make(map[string]*ResolvedAlert),
|
||||
pendingAlerts: make(map[string]time.Time),
|
||||
nodeOfflineCount: make(map[string]int),
|
||||
config: AlertConfig{
|
||||
Enabled: true,
|
||||
GuestDefaults: ThresholdConfig{
|
||||
|
|
@ -834,7 +837,7 @@ func (m *Manager) ClearAlertHistory() error {
|
|||
return m.historyManager.ClearAllHistory()
|
||||
}
|
||||
|
||||
// checkNodeOffline creates an alert for offline nodes
|
||||
// checkNodeOffline creates an alert for offline nodes after confirmation
|
||||
func (m *Manager) checkNodeOffline(node models.Node) {
|
||||
alertID := fmt.Sprintf("node-offline-%s", node.ID)
|
||||
|
||||
|
|
@ -848,7 +851,29 @@ func (m *Manager) checkNodeOffline(node models.Node) {
|
|||
return
|
||||
}
|
||||
|
||||
// Create new offline alert
|
||||
// Increment offline count
|
||||
m.nodeOfflineCount[node.ID]++
|
||||
offlineCount := m.nodeOfflineCount[node.ID]
|
||||
|
||||
log.Debug().
|
||||
Str("node", node.Name).
|
||||
Str("instance", node.Instance).
|
||||
Int("offlineCount", offlineCount).
|
||||
Msg("Node offline detection count")
|
||||
|
||||
// Require 3 consecutive offline polls (~15 seconds) before alerting
|
||||
// This prevents false positives from transient cluster communication issues
|
||||
const requiredOfflineCount = 3
|
||||
if offlineCount < requiredOfflineCount {
|
||||
log.Info().
|
||||
Str("node", node.Name).
|
||||
Int("count", offlineCount).
|
||||
Int("required", requiredOfflineCount).
|
||||
Msg("Node appears offline, waiting for confirmation")
|
||||
return
|
||||
}
|
||||
|
||||
// Create new offline alert after confirmation
|
||||
alert := &Alert{
|
||||
ID: alertID,
|
||||
Type: "connectivity",
|
||||
|
|
@ -870,7 +895,7 @@ func (m *Manager) checkNodeOffline(node models.Node) {
|
|||
// Add to history
|
||||
m.historyManager.AddAlert(*alert)
|
||||
|
||||
// Send notification immediately for offline nodes
|
||||
// Send notification after confirmation
|
||||
if m.onAlert != nil {
|
||||
m.onAlert(alert)
|
||||
}
|
||||
|
|
@ -881,7 +906,8 @@ func (m *Manager) checkNodeOffline(node models.Node) {
|
|||
Str("instance", node.Instance).
|
||||
Str("status", node.Status).
|
||||
Str("connectionHealth", node.ConnectionHealth).
|
||||
Msg("CRITICAL: Node is offline")
|
||||
Int("confirmedAfter", requiredOfflineCount).
|
||||
Msg("CRITICAL: Node is offline (confirmed)")
|
||||
}
|
||||
|
||||
// clearNodeOfflineAlert removes offline alert when node comes back online
|
||||
|
|
@ -891,6 +917,15 @@ func (m *Manager) clearNodeOfflineAlert(node models.Node) {
|
|||
m.mu.Lock()
|
||||
defer m.mu.Unlock()
|
||||
|
||||
// Reset offline count when node comes back online
|
||||
if m.nodeOfflineCount[node.ID] > 0 {
|
||||
log.Debug().
|
||||
Str("node", node.Name).
|
||||
Int("previousCount", m.nodeOfflineCount[node.ID]).
|
||||
Msg("Node back online, resetting offline count")
|
||||
delete(m.nodeOfflineCount, node.ID)
|
||||
}
|
||||
|
||||
// Check if offline alert exists
|
||||
alert, exists := m.activeAlerts[alertID]
|
||||
if !exists {
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue