mirror of
https://github.com/rcourtman/Pulse.git
synced 2026-05-19 16:27:37 +00:00
Fix webhook alerts persisting when DisableAll* flags are enabled
The original fix in c6c0ac63e only handled per-resource overrides when
thresholds were disabled (trigger <= 0 or Disabled=true). It did not
handle global DisableAll* flags (DisableAllStorage, DisableAllNodes,
DisableAllGuests, etc.).
When a user toggled a DisableAll* flag from false to true:
- Check* functions returned early without processing
- Existing active alerts remained in m.activeAlerts map
- Those alerts continued generating webhook notifications
- reevaluateActiveAlertsLocked didn't check DisableAll* flags
This commit fixes the issue by:
1. Updating reevaluateActiveAlertsLocked to check all DisableAll* flags
and resolve alerts for those resource types during config updates
2. Adding alert cleanup to Check* functions before early returns:
- CheckStorage: clears usage and offline alerts
- CheckNode: clears cpu/memory/disk/temperature and offline alerts
- CheckPMG: clears queue/message alerts and offline alerts
- CheckPBS: clears cpu/memory and offline alerts
- CheckHost: calls existing cleanup helpers
3. Adding comprehensive test coverage for DisableAllStorage scenario
Related to #561
This commit is contained in:
parent
2c3768341a
commit
4891f06e76
2 changed files with 275 additions and 1 deletions
|
|
@ -1355,6 +1355,23 @@ func (m *Manager) reevaluateActiveAlertsLocked() {
|
|||
}
|
||||
}
|
||||
|
||||
// Check for PMG alerts by Type
|
||||
if alert.Type == "queue-depth" || alert.Type == "queue-deferred" || alert.Type == "queue-hold" || alert.Type == "message-age" {
|
||||
// This is a PMG alert
|
||||
if m.config.DisableAllPMG {
|
||||
alertsToResolve = append(alertsToResolve, alertID)
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
// Check for Host alerts by resourceType
|
||||
if resourceTypeMeta == "host" {
|
||||
if m.config.DisableAllHosts {
|
||||
alertsToResolve = append(alertsToResolve, alertID)
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
if alert.Type == "docker-host-offline" ||
|
||||
strings.HasPrefix(alertID, "docker-container-health-") ||
|
||||
strings.HasPrefix(alertID, "docker-container-state-") ||
|
||||
|
|
@ -1366,10 +1383,20 @@ func (m *Manager) reevaluateActiveAlertsLocked() {
|
|||
}
|
||||
|
||||
if resourceTypeMeta == "dockerhost" {
|
||||
// Check if all Docker host alerts are disabled
|
||||
if m.config.DisableAllDockerHosts {
|
||||
alertsToResolve = append(alertsToResolve, alertID)
|
||||
continue
|
||||
}
|
||||
// No threshold evaluation for Docker hosts (connectivity handled separately)
|
||||
continue
|
||||
}
|
||||
if resourceTypeMeta == "docker container" {
|
||||
// Check if all Docker container alerts are disabled
|
||||
if m.config.DisableAllDockerContainers {
|
||||
alertsToResolve = append(alertsToResolve, alertID)
|
||||
continue
|
||||
}
|
||||
thresholds := m.config.GuestDefaults
|
||||
if override, exists := m.config.Overrides[resourceID]; exists {
|
||||
thresholds = m.applyThresholdOverride(thresholds, override)
|
||||
|
|
@ -1381,6 +1408,11 @@ func (m *Manager) reevaluateActiveAlertsLocked() {
|
|||
// We need to check what kind of resource this is
|
||||
if threshold == nil && (alert.Instance == "Node" || alert.Instance == alert.Node) {
|
||||
// This is a node alert
|
||||
// Check if all node alerts are disabled
|
||||
if m.config.DisableAllNodes {
|
||||
alertsToResolve = append(alertsToResolve, alertID)
|
||||
continue
|
||||
}
|
||||
thresholds := m.config.NodeDefaults
|
||||
if override, exists := m.config.Overrides[resourceID]; exists {
|
||||
thresholds = m.applyThresholdOverride(thresholds, override)
|
||||
|
|
@ -1388,6 +1420,11 @@ func (m *Manager) reevaluateActiveAlertsLocked() {
|
|||
threshold = getThresholdForMetric(thresholds, metricType)
|
||||
} else if threshold == nil && (alert.Instance == "Storage" || strings.Contains(alert.ResourceID, ":storage/")) {
|
||||
// This is a storage alert
|
||||
// Check if all storage alerts are disabled
|
||||
if m.config.DisableAllStorage {
|
||||
alertsToResolve = append(alertsToResolve, alertID)
|
||||
continue
|
||||
}
|
||||
if override, exists := m.config.Overrides[resourceID]; exists && override.Usage != nil {
|
||||
threshold = override.Usage
|
||||
} else {
|
||||
|
|
@ -1395,6 +1432,11 @@ func (m *Manager) reevaluateActiveAlertsLocked() {
|
|||
}
|
||||
} else if threshold == nil && alert.Instance == "PBS" {
|
||||
// This is a PBS alert
|
||||
// Check if all PBS alerts are disabled
|
||||
if m.config.DisableAllPBS {
|
||||
alertsToResolve = append(alertsToResolve, alertID)
|
||||
continue
|
||||
}
|
||||
thresholds := m.config.NodeDefaults
|
||||
if override, exists := m.config.Overrides[resourceID]; exists {
|
||||
if override.CPU != nil && metricType == "cpu" {
|
||||
|
|
@ -1410,6 +1452,11 @@ func (m *Manager) reevaluateActiveAlertsLocked() {
|
|||
|
||||
if threshold == nil {
|
||||
// This is a guest (qemu/lxc) alert
|
||||
// Check if all guest alerts are disabled
|
||||
if m.config.DisableAllGuests {
|
||||
alertsToResolve = append(alertsToResolve, alertID)
|
||||
continue
|
||||
}
|
||||
// We need to evaluate custom rules, but we don't have the guest object here.
|
||||
// For now, we'll mark these alerts for re-evaluation by the monitor.
|
||||
// The next poll cycle will properly evaluate them with custom rules.
|
||||
|
|
@ -2096,6 +2143,32 @@ func (m *Manager) CheckNode(node models.Node) {
|
|||
}
|
||||
if m.config.DisableAllNodes {
|
||||
m.mu.RUnlock()
|
||||
// Clear any existing node alerts when all node alerts are disabled
|
||||
m.mu.Lock()
|
||||
// Clear offline tracking
|
||||
delete(m.nodeOfflineCount, node.ID)
|
||||
// Clear all possible node alert types
|
||||
alertTypes := []string{"cpu", "memory", "disk", "temperature"}
|
||||
for _, alertType := range alertTypes {
|
||||
alertID := fmt.Sprintf("%s-%s", node.ID, alertType)
|
||||
if _, exists := m.activeAlerts[alertID]; exists {
|
||||
m.clearAlertNoLock(alertID)
|
||||
log.Info().
|
||||
Str("alertID", alertID).
|
||||
Str("node", node.Name).
|
||||
Msg("Cleared node alert - all node alerts disabled")
|
||||
}
|
||||
}
|
||||
// Clear offline alert
|
||||
offlineAlertID := fmt.Sprintf("node-offline-%s", node.ID)
|
||||
if _, exists := m.activeAlerts[offlineAlertID]; exists {
|
||||
m.clearAlertNoLock(offlineAlertID)
|
||||
log.Info().
|
||||
Str("alertID", offlineAlertID).
|
||||
Str("node", node.Name).
|
||||
Msg("Cleared offline alert - all node alerts disabled")
|
||||
}
|
||||
m.mu.Unlock()
|
||||
return
|
||||
}
|
||||
disableNodesOffline := m.config.DisableAllNodesOffline
|
||||
|
|
@ -2230,7 +2303,14 @@ func (m *Manager) CheckHost(host models.Host) {
|
|||
override, hasOverride := m.config.Overrides[host.ID]
|
||||
m.mu.RUnlock()
|
||||
|
||||
if !alertsEnabled || disableAllHosts {
|
||||
if !alertsEnabled {
|
||||
return
|
||||
}
|
||||
|
||||
if disableAllHosts {
|
||||
// Clear any existing host alerts when all host alerts are disabled
|
||||
m.clearHostMetricAlerts(host.ID)
|
||||
m.clearHostDiskAlerts(host.ID)
|
||||
return
|
||||
}
|
||||
|
||||
|
|
@ -2515,6 +2595,38 @@ func (m *Manager) CheckPBS(pbs models.PBSInstance) {
|
|||
}
|
||||
if m.config.DisableAllPBS {
|
||||
m.mu.RUnlock()
|
||||
// Clear any existing PBS alerts when all PBS alerts are disabled
|
||||
m.mu.Lock()
|
||||
// Reset offline confirmation tracking
|
||||
delete(m.offlineConfirmations, pbs.ID)
|
||||
// Clear CPU alert
|
||||
cpuAlertID := fmt.Sprintf("%s-cpu", pbs.ID)
|
||||
if _, exists := m.activeAlerts[cpuAlertID]; exists {
|
||||
m.clearAlertNoLock(cpuAlertID)
|
||||
log.Info().
|
||||
Str("alertID", cpuAlertID).
|
||||
Str("pbs", pbs.Name).
|
||||
Msg("Cleared CPU alert - all PBS alerts disabled")
|
||||
}
|
||||
// Clear Memory alert
|
||||
memAlertID := fmt.Sprintf("%s-memory", pbs.ID)
|
||||
if _, exists := m.activeAlerts[memAlertID]; exists {
|
||||
m.clearAlertNoLock(memAlertID)
|
||||
log.Info().
|
||||
Str("alertID", memAlertID).
|
||||
Str("pbs", pbs.Name).
|
||||
Msg("Cleared Memory alert - all PBS alerts disabled")
|
||||
}
|
||||
// Clear offline alert
|
||||
offlineAlertID := fmt.Sprintf("pbs-offline-%s", pbs.ID)
|
||||
if _, exists := m.activeAlerts[offlineAlertID]; exists {
|
||||
m.clearAlertNoLock(offlineAlertID)
|
||||
log.Info().
|
||||
Str("alertID", offlineAlertID).
|
||||
Str("pbs", pbs.Name).
|
||||
Msg("Cleared offline alert - all PBS alerts disabled")
|
||||
}
|
||||
m.mu.Unlock()
|
||||
return
|
||||
}
|
||||
|
||||
|
|
@ -2607,6 +2719,32 @@ func (m *Manager) CheckPMG(pmg models.PMGInstance) {
|
|||
}
|
||||
if m.config.DisableAllPMG {
|
||||
m.mu.RUnlock()
|
||||
// Clear any existing PMG alerts when all PMG alerts are disabled
|
||||
m.mu.Lock()
|
||||
// Reset offline confirmation tracking
|
||||
delete(m.offlineConfirmations, pmg.ID)
|
||||
// Clear all possible PMG alert types
|
||||
alertTypes := []string{"queue-total", "queue-deferred", "queue-hold", "oldest-message"}
|
||||
for _, alertType := range alertTypes {
|
||||
alertID := fmt.Sprintf("%s-%s", pmg.ID, alertType)
|
||||
if _, exists := m.activeAlerts[alertID]; exists {
|
||||
m.clearAlertNoLock(alertID)
|
||||
log.Info().
|
||||
Str("alertID", alertID).
|
||||
Str("pmg", pmg.Name).
|
||||
Msg("Cleared PMG alert - all PMG alerts disabled")
|
||||
}
|
||||
}
|
||||
// Clear offline alert
|
||||
offlineAlertID := fmt.Sprintf("pmg-offline-%s", pmg.ID)
|
||||
if _, exists := m.activeAlerts[offlineAlertID]; exists {
|
||||
m.clearAlertNoLock(offlineAlertID)
|
||||
log.Info().
|
||||
Str("alertID", offlineAlertID).
|
||||
Str("pmg", pmg.Name).
|
||||
Msg("Cleared offline alert - all PMG alerts disabled")
|
||||
}
|
||||
m.mu.Unlock()
|
||||
return
|
||||
}
|
||||
|
||||
|
|
@ -3767,6 +3905,25 @@ func (m *Manager) CheckStorage(storage models.Storage) {
|
|||
}
|
||||
if m.config.DisableAllStorage {
|
||||
m.mu.RUnlock()
|
||||
// Clear any existing storage alerts when all storage alerts are disabled
|
||||
m.mu.Lock()
|
||||
usageAlertID := fmt.Sprintf("%s-usage", storage.ID)
|
||||
if _, exists := m.activeAlerts[usageAlertID]; exists {
|
||||
m.clearAlertNoLock(usageAlertID)
|
||||
log.Info().
|
||||
Str("alertID", usageAlertID).
|
||||
Str("storage", storage.Name).
|
||||
Msg("Cleared usage alert - all storage alerts disabled")
|
||||
}
|
||||
offlineAlertID := fmt.Sprintf("storage-offline-%s", storage.ID)
|
||||
if _, exists := m.activeAlerts[offlineAlertID]; exists {
|
||||
m.clearAlertNoLock(offlineAlertID)
|
||||
log.Info().
|
||||
Str("alertID", offlineAlertID).
|
||||
Str("storage", storage.Name).
|
||||
Msg("Cleared offline alert - all storage alerts disabled")
|
||||
}
|
||||
m.mu.Unlock()
|
||||
return
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -1655,3 +1655,120 @@ func TestCheckDiskHealthClearsExistingSamsung980Alerts(t *testing.T) {
|
|||
t.Fatalf("expected existing Samsung 990 health alert to be cleared")
|
||||
}
|
||||
}
|
||||
|
||||
func TestDisableAllStorageClearsExistingAlerts(t *testing.T) {
|
||||
m := NewManager()
|
||||
|
||||
storageID := "local-lvm"
|
||||
|
||||
// Start with configuration that allows storage alerts
|
||||
initialConfig := AlertConfig{
|
||||
Enabled: true,
|
||||
DisableAllStorage: false,
|
||||
StorageDefault: HysteresisThreshold{Trigger: 80, Clear: 75},
|
||||
TimeThreshold: 0,
|
||||
TimeThresholds: map[string]int{},
|
||||
NodeDefaults: ThresholdConfig{
|
||||
CPU: &HysteresisThreshold{Trigger: 80, Clear: 75},
|
||||
Memory: &HysteresisThreshold{Trigger: 85, Clear: 80},
|
||||
Disk: &HysteresisThreshold{Trigger: 90, Clear: 85},
|
||||
},
|
||||
GuestDefaults: ThresholdConfig{
|
||||
CPU: &HysteresisThreshold{Trigger: 80, Clear: 75},
|
||||
},
|
||||
Overrides: make(map[string]ThresholdConfig),
|
||||
}
|
||||
m.UpdateConfig(initialConfig)
|
||||
m.mu.Lock()
|
||||
m.config.TimeThreshold = 0
|
||||
m.config.TimeThresholds = map[string]int{}
|
||||
m.config.ActivationState = ActivationActive
|
||||
m.mu.Unlock()
|
||||
|
||||
var dispatched []*Alert
|
||||
done := make(chan struct{}, 1)
|
||||
var resolved []string
|
||||
resolvedDone := make(chan struct{}, 1)
|
||||
m.SetAlertCallback(func(alert *Alert) {
|
||||
dispatched = append(dispatched, alert)
|
||||
select {
|
||||
case done <- struct{}{}:
|
||||
default:
|
||||
}
|
||||
})
|
||||
m.SetResolvedCallback(func(alertID string) {
|
||||
resolved = append(resolved, alertID)
|
||||
select {
|
||||
case resolvedDone <- struct{}{}:
|
||||
default:
|
||||
}
|
||||
})
|
||||
|
||||
storage := models.Storage{
|
||||
ID: storageID,
|
||||
Name: "local-lvm",
|
||||
Usage: 90.0,
|
||||
Status: "available",
|
||||
}
|
||||
|
||||
// Initial check should trigger an alert
|
||||
m.CheckStorage(storage)
|
||||
select {
|
||||
case <-done:
|
||||
case <-time.After(100 * time.Millisecond):
|
||||
t.Fatalf("did not receive initial alert dispatch")
|
||||
}
|
||||
if len(dispatched) != 1 {
|
||||
t.Fatalf("expected 1 alert before disabling storage, got %d", len(dispatched))
|
||||
}
|
||||
|
||||
// Apply config with DisableAllStorage enabled
|
||||
disabledConfig := initialConfig
|
||||
disabledConfig.DisableAllStorage = true
|
||||
m.UpdateConfig(disabledConfig)
|
||||
m.mu.Lock()
|
||||
m.config.TimeThreshold = 0
|
||||
m.config.TimeThresholds = map[string]int{}
|
||||
m.config.ActivationState = ActivationActive
|
||||
m.mu.Unlock()
|
||||
|
||||
// Clear dispatched slice to capture only post-disable notifications
|
||||
dispatched = dispatched[:0]
|
||||
done = make(chan struct{}, 1)
|
||||
|
||||
// Re-run CheckStorage with high usage; no alert should be dispatched
|
||||
m.CheckStorage(storage)
|
||||
select {
|
||||
case <-done:
|
||||
t.Fatalf("expected no alerts after disabling all storage, but callback fired")
|
||||
case <-time.After(100 * time.Millisecond):
|
||||
// No callback fired as expected
|
||||
}
|
||||
|
||||
// Active alerts should be cleared by reevaluateActiveAlertsLocked
|
||||
m.mu.RLock()
|
||||
activeCount := len(m.activeAlerts)
|
||||
m.mu.RUnlock()
|
||||
if activeCount != 0 {
|
||||
t.Fatalf("expected active alerts to be cleared after disabling all storage, got %d", activeCount)
|
||||
}
|
||||
|
||||
// Resolved callback should have fired
|
||||
select {
|
||||
case <-resolvedDone:
|
||||
case <-time.After(100 * time.Millisecond):
|
||||
t.Fatalf("expected resolved callback to fire after disabling all storage")
|
||||
}
|
||||
expectedAlertID := fmt.Sprintf("%s-usage", storageID)
|
||||
if len(resolved) != 1 || resolved[0] != expectedAlertID {
|
||||
t.Fatalf("expected resolved callback for %s, got %v", expectedAlertID, resolved)
|
||||
}
|
||||
|
||||
// Pending alert should be cleared
|
||||
m.mu.RLock()
|
||||
_, isPending := m.pendingAlerts[expectedAlertID]
|
||||
m.mu.RUnlock()
|
||||
if isPending {
|
||||
t.Fatalf("expected pending alert entry to be cleared after disabling all storage")
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue