Fix webhook alerts persisting when DisableAll* flags are enabled

The original fix in c6c0ac63e only handled per-resource overrides when
thresholds were disabled (trigger <= 0 or Disabled=true). It did not
handle global DisableAll* flags (DisableAllStorage, DisableAllNodes,
DisableAllGuests, etc.).

When a user toggled a DisableAll* flag from false to true:
- Check* functions returned early without processing
- Existing active alerts remained in m.activeAlerts map
- Those alerts continued generating webhook notifications
- reevaluateActiveAlertsLocked didn't check DisableAll* flags

This commit fixes the issue by:

1. Updating reevaluateActiveAlertsLocked to check all DisableAll* flags
   and resolve alerts for those resource types during config updates

2. Adding alert cleanup to Check* functions before early returns:
   - CheckStorage: clears usage and offline alerts
   - CheckNode: clears cpu/memory/disk/temperature and offline alerts
   - CheckPMG: clears queue/message alerts and offline alerts
   - CheckPBS: clears cpu/memory and offline alerts
   - CheckHost: calls existing cleanup helpers

3. Adding comprehensive test coverage for DisableAllStorage scenario

Related to #561
This commit is contained in:
rcourtman 2025-11-06 21:17:56 +00:00
parent 2c3768341a
commit 4891f06e76
2 changed files with 275 additions and 1 deletions

View file

@ -1355,6 +1355,23 @@ func (m *Manager) reevaluateActiveAlertsLocked() {
}
}
// Check for PMG alerts by Type
if alert.Type == "queue-depth" || alert.Type == "queue-deferred" || alert.Type == "queue-hold" || alert.Type == "message-age" {
// This is a PMG alert
if m.config.DisableAllPMG {
alertsToResolve = append(alertsToResolve, alertID)
continue
}
}
// Check for Host alerts by resourceType
if resourceTypeMeta == "host" {
if m.config.DisableAllHosts {
alertsToResolve = append(alertsToResolve, alertID)
continue
}
}
if alert.Type == "docker-host-offline" ||
strings.HasPrefix(alertID, "docker-container-health-") ||
strings.HasPrefix(alertID, "docker-container-state-") ||
@ -1366,10 +1383,20 @@ func (m *Manager) reevaluateActiveAlertsLocked() {
}
if resourceTypeMeta == "dockerhost" {
// Check if all Docker host alerts are disabled
if m.config.DisableAllDockerHosts {
alertsToResolve = append(alertsToResolve, alertID)
continue
}
// No threshold evaluation for Docker hosts (connectivity handled separately)
continue
}
if resourceTypeMeta == "docker container" {
// Check if all Docker container alerts are disabled
if m.config.DisableAllDockerContainers {
alertsToResolve = append(alertsToResolve, alertID)
continue
}
thresholds := m.config.GuestDefaults
if override, exists := m.config.Overrides[resourceID]; exists {
thresholds = m.applyThresholdOverride(thresholds, override)
@ -1381,6 +1408,11 @@ func (m *Manager) reevaluateActiveAlertsLocked() {
// We need to check what kind of resource this is
if threshold == nil && (alert.Instance == "Node" || alert.Instance == alert.Node) {
// This is a node alert
// Check if all node alerts are disabled
if m.config.DisableAllNodes {
alertsToResolve = append(alertsToResolve, alertID)
continue
}
thresholds := m.config.NodeDefaults
if override, exists := m.config.Overrides[resourceID]; exists {
thresholds = m.applyThresholdOverride(thresholds, override)
@ -1388,6 +1420,11 @@ func (m *Manager) reevaluateActiveAlertsLocked() {
threshold = getThresholdForMetric(thresholds, metricType)
} else if threshold == nil && (alert.Instance == "Storage" || strings.Contains(alert.ResourceID, ":storage/")) {
// This is a storage alert
// Check if all storage alerts are disabled
if m.config.DisableAllStorage {
alertsToResolve = append(alertsToResolve, alertID)
continue
}
if override, exists := m.config.Overrides[resourceID]; exists && override.Usage != nil {
threshold = override.Usage
} else {
@ -1395,6 +1432,11 @@ func (m *Manager) reevaluateActiveAlertsLocked() {
}
} else if threshold == nil && alert.Instance == "PBS" {
// This is a PBS alert
// Check if all PBS alerts are disabled
if m.config.DisableAllPBS {
alertsToResolve = append(alertsToResolve, alertID)
continue
}
thresholds := m.config.NodeDefaults
if override, exists := m.config.Overrides[resourceID]; exists {
if override.CPU != nil && metricType == "cpu" {
@ -1410,6 +1452,11 @@ func (m *Manager) reevaluateActiveAlertsLocked() {
if threshold == nil {
// This is a guest (qemu/lxc) alert
// Check if all guest alerts are disabled
if m.config.DisableAllGuests {
alertsToResolve = append(alertsToResolve, alertID)
continue
}
// We need to evaluate custom rules, but we don't have the guest object here.
// For now, we'll mark these alerts for re-evaluation by the monitor.
// The next poll cycle will properly evaluate them with custom rules.
@ -2096,6 +2143,32 @@ func (m *Manager) CheckNode(node models.Node) {
}
if m.config.DisableAllNodes {
m.mu.RUnlock()
// Clear any existing node alerts when all node alerts are disabled
m.mu.Lock()
// Clear offline tracking
delete(m.nodeOfflineCount, node.ID)
// Clear all possible node alert types
alertTypes := []string{"cpu", "memory", "disk", "temperature"}
for _, alertType := range alertTypes {
alertID := fmt.Sprintf("%s-%s", node.ID, alertType)
if _, exists := m.activeAlerts[alertID]; exists {
m.clearAlertNoLock(alertID)
log.Info().
Str("alertID", alertID).
Str("node", node.Name).
Msg("Cleared node alert - all node alerts disabled")
}
}
// Clear offline alert
offlineAlertID := fmt.Sprintf("node-offline-%s", node.ID)
if _, exists := m.activeAlerts[offlineAlertID]; exists {
m.clearAlertNoLock(offlineAlertID)
log.Info().
Str("alertID", offlineAlertID).
Str("node", node.Name).
Msg("Cleared offline alert - all node alerts disabled")
}
m.mu.Unlock()
return
}
disableNodesOffline := m.config.DisableAllNodesOffline
@ -2230,7 +2303,14 @@ func (m *Manager) CheckHost(host models.Host) {
override, hasOverride := m.config.Overrides[host.ID]
m.mu.RUnlock()
if !alertsEnabled || disableAllHosts {
if !alertsEnabled {
return
}
if disableAllHosts {
// Clear any existing host alerts when all host alerts are disabled
m.clearHostMetricAlerts(host.ID)
m.clearHostDiskAlerts(host.ID)
return
}
@ -2515,6 +2595,38 @@ func (m *Manager) CheckPBS(pbs models.PBSInstance) {
}
if m.config.DisableAllPBS {
m.mu.RUnlock()
// Clear any existing PBS alerts when all PBS alerts are disabled
m.mu.Lock()
// Reset offline confirmation tracking
delete(m.offlineConfirmations, pbs.ID)
// Clear CPU alert
cpuAlertID := fmt.Sprintf("%s-cpu", pbs.ID)
if _, exists := m.activeAlerts[cpuAlertID]; exists {
m.clearAlertNoLock(cpuAlertID)
log.Info().
Str("alertID", cpuAlertID).
Str("pbs", pbs.Name).
Msg("Cleared CPU alert - all PBS alerts disabled")
}
// Clear Memory alert
memAlertID := fmt.Sprintf("%s-memory", pbs.ID)
if _, exists := m.activeAlerts[memAlertID]; exists {
m.clearAlertNoLock(memAlertID)
log.Info().
Str("alertID", memAlertID).
Str("pbs", pbs.Name).
Msg("Cleared Memory alert - all PBS alerts disabled")
}
// Clear offline alert
offlineAlertID := fmt.Sprintf("pbs-offline-%s", pbs.ID)
if _, exists := m.activeAlerts[offlineAlertID]; exists {
m.clearAlertNoLock(offlineAlertID)
log.Info().
Str("alertID", offlineAlertID).
Str("pbs", pbs.Name).
Msg("Cleared offline alert - all PBS alerts disabled")
}
m.mu.Unlock()
return
}
@ -2607,6 +2719,32 @@ func (m *Manager) CheckPMG(pmg models.PMGInstance) {
}
if m.config.DisableAllPMG {
m.mu.RUnlock()
// Clear any existing PMG alerts when all PMG alerts are disabled
m.mu.Lock()
// Reset offline confirmation tracking
delete(m.offlineConfirmations, pmg.ID)
// Clear all possible PMG alert types
alertTypes := []string{"queue-total", "queue-deferred", "queue-hold", "oldest-message"}
for _, alertType := range alertTypes {
alertID := fmt.Sprintf("%s-%s", pmg.ID, alertType)
if _, exists := m.activeAlerts[alertID]; exists {
m.clearAlertNoLock(alertID)
log.Info().
Str("alertID", alertID).
Str("pmg", pmg.Name).
Msg("Cleared PMG alert - all PMG alerts disabled")
}
}
// Clear offline alert
offlineAlertID := fmt.Sprintf("pmg-offline-%s", pmg.ID)
if _, exists := m.activeAlerts[offlineAlertID]; exists {
m.clearAlertNoLock(offlineAlertID)
log.Info().
Str("alertID", offlineAlertID).
Str("pmg", pmg.Name).
Msg("Cleared offline alert - all PMG alerts disabled")
}
m.mu.Unlock()
return
}
@ -3767,6 +3905,25 @@ func (m *Manager) CheckStorage(storage models.Storage) {
}
if m.config.DisableAllStorage {
m.mu.RUnlock()
// Clear any existing storage alerts when all storage alerts are disabled
m.mu.Lock()
usageAlertID := fmt.Sprintf("%s-usage", storage.ID)
if _, exists := m.activeAlerts[usageAlertID]; exists {
m.clearAlertNoLock(usageAlertID)
log.Info().
Str("alertID", usageAlertID).
Str("storage", storage.Name).
Msg("Cleared usage alert - all storage alerts disabled")
}
offlineAlertID := fmt.Sprintf("storage-offline-%s", storage.ID)
if _, exists := m.activeAlerts[offlineAlertID]; exists {
m.clearAlertNoLock(offlineAlertID)
log.Info().
Str("alertID", offlineAlertID).
Str("storage", storage.Name).
Msg("Cleared offline alert - all storage alerts disabled")
}
m.mu.Unlock()
return
}

View file

@ -1655,3 +1655,120 @@ func TestCheckDiskHealthClearsExistingSamsung980Alerts(t *testing.T) {
t.Fatalf("expected existing Samsung 990 health alert to be cleared")
}
}
func TestDisableAllStorageClearsExistingAlerts(t *testing.T) {
m := NewManager()
storageID := "local-lvm"
// Start with configuration that allows storage alerts
initialConfig := AlertConfig{
Enabled: true,
DisableAllStorage: false,
StorageDefault: HysteresisThreshold{Trigger: 80, Clear: 75},
TimeThreshold: 0,
TimeThresholds: map[string]int{},
NodeDefaults: ThresholdConfig{
CPU: &HysteresisThreshold{Trigger: 80, Clear: 75},
Memory: &HysteresisThreshold{Trigger: 85, Clear: 80},
Disk: &HysteresisThreshold{Trigger: 90, Clear: 85},
},
GuestDefaults: ThresholdConfig{
CPU: &HysteresisThreshold{Trigger: 80, Clear: 75},
},
Overrides: make(map[string]ThresholdConfig),
}
m.UpdateConfig(initialConfig)
m.mu.Lock()
m.config.TimeThreshold = 0
m.config.TimeThresholds = map[string]int{}
m.config.ActivationState = ActivationActive
m.mu.Unlock()
var dispatched []*Alert
done := make(chan struct{}, 1)
var resolved []string
resolvedDone := make(chan struct{}, 1)
m.SetAlertCallback(func(alert *Alert) {
dispatched = append(dispatched, alert)
select {
case done <- struct{}{}:
default:
}
})
m.SetResolvedCallback(func(alertID string) {
resolved = append(resolved, alertID)
select {
case resolvedDone <- struct{}{}:
default:
}
})
storage := models.Storage{
ID: storageID,
Name: "local-lvm",
Usage: 90.0,
Status: "available",
}
// Initial check should trigger an alert
m.CheckStorage(storage)
select {
case <-done:
case <-time.After(100 * time.Millisecond):
t.Fatalf("did not receive initial alert dispatch")
}
if len(dispatched) != 1 {
t.Fatalf("expected 1 alert before disabling storage, got %d", len(dispatched))
}
// Apply config with DisableAllStorage enabled
disabledConfig := initialConfig
disabledConfig.DisableAllStorage = true
m.UpdateConfig(disabledConfig)
m.mu.Lock()
m.config.TimeThreshold = 0
m.config.TimeThresholds = map[string]int{}
m.config.ActivationState = ActivationActive
m.mu.Unlock()
// Clear dispatched slice to capture only post-disable notifications
dispatched = dispatched[:0]
done = make(chan struct{}, 1)
// Re-run CheckStorage with high usage; no alert should be dispatched
m.CheckStorage(storage)
select {
case <-done:
t.Fatalf("expected no alerts after disabling all storage, but callback fired")
case <-time.After(100 * time.Millisecond):
// No callback fired as expected
}
// Active alerts should be cleared by reevaluateActiveAlertsLocked
m.mu.RLock()
activeCount := len(m.activeAlerts)
m.mu.RUnlock()
if activeCount != 0 {
t.Fatalf("expected active alerts to be cleared after disabling all storage, got %d", activeCount)
}
// Resolved callback should have fired
select {
case <-resolvedDone:
case <-time.After(100 * time.Millisecond):
t.Fatalf("expected resolved callback to fire after disabling all storage")
}
expectedAlertID := fmt.Sprintf("%s-usage", storageID)
if len(resolved) != 1 || resolved[0] != expectedAlertID {
t.Fatalf("expected resolved callback for %s, got %v", expectedAlertID, resolved)
}
// Pending alert should be cleared
m.mu.RLock()
_, isPending := m.pendingAlerts[expectedAlertID]
m.mu.RUnlock()
if isPending {
t.Fatalf("expected pending alert entry to be cleared after disabling all storage")
}
}