Fix recovery notifications silently disabled by config PUT (#1332)

Two fixes for missing recovery/resolved notifications:

1. API config PUT handler now preserves notifyOnResolve when the client
   omits it from the request body. Go decodes a missing bool as false,
   which silently disabled recovery notifications on older clients.

2. CancelAlert now always cleans up the cooldown record even when the
   alert has already left the pending buffer, preventing stale cooldown
   entries from suppressing future alert cycles.
This commit is contained in:
rcourtman 2026-03-09 11:15:26 +00:00
parent 572520ebc6
commit 9b531c547d
2 changed files with 43 additions and 13 deletions

View file

@ -3,6 +3,7 @@ package api
import (
"context"
"encoding/json"
"io"
"net/http"
"net/url"
"strconv"
@ -142,12 +143,32 @@ func (h *AlertHandlers) UpdateAlertConfig(w http.ResponseWriter, r *http.Request
// Limit request body to 64KB to prevent memory exhaustion
r.Body = http.MaxBytesReader(w, r.Body, 64*1024)
var config alerts.AlertConfig
if err := json.NewDecoder(r.Body).Decode(&config); err != nil {
body, err := io.ReadAll(r.Body)
if err != nil {
http.Error(w, err.Error(), http.StatusBadRequest)
return
}
var config alerts.AlertConfig
if err := json.Unmarshal(body, &config); err != nil {
http.Error(w, err.Error(), http.StatusBadRequest)
return
}
// Preserve notifyOnResolve when the client omits it from the request body.
// Go decodes a missing bool as false, which would silently disable recovery
// notifications. Use a *bool probe to distinguish "missing" from "explicitly false".
// Refs: #1259, #1332
var scheduleProbe struct {
Schedule struct {
NotifyOnResolve *bool `json:"notifyOnResolve"`
} `json:"schedule"`
}
if err := json.Unmarshal(body, &scheduleProbe); err == nil && scheduleProbe.Schedule.NotifyOnResolve == nil {
currentConfig := h.getMonitor(r.Context()).GetAlertManager().GetConfig()
config.Schedule.NotifyOnResolve = currentConfig.Schedule.NotifyOnResolve
}
// Migrate deprecated GroupingWindow to Grouping.Window if needed before applying.
groupWindow := config.Schedule.Grouping.Window
if groupWindow == 0 && config.Schedule.GroupingWindow != 0 {

View file

@ -871,11 +871,30 @@ func (n *NotificationManager) SendResolvedAlert(resolved *alerts.ResolvedAlert)
}
}
// CancelAlert removes pending notifications for a resolved alert
// CancelAlert removes pending notifications for a resolved alert and cleans
// up cooldown and queue state. Always cleans up cooldown and queued firing
// notifications even when the alert has already left the pending buffer
// (was already grouped and flushed to the queue). Refs: #1332
func (n *NotificationManager) CancelAlert(alertID string) {
n.mu.Lock()
defer n.mu.Unlock()
// Always clean up cooldown record for resolved alert, even if it
// already left the pending buffer (i.e. was already sent/grouped).
// Without this, a stale cooldown entry can suppress the next firing.
delete(n.lastNotified, alertID)
// Always cancel queued firing notifications for this alert. If the
// alert was flushed from the pending buffer to the persistent queue
// but hasn't been sent yet, we must cancel it to avoid delivering a
// stale "firing" notification after the alert has already resolved.
if n.queue != nil {
if err := n.queue.CancelByAlertIDs([]string{alertID}); err != nil {
log.Error().Err(err).Str("alertID", alertID).Msg("Failed to cancel queued notifications")
}
}
// Remove from the in-memory pending buffer if still there
if len(n.pendingAlerts) == 0 {
return
}
@ -910,16 +929,6 @@ func (n *NotificationManager) CancelAlert(alertID string) {
n.groupTimer = nil
}
// Clean up cooldown record for resolved alert
delete(n.lastNotified, alertID)
// Cancel any queued notifications containing this alert
if n.queue != nil {
if err := n.queue.CancelByAlertIDs([]string{alertID}); err != nil {
log.Error().Err(err).Str("alertID", alertID).Msg("Failed to cancel queued notifications")
}
}
log.Debug().
Str("alertID", alertID).
Int("remaining", len(n.pendingAlerts)).