From 9b531c547d8501ef8a117d3719e26bfbee58e4cb Mon Sep 17 00:00:00 2001 From: rcourtman Date: Mon, 9 Mar 2026 11:15:26 +0000 Subject: [PATCH] Fix recovery notifications silently disabled by config PUT (#1332) Two fixes for missing recovery/resolved notifications: 1. API config PUT handler now preserves notifyOnResolve when the client omits it from the request body. Go decodes a missing bool as false, which silently disabled recovery notifications on older clients. 2. CancelAlert now always cleans up the cooldown record even when the alert has already left the pending buffer, preventing stale cooldown entries from suppressing future alert cycles. --- internal/api/alerts.go | 25 ++++++++++++++++++-- internal/notifications/notifications.go | 31 ++++++++++++++++--------- 2 files changed, 43 insertions(+), 13 deletions(-) diff --git a/internal/api/alerts.go b/internal/api/alerts.go index 55606c964..cc33613ca 100644 --- a/internal/api/alerts.go +++ b/internal/api/alerts.go @@ -3,6 +3,7 @@ package api import ( "context" "encoding/json" + "io" "net/http" "net/url" "strconv" @@ -142,12 +143,32 @@ func (h *AlertHandlers) UpdateAlertConfig(w http.ResponseWriter, r *http.Request // Limit request body to 64KB to prevent memory exhaustion r.Body = http.MaxBytesReader(w, r.Body, 64*1024) - var config alerts.AlertConfig - if err := json.NewDecoder(r.Body).Decode(&config); err != nil { + body, err := io.ReadAll(r.Body) + if err != nil { http.Error(w, err.Error(), http.StatusBadRequest) return } + var config alerts.AlertConfig + if err := json.Unmarshal(body, &config); err != nil { + http.Error(w, err.Error(), http.StatusBadRequest) + return + } + + // Preserve notifyOnResolve when the client omits it from the request body. + // Go decodes a missing bool as false, which would silently disable recovery + // notifications. Use a *bool probe to distinguish "missing" from "explicitly false". + // Refs: #1259, #1332 + var scheduleProbe struct { + Schedule struct { + NotifyOnResolve *bool `json:"notifyOnResolve"` + } `json:"schedule"` + } + if err := json.Unmarshal(body, &scheduleProbe); err == nil && scheduleProbe.Schedule.NotifyOnResolve == nil { + currentConfig := h.getMonitor(r.Context()).GetAlertManager().GetConfig() + config.Schedule.NotifyOnResolve = currentConfig.Schedule.NotifyOnResolve + } + // Migrate deprecated GroupingWindow to Grouping.Window if needed before applying. groupWindow := config.Schedule.Grouping.Window if groupWindow == 0 && config.Schedule.GroupingWindow != 0 { diff --git a/internal/notifications/notifications.go b/internal/notifications/notifications.go index 01bf5bf8e..c749e91da 100644 --- a/internal/notifications/notifications.go +++ b/internal/notifications/notifications.go @@ -871,11 +871,30 @@ func (n *NotificationManager) SendResolvedAlert(resolved *alerts.ResolvedAlert) } } -// CancelAlert removes pending notifications for a resolved alert +// CancelAlert removes pending notifications for a resolved alert and cleans +// up cooldown and queue state. Always cleans up cooldown and queued firing +// notifications even when the alert has already left the pending buffer +// (was already grouped and flushed to the queue). Refs: #1332 func (n *NotificationManager) CancelAlert(alertID string) { n.mu.Lock() defer n.mu.Unlock() + // Always clean up cooldown record for resolved alert, even if it + // already left the pending buffer (i.e. was already sent/grouped). + // Without this, a stale cooldown entry can suppress the next firing. + delete(n.lastNotified, alertID) + + // Always cancel queued firing notifications for this alert. If the + // alert was flushed from the pending buffer to the persistent queue + // but hasn't been sent yet, we must cancel it to avoid delivering a + // stale "firing" notification after the alert has already resolved. + if n.queue != nil { + if err := n.queue.CancelByAlertIDs([]string{alertID}); err != nil { + log.Error().Err(err).Str("alertID", alertID).Msg("Failed to cancel queued notifications") + } + } + + // Remove from the in-memory pending buffer if still there if len(n.pendingAlerts) == 0 { return } @@ -910,16 +929,6 @@ func (n *NotificationManager) CancelAlert(alertID string) { n.groupTimer = nil } - // Clean up cooldown record for resolved alert - delete(n.lastNotified, alertID) - - // Cancel any queued notifications containing this alert - if n.queue != nil { - if err := n.queue.CancelByAlertIDs([]string{alertID}); err != nil { - log.Error().Err(err).Str("alertID", alertID).Msg("Failed to cancel queued notifications") - } - } - log.Debug(). Str("alertID", alertID). Int("remaining", len(n.pendingAlerts)).