mirror of
https://github.com/rcourtman/Pulse.git
synced 2026-05-22 19:36:46 +00:00
Wire the alerts manager's new flapping-detected callback in the AI
intelligence initialization path. Two things happen on each first
transition into the flapping cooldown window for a tracking key:
1. A reliability-category finding is written directly to the findings
store via emitFlappingPostmortemFinding. Path B from the lane brief:
the finding is durable without depending on patrol synthesis, so
the operator sees the diagnosis the moment Pulse decides to
suppress. The finding ID is derived from the canonical tracking
key ("alert-flapping:<trackingKey>") so re-detection inside the
cooldown window folds into the existing record via the same-ID
branch of FindingsStore.Add -- one finding per flapping condition,
not one per dispatch.
2. A scoped FlappingPostmortemPatrolScope is enqueued on the trigger
manager so an actual patrol run can enrich the finding with deeper
context once it lands.
The finding body names the flapping threshold, window, and cooldown
the manager is currently configured with, plus an action hint
(widen threshold, raise cooldown, or stabilise the resource). That
turns the suppressed alert from silence into a closable item on the
FindingsPanel.
FindingCategoryReliability is reused; no new category, no parent/
child finding structure -- those are deferred per the lane brief.
108 lines
3.8 KiB
Go
108 lines
3.8 KiB
Go
package api
|
|
|
|
import (
|
|
"fmt"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/rcourtman/pulse-go-rewrite/internal/ai"
|
|
"github.com/rcourtman/pulse-go-rewrite/internal/alerts"
|
|
"github.com/rs/zerolog/log"
|
|
)
|
|
|
|
// emitFlappingPostmortemFinding writes a reliability finding to the findings
|
|
// store explaining what is flapping and why Pulse suppressed it. The finding
|
|
// ID is derived from the alert's canonical tracking key so re-detection
|
|
// inside the cooldown window updates the existing finding instead of
|
|
// creating a duplicate.
|
|
//
|
|
// This is Path B from the lane brief: emit the finding directly at callback
|
|
// time so it is durable without depending on patrol synthesis. The scoped
|
|
// patrol still runs in parallel to enrich context once it lands.
|
|
func emitFlappingPostmortemFinding(patrol *ai.PatrolService, alertManager *alerts.Manager, alert *alerts.Alert, trackingKey string) {
|
|
if patrol == nil || alert == nil || trackingKey == "" {
|
|
return
|
|
}
|
|
store := patrol.GetFindings()
|
|
if store == nil {
|
|
log.Debug().
|
|
Str("trackingKey", trackingKey).
|
|
Msg("Flapping postmortem: findings store unavailable, skipping finding emission")
|
|
return
|
|
}
|
|
|
|
cfg := alertManager.GetConfig()
|
|
now := time.Now()
|
|
|
|
resourceName := strings.TrimSpace(alert.ResourceName)
|
|
if resourceName == "" {
|
|
resourceName = strings.TrimSpace(alert.ResourceID)
|
|
}
|
|
if resourceName == "" {
|
|
resourceName = "resource"
|
|
}
|
|
alertType := strings.TrimSpace(alert.Type)
|
|
if alertType == "" {
|
|
alertType = "alert"
|
|
}
|
|
|
|
title := fmt.Sprintf("Alert %s on %s is flapping", alertType, resourceName)
|
|
description := fmt.Sprintf(
|
|
"Pulse detected this alert switching state %d or more times within %s and suppressed further notifications for %s. Suppression is in effect to avoid alarm-storm noise; the underlying condition has NOT been fixed.",
|
|
cfg.FlappingThreshold,
|
|
formatSecondsDuration(cfg.FlappingWindowSeconds),
|
|
formatMinutesDuration(cfg.FlappingCooldownMinutes),
|
|
)
|
|
recommendation := "Consider widening the threshold (so transient blips do not toggle the alert), raising the flapping cooldown, or stabilising the underlying resource. If the resource really is unstable, investigating why it crosses the threshold so often is the durable fix."
|
|
|
|
findingID := "alert-flapping:" + trackingKey
|
|
finding := &ai.Finding{
|
|
ID: findingID,
|
|
Key: findingID,
|
|
Severity: ai.FindingSeverityWarning,
|
|
Category: ai.FindingCategoryReliability,
|
|
ResourceID: alert.ResourceID,
|
|
ResourceName: resourceName,
|
|
ResourceType: alert.CanonicalKind,
|
|
Node: alert.Node,
|
|
Title: title,
|
|
Description: description,
|
|
Impact: "Alarm-storm flapping is being silenced. Real problems on this resource may be missed until you tune the alert or fix the instability.",
|
|
Recommendation: recommendation,
|
|
Evidence: fmt.Sprintf("trackingKey=%s alertType=%s threshold=%d windowSeconds=%d cooldownMinutes=%d", trackingKey, alertType, cfg.FlappingThreshold, cfg.FlappingWindowSeconds, cfg.FlappingCooldownMinutes),
|
|
Source: "alert-flapping",
|
|
DetectedAt: now,
|
|
LastSeenAt: now,
|
|
AlertIdentifier: alert.ID,
|
|
}
|
|
|
|
if store.Add(finding) {
|
|
log.Info().
|
|
Str("findingID", findingID).
|
|
Str("trackingKey", trackingKey).
|
|
Str("resourceID", alert.ResourceID).
|
|
Str("alertType", alertType).
|
|
Msg("Emitted flapping postmortem finding")
|
|
} else {
|
|
log.Debug().
|
|
Str("findingID", findingID).
|
|
Str("trackingKey", trackingKey).
|
|
Msg("Flapping postmortem finding deduped against existing record")
|
|
}
|
|
}
|
|
|
|
func formatSecondsDuration(seconds int) string {
|
|
if seconds <= 0 {
|
|
return "the configured window"
|
|
}
|
|
d := time.Duration(seconds) * time.Second
|
|
return d.String()
|
|
}
|
|
|
|
func formatMinutesDuration(minutes int) string {
|
|
if minutes <= 0 {
|
|
return "the configured cooldown"
|
|
}
|
|
d := time.Duration(minutes) * time.Minute
|
|
return d.String()
|
|
}
|