mirror of
https://github.com/rcourtman/Pulse.git
synced 2026-05-22 03:02:35 +00:00
Port v5 alert lifecycle and linked agent overrides
This commit is contained in:
parent
f94601e26f
commit
b73dab2b64
12 changed files with 1201 additions and 106 deletions
|
|
@ -82,6 +82,12 @@ runtime must migrate the active alert, history entry, acknowledgment record,
|
|||
suppression/rate-limit/flapping tracking, and guest per-disk metric identity
|
||||
to the current canonical state instead of reopening a duplicate alert or
|
||||
resolving only the stale node-scoped identity.
|
||||
That same guest-alert owner also has to retire per-disk guest alerts when the
|
||||
guest stops, disk alerting is disabled, or the reported disk set changes.
|
||||
Canonical guest disk identity is only valid while the guest still exposes that
|
||||
disk resource under the current thresholds, so runtime cleanup must remove
|
||||
stale `guestID-disk-*` state instead of leaving orphaned per-disk incidents in
|
||||
active alerts, resolved history, or later UI projections.
|
||||
That same alerts runtime also owns instance-scoped node display-name
|
||||
resolution. Raw node names are not globally unique across configured
|
||||
infrastructure instances, so cached node display names must key on instance +
|
||||
|
|
@ -211,6 +217,11 @@ keep schedules such as `00:00` to `23:59` active through the full final
|
|||
minute instead of expiring at `23:59:00`. Alert quiet-hours proofs should
|
||||
control time through the alert manager clock hook instead of depending on wall
|
||||
clock execution at whatever second the test runner happens to hit.
|
||||
Quiet-hours suppression also applies to alert delivery lifecycle, not only the
|
||||
initial raised notification. Resolved notifications must not fan out when the
|
||||
alert was never notified or was already acknowledged, and monitoring-driven
|
||||
escalation delivery must consult the same quiet-hours suppression path while
|
||||
still letting canonical escalation state reach websocket consumers.
|
||||
That schedule surface now also follows the same shell/runtime split as the
|
||||
other feature tabs: `frontend-modern/src/features/alerts/tabs/ScheduleTab.tsx`
|
||||
stays the render shell, while
|
||||
|
|
@ -231,6 +242,21 @@ presentation now also route through
|
|||
`frontend-modern/src/utils/alertIncidentPresentation.ts` instead of remaining
|
||||
duplicated inline across the alerts page and overview timeline surfaces.
|
||||
|
||||
Poll-driven connectivity recovery is also part of canonical alert truth.
|
||||
Resources that clear an offline alert from later healthy polls must require
|
||||
repeated healthy confirmations before resolving that alert instead of clearing
|
||||
on the first recovered sample; otherwise transient poll recovery reopens the
|
||||
same regression as false "back online" notifications and missing downtime
|
||||
signal. Nodes, PBS, and PMG require three healthy confirmations before
|
||||
resolution, while storage requires two.
|
||||
|
||||
Host-agent threshold ownership now follows the linked resource model.
|
||||
Explicit agent overrides still win, but when no host-agent override exists the
|
||||
alerts runtime must inherit linked node or guest overrides for that agent so
|
||||
metric and connectivity behavior match the logical machine the agent augments.
|
||||
Persisted host alerts must carry enough linked-resource metadata for
|
||||
reevaluation after threshold changes to honor that same inheritance rule.
|
||||
|
||||
Alert resource tables, grouped node headers, and alert override reconstruction
|
||||
now route resource-backed names through the shared policy-aware alerts helper
|
||||
so governed resources do not fall back to raw names when the thresholds editor
|
||||
|
|
|
|||
|
|
@ -95,6 +95,12 @@ This subsystem now sits under the dedicated core monitoring runtime lane so
|
|||
discovery, metrics-history correctness, and platform-specific runtime coverage
|
||||
can be governed as first-class product work instead of staying diluted inside
|
||||
architecture coherence.
|
||||
That same monitoring boundary also owns the escalation callback bridge into the
|
||||
alerts delivery layer. Monitor-owned escalation handling may still publish
|
||||
canonical escalation state to websocket consumers, but notification fan-out
|
||||
must defer quiet-hours and resolved-notification suppression policy to the
|
||||
alerts manager instead of bypassing that shared routing contract when monitor
|
||||
plumbs escalations outward.
|
||||
That same monitoring owner now also governs monitored-system usage readiness
|
||||
for commercial boundaries. A non-nil unified read-state is not sufficient when
|
||||
provider-owned supplemental inventories such as TrueNAS or VMware are still
|
||||
|
|
|
|||
|
|
@ -45,10 +45,12 @@ const (
|
|||
|
||||
// Cleanup intervals
|
||||
const (
|
||||
StaleTrackingThreshold = 24 * time.Hour
|
||||
RateLimitCleanupWindow = 1 * time.Hour
|
||||
alertsDirPerm = 0o700
|
||||
alertsFilePerm = 0o600
|
||||
StaleTrackingThreshold = 24 * time.Hour
|
||||
RateLimitCleanupWindow = 1 * time.Hour
|
||||
alertsDirPerm = 0o700
|
||||
alertsFilePerm = 0o600
|
||||
offlineRecoveryConfirmationsDefault = 3
|
||||
offlineRecoveryConfirmationsStorage = 2
|
||||
)
|
||||
|
||||
func normalizePoweredOffSeverity(level AlertLevel) AlertLevel {
|
||||
|
|
@ -555,13 +557,14 @@ type Manager struct {
|
|||
// Time threshold tracking
|
||||
pendingAlerts map[string]time.Time // Track when thresholds were first exceeded
|
||||
// Offline confirmation tracking
|
||||
nodeOfflineCount map[string]int // Track consecutive offline counts for nodes (legacy)
|
||||
offlineConfirmations map[string]int // Track consecutive offline counts for all resources
|
||||
dockerOfflineCount map[string]int // Track consecutive offline counts for Docker hosts
|
||||
dockerStateConfirm map[string]int // Track consecutive state confirmations for Docker containers
|
||||
dockerRestartTracking map[string]*dockerRestartRecord // Track restart counts and times for restart loop detection
|
||||
dockerLastExitCode map[string]int // Track last exit code for OOM detection
|
||||
dockerUpdateFirstSeen map[string]time.Time // Track when image updates were first detected for alert delay
|
||||
nodeOfflineCount map[string]int // Track consecutive offline counts for nodes (legacy)
|
||||
offlineConfirmations map[string]int // Track consecutive offline counts for all resources
|
||||
offlineRecoveryConfirmations map[string]int // Track consecutive healthy confirmations before clearing poll-driven offline alerts
|
||||
dockerOfflineCount map[string]int // Track consecutive offline counts for Docker hosts
|
||||
dockerStateConfirm map[string]int // Track consecutive state confirmations for Docker containers
|
||||
dockerRestartTracking map[string]*dockerRestartRecord // Track restart counts and times for restart loop detection
|
||||
dockerLastExitCode map[string]int // Track last exit code for OOM detection
|
||||
dockerUpdateFirstSeen map[string]time.Time // Track when image updates were first detected for alert delay
|
||||
// Stable identity tracking prevents update-delay resets when host IDs churn.
|
||||
dockerUpdateFirstSeenByIdentity map[string]time.Time
|
||||
// PMG quarantine growth tracking
|
||||
|
|
@ -642,6 +645,7 @@ func NewManagerWithDataDir(dataDir string) *Manager {
|
|||
pendingAlerts: make(map[string]time.Time),
|
||||
nodeOfflineCount: make(map[string]int),
|
||||
offlineConfirmations: make(map[string]int),
|
||||
offlineRecoveryConfirmations: make(map[string]int),
|
||||
dockerOfflineCount: make(map[string]int),
|
||||
dockerStateConfirm: make(map[string]int),
|
||||
dockerRestartTracking: make(map[string]*dockerRestartRecord),
|
||||
|
|
@ -2046,10 +2050,7 @@ func (m *Manager) reevaluateActiveAlertsLocked() {
|
|||
alertsToResolve = append(alertsToResolve, alertID)
|
||||
continue
|
||||
}
|
||||
// Overrides are keyed by raw host ID (without the "agent:" prefix
|
||||
// that hostResourceID adds to the resource ID used in alert IDs).
|
||||
rawHostID := stripHostResourcePrefix(resourceID)
|
||||
thresholds := m.resolveResourceThresholds("agent", rawHostID)
|
||||
thresholds := m.resolveHostAlertThresholdsNoLock(alert, resourceID)
|
||||
if thresholds.Disabled {
|
||||
alertsToResolve = append(alertsToResolve, alertID)
|
||||
continue
|
||||
|
|
@ -2507,6 +2508,28 @@ func (m *Manager) shouldSuppressNotification(alert *Alert) (bool, string) {
|
|||
return false, ""
|
||||
}
|
||||
|
||||
// ShouldSuppressNotification checks if a notification should be suppressed
|
||||
// during quiet hours. This is used by paths that bypass dispatchAlert, such as escalation.
|
||||
func (m *Manager) ShouldSuppressNotification(alert *Alert) bool {
|
||||
if alert == nil {
|
||||
return false
|
||||
}
|
||||
|
||||
m.mu.RLock()
|
||||
defer m.mu.RUnlock()
|
||||
|
||||
suppressed, reason := m.shouldSuppressNotification(alert)
|
||||
if suppressed {
|
||||
log.Debug().
|
||||
Str("alertID", alert.ID).
|
||||
Str("type", alert.Type).
|
||||
Str("level", string(alert.Level)).
|
||||
Str("quietHoursRule", reason).
|
||||
Msg("Notification suppressed during quiet hours")
|
||||
}
|
||||
return suppressed
|
||||
}
|
||||
|
||||
// ShouldSuppressResolvedNotification checks if a recovery notification should be suppressed
|
||||
// during quiet hours. Recovery notifications follow the same quiet hours rules as their
|
||||
// corresponding alerts - if the original alert would have been suppressed, so is the recovery.
|
||||
|
|
@ -2518,6 +2541,22 @@ func (m *Manager) ShouldSuppressResolvedNotification(alert *Alert) bool {
|
|||
m.mu.RLock()
|
||||
defer m.mu.RUnlock()
|
||||
|
||||
if alert.Acknowledged {
|
||||
log.Debug().
|
||||
Str("alertID", alert.ID).
|
||||
Str("type", alert.Type).
|
||||
Msg("Recovery notification suppressed for acknowledged alert")
|
||||
return true
|
||||
}
|
||||
|
||||
if alert.LastNotified == nil {
|
||||
log.Debug().
|
||||
Str("alertID", alert.ID).
|
||||
Str("type", alert.Type).
|
||||
Msg("Recovery notification suppressed because firing notification was never sent")
|
||||
return true
|
||||
}
|
||||
|
||||
suppressed, reason := m.shouldSuppressNotification(alert)
|
||||
if suppressed {
|
||||
log.Debug().
|
||||
|
|
@ -2698,23 +2737,7 @@ func (m *Manager) CheckGuest(guest any, instanceName string) {
|
|||
m.clearGuestPoweredOffAlert(guestID, name)
|
||||
}
|
||||
|
||||
// Clear all resource metric alerts (cpu, memory, disk, etc.) for non-running guests
|
||||
m.mu.Lock()
|
||||
alertsCleared := 0
|
||||
for storageKey, alert := range m.activeAlerts {
|
||||
alertID := effectiveAlertID(alert, storageKey)
|
||||
// Only clear resource metric alerts, not powered-off alerts
|
||||
if alert.ResourceID == guestID && alert.Type != "powered-off" {
|
||||
m.clearAlertNoLock(alertID)
|
||||
alertsCleared++
|
||||
log.Debug().
|
||||
Str("alertID", alertID).
|
||||
Str("guest", name).
|
||||
Str("status", status).
|
||||
Msg("Cleared metric alert for non-running guest")
|
||||
}
|
||||
}
|
||||
m.mu.Unlock()
|
||||
alertsCleared := m.clearGuestMetricAlerts(guestID)
|
||||
|
||||
if alertsCleared > 0 {
|
||||
log.Debug().
|
||||
|
|
@ -2722,6 +2745,7 @@ func (m *Manager) CheckGuest(guest any, instanceName string) {
|
|||
Str("status", status).
|
||||
Int("alertsCleared", alertsCleared).
|
||||
Msg("Cleared metric alerts for non-running guest")
|
||||
m.saveActiveAlertsAsync("guest-not-running")
|
||||
}
|
||||
return
|
||||
}
|
||||
|
|
@ -2744,18 +2768,13 @@ func (m *Manager) CheckGuest(guest any, instanceName string) {
|
|||
|
||||
// If alerts are disabled for this guest, clear any existing alerts and return
|
||||
if thresholds.Disabled {
|
||||
m.mu.Lock()
|
||||
for storageKey, alert := range m.activeAlerts {
|
||||
alertID := effectiveAlertID(alert, storageKey)
|
||||
if alert.ResourceID == guestID {
|
||||
m.clearAlertNoLock(alertID)
|
||||
log.Info().
|
||||
Str("alertID", alertID).
|
||||
Str("guest", name).
|
||||
Msg("Cleared alert - guest has alerts disabled")
|
||||
}
|
||||
if alertsCleared := m.clearGuestMetricAlerts(guestID); alertsCleared > 0 {
|
||||
log.Info().
|
||||
Str("guest", name).
|
||||
Int("alertsCleared", alertsCleared).
|
||||
Msg("Cleared guest metric alerts because alerts are disabled")
|
||||
m.saveActiveAlertsAsync("guest-alerts-disabled")
|
||||
}
|
||||
m.mu.Unlock()
|
||||
return
|
||||
}
|
||||
|
||||
|
|
@ -2789,7 +2808,8 @@ func (m *Manager) CheckGuest(guest any, instanceName string) {
|
|||
}, thresholds, evalOpts)
|
||||
|
||||
if thresholds.Disk != nil && thresholds.Disk.Trigger > 0 && len(disks) > 0 {
|
||||
seenDisks := make(map[string]struct{})
|
||||
seenDiskKeys := make(map[string]struct{})
|
||||
seenDiskResources := make(map[string]struct{})
|
||||
for idx, disk := range disks {
|
||||
if disk.Total <= 0 {
|
||||
continue
|
||||
|
|
@ -2816,12 +2836,13 @@ func (m *Manager) CheckGuest(guest any, instanceName string) {
|
|||
}
|
||||
|
||||
// Avoid duplicate checks if two disks resolve to the same key
|
||||
if _, exists := seenDisks[sanitizedKey]; exists {
|
||||
if _, exists := seenDiskKeys[sanitizedKey]; exists {
|
||||
continue
|
||||
}
|
||||
seenDisks[sanitizedKey] = struct{}{}
|
||||
seenDiskKeys[sanitizedKey] = struct{}{}
|
||||
|
||||
perDiskResourceID := fmt.Sprintf("%s-disk-%s", guestID, sanitizedKey)
|
||||
seenDiskResources[perDiskResourceID] = struct{}{}
|
||||
message := fmt.Sprintf("%s disk (%s) at %.1f%%", guestType, label, disk.Usage)
|
||||
|
||||
log.Debug().
|
||||
|
|
@ -2867,6 +2888,11 @@ func (m *Manager) CheckGuest(guest any, instanceName string) {
|
|||
MonitorOnly: monitorOnly,
|
||||
})
|
||||
}
|
||||
if cleared := m.cleanupGuestDiskAlerts(guestID, seenDiskResources); cleared > 0 {
|
||||
m.saveActiveAlertsAsync("guest-disk-set-changed")
|
||||
}
|
||||
} else if cleared := m.cleanupGuestDiskAlerts(guestID, nil); cleared > 0 {
|
||||
m.saveActiveAlertsAsync("guest-disk-alerts-cleared")
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -3131,6 +3157,64 @@ func hostInstanceName(host models.Host) string {
|
|||
return "Agent"
|
||||
}
|
||||
|
||||
// resolveHostThresholdsNoLock resolves the effective thresholds for a host agent.
|
||||
// Explicit host-agent overrides win. Otherwise, linked node/guest overrides are
|
||||
// inherited so the host agent follows the logical resource it augments.
|
||||
// Callers must hold m.mu when reading config through this helper.
|
||||
func (m *Manager) resolveHostThresholdsNoLock(hostID, linkedNodeID, linkedVMID, linkedContainerID string) ThresholdConfig {
|
||||
base := m.defaultThresholdsForResourceType("agent")
|
||||
|
||||
if hostID = strings.TrimSpace(hostID); hostID != "" {
|
||||
if override, exists := m.config.Overrides[hostID]; exists {
|
||||
return m.applyThresholdOverride(base, override)
|
||||
}
|
||||
}
|
||||
|
||||
if linkedNodeID = strings.TrimSpace(linkedNodeID); linkedNodeID != "" {
|
||||
if override, exists := m.config.Overrides[linkedNodeID]; exists {
|
||||
return m.applyThresholdOverride(base, override)
|
||||
}
|
||||
}
|
||||
|
||||
if linkedVMID = strings.TrimSpace(linkedVMID); linkedVMID != "" {
|
||||
if override, exists := lookupGuestOverride(m.config.Overrides, nil, linkedVMID); exists {
|
||||
return m.applyThresholdOverride(base, override)
|
||||
}
|
||||
}
|
||||
|
||||
if linkedContainerID = strings.TrimSpace(linkedContainerID); linkedContainerID != "" {
|
||||
if override, exists := lookupGuestOverride(m.config.Overrides, nil, linkedContainerID); exists {
|
||||
return m.applyThresholdOverride(base, override)
|
||||
}
|
||||
}
|
||||
|
||||
return base
|
||||
}
|
||||
|
||||
// resolveHostAlertThresholdsNoLock resolves thresholds for persisted host-agent alerts.
|
||||
// Alert metadata carries the link context needed to inherit node/guest overrides.
|
||||
// Callers must hold m.mu when reading config through this helper.
|
||||
func (m *Manager) resolveHostAlertThresholdsNoLock(alert *Alert, resourceID string) ThresholdConfig {
|
||||
hostID := stripHostResourcePrefix(resourceID)
|
||||
if idx := strings.Index(hostID, "/"); idx >= 0 {
|
||||
hostID = hostID[:idx]
|
||||
}
|
||||
|
||||
linkedNodeID := ""
|
||||
linkedVMID := ""
|
||||
linkedContainerID := ""
|
||||
if alert != nil {
|
||||
if metadataHostID := metadataStringValue(alert.Metadata, "hostId"); metadataHostID != "" {
|
||||
hostID = metadataHostID
|
||||
}
|
||||
linkedNodeID = metadataStringValue(alert.Metadata, "linkedNodeId")
|
||||
linkedVMID = metadataStringValue(alert.Metadata, "linkedVmId")
|
||||
linkedContainerID = metadataStringValue(alert.Metadata, "linkedContainerId")
|
||||
}
|
||||
|
||||
return m.resolveHostThresholdsNoLock(hostID, linkedNodeID, linkedVMID, linkedContainerID)
|
||||
}
|
||||
|
||||
func sanitizeHostComponent(value string) string {
|
||||
value = strings.TrimSpace(strings.ToLower(value))
|
||||
if value == "" {
|
||||
|
|
@ -3227,7 +3311,7 @@ func (m *Manager) CheckHost(host models.Host) {
|
|||
m.mu.RLock()
|
||||
alertsEnabled := m.config.Enabled
|
||||
disableAllAgents := m.config.DisableAllAgents
|
||||
thresholds := m.resolveResourceThresholds("agent", host.ID)
|
||||
thresholds := m.resolveHostThresholdsNoLock(host.ID, host.LinkedNodeID, host.LinkedVMID, host.LinkedContainerID)
|
||||
m.mu.RUnlock()
|
||||
|
||||
if !alertsEnabled {
|
||||
|
|
@ -3267,6 +3351,15 @@ func (m *Manager) CheckHost(host models.Host) {
|
|||
"agentVersion": host.AgentVersion,
|
||||
"architecture": host.Architecture,
|
||||
}
|
||||
if linkedNodeID := strings.TrimSpace(host.LinkedNodeID); linkedNodeID != "" {
|
||||
baseMetadata["linkedNodeId"] = linkedNodeID
|
||||
}
|
||||
if linkedVMID := strings.TrimSpace(host.LinkedVMID); linkedVMID != "" {
|
||||
baseMetadata["linkedVmId"] = linkedVMID
|
||||
}
|
||||
if linkedContainerID := strings.TrimSpace(host.LinkedContainerID); linkedContainerID != "" {
|
||||
baseMetadata["linkedContainerId"] = linkedContainerID
|
||||
}
|
||||
if len(host.Tags) > 0 {
|
||||
baseMetadata["tags"] = append([]string(nil), host.Tags...)
|
||||
}
|
||||
|
|
@ -3590,7 +3683,7 @@ func (m *Manager) HandleHostOffline(host models.Host) {
|
|||
return
|
||||
}
|
||||
disableHostsOffline := m.config.DisableAllAgentsOffline
|
||||
thresholds := m.resolveResourceThresholds("agent", host.ID)
|
||||
thresholds := m.resolveHostThresholdsNoLock(host.ID, host.LinkedNodeID, host.LinkedVMID, host.LinkedContainerID)
|
||||
m.mu.RUnlock()
|
||||
|
||||
resourceKey := hostResourceID(host.ID)
|
||||
|
|
@ -3644,13 +3737,16 @@ func (m *Manager) HandleHostOffline(host models.Host) {
|
|||
Instance: instanceName,
|
||||
Message: fmt.Sprintf("Host '%s' is offline", resourceName),
|
||||
Metadata: map[string]interface{}{
|
||||
"resourceType": "agent",
|
||||
"hostId": host.ID,
|
||||
"hostname": host.Hostname,
|
||||
"displayName": host.DisplayName,
|
||||
"platform": host.Platform,
|
||||
"osName": host.OSName,
|
||||
"osVersion": host.OSVersion,
|
||||
"resourceType": "agent",
|
||||
"hostId": host.ID,
|
||||
"hostname": host.Hostname,
|
||||
"displayName": host.DisplayName,
|
||||
"platform": host.Platform,
|
||||
"osName": host.OSName,
|
||||
"osVersion": host.OSVersion,
|
||||
"linkedNodeId": strings.TrimSpace(host.LinkedNodeID),
|
||||
"linkedVmId": strings.TrimSpace(host.LinkedVMID),
|
||||
"linkedContainerId": strings.TrimSpace(host.LinkedContainerID),
|
||||
},
|
||||
AddToRecent: true,
|
||||
AddToHistory: true,
|
||||
|
|
@ -3760,6 +3856,72 @@ func (m *Manager) clearHostDiskAlerts(hostID string) {
|
|||
}
|
||||
}
|
||||
|
||||
func (m *Manager) clearGuestMetricAlerts(guestID string, metrics ...string) int {
|
||||
if guestID == "" {
|
||||
return 0
|
||||
}
|
||||
|
||||
allowedMetrics := make(map[string]struct{}, len(metrics))
|
||||
for _, metric := range metrics {
|
||||
metric = strings.TrimSpace(metric)
|
||||
if metric == "" {
|
||||
continue
|
||||
}
|
||||
allowedMetrics[metric] = struct{}{}
|
||||
}
|
||||
|
||||
perDiskPrefix := fmt.Sprintf("%s-disk-", guestID)
|
||||
|
||||
m.mu.Lock()
|
||||
defer m.mu.Unlock()
|
||||
|
||||
cleared := 0
|
||||
for storageKey, alert := range m.activeAlerts {
|
||||
if alert == nil || alert.Type == "powered-off" {
|
||||
continue
|
||||
}
|
||||
if alert.ResourceID != guestID && !strings.HasPrefix(alert.ResourceID, perDiskPrefix) {
|
||||
continue
|
||||
}
|
||||
if len(allowedMetrics) > 0 {
|
||||
if _, ok := allowedMetrics[alert.Type]; !ok {
|
||||
continue
|
||||
}
|
||||
}
|
||||
m.clearAlertNoLock(storageKey)
|
||||
cleared++
|
||||
}
|
||||
|
||||
return cleared
|
||||
}
|
||||
|
||||
func (m *Manager) cleanupGuestDiskAlerts(guestID string, seen map[string]struct{}) int {
|
||||
if guestID == "" {
|
||||
return 0
|
||||
}
|
||||
|
||||
prefix := fmt.Sprintf("%s-disk-", guestID)
|
||||
|
||||
m.mu.Lock()
|
||||
defer m.mu.Unlock()
|
||||
|
||||
cleared := 0
|
||||
for storageKey, alert := range m.activeAlerts {
|
||||
if alert == nil || !strings.HasPrefix(alert.ResourceID, prefix) {
|
||||
continue
|
||||
}
|
||||
if seen != nil {
|
||||
if _, exists := seen[alert.ResourceID]; exists {
|
||||
continue
|
||||
}
|
||||
}
|
||||
m.clearAlertNoLock(storageKey)
|
||||
cleared++
|
||||
}
|
||||
|
||||
return cleared
|
||||
}
|
||||
|
||||
func (m *Manager) cleanupHostDiskAlerts(host models.Host, seen map[string]struct{}) {
|
||||
if host.ID == "" {
|
||||
return
|
||||
|
|
@ -5502,9 +5664,19 @@ func (m *Manager) clearDockerContainerUpdateTracking(resourceID, trackingKey str
|
|||
m.mu.Unlock()
|
||||
}
|
||||
|
||||
func (m *Manager) touchDockerContainerUpdateAlert(alertID string) {
|
||||
m.mu.Lock()
|
||||
defer m.mu.Unlock()
|
||||
|
||||
if alert, exists := m.getActiveAlertNoLock(alertID); exists && alert != nil {
|
||||
alert.LastSeen = time.Now()
|
||||
}
|
||||
}
|
||||
|
||||
// checkDockerContainerImageUpdate checks if an image update has been pending for too long
|
||||
func (m *Manager) checkDockerContainerImageUpdate(host models.DockerHost, container models.DockerContainer, resourceID, containerName, instanceName, nodeName string) {
|
||||
alertID := fmt.Sprintf("docker-container-update-%s", resourceID)
|
||||
canonicalAlertID := buildCanonicalStateID(resourceID, resourceID+"-image-update")
|
||||
updateTrackingKey := dockerUpdateTrackingKey(host, container)
|
||||
|
||||
// Check if update detection is enabled
|
||||
|
|
@ -5514,30 +5686,30 @@ func (m *Manager) checkDockerContainerImageUpdate(host models.DockerHost, contai
|
|||
|
||||
// Negative value means disabled
|
||||
if delayHours < 0 {
|
||||
m.clearAlert(buildCanonicalStateID(resourceID, resourceID+"-image-update"))
|
||||
m.clearAlert(canonicalAlertID)
|
||||
m.clearDockerContainerUpdateTracking(resourceID, updateTrackingKey)
|
||||
return
|
||||
}
|
||||
|
||||
// Check if this container has an update status reported
|
||||
if container.UpdateStatus == nil {
|
||||
// No update status - clear any tracking and alerts
|
||||
m.clearAlert(buildCanonicalStateID(resourceID, resourceID+"-image-update"))
|
||||
m.clearDockerContainerUpdateTracking(resourceID, updateTrackingKey)
|
||||
// Missing update status means the condition is unknown, not resolved.
|
||||
// Preserve any active alert and first-seen tracking until we see an affirmative clear.
|
||||
m.touchDockerContainerUpdateAlert(canonicalAlertID)
|
||||
return
|
||||
}
|
||||
|
||||
// Check for errors in update detection (don't alert on errors)
|
||||
if container.UpdateStatus.Error != "" {
|
||||
// Update check failed - clear alert but keep tracking
|
||||
m.clearAlert(buildCanonicalStateID(resourceID, resourceID+"-image-update"))
|
||||
// A failed update check cannot confirm the pending update has been resolved.
|
||||
m.touchDockerContainerUpdateAlert(canonicalAlertID)
|
||||
return
|
||||
}
|
||||
|
||||
// Check if an update is available
|
||||
if !container.UpdateStatus.UpdateAvailable {
|
||||
// No update available - clear tracking and alert
|
||||
m.clearAlert(buildCanonicalStateID(resourceID, resourceID+"-image-update"))
|
||||
m.clearAlert(canonicalAlertID)
|
||||
m.clearDockerContainerUpdateTracking(resourceID, updateTrackingKey)
|
||||
return
|
||||
}
|
||||
|
|
@ -7495,8 +7667,10 @@ func (m *Manager) removeActiveAlertNoLock(alertID string) {
|
|||
m.unregisterActiveAlertAliasNoLock(key, alert)
|
||||
}
|
||||
if exists {
|
||||
delete(m.offlineRecoveryConfirmations, key)
|
||||
delete(m.activeAlerts, key)
|
||||
}
|
||||
delete(m.offlineRecoveryConfirmations, alertID)
|
||||
// NOTE: Don't delete ackState here - preserve it so if the same alert
|
||||
// reappears (e.g., powered-off VM during backup), the acknowledgement
|
||||
// is restored via preserveAlertState. ackState is cleaned up in Cleanup().
|
||||
|
|
@ -7506,6 +7680,27 @@ func (m *Manager) removeActiveAlertNoLock(alertID string) {
|
|||
}
|
||||
}
|
||||
|
||||
func (m *Manager) confirmOfflineRecoveryNoLock(alertID string, required int) (int, bool) {
|
||||
alertID = strings.TrimSpace(alertID)
|
||||
if alertID == "" {
|
||||
return 0, false
|
||||
}
|
||||
|
||||
if required <= 1 {
|
||||
delete(m.offlineRecoveryConfirmations, alertID)
|
||||
return required, true
|
||||
}
|
||||
|
||||
m.offlineRecoveryConfirmations[alertID]++
|
||||
confirmations := m.offlineRecoveryConfirmations[alertID]
|
||||
if confirmations < required {
|
||||
return confirmations, false
|
||||
}
|
||||
|
||||
delete(m.offlineRecoveryConfirmations, alertID)
|
||||
return confirmations, true
|
||||
}
|
||||
|
||||
// GetActiveAlerts returns all active alerts
|
||||
func (m *Manager) GetActiveAlerts() []Alert {
|
||||
if m == nil {
|
||||
|
|
@ -7817,8 +8012,9 @@ func (m *Manager) OnAlertHistory(cb AlertCallback) {
|
|||
}
|
||||
}
|
||||
|
||||
// clearResourceOfflineAlert removes an offline alert when a resource comes back online.
|
||||
func (m *Manager) clearResourceOfflineAlert(alertPrefix, resourceID, resourceName, host, resourceKind string) {
|
||||
// clearResourceOfflineAlert removes an offline alert when a poll-driven resource
|
||||
// stays healthy for enough consecutive polls to confirm recovery.
|
||||
func (m *Manager) clearResourceOfflineAlert(resourceID, resourceName, host, resourceKind string, requiredRecoveryCount int) {
|
||||
alertID := canonicalConnectivityStateID(resourceID)
|
||||
|
||||
m.mu.Lock()
|
||||
|
|
@ -7836,6 +8032,17 @@ func (m *Manager) clearResourceOfflineAlert(alertPrefix, resourceID, resourceNam
|
|||
// Check if offline alert exists
|
||||
alert, exists := m.getActiveAlertNoLock(alertID)
|
||||
if !exists {
|
||||
delete(m.offlineRecoveryConfirmations, alertID)
|
||||
return
|
||||
}
|
||||
|
||||
recoveryCount, confirmed := m.confirmOfflineRecoveryNoLock(alertID, requiredRecoveryCount)
|
||||
if !confirmed {
|
||||
log.Debug().
|
||||
Str(strings.ToLower(resourceKind), resourceName).
|
||||
Int("confirmations", recoveryCount).
|
||||
Int("required", requiredRecoveryCount).
|
||||
Msg(resourceKind + " appears back online, waiting for recovery confirmation")
|
||||
return
|
||||
}
|
||||
|
||||
|
|
@ -7864,6 +8071,10 @@ func (m *Manager) clearResourceOfflineAlert(alertPrefix, resourceID, resourceNam
|
|||
func (m *Manager) checkNodeOffline(node models.Node) {
|
||||
alertID := fmt.Sprintf("node-offline-%s", node.ID)
|
||||
|
||||
m.mu.Lock()
|
||||
delete(m.offlineRecoveryConfirmations, canonicalConnectivityStateID(node.ID))
|
||||
m.mu.Unlock()
|
||||
|
||||
thresholds := m.resolveResourceThresholds("node", node.ID)
|
||||
spec, err := buildCanonicalConnectivitySpec(node.ID, node.Name, unifiedresources.ResourceType("node"), AlertLevelCritical, 3, thresholds.Disabled || thresholds.DisableConnectivity)
|
||||
if err != nil {
|
||||
|
|
@ -7918,6 +8129,17 @@ func (m *Manager) clearNodeOfflineAlert(node models.Node) {
|
|||
// Check if offline alert exists
|
||||
alert, exists := m.getActiveAlertNoLock(alertID)
|
||||
if !exists {
|
||||
delete(m.offlineRecoveryConfirmations, alertID)
|
||||
return
|
||||
}
|
||||
|
||||
recoveryCount, confirmed := m.confirmOfflineRecoveryNoLock(alertID, offlineRecoveryConfirmationsDefault)
|
||||
if !confirmed {
|
||||
log.Debug().
|
||||
Str("node", node.Name).
|
||||
Int("confirmations", recoveryCount).
|
||||
Int("required", offlineRecoveryConfirmationsDefault).
|
||||
Msg("Node appears back online, waiting for recovery confirmation")
|
||||
return
|
||||
}
|
||||
|
||||
|
|
@ -7944,6 +8166,10 @@ func (m *Manager) clearNodeOfflineAlert(node models.Node) {
|
|||
|
||||
// checkPBSOffline creates an alert for offline PBS instances
|
||||
func (m *Manager) checkPBSOffline(pbs models.PBSInstance) {
|
||||
m.mu.Lock()
|
||||
delete(m.offlineRecoveryConfirmations, canonicalConnectivityStateID(pbs.ID))
|
||||
m.mu.Unlock()
|
||||
|
||||
thresholds := m.resolveResourceThresholds("pbs", pbs.ID)
|
||||
spec, err := buildCanonicalConnectivitySpec(pbs.ID, pbs.Name, unifiedresources.ResourceTypePBS, AlertLevelCritical, 3, thresholds.Disabled || thresholds.DisableConnectivity)
|
||||
if err != nil {
|
||||
|
|
@ -7979,11 +8205,15 @@ func (m *Manager) checkPBSOffline(pbs models.PBSInstance) {
|
|||
|
||||
// clearPBSOfflineAlert removes offline alert when PBS comes back online
|
||||
func (m *Manager) clearPBSOfflineAlert(pbs models.PBSInstance) {
|
||||
m.clearResourceOfflineAlert("pbs-offline", pbs.ID, pbs.Name, pbs.Host, "PBS")
|
||||
m.clearResourceOfflineAlert(pbs.ID, pbs.Name, pbs.Host, "PBS", offlineRecoveryConfirmationsDefault)
|
||||
}
|
||||
|
||||
// checkPMGOffline creates an alert for offline PMG instances
|
||||
func (m *Manager) checkPMGOffline(pmg models.PMGInstance) {
|
||||
m.mu.Lock()
|
||||
delete(m.offlineRecoveryConfirmations, canonicalConnectivityStateID(pmg.ID))
|
||||
m.mu.Unlock()
|
||||
|
||||
m.mu.RLock()
|
||||
override, hasOverride := m.config.Overrides[pmg.ID]
|
||||
m.mu.RUnlock()
|
||||
|
|
@ -8022,7 +8252,7 @@ func (m *Manager) checkPMGOffline(pmg models.PMGInstance) {
|
|||
|
||||
// clearPMGOfflineAlert removes offline alert when PMG comes back online
|
||||
func (m *Manager) clearPMGOfflineAlert(pmg models.PMGInstance) {
|
||||
m.clearResourceOfflineAlert("pmg-offline", pmg.ID, pmg.Name, pmg.Host, "PMG")
|
||||
m.clearResourceOfflineAlert(pmg.ID, pmg.Name, pmg.Host, "PMG", offlineRecoveryConfirmationsDefault)
|
||||
}
|
||||
|
||||
// checkPMGQueueDepths checks PMG mail queue depths and creates alerts
|
||||
|
|
@ -8936,6 +9166,10 @@ func (m *Manager) checkAnomalyMetric(pmg models.PMGInstance, tracker *pmgAnomaly
|
|||
func (m *Manager) checkStorageOffline(storage models.Storage) {
|
||||
alertID := fmt.Sprintf("storage-offline-%s", storage.ID)
|
||||
|
||||
m.mu.Lock()
|
||||
delete(m.offlineRecoveryConfirmations, canonicalConnectivityStateID(storage.ID))
|
||||
m.mu.Unlock()
|
||||
|
||||
thresholds := m.resolveResourceThresholds("storage", storage.ID)
|
||||
spec, err := buildCanonicalConnectivitySpec(storage.ID, storage.Name, unifiedresources.ResourceTypeStorage, AlertLevelWarning, 2, thresholds.Disabled || thresholds.DisableConnectivity)
|
||||
if err != nil {
|
||||
|
|
@ -8970,7 +9204,7 @@ func (m *Manager) checkStorageOffline(storage models.Storage) {
|
|||
|
||||
// clearStorageOfflineAlert removes offline alert when storage comes back online
|
||||
func (m *Manager) clearStorageOfflineAlert(storage models.Storage) {
|
||||
m.clearResourceOfflineAlert("storage-offline", storage.ID, storage.Name, storage.Node, "Storage")
|
||||
m.clearResourceOfflineAlert(storage.ID, storage.Name, storage.Node, "Storage", offlineRecoveryConfirmationsStorage)
|
||||
}
|
||||
|
||||
// checkGuestPoweredOff creates an alert for powered-off guests
|
||||
|
|
@ -10095,6 +10329,7 @@ func (m *Manager) ClearActiveAlerts() {
|
|||
m.alertRateLimit = make(map[string][]time.Time)
|
||||
m.nodeOfflineCount = make(map[string]int)
|
||||
m.offlineConfirmations = make(map[string]int)
|
||||
m.offlineRecoveryConfirmations = make(map[string]int)
|
||||
m.dockerOfflineCount = make(map[string]int)
|
||||
m.dockerStateConfirm = make(map[string]int)
|
||||
m.dockerRestartTracking = make(map[string]*dockerRestartRecord)
|
||||
|
|
@ -10215,6 +10450,13 @@ func (m *Manager) cleanupStaleMaps() {
|
|||
}
|
||||
}
|
||||
|
||||
for alertID := range m.offlineRecoveryConfirmations {
|
||||
if !m.hasActiveAlertNoLock(alertID) {
|
||||
delete(m.offlineRecoveryConfirmations, alertID)
|
||||
cleaned++
|
||||
}
|
||||
}
|
||||
|
||||
// Clean up node offline counts (legacy)
|
||||
for nodeID := range m.nodeOfflineCount {
|
||||
hasRelatedAlert := false
|
||||
|
|
|
|||
|
|
@ -5097,6 +5097,18 @@ func TestClearStorageOfflineAlert(t *testing.T) {
|
|||
resolvedCh <- id
|
||||
})
|
||||
|
||||
m.clearStorageOfflineAlert(storage)
|
||||
m.mu.RLock()
|
||||
_, existsAfterFirst := testLookupActiveAlert(t, m, alertID)
|
||||
recoveryCountAfterFirst := m.offlineRecoveryConfirmations[canonicalConnectivityStateID(storage.ID)]
|
||||
m.mu.RUnlock()
|
||||
if !existsAfterFirst {
|
||||
t.Fatal("expected storage alert to remain active until recovery confirmations are satisfied")
|
||||
}
|
||||
if recoveryCountAfterFirst != 1 {
|
||||
t.Fatalf("expected first storage recovery confirmation to be tracked, got %d", recoveryCountAfterFirst)
|
||||
}
|
||||
|
||||
m.clearStorageOfflineAlert(storage)
|
||||
|
||||
m.mu.RLock()
|
||||
|
|
@ -6787,6 +6799,28 @@ func TestHandleHostOffline(t *testing.T) {
|
|||
}
|
||||
})
|
||||
|
||||
t.Run("linked node override DisableConnectivity clears alert and returns", func(t *testing.T) {
|
||||
m := newTestManager(t)
|
||||
m.config.Enabled = true
|
||||
m.config.Overrides = map[string]ThresholdConfig{
|
||||
"cluster-node1": {DisableConnectivity: true},
|
||||
}
|
||||
|
||||
alertID := canonicalConnectivityStateID(hostResourceID("host1"))
|
||||
m.activeAlerts[alertID] = &Alert{ID: alertID, Type: "host-offline"}
|
||||
m.offlineConfirmations[hostResourceID("host1")] = 5
|
||||
|
||||
host := models.Host{ID: "host1", Hostname: "test-host", LinkedNodeID: "cluster-node1"}
|
||||
m.HandleHostOffline(host)
|
||||
|
||||
if testHasActiveAlert(t, m, alertID) {
|
||||
t.Error("expected alert to be cleared")
|
||||
}
|
||||
if _, exists := m.offlineConfirmations[hostResourceID("host1")]; exists {
|
||||
t.Error("expected offlineConfirmations to be cleared")
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("override Disabled clears alert and returns", func(t *testing.T) {
|
||||
// t.Parallel()
|
||||
m := newTestManager(t)
|
||||
|
|
@ -8118,6 +8152,30 @@ func TestClearNodeOfflineAlert(t *testing.T) {
|
|||
m.mu.Unlock()
|
||||
|
||||
node := models.Node{ID: "node1", Name: "Node 1", Instance: "pve1"}
|
||||
m.clearNodeOfflineAlert(node)
|
||||
m.mu.RLock()
|
||||
_, existsAfterFirst := testLookupActiveAlert(t, m, state)
|
||||
recoveryCountAfterFirst := m.offlineRecoveryConfirmations[state]
|
||||
m.mu.RUnlock()
|
||||
if !existsAfterFirst {
|
||||
t.Fatal("expected node alert to remain active until recovery confirmations are satisfied")
|
||||
}
|
||||
if recoveryCountAfterFirst != 1 {
|
||||
t.Fatalf("expected first node recovery confirmation to be tracked, got %d", recoveryCountAfterFirst)
|
||||
}
|
||||
|
||||
m.clearNodeOfflineAlert(node)
|
||||
m.mu.RLock()
|
||||
_, existsAfterSecond := testLookupActiveAlert(t, m, state)
|
||||
recoveryCountAfterSecond := m.offlineRecoveryConfirmations[state]
|
||||
m.mu.RUnlock()
|
||||
if !existsAfterSecond {
|
||||
t.Fatal("expected node alert to remain active after second recovery confirmation")
|
||||
}
|
||||
if recoveryCountAfterSecond != 2 {
|
||||
t.Fatalf("expected second node recovery confirmation to be tracked, got %d", recoveryCountAfterSecond)
|
||||
}
|
||||
|
||||
m.clearNodeOfflineAlert(node)
|
||||
|
||||
m.mu.RLock()
|
||||
|
|
@ -8174,7 +8232,10 @@ func TestClearOfflineAlertNoDeadlock(t *testing.T) {
|
|||
m.mu.Unlock()
|
||||
},
|
||||
clearFn: func(m *Manager) {
|
||||
m.clearNodeOfflineAlert(models.Node{ID: "node1", Name: "Node 1", Instance: "pve1"})
|
||||
node := models.Node{ID: "node1", Name: "Node 1", Instance: "pve1"}
|
||||
m.clearNodeOfflineAlert(node)
|
||||
m.clearNodeOfflineAlert(node)
|
||||
m.clearNodeOfflineAlert(node)
|
||||
},
|
||||
},
|
||||
{
|
||||
|
|
@ -8189,7 +8250,10 @@ func TestClearOfflineAlertNoDeadlock(t *testing.T) {
|
|||
m.mu.Unlock()
|
||||
},
|
||||
clearFn: func(m *Manager) {
|
||||
m.clearPBSOfflineAlert(models.PBSInstance{ID: "pbs1", Name: "PBS 1", Host: "host1"})
|
||||
pbs := models.PBSInstance{ID: "pbs1", Name: "PBS 1", Host: "host1"}
|
||||
m.clearPBSOfflineAlert(pbs)
|
||||
m.clearPBSOfflineAlert(pbs)
|
||||
m.clearPBSOfflineAlert(pbs)
|
||||
},
|
||||
},
|
||||
{
|
||||
|
|
@ -8204,7 +8268,10 @@ func TestClearOfflineAlertNoDeadlock(t *testing.T) {
|
|||
m.mu.Unlock()
|
||||
},
|
||||
clearFn: func(m *Manager) {
|
||||
m.clearPMGOfflineAlert(models.PMGInstance{ID: "pmg1", Name: "PMG 1", Host: "host1"})
|
||||
pmg := models.PMGInstance{ID: "pmg1", Name: "PMG 1", Host: "host1"}
|
||||
m.clearPMGOfflineAlert(pmg)
|
||||
m.clearPMGOfflineAlert(pmg)
|
||||
m.clearPMGOfflineAlert(pmg)
|
||||
},
|
||||
},
|
||||
{
|
||||
|
|
@ -8219,7 +8286,9 @@ func TestClearOfflineAlertNoDeadlock(t *testing.T) {
|
|||
m.mu.Unlock()
|
||||
},
|
||||
clearFn: func(m *Manager) {
|
||||
m.clearStorageOfflineAlert(models.Storage{ID: "stor1", Name: "Storage 1", Node: "node1"})
|
||||
storage := models.Storage{ID: "stor1", Name: "Storage 1", Node: "node1"}
|
||||
m.clearStorageOfflineAlert(storage)
|
||||
m.clearStorageOfflineAlert(storage)
|
||||
},
|
||||
},
|
||||
{
|
||||
|
|
@ -8319,6 +8388,30 @@ func TestClearPBSOfflineAlert(t *testing.T) {
|
|||
m.mu.Unlock()
|
||||
|
||||
pbs := models.PBSInstance{ID: "pbs1", Name: "PBS 1", Host: "pbs.local"}
|
||||
m.clearPBSOfflineAlert(pbs)
|
||||
m.mu.RLock()
|
||||
_, existsAfterFirst := testLookupActiveAlert(t, m, "pbs-offline-pbs1")
|
||||
recoveryCountAfterFirst := m.offlineRecoveryConfirmations[canonicalConnectivityStateID("pbs1")]
|
||||
m.mu.RUnlock()
|
||||
if !existsAfterFirst {
|
||||
t.Fatal("expected PBS alert to remain active until recovery confirmations are satisfied")
|
||||
}
|
||||
if recoveryCountAfterFirst != 1 {
|
||||
t.Fatalf("expected first PBS recovery confirmation to be tracked, got %d", recoveryCountAfterFirst)
|
||||
}
|
||||
|
||||
m.clearPBSOfflineAlert(pbs)
|
||||
m.mu.RLock()
|
||||
_, existsAfterSecond := testLookupActiveAlert(t, m, "pbs-offline-pbs1")
|
||||
recoveryCountAfterSecond := m.offlineRecoveryConfirmations[canonicalConnectivityStateID("pbs1")]
|
||||
m.mu.RUnlock()
|
||||
if !existsAfterSecond {
|
||||
t.Fatal("expected PBS alert to remain active after second recovery confirmation")
|
||||
}
|
||||
if recoveryCountAfterSecond != 2 {
|
||||
t.Fatalf("expected second PBS recovery confirmation to be tracked, got %d", recoveryCountAfterSecond)
|
||||
}
|
||||
|
||||
m.clearPBSOfflineAlert(pbs)
|
||||
|
||||
m.mu.RLock()
|
||||
|
|
@ -8402,6 +8495,30 @@ func TestClearPMGOfflineAlert(t *testing.T) {
|
|||
m.mu.Unlock()
|
||||
|
||||
pmg := models.PMGInstance{ID: "pmg1", Name: "PMG 1", Host: "pmg.local"}
|
||||
m.clearPMGOfflineAlert(pmg)
|
||||
m.mu.RLock()
|
||||
_, existsAfterFirst := testLookupActiveAlert(t, m, "pmg-offline-pmg1")
|
||||
recoveryCountAfterFirst := m.offlineRecoveryConfirmations[canonicalConnectivityStateID("pmg1")]
|
||||
m.mu.RUnlock()
|
||||
if !existsAfterFirst {
|
||||
t.Fatal("expected PMG alert to remain active until recovery confirmations are satisfied")
|
||||
}
|
||||
if recoveryCountAfterFirst != 1 {
|
||||
t.Fatalf("expected first PMG recovery confirmation to be tracked, got %d", recoveryCountAfterFirst)
|
||||
}
|
||||
|
||||
m.clearPMGOfflineAlert(pmg)
|
||||
m.mu.RLock()
|
||||
_, existsAfterSecond := testLookupActiveAlert(t, m, "pmg-offline-pmg1")
|
||||
recoveryCountAfterSecond := m.offlineRecoveryConfirmations[canonicalConnectivityStateID("pmg1")]
|
||||
m.mu.RUnlock()
|
||||
if !existsAfterSecond {
|
||||
t.Fatal("expected PMG alert to remain active after second recovery confirmation")
|
||||
}
|
||||
if recoveryCountAfterSecond != 2 {
|
||||
t.Fatalf("expected second PMG recovery confirmation to be tracked, got %d", recoveryCountAfterSecond)
|
||||
}
|
||||
|
||||
m.clearPMGOfflineAlert(pmg)
|
||||
|
||||
m.mu.RLock()
|
||||
|
|
@ -13217,6 +13334,7 @@ func TestCheckNode(t *testing.T) {
|
|||
t.Run("online node clears offline alert", func(t *testing.T) {
|
||||
// t.Parallel()
|
||||
m := newTestManager(t)
|
||||
state := canonicalConnectivityStateID("node1")
|
||||
|
||||
// Pre-create offline alert
|
||||
m.mu.Lock()
|
||||
|
|
@ -13235,11 +13353,36 @@ func TestCheckNode(t *testing.T) {
|
|||
Status: "online",
|
||||
ConnectionHealth: "connected",
|
||||
}
|
||||
m.CheckNode(node)
|
||||
m.mu.RLock()
|
||||
_, alertExistsAfterFirst := testLookupActiveAlert(t, m, "node-offline-node1")
|
||||
recoveryCountAfterFirst := m.offlineRecoveryConfirmations[state]
|
||||
m.mu.RUnlock()
|
||||
if !alertExistsAfterFirst {
|
||||
t.Fatal("expected offline alert to remain active until recovery confirmations are satisfied")
|
||||
}
|
||||
if recoveryCountAfterFirst != 1 {
|
||||
t.Fatalf("expected first recovery confirmation to be tracked, got %d", recoveryCountAfterFirst)
|
||||
}
|
||||
|
||||
m.CheckNode(node)
|
||||
m.mu.RLock()
|
||||
_, alertExistsAfterSecond := testLookupActiveAlert(t, m, "node-offline-node1")
|
||||
recoveryCountAfterSecond := m.offlineRecoveryConfirmations[state]
|
||||
m.mu.RUnlock()
|
||||
if !alertExistsAfterSecond {
|
||||
t.Fatal("expected offline alert to remain active after second recovery confirmation")
|
||||
}
|
||||
if recoveryCountAfterSecond != 2 {
|
||||
t.Fatalf("expected second recovery confirmation to be tracked, got %d", recoveryCountAfterSecond)
|
||||
}
|
||||
|
||||
m.CheckNode(node)
|
||||
|
||||
m.mu.RLock()
|
||||
_, alertExists := testLookupActiveAlert(t, m, "node-offline-node1")
|
||||
_, countExists := m.nodeOfflineCount["node1"]
|
||||
_, recoveryExists := m.offlineRecoveryConfirmations[state]
|
||||
m.mu.RUnlock()
|
||||
|
||||
if alertExists {
|
||||
|
|
@ -13248,6 +13391,9 @@ func TestCheckNode(t *testing.T) {
|
|||
if countExists {
|
||||
t.Error("expected offline count to be cleared")
|
||||
}
|
||||
if recoveryExists {
|
||||
t.Error("expected offline recovery confirmations to be cleared")
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("online node triggers metric checks", func(t *testing.T) {
|
||||
|
|
@ -14907,6 +15053,125 @@ func TestCheckHostComprehensive(t *testing.T) {
|
|||
t.Error("expected tags in metadata")
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("inherits linked node overrides for host agent metrics", func(t *testing.T) {
|
||||
m := newTestManager(t)
|
||||
m.ClearActiveAlerts()
|
||||
|
||||
m.mu.Lock()
|
||||
m.config.TimeThresholds = map[string]int{}
|
||||
m.config.AgentDefaults = ThresholdConfig{
|
||||
Memory: &HysteresisThreshold{Trigger: 85.0, Clear: 80.0},
|
||||
}
|
||||
m.config.Overrides = map[string]ThresholdConfig{
|
||||
"cluster-node1": {
|
||||
Memory: &HysteresisThreshold{Trigger: 97.0, Clear: 92.0},
|
||||
},
|
||||
}
|
||||
m.mu.Unlock()
|
||||
|
||||
host := models.Host{
|
||||
ID: "host-node1",
|
||||
DisplayName: "node1",
|
||||
Hostname: "node1",
|
||||
LinkedNodeID: "cluster-node1",
|
||||
Memory: models.Memory{
|
||||
Usage: 90.6,
|
||||
Total: 1024,
|
||||
Used: 928,
|
||||
Free: 96,
|
||||
},
|
||||
Status: "online",
|
||||
LastSeen: time.Now(),
|
||||
}
|
||||
|
||||
m.CheckHost(host)
|
||||
|
||||
m.mu.RLock()
|
||||
_, exists := m.activeAlerts[canonicalMetricStateID(hostResourceID(host.ID), "memory")]
|
||||
m.mu.RUnlock()
|
||||
|
||||
if exists {
|
||||
t.Fatal("expected linked node override to suppress host-agent memory alert")
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("inherits linked guest overrides for host agent metrics", func(t *testing.T) {
|
||||
m := newTestManager(t)
|
||||
m.ClearActiveAlerts()
|
||||
|
||||
m.mu.Lock()
|
||||
m.config.TimeThresholds = map[string]int{}
|
||||
m.config.AgentDefaults = ThresholdConfig{
|
||||
CPU: &HysteresisThreshold{Trigger: 80.0, Clear: 75.0},
|
||||
}
|
||||
m.config.Overrides = map[string]ThresholdConfig{
|
||||
"guest:Main:101": {
|
||||
CPU: &HysteresisThreshold{Trigger: 105.0, Clear: 100.0},
|
||||
},
|
||||
}
|
||||
m.mu.Unlock()
|
||||
|
||||
host := models.Host{
|
||||
ID: "host-vm101",
|
||||
DisplayName: "vm101",
|
||||
Hostname: "vm101.local",
|
||||
LinkedVMID: "Main:node3:101",
|
||||
CPUUsage: 97.5,
|
||||
Status: "online",
|
||||
LastSeen: time.Now(),
|
||||
}
|
||||
|
||||
m.CheckHost(host)
|
||||
|
||||
m.mu.RLock()
|
||||
_, exists := m.activeAlerts[canonicalMetricStateID(hostResourceID(host.ID), "cpu")]
|
||||
m.mu.RUnlock()
|
||||
|
||||
if exists {
|
||||
t.Fatal("expected linked guest override to suppress host-agent cpu alert")
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("prefers explicit host overrides over linked resource overrides", func(t *testing.T) {
|
||||
m := newTestManager(t)
|
||||
m.ClearActiveAlerts()
|
||||
|
||||
m.mu.Lock()
|
||||
m.config.TimeThresholds = map[string]int{}
|
||||
m.config.AgentDefaults = ThresholdConfig{
|
||||
CPU: &HysteresisThreshold{Trigger: 80.0, Clear: 75.0},
|
||||
}
|
||||
m.config.Overrides = map[string]ThresholdConfig{
|
||||
"guest:Main:101": {
|
||||
CPU: &HysteresisThreshold{Trigger: 105.0, Clear: 100.0},
|
||||
},
|
||||
"host-vm101": {
|
||||
CPU: &HysteresisThreshold{Trigger: 90.0, Clear: 85.0},
|
||||
},
|
||||
}
|
||||
m.mu.Unlock()
|
||||
|
||||
host := models.Host{
|
||||
ID: "host-vm101",
|
||||
DisplayName: "vm101",
|
||||
Hostname: "vm101.local",
|
||||
LinkedVMID: "Main:node3:101",
|
||||
CPUUsage: 97.5,
|
||||
Status: "online",
|
||||
LastSeen: time.Now(),
|
||||
}
|
||||
|
||||
m.CheckHost(host)
|
||||
|
||||
m.mu.RLock()
|
||||
alert := m.activeAlerts[canonicalMetricStateID(hostResourceID(host.ID), "cpu")]
|
||||
m.mu.RUnlock()
|
||||
|
||||
if alert == nil {
|
||||
t.Fatal("expected explicit host override to take precedence and trigger alert")
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func TestCheckPBSComprehensive(t *testing.T) {
|
||||
|
|
@ -15332,6 +15597,7 @@ func TestCheckPBSComprehensive(t *testing.T) {
|
|||
t.Run("clears offline alert when back online", func(t *testing.T) {
|
||||
// t.Parallel()
|
||||
m := newTestManager(t)
|
||||
state := canonicalConnectivityStateID("pbs1")
|
||||
|
||||
m.mu.Lock()
|
||||
m.activeAlerts["pbs-offline-pbs1"] = &Alert{ID: "pbs-offline-pbs1", Type: "connectivity"}
|
||||
|
|
@ -15345,11 +15611,36 @@ func TestCheckPBSComprehensive(t *testing.T) {
|
|||
ConnectionHealth: "healthy",
|
||||
}
|
||||
|
||||
m.CheckPBS(pbs)
|
||||
m.mu.RLock()
|
||||
_, offlineExistsAfterFirst := testLookupActiveAlert(t, m, "pbs-offline-pbs1")
|
||||
recoveryCountAfterFirst := m.offlineRecoveryConfirmations[state]
|
||||
m.mu.RUnlock()
|
||||
if !offlineExistsAfterFirst {
|
||||
t.Fatal("expected offline alert to remain active until recovery confirmations are satisfied")
|
||||
}
|
||||
if recoveryCountAfterFirst != 1 {
|
||||
t.Fatalf("expected first PBS recovery confirmation to be tracked, got %d", recoveryCountAfterFirst)
|
||||
}
|
||||
|
||||
m.CheckPBS(pbs)
|
||||
m.mu.RLock()
|
||||
_, offlineExistsAfterSecond := testLookupActiveAlert(t, m, "pbs-offline-pbs1")
|
||||
recoveryCountAfterSecond := m.offlineRecoveryConfirmations[state]
|
||||
m.mu.RUnlock()
|
||||
if !offlineExistsAfterSecond {
|
||||
t.Fatal("expected offline alert to remain active after second recovery confirmation")
|
||||
}
|
||||
if recoveryCountAfterSecond != 2 {
|
||||
t.Fatalf("expected second PBS recovery confirmation to be tracked, got %d", recoveryCountAfterSecond)
|
||||
}
|
||||
|
||||
m.CheckPBS(pbs)
|
||||
|
||||
m.mu.RLock()
|
||||
_, offlineExists := testLookupActiveAlert(t, m, "pbs-offline-pbs1")
|
||||
_, confirmExists := m.offlineConfirmations["pbs1"]
|
||||
_, recoveryExists := m.offlineRecoveryConfirmations[state]
|
||||
m.mu.RUnlock()
|
||||
|
||||
if offlineExists {
|
||||
|
|
@ -15358,6 +15649,9 @@ func TestCheckPBSComprehensive(t *testing.T) {
|
|||
if confirmExists {
|
||||
t.Error("expected offline confirmation to be cleared")
|
||||
}
|
||||
if recoveryExists {
|
||||
t.Error("expected offline recovery confirmations to be cleared")
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
|
|
@ -15664,6 +15958,7 @@ func TestCheckPMGComprehensive(t *testing.T) {
|
|||
t.Run("clears offline alert when back online", func(t *testing.T) {
|
||||
// t.Parallel()
|
||||
m := newTestManager(t)
|
||||
state := canonicalConnectivityStateID("pmg1")
|
||||
|
||||
m.mu.Lock()
|
||||
m.activeAlerts["pmg-offline-pmg1"] = &Alert{ID: "pmg-offline-pmg1", Type: "connectivity"}
|
||||
|
|
@ -15677,11 +15972,36 @@ func TestCheckPMGComprehensive(t *testing.T) {
|
|||
ConnectionHealth: "healthy",
|
||||
}
|
||||
|
||||
m.CheckPMG(pmg)
|
||||
m.mu.RLock()
|
||||
_, offlineExistsAfterFirst := testLookupActiveAlert(t, m, "pmg-offline-pmg1")
|
||||
recoveryCountAfterFirst := m.offlineRecoveryConfirmations[state]
|
||||
m.mu.RUnlock()
|
||||
if !offlineExistsAfterFirst {
|
||||
t.Fatal("expected offline alert to remain active until recovery confirmations are satisfied")
|
||||
}
|
||||
if recoveryCountAfterFirst != 1 {
|
||||
t.Fatalf("expected first PMG recovery confirmation to be tracked, got %d", recoveryCountAfterFirst)
|
||||
}
|
||||
|
||||
m.CheckPMG(pmg)
|
||||
m.mu.RLock()
|
||||
_, offlineExistsAfterSecond := testLookupActiveAlert(t, m, "pmg-offline-pmg1")
|
||||
recoveryCountAfterSecond := m.offlineRecoveryConfirmations[state]
|
||||
m.mu.RUnlock()
|
||||
if !offlineExistsAfterSecond {
|
||||
t.Fatal("expected offline alert to remain active after second recovery confirmation")
|
||||
}
|
||||
if recoveryCountAfterSecond != 2 {
|
||||
t.Fatalf("expected second PMG recovery confirmation to be tracked, got %d", recoveryCountAfterSecond)
|
||||
}
|
||||
|
||||
m.CheckPMG(pmg)
|
||||
|
||||
m.mu.RLock()
|
||||
_, offlineExists := testLookupActiveAlert(t, m, "pmg-offline-pmg1")
|
||||
_, confirmExists := m.offlineConfirmations["pmg1"]
|
||||
_, recoveryExists := m.offlineRecoveryConfirmations[state]
|
||||
m.mu.RUnlock()
|
||||
|
||||
if offlineExists {
|
||||
|
|
@ -15690,6 +16010,9 @@ func TestCheckPMGComprehensive(t *testing.T) {
|
|||
if confirmExists {
|
||||
t.Error("expected offline confirmation to be cleared")
|
||||
}
|
||||
if recoveryExists {
|
||||
t.Error("expected offline recovery confirmations to be cleared")
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("skips metrics when PMG is offline", func(t *testing.T) {
|
||||
|
|
@ -16008,6 +16331,7 @@ func TestCheckStorageComprehensive(t *testing.T) {
|
|||
t.Run("clears offline alert when back online", func(t *testing.T) {
|
||||
// t.Parallel()
|
||||
m := newTestManager(t)
|
||||
state := canonicalConnectivityStateID("storage1")
|
||||
|
||||
m.mu.Lock()
|
||||
m.activeAlerts["storage-offline-storage1"] = &Alert{ID: "storage-offline-storage1", Type: "connectivity"}
|
||||
|
|
@ -16020,11 +16344,24 @@ func TestCheckStorageComprehensive(t *testing.T) {
|
|||
Status: "active",
|
||||
}
|
||||
|
||||
m.CheckStorage(storage)
|
||||
m.mu.RLock()
|
||||
_, offlineExistsAfterFirst := testLookupActiveAlert(t, m, "storage-offline-storage1")
|
||||
recoveryCountAfterFirst := m.offlineRecoveryConfirmations[state]
|
||||
m.mu.RUnlock()
|
||||
if !offlineExistsAfterFirst {
|
||||
t.Fatal("expected offline alert to remain active until recovery confirmations are satisfied")
|
||||
}
|
||||
if recoveryCountAfterFirst != 1 {
|
||||
t.Fatalf("expected first storage recovery confirmation to be tracked, got %d", recoveryCountAfterFirst)
|
||||
}
|
||||
|
||||
m.CheckStorage(storage)
|
||||
|
||||
m.mu.RLock()
|
||||
_, offlineExists := testLookupActiveAlert(t, m, "storage-offline-storage1")
|
||||
_, confirmExists := m.offlineConfirmations["storage1"]
|
||||
_, recoveryExists := m.offlineRecoveryConfirmations[state]
|
||||
m.mu.RUnlock()
|
||||
|
||||
if offlineExists {
|
||||
|
|
@ -16033,6 +16370,9 @@ func TestCheckStorageComprehensive(t *testing.T) {
|
|||
if confirmExists {
|
||||
t.Error("expected offline confirmation to be cleared")
|
||||
}
|
||||
if recoveryExists {
|
||||
t.Error("expected offline recovery confirmations to be cleared")
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("skips usage check when usage is zero", func(t *testing.T) {
|
||||
|
|
|
|||
|
|
@ -79,6 +79,40 @@ func TestShouldSuppressNotificationQuietHours(t *testing.T) {
|
|||
t.Fatalf("expected storage alert suppression, got suppressed=%t reason=%q", suppressed, reason)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("public suppression helper mirrors quiet hours policy", func(t *testing.T) {
|
||||
m := newManagerWithQuietHoursSuppress(QuietHoursSuppression{Offline: true})
|
||||
alert := &Alert{ID: "offline-public", Type: "connectivity", Level: AlertLevelCritical}
|
||||
if !m.ShouldSuppressNotification(alert) {
|
||||
t.Fatal("expected public quiet-hours suppression helper to return true")
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("resolved notifications are suppressed for acknowledged alerts", func(t *testing.T) {
|
||||
m := newManagerWithQuietHoursSuppress(QuietHoursSuppression{})
|
||||
alert := &Alert{
|
||||
ID: "resolved-ack",
|
||||
Type: "cpu",
|
||||
Level: AlertLevelCritical,
|
||||
Acknowledged: true,
|
||||
LastNotified: ptrTime(time.Now().Add(-time.Minute)),
|
||||
}
|
||||
if !m.ShouldSuppressResolvedNotification(alert) {
|
||||
t.Fatal("expected recovery notification to be suppressed for acknowledged alert")
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("resolved notifications are suppressed when the firing alert was never notified", func(t *testing.T) {
|
||||
m := newManagerWithQuietHoursSuppress(QuietHoursSuppression{})
|
||||
alert := &Alert{
|
||||
ID: "resolved-unnotified",
|
||||
Type: "cpu",
|
||||
Level: AlertLevelCritical,
|
||||
}
|
||||
if !m.ShouldSuppressResolvedNotification(alert) {
|
||||
t.Fatal("expected recovery notification to be suppressed when LastNotified is nil")
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func TestIsInQuietHours(t *testing.T) {
|
||||
|
|
|
|||
|
|
@ -156,6 +156,65 @@ func TestReevaluateActiveAlertsWithOverride(t *testing.T) {
|
|||
}
|
||||
}
|
||||
|
||||
func TestReevaluateActiveAlertsWithLinkedHostOverride(t *testing.T) {
|
||||
manager := NewManager()
|
||||
|
||||
manager.mu.Lock()
|
||||
manager.activeAlerts = make(map[string]*Alert)
|
||||
manager.mu.Unlock()
|
||||
|
||||
initialConfig := AlertConfig{
|
||||
Enabled: true,
|
||||
AgentDefaults: ThresholdConfig{
|
||||
Memory: &HysteresisThreshold{Trigger: 85, Clear: 80},
|
||||
},
|
||||
Overrides: make(map[string]ThresholdConfig),
|
||||
}
|
||||
manager.UpdateConfig(initialConfig)
|
||||
|
||||
resourceID := hostResourceID("host-node1")
|
||||
alertID := canonicalMetricStateID(resourceID, "memory")
|
||||
alert := &Alert{
|
||||
ID: alertID,
|
||||
Type: "memory",
|
||||
Level: AlertLevelWarning,
|
||||
ResourceID: resourceID,
|
||||
ResourceName: "node1",
|
||||
Node: "node1",
|
||||
Instance: "linux",
|
||||
Message: "Host memory at 90.6%",
|
||||
Value: 90.6,
|
||||
Threshold: 85.0,
|
||||
StartTime: time.Now().Add(-5 * time.Minute),
|
||||
LastSeen: time.Now(),
|
||||
Metadata: map[string]interface{}{
|
||||
"resourceType": "agent",
|
||||
"hostId": "host-node1",
|
||||
"linkedNodeId": "cluster-node1",
|
||||
},
|
||||
}
|
||||
|
||||
manager.mu.Lock()
|
||||
manager.activeAlerts[alertID] = alert
|
||||
manager.mu.Unlock()
|
||||
|
||||
updatedConfig := initialConfig
|
||||
updatedConfig.Overrides["cluster-node1"] = ThresholdConfig{
|
||||
Memory: &HysteresisThreshold{Trigger: 97, Clear: 92},
|
||||
}
|
||||
manager.UpdateConfig(updatedConfig)
|
||||
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
|
||||
manager.mu.RLock()
|
||||
_, alertStillActive := manager.activeAlerts[alertID]
|
||||
manager.mu.RUnlock()
|
||||
|
||||
if alertStillActive {
|
||||
t.Errorf("expected linked host alert to be resolved after linked node override increase")
|
||||
}
|
||||
}
|
||||
|
||||
func TestReevaluateActiveAlertsUsesStableClusterGuestOverrideAcrossNodeMove(t *testing.T) {
|
||||
manager := NewManager()
|
||||
|
||||
|
|
|
|||
|
|
@ -369,6 +369,114 @@ func TestCheckGuestPerDiskAnnotatesCanonicalSpecMetadata(t *testing.T) {
|
|||
}
|
||||
}
|
||||
|
||||
func TestCheckGuestPerDiskCleansUpRemovedDiskAlerts(t *testing.T) {
|
||||
m := newTestManager(t)
|
||||
configureUnifiedEvalManager(t, m, unifiedEvalBaseConfig())
|
||||
|
||||
guestID := BuildGuestKey("pve1", "node1", 101)
|
||||
baseGuest := models.VM{
|
||||
ID: guestID,
|
||||
VMID: 101,
|
||||
Name: "app01",
|
||||
Node: "node1",
|
||||
Instance: "pve1",
|
||||
Status: "running",
|
||||
CPU: 0.20,
|
||||
Memory: models.Memory{Usage: 40},
|
||||
Disk: models.Disk{Usage: 40},
|
||||
}
|
||||
|
||||
withDisk := baseGuest
|
||||
withDisk.Disks = []models.Disk{
|
||||
{
|
||||
Mountpoint: "/",
|
||||
Device: "scsi0",
|
||||
Usage: 95,
|
||||
Total: 100,
|
||||
Used: 95,
|
||||
Free: 5,
|
||||
},
|
||||
}
|
||||
m.CheckGuest(withDisk, "pve1")
|
||||
|
||||
originalAlertID := canonicalMetricStateID(guestID+"-disk-scsi0", "disk")
|
||||
if activeAlert(t, m, originalAlertID) == nil {
|
||||
t.Fatalf("expected guest disk alert %q", originalAlertID)
|
||||
}
|
||||
|
||||
changedDisk := baseGuest
|
||||
changedDisk.Disks = []models.Disk{
|
||||
{
|
||||
Mountpoint: "/data",
|
||||
Device: "scsi1",
|
||||
Usage: 96,
|
||||
Total: 100,
|
||||
Used: 96,
|
||||
Free: 4,
|
||||
},
|
||||
}
|
||||
m.CheckGuest(changedDisk, "pve1")
|
||||
|
||||
newAlertID := canonicalMetricStateID(guestID+"-disk-data-scsi1", "disk")
|
||||
m.mu.RLock()
|
||||
_, oldExists := testLookupActiveAlert(t, m, originalAlertID)
|
||||
_, newExists := testLookupActiveAlert(t, m, newAlertID)
|
||||
m.mu.RUnlock()
|
||||
|
||||
if oldExists {
|
||||
t.Fatalf("expected stale guest disk alert %q to be cleared", originalAlertID)
|
||||
}
|
||||
if !newExists {
|
||||
t.Fatalf("expected replacement guest disk alert %q", newAlertID)
|
||||
}
|
||||
}
|
||||
|
||||
func TestCheckGuestClearsPerDiskAlertsWhenGuestStops(t *testing.T) {
|
||||
m := newTestManager(t)
|
||||
configureUnifiedEvalManager(t, m, unifiedEvalBaseConfig())
|
||||
|
||||
guestID := BuildGuestKey("pve1", "node1", 101)
|
||||
guest := models.VM{
|
||||
ID: guestID,
|
||||
VMID: 101,
|
||||
Name: "app01",
|
||||
Node: "node1",
|
||||
Instance: "pve1",
|
||||
Status: "running",
|
||||
CPU: 0.20,
|
||||
Memory: models.Memory{Usage: 40},
|
||||
Disk: models.Disk{Usage: 40},
|
||||
Disks: []models.Disk{
|
||||
{
|
||||
Mountpoint: "/",
|
||||
Device: "scsi0",
|
||||
Usage: 95,
|
||||
Total: 100,
|
||||
Used: 95,
|
||||
Free: 5,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
m.CheckGuest(guest, "pve1")
|
||||
|
||||
alertID := canonicalMetricStateID(guestID+"-disk-scsi0", "disk")
|
||||
if activeAlert(t, m, alertID) == nil {
|
||||
t.Fatalf("expected guest disk alert %q", alertID)
|
||||
}
|
||||
|
||||
guest.Status = "stopped"
|
||||
guest.Disks = nil
|
||||
m.CheckGuest(guest, "pve1")
|
||||
|
||||
m.mu.RLock()
|
||||
_, exists := testLookupActiveAlert(t, m, alertID)
|
||||
m.mu.RUnlock()
|
||||
if exists {
|
||||
t.Fatalf("expected guest disk alert %q to clear when guest stops", alertID)
|
||||
}
|
||||
}
|
||||
|
||||
func TestCheckNodeTemperatureAnnotatesCanonicalSpecMetadata(t *testing.T) {
|
||||
m := newTestManager(t)
|
||||
configureUnifiedEvalManager(t, m, unifiedEvalBaseConfig())
|
||||
|
|
|
|||
|
|
@ -221,6 +221,122 @@ func TestCheckDockerContainerImageUpdate(t *testing.T) {
|
|||
}
|
||||
})
|
||||
|
||||
t.Run("missing update status preserves existing alert and tracking", func(t *testing.T) {
|
||||
testResourceID := "docker:" + hostID + "/container-unknown-status"
|
||||
container := models.DockerContainer{
|
||||
ID: "container-unknown-status",
|
||||
Name: "unknown-status-container",
|
||||
Image: "nginx:latest",
|
||||
UpdateStatus: &models.DockerContainerUpdateStatus{
|
||||
UpdateAvailable: true,
|
||||
CurrentDigest: "sha256:old",
|
||||
LatestDigest: "sha256:new",
|
||||
LastChecked: time.Now(),
|
||||
},
|
||||
}
|
||||
trackingKey := dockerUpdateTrackingKey(host, container)
|
||||
firstSeen := time.Now().Add(-25 * time.Hour)
|
||||
|
||||
m.mu.Lock()
|
||||
m.dockerUpdateFirstSeen[testResourceID] = firstSeen
|
||||
m.dockerUpdateFirstSeenByIdentity[trackingKey] = firstSeen
|
||||
m.mu.Unlock()
|
||||
|
||||
m.checkDockerContainerImageUpdate(host, container, testResourceID, "unknown-status-container", instanceName, nodeName)
|
||||
|
||||
canonicalAlertID := buildCanonicalStateID(testResourceID, testResourceID+"-image-update")
|
||||
m.mu.RLock()
|
||||
alert := testRequireActiveAlert(t, m, canonicalAlertID)
|
||||
if alert == nil {
|
||||
m.mu.RUnlock()
|
||||
t.Fatalf("expected update alert %q to be active", canonicalAlertID)
|
||||
}
|
||||
beforeLastSeen := alert.LastSeen
|
||||
m.mu.RUnlock()
|
||||
|
||||
time.Sleep(10 * time.Millisecond)
|
||||
container.UpdateStatus = nil
|
||||
m.checkDockerContainerImageUpdate(host, container, testResourceID, "unknown-status-container", instanceName, nodeName)
|
||||
|
||||
m.mu.RLock()
|
||||
alert = testRequireActiveAlert(t, m, canonicalAlertID)
|
||||
if alert == nil {
|
||||
m.mu.RUnlock()
|
||||
t.Fatalf("expected update alert %q to remain active", canonicalAlertID)
|
||||
}
|
||||
_, tracked := m.dockerUpdateFirstSeen[testResourceID]
|
||||
_, trackedByIdentity := m.dockerUpdateFirstSeenByIdentity[trackingKey]
|
||||
afterLastSeen := alert.LastSeen
|
||||
m.mu.RUnlock()
|
||||
|
||||
if !tracked || !trackedByIdentity {
|
||||
t.Fatal("expected update tracking to remain when update status is temporarily unavailable")
|
||||
}
|
||||
if !afterLastSeen.After(beforeLastSeen) {
|
||||
t.Fatalf("expected LastSeen to refresh when update status is missing, before=%s after=%s", beforeLastSeen, afterLastSeen)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("update detection errors preserve existing alert and tracking", func(t *testing.T) {
|
||||
testResourceID := "docker:" + hostID + "/container-error-status"
|
||||
container := models.DockerContainer{
|
||||
ID: "container-error-status",
|
||||
Name: "error-status-container",
|
||||
Image: "nginx:latest",
|
||||
UpdateStatus: &models.DockerContainerUpdateStatus{
|
||||
UpdateAvailable: true,
|
||||
CurrentDigest: "sha256:old",
|
||||
LatestDigest: "sha256:new",
|
||||
LastChecked: time.Now(),
|
||||
},
|
||||
}
|
||||
trackingKey := dockerUpdateTrackingKey(host, container)
|
||||
firstSeen := time.Now().Add(-25 * time.Hour)
|
||||
|
||||
m.mu.Lock()
|
||||
m.dockerUpdateFirstSeen[testResourceID] = firstSeen
|
||||
m.dockerUpdateFirstSeenByIdentity[trackingKey] = firstSeen
|
||||
m.mu.Unlock()
|
||||
|
||||
m.checkDockerContainerImageUpdate(host, container, testResourceID, "error-status-container", instanceName, nodeName)
|
||||
|
||||
canonicalAlertID := buildCanonicalStateID(testResourceID, testResourceID+"-image-update")
|
||||
m.mu.RLock()
|
||||
alert := testRequireActiveAlert(t, m, canonicalAlertID)
|
||||
if alert == nil {
|
||||
m.mu.RUnlock()
|
||||
t.Fatalf("expected update alert %q to be active", canonicalAlertID)
|
||||
}
|
||||
beforeLastSeen := alert.LastSeen
|
||||
m.mu.RUnlock()
|
||||
|
||||
time.Sleep(10 * time.Millisecond)
|
||||
container.UpdateStatus = &models.DockerContainerUpdateStatus{
|
||||
UpdateAvailable: true,
|
||||
Error: "rate limited",
|
||||
LastChecked: time.Now(),
|
||||
}
|
||||
m.checkDockerContainerImageUpdate(host, container, testResourceID, "error-status-container", instanceName, nodeName)
|
||||
|
||||
m.mu.RLock()
|
||||
alert = testRequireActiveAlert(t, m, canonicalAlertID)
|
||||
if alert == nil {
|
||||
m.mu.RUnlock()
|
||||
t.Fatalf("expected update alert %q to remain active after check error", canonicalAlertID)
|
||||
}
|
||||
_, tracked := m.dockerUpdateFirstSeen[testResourceID]
|
||||
_, trackedByIdentity := m.dockerUpdateFirstSeenByIdentity[trackingKey]
|
||||
afterLastSeen := alert.LastSeen
|
||||
m.mu.RUnlock()
|
||||
|
||||
if !tracked || !trackedByIdentity {
|
||||
t.Fatal("expected update tracking to remain after update check error")
|
||||
}
|
||||
if !afterLastSeen.After(beforeLastSeen) {
|
||||
t.Fatalf("expected LastSeen to refresh after update check error, before=%s after=%s", beforeLastSeen, afterLastSeen)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("disabled by negative delay hours", func(t *testing.T) {
|
||||
// Create manager with disabled update alerts
|
||||
m2 := NewManager()
|
||||
|
|
|
|||
|
|
@ -91,6 +91,33 @@ func TestGetStateRefreshesLiveAlertSnapshots(t *testing.T) {
|
|||
}
|
||||
}
|
||||
|
||||
func TestEscalationDeliveryDefersToCanonicalAlertSuppression(t *testing.T) {
|
||||
requiredSnippets := map[string][]string{
|
||||
"monitor.go": {
|
||||
"m.alertManager.SetEscalateCallback(func(alert *alerts.Alert, level int) {",
|
||||
"m.handleAlertEscalated(wsHub, alert, level)",
|
||||
},
|
||||
"monitor_alerts.go": {
|
||||
"func (m *Monitor) handleAlertEscalated(hub *websocket.Hub, alert *alerts.Alert, level int) {",
|
||||
"if m.alertManager.ShouldSuppressNotification(alert) {",
|
||||
"m.broadcastEscalatedAlert(hub, alert)",
|
||||
},
|
||||
}
|
||||
|
||||
for file, snippets := range requiredSnippets {
|
||||
data, err := os.ReadFile(file)
|
||||
if err != nil {
|
||||
t.Fatalf("failed to read %s: %v", file, err)
|
||||
}
|
||||
source := string(data)
|
||||
for _, snippet := range snippets {
|
||||
if !strings.Contains(source, snippet) {
|
||||
t.Fatalf("%s must contain %q", file, snippet)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestMonitoredSystemUsageReadinessGuardrailsRemainCanonical(t *testing.T) {
|
||||
requiredSnippets := map[string][]string{
|
||||
"monitor.go": {
|
||||
|
|
|
|||
|
|
@ -1772,41 +1772,7 @@ func (m *Monitor) Start(ctx context.Context, wsHub *websocket.Hub) {
|
|||
m.handleAlertUnacknowledged(alert, user)
|
||||
})
|
||||
m.alertManager.SetEscalateCallback(func(alert *alerts.Alert, level int) {
|
||||
log.Info().
|
||||
Str("alertID", alert.ID).
|
||||
Int("level", level).
|
||||
Msg("Alert escalated - sending notifications")
|
||||
|
||||
// Get escalation config
|
||||
config := m.alertManager.GetConfig()
|
||||
if level <= 0 || level > len(config.Schedule.Escalation.Levels) {
|
||||
return
|
||||
}
|
||||
|
||||
escalationLevel := config.Schedule.Escalation.Levels[level-1]
|
||||
|
||||
// Send notifications based on escalation level
|
||||
switch escalationLevel.Notify {
|
||||
case "email":
|
||||
// Only send email
|
||||
if emailConfig := m.notificationMgr.GetEmailConfig(); emailConfig.Enabled {
|
||||
m.notificationMgr.SendAlert(alert)
|
||||
}
|
||||
case "webhook":
|
||||
// Only send webhooks
|
||||
for _, webhook := range m.notificationMgr.GetWebhooks() {
|
||||
if webhook.Enabled {
|
||||
m.notificationMgr.SendAlert(alert)
|
||||
break
|
||||
}
|
||||
}
|
||||
case "all":
|
||||
// Send all notifications
|
||||
m.notificationMgr.SendAlert(alert)
|
||||
}
|
||||
|
||||
// Update WebSocket with escalation
|
||||
m.broadcastEscalatedAlert(wsHub, alert)
|
||||
m.handleAlertEscalated(wsHub, alert, level)
|
||||
})
|
||||
|
||||
// Create separate tickers for polling and broadcasting using the configured cadence
|
||||
|
|
|
|||
|
|
@ -1,6 +1,8 @@
|
|||
package monitoring
|
||||
|
||||
import (
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"sync/atomic"
|
||||
"testing"
|
||||
"time"
|
||||
|
|
@ -276,3 +278,125 @@ func TestMonitor_HandleAlertResolved_NoQuietHoursSendsNotification(t *testing.T)
|
|||
// Should not crash, and notification should be dispatched (not suppressed)
|
||||
m.handleAlertResolved(alertID)
|
||||
}
|
||||
|
||||
func TestMonitor_HandleAlertEscalated_QuietHoursSuppressesNotification(t *testing.T) {
|
||||
t.Setenv("PULSE_DATA_DIR", t.TempDir())
|
||||
|
||||
requests := make(chan struct{}, 1)
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
requests <- struct{}{}
|
||||
w.WriteHeader(http.StatusOK)
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
notifMgr := notifications.NewNotificationManager("https://pulse.local")
|
||||
defer notifMgr.Stop()
|
||||
notifMgr.SetGroupingWindow(0)
|
||||
notifMgr.SetCooldown(0)
|
||||
if err := notifMgr.UpdateAllowedPrivateCIDRs("127.0.0.1/32"); err != nil {
|
||||
t.Fatalf("UpdateAllowedPrivateCIDRs: %v", err)
|
||||
}
|
||||
notifMgr.AddWebhook(notifications.WebhookConfig{
|
||||
ID: "quiet-hours-hook",
|
||||
Name: "quiet-hours",
|
||||
URL: server.URL,
|
||||
Enabled: true,
|
||||
})
|
||||
|
||||
mgr := alerts.NewManager()
|
||||
cfg := mgr.GetConfig()
|
||||
cfg.Schedule.Escalation.Levels = []alerts.EscalationLevel{{After: 1, Notify: "webhook"}}
|
||||
cfg.Schedule.QuietHours = alerts.QuietHours{
|
||||
Enabled: true,
|
||||
Start: "00:00",
|
||||
End: "23:59",
|
||||
Timezone: "UTC",
|
||||
Days: map[string]bool{
|
||||
"monday": true, "tuesday": true, "wednesday": true,
|
||||
"thursday": true, "friday": true, "saturday": true, "sunday": true,
|
||||
},
|
||||
Suppress: alerts.QuietHoursSuppression{Offline: true},
|
||||
}
|
||||
mgr.UpdateConfig(cfg)
|
||||
|
||||
m := &Monitor{
|
||||
notificationMgr: notifMgr,
|
||||
alertManager: mgr,
|
||||
}
|
||||
|
||||
alert := &alerts.Alert{
|
||||
ID: "escalated-offline",
|
||||
Type: "connectivity",
|
||||
Level: alerts.AlertLevelCritical,
|
||||
ResourceID: "node/pve-1",
|
||||
ResourceName: "pve-1",
|
||||
Node: "pve-1",
|
||||
Instance: "pve",
|
||||
Message: "Node offline",
|
||||
StartTime: time.Now(),
|
||||
}
|
||||
|
||||
m.handleAlertEscalated(nil, alert, 1)
|
||||
|
||||
select {
|
||||
case <-requests:
|
||||
t.Fatal("expected quiet hours to suppress escalated notification delivery")
|
||||
case <-time.After(500 * time.Millisecond):
|
||||
}
|
||||
}
|
||||
|
||||
func TestMonitor_HandleAlertEscalated_SendsNotificationWhenNotSuppressed(t *testing.T) {
|
||||
t.Setenv("PULSE_DATA_DIR", t.TempDir())
|
||||
|
||||
requests := make(chan struct{}, 1)
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
requests <- struct{}{}
|
||||
w.WriteHeader(http.StatusOK)
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
notifMgr := notifications.NewNotificationManager("https://pulse.local")
|
||||
defer notifMgr.Stop()
|
||||
notifMgr.SetGroupingWindow(0)
|
||||
notifMgr.SetCooldown(0)
|
||||
if err := notifMgr.UpdateAllowedPrivateCIDRs("127.0.0.1/32"); err != nil {
|
||||
t.Fatalf("UpdateAllowedPrivateCIDRs: %v", err)
|
||||
}
|
||||
notifMgr.AddWebhook(notifications.WebhookConfig{
|
||||
ID: "normal-hook",
|
||||
Name: "normal",
|
||||
URL: server.URL,
|
||||
Enabled: true,
|
||||
})
|
||||
|
||||
mgr := alerts.NewManager()
|
||||
cfg := mgr.GetConfig()
|
||||
cfg.Schedule.Escalation.Levels = []alerts.EscalationLevel{{After: 1, Notify: "webhook"}}
|
||||
cfg.Schedule.QuietHours.Enabled = false
|
||||
mgr.UpdateConfig(cfg)
|
||||
|
||||
m := &Monitor{
|
||||
notificationMgr: notifMgr,
|
||||
alertManager: mgr,
|
||||
}
|
||||
|
||||
alert := &alerts.Alert{
|
||||
ID: "escalated-normal",
|
||||
Type: "connectivity",
|
||||
Level: alerts.AlertLevelCritical,
|
||||
ResourceID: "node/pve-1",
|
||||
ResourceName: "pve-1",
|
||||
Node: "pve-1",
|
||||
Instance: "pve",
|
||||
Message: "Node offline",
|
||||
StartTime: time.Now(),
|
||||
}
|
||||
|
||||
m.handleAlertEscalated(nil, alert, 1)
|
||||
|
||||
select {
|
||||
case <-requests:
|
||||
case <-time.After(2 * time.Second):
|
||||
t.Fatal("expected escalated notification delivery")
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -8,6 +8,7 @@ import (
|
|||
"github.com/rcourtman/pulse-go-rewrite/internal/alerts"
|
||||
"github.com/rcourtman/pulse-go-rewrite/internal/mock"
|
||||
"github.com/rcourtman/pulse-go-rewrite/internal/unifiedresources"
|
||||
"github.com/rcourtman/pulse-go-rewrite/internal/websocket"
|
||||
"github.com/rs/zerolog/log"
|
||||
)
|
||||
|
||||
|
|
@ -140,6 +141,52 @@ func (m *Monitor) handleAlertResolved(alertID string) {
|
|||
}
|
||||
}
|
||||
|
||||
func (m *Monitor) handleAlertEscalated(hub *websocket.Hub, alert *alerts.Alert, level int) {
|
||||
if alert == nil || m.alertManager == nil {
|
||||
return
|
||||
}
|
||||
|
||||
log.Info().
|
||||
Str("alertID", alert.ID).
|
||||
Int("level", level).
|
||||
Msg("Alert escalated")
|
||||
|
||||
config := m.alertManager.GetConfig()
|
||||
if level <= 0 || level > len(config.Schedule.Escalation.Levels) {
|
||||
return
|
||||
}
|
||||
|
||||
if m.alertManager.ShouldSuppressNotification(alert) {
|
||||
log.Info().
|
||||
Str("alertID", alert.ID).
|
||||
Int("level", level).
|
||||
Msg("Escalated notification suppressed during quiet hours")
|
||||
m.broadcastEscalatedAlert(hub, alert)
|
||||
return
|
||||
}
|
||||
|
||||
if m.notificationMgr != nil {
|
||||
escalationLevel := config.Schedule.Escalation.Levels[level-1]
|
||||
switch escalationLevel.Notify {
|
||||
case "email":
|
||||
if emailConfig := m.notificationMgr.GetEmailConfig(); emailConfig.Enabled {
|
||||
m.notificationMgr.SendAlert(alert)
|
||||
}
|
||||
case "webhook":
|
||||
for _, webhook := range m.notificationMgr.GetWebhooks() {
|
||||
if webhook.Enabled {
|
||||
m.notificationMgr.SendAlert(alert)
|
||||
break
|
||||
}
|
||||
}
|
||||
case "all":
|
||||
m.notificationMgr.SendAlert(alert)
|
||||
}
|
||||
}
|
||||
|
||||
m.broadcastEscalatedAlert(hub, alert)
|
||||
}
|
||||
|
||||
func (m *Monitor) handleAlertAcknowledged(alert *alerts.Alert, user string) {
|
||||
if m.incidentStore == nil || alert == nil {
|
||||
if alert == nil {
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue