diff --git a/docs/release-control/v6/internal/subsystems/alerts.md b/docs/release-control/v6/internal/subsystems/alerts.md index 04c4f42bb..0a124fe74 100644 --- a/docs/release-control/v6/internal/subsystems/alerts.md +++ b/docs/release-control/v6/internal/subsystems/alerts.md @@ -82,6 +82,12 @@ runtime must migrate the active alert, history entry, acknowledgment record, suppression/rate-limit/flapping tracking, and guest per-disk metric identity to the current canonical state instead of reopening a duplicate alert or resolving only the stale node-scoped identity. +That same guest-alert owner also has to retire per-disk guest alerts when the +guest stops, disk alerting is disabled, or the reported disk set changes. +Canonical guest disk identity is only valid while the guest still exposes that +disk resource under the current thresholds, so runtime cleanup must remove +stale `guestID-disk-*` state instead of leaving orphaned per-disk incidents in +active alerts, resolved history, or later UI projections. That same alerts runtime also owns instance-scoped node display-name resolution. Raw node names are not globally unique across configured infrastructure instances, so cached node display names must key on instance + @@ -211,6 +217,11 @@ keep schedules such as `00:00` to `23:59` active through the full final minute instead of expiring at `23:59:00`. Alert quiet-hours proofs should control time through the alert manager clock hook instead of depending on wall clock execution at whatever second the test runner happens to hit. +Quiet-hours suppression also applies to alert delivery lifecycle, not only the +initial raised notification. Resolved notifications must not fan out when the +alert was never notified or was already acknowledged, and monitoring-driven +escalation delivery must consult the same quiet-hours suppression path while +still letting canonical escalation state reach websocket consumers. That schedule surface now also follows the same shell/runtime split as the other feature tabs: `frontend-modern/src/features/alerts/tabs/ScheduleTab.tsx` stays the render shell, while @@ -231,6 +242,21 @@ presentation now also route through `frontend-modern/src/utils/alertIncidentPresentation.ts` instead of remaining duplicated inline across the alerts page and overview timeline surfaces. +Poll-driven connectivity recovery is also part of canonical alert truth. +Resources that clear an offline alert from later healthy polls must require +repeated healthy confirmations before resolving that alert instead of clearing +on the first recovered sample; otherwise transient poll recovery reopens the +same regression as false "back online" notifications and missing downtime +signal. Nodes, PBS, and PMG require three healthy confirmations before +resolution, while storage requires two. + +Host-agent threshold ownership now follows the linked resource model. +Explicit agent overrides still win, but when no host-agent override exists the +alerts runtime must inherit linked node or guest overrides for that agent so +metric and connectivity behavior match the logical machine the agent augments. +Persisted host alerts must carry enough linked-resource metadata for +reevaluation after threshold changes to honor that same inheritance rule. + Alert resource tables, grouped node headers, and alert override reconstruction now route resource-backed names through the shared policy-aware alerts helper so governed resources do not fall back to raw names when the thresholds editor diff --git a/docs/release-control/v6/internal/subsystems/monitoring.md b/docs/release-control/v6/internal/subsystems/monitoring.md index 7994099db..01ce53d98 100644 --- a/docs/release-control/v6/internal/subsystems/monitoring.md +++ b/docs/release-control/v6/internal/subsystems/monitoring.md @@ -95,6 +95,12 @@ This subsystem now sits under the dedicated core monitoring runtime lane so discovery, metrics-history correctness, and platform-specific runtime coverage can be governed as first-class product work instead of staying diluted inside architecture coherence. +That same monitoring boundary also owns the escalation callback bridge into the +alerts delivery layer. Monitor-owned escalation handling may still publish +canonical escalation state to websocket consumers, but notification fan-out +must defer quiet-hours and resolved-notification suppression policy to the +alerts manager instead of bypassing that shared routing contract when monitor +plumbs escalations outward. That same monitoring owner now also governs monitored-system usage readiness for commercial boundaries. A non-nil unified read-state is not sufficient when provider-owned supplemental inventories such as TrueNAS or VMware are still diff --git a/internal/alerts/alerts.go b/internal/alerts/alerts.go index 368ec4d66..6f1b5359d 100644 --- a/internal/alerts/alerts.go +++ b/internal/alerts/alerts.go @@ -45,10 +45,12 @@ const ( // Cleanup intervals const ( - StaleTrackingThreshold = 24 * time.Hour - RateLimitCleanupWindow = 1 * time.Hour - alertsDirPerm = 0o700 - alertsFilePerm = 0o600 + StaleTrackingThreshold = 24 * time.Hour + RateLimitCleanupWindow = 1 * time.Hour + alertsDirPerm = 0o700 + alertsFilePerm = 0o600 + offlineRecoveryConfirmationsDefault = 3 + offlineRecoveryConfirmationsStorage = 2 ) func normalizePoweredOffSeverity(level AlertLevel) AlertLevel { @@ -555,13 +557,14 @@ type Manager struct { // Time threshold tracking pendingAlerts map[string]time.Time // Track when thresholds were first exceeded // Offline confirmation tracking - nodeOfflineCount map[string]int // Track consecutive offline counts for nodes (legacy) - offlineConfirmations map[string]int // Track consecutive offline counts for all resources - dockerOfflineCount map[string]int // Track consecutive offline counts for Docker hosts - dockerStateConfirm map[string]int // Track consecutive state confirmations for Docker containers - dockerRestartTracking map[string]*dockerRestartRecord // Track restart counts and times for restart loop detection - dockerLastExitCode map[string]int // Track last exit code for OOM detection - dockerUpdateFirstSeen map[string]time.Time // Track when image updates were first detected for alert delay + nodeOfflineCount map[string]int // Track consecutive offline counts for nodes (legacy) + offlineConfirmations map[string]int // Track consecutive offline counts for all resources + offlineRecoveryConfirmations map[string]int // Track consecutive healthy confirmations before clearing poll-driven offline alerts + dockerOfflineCount map[string]int // Track consecutive offline counts for Docker hosts + dockerStateConfirm map[string]int // Track consecutive state confirmations for Docker containers + dockerRestartTracking map[string]*dockerRestartRecord // Track restart counts and times for restart loop detection + dockerLastExitCode map[string]int // Track last exit code for OOM detection + dockerUpdateFirstSeen map[string]time.Time // Track when image updates were first detected for alert delay // Stable identity tracking prevents update-delay resets when host IDs churn. dockerUpdateFirstSeenByIdentity map[string]time.Time // PMG quarantine growth tracking @@ -642,6 +645,7 @@ func NewManagerWithDataDir(dataDir string) *Manager { pendingAlerts: make(map[string]time.Time), nodeOfflineCount: make(map[string]int), offlineConfirmations: make(map[string]int), + offlineRecoveryConfirmations: make(map[string]int), dockerOfflineCount: make(map[string]int), dockerStateConfirm: make(map[string]int), dockerRestartTracking: make(map[string]*dockerRestartRecord), @@ -2046,10 +2050,7 @@ func (m *Manager) reevaluateActiveAlertsLocked() { alertsToResolve = append(alertsToResolve, alertID) continue } - // Overrides are keyed by raw host ID (without the "agent:" prefix - // that hostResourceID adds to the resource ID used in alert IDs). - rawHostID := stripHostResourcePrefix(resourceID) - thresholds := m.resolveResourceThresholds("agent", rawHostID) + thresholds := m.resolveHostAlertThresholdsNoLock(alert, resourceID) if thresholds.Disabled { alertsToResolve = append(alertsToResolve, alertID) continue @@ -2507,6 +2508,28 @@ func (m *Manager) shouldSuppressNotification(alert *Alert) (bool, string) { return false, "" } +// ShouldSuppressNotification checks if a notification should be suppressed +// during quiet hours. This is used by paths that bypass dispatchAlert, such as escalation. +func (m *Manager) ShouldSuppressNotification(alert *Alert) bool { + if alert == nil { + return false + } + + m.mu.RLock() + defer m.mu.RUnlock() + + suppressed, reason := m.shouldSuppressNotification(alert) + if suppressed { + log.Debug(). + Str("alertID", alert.ID). + Str("type", alert.Type). + Str("level", string(alert.Level)). + Str("quietHoursRule", reason). + Msg("Notification suppressed during quiet hours") + } + return suppressed +} + // ShouldSuppressResolvedNotification checks if a recovery notification should be suppressed // during quiet hours. Recovery notifications follow the same quiet hours rules as their // corresponding alerts - if the original alert would have been suppressed, so is the recovery. @@ -2518,6 +2541,22 @@ func (m *Manager) ShouldSuppressResolvedNotification(alert *Alert) bool { m.mu.RLock() defer m.mu.RUnlock() + if alert.Acknowledged { + log.Debug(). + Str("alertID", alert.ID). + Str("type", alert.Type). + Msg("Recovery notification suppressed for acknowledged alert") + return true + } + + if alert.LastNotified == nil { + log.Debug(). + Str("alertID", alert.ID). + Str("type", alert.Type). + Msg("Recovery notification suppressed because firing notification was never sent") + return true + } + suppressed, reason := m.shouldSuppressNotification(alert) if suppressed { log.Debug(). @@ -2698,23 +2737,7 @@ func (m *Manager) CheckGuest(guest any, instanceName string) { m.clearGuestPoweredOffAlert(guestID, name) } - // Clear all resource metric alerts (cpu, memory, disk, etc.) for non-running guests - m.mu.Lock() - alertsCleared := 0 - for storageKey, alert := range m.activeAlerts { - alertID := effectiveAlertID(alert, storageKey) - // Only clear resource metric alerts, not powered-off alerts - if alert.ResourceID == guestID && alert.Type != "powered-off" { - m.clearAlertNoLock(alertID) - alertsCleared++ - log.Debug(). - Str("alertID", alertID). - Str("guest", name). - Str("status", status). - Msg("Cleared metric alert for non-running guest") - } - } - m.mu.Unlock() + alertsCleared := m.clearGuestMetricAlerts(guestID) if alertsCleared > 0 { log.Debug(). @@ -2722,6 +2745,7 @@ func (m *Manager) CheckGuest(guest any, instanceName string) { Str("status", status). Int("alertsCleared", alertsCleared). Msg("Cleared metric alerts for non-running guest") + m.saveActiveAlertsAsync("guest-not-running") } return } @@ -2744,18 +2768,13 @@ func (m *Manager) CheckGuest(guest any, instanceName string) { // If alerts are disabled for this guest, clear any existing alerts and return if thresholds.Disabled { - m.mu.Lock() - for storageKey, alert := range m.activeAlerts { - alertID := effectiveAlertID(alert, storageKey) - if alert.ResourceID == guestID { - m.clearAlertNoLock(alertID) - log.Info(). - Str("alertID", alertID). - Str("guest", name). - Msg("Cleared alert - guest has alerts disabled") - } + if alertsCleared := m.clearGuestMetricAlerts(guestID); alertsCleared > 0 { + log.Info(). + Str("guest", name). + Int("alertsCleared", alertsCleared). + Msg("Cleared guest metric alerts because alerts are disabled") + m.saveActiveAlertsAsync("guest-alerts-disabled") } - m.mu.Unlock() return } @@ -2789,7 +2808,8 @@ func (m *Manager) CheckGuest(guest any, instanceName string) { }, thresholds, evalOpts) if thresholds.Disk != nil && thresholds.Disk.Trigger > 0 && len(disks) > 0 { - seenDisks := make(map[string]struct{}) + seenDiskKeys := make(map[string]struct{}) + seenDiskResources := make(map[string]struct{}) for idx, disk := range disks { if disk.Total <= 0 { continue @@ -2816,12 +2836,13 @@ func (m *Manager) CheckGuest(guest any, instanceName string) { } // Avoid duplicate checks if two disks resolve to the same key - if _, exists := seenDisks[sanitizedKey]; exists { + if _, exists := seenDiskKeys[sanitizedKey]; exists { continue } - seenDisks[sanitizedKey] = struct{}{} + seenDiskKeys[sanitizedKey] = struct{}{} perDiskResourceID := fmt.Sprintf("%s-disk-%s", guestID, sanitizedKey) + seenDiskResources[perDiskResourceID] = struct{}{} message := fmt.Sprintf("%s disk (%s) at %.1f%%", guestType, label, disk.Usage) log.Debug(). @@ -2867,6 +2888,11 @@ func (m *Manager) CheckGuest(guest any, instanceName string) { MonitorOnly: monitorOnly, }) } + if cleared := m.cleanupGuestDiskAlerts(guestID, seenDiskResources); cleared > 0 { + m.saveActiveAlertsAsync("guest-disk-set-changed") + } + } else if cleared := m.cleanupGuestDiskAlerts(guestID, nil); cleared > 0 { + m.saveActiveAlertsAsync("guest-disk-alerts-cleared") } } @@ -3131,6 +3157,64 @@ func hostInstanceName(host models.Host) string { return "Agent" } +// resolveHostThresholdsNoLock resolves the effective thresholds for a host agent. +// Explicit host-agent overrides win. Otherwise, linked node/guest overrides are +// inherited so the host agent follows the logical resource it augments. +// Callers must hold m.mu when reading config through this helper. +func (m *Manager) resolveHostThresholdsNoLock(hostID, linkedNodeID, linkedVMID, linkedContainerID string) ThresholdConfig { + base := m.defaultThresholdsForResourceType("agent") + + if hostID = strings.TrimSpace(hostID); hostID != "" { + if override, exists := m.config.Overrides[hostID]; exists { + return m.applyThresholdOverride(base, override) + } + } + + if linkedNodeID = strings.TrimSpace(linkedNodeID); linkedNodeID != "" { + if override, exists := m.config.Overrides[linkedNodeID]; exists { + return m.applyThresholdOverride(base, override) + } + } + + if linkedVMID = strings.TrimSpace(linkedVMID); linkedVMID != "" { + if override, exists := lookupGuestOverride(m.config.Overrides, nil, linkedVMID); exists { + return m.applyThresholdOverride(base, override) + } + } + + if linkedContainerID = strings.TrimSpace(linkedContainerID); linkedContainerID != "" { + if override, exists := lookupGuestOverride(m.config.Overrides, nil, linkedContainerID); exists { + return m.applyThresholdOverride(base, override) + } + } + + return base +} + +// resolveHostAlertThresholdsNoLock resolves thresholds for persisted host-agent alerts. +// Alert metadata carries the link context needed to inherit node/guest overrides. +// Callers must hold m.mu when reading config through this helper. +func (m *Manager) resolveHostAlertThresholdsNoLock(alert *Alert, resourceID string) ThresholdConfig { + hostID := stripHostResourcePrefix(resourceID) + if idx := strings.Index(hostID, "/"); idx >= 0 { + hostID = hostID[:idx] + } + + linkedNodeID := "" + linkedVMID := "" + linkedContainerID := "" + if alert != nil { + if metadataHostID := metadataStringValue(alert.Metadata, "hostId"); metadataHostID != "" { + hostID = metadataHostID + } + linkedNodeID = metadataStringValue(alert.Metadata, "linkedNodeId") + linkedVMID = metadataStringValue(alert.Metadata, "linkedVmId") + linkedContainerID = metadataStringValue(alert.Metadata, "linkedContainerId") + } + + return m.resolveHostThresholdsNoLock(hostID, linkedNodeID, linkedVMID, linkedContainerID) +} + func sanitizeHostComponent(value string) string { value = strings.TrimSpace(strings.ToLower(value)) if value == "" { @@ -3227,7 +3311,7 @@ func (m *Manager) CheckHost(host models.Host) { m.mu.RLock() alertsEnabled := m.config.Enabled disableAllAgents := m.config.DisableAllAgents - thresholds := m.resolveResourceThresholds("agent", host.ID) + thresholds := m.resolveHostThresholdsNoLock(host.ID, host.LinkedNodeID, host.LinkedVMID, host.LinkedContainerID) m.mu.RUnlock() if !alertsEnabled { @@ -3267,6 +3351,15 @@ func (m *Manager) CheckHost(host models.Host) { "agentVersion": host.AgentVersion, "architecture": host.Architecture, } + if linkedNodeID := strings.TrimSpace(host.LinkedNodeID); linkedNodeID != "" { + baseMetadata["linkedNodeId"] = linkedNodeID + } + if linkedVMID := strings.TrimSpace(host.LinkedVMID); linkedVMID != "" { + baseMetadata["linkedVmId"] = linkedVMID + } + if linkedContainerID := strings.TrimSpace(host.LinkedContainerID); linkedContainerID != "" { + baseMetadata["linkedContainerId"] = linkedContainerID + } if len(host.Tags) > 0 { baseMetadata["tags"] = append([]string(nil), host.Tags...) } @@ -3590,7 +3683,7 @@ func (m *Manager) HandleHostOffline(host models.Host) { return } disableHostsOffline := m.config.DisableAllAgentsOffline - thresholds := m.resolveResourceThresholds("agent", host.ID) + thresholds := m.resolveHostThresholdsNoLock(host.ID, host.LinkedNodeID, host.LinkedVMID, host.LinkedContainerID) m.mu.RUnlock() resourceKey := hostResourceID(host.ID) @@ -3644,13 +3737,16 @@ func (m *Manager) HandleHostOffline(host models.Host) { Instance: instanceName, Message: fmt.Sprintf("Host '%s' is offline", resourceName), Metadata: map[string]interface{}{ - "resourceType": "agent", - "hostId": host.ID, - "hostname": host.Hostname, - "displayName": host.DisplayName, - "platform": host.Platform, - "osName": host.OSName, - "osVersion": host.OSVersion, + "resourceType": "agent", + "hostId": host.ID, + "hostname": host.Hostname, + "displayName": host.DisplayName, + "platform": host.Platform, + "osName": host.OSName, + "osVersion": host.OSVersion, + "linkedNodeId": strings.TrimSpace(host.LinkedNodeID), + "linkedVmId": strings.TrimSpace(host.LinkedVMID), + "linkedContainerId": strings.TrimSpace(host.LinkedContainerID), }, AddToRecent: true, AddToHistory: true, @@ -3760,6 +3856,72 @@ func (m *Manager) clearHostDiskAlerts(hostID string) { } } +func (m *Manager) clearGuestMetricAlerts(guestID string, metrics ...string) int { + if guestID == "" { + return 0 + } + + allowedMetrics := make(map[string]struct{}, len(metrics)) + for _, metric := range metrics { + metric = strings.TrimSpace(metric) + if metric == "" { + continue + } + allowedMetrics[metric] = struct{}{} + } + + perDiskPrefix := fmt.Sprintf("%s-disk-", guestID) + + m.mu.Lock() + defer m.mu.Unlock() + + cleared := 0 + for storageKey, alert := range m.activeAlerts { + if alert == nil || alert.Type == "powered-off" { + continue + } + if alert.ResourceID != guestID && !strings.HasPrefix(alert.ResourceID, perDiskPrefix) { + continue + } + if len(allowedMetrics) > 0 { + if _, ok := allowedMetrics[alert.Type]; !ok { + continue + } + } + m.clearAlertNoLock(storageKey) + cleared++ + } + + return cleared +} + +func (m *Manager) cleanupGuestDiskAlerts(guestID string, seen map[string]struct{}) int { + if guestID == "" { + return 0 + } + + prefix := fmt.Sprintf("%s-disk-", guestID) + + m.mu.Lock() + defer m.mu.Unlock() + + cleared := 0 + for storageKey, alert := range m.activeAlerts { + if alert == nil || !strings.HasPrefix(alert.ResourceID, prefix) { + continue + } + if seen != nil { + if _, exists := seen[alert.ResourceID]; exists { + continue + } + } + m.clearAlertNoLock(storageKey) + cleared++ + } + + return cleared +} + func (m *Manager) cleanupHostDiskAlerts(host models.Host, seen map[string]struct{}) { if host.ID == "" { return @@ -5502,9 +5664,19 @@ func (m *Manager) clearDockerContainerUpdateTracking(resourceID, trackingKey str m.mu.Unlock() } +func (m *Manager) touchDockerContainerUpdateAlert(alertID string) { + m.mu.Lock() + defer m.mu.Unlock() + + if alert, exists := m.getActiveAlertNoLock(alertID); exists && alert != nil { + alert.LastSeen = time.Now() + } +} + // checkDockerContainerImageUpdate checks if an image update has been pending for too long func (m *Manager) checkDockerContainerImageUpdate(host models.DockerHost, container models.DockerContainer, resourceID, containerName, instanceName, nodeName string) { alertID := fmt.Sprintf("docker-container-update-%s", resourceID) + canonicalAlertID := buildCanonicalStateID(resourceID, resourceID+"-image-update") updateTrackingKey := dockerUpdateTrackingKey(host, container) // Check if update detection is enabled @@ -5514,30 +5686,30 @@ func (m *Manager) checkDockerContainerImageUpdate(host models.DockerHost, contai // Negative value means disabled if delayHours < 0 { - m.clearAlert(buildCanonicalStateID(resourceID, resourceID+"-image-update")) + m.clearAlert(canonicalAlertID) m.clearDockerContainerUpdateTracking(resourceID, updateTrackingKey) return } // Check if this container has an update status reported if container.UpdateStatus == nil { - // No update status - clear any tracking and alerts - m.clearAlert(buildCanonicalStateID(resourceID, resourceID+"-image-update")) - m.clearDockerContainerUpdateTracking(resourceID, updateTrackingKey) + // Missing update status means the condition is unknown, not resolved. + // Preserve any active alert and first-seen tracking until we see an affirmative clear. + m.touchDockerContainerUpdateAlert(canonicalAlertID) return } // Check for errors in update detection (don't alert on errors) if container.UpdateStatus.Error != "" { - // Update check failed - clear alert but keep tracking - m.clearAlert(buildCanonicalStateID(resourceID, resourceID+"-image-update")) + // A failed update check cannot confirm the pending update has been resolved. + m.touchDockerContainerUpdateAlert(canonicalAlertID) return } // Check if an update is available if !container.UpdateStatus.UpdateAvailable { // No update available - clear tracking and alert - m.clearAlert(buildCanonicalStateID(resourceID, resourceID+"-image-update")) + m.clearAlert(canonicalAlertID) m.clearDockerContainerUpdateTracking(resourceID, updateTrackingKey) return } @@ -7495,8 +7667,10 @@ func (m *Manager) removeActiveAlertNoLock(alertID string) { m.unregisterActiveAlertAliasNoLock(key, alert) } if exists { + delete(m.offlineRecoveryConfirmations, key) delete(m.activeAlerts, key) } + delete(m.offlineRecoveryConfirmations, alertID) // NOTE: Don't delete ackState here - preserve it so if the same alert // reappears (e.g., powered-off VM during backup), the acknowledgement // is restored via preserveAlertState. ackState is cleaned up in Cleanup(). @@ -7506,6 +7680,27 @@ func (m *Manager) removeActiveAlertNoLock(alertID string) { } } +func (m *Manager) confirmOfflineRecoveryNoLock(alertID string, required int) (int, bool) { + alertID = strings.TrimSpace(alertID) + if alertID == "" { + return 0, false + } + + if required <= 1 { + delete(m.offlineRecoveryConfirmations, alertID) + return required, true + } + + m.offlineRecoveryConfirmations[alertID]++ + confirmations := m.offlineRecoveryConfirmations[alertID] + if confirmations < required { + return confirmations, false + } + + delete(m.offlineRecoveryConfirmations, alertID) + return confirmations, true +} + // GetActiveAlerts returns all active alerts func (m *Manager) GetActiveAlerts() []Alert { if m == nil { @@ -7817,8 +8012,9 @@ func (m *Manager) OnAlertHistory(cb AlertCallback) { } } -// clearResourceOfflineAlert removes an offline alert when a resource comes back online. -func (m *Manager) clearResourceOfflineAlert(alertPrefix, resourceID, resourceName, host, resourceKind string) { +// clearResourceOfflineAlert removes an offline alert when a poll-driven resource +// stays healthy for enough consecutive polls to confirm recovery. +func (m *Manager) clearResourceOfflineAlert(resourceID, resourceName, host, resourceKind string, requiredRecoveryCount int) { alertID := canonicalConnectivityStateID(resourceID) m.mu.Lock() @@ -7836,6 +8032,17 @@ func (m *Manager) clearResourceOfflineAlert(alertPrefix, resourceID, resourceNam // Check if offline alert exists alert, exists := m.getActiveAlertNoLock(alertID) if !exists { + delete(m.offlineRecoveryConfirmations, alertID) + return + } + + recoveryCount, confirmed := m.confirmOfflineRecoveryNoLock(alertID, requiredRecoveryCount) + if !confirmed { + log.Debug(). + Str(strings.ToLower(resourceKind), resourceName). + Int("confirmations", recoveryCount). + Int("required", requiredRecoveryCount). + Msg(resourceKind + " appears back online, waiting for recovery confirmation") return } @@ -7864,6 +8071,10 @@ func (m *Manager) clearResourceOfflineAlert(alertPrefix, resourceID, resourceNam func (m *Manager) checkNodeOffline(node models.Node) { alertID := fmt.Sprintf("node-offline-%s", node.ID) + m.mu.Lock() + delete(m.offlineRecoveryConfirmations, canonicalConnectivityStateID(node.ID)) + m.mu.Unlock() + thresholds := m.resolveResourceThresholds("node", node.ID) spec, err := buildCanonicalConnectivitySpec(node.ID, node.Name, unifiedresources.ResourceType("node"), AlertLevelCritical, 3, thresholds.Disabled || thresholds.DisableConnectivity) if err != nil { @@ -7918,6 +8129,17 @@ func (m *Manager) clearNodeOfflineAlert(node models.Node) { // Check if offline alert exists alert, exists := m.getActiveAlertNoLock(alertID) if !exists { + delete(m.offlineRecoveryConfirmations, alertID) + return + } + + recoveryCount, confirmed := m.confirmOfflineRecoveryNoLock(alertID, offlineRecoveryConfirmationsDefault) + if !confirmed { + log.Debug(). + Str("node", node.Name). + Int("confirmations", recoveryCount). + Int("required", offlineRecoveryConfirmationsDefault). + Msg("Node appears back online, waiting for recovery confirmation") return } @@ -7944,6 +8166,10 @@ func (m *Manager) clearNodeOfflineAlert(node models.Node) { // checkPBSOffline creates an alert for offline PBS instances func (m *Manager) checkPBSOffline(pbs models.PBSInstance) { + m.mu.Lock() + delete(m.offlineRecoveryConfirmations, canonicalConnectivityStateID(pbs.ID)) + m.mu.Unlock() + thresholds := m.resolveResourceThresholds("pbs", pbs.ID) spec, err := buildCanonicalConnectivitySpec(pbs.ID, pbs.Name, unifiedresources.ResourceTypePBS, AlertLevelCritical, 3, thresholds.Disabled || thresholds.DisableConnectivity) if err != nil { @@ -7979,11 +8205,15 @@ func (m *Manager) checkPBSOffline(pbs models.PBSInstance) { // clearPBSOfflineAlert removes offline alert when PBS comes back online func (m *Manager) clearPBSOfflineAlert(pbs models.PBSInstance) { - m.clearResourceOfflineAlert("pbs-offline", pbs.ID, pbs.Name, pbs.Host, "PBS") + m.clearResourceOfflineAlert(pbs.ID, pbs.Name, pbs.Host, "PBS", offlineRecoveryConfirmationsDefault) } // checkPMGOffline creates an alert for offline PMG instances func (m *Manager) checkPMGOffline(pmg models.PMGInstance) { + m.mu.Lock() + delete(m.offlineRecoveryConfirmations, canonicalConnectivityStateID(pmg.ID)) + m.mu.Unlock() + m.mu.RLock() override, hasOverride := m.config.Overrides[pmg.ID] m.mu.RUnlock() @@ -8022,7 +8252,7 @@ func (m *Manager) checkPMGOffline(pmg models.PMGInstance) { // clearPMGOfflineAlert removes offline alert when PMG comes back online func (m *Manager) clearPMGOfflineAlert(pmg models.PMGInstance) { - m.clearResourceOfflineAlert("pmg-offline", pmg.ID, pmg.Name, pmg.Host, "PMG") + m.clearResourceOfflineAlert(pmg.ID, pmg.Name, pmg.Host, "PMG", offlineRecoveryConfirmationsDefault) } // checkPMGQueueDepths checks PMG mail queue depths and creates alerts @@ -8936,6 +9166,10 @@ func (m *Manager) checkAnomalyMetric(pmg models.PMGInstance, tracker *pmgAnomaly func (m *Manager) checkStorageOffline(storage models.Storage) { alertID := fmt.Sprintf("storage-offline-%s", storage.ID) + m.mu.Lock() + delete(m.offlineRecoveryConfirmations, canonicalConnectivityStateID(storage.ID)) + m.mu.Unlock() + thresholds := m.resolveResourceThresholds("storage", storage.ID) spec, err := buildCanonicalConnectivitySpec(storage.ID, storage.Name, unifiedresources.ResourceTypeStorage, AlertLevelWarning, 2, thresholds.Disabled || thresholds.DisableConnectivity) if err != nil { @@ -8970,7 +9204,7 @@ func (m *Manager) checkStorageOffline(storage models.Storage) { // clearStorageOfflineAlert removes offline alert when storage comes back online func (m *Manager) clearStorageOfflineAlert(storage models.Storage) { - m.clearResourceOfflineAlert("storage-offline", storage.ID, storage.Name, storage.Node, "Storage") + m.clearResourceOfflineAlert(storage.ID, storage.Name, storage.Node, "Storage", offlineRecoveryConfirmationsStorage) } // checkGuestPoweredOff creates an alert for powered-off guests @@ -10095,6 +10329,7 @@ func (m *Manager) ClearActiveAlerts() { m.alertRateLimit = make(map[string][]time.Time) m.nodeOfflineCount = make(map[string]int) m.offlineConfirmations = make(map[string]int) + m.offlineRecoveryConfirmations = make(map[string]int) m.dockerOfflineCount = make(map[string]int) m.dockerStateConfirm = make(map[string]int) m.dockerRestartTracking = make(map[string]*dockerRestartRecord) @@ -10215,6 +10450,13 @@ func (m *Manager) cleanupStaleMaps() { } } + for alertID := range m.offlineRecoveryConfirmations { + if !m.hasActiveAlertNoLock(alertID) { + delete(m.offlineRecoveryConfirmations, alertID) + cleaned++ + } + } + // Clean up node offline counts (legacy) for nodeID := range m.nodeOfflineCount { hasRelatedAlert := false diff --git a/internal/alerts/alerts_test.go b/internal/alerts/alerts_test.go index 8e2d99e94..e01d9501b 100644 --- a/internal/alerts/alerts_test.go +++ b/internal/alerts/alerts_test.go @@ -5097,6 +5097,18 @@ func TestClearStorageOfflineAlert(t *testing.T) { resolvedCh <- id }) + m.clearStorageOfflineAlert(storage) + m.mu.RLock() + _, existsAfterFirst := testLookupActiveAlert(t, m, alertID) + recoveryCountAfterFirst := m.offlineRecoveryConfirmations[canonicalConnectivityStateID(storage.ID)] + m.mu.RUnlock() + if !existsAfterFirst { + t.Fatal("expected storage alert to remain active until recovery confirmations are satisfied") + } + if recoveryCountAfterFirst != 1 { + t.Fatalf("expected first storage recovery confirmation to be tracked, got %d", recoveryCountAfterFirst) + } + m.clearStorageOfflineAlert(storage) m.mu.RLock() @@ -6787,6 +6799,28 @@ func TestHandleHostOffline(t *testing.T) { } }) + t.Run("linked node override DisableConnectivity clears alert and returns", func(t *testing.T) { + m := newTestManager(t) + m.config.Enabled = true + m.config.Overrides = map[string]ThresholdConfig{ + "cluster-node1": {DisableConnectivity: true}, + } + + alertID := canonicalConnectivityStateID(hostResourceID("host1")) + m.activeAlerts[alertID] = &Alert{ID: alertID, Type: "host-offline"} + m.offlineConfirmations[hostResourceID("host1")] = 5 + + host := models.Host{ID: "host1", Hostname: "test-host", LinkedNodeID: "cluster-node1"} + m.HandleHostOffline(host) + + if testHasActiveAlert(t, m, alertID) { + t.Error("expected alert to be cleared") + } + if _, exists := m.offlineConfirmations[hostResourceID("host1")]; exists { + t.Error("expected offlineConfirmations to be cleared") + } + }) + t.Run("override Disabled clears alert and returns", func(t *testing.T) { // t.Parallel() m := newTestManager(t) @@ -8118,6 +8152,30 @@ func TestClearNodeOfflineAlert(t *testing.T) { m.mu.Unlock() node := models.Node{ID: "node1", Name: "Node 1", Instance: "pve1"} + m.clearNodeOfflineAlert(node) + m.mu.RLock() + _, existsAfterFirst := testLookupActiveAlert(t, m, state) + recoveryCountAfterFirst := m.offlineRecoveryConfirmations[state] + m.mu.RUnlock() + if !existsAfterFirst { + t.Fatal("expected node alert to remain active until recovery confirmations are satisfied") + } + if recoveryCountAfterFirst != 1 { + t.Fatalf("expected first node recovery confirmation to be tracked, got %d", recoveryCountAfterFirst) + } + + m.clearNodeOfflineAlert(node) + m.mu.RLock() + _, existsAfterSecond := testLookupActiveAlert(t, m, state) + recoveryCountAfterSecond := m.offlineRecoveryConfirmations[state] + m.mu.RUnlock() + if !existsAfterSecond { + t.Fatal("expected node alert to remain active after second recovery confirmation") + } + if recoveryCountAfterSecond != 2 { + t.Fatalf("expected second node recovery confirmation to be tracked, got %d", recoveryCountAfterSecond) + } + m.clearNodeOfflineAlert(node) m.mu.RLock() @@ -8174,7 +8232,10 @@ func TestClearOfflineAlertNoDeadlock(t *testing.T) { m.mu.Unlock() }, clearFn: func(m *Manager) { - m.clearNodeOfflineAlert(models.Node{ID: "node1", Name: "Node 1", Instance: "pve1"}) + node := models.Node{ID: "node1", Name: "Node 1", Instance: "pve1"} + m.clearNodeOfflineAlert(node) + m.clearNodeOfflineAlert(node) + m.clearNodeOfflineAlert(node) }, }, { @@ -8189,7 +8250,10 @@ func TestClearOfflineAlertNoDeadlock(t *testing.T) { m.mu.Unlock() }, clearFn: func(m *Manager) { - m.clearPBSOfflineAlert(models.PBSInstance{ID: "pbs1", Name: "PBS 1", Host: "host1"}) + pbs := models.PBSInstance{ID: "pbs1", Name: "PBS 1", Host: "host1"} + m.clearPBSOfflineAlert(pbs) + m.clearPBSOfflineAlert(pbs) + m.clearPBSOfflineAlert(pbs) }, }, { @@ -8204,7 +8268,10 @@ func TestClearOfflineAlertNoDeadlock(t *testing.T) { m.mu.Unlock() }, clearFn: func(m *Manager) { - m.clearPMGOfflineAlert(models.PMGInstance{ID: "pmg1", Name: "PMG 1", Host: "host1"}) + pmg := models.PMGInstance{ID: "pmg1", Name: "PMG 1", Host: "host1"} + m.clearPMGOfflineAlert(pmg) + m.clearPMGOfflineAlert(pmg) + m.clearPMGOfflineAlert(pmg) }, }, { @@ -8219,7 +8286,9 @@ func TestClearOfflineAlertNoDeadlock(t *testing.T) { m.mu.Unlock() }, clearFn: func(m *Manager) { - m.clearStorageOfflineAlert(models.Storage{ID: "stor1", Name: "Storage 1", Node: "node1"}) + storage := models.Storage{ID: "stor1", Name: "Storage 1", Node: "node1"} + m.clearStorageOfflineAlert(storage) + m.clearStorageOfflineAlert(storage) }, }, { @@ -8319,6 +8388,30 @@ func TestClearPBSOfflineAlert(t *testing.T) { m.mu.Unlock() pbs := models.PBSInstance{ID: "pbs1", Name: "PBS 1", Host: "pbs.local"} + m.clearPBSOfflineAlert(pbs) + m.mu.RLock() + _, existsAfterFirst := testLookupActiveAlert(t, m, "pbs-offline-pbs1") + recoveryCountAfterFirst := m.offlineRecoveryConfirmations[canonicalConnectivityStateID("pbs1")] + m.mu.RUnlock() + if !existsAfterFirst { + t.Fatal("expected PBS alert to remain active until recovery confirmations are satisfied") + } + if recoveryCountAfterFirst != 1 { + t.Fatalf("expected first PBS recovery confirmation to be tracked, got %d", recoveryCountAfterFirst) + } + + m.clearPBSOfflineAlert(pbs) + m.mu.RLock() + _, existsAfterSecond := testLookupActiveAlert(t, m, "pbs-offline-pbs1") + recoveryCountAfterSecond := m.offlineRecoveryConfirmations[canonicalConnectivityStateID("pbs1")] + m.mu.RUnlock() + if !existsAfterSecond { + t.Fatal("expected PBS alert to remain active after second recovery confirmation") + } + if recoveryCountAfterSecond != 2 { + t.Fatalf("expected second PBS recovery confirmation to be tracked, got %d", recoveryCountAfterSecond) + } + m.clearPBSOfflineAlert(pbs) m.mu.RLock() @@ -8402,6 +8495,30 @@ func TestClearPMGOfflineAlert(t *testing.T) { m.mu.Unlock() pmg := models.PMGInstance{ID: "pmg1", Name: "PMG 1", Host: "pmg.local"} + m.clearPMGOfflineAlert(pmg) + m.mu.RLock() + _, existsAfterFirst := testLookupActiveAlert(t, m, "pmg-offline-pmg1") + recoveryCountAfterFirst := m.offlineRecoveryConfirmations[canonicalConnectivityStateID("pmg1")] + m.mu.RUnlock() + if !existsAfterFirst { + t.Fatal("expected PMG alert to remain active until recovery confirmations are satisfied") + } + if recoveryCountAfterFirst != 1 { + t.Fatalf("expected first PMG recovery confirmation to be tracked, got %d", recoveryCountAfterFirst) + } + + m.clearPMGOfflineAlert(pmg) + m.mu.RLock() + _, existsAfterSecond := testLookupActiveAlert(t, m, "pmg-offline-pmg1") + recoveryCountAfterSecond := m.offlineRecoveryConfirmations[canonicalConnectivityStateID("pmg1")] + m.mu.RUnlock() + if !existsAfterSecond { + t.Fatal("expected PMG alert to remain active after second recovery confirmation") + } + if recoveryCountAfterSecond != 2 { + t.Fatalf("expected second PMG recovery confirmation to be tracked, got %d", recoveryCountAfterSecond) + } + m.clearPMGOfflineAlert(pmg) m.mu.RLock() @@ -13217,6 +13334,7 @@ func TestCheckNode(t *testing.T) { t.Run("online node clears offline alert", func(t *testing.T) { // t.Parallel() m := newTestManager(t) + state := canonicalConnectivityStateID("node1") // Pre-create offline alert m.mu.Lock() @@ -13235,11 +13353,36 @@ func TestCheckNode(t *testing.T) { Status: "online", ConnectionHealth: "connected", } + m.CheckNode(node) + m.mu.RLock() + _, alertExistsAfterFirst := testLookupActiveAlert(t, m, "node-offline-node1") + recoveryCountAfterFirst := m.offlineRecoveryConfirmations[state] + m.mu.RUnlock() + if !alertExistsAfterFirst { + t.Fatal("expected offline alert to remain active until recovery confirmations are satisfied") + } + if recoveryCountAfterFirst != 1 { + t.Fatalf("expected first recovery confirmation to be tracked, got %d", recoveryCountAfterFirst) + } + + m.CheckNode(node) + m.mu.RLock() + _, alertExistsAfterSecond := testLookupActiveAlert(t, m, "node-offline-node1") + recoveryCountAfterSecond := m.offlineRecoveryConfirmations[state] + m.mu.RUnlock() + if !alertExistsAfterSecond { + t.Fatal("expected offline alert to remain active after second recovery confirmation") + } + if recoveryCountAfterSecond != 2 { + t.Fatalf("expected second recovery confirmation to be tracked, got %d", recoveryCountAfterSecond) + } + m.CheckNode(node) m.mu.RLock() _, alertExists := testLookupActiveAlert(t, m, "node-offline-node1") _, countExists := m.nodeOfflineCount["node1"] + _, recoveryExists := m.offlineRecoveryConfirmations[state] m.mu.RUnlock() if alertExists { @@ -13248,6 +13391,9 @@ func TestCheckNode(t *testing.T) { if countExists { t.Error("expected offline count to be cleared") } + if recoveryExists { + t.Error("expected offline recovery confirmations to be cleared") + } }) t.Run("online node triggers metric checks", func(t *testing.T) { @@ -14907,6 +15053,125 @@ func TestCheckHostComprehensive(t *testing.T) { t.Error("expected tags in metadata") } }) + + t.Run("inherits linked node overrides for host agent metrics", func(t *testing.T) { + m := newTestManager(t) + m.ClearActiveAlerts() + + m.mu.Lock() + m.config.TimeThresholds = map[string]int{} + m.config.AgentDefaults = ThresholdConfig{ + Memory: &HysteresisThreshold{Trigger: 85.0, Clear: 80.0}, + } + m.config.Overrides = map[string]ThresholdConfig{ + "cluster-node1": { + Memory: &HysteresisThreshold{Trigger: 97.0, Clear: 92.0}, + }, + } + m.mu.Unlock() + + host := models.Host{ + ID: "host-node1", + DisplayName: "node1", + Hostname: "node1", + LinkedNodeID: "cluster-node1", + Memory: models.Memory{ + Usage: 90.6, + Total: 1024, + Used: 928, + Free: 96, + }, + Status: "online", + LastSeen: time.Now(), + } + + m.CheckHost(host) + + m.mu.RLock() + _, exists := m.activeAlerts[canonicalMetricStateID(hostResourceID(host.ID), "memory")] + m.mu.RUnlock() + + if exists { + t.Fatal("expected linked node override to suppress host-agent memory alert") + } + }) + + t.Run("inherits linked guest overrides for host agent metrics", func(t *testing.T) { + m := newTestManager(t) + m.ClearActiveAlerts() + + m.mu.Lock() + m.config.TimeThresholds = map[string]int{} + m.config.AgentDefaults = ThresholdConfig{ + CPU: &HysteresisThreshold{Trigger: 80.0, Clear: 75.0}, + } + m.config.Overrides = map[string]ThresholdConfig{ + "guest:Main:101": { + CPU: &HysteresisThreshold{Trigger: 105.0, Clear: 100.0}, + }, + } + m.mu.Unlock() + + host := models.Host{ + ID: "host-vm101", + DisplayName: "vm101", + Hostname: "vm101.local", + LinkedVMID: "Main:node3:101", + CPUUsage: 97.5, + Status: "online", + LastSeen: time.Now(), + } + + m.CheckHost(host) + + m.mu.RLock() + _, exists := m.activeAlerts[canonicalMetricStateID(hostResourceID(host.ID), "cpu")] + m.mu.RUnlock() + + if exists { + t.Fatal("expected linked guest override to suppress host-agent cpu alert") + } + }) + + t.Run("prefers explicit host overrides over linked resource overrides", func(t *testing.T) { + m := newTestManager(t) + m.ClearActiveAlerts() + + m.mu.Lock() + m.config.TimeThresholds = map[string]int{} + m.config.AgentDefaults = ThresholdConfig{ + CPU: &HysteresisThreshold{Trigger: 80.0, Clear: 75.0}, + } + m.config.Overrides = map[string]ThresholdConfig{ + "guest:Main:101": { + CPU: &HysteresisThreshold{Trigger: 105.0, Clear: 100.0}, + }, + "host-vm101": { + CPU: &HysteresisThreshold{Trigger: 90.0, Clear: 85.0}, + }, + } + m.mu.Unlock() + + host := models.Host{ + ID: "host-vm101", + DisplayName: "vm101", + Hostname: "vm101.local", + LinkedVMID: "Main:node3:101", + CPUUsage: 97.5, + Status: "online", + LastSeen: time.Now(), + } + + m.CheckHost(host) + + m.mu.RLock() + alert := m.activeAlerts[canonicalMetricStateID(hostResourceID(host.ID), "cpu")] + m.mu.RUnlock() + + if alert == nil { + t.Fatal("expected explicit host override to take precedence and trigger alert") + } + }) } func TestCheckPBSComprehensive(t *testing.T) { @@ -15332,6 +15597,7 @@ func TestCheckPBSComprehensive(t *testing.T) { t.Run("clears offline alert when back online", func(t *testing.T) { // t.Parallel() m := newTestManager(t) + state := canonicalConnectivityStateID("pbs1") m.mu.Lock() m.activeAlerts["pbs-offline-pbs1"] = &Alert{ID: "pbs-offline-pbs1", Type: "connectivity"} @@ -15345,11 +15611,36 @@ func TestCheckPBSComprehensive(t *testing.T) { ConnectionHealth: "healthy", } + m.CheckPBS(pbs) + m.mu.RLock() + _, offlineExistsAfterFirst := testLookupActiveAlert(t, m, "pbs-offline-pbs1") + recoveryCountAfterFirst := m.offlineRecoveryConfirmations[state] + m.mu.RUnlock() + if !offlineExistsAfterFirst { + t.Fatal("expected offline alert to remain active until recovery confirmations are satisfied") + } + if recoveryCountAfterFirst != 1 { + t.Fatalf("expected first PBS recovery confirmation to be tracked, got %d", recoveryCountAfterFirst) + } + + m.CheckPBS(pbs) + m.mu.RLock() + _, offlineExistsAfterSecond := testLookupActiveAlert(t, m, "pbs-offline-pbs1") + recoveryCountAfterSecond := m.offlineRecoveryConfirmations[state] + m.mu.RUnlock() + if !offlineExistsAfterSecond { + t.Fatal("expected offline alert to remain active after second recovery confirmation") + } + if recoveryCountAfterSecond != 2 { + t.Fatalf("expected second PBS recovery confirmation to be tracked, got %d", recoveryCountAfterSecond) + } + m.CheckPBS(pbs) m.mu.RLock() _, offlineExists := testLookupActiveAlert(t, m, "pbs-offline-pbs1") _, confirmExists := m.offlineConfirmations["pbs1"] + _, recoveryExists := m.offlineRecoveryConfirmations[state] m.mu.RUnlock() if offlineExists { @@ -15358,6 +15649,9 @@ func TestCheckPBSComprehensive(t *testing.T) { if confirmExists { t.Error("expected offline confirmation to be cleared") } + if recoveryExists { + t.Error("expected offline recovery confirmations to be cleared") + } }) } @@ -15664,6 +15958,7 @@ func TestCheckPMGComprehensive(t *testing.T) { t.Run("clears offline alert when back online", func(t *testing.T) { // t.Parallel() m := newTestManager(t) + state := canonicalConnectivityStateID("pmg1") m.mu.Lock() m.activeAlerts["pmg-offline-pmg1"] = &Alert{ID: "pmg-offline-pmg1", Type: "connectivity"} @@ -15677,11 +15972,36 @@ func TestCheckPMGComprehensive(t *testing.T) { ConnectionHealth: "healthy", } + m.CheckPMG(pmg) + m.mu.RLock() + _, offlineExistsAfterFirst := testLookupActiveAlert(t, m, "pmg-offline-pmg1") + recoveryCountAfterFirst := m.offlineRecoveryConfirmations[state] + m.mu.RUnlock() + if !offlineExistsAfterFirst { + t.Fatal("expected offline alert to remain active until recovery confirmations are satisfied") + } + if recoveryCountAfterFirst != 1 { + t.Fatalf("expected first PMG recovery confirmation to be tracked, got %d", recoveryCountAfterFirst) + } + + m.CheckPMG(pmg) + m.mu.RLock() + _, offlineExistsAfterSecond := testLookupActiveAlert(t, m, "pmg-offline-pmg1") + recoveryCountAfterSecond := m.offlineRecoveryConfirmations[state] + m.mu.RUnlock() + if !offlineExistsAfterSecond { + t.Fatal("expected offline alert to remain active after second recovery confirmation") + } + if recoveryCountAfterSecond != 2 { + t.Fatalf("expected second PMG recovery confirmation to be tracked, got %d", recoveryCountAfterSecond) + } + m.CheckPMG(pmg) m.mu.RLock() _, offlineExists := testLookupActiveAlert(t, m, "pmg-offline-pmg1") _, confirmExists := m.offlineConfirmations["pmg1"] + _, recoveryExists := m.offlineRecoveryConfirmations[state] m.mu.RUnlock() if offlineExists { @@ -15690,6 +16010,9 @@ func TestCheckPMGComprehensive(t *testing.T) { if confirmExists { t.Error("expected offline confirmation to be cleared") } + if recoveryExists { + t.Error("expected offline recovery confirmations to be cleared") + } }) t.Run("skips metrics when PMG is offline", func(t *testing.T) { @@ -16008,6 +16331,7 @@ func TestCheckStorageComprehensive(t *testing.T) { t.Run("clears offline alert when back online", func(t *testing.T) { // t.Parallel() m := newTestManager(t) + state := canonicalConnectivityStateID("storage1") m.mu.Lock() m.activeAlerts["storage-offline-storage1"] = &Alert{ID: "storage-offline-storage1", Type: "connectivity"} @@ -16020,11 +16344,24 @@ func TestCheckStorageComprehensive(t *testing.T) { Status: "active", } + m.CheckStorage(storage) + m.mu.RLock() + _, offlineExistsAfterFirst := testLookupActiveAlert(t, m, "storage-offline-storage1") + recoveryCountAfterFirst := m.offlineRecoveryConfirmations[state] + m.mu.RUnlock() + if !offlineExistsAfterFirst { + t.Fatal("expected offline alert to remain active until recovery confirmations are satisfied") + } + if recoveryCountAfterFirst != 1 { + t.Fatalf("expected first storage recovery confirmation to be tracked, got %d", recoveryCountAfterFirst) + } + m.CheckStorage(storage) m.mu.RLock() _, offlineExists := testLookupActiveAlert(t, m, "storage-offline-storage1") _, confirmExists := m.offlineConfirmations["storage1"] + _, recoveryExists := m.offlineRecoveryConfirmations[state] m.mu.RUnlock() if offlineExists { @@ -16033,6 +16370,9 @@ func TestCheckStorageComprehensive(t *testing.T) { if confirmExists { t.Error("expected offline confirmation to be cleared") } + if recoveryExists { + t.Error("expected offline recovery confirmations to be cleared") + } }) t.Run("skips usage check when usage is zero", func(t *testing.T) { diff --git a/internal/alerts/quiet_hours_test.go b/internal/alerts/quiet_hours_test.go index 5e54a4dc9..0fd5a3f66 100644 --- a/internal/alerts/quiet_hours_test.go +++ b/internal/alerts/quiet_hours_test.go @@ -79,6 +79,40 @@ func TestShouldSuppressNotificationQuietHours(t *testing.T) { t.Fatalf("expected storage alert suppression, got suppressed=%t reason=%q", suppressed, reason) } }) + + t.Run("public suppression helper mirrors quiet hours policy", func(t *testing.T) { + m := newManagerWithQuietHoursSuppress(QuietHoursSuppression{Offline: true}) + alert := &Alert{ID: "offline-public", Type: "connectivity", Level: AlertLevelCritical} + if !m.ShouldSuppressNotification(alert) { + t.Fatal("expected public quiet-hours suppression helper to return true") + } + }) + + t.Run("resolved notifications are suppressed for acknowledged alerts", func(t *testing.T) { + m := newManagerWithQuietHoursSuppress(QuietHoursSuppression{}) + alert := &Alert{ + ID: "resolved-ack", + Type: "cpu", + Level: AlertLevelCritical, + Acknowledged: true, + LastNotified: ptrTime(time.Now().Add(-time.Minute)), + } + if !m.ShouldSuppressResolvedNotification(alert) { + t.Fatal("expected recovery notification to be suppressed for acknowledged alert") + } + }) + + t.Run("resolved notifications are suppressed when the firing alert was never notified", func(t *testing.T) { + m := newManagerWithQuietHoursSuppress(QuietHoursSuppression{}) + alert := &Alert{ + ID: "resolved-unnotified", + Type: "cpu", + Level: AlertLevelCritical, + } + if !m.ShouldSuppressResolvedNotification(alert) { + t.Fatal("expected recovery notification to be suppressed when LastNotified is nil") + } + }) } func TestIsInQuietHours(t *testing.T) { diff --git a/internal/alerts/threshold_update_test.go b/internal/alerts/threshold_update_test.go index 2228422ab..93ba64c43 100644 --- a/internal/alerts/threshold_update_test.go +++ b/internal/alerts/threshold_update_test.go @@ -156,6 +156,65 @@ func TestReevaluateActiveAlertsWithOverride(t *testing.T) { } } +func TestReevaluateActiveAlertsWithLinkedHostOverride(t *testing.T) { + manager := NewManager() + + manager.mu.Lock() + manager.activeAlerts = make(map[string]*Alert) + manager.mu.Unlock() + + initialConfig := AlertConfig{ + Enabled: true, + AgentDefaults: ThresholdConfig{ + Memory: &HysteresisThreshold{Trigger: 85, Clear: 80}, + }, + Overrides: make(map[string]ThresholdConfig), + } + manager.UpdateConfig(initialConfig) + + resourceID := hostResourceID("host-node1") + alertID := canonicalMetricStateID(resourceID, "memory") + alert := &Alert{ + ID: alertID, + Type: "memory", + Level: AlertLevelWarning, + ResourceID: resourceID, + ResourceName: "node1", + Node: "node1", + Instance: "linux", + Message: "Host memory at 90.6%", + Value: 90.6, + Threshold: 85.0, + StartTime: time.Now().Add(-5 * time.Minute), + LastSeen: time.Now(), + Metadata: map[string]interface{}{ + "resourceType": "agent", + "hostId": "host-node1", + "linkedNodeId": "cluster-node1", + }, + } + + manager.mu.Lock() + manager.activeAlerts[alertID] = alert + manager.mu.Unlock() + + updatedConfig := initialConfig + updatedConfig.Overrides["cluster-node1"] = ThresholdConfig{ + Memory: &HysteresisThreshold{Trigger: 97, Clear: 92}, + } + manager.UpdateConfig(updatedConfig) + + time.Sleep(100 * time.Millisecond) + + manager.mu.RLock() + _, alertStillActive := manager.activeAlerts[alertID] + manager.mu.RUnlock() + + if alertStillActive { + t.Errorf("expected linked host alert to be resolved after linked node override increase") + } +} + func TestReevaluateActiveAlertsUsesStableClusterGuestOverrideAcrossNodeMove(t *testing.T) { manager := NewManager() diff --git a/internal/alerts/unified_eval_test.go b/internal/alerts/unified_eval_test.go index 3e1e8214c..ca0bc6260 100644 --- a/internal/alerts/unified_eval_test.go +++ b/internal/alerts/unified_eval_test.go @@ -369,6 +369,114 @@ func TestCheckGuestPerDiskAnnotatesCanonicalSpecMetadata(t *testing.T) { } } +func TestCheckGuestPerDiskCleansUpRemovedDiskAlerts(t *testing.T) { + m := newTestManager(t) + configureUnifiedEvalManager(t, m, unifiedEvalBaseConfig()) + + guestID := BuildGuestKey("pve1", "node1", 101) + baseGuest := models.VM{ + ID: guestID, + VMID: 101, + Name: "app01", + Node: "node1", + Instance: "pve1", + Status: "running", + CPU: 0.20, + Memory: models.Memory{Usage: 40}, + Disk: models.Disk{Usage: 40}, + } + + withDisk := baseGuest + withDisk.Disks = []models.Disk{ + { + Mountpoint: "/", + Device: "scsi0", + Usage: 95, + Total: 100, + Used: 95, + Free: 5, + }, + } + m.CheckGuest(withDisk, "pve1") + + originalAlertID := canonicalMetricStateID(guestID+"-disk-scsi0", "disk") + if activeAlert(t, m, originalAlertID) == nil { + t.Fatalf("expected guest disk alert %q", originalAlertID) + } + + changedDisk := baseGuest + changedDisk.Disks = []models.Disk{ + { + Mountpoint: "/data", + Device: "scsi1", + Usage: 96, + Total: 100, + Used: 96, + Free: 4, + }, + } + m.CheckGuest(changedDisk, "pve1") + + newAlertID := canonicalMetricStateID(guestID+"-disk-data-scsi1", "disk") + m.mu.RLock() + _, oldExists := testLookupActiveAlert(t, m, originalAlertID) + _, newExists := testLookupActiveAlert(t, m, newAlertID) + m.mu.RUnlock() + + if oldExists { + t.Fatalf("expected stale guest disk alert %q to be cleared", originalAlertID) + } + if !newExists { + t.Fatalf("expected replacement guest disk alert %q", newAlertID) + } +} + +func TestCheckGuestClearsPerDiskAlertsWhenGuestStops(t *testing.T) { + m := newTestManager(t) + configureUnifiedEvalManager(t, m, unifiedEvalBaseConfig()) + + guestID := BuildGuestKey("pve1", "node1", 101) + guest := models.VM{ + ID: guestID, + VMID: 101, + Name: "app01", + Node: "node1", + Instance: "pve1", + Status: "running", + CPU: 0.20, + Memory: models.Memory{Usage: 40}, + Disk: models.Disk{Usage: 40}, + Disks: []models.Disk{ + { + Mountpoint: "/", + Device: "scsi0", + Usage: 95, + Total: 100, + Used: 95, + Free: 5, + }, + }, + } + + m.CheckGuest(guest, "pve1") + + alertID := canonicalMetricStateID(guestID+"-disk-scsi0", "disk") + if activeAlert(t, m, alertID) == nil { + t.Fatalf("expected guest disk alert %q", alertID) + } + + guest.Status = "stopped" + guest.Disks = nil + m.CheckGuest(guest, "pve1") + + m.mu.RLock() + _, exists := testLookupActiveAlert(t, m, alertID) + m.mu.RUnlock() + if exists { + t.Fatalf("expected guest disk alert %q to clear when guest stops", alertID) + } +} + func TestCheckNodeTemperatureAnnotatesCanonicalSpecMetadata(t *testing.T) { m := newTestManager(t) configureUnifiedEvalManager(t, m, unifiedEvalBaseConfig()) diff --git a/internal/alerts/update_alerts_test.go b/internal/alerts/update_alerts_test.go index 16d9fde21..3ddbd3772 100644 --- a/internal/alerts/update_alerts_test.go +++ b/internal/alerts/update_alerts_test.go @@ -221,6 +221,122 @@ func TestCheckDockerContainerImageUpdate(t *testing.T) { } }) + t.Run("missing update status preserves existing alert and tracking", func(t *testing.T) { + testResourceID := "docker:" + hostID + "/container-unknown-status" + container := models.DockerContainer{ + ID: "container-unknown-status", + Name: "unknown-status-container", + Image: "nginx:latest", + UpdateStatus: &models.DockerContainerUpdateStatus{ + UpdateAvailable: true, + CurrentDigest: "sha256:old", + LatestDigest: "sha256:new", + LastChecked: time.Now(), + }, + } + trackingKey := dockerUpdateTrackingKey(host, container) + firstSeen := time.Now().Add(-25 * time.Hour) + + m.mu.Lock() + m.dockerUpdateFirstSeen[testResourceID] = firstSeen + m.dockerUpdateFirstSeenByIdentity[trackingKey] = firstSeen + m.mu.Unlock() + + m.checkDockerContainerImageUpdate(host, container, testResourceID, "unknown-status-container", instanceName, nodeName) + + canonicalAlertID := buildCanonicalStateID(testResourceID, testResourceID+"-image-update") + m.mu.RLock() + alert := testRequireActiveAlert(t, m, canonicalAlertID) + if alert == nil { + m.mu.RUnlock() + t.Fatalf("expected update alert %q to be active", canonicalAlertID) + } + beforeLastSeen := alert.LastSeen + m.mu.RUnlock() + + time.Sleep(10 * time.Millisecond) + container.UpdateStatus = nil + m.checkDockerContainerImageUpdate(host, container, testResourceID, "unknown-status-container", instanceName, nodeName) + + m.mu.RLock() + alert = testRequireActiveAlert(t, m, canonicalAlertID) + if alert == nil { + m.mu.RUnlock() + t.Fatalf("expected update alert %q to remain active", canonicalAlertID) + } + _, tracked := m.dockerUpdateFirstSeen[testResourceID] + _, trackedByIdentity := m.dockerUpdateFirstSeenByIdentity[trackingKey] + afterLastSeen := alert.LastSeen + m.mu.RUnlock() + + if !tracked || !trackedByIdentity { + t.Fatal("expected update tracking to remain when update status is temporarily unavailable") + } + if !afterLastSeen.After(beforeLastSeen) { + t.Fatalf("expected LastSeen to refresh when update status is missing, before=%s after=%s", beforeLastSeen, afterLastSeen) + } + }) + + t.Run("update detection errors preserve existing alert and tracking", func(t *testing.T) { + testResourceID := "docker:" + hostID + "/container-error-status" + container := models.DockerContainer{ + ID: "container-error-status", + Name: "error-status-container", + Image: "nginx:latest", + UpdateStatus: &models.DockerContainerUpdateStatus{ + UpdateAvailable: true, + CurrentDigest: "sha256:old", + LatestDigest: "sha256:new", + LastChecked: time.Now(), + }, + } + trackingKey := dockerUpdateTrackingKey(host, container) + firstSeen := time.Now().Add(-25 * time.Hour) + + m.mu.Lock() + m.dockerUpdateFirstSeen[testResourceID] = firstSeen + m.dockerUpdateFirstSeenByIdentity[trackingKey] = firstSeen + m.mu.Unlock() + + m.checkDockerContainerImageUpdate(host, container, testResourceID, "error-status-container", instanceName, nodeName) + + canonicalAlertID := buildCanonicalStateID(testResourceID, testResourceID+"-image-update") + m.mu.RLock() + alert := testRequireActiveAlert(t, m, canonicalAlertID) + if alert == nil { + m.mu.RUnlock() + t.Fatalf("expected update alert %q to be active", canonicalAlertID) + } + beforeLastSeen := alert.LastSeen + m.mu.RUnlock() + + time.Sleep(10 * time.Millisecond) + container.UpdateStatus = &models.DockerContainerUpdateStatus{ + UpdateAvailable: true, + Error: "rate limited", + LastChecked: time.Now(), + } + m.checkDockerContainerImageUpdate(host, container, testResourceID, "error-status-container", instanceName, nodeName) + + m.mu.RLock() + alert = testRequireActiveAlert(t, m, canonicalAlertID) + if alert == nil { + m.mu.RUnlock() + t.Fatalf("expected update alert %q to remain active after check error", canonicalAlertID) + } + _, tracked := m.dockerUpdateFirstSeen[testResourceID] + _, trackedByIdentity := m.dockerUpdateFirstSeenByIdentity[trackingKey] + afterLastSeen := alert.LastSeen + m.mu.RUnlock() + + if !tracked || !trackedByIdentity { + t.Fatal("expected update tracking to remain after update check error") + } + if !afterLastSeen.After(beforeLastSeen) { + t.Fatalf("expected LastSeen to refresh after update check error, before=%s after=%s", beforeLastSeen, afterLastSeen) + } + }) + t.Run("disabled by negative delay hours", func(t *testing.T) { // Create manager with disabled update alerts m2 := NewManager() diff --git a/internal/monitoring/canonical_guardrails_test.go b/internal/monitoring/canonical_guardrails_test.go index f321f2a69..78e032ded 100644 --- a/internal/monitoring/canonical_guardrails_test.go +++ b/internal/monitoring/canonical_guardrails_test.go @@ -91,6 +91,33 @@ func TestGetStateRefreshesLiveAlertSnapshots(t *testing.T) { } } +func TestEscalationDeliveryDefersToCanonicalAlertSuppression(t *testing.T) { + requiredSnippets := map[string][]string{ + "monitor.go": { + "m.alertManager.SetEscalateCallback(func(alert *alerts.Alert, level int) {", + "m.handleAlertEscalated(wsHub, alert, level)", + }, + "monitor_alerts.go": { + "func (m *Monitor) handleAlertEscalated(hub *websocket.Hub, alert *alerts.Alert, level int) {", + "if m.alertManager.ShouldSuppressNotification(alert) {", + "m.broadcastEscalatedAlert(hub, alert)", + }, + } + + for file, snippets := range requiredSnippets { + data, err := os.ReadFile(file) + if err != nil { + t.Fatalf("failed to read %s: %v", file, err) + } + source := string(data) + for _, snippet := range snippets { + if !strings.Contains(source, snippet) { + t.Fatalf("%s must contain %q", file, snippet) + } + } + } +} + func TestMonitoredSystemUsageReadinessGuardrailsRemainCanonical(t *testing.T) { requiredSnippets := map[string][]string{ "monitor.go": { diff --git a/internal/monitoring/monitor.go b/internal/monitoring/monitor.go index 1410f3f29..fe25ed582 100644 --- a/internal/monitoring/monitor.go +++ b/internal/monitoring/monitor.go @@ -1772,41 +1772,7 @@ func (m *Monitor) Start(ctx context.Context, wsHub *websocket.Hub) { m.handleAlertUnacknowledged(alert, user) }) m.alertManager.SetEscalateCallback(func(alert *alerts.Alert, level int) { - log.Info(). - Str("alertID", alert.ID). - Int("level", level). - Msg("Alert escalated - sending notifications") - - // Get escalation config - config := m.alertManager.GetConfig() - if level <= 0 || level > len(config.Schedule.Escalation.Levels) { - return - } - - escalationLevel := config.Schedule.Escalation.Levels[level-1] - - // Send notifications based on escalation level - switch escalationLevel.Notify { - case "email": - // Only send email - if emailConfig := m.notificationMgr.GetEmailConfig(); emailConfig.Enabled { - m.notificationMgr.SendAlert(alert) - } - case "webhook": - // Only send webhooks - for _, webhook := range m.notificationMgr.GetWebhooks() { - if webhook.Enabled { - m.notificationMgr.SendAlert(alert) - break - } - } - case "all": - // Send all notifications - m.notificationMgr.SendAlert(alert) - } - - // Update WebSocket with escalation - m.broadcastEscalatedAlert(wsHub, alert) + m.handleAlertEscalated(wsHub, alert, level) }) // Create separate tickers for polling and broadcasting using the configured cadence diff --git a/internal/monitoring/monitor_alert_handling_test.go b/internal/monitoring/monitor_alert_handling_test.go index 1df74f3d7..73a80a65c 100644 --- a/internal/monitoring/monitor_alert_handling_test.go +++ b/internal/monitoring/monitor_alert_handling_test.go @@ -1,6 +1,8 @@ package monitoring import ( + "net/http" + "net/http/httptest" "sync/atomic" "testing" "time" @@ -276,3 +278,125 @@ func TestMonitor_HandleAlertResolved_NoQuietHoursSendsNotification(t *testing.T) // Should not crash, and notification should be dispatched (not suppressed) m.handleAlertResolved(alertID) } + +func TestMonitor_HandleAlertEscalated_QuietHoursSuppressesNotification(t *testing.T) { + t.Setenv("PULSE_DATA_DIR", t.TempDir()) + + requests := make(chan struct{}, 1) + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + requests <- struct{}{} + w.WriteHeader(http.StatusOK) + })) + defer server.Close() + + notifMgr := notifications.NewNotificationManager("https://pulse.local") + defer notifMgr.Stop() + notifMgr.SetGroupingWindow(0) + notifMgr.SetCooldown(0) + if err := notifMgr.UpdateAllowedPrivateCIDRs("127.0.0.1/32"); err != nil { + t.Fatalf("UpdateAllowedPrivateCIDRs: %v", err) + } + notifMgr.AddWebhook(notifications.WebhookConfig{ + ID: "quiet-hours-hook", + Name: "quiet-hours", + URL: server.URL, + Enabled: true, + }) + + mgr := alerts.NewManager() + cfg := mgr.GetConfig() + cfg.Schedule.Escalation.Levels = []alerts.EscalationLevel{{After: 1, Notify: "webhook"}} + cfg.Schedule.QuietHours = alerts.QuietHours{ + Enabled: true, + Start: "00:00", + End: "23:59", + Timezone: "UTC", + Days: map[string]bool{ + "monday": true, "tuesday": true, "wednesday": true, + "thursday": true, "friday": true, "saturday": true, "sunday": true, + }, + Suppress: alerts.QuietHoursSuppression{Offline: true}, + } + mgr.UpdateConfig(cfg) + + m := &Monitor{ + notificationMgr: notifMgr, + alertManager: mgr, + } + + alert := &alerts.Alert{ + ID: "escalated-offline", + Type: "connectivity", + Level: alerts.AlertLevelCritical, + ResourceID: "node/pve-1", + ResourceName: "pve-1", + Node: "pve-1", + Instance: "pve", + Message: "Node offline", + StartTime: time.Now(), + } + + m.handleAlertEscalated(nil, alert, 1) + + select { + case <-requests: + t.Fatal("expected quiet hours to suppress escalated notification delivery") + case <-time.After(500 * time.Millisecond): + } +} + +func TestMonitor_HandleAlertEscalated_SendsNotificationWhenNotSuppressed(t *testing.T) { + t.Setenv("PULSE_DATA_DIR", t.TempDir()) + + requests := make(chan struct{}, 1) + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + requests <- struct{}{} + w.WriteHeader(http.StatusOK) + })) + defer server.Close() + + notifMgr := notifications.NewNotificationManager("https://pulse.local") + defer notifMgr.Stop() + notifMgr.SetGroupingWindow(0) + notifMgr.SetCooldown(0) + if err := notifMgr.UpdateAllowedPrivateCIDRs("127.0.0.1/32"); err != nil { + t.Fatalf("UpdateAllowedPrivateCIDRs: %v", err) + } + notifMgr.AddWebhook(notifications.WebhookConfig{ + ID: "normal-hook", + Name: "normal", + URL: server.URL, + Enabled: true, + }) + + mgr := alerts.NewManager() + cfg := mgr.GetConfig() + cfg.Schedule.Escalation.Levels = []alerts.EscalationLevel{{After: 1, Notify: "webhook"}} + cfg.Schedule.QuietHours.Enabled = false + mgr.UpdateConfig(cfg) + + m := &Monitor{ + notificationMgr: notifMgr, + alertManager: mgr, + } + + alert := &alerts.Alert{ + ID: "escalated-normal", + Type: "connectivity", + Level: alerts.AlertLevelCritical, + ResourceID: "node/pve-1", + ResourceName: "pve-1", + Node: "pve-1", + Instance: "pve", + Message: "Node offline", + StartTime: time.Now(), + } + + m.handleAlertEscalated(nil, alert, 1) + + select { + case <-requests: + case <-time.After(2 * time.Second): + t.Fatal("expected escalated notification delivery") + } +} diff --git a/internal/monitoring/monitor_alerts.go b/internal/monitoring/monitor_alerts.go index c738fd066..5cb37d5df 100644 --- a/internal/monitoring/monitor_alerts.go +++ b/internal/monitoring/monitor_alerts.go @@ -8,6 +8,7 @@ import ( "github.com/rcourtman/pulse-go-rewrite/internal/alerts" "github.com/rcourtman/pulse-go-rewrite/internal/mock" "github.com/rcourtman/pulse-go-rewrite/internal/unifiedresources" + "github.com/rcourtman/pulse-go-rewrite/internal/websocket" "github.com/rs/zerolog/log" ) @@ -140,6 +141,52 @@ func (m *Monitor) handleAlertResolved(alertID string) { } } +func (m *Monitor) handleAlertEscalated(hub *websocket.Hub, alert *alerts.Alert, level int) { + if alert == nil || m.alertManager == nil { + return + } + + log.Info(). + Str("alertID", alert.ID). + Int("level", level). + Msg("Alert escalated") + + config := m.alertManager.GetConfig() + if level <= 0 || level > len(config.Schedule.Escalation.Levels) { + return + } + + if m.alertManager.ShouldSuppressNotification(alert) { + log.Info(). + Str("alertID", alert.ID). + Int("level", level). + Msg("Escalated notification suppressed during quiet hours") + m.broadcastEscalatedAlert(hub, alert) + return + } + + if m.notificationMgr != nil { + escalationLevel := config.Schedule.Escalation.Levels[level-1] + switch escalationLevel.Notify { + case "email": + if emailConfig := m.notificationMgr.GetEmailConfig(); emailConfig.Enabled { + m.notificationMgr.SendAlert(alert) + } + case "webhook": + for _, webhook := range m.notificationMgr.GetWebhooks() { + if webhook.Enabled { + m.notificationMgr.SendAlert(alert) + break + } + } + case "all": + m.notificationMgr.SendAlert(alert) + } + } + + m.broadcastEscalatedAlert(hub, alert) +} + func (m *Monitor) handleAlertAcknowledged(alert *alerts.Alert, user string) { if m.incidentStore == nil || alert == nil { if alert == nil {