Port v5 alert lifecycle and linked agent overrides

2026-05-22 03:02:35 +00:00 · 2026-04-15 12:18:57 +01:00 · 2026-04-15 12:18:57 +01:00 · b73dab2b64
commit b73dab2b64
parent f94601e26f
12 changed files with 1201 additions and 106 deletions
--- a/docs/release-control/v6/internal/subsystems/alerts.md
+++ b/docs/release-control/v6/internal/subsystems/alerts.md
@ -82,6 +82,12 @@ runtime must migrate the active alert, history entry, acknowledgment record,
 suppression/rate-limit/flapping tracking, and guest per-disk metric identity
 to the current canonical state instead of reopening a duplicate alert or
 resolving only the stale node-scoped identity.
+That same guest-alert owner also has to retire per-disk guest alerts when the
+guest stops, disk alerting is disabled, or the reported disk set changes.
+Canonical guest disk identity is only valid while the guest still exposes that
+disk resource under the current thresholds, so runtime cleanup must remove
+stale `guestID-disk-*` state instead of leaving orphaned per-disk incidents in
+active alerts, resolved history, or later UI projections.
 That same alerts runtime also owns instance-scoped node display-name
 resolution. Raw node names are not globally unique across configured
 infrastructure instances, so cached node display names must key on instance +
@ -211,6 +217,11 @@ keep schedules such as `00:00` to `23:59` active through the full final
 minute instead of expiring at `23:59:00`. Alert quiet-hours proofs should
 control time through the alert manager clock hook instead of depending on wall
 clock execution at whatever second the test runner happens to hit.
+Quiet-hours suppression also applies to alert delivery lifecycle, not only the
+initial raised notification. Resolved notifications must not fan out when the
+alert was never notified or was already acknowledged, and monitoring-driven
+escalation delivery must consult the same quiet-hours suppression path while
+still letting canonical escalation state reach websocket consumers.
 That schedule surface now also follows the same shell/runtime split as the
 other feature tabs: `frontend-modern/src/features/alerts/tabs/ScheduleTab.tsx`
 stays the render shell, while
@ -231,6 +242,21 @@ presentation now also route through
 `frontend-modern/src/utils/alertIncidentPresentation.ts` instead of remaining
 duplicated inline across the alerts page and overview timeline surfaces.

+Poll-driven connectivity recovery is also part of canonical alert truth.
+Resources that clear an offline alert from later healthy polls must require
+repeated healthy confirmations before resolving that alert instead of clearing
+on the first recovered sample; otherwise transient poll recovery reopens the
+same regression as false "back online" notifications and missing downtime
+signal. Nodes, PBS, and PMG require three healthy confirmations before
+resolution, while storage requires two.
+
+Host-agent threshold ownership now follows the linked resource model.
+Explicit agent overrides still win, but when no host-agent override exists the
+alerts runtime must inherit linked node or guest overrides for that agent so
+metric and connectivity behavior match the logical machine the agent augments.
+Persisted host alerts must carry enough linked-resource metadata for
+reevaluation after threshold changes to honor that same inheritance rule.
+
 Alert resource tables, grouped node headers, and alert override reconstruction
 now route resource-backed names through the shared policy-aware alerts helper
 so governed resources do not fall back to raw names when the thresholds editor
--- a/docs/release-control/v6/internal/subsystems/monitoring.md
+++ b/docs/release-control/v6/internal/subsystems/monitoring.md
@ -95,6 +95,12 @@ This subsystem now sits under the dedicated core monitoring runtime lane so
 discovery, metrics-history correctness, and platform-specific runtime coverage
 can be governed as first-class product work instead of staying diluted inside
 architecture coherence.
+That same monitoring boundary also owns the escalation callback bridge into the
+alerts delivery layer. Monitor-owned escalation handling may still publish
+canonical escalation state to websocket consumers, but notification fan-out
+must defer quiet-hours and resolved-notification suppression policy to the
+alerts manager instead of bypassing that shared routing contract when monitor
+plumbs escalations outward.
 That same monitoring owner now also governs monitored-system usage readiness
 for commercial boundaries. A non-nil unified read-state is not sufficient when
 provider-owned supplemental inventories such as TrueNAS or VMware are still
--- a/internal/alerts/alerts.go
+++ b/internal/alerts/alerts.go
@ -45,10 +45,12 @@ const (

 // Cleanup intervals
 const (
-	StaleTrackingThreshold = 24 * time.Hour
-	RateLimitCleanupWindow = 1 * time.Hour
-	alertsDirPerm          = 0o700
-	alertsFilePerm         = 0o600
+	StaleTrackingThreshold              = 24 * time.Hour
+	RateLimitCleanupWindow              = 1 * time.Hour
+	alertsDirPerm                       = 0o700
+	alertsFilePerm                      = 0o600
+	offlineRecoveryConfirmationsDefault = 3
+	offlineRecoveryConfirmationsStorage = 2
 )

 func normalizePoweredOffSeverity(level AlertLevel) AlertLevel {
@ -555,13 +557,14 @@ type Manager struct {
 	// Time threshold tracking
 	pendingAlerts map[string]time.Time // Track when thresholds were first exceeded
 	// Offline confirmation tracking
-	nodeOfflineCount      map[string]int                  // Track consecutive offline counts for nodes (legacy)
-	offlineConfirmations  map[string]int                  // Track consecutive offline counts for all resources
-	dockerOfflineCount    map[string]int                  // Track consecutive offline counts for Docker hosts
-	dockerStateConfirm    map[string]int                  // Track consecutive state confirmations for Docker containers
-	dockerRestartTracking map[string]*dockerRestartRecord // Track restart counts and times for restart loop detection
-	dockerLastExitCode    map[string]int                  // Track last exit code for OOM detection
-	dockerUpdateFirstSeen map[string]time.Time            // Track when image updates were first detected for alert delay
+	nodeOfflineCount             map[string]int                  // Track consecutive offline counts for nodes (legacy)
+	offlineConfirmations         map[string]int                  // Track consecutive offline counts for all resources
+	offlineRecoveryConfirmations map[string]int                  // Track consecutive healthy confirmations before clearing poll-driven offline alerts
+	dockerOfflineCount           map[string]int                  // Track consecutive offline counts for Docker hosts
+	dockerStateConfirm           map[string]int                  // Track consecutive state confirmations for Docker containers
+	dockerRestartTracking        map[string]*dockerRestartRecord // Track restart counts and times for restart loop detection
+	dockerLastExitCode           map[string]int                  // Track last exit code for OOM detection
+	dockerUpdateFirstSeen        map[string]time.Time            // Track when image updates were first detected for alert delay
 	// Stable identity tracking prevents update-delay resets when host IDs churn.
 	dockerUpdateFirstSeenByIdentity map[string]time.Time
 	// PMG quarantine growth tracking
@ -642,6 +645,7 @@ func NewManagerWithDataDir(dataDir string) *Manager {
 		pendingAlerts:                   make(map[string]time.Time),
 		nodeOfflineCount:                make(map[string]int),
 		offlineConfirmations:            make(map[string]int),
+		offlineRecoveryConfirmations:    make(map[string]int),
 		dockerOfflineCount:              make(map[string]int),
 		dockerStateConfirm:              make(map[string]int),
 		dockerRestartTracking:           make(map[string]*dockerRestartRecord),
@ -2046,10 +2050,7 @@ func (m *Manager) reevaluateActiveAlertsLocked() {
 				alertsToResolve = append(alertsToResolve, alertID)
 				continue
 			}
-			// Overrides are keyed by raw host ID (without the "agent:" prefix
-			// that hostResourceID adds to the resource ID used in alert IDs).
-			rawHostID := stripHostResourcePrefix(resourceID)
-			thresholds := m.resolveResourceThresholds("agent", rawHostID)
+			thresholds := m.resolveHostAlertThresholdsNoLock(alert, resourceID)
 			if thresholds.Disabled {
 				alertsToResolve = append(alertsToResolve, alertID)
 				continue
@ -2507,6 +2508,28 @@ func (m *Manager) shouldSuppressNotification(alert *Alert) (bool, string) {
 	return false, ""
 }

+// ShouldSuppressNotification checks if a notification should be suppressed
+// during quiet hours. This is used by paths that bypass dispatchAlert, such as escalation.
+func (m *Manager) ShouldSuppressNotification(alert *Alert) bool {
+	if alert == nil {
+		return false
+	}
+
+	m.mu.RLock()
+	defer m.mu.RUnlock()
+
+	suppressed, reason := m.shouldSuppressNotification(alert)
+	if suppressed {
+		log.Debug().
+			Str("alertID", alert.ID).
+			Str("type", alert.Type).
+			Str("level", string(alert.Level)).
+			Str("quietHoursRule", reason).
+			Msg("Notification suppressed during quiet hours")
+	}
+	return suppressed
+}
+
 // ShouldSuppressResolvedNotification checks if a recovery notification should be suppressed
 // during quiet hours. Recovery notifications follow the same quiet hours rules as their
 // corresponding alerts - if the original alert would have been suppressed, so is the recovery.
@ -2518,6 +2541,22 @@ func (m *Manager) ShouldSuppressResolvedNotification(alert *Alert) bool {
 	m.mu.RLock()
 	defer m.mu.RUnlock()

+	if alert.Acknowledged {
+		log.Debug().
+			Str("alertID", alert.ID).
+			Str("type", alert.Type).
+			Msg("Recovery notification suppressed for acknowledged alert")
+		return true
+	}
+
+	if alert.LastNotified == nil {
+		log.Debug().
+			Str("alertID", alert.ID).
+			Str("type", alert.Type).
+			Msg("Recovery notification suppressed because firing notification was never sent")
+		return true
+	}
+
 	suppressed, reason := m.shouldSuppressNotification(alert)
 	if suppressed {
 		log.Debug().
@ -2698,23 +2737,7 @@ func (m *Manager) CheckGuest(guest any, instanceName string) {
 			m.clearGuestPoweredOffAlert(guestID, name)
 		}

-		// Clear all resource metric alerts (cpu, memory, disk, etc.) for non-running guests
-		m.mu.Lock()
-		alertsCleared := 0
-		for storageKey, alert := range m.activeAlerts {
-			alertID := effectiveAlertID(alert, storageKey)
-			// Only clear resource metric alerts, not powered-off alerts
-			if alert.ResourceID == guestID && alert.Type != "powered-off" {
-				m.clearAlertNoLock(alertID)
-				alertsCleared++
-				log.Debug().
-					Str("alertID", alertID).
-					Str("guest", name).
-					Str("status", status).
-					Msg("Cleared metric alert for non-running guest")
-			}
-		}
-		m.mu.Unlock()
+		alertsCleared := m.clearGuestMetricAlerts(guestID)

 		if alertsCleared > 0 {
 			log.Debug().
@ -2722,6 +2745,7 @@ func (m *Manager) CheckGuest(guest any, instanceName string) {
 				Str("status", status).
 				Int("alertsCleared", alertsCleared).
 				Msg("Cleared metric alerts for non-running guest")
+			m.saveActiveAlertsAsync("guest-not-running")
 		}
 		return
 	}
@ -2744,18 +2768,13 @@ func (m *Manager) CheckGuest(guest any, instanceName string) {

 	// If alerts are disabled for this guest, clear any existing alerts and return
 	if thresholds.Disabled {
-		m.mu.Lock()
-		for storageKey, alert := range m.activeAlerts {
-			alertID := effectiveAlertID(alert, storageKey)
-			if alert.ResourceID == guestID {
-				m.clearAlertNoLock(alertID)
-				log.Info().
-					Str("alertID", alertID).
-					Str("guest", name).
-					Msg("Cleared alert - guest has alerts disabled")
-			}
+		if alertsCleared := m.clearGuestMetricAlerts(guestID); alertsCleared > 0 {
+			log.Info().
+				Str("guest", name).
+				Int("alertsCleared", alertsCleared).
+				Msg("Cleared guest metric alerts because alerts are disabled")
+			m.saveActiveAlertsAsync("guest-alerts-disabled")
 		}
-		m.mu.Unlock()
 		return
 	}

@ -2789,7 +2808,8 @@ func (m *Manager) CheckGuest(guest any, instanceName string) {
 	}, thresholds, evalOpts)

 	if thresholds.Disk != nil && thresholds.Disk.Trigger > 0 && len(disks) > 0 {
-		seenDisks := make(map[string]struct{})
+		seenDiskKeys := make(map[string]struct{})
+		seenDiskResources := make(map[string]struct{})
 		for idx, disk := range disks {
 			if disk.Total <= 0 {
 				continue
@ -2816,12 +2836,13 @@ func (m *Manager) CheckGuest(guest any, instanceName string) {
 			}

 			// Avoid duplicate checks if two disks resolve to the same key
-			if _, exists := seenDisks[sanitizedKey]; exists {
+			if _, exists := seenDiskKeys[sanitizedKey]; exists {
 				continue
 			}
-			seenDisks[sanitizedKey] = struct{}{}
+			seenDiskKeys[sanitizedKey] = struct{}{}

 			perDiskResourceID := fmt.Sprintf("%s-disk-%s", guestID, sanitizedKey)
+			seenDiskResources[perDiskResourceID] = struct{}{}
 			message := fmt.Sprintf("%s disk (%s) at %.1f%%", guestType, label, disk.Usage)

 			log.Debug().
@ -2867,6 +2888,11 @@ func (m *Manager) CheckGuest(guest any, instanceName string) {
 				MonitorOnly: monitorOnly,
 			})
 		}
+		if cleared := m.cleanupGuestDiskAlerts(guestID, seenDiskResources); cleared > 0 {
+			m.saveActiveAlertsAsync("guest-disk-set-changed")
+		}
+	} else if cleared := m.cleanupGuestDiskAlerts(guestID, nil); cleared > 0 {
+		m.saveActiveAlertsAsync("guest-disk-alerts-cleared")
 	}
 }

@ -3131,6 +3157,64 @@ func hostInstanceName(host models.Host) string {
 	return "Agent"
 }

+// resolveHostThresholdsNoLock resolves the effective thresholds for a host agent.
+// Explicit host-agent overrides win. Otherwise, linked node/guest overrides are
+// inherited so the host agent follows the logical resource it augments.
+// Callers must hold m.mu when reading config through this helper.
+func (m *Manager) resolveHostThresholdsNoLock(hostID, linkedNodeID, linkedVMID, linkedContainerID string) ThresholdConfig {
+	base := m.defaultThresholdsForResourceType("agent")
+
+	if hostID = strings.TrimSpace(hostID); hostID != "" {
+		if override, exists := m.config.Overrides[hostID]; exists {
+			return m.applyThresholdOverride(base, override)
+		}
+	}
+
+	if linkedNodeID = strings.TrimSpace(linkedNodeID); linkedNodeID != "" {
+		if override, exists := m.config.Overrides[linkedNodeID]; exists {
+			return m.applyThresholdOverride(base, override)
+		}
+	}
+
+	if linkedVMID = strings.TrimSpace(linkedVMID); linkedVMID != "" {
+		if override, exists := lookupGuestOverride(m.config.Overrides, nil, linkedVMID); exists {
+			return m.applyThresholdOverride(base, override)
+		}
+	}
+
+	if linkedContainerID = strings.TrimSpace(linkedContainerID); linkedContainerID != "" {
+		if override, exists := lookupGuestOverride(m.config.Overrides, nil, linkedContainerID); exists {
+			return m.applyThresholdOverride(base, override)
+		}
+	}
+
+	return base
+}
+
+// resolveHostAlertThresholdsNoLock resolves thresholds for persisted host-agent alerts.
+// Alert metadata carries the link context needed to inherit node/guest overrides.
+// Callers must hold m.mu when reading config through this helper.
+func (m *Manager) resolveHostAlertThresholdsNoLock(alert *Alert, resourceID string) ThresholdConfig {
+	hostID := stripHostResourcePrefix(resourceID)
+	if idx := strings.Index(hostID, "/"); idx >= 0 {
+		hostID = hostID[:idx]
+	}
+
+	linkedNodeID := ""
+	linkedVMID := ""
+	linkedContainerID := ""
+	if alert != nil {
+		if metadataHostID := metadataStringValue(alert.Metadata, "hostId"); metadataHostID != "" {
+			hostID = metadataHostID
+		}
+		linkedNodeID = metadataStringValue(alert.Metadata, "linkedNodeId")
+		linkedVMID = metadataStringValue(alert.Metadata, "linkedVmId")
+		linkedContainerID = metadataStringValue(alert.Metadata, "linkedContainerId")
+	}
+
+	return m.resolveHostThresholdsNoLock(hostID, linkedNodeID, linkedVMID, linkedContainerID)
+}
+
 func sanitizeHostComponent(value string) string {
 	value = strings.TrimSpace(strings.ToLower(value))
 	if value == "" {
@ -3227,7 +3311,7 @@ func (m *Manager) CheckHost(host models.Host) {
 	m.mu.RLock()
 	alertsEnabled := m.config.Enabled
 	disableAllAgents := m.config.DisableAllAgents
-	thresholds := m.resolveResourceThresholds("agent", host.ID)
+	thresholds := m.resolveHostThresholdsNoLock(host.ID, host.LinkedNodeID, host.LinkedVMID, host.LinkedContainerID)
 	m.mu.RUnlock()

 	if !alertsEnabled {
@ -3267,6 +3351,15 @@ func (m *Manager) CheckHost(host models.Host) {
 		"agentVersion": host.AgentVersion,
 		"architecture": host.Architecture,
 	}
+	if linkedNodeID := strings.TrimSpace(host.LinkedNodeID); linkedNodeID != "" {
+		baseMetadata["linkedNodeId"] = linkedNodeID
+	}
+	if linkedVMID := strings.TrimSpace(host.LinkedVMID); linkedVMID != "" {
+		baseMetadata["linkedVmId"] = linkedVMID
+	}
+	if linkedContainerID := strings.TrimSpace(host.LinkedContainerID); linkedContainerID != "" {
+		baseMetadata["linkedContainerId"] = linkedContainerID
+	}
 	if len(host.Tags) > 0 {
 		baseMetadata["tags"] = append([]string(nil), host.Tags...)
 	}
@ -3590,7 +3683,7 @@ func (m *Manager) HandleHostOffline(host models.Host) {
 		return
 	}
 	disableHostsOffline := m.config.DisableAllAgentsOffline
-	thresholds := m.resolveResourceThresholds("agent", host.ID)
+	thresholds := m.resolveHostThresholdsNoLock(host.ID, host.LinkedNodeID, host.LinkedVMID, host.LinkedContainerID)
 	m.mu.RUnlock()

 	resourceKey := hostResourceID(host.ID)
@ -3644,13 +3737,16 @@ func (m *Manager) HandleHostOffline(host models.Host) {
 		Instance:     instanceName,
 		Message:      fmt.Sprintf("Host '%s' is offline", resourceName),
 		Metadata: map[string]interface{}{
-			"resourceType": "agent",
-			"hostId":       host.ID,
-			"hostname":     host.Hostname,
-			"displayName":  host.DisplayName,
-			"platform":     host.Platform,
-			"osName":       host.OSName,
-			"osVersion":    host.OSVersion,
+			"resourceType":      "agent",
+			"hostId":            host.ID,
+			"hostname":          host.Hostname,
+			"displayName":       host.DisplayName,
+			"platform":          host.Platform,
+			"osName":            host.OSName,
+			"osVersion":         host.OSVersion,
+			"linkedNodeId":      strings.TrimSpace(host.LinkedNodeID),
+			"linkedVmId":        strings.TrimSpace(host.LinkedVMID),
+			"linkedContainerId": strings.TrimSpace(host.LinkedContainerID),
 		},
 		AddToRecent:   true,
 		AddToHistory:  true,
@ -3760,6 +3856,72 @@ func (m *Manager) clearHostDiskAlerts(hostID string) {
 	}
 }

+func (m *Manager) clearGuestMetricAlerts(guestID string, metrics ...string) int {
+	if guestID == "" {
+		return 0
+	}
+
+	allowedMetrics := make(map[string]struct{}, len(metrics))
+	for _, metric := range metrics {
+		metric = strings.TrimSpace(metric)
+		if metric == "" {
+			continue
+		}
+		allowedMetrics[metric] = struct{}{}
+	}
+
+	perDiskPrefix := fmt.Sprintf("%s-disk-", guestID)
+
+	m.mu.Lock()
+	defer m.mu.Unlock()
+
+	cleared := 0
+	for storageKey, alert := range m.activeAlerts {
+		if alert == nil || alert.Type == "powered-off" {
+			continue
+		}
+		if alert.ResourceID != guestID && !strings.HasPrefix(alert.ResourceID, perDiskPrefix) {
+			continue
+		}
+		if len(allowedMetrics) > 0 {
+			if _, ok := allowedMetrics[alert.Type]; !ok {
+				continue
+			}
+		}
+		m.clearAlertNoLock(storageKey)
+		cleared++
+	}
+
+	return cleared
+}
+
+func (m *Manager) cleanupGuestDiskAlerts(guestID string, seen map[string]struct{}) int {
+	if guestID == "" {
+		return 0
+	}
+
+	prefix := fmt.Sprintf("%s-disk-", guestID)
+
+	m.mu.Lock()
+	defer m.mu.Unlock()
+
+	cleared := 0
+	for storageKey, alert := range m.activeAlerts {
+		if alert == nil || !strings.HasPrefix(alert.ResourceID, prefix) {
+			continue
+		}
+		if seen != nil {
+			if _, exists := seen[alert.ResourceID]; exists {
+				continue
+			}
+		}
+		m.clearAlertNoLock(storageKey)
+		cleared++
+	}
+
+	return cleared
+}
+
 func (m *Manager) cleanupHostDiskAlerts(host models.Host, seen map[string]struct{}) {
 	if host.ID == "" {
 		return
@ -5502,9 +5664,19 @@ func (m *Manager) clearDockerContainerUpdateTracking(resourceID, trackingKey str
 	m.mu.Unlock()
 }

+func (m *Manager) touchDockerContainerUpdateAlert(alertID string) {
+	m.mu.Lock()
+	defer m.mu.Unlock()
+
+	if alert, exists := m.getActiveAlertNoLock(alertID); exists && alert != nil {
+		alert.LastSeen = time.Now()
+	}
+}
+
 // checkDockerContainerImageUpdate checks if an image update has been pending for too long
 func (m *Manager) checkDockerContainerImageUpdate(host models.DockerHost, container models.DockerContainer, resourceID, containerName, instanceName, nodeName string) {
 	alertID := fmt.Sprintf("docker-container-update-%s", resourceID)
+	canonicalAlertID := buildCanonicalStateID(resourceID, resourceID+"-image-update")
 	updateTrackingKey := dockerUpdateTrackingKey(host, container)

 	// Check if update detection is enabled
@ -5514,30 +5686,30 @@ func (m *Manager) checkDockerContainerImageUpdate(host models.DockerHost, contai

 	// Negative value means disabled
 	if delayHours < 0 {
-		m.clearAlert(buildCanonicalStateID(resourceID, resourceID+"-image-update"))
+		m.clearAlert(canonicalAlertID)
 		m.clearDockerContainerUpdateTracking(resourceID, updateTrackingKey)
 		return
 	}

 	// Check if this container has an update status reported
 	if container.UpdateStatus == nil {
-		// No update status - clear any tracking and alerts
-		m.clearAlert(buildCanonicalStateID(resourceID, resourceID+"-image-update"))
-		m.clearDockerContainerUpdateTracking(resourceID, updateTrackingKey)
+		// Missing update status means the condition is unknown, not resolved.
+		// Preserve any active alert and first-seen tracking until we see an affirmative clear.
+		m.touchDockerContainerUpdateAlert(canonicalAlertID)
 		return
 	}

 	// Check for errors in update detection (don't alert on errors)
 	if container.UpdateStatus.Error != "" {
-		// Update check failed - clear alert but keep tracking
-		m.clearAlert(buildCanonicalStateID(resourceID, resourceID+"-image-update"))
+		// A failed update check cannot confirm the pending update has been resolved.
+		m.touchDockerContainerUpdateAlert(canonicalAlertID)
 		return
 	}

 	// Check if an update is available
 	if !container.UpdateStatus.UpdateAvailable {
 		// No update available - clear tracking and alert
-		m.clearAlert(buildCanonicalStateID(resourceID, resourceID+"-image-update"))
+		m.clearAlert(canonicalAlertID)
 		m.clearDockerContainerUpdateTracking(resourceID, updateTrackingKey)
 		return
 	}
@ -7495,8 +7667,10 @@ func (m *Manager) removeActiveAlertNoLock(alertID string) {
 		m.unregisterActiveAlertAliasNoLock(key, alert)
 	}
 	if exists {
+		delete(m.offlineRecoveryConfirmations, key)
 		delete(m.activeAlerts, key)
 	}
+	delete(m.offlineRecoveryConfirmations, alertID)
 	// NOTE: Don't delete ackState here - preserve it so if the same alert
 	// reappears (e.g., powered-off VM during backup), the acknowledgement
 	// is restored via preserveAlertState. ackState is cleaned up in Cleanup().
@ -7506,6 +7680,27 @@ func (m *Manager) removeActiveAlertNoLock(alertID string) {
 	}
 }

+func (m *Manager) confirmOfflineRecoveryNoLock(alertID string, required int) (int, bool) {
+	alertID = strings.TrimSpace(alertID)
+	if alertID == "" {
+		return 0, false
+	}
+
+	if required <= 1 {
+		delete(m.offlineRecoveryConfirmations, alertID)
+		return required, true
+	}
+
+	m.offlineRecoveryConfirmations[alertID]++
+	confirmations := m.offlineRecoveryConfirmations[alertID]
+	if confirmations < required {
+		return confirmations, false
+	}
+
+	delete(m.offlineRecoveryConfirmations, alertID)
+	return confirmations, true
+}
+
 // GetActiveAlerts returns all active alerts
 func (m *Manager) GetActiveAlerts() []Alert {
 	if m == nil {
@ -7817,8 +8012,9 @@ func (m *Manager) OnAlertHistory(cb AlertCallback) {
 	}
 }

-// clearResourceOfflineAlert removes an offline alert when a resource comes back online.
-func (m *Manager) clearResourceOfflineAlert(alertPrefix, resourceID, resourceName, host, resourceKind string) {
+// clearResourceOfflineAlert removes an offline alert when a poll-driven resource
+// stays healthy for enough consecutive polls to confirm recovery.
+func (m *Manager) clearResourceOfflineAlert(resourceID, resourceName, host, resourceKind string, requiredRecoveryCount int) {
 	alertID := canonicalConnectivityStateID(resourceID)

 	m.mu.Lock()
@ -7836,6 +8032,17 @@ func (m *Manager) clearResourceOfflineAlert(alertPrefix, resourceID, resourceNam
 	// Check if offline alert exists
 	alert, exists := m.getActiveAlertNoLock(alertID)
 	if !exists {
+		delete(m.offlineRecoveryConfirmations, alertID)
+		return
+	}
+
+	recoveryCount, confirmed := m.confirmOfflineRecoveryNoLock(alertID, requiredRecoveryCount)
+	if !confirmed {
+		log.Debug().
+			Str(strings.ToLower(resourceKind), resourceName).
+			Int("confirmations", recoveryCount).
+			Int("required", requiredRecoveryCount).
+			Msg(resourceKind + " appears back online, waiting for recovery confirmation")
 		return
 	}

@ -7864,6 +8071,10 @@ func (m *Manager) clearResourceOfflineAlert(alertPrefix, resourceID, resourceNam
 func (m *Manager) checkNodeOffline(node models.Node) {
 	alertID := fmt.Sprintf("node-offline-%s", node.ID)

+	m.mu.Lock()
+	delete(m.offlineRecoveryConfirmations, canonicalConnectivityStateID(node.ID))
+	m.mu.Unlock()
+
 	thresholds := m.resolveResourceThresholds("node", node.ID)
 	spec, err := buildCanonicalConnectivitySpec(node.ID, node.Name, unifiedresources.ResourceType("node"), AlertLevelCritical, 3, thresholds.Disabled || thresholds.DisableConnectivity)
 	if err != nil {
@ -7918,6 +8129,17 @@ func (m *Manager) clearNodeOfflineAlert(node models.Node) {
 	// Check if offline alert exists
 	alert, exists := m.getActiveAlertNoLock(alertID)
 	if !exists {
+		delete(m.offlineRecoveryConfirmations, alertID)
+		return
+	}
+
+	recoveryCount, confirmed := m.confirmOfflineRecoveryNoLock(alertID, offlineRecoveryConfirmationsDefault)
+	if !confirmed {
+		log.Debug().
+			Str("node", node.Name).
+			Int("confirmations", recoveryCount).
+			Int("required", offlineRecoveryConfirmationsDefault).
+			Msg("Node appears back online, waiting for recovery confirmation")
 		return
 	}

@ -7944,6 +8166,10 @@ func (m *Manager) clearNodeOfflineAlert(node models.Node) {

 // checkPBSOffline creates an alert for offline PBS instances
 func (m *Manager) checkPBSOffline(pbs models.PBSInstance) {
+	m.mu.Lock()
+	delete(m.offlineRecoveryConfirmations, canonicalConnectivityStateID(pbs.ID))
+	m.mu.Unlock()
+
 	thresholds := m.resolveResourceThresholds("pbs", pbs.ID)
 	spec, err := buildCanonicalConnectivitySpec(pbs.ID, pbs.Name, unifiedresources.ResourceTypePBS, AlertLevelCritical, 3, thresholds.Disabled || thresholds.DisableConnectivity)
 	if err != nil {
@ -7979,11 +8205,15 @@ func (m *Manager) checkPBSOffline(pbs models.PBSInstance) {

 // clearPBSOfflineAlert removes offline alert when PBS comes back online
 func (m *Manager) clearPBSOfflineAlert(pbs models.PBSInstance) {
-	m.clearResourceOfflineAlert("pbs-offline", pbs.ID, pbs.Name, pbs.Host, "PBS")
+	m.clearResourceOfflineAlert(pbs.ID, pbs.Name, pbs.Host, "PBS", offlineRecoveryConfirmationsDefault)
 }

 // checkPMGOffline creates an alert for offline PMG instances
 func (m *Manager) checkPMGOffline(pmg models.PMGInstance) {
+	m.mu.Lock()
+	delete(m.offlineRecoveryConfirmations, canonicalConnectivityStateID(pmg.ID))
+	m.mu.Unlock()
+
 	m.mu.RLock()
 	override, hasOverride := m.config.Overrides[pmg.ID]
 	m.mu.RUnlock()
@ -8022,7 +8252,7 @@ func (m *Manager) checkPMGOffline(pmg models.PMGInstance) {

 // clearPMGOfflineAlert removes offline alert when PMG comes back online
 func (m *Manager) clearPMGOfflineAlert(pmg models.PMGInstance) {
-	m.clearResourceOfflineAlert("pmg-offline", pmg.ID, pmg.Name, pmg.Host, "PMG")
+	m.clearResourceOfflineAlert(pmg.ID, pmg.Name, pmg.Host, "PMG", offlineRecoveryConfirmationsDefault)
 }

 // checkPMGQueueDepths checks PMG mail queue depths and creates alerts
@ -8936,6 +9166,10 @@ func (m *Manager) checkAnomalyMetric(pmg models.PMGInstance, tracker *pmgAnomaly
 func (m *Manager) checkStorageOffline(storage models.Storage) {
 	alertID := fmt.Sprintf("storage-offline-%s", storage.ID)

+	m.mu.Lock()
+	delete(m.offlineRecoveryConfirmations, canonicalConnectivityStateID(storage.ID))
+	m.mu.Unlock()
+
 	thresholds := m.resolveResourceThresholds("storage", storage.ID)
 	spec, err := buildCanonicalConnectivitySpec(storage.ID, storage.Name, unifiedresources.ResourceTypeStorage, AlertLevelWarning, 2, thresholds.Disabled || thresholds.DisableConnectivity)
 	if err != nil {
@ -8970,7 +9204,7 @@ func (m *Manager) checkStorageOffline(storage models.Storage) {

 // clearStorageOfflineAlert removes offline alert when storage comes back online
 func (m *Manager) clearStorageOfflineAlert(storage models.Storage) {
-	m.clearResourceOfflineAlert("storage-offline", storage.ID, storage.Name, storage.Node, "Storage")
+	m.clearResourceOfflineAlert(storage.ID, storage.Name, storage.Node, "Storage", offlineRecoveryConfirmationsStorage)
 }

 // checkGuestPoweredOff creates an alert for powered-off guests
@ -10095,6 +10329,7 @@ func (m *Manager) ClearActiveAlerts() {
 	m.alertRateLimit = make(map[string][]time.Time)
 	m.nodeOfflineCount = make(map[string]int)
 	m.offlineConfirmations = make(map[string]int)
+	m.offlineRecoveryConfirmations = make(map[string]int)
 	m.dockerOfflineCount = make(map[string]int)
 	m.dockerStateConfirm = make(map[string]int)
 	m.dockerRestartTracking = make(map[string]*dockerRestartRecord)
@ -10215,6 +10450,13 @@ func (m *Manager) cleanupStaleMaps() {
 		}
 	}

+	for alertID := range m.offlineRecoveryConfirmations {
+		if !m.hasActiveAlertNoLock(alertID) {
+			delete(m.offlineRecoveryConfirmations, alertID)
+			cleaned++
+		}
+	}
+
 	// Clean up node offline counts (legacy)
 	for nodeID := range m.nodeOfflineCount {
 		hasRelatedAlert := false
--- a/internal/alerts/alerts_test.go
+++ b/internal/alerts/alerts_test.go
@ -5097,6 +5097,18 @@ func TestClearStorageOfflineAlert(t *testing.T) {
 			resolvedCh <- id
 		})

+		m.clearStorageOfflineAlert(storage)
+		m.mu.RLock()
+		_, existsAfterFirst := testLookupActiveAlert(t, m, alertID)
+		recoveryCountAfterFirst := m.offlineRecoveryConfirmations[canonicalConnectivityStateID(storage.ID)]
+		m.mu.RUnlock()
+		if !existsAfterFirst {
+			t.Fatal("expected storage alert to remain active until recovery confirmations are satisfied")
+		}
+		if recoveryCountAfterFirst != 1 {
+			t.Fatalf("expected first storage recovery confirmation to be tracked, got %d", recoveryCountAfterFirst)
+		}
+
 		m.clearStorageOfflineAlert(storage)

 		m.mu.RLock()
@ -6787,6 +6799,28 @@ func TestHandleHostOffline(t *testing.T) {
 		}
 	})

+	t.Run("linked node override DisableConnectivity clears alert and returns", func(t *testing.T) {
+		m := newTestManager(t)
+		m.config.Enabled = true
+		m.config.Overrides = map[string]ThresholdConfig{
+			"cluster-node1": {DisableConnectivity: true},
+		}
+
+		alertID := canonicalConnectivityStateID(hostResourceID("host1"))
+		m.activeAlerts[alertID] = &Alert{ID: alertID, Type: "host-offline"}
+		m.offlineConfirmations[hostResourceID("host1")] = 5
+
+		host := models.Host{ID: "host1", Hostname: "test-host", LinkedNodeID: "cluster-node1"}
+		m.HandleHostOffline(host)
+
+		if testHasActiveAlert(t, m, alertID) {
+			t.Error("expected alert to be cleared")
+		}
+		if _, exists := m.offlineConfirmations[hostResourceID("host1")]; exists {
+			t.Error("expected offlineConfirmations to be cleared")
+		}
+	})
+
 	t.Run("override Disabled clears alert and returns", func(t *testing.T) {
 		// t.Parallel()
 		m := newTestManager(t)
@ -8118,6 +8152,30 @@ func TestClearNodeOfflineAlert(t *testing.T) {
 		m.mu.Unlock()

 		node := models.Node{ID: "node1", Name: "Node 1", Instance: "pve1"}
+		m.clearNodeOfflineAlert(node)
+		m.mu.RLock()
+		_, existsAfterFirst := testLookupActiveAlert(t, m, state)
+		recoveryCountAfterFirst := m.offlineRecoveryConfirmations[state]
+		m.mu.RUnlock()
+		if !existsAfterFirst {
+			t.Fatal("expected node alert to remain active until recovery confirmations are satisfied")
+		}
+		if recoveryCountAfterFirst != 1 {
+			t.Fatalf("expected first node recovery confirmation to be tracked, got %d", recoveryCountAfterFirst)
+		}
+
+		m.clearNodeOfflineAlert(node)
+		m.mu.RLock()
+		_, existsAfterSecond := testLookupActiveAlert(t, m, state)
+		recoveryCountAfterSecond := m.offlineRecoveryConfirmations[state]
+		m.mu.RUnlock()
+		if !existsAfterSecond {
+			t.Fatal("expected node alert to remain active after second recovery confirmation")
+		}
+		if recoveryCountAfterSecond != 2 {
+			t.Fatalf("expected second node recovery confirmation to be tracked, got %d", recoveryCountAfterSecond)
+		}
+
 		m.clearNodeOfflineAlert(node)

 		m.mu.RLock()
@ -8174,7 +8232,10 @@ func TestClearOfflineAlertNoDeadlock(t *testing.T) {
 				m.mu.Unlock()
 			},
 			clearFn: func(m *Manager) {
-				m.clearNodeOfflineAlert(models.Node{ID: "node1", Name: "Node 1", Instance: "pve1"})
+				node := models.Node{ID: "node1", Name: "Node 1", Instance: "pve1"}
+				m.clearNodeOfflineAlert(node)
+				m.clearNodeOfflineAlert(node)
+				m.clearNodeOfflineAlert(node)
 			},
 		},
 		{
@ -8189,7 +8250,10 @@ func TestClearOfflineAlertNoDeadlock(t *testing.T) {
 				m.mu.Unlock()
 			},
 			clearFn: func(m *Manager) {
-				m.clearPBSOfflineAlert(models.PBSInstance{ID: "pbs1", Name: "PBS 1", Host: "host1"})
+				pbs := models.PBSInstance{ID: "pbs1", Name: "PBS 1", Host: "host1"}
+				m.clearPBSOfflineAlert(pbs)
+				m.clearPBSOfflineAlert(pbs)
+				m.clearPBSOfflineAlert(pbs)
 			},
 		},
 		{
@ -8204,7 +8268,10 @@ func TestClearOfflineAlertNoDeadlock(t *testing.T) {
 				m.mu.Unlock()
 			},
 			clearFn: func(m *Manager) {
-				m.clearPMGOfflineAlert(models.PMGInstance{ID: "pmg1", Name: "PMG 1", Host: "host1"})
+				pmg := models.PMGInstance{ID: "pmg1", Name: "PMG 1", Host: "host1"}
+				m.clearPMGOfflineAlert(pmg)
+				m.clearPMGOfflineAlert(pmg)
+				m.clearPMGOfflineAlert(pmg)
 			},
 		},
 		{
@ -8219,7 +8286,9 @@ func TestClearOfflineAlertNoDeadlock(t *testing.T) {
 				m.mu.Unlock()
 			},
 			clearFn: func(m *Manager) {
-				m.clearStorageOfflineAlert(models.Storage{ID: "stor1", Name: "Storage 1", Node: "node1"})
+				storage := models.Storage{ID: "stor1", Name: "Storage 1", Node: "node1"}
+				m.clearStorageOfflineAlert(storage)
+				m.clearStorageOfflineAlert(storage)
 			},
 		},
 		{
@ -8319,6 +8388,30 @@ func TestClearPBSOfflineAlert(t *testing.T) {
 		m.mu.Unlock()

 		pbs := models.PBSInstance{ID: "pbs1", Name: "PBS 1", Host: "pbs.local"}
+		m.clearPBSOfflineAlert(pbs)
+		m.mu.RLock()
+		_, existsAfterFirst := testLookupActiveAlert(t, m, "pbs-offline-pbs1")
+		recoveryCountAfterFirst := m.offlineRecoveryConfirmations[canonicalConnectivityStateID("pbs1")]
+		m.mu.RUnlock()
+		if !existsAfterFirst {
+			t.Fatal("expected PBS alert to remain active until recovery confirmations are satisfied")
+		}
+		if recoveryCountAfterFirst != 1 {
+			t.Fatalf("expected first PBS recovery confirmation to be tracked, got %d", recoveryCountAfterFirst)
+		}
+
+		m.clearPBSOfflineAlert(pbs)
+		m.mu.RLock()
+		_, existsAfterSecond := testLookupActiveAlert(t, m, "pbs-offline-pbs1")
+		recoveryCountAfterSecond := m.offlineRecoveryConfirmations[canonicalConnectivityStateID("pbs1")]
+		m.mu.RUnlock()
+		if !existsAfterSecond {
+			t.Fatal("expected PBS alert to remain active after second recovery confirmation")
+		}
+		if recoveryCountAfterSecond != 2 {
+			t.Fatalf("expected second PBS recovery confirmation to be tracked, got %d", recoveryCountAfterSecond)
+		}
+
 		m.clearPBSOfflineAlert(pbs)

 		m.mu.RLock()
@ -8402,6 +8495,30 @@ func TestClearPMGOfflineAlert(t *testing.T) {
 		m.mu.Unlock()

 		pmg := models.PMGInstance{ID: "pmg1", Name: "PMG 1", Host: "pmg.local"}
+		m.clearPMGOfflineAlert(pmg)
+		m.mu.RLock()
+		_, existsAfterFirst := testLookupActiveAlert(t, m, "pmg-offline-pmg1")
+		recoveryCountAfterFirst := m.offlineRecoveryConfirmations[canonicalConnectivityStateID("pmg1")]
+		m.mu.RUnlock()
+		if !existsAfterFirst {
+			t.Fatal("expected PMG alert to remain active until recovery confirmations are satisfied")
+		}
+		if recoveryCountAfterFirst != 1 {
+			t.Fatalf("expected first PMG recovery confirmation to be tracked, got %d", recoveryCountAfterFirst)
+		}
+
+		m.clearPMGOfflineAlert(pmg)
+		m.mu.RLock()
+		_, existsAfterSecond := testLookupActiveAlert(t, m, "pmg-offline-pmg1")
+		recoveryCountAfterSecond := m.offlineRecoveryConfirmations[canonicalConnectivityStateID("pmg1")]
+		m.mu.RUnlock()
+		if !existsAfterSecond {
+			t.Fatal("expected PMG alert to remain active after second recovery confirmation")
+		}
+		if recoveryCountAfterSecond != 2 {
+			t.Fatalf("expected second PMG recovery confirmation to be tracked, got %d", recoveryCountAfterSecond)
+		}
+
 		m.clearPMGOfflineAlert(pmg)

 		m.mu.RLock()
@ -13217,6 +13334,7 @@ func TestCheckNode(t *testing.T) {
 	t.Run("online node clears offline alert", func(t *testing.T) {
 		// t.Parallel()
 		m := newTestManager(t)
+		state := canonicalConnectivityStateID("node1")

 		// Pre-create offline alert
 		m.mu.Lock()
@ -13235,11 +13353,36 @@ func TestCheckNode(t *testing.T) {
 			Status:           "online",
 			ConnectionHealth: "connected",
 		}
+		m.CheckNode(node)
+		m.mu.RLock()
+		_, alertExistsAfterFirst := testLookupActiveAlert(t, m, "node-offline-node1")
+		recoveryCountAfterFirst := m.offlineRecoveryConfirmations[state]
+		m.mu.RUnlock()
+		if !alertExistsAfterFirst {
+			t.Fatal("expected offline alert to remain active until recovery confirmations are satisfied")
+		}
+		if recoveryCountAfterFirst != 1 {
+			t.Fatalf("expected first recovery confirmation to be tracked, got %d", recoveryCountAfterFirst)
+		}
+
+		m.CheckNode(node)
+		m.mu.RLock()
+		_, alertExistsAfterSecond := testLookupActiveAlert(t, m, "node-offline-node1")
+		recoveryCountAfterSecond := m.offlineRecoveryConfirmations[state]
+		m.mu.RUnlock()
+		if !alertExistsAfterSecond {
+			t.Fatal("expected offline alert to remain active after second recovery confirmation")
+		}
+		if recoveryCountAfterSecond != 2 {
+			t.Fatalf("expected second recovery confirmation to be tracked, got %d", recoveryCountAfterSecond)
+		}
+
 		m.CheckNode(node)

 		m.mu.RLock()
 		_, alertExists := testLookupActiveAlert(t, m, "node-offline-node1")
 		_, countExists := m.nodeOfflineCount["node1"]
+		_, recoveryExists := m.offlineRecoveryConfirmations[state]
 		m.mu.RUnlock()

 		if alertExists {
@ -13248,6 +13391,9 @@ func TestCheckNode(t *testing.T) {
 		if countExists {
 			t.Error("expected offline count to be cleared")
 		}
+		if recoveryExists {
+			t.Error("expected offline recovery confirmations to be cleared")
+		}
 	})

 	t.Run("online node triggers metric checks", func(t *testing.T) {
@ -14907,6 +15053,125 @@ func TestCheckHostComprehensive(t *testing.T) {
 			t.Error("expected tags in metadata")
 		}
 	})
+
+	t.Run("inherits linked node overrides for host agent metrics", func(t *testing.T) {
+		m := newTestManager(t)
+		m.ClearActiveAlerts()
+
+		m.mu.Lock()
+		m.config.TimeThresholds = map[string]int{}
+		m.config.AgentDefaults = ThresholdConfig{
+			Memory: &HysteresisThreshold{Trigger: 85.0, Clear: 80.0},
+		}
+		m.config.Overrides = map[string]ThresholdConfig{
+			"cluster-node1": {
+				Memory: &HysteresisThreshold{Trigger: 97.0, Clear: 92.0},
+			},
+		}
+		m.mu.Unlock()
+
+		host := models.Host{
+			ID:           "host-node1",
+			DisplayName:  "node1",
+			Hostname:     "node1",
+			LinkedNodeID: "cluster-node1",
+			Memory: models.Memory{
+				Usage: 90.6,
+				Total: 1024,
+				Used:  928,
+				Free:  96,
+			},
+			Status:   "online",
+			LastSeen: time.Now(),
+		}
+
+		m.CheckHost(host)
+
+		m.mu.RLock()
+		_, exists := m.activeAlerts[canonicalMetricStateID(hostResourceID(host.ID), "memory")]
+		m.mu.RUnlock()
+
+		if exists {
+			t.Fatal("expected linked node override to suppress host-agent memory alert")
+		}
+	})
+
+	t.Run("inherits linked guest overrides for host agent metrics", func(t *testing.T) {
+		m := newTestManager(t)
+		m.ClearActiveAlerts()
+
+		m.mu.Lock()
+		m.config.TimeThresholds = map[string]int{}
+		m.config.AgentDefaults = ThresholdConfig{
+			CPU: &HysteresisThreshold{Trigger: 80.0, Clear: 75.0},
+		}
+		m.config.Overrides = map[string]ThresholdConfig{
+			"guest:Main:101": {
+				CPU: &HysteresisThreshold{Trigger: 105.0, Clear: 100.0},
+			},
+		}
+		m.mu.Unlock()
+
+		host := models.Host{
+			ID:          "host-vm101",
+			DisplayName: "vm101",
+			Hostname:    "vm101.local",
+			LinkedVMID:  "Main:node3:101",
+			CPUUsage:    97.5,
+			Status:      "online",
+			LastSeen:    time.Now(),
+		}
+
+		m.CheckHost(host)
+
+		m.mu.RLock()
+		_, exists := m.activeAlerts[canonicalMetricStateID(hostResourceID(host.ID), "cpu")]
+		m.mu.RUnlock()
+
+		if exists {
+			t.Fatal("expected linked guest override to suppress host-agent cpu alert")
+		}
+	})
+
+	t.Run("prefers explicit host overrides over linked resource overrides", func(t *testing.T) {
+		m := newTestManager(t)
+		m.ClearActiveAlerts()
+
+		m.mu.Lock()
+		m.config.TimeThresholds = map[string]int{}
+		m.config.AgentDefaults = ThresholdConfig{
+			CPU: &HysteresisThreshold{Trigger: 80.0, Clear: 75.0},
+		}
+		m.config.Overrides = map[string]ThresholdConfig{
+			"guest:Main:101": {
+				CPU: &HysteresisThreshold{Trigger: 105.0, Clear: 100.0},
+			},
+			"host-vm101": {
+				CPU: &HysteresisThreshold{Trigger: 90.0, Clear: 85.0},
+			},
+		}
+		m.mu.Unlock()
+
+		host := models.Host{
+			ID:          "host-vm101",
+			DisplayName: "vm101",
+			Hostname:    "vm101.local",
+			LinkedVMID:  "Main:node3:101",
+			CPUUsage:    97.5,
+			Status:      "online",
+			LastSeen:    time.Now(),
+		}
+
+		m.CheckHost(host)
+
+		m.mu.RLock()
+		alert := m.activeAlerts[canonicalMetricStateID(hostResourceID(host.ID), "cpu")]
+		m.mu.RUnlock()
+
+		if alert == nil {
+			t.Fatal("expected explicit host override to take precedence and trigger alert")
+		}
+	})
 }

 func TestCheckPBSComprehensive(t *testing.T) {
@ -15332,6 +15597,7 @@ func TestCheckPBSComprehensive(t *testing.T) {
 	t.Run("clears offline alert when back online", func(t *testing.T) {
 		// t.Parallel()
 		m := newTestManager(t)
+		state := canonicalConnectivityStateID("pbs1")

 		m.mu.Lock()
 		m.activeAlerts["pbs-offline-pbs1"] = &Alert{ID: "pbs-offline-pbs1", Type: "connectivity"}
@ -15345,11 +15611,36 @@ func TestCheckPBSComprehensive(t *testing.T) {
 			ConnectionHealth: "healthy",
 		}

+		m.CheckPBS(pbs)
+		m.mu.RLock()
+		_, offlineExistsAfterFirst := testLookupActiveAlert(t, m, "pbs-offline-pbs1")
+		recoveryCountAfterFirst := m.offlineRecoveryConfirmations[state]
+		m.mu.RUnlock()
+		if !offlineExistsAfterFirst {
+			t.Fatal("expected offline alert to remain active until recovery confirmations are satisfied")
+		}
+		if recoveryCountAfterFirst != 1 {
+			t.Fatalf("expected first PBS recovery confirmation to be tracked, got %d", recoveryCountAfterFirst)
+		}
+
+		m.CheckPBS(pbs)
+		m.mu.RLock()
+		_, offlineExistsAfterSecond := testLookupActiveAlert(t, m, "pbs-offline-pbs1")
+		recoveryCountAfterSecond := m.offlineRecoveryConfirmations[state]
+		m.mu.RUnlock()
+		if !offlineExistsAfterSecond {
+			t.Fatal("expected offline alert to remain active after second recovery confirmation")
+		}
+		if recoveryCountAfterSecond != 2 {
+			t.Fatalf("expected second PBS recovery confirmation to be tracked, got %d", recoveryCountAfterSecond)
+		}
+
 		m.CheckPBS(pbs)

 		m.mu.RLock()
 		_, offlineExists := testLookupActiveAlert(t, m, "pbs-offline-pbs1")
 		_, confirmExists := m.offlineConfirmations["pbs1"]
+		_, recoveryExists := m.offlineRecoveryConfirmations[state]
 		m.mu.RUnlock()

 		if offlineExists {
@ -15358,6 +15649,9 @@ func TestCheckPBSComprehensive(t *testing.T) {
 		if confirmExists {
 			t.Error("expected offline confirmation to be cleared")
 		}
+		if recoveryExists {
+			t.Error("expected offline recovery confirmations to be cleared")
+		}
 	})
 }

@ -15664,6 +15958,7 @@ func TestCheckPMGComprehensive(t *testing.T) {
 	t.Run("clears offline alert when back online", func(t *testing.T) {
 		// t.Parallel()
 		m := newTestManager(t)
+		state := canonicalConnectivityStateID("pmg1")

 		m.mu.Lock()
 		m.activeAlerts["pmg-offline-pmg1"] = &Alert{ID: "pmg-offline-pmg1", Type: "connectivity"}
@ -15677,11 +15972,36 @@ func TestCheckPMGComprehensive(t *testing.T) {
 			ConnectionHealth: "healthy",
 		}

+		m.CheckPMG(pmg)
+		m.mu.RLock()
+		_, offlineExistsAfterFirst := testLookupActiveAlert(t, m, "pmg-offline-pmg1")
+		recoveryCountAfterFirst := m.offlineRecoveryConfirmations[state]
+		m.mu.RUnlock()
+		if !offlineExistsAfterFirst {
+			t.Fatal("expected offline alert to remain active until recovery confirmations are satisfied")
+		}
+		if recoveryCountAfterFirst != 1 {
+			t.Fatalf("expected first PMG recovery confirmation to be tracked, got %d", recoveryCountAfterFirst)
+		}
+
+		m.CheckPMG(pmg)
+		m.mu.RLock()
+		_, offlineExistsAfterSecond := testLookupActiveAlert(t, m, "pmg-offline-pmg1")
+		recoveryCountAfterSecond := m.offlineRecoveryConfirmations[state]
+		m.mu.RUnlock()
+		if !offlineExistsAfterSecond {
+			t.Fatal("expected offline alert to remain active after second recovery confirmation")
+		}
+		if recoveryCountAfterSecond != 2 {
+			t.Fatalf("expected second PMG recovery confirmation to be tracked, got %d", recoveryCountAfterSecond)
+		}
+
 		m.CheckPMG(pmg)

 		m.mu.RLock()
 		_, offlineExists := testLookupActiveAlert(t, m, "pmg-offline-pmg1")
 		_, confirmExists := m.offlineConfirmations["pmg1"]
+		_, recoveryExists := m.offlineRecoveryConfirmations[state]
 		m.mu.RUnlock()

 		if offlineExists {
@ -15690,6 +16010,9 @@ func TestCheckPMGComprehensive(t *testing.T) {
 		if confirmExists {
 			t.Error("expected offline confirmation to be cleared")
 		}
+		if recoveryExists {
+			t.Error("expected offline recovery confirmations to be cleared")
+		}
 	})

 	t.Run("skips metrics when PMG is offline", func(t *testing.T) {
@ -16008,6 +16331,7 @@ func TestCheckStorageComprehensive(t *testing.T) {
 	t.Run("clears offline alert when back online", func(t *testing.T) {
 		// t.Parallel()
 		m := newTestManager(t)
+		state := canonicalConnectivityStateID("storage1")

 		m.mu.Lock()
 		m.activeAlerts["storage-offline-storage1"] = &Alert{ID: "storage-offline-storage1", Type: "connectivity"}
@ -16020,11 +16344,24 @@ func TestCheckStorageComprehensive(t *testing.T) {
 			Status: "active",
 		}

+		m.CheckStorage(storage)
+		m.mu.RLock()
+		_, offlineExistsAfterFirst := testLookupActiveAlert(t, m, "storage-offline-storage1")
+		recoveryCountAfterFirst := m.offlineRecoveryConfirmations[state]
+		m.mu.RUnlock()
+		if !offlineExistsAfterFirst {
+			t.Fatal("expected offline alert to remain active until recovery confirmations are satisfied")
+		}
+		if recoveryCountAfterFirst != 1 {
+			t.Fatalf("expected first storage recovery confirmation to be tracked, got %d", recoveryCountAfterFirst)
+		}
+
 		m.CheckStorage(storage)

 		m.mu.RLock()
 		_, offlineExists := testLookupActiveAlert(t, m, "storage-offline-storage1")
 		_, confirmExists := m.offlineConfirmations["storage1"]
+		_, recoveryExists := m.offlineRecoveryConfirmations[state]
 		m.mu.RUnlock()

 		if offlineExists {
@ -16033,6 +16370,9 @@ func TestCheckStorageComprehensive(t *testing.T) {
 		if confirmExists {
 			t.Error("expected offline confirmation to be cleared")
 		}
+		if recoveryExists {
+			t.Error("expected offline recovery confirmations to be cleared")
+		}
 	})

 	t.Run("skips usage check when usage is zero", func(t *testing.T) {
--- a/internal/alerts/quiet_hours_test.go
+++ b/internal/alerts/quiet_hours_test.go
@ -79,6 +79,40 @@ func TestShouldSuppressNotificationQuietHours(t *testing.T) {
 			t.Fatalf("expected storage alert suppression, got suppressed=%t reason=%q", suppressed, reason)
 		}
 	})
+
+	t.Run("public suppression helper mirrors quiet hours policy", func(t *testing.T) {
+		m := newManagerWithQuietHoursSuppress(QuietHoursSuppression{Offline: true})
+		alert := &Alert{ID: "offline-public", Type: "connectivity", Level: AlertLevelCritical}
+		if !m.ShouldSuppressNotification(alert) {
+			t.Fatal("expected public quiet-hours suppression helper to return true")
+		}
+	})
+
+	t.Run("resolved notifications are suppressed for acknowledged alerts", func(t *testing.T) {
+		m := newManagerWithQuietHoursSuppress(QuietHoursSuppression{})
+		alert := &Alert{
+			ID:           "resolved-ack",
+			Type:         "cpu",
+			Level:        AlertLevelCritical,
+			Acknowledged: true,
+			LastNotified: ptrTime(time.Now().Add(-time.Minute)),
+		}
+		if !m.ShouldSuppressResolvedNotification(alert) {
+			t.Fatal("expected recovery notification to be suppressed for acknowledged alert")
+		}
+	})
+
+	t.Run("resolved notifications are suppressed when the firing alert was never notified", func(t *testing.T) {
+		m := newManagerWithQuietHoursSuppress(QuietHoursSuppression{})
+		alert := &Alert{
+			ID:    "resolved-unnotified",
+			Type:  "cpu",
+			Level: AlertLevelCritical,
+		}
+		if !m.ShouldSuppressResolvedNotification(alert) {
+			t.Fatal("expected recovery notification to be suppressed when LastNotified is nil")
+		}
+	})
 }

 func TestIsInQuietHours(t *testing.T) {
--- a/internal/alerts/threshold_update_test.go
+++ b/internal/alerts/threshold_update_test.go
@ -156,6 +156,65 @@ func TestReevaluateActiveAlertsWithOverride(t *testing.T) {
 	}
 }

+func TestReevaluateActiveAlertsWithLinkedHostOverride(t *testing.T) {
+	manager := NewManager()
+
+	manager.mu.Lock()
+	manager.activeAlerts = make(map[string]*Alert)
+	manager.mu.Unlock()
+
+	initialConfig := AlertConfig{
+		Enabled: true,
+		AgentDefaults: ThresholdConfig{
+			Memory: &HysteresisThreshold{Trigger: 85, Clear: 80},
+		},
+		Overrides: make(map[string]ThresholdConfig),
+	}
+	manager.UpdateConfig(initialConfig)
+
+	resourceID := hostResourceID("host-node1")
+	alertID := canonicalMetricStateID(resourceID, "memory")
+	alert := &Alert{
+		ID:           alertID,
+		Type:         "memory",
+		Level:        AlertLevelWarning,
+		ResourceID:   resourceID,
+		ResourceName: "node1",
+		Node:         "node1",
+		Instance:     "linux",
+		Message:      "Host memory at 90.6%",
+		Value:        90.6,
+		Threshold:    85.0,
+		StartTime:    time.Now().Add(-5 * time.Minute),
+		LastSeen:     time.Now(),
+		Metadata: map[string]interface{}{
+			"resourceType": "agent",
+			"hostId":       "host-node1",
+			"linkedNodeId": "cluster-node1",
+		},
+	}
+
+	manager.mu.Lock()
+	manager.activeAlerts[alertID] = alert
+	manager.mu.Unlock()
+
+	updatedConfig := initialConfig
+	updatedConfig.Overrides["cluster-node1"] = ThresholdConfig{
+		Memory: &HysteresisThreshold{Trigger: 97, Clear: 92},
+	}
+	manager.UpdateConfig(updatedConfig)
+
+	time.Sleep(100 * time.Millisecond)
+
+	manager.mu.RLock()
+	_, alertStillActive := manager.activeAlerts[alertID]
+	manager.mu.RUnlock()
+
+	if alertStillActive {
+		t.Errorf("expected linked host alert to be resolved after linked node override increase")
+	}
+}
+
 func TestReevaluateActiveAlertsUsesStableClusterGuestOverrideAcrossNodeMove(t *testing.T) {
 	manager := NewManager()

--- a/internal/alerts/unified_eval_test.go
+++ b/internal/alerts/unified_eval_test.go
@ -369,6 +369,114 @@ func TestCheckGuestPerDiskAnnotatesCanonicalSpecMetadata(t *testing.T) {
 	}
 }

+func TestCheckGuestPerDiskCleansUpRemovedDiskAlerts(t *testing.T) {
+	m := newTestManager(t)
+	configureUnifiedEvalManager(t, m, unifiedEvalBaseConfig())
+
+	guestID := BuildGuestKey("pve1", "node1", 101)
+	baseGuest := models.VM{
+		ID:       guestID,
+		VMID:     101,
+		Name:     "app01",
+		Node:     "node1",
+		Instance: "pve1",
+		Status:   "running",
+		CPU:      0.20,
+		Memory:   models.Memory{Usage: 40},
+		Disk:     models.Disk{Usage: 40},
+	}
+
+	withDisk := baseGuest
+	withDisk.Disks = []models.Disk{
+		{
+			Mountpoint: "/",
+			Device:     "scsi0",
+			Usage:      95,
+			Total:      100,
+			Used:       95,
+			Free:       5,
+		},
+	}
+	m.CheckGuest(withDisk, "pve1")
+
+	originalAlertID := canonicalMetricStateID(guestID+"-disk-scsi0", "disk")
+	if activeAlert(t, m, originalAlertID) == nil {
+		t.Fatalf("expected guest disk alert %q", originalAlertID)
+	}
+
+	changedDisk := baseGuest
+	changedDisk.Disks = []models.Disk{
+		{
+			Mountpoint: "/data",
+			Device:     "scsi1",
+			Usage:      96,
+			Total:      100,
+			Used:       96,
+			Free:       4,
+		},
+	}
+	m.CheckGuest(changedDisk, "pve1")
+
+	newAlertID := canonicalMetricStateID(guestID+"-disk-data-scsi1", "disk")
+	m.mu.RLock()
+	_, oldExists := testLookupActiveAlert(t, m, originalAlertID)
+	_, newExists := testLookupActiveAlert(t, m, newAlertID)
+	m.mu.RUnlock()
+
+	if oldExists {
+		t.Fatalf("expected stale guest disk alert %q to be cleared", originalAlertID)
+	}
+	if !newExists {
+		t.Fatalf("expected replacement guest disk alert %q", newAlertID)
+	}
+}
+
+func TestCheckGuestClearsPerDiskAlertsWhenGuestStops(t *testing.T) {
+	m := newTestManager(t)
+	configureUnifiedEvalManager(t, m, unifiedEvalBaseConfig())
+
+	guestID := BuildGuestKey("pve1", "node1", 101)
+	guest := models.VM{
+		ID:       guestID,
+		VMID:     101,
+		Name:     "app01",
+		Node:     "node1",
+		Instance: "pve1",
+		Status:   "running",
+		CPU:      0.20,
+		Memory:   models.Memory{Usage: 40},
+		Disk:     models.Disk{Usage: 40},
+		Disks: []models.Disk{
+			{
+				Mountpoint: "/",
+				Device:     "scsi0",
+				Usage:      95,
+				Total:      100,
+				Used:       95,
+				Free:       5,
+			},
+		},
+	}
+
+	m.CheckGuest(guest, "pve1")
+
+	alertID := canonicalMetricStateID(guestID+"-disk-scsi0", "disk")
+	if activeAlert(t, m, alertID) == nil {
+		t.Fatalf("expected guest disk alert %q", alertID)
+	}
+
+	guest.Status = "stopped"
+	guest.Disks = nil
+	m.CheckGuest(guest, "pve1")
+
+	m.mu.RLock()
+	_, exists := testLookupActiveAlert(t, m, alertID)
+	m.mu.RUnlock()
+	if exists {
+		t.Fatalf("expected guest disk alert %q to clear when guest stops", alertID)
+	}
+}
+
 func TestCheckNodeTemperatureAnnotatesCanonicalSpecMetadata(t *testing.T) {
 	m := newTestManager(t)
 	configureUnifiedEvalManager(t, m, unifiedEvalBaseConfig())
--- a/internal/alerts/update_alerts_test.go
+++ b/internal/alerts/update_alerts_test.go
@ -221,6 +221,122 @@ func TestCheckDockerContainerImageUpdate(t *testing.T) {
 		}
 	})

+	t.Run("missing update status preserves existing alert and tracking", func(t *testing.T) {
+		testResourceID := "docker:" + hostID + "/container-unknown-status"
+		container := models.DockerContainer{
+			ID:    "container-unknown-status",
+			Name:  "unknown-status-container",
+			Image: "nginx:latest",
+			UpdateStatus: &models.DockerContainerUpdateStatus{
+				UpdateAvailable: true,
+				CurrentDigest:   "sha256:old",
+				LatestDigest:    "sha256:new",
+				LastChecked:     time.Now(),
+			},
+		}
+		trackingKey := dockerUpdateTrackingKey(host, container)
+		firstSeen := time.Now().Add(-25 * time.Hour)
+
+		m.mu.Lock()
+		m.dockerUpdateFirstSeen[testResourceID] = firstSeen
+		m.dockerUpdateFirstSeenByIdentity[trackingKey] = firstSeen
+		m.mu.Unlock()
+
+		m.checkDockerContainerImageUpdate(host, container, testResourceID, "unknown-status-container", instanceName, nodeName)
+
+		canonicalAlertID := buildCanonicalStateID(testResourceID, testResourceID+"-image-update")
+		m.mu.RLock()
+		alert := testRequireActiveAlert(t, m, canonicalAlertID)
+		if alert == nil {
+			m.mu.RUnlock()
+			t.Fatalf("expected update alert %q to be active", canonicalAlertID)
+		}
+		beforeLastSeen := alert.LastSeen
+		m.mu.RUnlock()
+
+		time.Sleep(10 * time.Millisecond)
+		container.UpdateStatus = nil
+		m.checkDockerContainerImageUpdate(host, container, testResourceID, "unknown-status-container", instanceName, nodeName)
+
+		m.mu.RLock()
+		alert = testRequireActiveAlert(t, m, canonicalAlertID)
+		if alert == nil {
+			m.mu.RUnlock()
+			t.Fatalf("expected update alert %q to remain active", canonicalAlertID)
+		}
+		_, tracked := m.dockerUpdateFirstSeen[testResourceID]
+		_, trackedByIdentity := m.dockerUpdateFirstSeenByIdentity[trackingKey]
+		afterLastSeen := alert.LastSeen
+		m.mu.RUnlock()
+
+		if !tracked || !trackedByIdentity {
+			t.Fatal("expected update tracking to remain when update status is temporarily unavailable")
+		}
+		if !afterLastSeen.After(beforeLastSeen) {
+			t.Fatalf("expected LastSeen to refresh when update status is missing, before=%s after=%s", beforeLastSeen, afterLastSeen)
+		}
+	})
+
+	t.Run("update detection errors preserve existing alert and tracking", func(t *testing.T) {
+		testResourceID := "docker:" + hostID + "/container-error-status"
+		container := models.DockerContainer{
+			ID:    "container-error-status",
+			Name:  "error-status-container",
+			Image: "nginx:latest",
+			UpdateStatus: &models.DockerContainerUpdateStatus{
+				UpdateAvailable: true,
+				CurrentDigest:   "sha256:old",
+				LatestDigest:    "sha256:new",
+				LastChecked:     time.Now(),
+			},
+		}
+		trackingKey := dockerUpdateTrackingKey(host, container)
+		firstSeen := time.Now().Add(-25 * time.Hour)
+
+		m.mu.Lock()
+		m.dockerUpdateFirstSeen[testResourceID] = firstSeen
+		m.dockerUpdateFirstSeenByIdentity[trackingKey] = firstSeen
+		m.mu.Unlock()
+
+		m.checkDockerContainerImageUpdate(host, container, testResourceID, "error-status-container", instanceName, nodeName)
+
+		canonicalAlertID := buildCanonicalStateID(testResourceID, testResourceID+"-image-update")
+		m.mu.RLock()
+		alert := testRequireActiveAlert(t, m, canonicalAlertID)
+		if alert == nil {
+			m.mu.RUnlock()
+			t.Fatalf("expected update alert %q to be active", canonicalAlertID)
+		}
+		beforeLastSeen := alert.LastSeen
+		m.mu.RUnlock()
+
+		time.Sleep(10 * time.Millisecond)
+		container.UpdateStatus = &models.DockerContainerUpdateStatus{
+			UpdateAvailable: true,
+			Error:           "rate limited",
+			LastChecked:     time.Now(),
+		}
+		m.checkDockerContainerImageUpdate(host, container, testResourceID, "error-status-container", instanceName, nodeName)
+
+		m.mu.RLock()
+		alert = testRequireActiveAlert(t, m, canonicalAlertID)
+		if alert == nil {
+			m.mu.RUnlock()
+			t.Fatalf("expected update alert %q to remain active after check error", canonicalAlertID)
+		}
+		_, tracked := m.dockerUpdateFirstSeen[testResourceID]
+		_, trackedByIdentity := m.dockerUpdateFirstSeenByIdentity[trackingKey]
+		afterLastSeen := alert.LastSeen
+		m.mu.RUnlock()
+
+		if !tracked || !trackedByIdentity {
+			t.Fatal("expected update tracking to remain after update check error")
+		}
+		if !afterLastSeen.After(beforeLastSeen) {
+			t.Fatalf("expected LastSeen to refresh after update check error, before=%s after=%s", beforeLastSeen, afterLastSeen)
+		}
+	})
+
 	t.Run("disabled by negative delay hours", func(t *testing.T) {
 		// Create manager with disabled update alerts
 		m2 := NewManager()
--- a/internal/monitoring/canonical_guardrails_test.go
+++ b/internal/monitoring/canonical_guardrails_test.go
@ -91,6 +91,33 @@ func TestGetStateRefreshesLiveAlertSnapshots(t *testing.T) {
 	}
 }

+func TestEscalationDeliveryDefersToCanonicalAlertSuppression(t *testing.T) {
+	requiredSnippets := map[string][]string{
+		"monitor.go": {
+			"m.alertManager.SetEscalateCallback(func(alert *alerts.Alert, level int) {",
+			"m.handleAlertEscalated(wsHub, alert, level)",
+		},
+		"monitor_alerts.go": {
+			"func (m *Monitor) handleAlertEscalated(hub *websocket.Hub, alert *alerts.Alert, level int) {",
+			"if m.alertManager.ShouldSuppressNotification(alert) {",
+			"m.broadcastEscalatedAlert(hub, alert)",
+		},
+	}
+
+	for file, snippets := range requiredSnippets {
+		data, err := os.ReadFile(file)
+		if err != nil {
+			t.Fatalf("failed to read %s: %v", file, err)
+		}
+		source := string(data)
+		for _, snippet := range snippets {
+			if !strings.Contains(source, snippet) {
+				t.Fatalf("%s must contain %q", file, snippet)
+			}
+		}
+	}
+}
+
 func TestMonitoredSystemUsageReadinessGuardrailsRemainCanonical(t *testing.T) {
 	requiredSnippets := map[string][]string{
 		"monitor.go": {
--- a/internal/monitoring/monitor.go
+++ b/internal/monitoring/monitor.go
@ -1772,41 +1772,7 @@ func (m *Monitor) Start(ctx context.Context, wsHub *websocket.Hub) {
 		m.handleAlertUnacknowledged(alert, user)
 	})
 	m.alertManager.SetEscalateCallback(func(alert *alerts.Alert, level int) {
-		log.Info().
-			Str("alertID", alert.ID).
-			Int("level", level).
-			Msg("Alert escalated - sending notifications")
-
-		// Get escalation config
-		config := m.alertManager.GetConfig()
-		if level <= 0 || level > len(config.Schedule.Escalation.Levels) {
-			return
-		}
-
-		escalationLevel := config.Schedule.Escalation.Levels[level-1]
-
-		// Send notifications based on escalation level
-		switch escalationLevel.Notify {
-		case "email":
-			// Only send email
-			if emailConfig := m.notificationMgr.GetEmailConfig(); emailConfig.Enabled {
-				m.notificationMgr.SendAlert(alert)
-			}
-		case "webhook":
-			// Only send webhooks
-			for _, webhook := range m.notificationMgr.GetWebhooks() {
-				if webhook.Enabled {
-					m.notificationMgr.SendAlert(alert)
-					break
-				}
-			}
-		case "all":
-			// Send all notifications
-			m.notificationMgr.SendAlert(alert)
-		}
-
-		// Update WebSocket with escalation
-		m.broadcastEscalatedAlert(wsHub, alert)
+		m.handleAlertEscalated(wsHub, alert, level)
 	})

 	// Create separate tickers for polling and broadcasting using the configured cadence
--- a/internal/monitoring/monitor_alert_handling_test.go
+++ b/internal/monitoring/monitor_alert_handling_test.go
@ -1,6 +1,8 @@
 package monitoring

 import (
+	"net/http"
+	"net/http/httptest"
 	"sync/atomic"
 	"testing"
 	"time"
@ -276,3 +278,125 @@ func TestMonitor_HandleAlertResolved_NoQuietHoursSendsNotification(t *testing.T)
 	// Should not crash, and notification should be dispatched (not suppressed)
 	m.handleAlertResolved(alertID)
 }
+
+func TestMonitor_HandleAlertEscalated_QuietHoursSuppressesNotification(t *testing.T) {
+	t.Setenv("PULSE_DATA_DIR", t.TempDir())
+
+	requests := make(chan struct{}, 1)
+	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		requests <- struct{}{}
+		w.WriteHeader(http.StatusOK)
+	}))
+	defer server.Close()
+
+	notifMgr := notifications.NewNotificationManager("https://pulse.local")
+	defer notifMgr.Stop()
+	notifMgr.SetGroupingWindow(0)
+	notifMgr.SetCooldown(0)
+	if err := notifMgr.UpdateAllowedPrivateCIDRs("127.0.0.1/32"); err != nil {
+		t.Fatalf("UpdateAllowedPrivateCIDRs: %v", err)
+	}
+	notifMgr.AddWebhook(notifications.WebhookConfig{
+		ID:      "quiet-hours-hook",
+		Name:    "quiet-hours",
+		URL:     server.URL,
+		Enabled: true,
+	})
+
+	mgr := alerts.NewManager()
+	cfg := mgr.GetConfig()
+	cfg.Schedule.Escalation.Levels = []alerts.EscalationLevel{{After: 1, Notify: "webhook"}}
+	cfg.Schedule.QuietHours = alerts.QuietHours{
+		Enabled:  true,
+		Start:    "00:00",
+		End:      "23:59",
+		Timezone: "UTC",
+		Days: map[string]bool{
+			"monday": true, "tuesday": true, "wednesday": true,
+			"thursday": true, "friday": true, "saturday": true, "sunday": true,
+		},
+		Suppress: alerts.QuietHoursSuppression{Offline: true},
+	}
+	mgr.UpdateConfig(cfg)
+
+	m := &Monitor{
+		notificationMgr: notifMgr,
+		alertManager:    mgr,
+	}
+
+	alert := &alerts.Alert{
+		ID:           "escalated-offline",
+		Type:         "connectivity",
+		Level:        alerts.AlertLevelCritical,
+		ResourceID:   "node/pve-1",
+		ResourceName: "pve-1",
+		Node:         "pve-1",
+		Instance:     "pve",
+		Message:      "Node offline",
+		StartTime:    time.Now(),
+	}
+
+	m.handleAlertEscalated(nil, alert, 1)
+
+	select {
+	case <-requests:
+		t.Fatal("expected quiet hours to suppress escalated notification delivery")
+	case <-time.After(500 * time.Millisecond):
+	}
+}
+
+func TestMonitor_HandleAlertEscalated_SendsNotificationWhenNotSuppressed(t *testing.T) {
+	t.Setenv("PULSE_DATA_DIR", t.TempDir())
+
+	requests := make(chan struct{}, 1)
+	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		requests <- struct{}{}
+		w.WriteHeader(http.StatusOK)
+	}))
+	defer server.Close()
+
+	notifMgr := notifications.NewNotificationManager("https://pulse.local")
+	defer notifMgr.Stop()
+	notifMgr.SetGroupingWindow(0)
+	notifMgr.SetCooldown(0)
+	if err := notifMgr.UpdateAllowedPrivateCIDRs("127.0.0.1/32"); err != nil {
+		t.Fatalf("UpdateAllowedPrivateCIDRs: %v", err)
+	}
+	notifMgr.AddWebhook(notifications.WebhookConfig{
+		ID:      "normal-hook",
+		Name:    "normal",
+		URL:     server.URL,
+		Enabled: true,
+	})
+
+	mgr := alerts.NewManager()
+	cfg := mgr.GetConfig()
+	cfg.Schedule.Escalation.Levels = []alerts.EscalationLevel{{After: 1, Notify: "webhook"}}
+	cfg.Schedule.QuietHours.Enabled = false
+	mgr.UpdateConfig(cfg)
+
+	m := &Monitor{
+		notificationMgr: notifMgr,
+		alertManager:    mgr,
+	}
+
+	alert := &alerts.Alert{
+		ID:           "escalated-normal",
+		Type:         "connectivity",
+		Level:        alerts.AlertLevelCritical,
+		ResourceID:   "node/pve-1",
+		ResourceName: "pve-1",
+		Node:         "pve-1",
+		Instance:     "pve",
+		Message:      "Node offline",
+		StartTime:    time.Now(),
+	}
+
+	m.handleAlertEscalated(nil, alert, 1)
+
+	select {
+	case <-requests:
+	case <-time.After(2 * time.Second):
+		t.Fatal("expected escalated notification delivery")
+	}
+}
--- a/internal/monitoring/monitor_alerts.go
+++ b/internal/monitoring/monitor_alerts.go
@ -8,6 +8,7 @@ import (
 	"github.com/rcourtman/pulse-go-rewrite/internal/alerts"
 	"github.com/rcourtman/pulse-go-rewrite/internal/mock"
 	"github.com/rcourtman/pulse-go-rewrite/internal/unifiedresources"
+	"github.com/rcourtman/pulse-go-rewrite/internal/websocket"
 	"github.com/rs/zerolog/log"
 )

@ -140,6 +141,52 @@ func (m *Monitor) handleAlertResolved(alertID string) {
 	}
 }

+func (m *Monitor) handleAlertEscalated(hub *websocket.Hub, alert *alerts.Alert, level int) {
+	if alert == nil || m.alertManager == nil {
+		return
+	}
+
+	log.Info().
+		Str("alertID", alert.ID).
+		Int("level", level).
+		Msg("Alert escalated")
+
+	config := m.alertManager.GetConfig()
+	if level <= 0 || level > len(config.Schedule.Escalation.Levels) {
+		return
+	}
+
+	if m.alertManager.ShouldSuppressNotification(alert) {
+		log.Info().
+			Str("alertID", alert.ID).
+			Int("level", level).
+			Msg("Escalated notification suppressed during quiet hours")
+		m.broadcastEscalatedAlert(hub, alert)
+		return
+	}
+
+	if m.notificationMgr != nil {
+		escalationLevel := config.Schedule.Escalation.Levels[level-1]
+		switch escalationLevel.Notify {
+		case "email":
+			if emailConfig := m.notificationMgr.GetEmailConfig(); emailConfig.Enabled {
+				m.notificationMgr.SendAlert(alert)
+			}
+		case "webhook":
+			for _, webhook := range m.notificationMgr.GetWebhooks() {
+				if webhook.Enabled {
+					m.notificationMgr.SendAlert(alert)
+					break
+				}
+			}
+		case "all":
+			m.notificationMgr.SendAlert(alert)
+		}
+	}
+
+	m.broadcastEscalatedAlert(hub, alert)
+}
+
 func (m *Monitor) handleAlertAcknowledged(alert *alerts.Alert, user string) {
 	if m.incidentStore == nil || alert == nil {
 		if alert == nil {