Port v5 alert lifecycle and linked agent overrides

This commit is contained in:
rcourtman 2026-04-15 12:18:57 +01:00
parent f94601e26f
commit b73dab2b64
12 changed files with 1201 additions and 106 deletions

View file

@ -82,6 +82,12 @@ runtime must migrate the active alert, history entry, acknowledgment record,
suppression/rate-limit/flapping tracking, and guest per-disk metric identity
to the current canonical state instead of reopening a duplicate alert or
resolving only the stale node-scoped identity.
That same guest-alert owner also has to retire per-disk guest alerts when the
guest stops, disk alerting is disabled, or the reported disk set changes.
Canonical guest disk identity is only valid while the guest still exposes that
disk resource under the current thresholds, so runtime cleanup must remove
stale `guestID-disk-*` state instead of leaving orphaned per-disk incidents in
active alerts, resolved history, or later UI projections.
That same alerts runtime also owns instance-scoped node display-name
resolution. Raw node names are not globally unique across configured
infrastructure instances, so cached node display names must key on instance +
@ -211,6 +217,11 @@ keep schedules such as `00:00` to `23:59` active through the full final
minute instead of expiring at `23:59:00`. Alert quiet-hours proofs should
control time through the alert manager clock hook instead of depending on wall
clock execution at whatever second the test runner happens to hit.
Quiet-hours suppression also applies to alert delivery lifecycle, not only the
initial raised notification. Resolved notifications must not fan out when the
alert was never notified or was already acknowledged, and monitoring-driven
escalation delivery must consult the same quiet-hours suppression path while
still letting canonical escalation state reach websocket consumers.
That schedule surface now also follows the same shell/runtime split as the
other feature tabs: `frontend-modern/src/features/alerts/tabs/ScheduleTab.tsx`
stays the render shell, while
@ -231,6 +242,21 @@ presentation now also route through
`frontend-modern/src/utils/alertIncidentPresentation.ts` instead of remaining
duplicated inline across the alerts page and overview timeline surfaces.
Poll-driven connectivity recovery is also part of canonical alert truth.
Resources that clear an offline alert from later healthy polls must require
repeated healthy confirmations before resolving that alert instead of clearing
on the first recovered sample; otherwise transient poll recovery reopens the
same regression as false "back online" notifications and missing downtime
signal. Nodes, PBS, and PMG require three healthy confirmations before
resolution, while storage requires two.
Host-agent threshold ownership now follows the linked resource model.
Explicit agent overrides still win, but when no host-agent override exists the
alerts runtime must inherit linked node or guest overrides for that agent so
metric and connectivity behavior match the logical machine the agent augments.
Persisted host alerts must carry enough linked-resource metadata for
reevaluation after threshold changes to honor that same inheritance rule.
Alert resource tables, grouped node headers, and alert override reconstruction
now route resource-backed names through the shared policy-aware alerts helper
so governed resources do not fall back to raw names when the thresholds editor

View file

@ -95,6 +95,12 @@ This subsystem now sits under the dedicated core monitoring runtime lane so
discovery, metrics-history correctness, and platform-specific runtime coverage
can be governed as first-class product work instead of staying diluted inside
architecture coherence.
That same monitoring boundary also owns the escalation callback bridge into the
alerts delivery layer. Monitor-owned escalation handling may still publish
canonical escalation state to websocket consumers, but notification fan-out
must defer quiet-hours and resolved-notification suppression policy to the
alerts manager instead of bypassing that shared routing contract when monitor
plumbs escalations outward.
That same monitoring owner now also governs monitored-system usage readiness
for commercial boundaries. A non-nil unified read-state is not sufficient when
provider-owned supplemental inventories such as TrueNAS or VMware are still

View file

@ -45,10 +45,12 @@ const (
// Cleanup intervals
const (
StaleTrackingThreshold = 24 * time.Hour
RateLimitCleanupWindow = 1 * time.Hour
alertsDirPerm = 0o700
alertsFilePerm = 0o600
StaleTrackingThreshold = 24 * time.Hour
RateLimitCleanupWindow = 1 * time.Hour
alertsDirPerm = 0o700
alertsFilePerm = 0o600
offlineRecoveryConfirmationsDefault = 3
offlineRecoveryConfirmationsStorage = 2
)
func normalizePoweredOffSeverity(level AlertLevel) AlertLevel {
@ -555,13 +557,14 @@ type Manager struct {
// Time threshold tracking
pendingAlerts map[string]time.Time // Track when thresholds were first exceeded
// Offline confirmation tracking
nodeOfflineCount map[string]int // Track consecutive offline counts for nodes (legacy)
offlineConfirmations map[string]int // Track consecutive offline counts for all resources
dockerOfflineCount map[string]int // Track consecutive offline counts for Docker hosts
dockerStateConfirm map[string]int // Track consecutive state confirmations for Docker containers
dockerRestartTracking map[string]*dockerRestartRecord // Track restart counts and times for restart loop detection
dockerLastExitCode map[string]int // Track last exit code for OOM detection
dockerUpdateFirstSeen map[string]time.Time // Track when image updates were first detected for alert delay
nodeOfflineCount map[string]int // Track consecutive offline counts for nodes (legacy)
offlineConfirmations map[string]int // Track consecutive offline counts for all resources
offlineRecoveryConfirmations map[string]int // Track consecutive healthy confirmations before clearing poll-driven offline alerts
dockerOfflineCount map[string]int // Track consecutive offline counts for Docker hosts
dockerStateConfirm map[string]int // Track consecutive state confirmations for Docker containers
dockerRestartTracking map[string]*dockerRestartRecord // Track restart counts and times for restart loop detection
dockerLastExitCode map[string]int // Track last exit code for OOM detection
dockerUpdateFirstSeen map[string]time.Time // Track when image updates were first detected for alert delay
// Stable identity tracking prevents update-delay resets when host IDs churn.
dockerUpdateFirstSeenByIdentity map[string]time.Time
// PMG quarantine growth tracking
@ -642,6 +645,7 @@ func NewManagerWithDataDir(dataDir string) *Manager {
pendingAlerts: make(map[string]time.Time),
nodeOfflineCount: make(map[string]int),
offlineConfirmations: make(map[string]int),
offlineRecoveryConfirmations: make(map[string]int),
dockerOfflineCount: make(map[string]int),
dockerStateConfirm: make(map[string]int),
dockerRestartTracking: make(map[string]*dockerRestartRecord),
@ -2046,10 +2050,7 @@ func (m *Manager) reevaluateActiveAlertsLocked() {
alertsToResolve = append(alertsToResolve, alertID)
continue
}
// Overrides are keyed by raw host ID (without the "agent:" prefix
// that hostResourceID adds to the resource ID used in alert IDs).
rawHostID := stripHostResourcePrefix(resourceID)
thresholds := m.resolveResourceThresholds("agent", rawHostID)
thresholds := m.resolveHostAlertThresholdsNoLock(alert, resourceID)
if thresholds.Disabled {
alertsToResolve = append(alertsToResolve, alertID)
continue
@ -2507,6 +2508,28 @@ func (m *Manager) shouldSuppressNotification(alert *Alert) (bool, string) {
return false, ""
}
// ShouldSuppressNotification checks if a notification should be suppressed
// during quiet hours. This is used by paths that bypass dispatchAlert, such as escalation.
func (m *Manager) ShouldSuppressNotification(alert *Alert) bool {
if alert == nil {
return false
}
m.mu.RLock()
defer m.mu.RUnlock()
suppressed, reason := m.shouldSuppressNotification(alert)
if suppressed {
log.Debug().
Str("alertID", alert.ID).
Str("type", alert.Type).
Str("level", string(alert.Level)).
Str("quietHoursRule", reason).
Msg("Notification suppressed during quiet hours")
}
return suppressed
}
// ShouldSuppressResolvedNotification checks if a recovery notification should be suppressed
// during quiet hours. Recovery notifications follow the same quiet hours rules as their
// corresponding alerts - if the original alert would have been suppressed, so is the recovery.
@ -2518,6 +2541,22 @@ func (m *Manager) ShouldSuppressResolvedNotification(alert *Alert) bool {
m.mu.RLock()
defer m.mu.RUnlock()
if alert.Acknowledged {
log.Debug().
Str("alertID", alert.ID).
Str("type", alert.Type).
Msg("Recovery notification suppressed for acknowledged alert")
return true
}
if alert.LastNotified == nil {
log.Debug().
Str("alertID", alert.ID).
Str("type", alert.Type).
Msg("Recovery notification suppressed because firing notification was never sent")
return true
}
suppressed, reason := m.shouldSuppressNotification(alert)
if suppressed {
log.Debug().
@ -2698,23 +2737,7 @@ func (m *Manager) CheckGuest(guest any, instanceName string) {
m.clearGuestPoweredOffAlert(guestID, name)
}
// Clear all resource metric alerts (cpu, memory, disk, etc.) for non-running guests
m.mu.Lock()
alertsCleared := 0
for storageKey, alert := range m.activeAlerts {
alertID := effectiveAlertID(alert, storageKey)
// Only clear resource metric alerts, not powered-off alerts
if alert.ResourceID == guestID && alert.Type != "powered-off" {
m.clearAlertNoLock(alertID)
alertsCleared++
log.Debug().
Str("alertID", alertID).
Str("guest", name).
Str("status", status).
Msg("Cleared metric alert for non-running guest")
}
}
m.mu.Unlock()
alertsCleared := m.clearGuestMetricAlerts(guestID)
if alertsCleared > 0 {
log.Debug().
@ -2722,6 +2745,7 @@ func (m *Manager) CheckGuest(guest any, instanceName string) {
Str("status", status).
Int("alertsCleared", alertsCleared).
Msg("Cleared metric alerts for non-running guest")
m.saveActiveAlertsAsync("guest-not-running")
}
return
}
@ -2744,18 +2768,13 @@ func (m *Manager) CheckGuest(guest any, instanceName string) {
// If alerts are disabled for this guest, clear any existing alerts and return
if thresholds.Disabled {
m.mu.Lock()
for storageKey, alert := range m.activeAlerts {
alertID := effectiveAlertID(alert, storageKey)
if alert.ResourceID == guestID {
m.clearAlertNoLock(alertID)
log.Info().
Str("alertID", alertID).
Str("guest", name).
Msg("Cleared alert - guest has alerts disabled")
}
if alertsCleared := m.clearGuestMetricAlerts(guestID); alertsCleared > 0 {
log.Info().
Str("guest", name).
Int("alertsCleared", alertsCleared).
Msg("Cleared guest metric alerts because alerts are disabled")
m.saveActiveAlertsAsync("guest-alerts-disabled")
}
m.mu.Unlock()
return
}
@ -2789,7 +2808,8 @@ func (m *Manager) CheckGuest(guest any, instanceName string) {
}, thresholds, evalOpts)
if thresholds.Disk != nil && thresholds.Disk.Trigger > 0 && len(disks) > 0 {
seenDisks := make(map[string]struct{})
seenDiskKeys := make(map[string]struct{})
seenDiskResources := make(map[string]struct{})
for idx, disk := range disks {
if disk.Total <= 0 {
continue
@ -2816,12 +2836,13 @@ func (m *Manager) CheckGuest(guest any, instanceName string) {
}
// Avoid duplicate checks if two disks resolve to the same key
if _, exists := seenDisks[sanitizedKey]; exists {
if _, exists := seenDiskKeys[sanitizedKey]; exists {
continue
}
seenDisks[sanitizedKey] = struct{}{}
seenDiskKeys[sanitizedKey] = struct{}{}
perDiskResourceID := fmt.Sprintf("%s-disk-%s", guestID, sanitizedKey)
seenDiskResources[perDiskResourceID] = struct{}{}
message := fmt.Sprintf("%s disk (%s) at %.1f%%", guestType, label, disk.Usage)
log.Debug().
@ -2867,6 +2888,11 @@ func (m *Manager) CheckGuest(guest any, instanceName string) {
MonitorOnly: monitorOnly,
})
}
if cleared := m.cleanupGuestDiskAlerts(guestID, seenDiskResources); cleared > 0 {
m.saveActiveAlertsAsync("guest-disk-set-changed")
}
} else if cleared := m.cleanupGuestDiskAlerts(guestID, nil); cleared > 0 {
m.saveActiveAlertsAsync("guest-disk-alerts-cleared")
}
}
@ -3131,6 +3157,64 @@ func hostInstanceName(host models.Host) string {
return "Agent"
}
// resolveHostThresholdsNoLock resolves the effective thresholds for a host agent.
// Explicit host-agent overrides win. Otherwise, linked node/guest overrides are
// inherited so the host agent follows the logical resource it augments.
// Callers must hold m.mu when reading config through this helper.
func (m *Manager) resolveHostThresholdsNoLock(hostID, linkedNodeID, linkedVMID, linkedContainerID string) ThresholdConfig {
base := m.defaultThresholdsForResourceType("agent")
if hostID = strings.TrimSpace(hostID); hostID != "" {
if override, exists := m.config.Overrides[hostID]; exists {
return m.applyThresholdOverride(base, override)
}
}
if linkedNodeID = strings.TrimSpace(linkedNodeID); linkedNodeID != "" {
if override, exists := m.config.Overrides[linkedNodeID]; exists {
return m.applyThresholdOverride(base, override)
}
}
if linkedVMID = strings.TrimSpace(linkedVMID); linkedVMID != "" {
if override, exists := lookupGuestOverride(m.config.Overrides, nil, linkedVMID); exists {
return m.applyThresholdOverride(base, override)
}
}
if linkedContainerID = strings.TrimSpace(linkedContainerID); linkedContainerID != "" {
if override, exists := lookupGuestOverride(m.config.Overrides, nil, linkedContainerID); exists {
return m.applyThresholdOverride(base, override)
}
}
return base
}
// resolveHostAlertThresholdsNoLock resolves thresholds for persisted host-agent alerts.
// Alert metadata carries the link context needed to inherit node/guest overrides.
// Callers must hold m.mu when reading config through this helper.
func (m *Manager) resolveHostAlertThresholdsNoLock(alert *Alert, resourceID string) ThresholdConfig {
hostID := stripHostResourcePrefix(resourceID)
if idx := strings.Index(hostID, "/"); idx >= 0 {
hostID = hostID[:idx]
}
linkedNodeID := ""
linkedVMID := ""
linkedContainerID := ""
if alert != nil {
if metadataHostID := metadataStringValue(alert.Metadata, "hostId"); metadataHostID != "" {
hostID = metadataHostID
}
linkedNodeID = metadataStringValue(alert.Metadata, "linkedNodeId")
linkedVMID = metadataStringValue(alert.Metadata, "linkedVmId")
linkedContainerID = metadataStringValue(alert.Metadata, "linkedContainerId")
}
return m.resolveHostThresholdsNoLock(hostID, linkedNodeID, linkedVMID, linkedContainerID)
}
func sanitizeHostComponent(value string) string {
value = strings.TrimSpace(strings.ToLower(value))
if value == "" {
@ -3227,7 +3311,7 @@ func (m *Manager) CheckHost(host models.Host) {
m.mu.RLock()
alertsEnabled := m.config.Enabled
disableAllAgents := m.config.DisableAllAgents
thresholds := m.resolveResourceThresholds("agent", host.ID)
thresholds := m.resolveHostThresholdsNoLock(host.ID, host.LinkedNodeID, host.LinkedVMID, host.LinkedContainerID)
m.mu.RUnlock()
if !alertsEnabled {
@ -3267,6 +3351,15 @@ func (m *Manager) CheckHost(host models.Host) {
"agentVersion": host.AgentVersion,
"architecture": host.Architecture,
}
if linkedNodeID := strings.TrimSpace(host.LinkedNodeID); linkedNodeID != "" {
baseMetadata["linkedNodeId"] = linkedNodeID
}
if linkedVMID := strings.TrimSpace(host.LinkedVMID); linkedVMID != "" {
baseMetadata["linkedVmId"] = linkedVMID
}
if linkedContainerID := strings.TrimSpace(host.LinkedContainerID); linkedContainerID != "" {
baseMetadata["linkedContainerId"] = linkedContainerID
}
if len(host.Tags) > 0 {
baseMetadata["tags"] = append([]string(nil), host.Tags...)
}
@ -3590,7 +3683,7 @@ func (m *Manager) HandleHostOffline(host models.Host) {
return
}
disableHostsOffline := m.config.DisableAllAgentsOffline
thresholds := m.resolveResourceThresholds("agent", host.ID)
thresholds := m.resolveHostThresholdsNoLock(host.ID, host.LinkedNodeID, host.LinkedVMID, host.LinkedContainerID)
m.mu.RUnlock()
resourceKey := hostResourceID(host.ID)
@ -3644,13 +3737,16 @@ func (m *Manager) HandleHostOffline(host models.Host) {
Instance: instanceName,
Message: fmt.Sprintf("Host '%s' is offline", resourceName),
Metadata: map[string]interface{}{
"resourceType": "agent",
"hostId": host.ID,
"hostname": host.Hostname,
"displayName": host.DisplayName,
"platform": host.Platform,
"osName": host.OSName,
"osVersion": host.OSVersion,
"resourceType": "agent",
"hostId": host.ID,
"hostname": host.Hostname,
"displayName": host.DisplayName,
"platform": host.Platform,
"osName": host.OSName,
"osVersion": host.OSVersion,
"linkedNodeId": strings.TrimSpace(host.LinkedNodeID),
"linkedVmId": strings.TrimSpace(host.LinkedVMID),
"linkedContainerId": strings.TrimSpace(host.LinkedContainerID),
},
AddToRecent: true,
AddToHistory: true,
@ -3760,6 +3856,72 @@ func (m *Manager) clearHostDiskAlerts(hostID string) {
}
}
func (m *Manager) clearGuestMetricAlerts(guestID string, metrics ...string) int {
if guestID == "" {
return 0
}
allowedMetrics := make(map[string]struct{}, len(metrics))
for _, metric := range metrics {
metric = strings.TrimSpace(metric)
if metric == "" {
continue
}
allowedMetrics[metric] = struct{}{}
}
perDiskPrefix := fmt.Sprintf("%s-disk-", guestID)
m.mu.Lock()
defer m.mu.Unlock()
cleared := 0
for storageKey, alert := range m.activeAlerts {
if alert == nil || alert.Type == "powered-off" {
continue
}
if alert.ResourceID != guestID && !strings.HasPrefix(alert.ResourceID, perDiskPrefix) {
continue
}
if len(allowedMetrics) > 0 {
if _, ok := allowedMetrics[alert.Type]; !ok {
continue
}
}
m.clearAlertNoLock(storageKey)
cleared++
}
return cleared
}
func (m *Manager) cleanupGuestDiskAlerts(guestID string, seen map[string]struct{}) int {
if guestID == "" {
return 0
}
prefix := fmt.Sprintf("%s-disk-", guestID)
m.mu.Lock()
defer m.mu.Unlock()
cleared := 0
for storageKey, alert := range m.activeAlerts {
if alert == nil || !strings.HasPrefix(alert.ResourceID, prefix) {
continue
}
if seen != nil {
if _, exists := seen[alert.ResourceID]; exists {
continue
}
}
m.clearAlertNoLock(storageKey)
cleared++
}
return cleared
}
func (m *Manager) cleanupHostDiskAlerts(host models.Host, seen map[string]struct{}) {
if host.ID == "" {
return
@ -5502,9 +5664,19 @@ func (m *Manager) clearDockerContainerUpdateTracking(resourceID, trackingKey str
m.mu.Unlock()
}
func (m *Manager) touchDockerContainerUpdateAlert(alertID string) {
m.mu.Lock()
defer m.mu.Unlock()
if alert, exists := m.getActiveAlertNoLock(alertID); exists && alert != nil {
alert.LastSeen = time.Now()
}
}
// checkDockerContainerImageUpdate checks if an image update has been pending for too long
func (m *Manager) checkDockerContainerImageUpdate(host models.DockerHost, container models.DockerContainer, resourceID, containerName, instanceName, nodeName string) {
alertID := fmt.Sprintf("docker-container-update-%s", resourceID)
canonicalAlertID := buildCanonicalStateID(resourceID, resourceID+"-image-update")
updateTrackingKey := dockerUpdateTrackingKey(host, container)
// Check if update detection is enabled
@ -5514,30 +5686,30 @@ func (m *Manager) checkDockerContainerImageUpdate(host models.DockerHost, contai
// Negative value means disabled
if delayHours < 0 {
m.clearAlert(buildCanonicalStateID(resourceID, resourceID+"-image-update"))
m.clearAlert(canonicalAlertID)
m.clearDockerContainerUpdateTracking(resourceID, updateTrackingKey)
return
}
// Check if this container has an update status reported
if container.UpdateStatus == nil {
// No update status - clear any tracking and alerts
m.clearAlert(buildCanonicalStateID(resourceID, resourceID+"-image-update"))
m.clearDockerContainerUpdateTracking(resourceID, updateTrackingKey)
// Missing update status means the condition is unknown, not resolved.
// Preserve any active alert and first-seen tracking until we see an affirmative clear.
m.touchDockerContainerUpdateAlert(canonicalAlertID)
return
}
// Check for errors in update detection (don't alert on errors)
if container.UpdateStatus.Error != "" {
// Update check failed - clear alert but keep tracking
m.clearAlert(buildCanonicalStateID(resourceID, resourceID+"-image-update"))
// A failed update check cannot confirm the pending update has been resolved.
m.touchDockerContainerUpdateAlert(canonicalAlertID)
return
}
// Check if an update is available
if !container.UpdateStatus.UpdateAvailable {
// No update available - clear tracking and alert
m.clearAlert(buildCanonicalStateID(resourceID, resourceID+"-image-update"))
m.clearAlert(canonicalAlertID)
m.clearDockerContainerUpdateTracking(resourceID, updateTrackingKey)
return
}
@ -7495,8 +7667,10 @@ func (m *Manager) removeActiveAlertNoLock(alertID string) {
m.unregisterActiveAlertAliasNoLock(key, alert)
}
if exists {
delete(m.offlineRecoveryConfirmations, key)
delete(m.activeAlerts, key)
}
delete(m.offlineRecoveryConfirmations, alertID)
// NOTE: Don't delete ackState here - preserve it so if the same alert
// reappears (e.g., powered-off VM during backup), the acknowledgement
// is restored via preserveAlertState. ackState is cleaned up in Cleanup().
@ -7506,6 +7680,27 @@ func (m *Manager) removeActiveAlertNoLock(alertID string) {
}
}
func (m *Manager) confirmOfflineRecoveryNoLock(alertID string, required int) (int, bool) {
alertID = strings.TrimSpace(alertID)
if alertID == "" {
return 0, false
}
if required <= 1 {
delete(m.offlineRecoveryConfirmations, alertID)
return required, true
}
m.offlineRecoveryConfirmations[alertID]++
confirmations := m.offlineRecoveryConfirmations[alertID]
if confirmations < required {
return confirmations, false
}
delete(m.offlineRecoveryConfirmations, alertID)
return confirmations, true
}
// GetActiveAlerts returns all active alerts
func (m *Manager) GetActiveAlerts() []Alert {
if m == nil {
@ -7817,8 +8012,9 @@ func (m *Manager) OnAlertHistory(cb AlertCallback) {
}
}
// clearResourceOfflineAlert removes an offline alert when a resource comes back online.
func (m *Manager) clearResourceOfflineAlert(alertPrefix, resourceID, resourceName, host, resourceKind string) {
// clearResourceOfflineAlert removes an offline alert when a poll-driven resource
// stays healthy for enough consecutive polls to confirm recovery.
func (m *Manager) clearResourceOfflineAlert(resourceID, resourceName, host, resourceKind string, requiredRecoveryCount int) {
alertID := canonicalConnectivityStateID(resourceID)
m.mu.Lock()
@ -7836,6 +8032,17 @@ func (m *Manager) clearResourceOfflineAlert(alertPrefix, resourceID, resourceNam
// Check if offline alert exists
alert, exists := m.getActiveAlertNoLock(alertID)
if !exists {
delete(m.offlineRecoveryConfirmations, alertID)
return
}
recoveryCount, confirmed := m.confirmOfflineRecoveryNoLock(alertID, requiredRecoveryCount)
if !confirmed {
log.Debug().
Str(strings.ToLower(resourceKind), resourceName).
Int("confirmations", recoveryCount).
Int("required", requiredRecoveryCount).
Msg(resourceKind + " appears back online, waiting for recovery confirmation")
return
}
@ -7864,6 +8071,10 @@ func (m *Manager) clearResourceOfflineAlert(alertPrefix, resourceID, resourceNam
func (m *Manager) checkNodeOffline(node models.Node) {
alertID := fmt.Sprintf("node-offline-%s", node.ID)
m.mu.Lock()
delete(m.offlineRecoveryConfirmations, canonicalConnectivityStateID(node.ID))
m.mu.Unlock()
thresholds := m.resolveResourceThresholds("node", node.ID)
spec, err := buildCanonicalConnectivitySpec(node.ID, node.Name, unifiedresources.ResourceType("node"), AlertLevelCritical, 3, thresholds.Disabled || thresholds.DisableConnectivity)
if err != nil {
@ -7918,6 +8129,17 @@ func (m *Manager) clearNodeOfflineAlert(node models.Node) {
// Check if offline alert exists
alert, exists := m.getActiveAlertNoLock(alertID)
if !exists {
delete(m.offlineRecoveryConfirmations, alertID)
return
}
recoveryCount, confirmed := m.confirmOfflineRecoveryNoLock(alertID, offlineRecoveryConfirmationsDefault)
if !confirmed {
log.Debug().
Str("node", node.Name).
Int("confirmations", recoveryCount).
Int("required", offlineRecoveryConfirmationsDefault).
Msg("Node appears back online, waiting for recovery confirmation")
return
}
@ -7944,6 +8166,10 @@ func (m *Manager) clearNodeOfflineAlert(node models.Node) {
// checkPBSOffline creates an alert for offline PBS instances
func (m *Manager) checkPBSOffline(pbs models.PBSInstance) {
m.mu.Lock()
delete(m.offlineRecoveryConfirmations, canonicalConnectivityStateID(pbs.ID))
m.mu.Unlock()
thresholds := m.resolveResourceThresholds("pbs", pbs.ID)
spec, err := buildCanonicalConnectivitySpec(pbs.ID, pbs.Name, unifiedresources.ResourceTypePBS, AlertLevelCritical, 3, thresholds.Disabled || thresholds.DisableConnectivity)
if err != nil {
@ -7979,11 +8205,15 @@ func (m *Manager) checkPBSOffline(pbs models.PBSInstance) {
// clearPBSOfflineAlert removes offline alert when PBS comes back online
func (m *Manager) clearPBSOfflineAlert(pbs models.PBSInstance) {
m.clearResourceOfflineAlert("pbs-offline", pbs.ID, pbs.Name, pbs.Host, "PBS")
m.clearResourceOfflineAlert(pbs.ID, pbs.Name, pbs.Host, "PBS", offlineRecoveryConfirmationsDefault)
}
// checkPMGOffline creates an alert for offline PMG instances
func (m *Manager) checkPMGOffline(pmg models.PMGInstance) {
m.mu.Lock()
delete(m.offlineRecoveryConfirmations, canonicalConnectivityStateID(pmg.ID))
m.mu.Unlock()
m.mu.RLock()
override, hasOverride := m.config.Overrides[pmg.ID]
m.mu.RUnlock()
@ -8022,7 +8252,7 @@ func (m *Manager) checkPMGOffline(pmg models.PMGInstance) {
// clearPMGOfflineAlert removes offline alert when PMG comes back online
func (m *Manager) clearPMGOfflineAlert(pmg models.PMGInstance) {
m.clearResourceOfflineAlert("pmg-offline", pmg.ID, pmg.Name, pmg.Host, "PMG")
m.clearResourceOfflineAlert(pmg.ID, pmg.Name, pmg.Host, "PMG", offlineRecoveryConfirmationsDefault)
}
// checkPMGQueueDepths checks PMG mail queue depths and creates alerts
@ -8936,6 +9166,10 @@ func (m *Manager) checkAnomalyMetric(pmg models.PMGInstance, tracker *pmgAnomaly
func (m *Manager) checkStorageOffline(storage models.Storage) {
alertID := fmt.Sprintf("storage-offline-%s", storage.ID)
m.mu.Lock()
delete(m.offlineRecoveryConfirmations, canonicalConnectivityStateID(storage.ID))
m.mu.Unlock()
thresholds := m.resolveResourceThresholds("storage", storage.ID)
spec, err := buildCanonicalConnectivitySpec(storage.ID, storage.Name, unifiedresources.ResourceTypeStorage, AlertLevelWarning, 2, thresholds.Disabled || thresholds.DisableConnectivity)
if err != nil {
@ -8970,7 +9204,7 @@ func (m *Manager) checkStorageOffline(storage models.Storage) {
// clearStorageOfflineAlert removes offline alert when storage comes back online
func (m *Manager) clearStorageOfflineAlert(storage models.Storage) {
m.clearResourceOfflineAlert("storage-offline", storage.ID, storage.Name, storage.Node, "Storage")
m.clearResourceOfflineAlert(storage.ID, storage.Name, storage.Node, "Storage", offlineRecoveryConfirmationsStorage)
}
// checkGuestPoweredOff creates an alert for powered-off guests
@ -10095,6 +10329,7 @@ func (m *Manager) ClearActiveAlerts() {
m.alertRateLimit = make(map[string][]time.Time)
m.nodeOfflineCount = make(map[string]int)
m.offlineConfirmations = make(map[string]int)
m.offlineRecoveryConfirmations = make(map[string]int)
m.dockerOfflineCount = make(map[string]int)
m.dockerStateConfirm = make(map[string]int)
m.dockerRestartTracking = make(map[string]*dockerRestartRecord)
@ -10215,6 +10450,13 @@ func (m *Manager) cleanupStaleMaps() {
}
}
for alertID := range m.offlineRecoveryConfirmations {
if !m.hasActiveAlertNoLock(alertID) {
delete(m.offlineRecoveryConfirmations, alertID)
cleaned++
}
}
// Clean up node offline counts (legacy)
for nodeID := range m.nodeOfflineCount {
hasRelatedAlert := false

View file

@ -5097,6 +5097,18 @@ func TestClearStorageOfflineAlert(t *testing.T) {
resolvedCh <- id
})
m.clearStorageOfflineAlert(storage)
m.mu.RLock()
_, existsAfterFirst := testLookupActiveAlert(t, m, alertID)
recoveryCountAfterFirst := m.offlineRecoveryConfirmations[canonicalConnectivityStateID(storage.ID)]
m.mu.RUnlock()
if !existsAfterFirst {
t.Fatal("expected storage alert to remain active until recovery confirmations are satisfied")
}
if recoveryCountAfterFirst != 1 {
t.Fatalf("expected first storage recovery confirmation to be tracked, got %d", recoveryCountAfterFirst)
}
m.clearStorageOfflineAlert(storage)
m.mu.RLock()
@ -6787,6 +6799,28 @@ func TestHandleHostOffline(t *testing.T) {
}
})
t.Run("linked node override DisableConnectivity clears alert and returns", func(t *testing.T) {
m := newTestManager(t)
m.config.Enabled = true
m.config.Overrides = map[string]ThresholdConfig{
"cluster-node1": {DisableConnectivity: true},
}
alertID := canonicalConnectivityStateID(hostResourceID("host1"))
m.activeAlerts[alertID] = &Alert{ID: alertID, Type: "host-offline"}
m.offlineConfirmations[hostResourceID("host1")] = 5
host := models.Host{ID: "host1", Hostname: "test-host", LinkedNodeID: "cluster-node1"}
m.HandleHostOffline(host)
if testHasActiveAlert(t, m, alertID) {
t.Error("expected alert to be cleared")
}
if _, exists := m.offlineConfirmations[hostResourceID("host1")]; exists {
t.Error("expected offlineConfirmations to be cleared")
}
})
t.Run("override Disabled clears alert and returns", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
@ -8118,6 +8152,30 @@ func TestClearNodeOfflineAlert(t *testing.T) {
m.mu.Unlock()
node := models.Node{ID: "node1", Name: "Node 1", Instance: "pve1"}
m.clearNodeOfflineAlert(node)
m.mu.RLock()
_, existsAfterFirst := testLookupActiveAlert(t, m, state)
recoveryCountAfterFirst := m.offlineRecoveryConfirmations[state]
m.mu.RUnlock()
if !existsAfterFirst {
t.Fatal("expected node alert to remain active until recovery confirmations are satisfied")
}
if recoveryCountAfterFirst != 1 {
t.Fatalf("expected first node recovery confirmation to be tracked, got %d", recoveryCountAfterFirst)
}
m.clearNodeOfflineAlert(node)
m.mu.RLock()
_, existsAfterSecond := testLookupActiveAlert(t, m, state)
recoveryCountAfterSecond := m.offlineRecoveryConfirmations[state]
m.mu.RUnlock()
if !existsAfterSecond {
t.Fatal("expected node alert to remain active after second recovery confirmation")
}
if recoveryCountAfterSecond != 2 {
t.Fatalf("expected second node recovery confirmation to be tracked, got %d", recoveryCountAfterSecond)
}
m.clearNodeOfflineAlert(node)
m.mu.RLock()
@ -8174,7 +8232,10 @@ func TestClearOfflineAlertNoDeadlock(t *testing.T) {
m.mu.Unlock()
},
clearFn: func(m *Manager) {
m.clearNodeOfflineAlert(models.Node{ID: "node1", Name: "Node 1", Instance: "pve1"})
node := models.Node{ID: "node1", Name: "Node 1", Instance: "pve1"}
m.clearNodeOfflineAlert(node)
m.clearNodeOfflineAlert(node)
m.clearNodeOfflineAlert(node)
},
},
{
@ -8189,7 +8250,10 @@ func TestClearOfflineAlertNoDeadlock(t *testing.T) {
m.mu.Unlock()
},
clearFn: func(m *Manager) {
m.clearPBSOfflineAlert(models.PBSInstance{ID: "pbs1", Name: "PBS 1", Host: "host1"})
pbs := models.PBSInstance{ID: "pbs1", Name: "PBS 1", Host: "host1"}
m.clearPBSOfflineAlert(pbs)
m.clearPBSOfflineAlert(pbs)
m.clearPBSOfflineAlert(pbs)
},
},
{
@ -8204,7 +8268,10 @@ func TestClearOfflineAlertNoDeadlock(t *testing.T) {
m.mu.Unlock()
},
clearFn: func(m *Manager) {
m.clearPMGOfflineAlert(models.PMGInstance{ID: "pmg1", Name: "PMG 1", Host: "host1"})
pmg := models.PMGInstance{ID: "pmg1", Name: "PMG 1", Host: "host1"}
m.clearPMGOfflineAlert(pmg)
m.clearPMGOfflineAlert(pmg)
m.clearPMGOfflineAlert(pmg)
},
},
{
@ -8219,7 +8286,9 @@ func TestClearOfflineAlertNoDeadlock(t *testing.T) {
m.mu.Unlock()
},
clearFn: func(m *Manager) {
m.clearStorageOfflineAlert(models.Storage{ID: "stor1", Name: "Storage 1", Node: "node1"})
storage := models.Storage{ID: "stor1", Name: "Storage 1", Node: "node1"}
m.clearStorageOfflineAlert(storage)
m.clearStorageOfflineAlert(storage)
},
},
{
@ -8319,6 +8388,30 @@ func TestClearPBSOfflineAlert(t *testing.T) {
m.mu.Unlock()
pbs := models.PBSInstance{ID: "pbs1", Name: "PBS 1", Host: "pbs.local"}
m.clearPBSOfflineAlert(pbs)
m.mu.RLock()
_, existsAfterFirst := testLookupActiveAlert(t, m, "pbs-offline-pbs1")
recoveryCountAfterFirst := m.offlineRecoveryConfirmations[canonicalConnectivityStateID("pbs1")]
m.mu.RUnlock()
if !existsAfterFirst {
t.Fatal("expected PBS alert to remain active until recovery confirmations are satisfied")
}
if recoveryCountAfterFirst != 1 {
t.Fatalf("expected first PBS recovery confirmation to be tracked, got %d", recoveryCountAfterFirst)
}
m.clearPBSOfflineAlert(pbs)
m.mu.RLock()
_, existsAfterSecond := testLookupActiveAlert(t, m, "pbs-offline-pbs1")
recoveryCountAfterSecond := m.offlineRecoveryConfirmations[canonicalConnectivityStateID("pbs1")]
m.mu.RUnlock()
if !existsAfterSecond {
t.Fatal("expected PBS alert to remain active after second recovery confirmation")
}
if recoveryCountAfterSecond != 2 {
t.Fatalf("expected second PBS recovery confirmation to be tracked, got %d", recoveryCountAfterSecond)
}
m.clearPBSOfflineAlert(pbs)
m.mu.RLock()
@ -8402,6 +8495,30 @@ func TestClearPMGOfflineAlert(t *testing.T) {
m.mu.Unlock()
pmg := models.PMGInstance{ID: "pmg1", Name: "PMG 1", Host: "pmg.local"}
m.clearPMGOfflineAlert(pmg)
m.mu.RLock()
_, existsAfterFirst := testLookupActiveAlert(t, m, "pmg-offline-pmg1")
recoveryCountAfterFirst := m.offlineRecoveryConfirmations[canonicalConnectivityStateID("pmg1")]
m.mu.RUnlock()
if !existsAfterFirst {
t.Fatal("expected PMG alert to remain active until recovery confirmations are satisfied")
}
if recoveryCountAfterFirst != 1 {
t.Fatalf("expected first PMG recovery confirmation to be tracked, got %d", recoveryCountAfterFirst)
}
m.clearPMGOfflineAlert(pmg)
m.mu.RLock()
_, existsAfterSecond := testLookupActiveAlert(t, m, "pmg-offline-pmg1")
recoveryCountAfterSecond := m.offlineRecoveryConfirmations[canonicalConnectivityStateID("pmg1")]
m.mu.RUnlock()
if !existsAfterSecond {
t.Fatal("expected PMG alert to remain active after second recovery confirmation")
}
if recoveryCountAfterSecond != 2 {
t.Fatalf("expected second PMG recovery confirmation to be tracked, got %d", recoveryCountAfterSecond)
}
m.clearPMGOfflineAlert(pmg)
m.mu.RLock()
@ -13217,6 +13334,7 @@ func TestCheckNode(t *testing.T) {
t.Run("online node clears offline alert", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
state := canonicalConnectivityStateID("node1")
// Pre-create offline alert
m.mu.Lock()
@ -13235,11 +13353,36 @@ func TestCheckNode(t *testing.T) {
Status: "online",
ConnectionHealth: "connected",
}
m.CheckNode(node)
m.mu.RLock()
_, alertExistsAfterFirst := testLookupActiveAlert(t, m, "node-offline-node1")
recoveryCountAfterFirst := m.offlineRecoveryConfirmations[state]
m.mu.RUnlock()
if !alertExistsAfterFirst {
t.Fatal("expected offline alert to remain active until recovery confirmations are satisfied")
}
if recoveryCountAfterFirst != 1 {
t.Fatalf("expected first recovery confirmation to be tracked, got %d", recoveryCountAfterFirst)
}
m.CheckNode(node)
m.mu.RLock()
_, alertExistsAfterSecond := testLookupActiveAlert(t, m, "node-offline-node1")
recoveryCountAfterSecond := m.offlineRecoveryConfirmations[state]
m.mu.RUnlock()
if !alertExistsAfterSecond {
t.Fatal("expected offline alert to remain active after second recovery confirmation")
}
if recoveryCountAfterSecond != 2 {
t.Fatalf("expected second recovery confirmation to be tracked, got %d", recoveryCountAfterSecond)
}
m.CheckNode(node)
m.mu.RLock()
_, alertExists := testLookupActiveAlert(t, m, "node-offline-node1")
_, countExists := m.nodeOfflineCount["node1"]
_, recoveryExists := m.offlineRecoveryConfirmations[state]
m.mu.RUnlock()
if alertExists {
@ -13248,6 +13391,9 @@ func TestCheckNode(t *testing.T) {
if countExists {
t.Error("expected offline count to be cleared")
}
if recoveryExists {
t.Error("expected offline recovery confirmations to be cleared")
}
})
t.Run("online node triggers metric checks", func(t *testing.T) {
@ -14907,6 +15053,125 @@ func TestCheckHostComprehensive(t *testing.T) {
t.Error("expected tags in metadata")
}
})
t.Run("inherits linked node overrides for host agent metrics", func(t *testing.T) {
m := newTestManager(t)
m.ClearActiveAlerts()
m.mu.Lock()
m.config.TimeThresholds = map[string]int{}
m.config.AgentDefaults = ThresholdConfig{
Memory: &HysteresisThreshold{Trigger: 85.0, Clear: 80.0},
}
m.config.Overrides = map[string]ThresholdConfig{
"cluster-node1": {
Memory: &HysteresisThreshold{Trigger: 97.0, Clear: 92.0},
},
}
m.mu.Unlock()
host := models.Host{
ID: "host-node1",
DisplayName: "node1",
Hostname: "node1",
LinkedNodeID: "cluster-node1",
Memory: models.Memory{
Usage: 90.6,
Total: 1024,
Used: 928,
Free: 96,
},
Status: "online",
LastSeen: time.Now(),
}
m.CheckHost(host)
m.mu.RLock()
_, exists := m.activeAlerts[canonicalMetricStateID(hostResourceID(host.ID), "memory")]
m.mu.RUnlock()
if exists {
t.Fatal("expected linked node override to suppress host-agent memory alert")
}
})
t.Run("inherits linked guest overrides for host agent metrics", func(t *testing.T) {
m := newTestManager(t)
m.ClearActiveAlerts()
m.mu.Lock()
m.config.TimeThresholds = map[string]int{}
m.config.AgentDefaults = ThresholdConfig{
CPU: &HysteresisThreshold{Trigger: 80.0, Clear: 75.0},
}
m.config.Overrides = map[string]ThresholdConfig{
"guest:Main:101": {
CPU: &HysteresisThreshold{Trigger: 105.0, Clear: 100.0},
},
}
m.mu.Unlock()
host := models.Host{
ID: "host-vm101",
DisplayName: "vm101",
Hostname: "vm101.local",
LinkedVMID: "Main:node3:101",
CPUUsage: 97.5,
Status: "online",
LastSeen: time.Now(),
}
m.CheckHost(host)
m.mu.RLock()
_, exists := m.activeAlerts[canonicalMetricStateID(hostResourceID(host.ID), "cpu")]
m.mu.RUnlock()
if exists {
t.Fatal("expected linked guest override to suppress host-agent cpu alert")
}
})
t.Run("prefers explicit host overrides over linked resource overrides", func(t *testing.T) {
m := newTestManager(t)
m.ClearActiveAlerts()
m.mu.Lock()
m.config.TimeThresholds = map[string]int{}
m.config.AgentDefaults = ThresholdConfig{
CPU: &HysteresisThreshold{Trigger: 80.0, Clear: 75.0},
}
m.config.Overrides = map[string]ThresholdConfig{
"guest:Main:101": {
CPU: &HysteresisThreshold{Trigger: 105.0, Clear: 100.0},
},
"host-vm101": {
CPU: &HysteresisThreshold{Trigger: 90.0, Clear: 85.0},
},
}
m.mu.Unlock()
host := models.Host{
ID: "host-vm101",
DisplayName: "vm101",
Hostname: "vm101.local",
LinkedVMID: "Main:node3:101",
CPUUsage: 97.5,
Status: "online",
LastSeen: time.Now(),
}
m.CheckHost(host)
m.mu.RLock()
alert := m.activeAlerts[canonicalMetricStateID(hostResourceID(host.ID), "cpu")]
m.mu.RUnlock()
if alert == nil {
t.Fatal("expected explicit host override to take precedence and trigger alert")
}
})
}
func TestCheckPBSComprehensive(t *testing.T) {
@ -15332,6 +15597,7 @@ func TestCheckPBSComprehensive(t *testing.T) {
t.Run("clears offline alert when back online", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
state := canonicalConnectivityStateID("pbs1")
m.mu.Lock()
m.activeAlerts["pbs-offline-pbs1"] = &Alert{ID: "pbs-offline-pbs1", Type: "connectivity"}
@ -15345,11 +15611,36 @@ func TestCheckPBSComprehensive(t *testing.T) {
ConnectionHealth: "healthy",
}
m.CheckPBS(pbs)
m.mu.RLock()
_, offlineExistsAfterFirst := testLookupActiveAlert(t, m, "pbs-offline-pbs1")
recoveryCountAfterFirst := m.offlineRecoveryConfirmations[state]
m.mu.RUnlock()
if !offlineExistsAfterFirst {
t.Fatal("expected offline alert to remain active until recovery confirmations are satisfied")
}
if recoveryCountAfterFirst != 1 {
t.Fatalf("expected first PBS recovery confirmation to be tracked, got %d", recoveryCountAfterFirst)
}
m.CheckPBS(pbs)
m.mu.RLock()
_, offlineExistsAfterSecond := testLookupActiveAlert(t, m, "pbs-offline-pbs1")
recoveryCountAfterSecond := m.offlineRecoveryConfirmations[state]
m.mu.RUnlock()
if !offlineExistsAfterSecond {
t.Fatal("expected offline alert to remain active after second recovery confirmation")
}
if recoveryCountAfterSecond != 2 {
t.Fatalf("expected second PBS recovery confirmation to be tracked, got %d", recoveryCountAfterSecond)
}
m.CheckPBS(pbs)
m.mu.RLock()
_, offlineExists := testLookupActiveAlert(t, m, "pbs-offline-pbs1")
_, confirmExists := m.offlineConfirmations["pbs1"]
_, recoveryExists := m.offlineRecoveryConfirmations[state]
m.mu.RUnlock()
if offlineExists {
@ -15358,6 +15649,9 @@ func TestCheckPBSComprehensive(t *testing.T) {
if confirmExists {
t.Error("expected offline confirmation to be cleared")
}
if recoveryExists {
t.Error("expected offline recovery confirmations to be cleared")
}
})
}
@ -15664,6 +15958,7 @@ func TestCheckPMGComprehensive(t *testing.T) {
t.Run("clears offline alert when back online", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
state := canonicalConnectivityStateID("pmg1")
m.mu.Lock()
m.activeAlerts["pmg-offline-pmg1"] = &Alert{ID: "pmg-offline-pmg1", Type: "connectivity"}
@ -15677,11 +15972,36 @@ func TestCheckPMGComprehensive(t *testing.T) {
ConnectionHealth: "healthy",
}
m.CheckPMG(pmg)
m.mu.RLock()
_, offlineExistsAfterFirst := testLookupActiveAlert(t, m, "pmg-offline-pmg1")
recoveryCountAfterFirst := m.offlineRecoveryConfirmations[state]
m.mu.RUnlock()
if !offlineExistsAfterFirst {
t.Fatal("expected offline alert to remain active until recovery confirmations are satisfied")
}
if recoveryCountAfterFirst != 1 {
t.Fatalf("expected first PMG recovery confirmation to be tracked, got %d", recoveryCountAfterFirst)
}
m.CheckPMG(pmg)
m.mu.RLock()
_, offlineExistsAfterSecond := testLookupActiveAlert(t, m, "pmg-offline-pmg1")
recoveryCountAfterSecond := m.offlineRecoveryConfirmations[state]
m.mu.RUnlock()
if !offlineExistsAfterSecond {
t.Fatal("expected offline alert to remain active after second recovery confirmation")
}
if recoveryCountAfterSecond != 2 {
t.Fatalf("expected second PMG recovery confirmation to be tracked, got %d", recoveryCountAfterSecond)
}
m.CheckPMG(pmg)
m.mu.RLock()
_, offlineExists := testLookupActiveAlert(t, m, "pmg-offline-pmg1")
_, confirmExists := m.offlineConfirmations["pmg1"]
_, recoveryExists := m.offlineRecoveryConfirmations[state]
m.mu.RUnlock()
if offlineExists {
@ -15690,6 +16010,9 @@ func TestCheckPMGComprehensive(t *testing.T) {
if confirmExists {
t.Error("expected offline confirmation to be cleared")
}
if recoveryExists {
t.Error("expected offline recovery confirmations to be cleared")
}
})
t.Run("skips metrics when PMG is offline", func(t *testing.T) {
@ -16008,6 +16331,7 @@ func TestCheckStorageComprehensive(t *testing.T) {
t.Run("clears offline alert when back online", func(t *testing.T) {
// t.Parallel()
m := newTestManager(t)
state := canonicalConnectivityStateID("storage1")
m.mu.Lock()
m.activeAlerts["storage-offline-storage1"] = &Alert{ID: "storage-offline-storage1", Type: "connectivity"}
@ -16020,11 +16344,24 @@ func TestCheckStorageComprehensive(t *testing.T) {
Status: "active",
}
m.CheckStorage(storage)
m.mu.RLock()
_, offlineExistsAfterFirst := testLookupActiveAlert(t, m, "storage-offline-storage1")
recoveryCountAfterFirst := m.offlineRecoveryConfirmations[state]
m.mu.RUnlock()
if !offlineExistsAfterFirst {
t.Fatal("expected offline alert to remain active until recovery confirmations are satisfied")
}
if recoveryCountAfterFirst != 1 {
t.Fatalf("expected first storage recovery confirmation to be tracked, got %d", recoveryCountAfterFirst)
}
m.CheckStorage(storage)
m.mu.RLock()
_, offlineExists := testLookupActiveAlert(t, m, "storage-offline-storage1")
_, confirmExists := m.offlineConfirmations["storage1"]
_, recoveryExists := m.offlineRecoveryConfirmations[state]
m.mu.RUnlock()
if offlineExists {
@ -16033,6 +16370,9 @@ func TestCheckStorageComprehensive(t *testing.T) {
if confirmExists {
t.Error("expected offline confirmation to be cleared")
}
if recoveryExists {
t.Error("expected offline recovery confirmations to be cleared")
}
})
t.Run("skips usage check when usage is zero", func(t *testing.T) {

View file

@ -79,6 +79,40 @@ func TestShouldSuppressNotificationQuietHours(t *testing.T) {
t.Fatalf("expected storage alert suppression, got suppressed=%t reason=%q", suppressed, reason)
}
})
t.Run("public suppression helper mirrors quiet hours policy", func(t *testing.T) {
m := newManagerWithQuietHoursSuppress(QuietHoursSuppression{Offline: true})
alert := &Alert{ID: "offline-public", Type: "connectivity", Level: AlertLevelCritical}
if !m.ShouldSuppressNotification(alert) {
t.Fatal("expected public quiet-hours suppression helper to return true")
}
})
t.Run("resolved notifications are suppressed for acknowledged alerts", func(t *testing.T) {
m := newManagerWithQuietHoursSuppress(QuietHoursSuppression{})
alert := &Alert{
ID: "resolved-ack",
Type: "cpu",
Level: AlertLevelCritical,
Acknowledged: true,
LastNotified: ptrTime(time.Now().Add(-time.Minute)),
}
if !m.ShouldSuppressResolvedNotification(alert) {
t.Fatal("expected recovery notification to be suppressed for acknowledged alert")
}
})
t.Run("resolved notifications are suppressed when the firing alert was never notified", func(t *testing.T) {
m := newManagerWithQuietHoursSuppress(QuietHoursSuppression{})
alert := &Alert{
ID: "resolved-unnotified",
Type: "cpu",
Level: AlertLevelCritical,
}
if !m.ShouldSuppressResolvedNotification(alert) {
t.Fatal("expected recovery notification to be suppressed when LastNotified is nil")
}
})
}
func TestIsInQuietHours(t *testing.T) {

View file

@ -156,6 +156,65 @@ func TestReevaluateActiveAlertsWithOverride(t *testing.T) {
}
}
func TestReevaluateActiveAlertsWithLinkedHostOverride(t *testing.T) {
manager := NewManager()
manager.mu.Lock()
manager.activeAlerts = make(map[string]*Alert)
manager.mu.Unlock()
initialConfig := AlertConfig{
Enabled: true,
AgentDefaults: ThresholdConfig{
Memory: &HysteresisThreshold{Trigger: 85, Clear: 80},
},
Overrides: make(map[string]ThresholdConfig),
}
manager.UpdateConfig(initialConfig)
resourceID := hostResourceID("host-node1")
alertID := canonicalMetricStateID(resourceID, "memory")
alert := &Alert{
ID: alertID,
Type: "memory",
Level: AlertLevelWarning,
ResourceID: resourceID,
ResourceName: "node1",
Node: "node1",
Instance: "linux",
Message: "Host memory at 90.6%",
Value: 90.6,
Threshold: 85.0,
StartTime: time.Now().Add(-5 * time.Minute),
LastSeen: time.Now(),
Metadata: map[string]interface{}{
"resourceType": "agent",
"hostId": "host-node1",
"linkedNodeId": "cluster-node1",
},
}
manager.mu.Lock()
manager.activeAlerts[alertID] = alert
manager.mu.Unlock()
updatedConfig := initialConfig
updatedConfig.Overrides["cluster-node1"] = ThresholdConfig{
Memory: &HysteresisThreshold{Trigger: 97, Clear: 92},
}
manager.UpdateConfig(updatedConfig)
time.Sleep(100 * time.Millisecond)
manager.mu.RLock()
_, alertStillActive := manager.activeAlerts[alertID]
manager.mu.RUnlock()
if alertStillActive {
t.Errorf("expected linked host alert to be resolved after linked node override increase")
}
}
func TestReevaluateActiveAlertsUsesStableClusterGuestOverrideAcrossNodeMove(t *testing.T) {
manager := NewManager()

View file

@ -369,6 +369,114 @@ func TestCheckGuestPerDiskAnnotatesCanonicalSpecMetadata(t *testing.T) {
}
}
func TestCheckGuestPerDiskCleansUpRemovedDiskAlerts(t *testing.T) {
m := newTestManager(t)
configureUnifiedEvalManager(t, m, unifiedEvalBaseConfig())
guestID := BuildGuestKey("pve1", "node1", 101)
baseGuest := models.VM{
ID: guestID,
VMID: 101,
Name: "app01",
Node: "node1",
Instance: "pve1",
Status: "running",
CPU: 0.20,
Memory: models.Memory{Usage: 40},
Disk: models.Disk{Usage: 40},
}
withDisk := baseGuest
withDisk.Disks = []models.Disk{
{
Mountpoint: "/",
Device: "scsi0",
Usage: 95,
Total: 100,
Used: 95,
Free: 5,
},
}
m.CheckGuest(withDisk, "pve1")
originalAlertID := canonicalMetricStateID(guestID+"-disk-scsi0", "disk")
if activeAlert(t, m, originalAlertID) == nil {
t.Fatalf("expected guest disk alert %q", originalAlertID)
}
changedDisk := baseGuest
changedDisk.Disks = []models.Disk{
{
Mountpoint: "/data",
Device: "scsi1",
Usage: 96,
Total: 100,
Used: 96,
Free: 4,
},
}
m.CheckGuest(changedDisk, "pve1")
newAlertID := canonicalMetricStateID(guestID+"-disk-data-scsi1", "disk")
m.mu.RLock()
_, oldExists := testLookupActiveAlert(t, m, originalAlertID)
_, newExists := testLookupActiveAlert(t, m, newAlertID)
m.mu.RUnlock()
if oldExists {
t.Fatalf("expected stale guest disk alert %q to be cleared", originalAlertID)
}
if !newExists {
t.Fatalf("expected replacement guest disk alert %q", newAlertID)
}
}
func TestCheckGuestClearsPerDiskAlertsWhenGuestStops(t *testing.T) {
m := newTestManager(t)
configureUnifiedEvalManager(t, m, unifiedEvalBaseConfig())
guestID := BuildGuestKey("pve1", "node1", 101)
guest := models.VM{
ID: guestID,
VMID: 101,
Name: "app01",
Node: "node1",
Instance: "pve1",
Status: "running",
CPU: 0.20,
Memory: models.Memory{Usage: 40},
Disk: models.Disk{Usage: 40},
Disks: []models.Disk{
{
Mountpoint: "/",
Device: "scsi0",
Usage: 95,
Total: 100,
Used: 95,
Free: 5,
},
},
}
m.CheckGuest(guest, "pve1")
alertID := canonicalMetricStateID(guestID+"-disk-scsi0", "disk")
if activeAlert(t, m, alertID) == nil {
t.Fatalf("expected guest disk alert %q", alertID)
}
guest.Status = "stopped"
guest.Disks = nil
m.CheckGuest(guest, "pve1")
m.mu.RLock()
_, exists := testLookupActiveAlert(t, m, alertID)
m.mu.RUnlock()
if exists {
t.Fatalf("expected guest disk alert %q to clear when guest stops", alertID)
}
}
func TestCheckNodeTemperatureAnnotatesCanonicalSpecMetadata(t *testing.T) {
m := newTestManager(t)
configureUnifiedEvalManager(t, m, unifiedEvalBaseConfig())

View file

@ -221,6 +221,122 @@ func TestCheckDockerContainerImageUpdate(t *testing.T) {
}
})
t.Run("missing update status preserves existing alert and tracking", func(t *testing.T) {
testResourceID := "docker:" + hostID + "/container-unknown-status"
container := models.DockerContainer{
ID: "container-unknown-status",
Name: "unknown-status-container",
Image: "nginx:latest",
UpdateStatus: &models.DockerContainerUpdateStatus{
UpdateAvailable: true,
CurrentDigest: "sha256:old",
LatestDigest: "sha256:new",
LastChecked: time.Now(),
},
}
trackingKey := dockerUpdateTrackingKey(host, container)
firstSeen := time.Now().Add(-25 * time.Hour)
m.mu.Lock()
m.dockerUpdateFirstSeen[testResourceID] = firstSeen
m.dockerUpdateFirstSeenByIdentity[trackingKey] = firstSeen
m.mu.Unlock()
m.checkDockerContainerImageUpdate(host, container, testResourceID, "unknown-status-container", instanceName, nodeName)
canonicalAlertID := buildCanonicalStateID(testResourceID, testResourceID+"-image-update")
m.mu.RLock()
alert := testRequireActiveAlert(t, m, canonicalAlertID)
if alert == nil {
m.mu.RUnlock()
t.Fatalf("expected update alert %q to be active", canonicalAlertID)
}
beforeLastSeen := alert.LastSeen
m.mu.RUnlock()
time.Sleep(10 * time.Millisecond)
container.UpdateStatus = nil
m.checkDockerContainerImageUpdate(host, container, testResourceID, "unknown-status-container", instanceName, nodeName)
m.mu.RLock()
alert = testRequireActiveAlert(t, m, canonicalAlertID)
if alert == nil {
m.mu.RUnlock()
t.Fatalf("expected update alert %q to remain active", canonicalAlertID)
}
_, tracked := m.dockerUpdateFirstSeen[testResourceID]
_, trackedByIdentity := m.dockerUpdateFirstSeenByIdentity[trackingKey]
afterLastSeen := alert.LastSeen
m.mu.RUnlock()
if !tracked || !trackedByIdentity {
t.Fatal("expected update tracking to remain when update status is temporarily unavailable")
}
if !afterLastSeen.After(beforeLastSeen) {
t.Fatalf("expected LastSeen to refresh when update status is missing, before=%s after=%s", beforeLastSeen, afterLastSeen)
}
})
t.Run("update detection errors preserve existing alert and tracking", func(t *testing.T) {
testResourceID := "docker:" + hostID + "/container-error-status"
container := models.DockerContainer{
ID: "container-error-status",
Name: "error-status-container",
Image: "nginx:latest",
UpdateStatus: &models.DockerContainerUpdateStatus{
UpdateAvailable: true,
CurrentDigest: "sha256:old",
LatestDigest: "sha256:new",
LastChecked: time.Now(),
},
}
trackingKey := dockerUpdateTrackingKey(host, container)
firstSeen := time.Now().Add(-25 * time.Hour)
m.mu.Lock()
m.dockerUpdateFirstSeen[testResourceID] = firstSeen
m.dockerUpdateFirstSeenByIdentity[trackingKey] = firstSeen
m.mu.Unlock()
m.checkDockerContainerImageUpdate(host, container, testResourceID, "error-status-container", instanceName, nodeName)
canonicalAlertID := buildCanonicalStateID(testResourceID, testResourceID+"-image-update")
m.mu.RLock()
alert := testRequireActiveAlert(t, m, canonicalAlertID)
if alert == nil {
m.mu.RUnlock()
t.Fatalf("expected update alert %q to be active", canonicalAlertID)
}
beforeLastSeen := alert.LastSeen
m.mu.RUnlock()
time.Sleep(10 * time.Millisecond)
container.UpdateStatus = &models.DockerContainerUpdateStatus{
UpdateAvailable: true,
Error: "rate limited",
LastChecked: time.Now(),
}
m.checkDockerContainerImageUpdate(host, container, testResourceID, "error-status-container", instanceName, nodeName)
m.mu.RLock()
alert = testRequireActiveAlert(t, m, canonicalAlertID)
if alert == nil {
m.mu.RUnlock()
t.Fatalf("expected update alert %q to remain active after check error", canonicalAlertID)
}
_, tracked := m.dockerUpdateFirstSeen[testResourceID]
_, trackedByIdentity := m.dockerUpdateFirstSeenByIdentity[trackingKey]
afterLastSeen := alert.LastSeen
m.mu.RUnlock()
if !tracked || !trackedByIdentity {
t.Fatal("expected update tracking to remain after update check error")
}
if !afterLastSeen.After(beforeLastSeen) {
t.Fatalf("expected LastSeen to refresh after update check error, before=%s after=%s", beforeLastSeen, afterLastSeen)
}
})
t.Run("disabled by negative delay hours", func(t *testing.T) {
// Create manager with disabled update alerts
m2 := NewManager()

View file

@ -91,6 +91,33 @@ func TestGetStateRefreshesLiveAlertSnapshots(t *testing.T) {
}
}
func TestEscalationDeliveryDefersToCanonicalAlertSuppression(t *testing.T) {
requiredSnippets := map[string][]string{
"monitor.go": {
"m.alertManager.SetEscalateCallback(func(alert *alerts.Alert, level int) {",
"m.handleAlertEscalated(wsHub, alert, level)",
},
"monitor_alerts.go": {
"func (m *Monitor) handleAlertEscalated(hub *websocket.Hub, alert *alerts.Alert, level int) {",
"if m.alertManager.ShouldSuppressNotification(alert) {",
"m.broadcastEscalatedAlert(hub, alert)",
},
}
for file, snippets := range requiredSnippets {
data, err := os.ReadFile(file)
if err != nil {
t.Fatalf("failed to read %s: %v", file, err)
}
source := string(data)
for _, snippet := range snippets {
if !strings.Contains(source, snippet) {
t.Fatalf("%s must contain %q", file, snippet)
}
}
}
}
func TestMonitoredSystemUsageReadinessGuardrailsRemainCanonical(t *testing.T) {
requiredSnippets := map[string][]string{
"monitor.go": {

View file

@ -1772,41 +1772,7 @@ func (m *Monitor) Start(ctx context.Context, wsHub *websocket.Hub) {
m.handleAlertUnacknowledged(alert, user)
})
m.alertManager.SetEscalateCallback(func(alert *alerts.Alert, level int) {
log.Info().
Str("alertID", alert.ID).
Int("level", level).
Msg("Alert escalated - sending notifications")
// Get escalation config
config := m.alertManager.GetConfig()
if level <= 0 || level > len(config.Schedule.Escalation.Levels) {
return
}
escalationLevel := config.Schedule.Escalation.Levels[level-1]
// Send notifications based on escalation level
switch escalationLevel.Notify {
case "email":
// Only send email
if emailConfig := m.notificationMgr.GetEmailConfig(); emailConfig.Enabled {
m.notificationMgr.SendAlert(alert)
}
case "webhook":
// Only send webhooks
for _, webhook := range m.notificationMgr.GetWebhooks() {
if webhook.Enabled {
m.notificationMgr.SendAlert(alert)
break
}
}
case "all":
// Send all notifications
m.notificationMgr.SendAlert(alert)
}
// Update WebSocket with escalation
m.broadcastEscalatedAlert(wsHub, alert)
m.handleAlertEscalated(wsHub, alert, level)
})
// Create separate tickers for polling and broadcasting using the configured cadence

View file

@ -1,6 +1,8 @@
package monitoring
import (
"net/http"
"net/http/httptest"
"sync/atomic"
"testing"
"time"
@ -276,3 +278,125 @@ func TestMonitor_HandleAlertResolved_NoQuietHoursSendsNotification(t *testing.T)
// Should not crash, and notification should be dispatched (not suppressed)
m.handleAlertResolved(alertID)
}
func TestMonitor_HandleAlertEscalated_QuietHoursSuppressesNotification(t *testing.T) {
t.Setenv("PULSE_DATA_DIR", t.TempDir())
requests := make(chan struct{}, 1)
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
requests <- struct{}{}
w.WriteHeader(http.StatusOK)
}))
defer server.Close()
notifMgr := notifications.NewNotificationManager("https://pulse.local")
defer notifMgr.Stop()
notifMgr.SetGroupingWindow(0)
notifMgr.SetCooldown(0)
if err := notifMgr.UpdateAllowedPrivateCIDRs("127.0.0.1/32"); err != nil {
t.Fatalf("UpdateAllowedPrivateCIDRs: %v", err)
}
notifMgr.AddWebhook(notifications.WebhookConfig{
ID: "quiet-hours-hook",
Name: "quiet-hours",
URL: server.URL,
Enabled: true,
})
mgr := alerts.NewManager()
cfg := mgr.GetConfig()
cfg.Schedule.Escalation.Levels = []alerts.EscalationLevel{{After: 1, Notify: "webhook"}}
cfg.Schedule.QuietHours = alerts.QuietHours{
Enabled: true,
Start: "00:00",
End: "23:59",
Timezone: "UTC",
Days: map[string]bool{
"monday": true, "tuesday": true, "wednesday": true,
"thursday": true, "friday": true, "saturday": true, "sunday": true,
},
Suppress: alerts.QuietHoursSuppression{Offline: true},
}
mgr.UpdateConfig(cfg)
m := &Monitor{
notificationMgr: notifMgr,
alertManager: mgr,
}
alert := &alerts.Alert{
ID: "escalated-offline",
Type: "connectivity",
Level: alerts.AlertLevelCritical,
ResourceID: "node/pve-1",
ResourceName: "pve-1",
Node: "pve-1",
Instance: "pve",
Message: "Node offline",
StartTime: time.Now(),
}
m.handleAlertEscalated(nil, alert, 1)
select {
case <-requests:
t.Fatal("expected quiet hours to suppress escalated notification delivery")
case <-time.After(500 * time.Millisecond):
}
}
func TestMonitor_HandleAlertEscalated_SendsNotificationWhenNotSuppressed(t *testing.T) {
t.Setenv("PULSE_DATA_DIR", t.TempDir())
requests := make(chan struct{}, 1)
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
requests <- struct{}{}
w.WriteHeader(http.StatusOK)
}))
defer server.Close()
notifMgr := notifications.NewNotificationManager("https://pulse.local")
defer notifMgr.Stop()
notifMgr.SetGroupingWindow(0)
notifMgr.SetCooldown(0)
if err := notifMgr.UpdateAllowedPrivateCIDRs("127.0.0.1/32"); err != nil {
t.Fatalf("UpdateAllowedPrivateCIDRs: %v", err)
}
notifMgr.AddWebhook(notifications.WebhookConfig{
ID: "normal-hook",
Name: "normal",
URL: server.URL,
Enabled: true,
})
mgr := alerts.NewManager()
cfg := mgr.GetConfig()
cfg.Schedule.Escalation.Levels = []alerts.EscalationLevel{{After: 1, Notify: "webhook"}}
cfg.Schedule.QuietHours.Enabled = false
mgr.UpdateConfig(cfg)
m := &Monitor{
notificationMgr: notifMgr,
alertManager: mgr,
}
alert := &alerts.Alert{
ID: "escalated-normal",
Type: "connectivity",
Level: alerts.AlertLevelCritical,
ResourceID: "node/pve-1",
ResourceName: "pve-1",
Node: "pve-1",
Instance: "pve",
Message: "Node offline",
StartTime: time.Now(),
}
m.handleAlertEscalated(nil, alert, 1)
select {
case <-requests:
case <-time.After(2 * time.Second):
t.Fatal("expected escalated notification delivery")
}
}

View file

@ -8,6 +8,7 @@ import (
"github.com/rcourtman/pulse-go-rewrite/internal/alerts"
"github.com/rcourtman/pulse-go-rewrite/internal/mock"
"github.com/rcourtman/pulse-go-rewrite/internal/unifiedresources"
"github.com/rcourtman/pulse-go-rewrite/internal/websocket"
"github.com/rs/zerolog/log"
)
@ -140,6 +141,52 @@ func (m *Monitor) handleAlertResolved(alertID string) {
}
}
func (m *Monitor) handleAlertEscalated(hub *websocket.Hub, alert *alerts.Alert, level int) {
if alert == nil || m.alertManager == nil {
return
}
log.Info().
Str("alertID", alert.ID).
Int("level", level).
Msg("Alert escalated")
config := m.alertManager.GetConfig()
if level <= 0 || level > len(config.Schedule.Escalation.Levels) {
return
}
if m.alertManager.ShouldSuppressNotification(alert) {
log.Info().
Str("alertID", alert.ID).
Int("level", level).
Msg("Escalated notification suppressed during quiet hours")
m.broadcastEscalatedAlert(hub, alert)
return
}
if m.notificationMgr != nil {
escalationLevel := config.Schedule.Escalation.Levels[level-1]
switch escalationLevel.Notify {
case "email":
if emailConfig := m.notificationMgr.GetEmailConfig(); emailConfig.Enabled {
m.notificationMgr.SendAlert(alert)
}
case "webhook":
for _, webhook := range m.notificationMgr.GetWebhooks() {
if webhook.Enabled {
m.notificationMgr.SendAlert(alert)
break
}
}
case "all":
m.notificationMgr.SendAlert(alert)
}
}
m.broadcastEscalatedAlert(hub, alert)
}
func (m *Monitor) handleAlertAcknowledged(alert *alerts.Alert, user string) {
if m.incidentStore == nil || alert == nil {
if alert == nil {