Pulse/internal/alerts/alerts.go

10415 lines
318 KiB
Go

package alerts
import (
"encoding/json"
"fmt"
"math"
"os"
"path/filepath"
"sort"
"strconv"
"strings"
"sync"
"time"
"github.com/rcourtman/pulse-go-rewrite/internal/models"
"github.com/rcourtman/pulse-go-rewrite/internal/utils"
"github.com/rcourtman/pulse-go-rewrite/pkg/proxmox"
"github.com/rs/zerolog/log"
)
// AlertLevel represents the severity of an alert
type AlertLevel string
const (
AlertLevelWarning AlertLevel = "warning"
AlertLevelCritical AlertLevel = "critical"
)
// ActivationState represents the alert notification activation state
type ActivationState string
const (
ActivationPending ActivationState = "pending_review"
ActivationActive ActivationState = "active"
ActivationSnoozed ActivationState = "snoozed"
)
// Default thresholds and configuration values
const (
// Default threshold values
DefaultCPUTrigger = 80.0
DefaultCPUClear = 75.0
DefaultMemoryTrigger = 85.0
DefaultMemoryClear = 80.0
DefaultDiskTrigger = 90.0
DefaultDiskClear = 85.0
DefaultStorageTrigger = 85.0
DefaultStorageClear = 80.0
DefaultTempTrigger = 80.0
DefaultTempClear = 75.0
// Time thresholds
DefaultDelaySeconds = 5
DefaultSuppressionWindow = 5 // minutes
// Alert management
DefaultMinimumDelta = 2.0 // minimum % change to trigger new alert
DefaultHysteresisMargin = 5.0 // % margin between trigger and clear
DefaultObservationWindow = 24 // hours
// Flapping detection
DefaultFlappingWindow = 300 // seconds (5 minutes)
DefaultFlappingThreshold = 5 // state changes to trigger flapping
DefaultFlappingCooldown = 15 // minutes
// Confirmation counts for transient state detection
RequiredOfflineConfirmations = 3
RequiredStateConfirmations = 2
// Cleanup intervals
StaleTrackingThreshold = 24 * time.Hour
RateLimitCleanupWindow = 1 * time.Hour
)
func normalizePoweredOffSeverity(level AlertLevel) AlertLevel {
switch strings.ToLower(string(level)) {
case string(AlertLevelCritical):
return AlertLevelCritical
default:
return AlertLevelWarning
}
}
// Alert represents an active alert
type Alert struct {
ID string `json:"id"`
Type string `json:"type"` // cpu, memory, disk, etc.
Level AlertLevel `json:"level"`
ResourceID string `json:"resourceId"` // guest or node ID
ResourceName string `json:"resourceName"`
Node string `json:"node"`
NodeDisplayName string `json:"nodeDisplayName,omitempty"`
Instance string `json:"instance"`
Message string `json:"message"`
Value float64 `json:"value"`
Threshold float64 `json:"threshold"`
StartTime time.Time `json:"startTime"`
LastSeen time.Time `json:"lastSeen"`
Acknowledged bool `json:"acknowledged"`
AckTime *time.Time `json:"ackTime,omitempty"`
AckUser string `json:"ackUser,omitempty"`
Metadata map[string]interface{} `json:"metadata,omitempty"`
// Notification tracking
LastNotified *time.Time `json:"lastNotified,omitempty"` // Last time notification was sent
// Escalation tracking
LastEscalation int `json:"lastEscalation,omitempty"` // Last escalation level notified
EscalationTimes []time.Time `json:"escalationTimes,omitempty"` // Times when escalations were sent
}
// Clone returns a deep copy of the alert so it can be safely shared across goroutines.
func (a *Alert) Clone() *Alert {
if a == nil {
return nil
}
clone := *a
if a.AckTime != nil {
t := *a.AckTime
clone.AckTime = &t
}
if a.LastNotified != nil {
t := *a.LastNotified
clone.LastNotified = &t
}
if len(a.EscalationTimes) > 0 {
clone.EscalationTimes = append([]time.Time(nil), a.EscalationTimes...)
}
if a.Metadata != nil {
clone.Metadata = cloneMetadata(a.Metadata)
}
return &clone
}
func cloneMetadata(src map[string]interface{}) map[string]interface{} {
if src == nil {
return nil
}
dst := make(map[string]interface{}, len(src))
for k, v := range src {
dst[k] = cloneMetadataValue(v)
}
return dst
}
func cloneMetadataValue(val interface{}) interface{} {
switch v := val.(type) {
case map[string]interface{}:
return cloneMetadata(v)
case map[string]string:
m := make(map[string]interface{}, len(v))
for key, value := range v {
m[key] = value
}
return m
case []interface{}:
arr := make([]interface{}, len(v))
for i, elem := range v {
arr[i] = cloneMetadataValue(elem)
}
return arr
case []string:
arr := make([]string, len(v))
copy(arr, v)
return arr
case []int:
arr := make([]int, len(v))
copy(arr, v)
return arr
case []float64:
arr := make([]float64, len(v))
copy(arr, v)
return arr
default:
return v
}
}
// ResolvedAlert represents a recently resolved alert
type ResolvedAlert struct {
*Alert
ResolvedTime time.Time `json:"resolvedTime"`
}
// HysteresisThreshold represents a threshold with hysteresis
type HysteresisThreshold struct {
Trigger float64 `json:"trigger"` // Threshold to trigger alert
Clear float64 `json:"clear"` // Threshold to clear alert
}
// ThresholdConfig represents threshold configuration
type ThresholdConfig struct {
Disabled bool `json:"disabled,omitempty"` // Completely disable alerts for this guest
DisableConnectivity bool `json:"disableConnectivity,omitempty"` // Disable node offline/connectivity/powered-off alerts
PoweredOffSeverity AlertLevel `json:"poweredOffSeverity,omitempty"` // Severity for powered-off alerts
CPU *HysteresisThreshold `json:"cpu,omitempty"`
Memory *HysteresisThreshold `json:"memory,omitempty"`
Disk *HysteresisThreshold `json:"disk,omitempty"`
DiskRead *HysteresisThreshold `json:"diskRead,omitempty"`
DiskWrite *HysteresisThreshold `json:"diskWrite,omitempty"`
NetworkIn *HysteresisThreshold `json:"networkIn,omitempty"`
NetworkOut *HysteresisThreshold `json:"networkOut,omitempty"`
Usage *HysteresisThreshold `json:"usage,omitempty"` // For storage devices
Temperature *HysteresisThreshold `json:"temperature,omitempty"` // For node CPU temperature
DiskTemperature *HysteresisThreshold `json:"diskTemperature,omitempty"` // For host SMART temperatures
Backup *BackupAlertConfig `json:"backup,omitempty"`
Snapshot *SnapshotAlertConfig `json:"snapshot,omitempty"`
Note *string `json:"note,omitempty"`
// Legacy thresholds for backwards compatibility
CPULegacy *float64 `json:"cpuLegacy,omitempty"`
MemoryLegacy *float64 `json:"memoryLegacy,omitempty"`
DiskLegacy *float64 `json:"diskLegacy,omitempty"`
DiskReadLegacy *float64 `json:"diskReadLegacy,omitempty"`
DiskWriteLegacy *float64 `json:"diskWriteLegacy,omitempty"`
NetworkInLegacy *float64 `json:"networkInLegacy,omitempty"`
NetworkOutLegacy *float64 `json:"networkOutLegacy,omitempty"`
}
// QuietHours represents quiet hours configuration
type QuietHours struct {
Enabled bool `json:"enabled"`
Start string `json:"start"` // 24-hour format "HH:MM"
End string `json:"end"` // 24-hour format "HH:MM"
Timezone string `json:"timezone"`
Days map[string]bool `json:"days"` // monday, tuesday, etc.
Suppress QuietHoursSuppression `json:"suppress"`
}
// QuietHoursSuppression controls which alert categories are silenced during quiet hours.
type QuietHoursSuppression struct {
Performance bool `json:"performance"`
Storage bool `json:"storage"`
Offline bool `json:"offline"`
}
// EscalationLevel represents an escalation rule
type EscalationLevel struct {
After int `json:"after"` // minutes after initial alert
Notify string `json:"notify"` // "email", "webhook", or "all"
}
// EscalationConfig represents alert escalation configuration
type EscalationConfig struct {
Enabled bool `json:"enabled"`
Levels []EscalationLevel `json:"levels"`
}
// GroupingConfig represents alert grouping configuration
type GroupingConfig struct {
Enabled bool `json:"enabled"`
Window int `json:"window"` // seconds
ByNode bool `json:"byNode"` // Group alerts by node
ByGuest bool `json:"byGuest"` // Group alerts by guest type
}
// ScheduleConfig represents alerting schedule configuration
type ScheduleConfig struct {
QuietHours QuietHours `json:"quietHours"`
Cooldown int `json:"cooldown"` // minutes
GroupingWindow int `json:"groupingWindow,omitempty"` // Deprecated: use Grouping.Window instead. Will be auto-migrated on config update.
MaxAlertsHour int `json:"maxAlertsHour"` // max alerts per hour per resource
NotifyOnResolve bool `json:"notifyOnResolve"` // Send notification when alert clears
Escalation EscalationConfig `json:"escalation"`
Grouping GroupingConfig `json:"grouping"`
}
// FilterCondition represents a single filter condition
type FilterCondition struct {
Type string `json:"type"` // "metric", "text", or "raw"
Field string `json:"field,omitempty"`
Operator string `json:"operator,omitempty"`
Value interface{} `json:"value,omitempty"`
RawText string `json:"rawText,omitempty"`
}
// FilterStack represents a collection of filters with logical operator
type FilterStack struct {
Filters []FilterCondition `json:"filters"`
LogicalOperator string `json:"logicalOperator"` // "AND" or "OR"
}
// CustomAlertRule represents a custom alert rule with filter conditions
type CustomAlertRule struct {
ID string `json:"id"`
Name string `json:"name"`
Description string `json:"description,omitempty"`
FilterConditions FilterStack `json:"filterConditions"`
Thresholds ThresholdConfig `json:"thresholds"`
Priority int `json:"priority"`
Enabled bool `json:"enabled"`
Notifications struct {
Email *struct {
Enabled bool `json:"enabled"`
Recipients []string `json:"recipients"`
} `json:"email,omitempty"`
Webhook *struct {
Enabled bool `json:"enabled"`
URL string `json:"url"`
} `json:"webhook,omitempty"`
} `json:"notifications"`
CreatedAt time.Time `json:"createdAt"`
UpdatedAt time.Time `json:"updatedAt"`
}
// DockerThresholdConfig represents Docker-specific alert thresholds
type DockerThresholdConfig struct {
CPU HysteresisThreshold `json:"cpu"` // CPU usage % threshold (default: 80%)
Memory HysteresisThreshold `json:"memory"` // Memory usage % threshold (default: 85%)
Disk HysteresisThreshold `json:"disk"` // Writable layer usage % threshold (default: 85%)
RestartCount int `json:"restartCount"` // Number of restarts to trigger alert (default: 3)
RestartWindow int `json:"restartWindow"` // Time window in seconds for restart loop detection (default: 300 = 5min)
MemoryWarnPct int `json:"memoryWarnPct"` // Memory limit % to trigger warning (default: 90)
MemoryCriticalPct int `json:"memoryCriticalPct"` // Memory limit % to trigger critical (default: 95)
ServiceWarnGapPct int `json:"serviceWarnGapPercent"` // % of desired tasks missing to trigger warning (default: 10)
ServiceCritGapPct int `json:"serviceCriticalGapPercent"` // % of desired tasks missing to trigger critical (default: 50)
StateDisableConnectivity bool `json:"stateDisableConnectivity,omitempty"` // Disable container offline/state alerts globally
StatePoweredOffSeverity AlertLevel `json:"statePoweredOffSeverity,omitempty"` // Default severity for container state/offline alerts
UpdateAlertDelayHours int `json:"updateAlertDelayHours,omitempty"` // Hours to wait before alerting on available image updates (default: 24, -1 = disabled)
}
// PMGThresholdConfig represents Proxmox Mail Gateway-specific alert thresholds
type PMGThresholdConfig struct {
QueueTotalWarning int `json:"queueTotalWarning"` // Total queue depth warning threshold (default: 500)
QueueTotalCritical int `json:"queueTotalCritical"` // Total queue depth critical threshold (default: 1000)
OldestMessageWarnMins int `json:"oldestMessageWarnMins"` // Oldest queued message age warning in minutes (default: 30)
OldestMessageCritMins int `json:"oldestMessageCritMins"` // Oldest queued message age critical in minutes (default: 60)
DeferredQueueWarn int `json:"deferredQueueWarn"` // Deferred queue depth warning (default: 200)
DeferredQueueCritical int `json:"deferredQueueCritical"` // Deferred queue depth critical (default: 500)
HoldQueueWarn int `json:"holdQueueWarn"` // Hold queue depth warning (default: 100)
HoldQueueCritical int `json:"holdQueueCritical"` // Hold queue depth critical (default: 300)
QuarantineSpamWarn int `json:"quarantineSpamWarn"` // Spam quarantine absolute warning (default: 2000)
QuarantineSpamCritical int `json:"quarantineSpamCritical"` // Spam quarantine absolute critical (default: 5000)
QuarantineVirusWarn int `json:"quarantineVirusWarn"` // Virus quarantine absolute warning (default: 2000)
QuarantineVirusCritical int `json:"quarantineVirusCritical"` // Virus quarantine absolute critical (default: 5000)
QuarantineGrowthWarnPct int `json:"quarantineGrowthWarnPct"` // Growth % to trigger warning (default: 25)
QuarantineGrowthWarnMin int `json:"quarantineGrowthWarnMin"` // Minimum message growth for warning (default: 250)
QuarantineGrowthCritPct int `json:"quarantineGrowthCritPct"` // Growth % to trigger critical (default: 50)
QuarantineGrowthCritMin int `json:"quarantineGrowthCritMin"` // Minimum message growth for critical (default: 500)
}
// SnapshotAlertConfig represents snapshot age alert configuration
type SnapshotAlertConfig struct {
Enabled bool `json:"enabled"`
WarningDays int `json:"warningDays"`
CriticalDays int `json:"criticalDays"`
WarningSizeGiB float64 `json:"warningSizeGiB,omitempty"`
CriticalSizeGiB float64 `json:"criticalSizeGiB,omitempty"`
}
// BackupAlertConfig represents backup age alert configuration
type BackupAlertConfig struct {
Enabled bool `json:"enabled"`
WarningDays int `json:"warningDays"`
CriticalDays int `json:"criticalDays"`
// Indicator thresholds for the dashboard (separate from alert thresholds)
FreshHours int `json:"freshHours"` // Backups newer than this show as green (default: 24)
StaleHours int `json:"staleHours"` // Backups older than FreshHours but newer than this show as amber (default: 72)
// Global backup alert filters
AlertOrphaned *bool `json:"alertOrphaned,omitempty"` // Alert on backups that do not match a known guest (default: true)
IgnoreVMIDs []string `json:"ignoreVMIDs,omitempty"` // Skip alerts for matching VMIDs (supports prefix*)
}
// GuestLookup describes a guest identity used for snapshot/backup evaluations.
type GuestLookup struct {
ResourceID string
Name string
Instance string
Node string
Type string
VMID int
}
// AlertConfig represents the complete alert configuration
type AlertConfig struct {
Enabled bool `json:"enabled"`
ActivationState ActivationState `json:"activationState,omitempty"`
ObservationWindowHours int `json:"observationWindowHours,omitempty"`
ActivationTime *time.Time `json:"activationTime,omitempty"`
GuestDefaults ThresholdConfig `json:"guestDefaults"`
NodeDefaults ThresholdConfig `json:"nodeDefaults"`
HostDefaults ThresholdConfig `json:"hostDefaults"`
StorageDefault HysteresisThreshold `json:"storageDefault"`
DockerDefaults DockerThresholdConfig `json:"dockerDefaults"`
DockerIgnoredContainerPrefixes []string `json:"dockerIgnoredContainerPrefixes,omitempty"`
IgnoredGuestPrefixes []string `json:"ignoredGuestPrefixes,omitempty"`
GuestTagWhitelist []string `json:"guestTagWhitelist,omitempty"`
GuestTagBlacklist []string `json:"guestTagBlacklist,omitempty"`
PMGDefaults PMGThresholdConfig `json:"pmgDefaults"`
PBSDefaults ThresholdConfig `json:"pbsDefaults"`
SnapshotDefaults SnapshotAlertConfig `json:"snapshotDefaults"`
BackupDefaults BackupAlertConfig `json:"backupDefaults"`
Overrides map[string]ThresholdConfig `json:"overrides"` // keyed by resource ID
CustomRules []CustomAlertRule `json:"customRules,omitempty"`
Schedule ScheduleConfig `json:"schedule"`
// Global disable flags per resource type
DisableAllNodes bool `json:"disableAllNodes"` // Disable all alerts for Proxmox nodes
DisableAllGuests bool `json:"disableAllGuests"` // Disable all alerts for VMs/containers
DisableAllHosts bool `json:"disableAllHosts"` // Disable all alerts for Pulse host agents
DisableAllStorage bool `json:"disableAllStorage"` // Disable all alerts for storage
DisableAllPBS bool `json:"disableAllPBS"` // Disable all alerts for PBS servers
DisableAllPMG bool `json:"disableAllPMG"` // Disable all alerts for PMG instances
DisableAllDockerHosts bool `json:"disableAllDockerHosts"` // Disable all alerts for Docker hosts
DisableAllDockerContainers bool `json:"disableAllDockerContainers"` // Disable all alerts for Docker containers
DisableAllDockerServices bool `json:"disableAllDockerServices"` // Disable all alerts for Docker services
DisableAllNodesOffline bool `json:"disableAllNodesOffline"` // Disable node offline/connectivity alerts globally
DisableAllGuestsOffline bool `json:"disableAllGuestsOffline"` // Disable guest powered-off alerts globally
DisableAllHostsOffline bool `json:"disableAllHostsOffline"` // Disable host agent offline alerts globally
DisableAllPBSOffline bool `json:"disableAllPBSOffline"` // Disable PBS offline alerts globally
DisableAllPMGOffline bool `json:"disableAllPMGOffline"` // Disable PMG offline alerts globally
DisableAllDockerHostsOffline bool `json:"disableAllDockerHostsOffline"` // Disable Docker host offline alerts globally
// New configuration options
MinimumDelta float64 `json:"minimumDelta"` // Minimum % change to trigger new alert
SuppressionWindow int `json:"suppressionWindow"` // Minutes to suppress duplicate alerts
HysteresisMargin float64 `json:"hysteresisMargin"` // Default margin for legacy thresholds
TimeThreshold int `json:"timeThreshold"` // Legacy: Seconds that threshold must be exceeded before triggering
TimeThresholds map[string]int `json:"timeThresholds"` // Per-type delays: guest, node, storage, pbs
MetricTimeThresholds map[string]map[string]int `json:"metricTimeThresholds"` // Optional per-metric delays keyed by resource type
// Alert TTL and auto-cleanup
MaxAlertAgeDays int `json:"maxAlertAgeDays"` // Maximum age for alerts before auto-cleanup (0 = disabled)
MaxAcknowledgedAgeDays int `json:"maxAcknowledgedAgeDays"` // Maximum age for acknowledged alerts (0 = disabled)
AutoAcknowledgeAfterHours int `json:"autoAcknowledgeAfterHours"` // Auto-acknowledge alerts after X hours (0 = disabled)
// Flapping detection
FlappingEnabled bool `json:"flappingEnabled"` // Enable flapping detection
FlappingWindowSeconds int `json:"flappingWindowSeconds"` // Time window for counting state changes
FlappingThreshold int `json:"flappingThreshold"` // Number of state changes to trigger flapping
FlappingCooldownMinutes int `json:"flappingCooldownMinutes"` // Cooldown period after flapping detected
}
// pmgQuarantineSnapshot stores quarantine counts at a point in time for growth detection
type pmgQuarantineSnapshot struct {
Spam int
Virus int
Timestamp time.Time
}
// pmgMailMetricSample stores a single hourly mail count sample
type pmgMailMetricSample struct {
SpamIn float64
SpamOut float64
VirusIn float64
VirusOut float64
Timestamp time.Time
}
// pmgBaselineCache stores calculated baseline values for a metric
type pmgBaselineCache struct {
TrimmedMean float64
Median float64
LastUpdated time.Time
}
// pmgAnomalyTracker tracks history and baselines for anomaly detection
type pmgAnomalyTracker struct {
Samples []pmgMailMetricSample // Ring buffer (max 48 samples)
Baselines map[string]pmgBaselineCache // Cached baselines per metric (spamIn, spamOut, virusIn, virusOut)
LastSampleTime time.Time // Timestamp of most recent sample
SampleCount int // Total samples collected (for warmup check)
}
// Manager handles alert monitoring and state
//
// Lock Ordering Documentation:
// The Manager uses two mutexes to prevent deadlocks:
// 1. m.mu (primary lock) - protects most manager state
// 2. m.resolvedMutex - protects only recentlyResolved map
//
// Lock Ordering Rules:
// - NEVER hold m.mu when acquiring resolvedMutex
// - ALWAYS release m.mu before acquiring resolvedMutex
// - resolvedMutex can be held independently without m.mu
// - When both locks are needed, acquire m.mu first, then release it before acquiring resolvedMutex
//
// This ordering prevents deadlock scenarios where different goroutines acquire locks in different orders.
// Metric hooks for integrating with Prometheus
var (
recordAlertFired func(*Alert)
recordAlertResolved func(*Alert)
recordAlertSuppressed func(string)
recordAlertAcknowledged func()
)
// SetMetricHooks registers callbacks for recording alert metrics.
// - fired: called when an alert is dispatched (in dispatchAlert)
// - resolved: called when an alert is cleared (in clearAlertNoLock)
// - suppressed: called when an alert is suppressed due to flapping
// - acknowledged: called when an alert is acknowledged
func SetMetricHooks(fired func(*Alert), resolved func(*Alert), suppressed func(string), acknowledged func()) {
recordAlertFired = fired
recordAlertResolved = resolved
recordAlertSuppressed = suppressed
recordAlertAcknowledged = acknowledged
}
type Manager struct {
mu sync.RWMutex
config AlertConfig
activeAlerts map[string]*Alert
historyManager *HistoryManager
onAlert func(alert *Alert)
onResolved func(alertID string)
onAcknowledged func(alert *Alert, user string)
onUnacknowledged func(alert *Alert, user string)
onEscalate func(alert *Alert, level int)
onAlertForAI func(alert *Alert) // AI analysis callback - bypasses notification suppression
escalationStop chan struct{}
alertRateLimit map[string][]time.Time // Track alert times for rate limiting
// New fields for deduplication and suppression
recentAlerts map[string]*Alert // Track recent alerts for deduplication
suppressedUntil map[string]time.Time // Track suppression windows
// Recently resolved alerts (kept for 5 minutes)
recentlyResolved map[string]*ResolvedAlert
resolvedMutex sync.RWMutex // Secondary lock - see Lock Ordering Documentation above
// Time threshold tracking
pendingAlerts map[string]time.Time // Track when thresholds were first exceeded
// Offline confirmation tracking
nodeOfflineCount map[string]int // Track consecutive offline counts for nodes (legacy)
offlineConfirmations map[string]int // Track consecutive offline counts for all resources
dockerOfflineCount map[string]int // Track consecutive offline counts for Docker hosts
dockerStateConfirm map[string]int // Track consecutive state confirmations for Docker containers
dockerRestartTracking map[string]*dockerRestartRecord // Track restart counts and times for restart loop detection
dockerLastExitCode map[string]int // Track last exit code for OOM detection
dockerUpdateFirstSeen map[string]time.Time // Track when image updates were first detected for alert delay
// Stable identity tracking prevents update-delay resets when host IDs churn.
dockerUpdateFirstSeenByIdentity map[string]time.Time
// PMG quarantine growth tracking
pmgQuarantineHistory map[string][]pmgQuarantineSnapshot // Track quarantine snapshots for growth detection
// PMG anomaly detection tracking
pmgAnomalyTrackers map[string]*pmgAnomalyTracker // Track mail metrics for anomaly detection per PMG instance
// Persistent acknowledgement state so quick alert rebuilds keep user acknowledgements
ackState map[string]ackRecord
// Flapping detection tracking
flappingHistory map[string][]time.Time // Track state change times for flapping detection
flappingActive map[string]bool // Track which alerts are currently in flapping state
// Cleanup control
cleanupStop chan struct{} // Signal to stop cleanup goroutine
// Host agent deduplication: track hostnames of active host agents
// When a host agent is running on a Proxmox node, we prefer the host agent
// alerts and suppress the node alerts to avoid duplicate monitoring.
hostAgentHostnames map[string]struct{} // Normalized hostnames (lowercase)
// Node display name caches. Proxmox nodes can share the same raw node name
// across multiple configured instances, so keep instance-scoped entries in
// addition to the legacy raw-name cache used by instance-less resources.
nodeDisplayNames map[string]string
instanceNodeDisplayNames map[string]string
// License checking for Pro-only alert features
hasProFeature func(feature string) bool
// Cached timezone for quiet hours
quietHoursLoc *time.Location
}
type ackRecord struct {
acknowledged bool
user string
time time.Time // When the alert was acknowledged
inactiveAt time.Time // When the alert was removed (zero if still active)
}
type dockerRestartRecord struct {
count int
lastCount int
times []time.Time // Track restart times for loop detection
lastChecked time.Time
}
// NewManager creates a new alert manager using the global data directory.
// For multi-tenant deployments, use NewManagerWithDataDir instead.
func NewManager() *Manager {
return NewManagerWithDataDir(utils.GetDataDir())
}
// NewManagerWithDataDir creates a new alert manager with a custom data directory.
// This enables tenant-scoped alert persistence in multi-tenant deployments.
func NewManagerWithDataDir(dataDir string) *Manager {
alertsDir := filepath.Join(dataDir, "alerts")
alertOrphaned := true
m := &Manager{
activeAlerts: make(map[string]*Alert),
historyManager: NewHistoryManager(alertsDir),
escalationStop: make(chan struct{}),
alertRateLimit: make(map[string][]time.Time),
recentAlerts: make(map[string]*Alert),
suppressedUntil: make(map[string]time.Time),
recentlyResolved: make(map[string]*ResolvedAlert),
pendingAlerts: make(map[string]time.Time),
nodeOfflineCount: make(map[string]int),
offlineConfirmations: make(map[string]int),
dockerOfflineCount: make(map[string]int),
dockerStateConfirm: make(map[string]int),
dockerRestartTracking: make(map[string]*dockerRestartRecord),
dockerLastExitCode: make(map[string]int),
dockerUpdateFirstSeen: make(map[string]time.Time),
dockerUpdateFirstSeenByIdentity: make(map[string]time.Time),
pmgQuarantineHistory: make(map[string][]pmgQuarantineSnapshot),
pmgAnomalyTrackers: make(map[string]*pmgAnomalyTracker),
ackState: make(map[string]ackRecord),
flappingHistory: make(map[string][]time.Time),
flappingActive: make(map[string]bool),
cleanupStop: make(chan struct{}),
hostAgentHostnames: make(map[string]struct{}),
nodeDisplayNames: make(map[string]string),
instanceNodeDisplayNames: make(map[string]string),
config: AlertConfig{
Enabled: true,
ActivationState: ActivationPending,
ObservationWindowHours: 24,
GuestDefaults: ThresholdConfig{
PoweredOffSeverity: AlertLevelWarning,
CPU: &HysteresisThreshold{Trigger: 80, Clear: 75},
Memory: &HysteresisThreshold{Trigger: 85, Clear: 80},
Disk: &HysteresisThreshold{Trigger: 90, Clear: 85},
DiskRead: &HysteresisThreshold{Trigger: 0, Clear: 0}, // Off by default
DiskWrite: &HysteresisThreshold{Trigger: 0, Clear: 0}, // Off by default
NetworkIn: &HysteresisThreshold{Trigger: 0, Clear: 0}, // Off by default
NetworkOut: &HysteresisThreshold{Trigger: 0, Clear: 0}, // Off by default
},
NodeDefaults: ThresholdConfig{
CPU: &HysteresisThreshold{Trigger: 80, Clear: 75},
Memory: &HysteresisThreshold{Trigger: 85, Clear: 80},
Disk: &HysteresisThreshold{Trigger: 90, Clear: 85},
Temperature: &HysteresisThreshold{Trigger: 80, Clear: 75}, // Warning at 80°C, clear at 75°C
},
HostDefaults: ThresholdConfig{
CPU: &HysteresisThreshold{Trigger: 80, Clear: 75},
Memory: &HysteresisThreshold{Trigger: 85, Clear: 80},
Disk: &HysteresisThreshold{Trigger: 90, Clear: 85},
DiskTemperature: &HysteresisThreshold{Trigger: 55, Clear: 50},
},
DockerDefaults: DockerThresholdConfig{
CPU: HysteresisThreshold{Trigger: 80, Clear: 75},
Memory: HysteresisThreshold{Trigger: 85, Clear: 80},
Disk: HysteresisThreshold{Trigger: 85, Clear: 80},
RestartCount: 3,
RestartWindow: 300, // 5 minutes
MemoryWarnPct: 90,
MemoryCriticalPct: 95,
StatePoweredOffSeverity: AlertLevelWarning,
},
PMGDefaults: PMGThresholdConfig{
QueueTotalWarning: 500, // Warning at 500 total queued messages
QueueTotalCritical: 1000, // Critical at 1000 total queued messages
OldestMessageWarnMins: 30, // Warning if oldest message is 30+ minutes old
OldestMessageCritMins: 60, // Critical if oldest message is 60+ minutes old
DeferredQueueWarn: 200, // Warning at 200 deferred messages
DeferredQueueCritical: 500, // Critical at 500 deferred messages
HoldQueueWarn: 100, // Warning at 100 held messages
HoldQueueCritical: 300, // Critical at 300 held messages
QuarantineSpamWarn: 2000, // Warning at 2000 spam quarantined
QuarantineSpamCritical: 5000, // Critical at 5000 spam quarantined
QuarantineVirusWarn: 2000, // Warning at 2000 virus quarantined
QuarantineVirusCritical: 5000, // Critical at 5000 virus quarantined
QuarantineGrowthWarnPct: 25, // Warning if growth ≥25%
QuarantineGrowthWarnMin: 250, // AND ≥250 messages
QuarantineGrowthCritPct: 50, // Critical if growth ≥50%
QuarantineGrowthCritMin: 500, // AND ≥500 messages
},
SnapshotDefaults: SnapshotAlertConfig{
Enabled: false,
WarningDays: 30,
CriticalDays: 45,
WarningSizeGiB: 0,
CriticalSizeGiB: 0,
},
BackupDefaults: BackupAlertConfig{
Enabled: false,
WarningDays: 7,
CriticalDays: 14,
FreshHours: 24,
StaleHours: 72,
AlertOrphaned: &alertOrphaned,
IgnoreVMIDs: []string{},
},
PBSDefaults: ThresholdConfig{
CPU: &HysteresisThreshold{Trigger: 80, Clear: 75},
Memory: &HysteresisThreshold{Trigger: 85, Clear: 80},
},
StorageDefault: HysteresisThreshold{Trigger: 85, Clear: 80},
MinimumDelta: 2.0, // 2% minimum change
SuppressionWindow: 5, // 5 minutes
HysteresisMargin: 5.0, // 5% default margin
TimeThreshold: 5,
TimeThresholds: map[string]int{
"guest": 5,
"node": 5,
"storage": 5,
"pbs": 5,
},
Overrides: make(map[string]ThresholdConfig),
Schedule: ScheduleConfig{
QuietHours: QuietHours{
Enabled: false, // OFF - users should opt-in to quiet hours
Start: "22:00",
End: "08:00",
Timezone: "America/New_York",
Days: map[string]bool{
"monday": true,
"tuesday": true,
"wednesday": true,
"thursday": true,
"friday": true,
"saturday": false,
"sunday": false,
},
Suppress: QuietHoursSuppression{},
},
Cooldown: 5, // ON - 5 minutes prevents spam
MaxAlertsHour: 10, // ON - 10 alerts/hour prevents flooding
// Note: GroupingWindow is deprecated - use Grouping.Window instead
NotifyOnResolve: true,
Escalation: EscalationConfig{
Enabled: false, // OFF - requires user configuration
Levels: []EscalationLevel{
{After: 15, Notify: "email"},
{After: 30, Notify: "webhook"},
{After: 60, Notify: "all"},
},
},
Grouping: GroupingConfig{
Enabled: true, // ON - reduces notification noise
Window: 30, // 30 second window for grouping
ByNode: true, // Group by node for mass node issues
ByGuest: false, // Don't group by guest by default
},
},
// Alert TTL defaults
MaxAlertAgeDays: 7, // Auto-cleanup alerts older than 7 days
MaxAcknowledgedAgeDays: 1, // Auto-cleanup acknowledged alerts older than 1 day
AutoAcknowledgeAfterHours: 24, // Auto-acknowledge alerts after 24 hours
// Flapping detection defaults
FlappingEnabled: true, // Enable flapping detection
FlappingWindowSeconds: 300, // 5 minute window
FlappingThreshold: 5, // 5 state changes triggers flapping
FlappingCooldownMinutes: 15, // 15 minute cooldown
},
}
// Load saved active alerts
if err := m.LoadActiveAlerts(); err != nil {
log.Error().Err(err).Msg("Failed to load active alerts")
}
// Start escalation checker
go m.escalationChecker()
// Start periodic save of active alerts
go m.periodicSaveAlerts()
// Start periodic cleanup of stale tracking map entries
go m.trackingMapCleanup()
return m
}
// SetLicenseChecker sets the function used to check Pro license features.
// This enables gating Pro-only alert features like update alerts.
func (m *Manager) SetLicenseChecker(checker func(feature string) bool) {
m.mu.Lock()
defer m.mu.Unlock()
m.hasProFeature = checker
}
// addRecentlyResolvedUnlocked records a resolved alert assuming the caller does not hold m.mu.
func (m *Manager) addRecentlyResolvedUnlocked(alertID string, resolved *ResolvedAlert) {
m.resolvedMutex.Lock()
m.recentlyResolved[alertID] = resolved
m.resolvedMutex.Unlock()
}
// addRecentlyResolvedWithPrimaryLock records a resolved alert while preserving the caller's
// ownership of m.mu. Callers must hold m.mu before invoking this helper.
func (m *Manager) addRecentlyResolvedWithPrimaryLock(alertID string, resolved *ResolvedAlert) {
m.mu.Unlock()
m.addRecentlyResolvedUnlocked(alertID, resolved)
m.mu.Lock()
}
// SetAlertCallback sets the callback for new alerts
func (m *Manager) SetAlertCallback(cb func(alert *Alert)) {
m.mu.Lock()
defer m.mu.Unlock()
m.onAlert = cb
}
// SetAlertForAICallback sets a callback for AI analysis when alerts are created.
// Unlike SetAlertCallback, this callback is invoked unconditionally - it bypasses
// activation state, quiet hours, and other notification suppression checks.
// This allows AI to analyze alerts even when the user hasn't finished setup.
func (m *Manager) SetAlertForAICallback(cb func(alert *Alert)) {
m.mu.Lock()
defer m.mu.Unlock()
m.onAlertForAI = cb
log.Info().Msg("Alert-for-AI callback registered (bypasses notification suppression)")
}
// SetResolvedCallback sets the callback for resolved alerts
func (m *Manager) SetResolvedCallback(cb func(alertID string)) {
m.mu.Lock()
defer m.mu.Unlock()
m.onResolved = cb
}
// SetAcknowledgedCallback sets the callback for acknowledged alerts.
func (m *Manager) SetAcknowledgedCallback(cb func(alert *Alert, user string)) {
m.mu.Lock()
defer m.mu.Unlock()
m.onAcknowledged = cb
}
// SetUnacknowledgedCallback sets the callback for unacknowledged alerts.
func (m *Manager) SetUnacknowledgedCallback(cb func(alert *Alert, user string)) {
m.mu.Lock()
defer m.mu.Unlock()
m.onUnacknowledged = cb
}
// SetEscalateCallback sets the callback for escalated alerts
func (m *Manager) SetEscalateCallback(cb func(alert *Alert, level int)) {
m.mu.Lock()
defer m.mu.Unlock()
m.onEscalate = cb
}
// safeCallResolvedCallback invokes onResolved with panic recovery
func (m *Manager) safeCallResolvedCallback(alertID string, async bool) {
if m.onResolved == nil {
return
}
callbackFunc := func() {
defer func() {
if r := recover(); r != nil {
log.Error().
Interface("panic", r).
Str("alertID", alertID).
Msg("Panic in onResolved callback")
}
}()
m.onResolved(alertID)
}
if async {
go callbackFunc()
} else {
callbackFunc()
}
}
// safeCallAcknowledgedCallback invokes onAcknowledged with panic recovery and alert cloning.
func (m *Manager) safeCallAcknowledgedCallback(alert *Alert, user string) {
if m.onAcknowledged == nil || alert == nil {
return
}
alertCopy := alert.Clone()
go func(a *Alert, u string) {
defer func() {
if r := recover(); r != nil {
log.Error().
Interface("panic", r).
Str("alertID", a.ID).
Msg("Panic in onAcknowledged callback")
}
}()
m.onAcknowledged(a, u)
}(alertCopy, user)
}
// safeCallUnacknowledgedCallback invokes onUnacknowledged with panic recovery and alert cloning.
func (m *Manager) safeCallUnacknowledgedCallback(alert *Alert, user string) {
if m.onUnacknowledged == nil || alert == nil {
return
}
alertCopy := alert.Clone()
go func(a *Alert, u string) {
defer func() {
if r := recover(); r != nil {
log.Error().
Interface("panic", r).
Str("alertID", a.ID).
Msg("Panic in onUnacknowledged callback")
}
}()
m.onUnacknowledged(a, u)
}(alertCopy, user)
}
// safeCallEscalateCallback invokes onEscalate with panic recovery and alert cloning
func (m *Manager) safeCallEscalateCallback(alert *Alert, level int) {
if m.onEscalate == nil {
return
}
// Clone alert to prevent concurrent modification
alertCopy := alert.Clone()
go func(a *Alert, lvl int) {
defer func() {
if r := recover(); r != nil {
log.Error().
Interface("panic", r).
Str("alertID", a.ID).
Int("level", lvl).
Msg("Panic in onEscalate callback")
}
}()
m.onEscalate(a, lvl)
}(alertCopy, level)
}
// checkFlappingLocked detects alert flapping and returns true if alert should be suppressed.
// It modifies flappingHistory, flappingActive, and suppressedUntil maps.
// IMPORTANT: Caller MUST hold m.mu before calling this function.
func (m *Manager) checkFlappingLocked(alertID string) bool {
if !m.config.FlappingEnabled {
return false
}
now := time.Now()
windowDuration := time.Duration(m.config.FlappingWindowSeconds) * time.Second
// Record this state change
m.flappingHistory[alertID] = append(m.flappingHistory[alertID], now)
// Remove state changes outside the window
history := m.flappingHistory[alertID]
validHistory := []time.Time{}
for _, t := range history {
if now.Sub(t) <= windowDuration {
validHistory = append(validHistory, t)
}
}
// Limit to max 10 entries to prevent unbounded growth
const maxFlappingHistory = 10
if len(validHistory) > maxFlappingHistory {
validHistory = validHistory[len(validHistory)-maxFlappingHistory:]
}
m.flappingHistory[alertID] = validHistory
// Check if we've exceeded the threshold
if len(validHistory) >= m.config.FlappingThreshold {
// Mark as flapping
if !m.flappingActive[alertID] {
log.Warn().
Str("alertID", alertID).
Int("stateChanges", len(validHistory)).
Int("threshold", m.config.FlappingThreshold).
Int("windowSeconds", m.config.FlappingWindowSeconds).
Msg("Flapping detected - suppressing alert")
m.flappingActive[alertID] = true
// Set cooldown period
cooldownDuration := time.Duration(m.config.FlappingCooldownMinutes) * time.Minute
m.suppressedUntil[alertID] = now.Add(cooldownDuration)
// Record suppression metric
if recordAlertSuppressed != nil {
recordAlertSuppressed("flapping")
}
}
return true
}
return false
}
func (m *Manager) dispatchAlert(alert *Alert, async bool) bool {
if m.onAlert == nil || alert == nil {
return false
}
// Don't dispatch notifications for acknowledged alerts
if alert.Acknowledged {
log.Debug().
Str("alertID", alert.ID).
Str("ackUser", alert.AckUser).
Msg("Alert notification suppressed - already acknowledged")
return false
}
// Check for flapping (caller must hold m.mu)
if m.checkFlappingLocked(alert.ID) {
log.Debug().
Str("alertID", alert.ID).
Msg("Alert suppressed due to flapping")
return false
}
// Check activation state - only dispatch notifications if active
if m.config.ActivationState != ActivationActive {
log.Debug().
Str("alertID", alert.ID).
Str("activationState", string(m.config.ActivationState)).
Msg("Alert notification suppressed - not activated")
return false
}
if suppressed, reason := m.shouldSuppressNotification(alert); suppressed {
log.Debug().
Str("alertID", alert.ID).
Str("type", alert.Type).
Str("level", string(alert.Level)).
Str("quietHoursRule", reason).
Msg("Alert notification suppressed during quiet hours")
return false
}
if isMonitorOnlyAlert(alert) {
log.Info().
Str("alertID", alert.ID).
Str("resource", alert.ResourceName).
Bool("monitorOnly", true).
Msg("Monitor-only alert detected, skipping alert dispatch")
return false
}
// Record metric for fired alert
if recordAlertFired != nil {
recordAlertFired(alert)
}
alertCopy := alert.Clone()
if async {
go func(a *Alert) {
defer func() {
if r := recover(); r != nil {
log.Error().
Interface("panic", r).
Str("alertID", a.ID).
Str("type", a.Type).
Msg("Panic in onAlert callback")
}
}()
m.onAlert(a)
}(alertCopy)
} else {
// Synchronous calls also need panic recovery to prevent service crash
func() {
defer func() {
if r := recover(); r != nil {
log.Error().
Interface("panic", r).
Str("alertID", alertCopy.ID).
Str("type", alertCopy.Type).
Msg("Panic in onAlert callback (synchronous)")
}
}()
m.onAlert(alertCopy)
}()
}
return true
}
func isMonitorOnlyAlert(alert *Alert) bool {
if alert == nil || alert.Metadata == nil {
return false
}
if value, ok := alert.Metadata["monitorOnly"]; ok {
switch v := value.(type) {
case bool:
return v
case string:
return strings.EqualFold(v, "true")
}
}
return false
}
// ensureValidHysteresis ensures clear < trigger for hysteresis thresholds
func ensureValidHysteresis(threshold *HysteresisThreshold, metricName string) {
if threshold == nil {
return
}
// Disabled thresholds don't need hysteresis validation
if threshold.Trigger <= 0 {
return
}
if threshold.Clear >= threshold.Trigger {
log.Warn().
Str("metric", metricName).
Float64("trigger", threshold.Trigger).
Float64("clear", threshold.Clear).
Msg("Invalid hysteresis: clear >= trigger, auto-fixing")
// Auto-fix: set clear to 5% below trigger
threshold.Clear = threshold.Trigger - 5
if threshold.Clear < 0 {
threshold.Clear = 0
}
}
}
// UpdateConfig updates the alert configuration
func (m *Manager) UpdateConfig(config AlertConfig) {
m.mu.Lock()
defer m.mu.Unlock()
// Preserve activation state/time when clients update the config without including it.
// This avoids unintentionally resetting alerts to pending review when saving thresholds.
if config.ActivationState == "" && m.config.ActivationState != "" {
config.ActivationState = m.config.ActivationState
if config.ActivationTime == nil && m.config.ActivationTime != nil {
config.ActivationTime = m.config.ActivationTime
}
}
// Normalize all config sections
normalizeStorageDefaults(&config)
normalizeDockerDefaults(&config)
normalizePMGDefaults(&config)
normalizeSnapshotDefaults(&config)
normalizeBackupDefaults(&config)
normalizeNodeDefaults(&config)
normalizeHostDefaults(&config)
normalizeGeneralSettings(&config)
normalizeTimeThresholds(&config)
config.GuestDefaults.PoweredOffSeverity = normalizePoweredOffSeverity(config.GuestDefaults.PoweredOffSeverity)
config.NodeDefaults.PoweredOffSeverity = normalizePoweredOffSeverity(config.NodeDefaults.PoweredOffSeverity)
config.DockerIgnoredContainerPrefixes = NormalizeDockerIgnoredPrefixes(config.DockerIgnoredContainerPrefixes)
// Migration logic for activation state (backward compatibility)
m.migrateActivationState(&config)
// Validate hysteresis thresholds to prevent stuck alerts
validateHysteresisThresholds(&config)
// Validate timezone if quiet hours are enabled
validateQuietHoursTimezone(&config)
m.config = config
normalizeOverrides(m.config.Overrides)
// Update cached quiet hours location
if m.config.Schedule.QuietHours.Enabled && m.config.Schedule.QuietHours.Timezone != "" {
loc, err := time.LoadLocation(m.config.Schedule.QuietHours.Timezone)
if err == nil {
m.quietHoursLoc = loc
} else {
m.quietHoursLoc = time.Local
}
} else {
m.quietHoursLoc = time.Local
}
if !m.config.SnapshotDefaults.Enabled {
m.clearSnapshotAlertsForInstanceLocked("")
}
if !m.config.BackupDefaults.Enabled {
m.clearBackupAlertsLocked()
}
m.applyGlobalOfflineSettingsLocked()
log.Info().
Bool("enabled", config.Enabled).
Interface("guestDefaults", config.GuestDefaults).
Msg("Alert configuration updated")
// Re-evaluate active alerts against new thresholds
m.reevaluateActiveAlertsLocked()
}
// normalizeStorageDefaults ensures storage default thresholds are set
// Trigger=0 is allowed and means "disable storage alerting"
func normalizeStorageDefaults(config *AlertConfig) {
if config.StorageDefault.Trigger < 0 {
config.StorageDefault.Trigger = 85
config.StorageDefault.Clear = 80
} else if config.StorageDefault.Trigger == 0 {
// Trigger=0 means disabled, set Clear=0 too
config.StorageDefault.Clear = 0
} else if config.StorageDefault.Clear <= 0 {
config.StorageDefault.Clear = config.StorageDefault.Trigger - 5
if config.StorageDefault.Clear < 0 {
config.StorageDefault.Clear = 0
}
}
}
// normalizeDockerThreshold normalizes a single Docker threshold
func normalizeDockerThreshold(th HysteresisThreshold, defaultTrigger float64, metricName string) HysteresisThreshold {
normalized := th
// Negative triggers are treated as unset and replaced with defaults.
if normalized.Trigger < 0 {
normalized.Trigger = defaultTrigger
}
// Explicit disable: keep trigger at 0 and clamp clear to 0.
if normalized.Trigger == 0 {
if normalized.Clear < 0 {
normalized.Clear = 0
}
return normalized
}
if normalized.Clear <= 0 {
normalized.Clear = normalized.Trigger - 5
if normalized.Clear < 0 {
normalized.Clear = 0
}
}
ensureValidHysteresis(&normalized, metricName)
return normalized
}
// normalizeDockerDefaults ensures Docker default thresholds are set
func normalizeDockerDefaults(config *AlertConfig) {
config.DockerDefaults.CPU = normalizeDockerThreshold(config.DockerDefaults.CPU, 80, "docker.cpu")
config.DockerDefaults.Memory = normalizeDockerThreshold(config.DockerDefaults.Memory, 85, "docker.memory")
config.DockerDefaults.Disk = normalizeDockerThreshold(config.DockerDefaults.Disk, 85, "docker.disk")
if config.DockerDefaults.RestartCount <= 0 {
config.DockerDefaults.RestartCount = 3
}
if config.DockerDefaults.RestartWindow <= 0 {
config.DockerDefaults.RestartWindow = 300 // 5 minutes
}
if config.DockerDefaults.MemoryWarnPct <= 0 {
config.DockerDefaults.MemoryWarnPct = 90
}
if config.DockerDefaults.MemoryCriticalPct <= 0 {
config.DockerDefaults.MemoryCriticalPct = 95
}
if config.DockerDefaults.ServiceWarnGapPct <= 0 {
config.DockerDefaults.ServiceWarnGapPct = 10
}
if config.DockerDefaults.ServiceCritGapPct <= 0 {
config.DockerDefaults.ServiceCritGapPct = 50
}
if config.DockerDefaults.ServiceCritGapPct > 0 &&
config.DockerDefaults.ServiceCritGapPct < config.DockerDefaults.ServiceWarnGapPct {
log.Warn().
Int("warnGapPercent", config.DockerDefaults.ServiceWarnGapPct).
Int("criticalGapPercent", config.DockerDefaults.ServiceCritGapPct).
Msg("Adjusting Docker service critical gap to match warning gap")
config.DockerDefaults.ServiceCritGapPct = config.DockerDefaults.ServiceWarnGapPct
}
if config.DockerDefaults.StatePoweredOffSeverity == "" {
config.DockerDefaults.StatePoweredOffSeverity = AlertLevelWarning
}
config.DockerDefaults.StatePoweredOffSeverity = normalizePoweredOffSeverity(config.DockerDefaults.StatePoweredOffSeverity)
// Default to 24 hours delay for update alerts; set to -1 to explicitly disable
if config.DockerDefaults.UpdateAlertDelayHours == 0 {
config.DockerDefaults.UpdateAlertDelayHours = 24
}
}
// normalizePMGDefaults ensures PMG (Proxmox Mail Gateway) defaults are set
func normalizePMGDefaults(config *AlertConfig) {
if config.PMGDefaults.QueueTotalWarning <= 0 {
config.PMGDefaults.QueueTotalWarning = 500
}
if config.PMGDefaults.QueueTotalCritical <= 0 {
config.PMGDefaults.QueueTotalCritical = 1000
}
if config.PMGDefaults.OldestMessageWarnMins <= 0 {
config.PMGDefaults.OldestMessageWarnMins = 30
}
if config.PMGDefaults.OldestMessageCritMins <= 0 {
config.PMGDefaults.OldestMessageCritMins = 60
}
if config.PMGDefaults.DeferredQueueWarn <= 0 {
config.PMGDefaults.DeferredQueueWarn = 200
}
if config.PMGDefaults.DeferredQueueCritical <= 0 {
config.PMGDefaults.DeferredQueueCritical = 500
}
if config.PMGDefaults.HoldQueueWarn <= 0 {
config.PMGDefaults.HoldQueueWarn = 100
}
if config.PMGDefaults.HoldQueueCritical <= 0 {
config.PMGDefaults.HoldQueueCritical = 300
}
if config.PMGDefaults.QuarantineSpamWarn <= 0 {
config.PMGDefaults.QuarantineSpamWarn = 2000
}
if config.PMGDefaults.QuarantineSpamCritical <= 0 {
config.PMGDefaults.QuarantineSpamCritical = 5000
}
if config.PMGDefaults.QuarantineVirusWarn <= 0 {
config.PMGDefaults.QuarantineVirusWarn = 2000
}
if config.PMGDefaults.QuarantineVirusCritical <= 0 {
config.PMGDefaults.QuarantineVirusCritical = 5000
}
if config.PMGDefaults.QuarantineGrowthWarnPct <= 0 {
config.PMGDefaults.QuarantineGrowthWarnPct = 25
}
if config.PMGDefaults.QuarantineGrowthWarnMin <= 0 {
config.PMGDefaults.QuarantineGrowthWarnMin = 250
}
if config.PMGDefaults.QuarantineGrowthCritPct <= 0 {
config.PMGDefaults.QuarantineGrowthCritPct = 50
}
if config.PMGDefaults.QuarantineGrowthCritMin <= 0 {
config.PMGDefaults.QuarantineGrowthCritMin = 500
}
}
// normalizeSnapshotDefaults ensures snapshot alert thresholds are valid
func normalizeSnapshotDefaults(config *AlertConfig) {
if config.SnapshotDefaults.WarningDays < 0 {
config.SnapshotDefaults.WarningDays = 0
}
if config.SnapshotDefaults.CriticalDays < 0 {
config.SnapshotDefaults.CriticalDays = 0
}
if config.SnapshotDefaults.CriticalDays > 0 && config.SnapshotDefaults.WarningDays > config.SnapshotDefaults.CriticalDays {
config.SnapshotDefaults.WarningDays = config.SnapshotDefaults.CriticalDays
}
if config.SnapshotDefaults.CriticalDays == 0 && config.SnapshotDefaults.WarningDays > 0 {
config.SnapshotDefaults.CriticalDays = config.SnapshotDefaults.WarningDays
}
if config.SnapshotDefaults.WarningSizeGiB < 0 {
config.SnapshotDefaults.WarningSizeGiB = 0
}
if config.SnapshotDefaults.CriticalSizeGiB < 0 {
config.SnapshotDefaults.CriticalSizeGiB = 0
}
if config.SnapshotDefaults.CriticalSizeGiB > 0 && config.SnapshotDefaults.WarningSizeGiB > config.SnapshotDefaults.CriticalSizeGiB {
config.SnapshotDefaults.WarningSizeGiB = config.SnapshotDefaults.CriticalSizeGiB
}
if config.SnapshotDefaults.CriticalSizeGiB == 0 && config.SnapshotDefaults.WarningSizeGiB > 0 {
config.SnapshotDefaults.CriticalSizeGiB = config.SnapshotDefaults.WarningSizeGiB
}
}
// normalizeBackupDefaults ensures backup alert thresholds are valid
func normalizeBackupDefaults(config *AlertConfig) {
if config.BackupDefaults.WarningDays < 0 {
config.BackupDefaults.WarningDays = 0
}
if config.BackupDefaults.CriticalDays < 0 {
config.BackupDefaults.CriticalDays = 0
}
if config.BackupDefaults.CriticalDays > 0 && config.BackupDefaults.WarningDays > config.BackupDefaults.CriticalDays {
config.BackupDefaults.WarningDays = config.BackupDefaults.CriticalDays
}
if config.BackupDefaults.AlertOrphaned == nil {
alertOrphaned := true
config.BackupDefaults.AlertOrphaned = &alertOrphaned
}
if len(config.BackupDefaults.IgnoreVMIDs) > 0 {
seen := make(map[string]struct{}, len(config.BackupDefaults.IgnoreVMIDs))
normalized := make([]string, 0, len(config.BackupDefaults.IgnoreVMIDs))
for _, entry := range config.BackupDefaults.IgnoreVMIDs {
value := strings.TrimSpace(entry)
if value == "" {
continue
}
if _, exists := seen[value]; exists {
continue
}
seen[value] = struct{}{}
normalized = append(normalized, value)
}
config.BackupDefaults.IgnoreVMIDs = normalized
}
}
func backupIgnoreVMID(vmid string, ignoreList []string) bool {
if vmid == "" || len(ignoreList) == 0 {
return false
}
for _, entry := range ignoreList {
value := strings.TrimSpace(entry)
if value == "" {
continue
}
if strings.HasSuffix(value, "*") {
prefix := strings.TrimSuffix(value, "*")
if prefix != "" && strings.HasPrefix(vmid, prefix) {
return true
}
continue
}
if vmid == value {
return true
}
}
return false
}
// normalizeNodeDefaults ensures node threshold defaults exist
// Trigger=0 is allowed for Temperature and means "disable temperature alerting"
func normalizeNodeDefaults(config *AlertConfig) {
// Ensure temperature defaults exist for nodes so high temps alert out of the box
if config.NodeDefaults.Temperature == nil || config.NodeDefaults.Temperature.Trigger < 0 {
config.NodeDefaults.Temperature = &HysteresisThreshold{Trigger: 80, Clear: 75}
} else if config.NodeDefaults.Temperature.Trigger == 0 {
// Trigger=0 means disabled, set Clear=0 too
config.NodeDefaults.Temperature.Clear = 0
} else if config.NodeDefaults.Temperature.Clear <= 0 {
config.NodeDefaults.Temperature.Clear = config.NodeDefaults.Temperature.Trigger - 5
if config.NodeDefaults.Temperature.Clear <= 0 {
config.NodeDefaults.Temperature.Clear = 75
}
}
}
// normalizeHostDefaults ensures host agent threshold defaults exist
// Trigger=0 is allowed and means "disable alerting for this metric"
func normalizeHostDefaults(config *AlertConfig) {
if config.HostDefaults.CPU == nil || config.HostDefaults.CPU.Trigger < 0 {
config.HostDefaults.CPU = &HysteresisThreshold{Trigger: 80, Clear: 75}
} else if config.HostDefaults.CPU.Trigger == 0 {
// Trigger=0 means disabled, set Clear=0 too
config.HostDefaults.CPU.Clear = 0
} else if config.HostDefaults.CPU.Clear <= 0 {
config.HostDefaults.CPU.Clear = config.HostDefaults.CPU.Trigger - 5
if config.HostDefaults.CPU.Clear <= 0 {
config.HostDefaults.CPU.Clear = 75
}
}
if config.HostDefaults.Memory == nil || config.HostDefaults.Memory.Trigger < 0 {
config.HostDefaults.Memory = &HysteresisThreshold{Trigger: 85, Clear: 80}
} else if config.HostDefaults.Memory.Trigger == 0 {
// Trigger=0 means disabled, set Clear=0 too
config.HostDefaults.Memory.Clear = 0
} else if config.HostDefaults.Memory.Clear <= 0 {
config.HostDefaults.Memory.Clear = config.HostDefaults.Memory.Trigger - 5
if config.HostDefaults.Memory.Clear <= 0 {
config.HostDefaults.Memory.Clear = 80
}
}
if config.HostDefaults.Disk == nil || config.HostDefaults.Disk.Trigger < 0 {
config.HostDefaults.Disk = &HysteresisThreshold{Trigger: 90, Clear: 85}
} else if config.HostDefaults.Disk.Trigger == 0 {
// Trigger=0 means disabled, set Clear=0 too
config.HostDefaults.Disk.Clear = 0
} else if config.HostDefaults.Disk.Clear <= 0 {
config.HostDefaults.Disk.Clear = config.HostDefaults.Disk.Trigger - 5
if config.HostDefaults.Disk.Clear <= 0 {
config.HostDefaults.Disk.Clear = 85
}
}
if config.HostDefaults.DiskTemperature == nil || config.HostDefaults.DiskTemperature.Trigger < 0 {
config.HostDefaults.DiskTemperature = &HysteresisThreshold{Trigger: 55, Clear: 50}
} else if config.HostDefaults.DiskTemperature.Trigger == 0 {
config.HostDefaults.DiskTemperature.Clear = 0
} else if config.HostDefaults.DiskTemperature.Clear <= 0 {
config.HostDefaults.DiskTemperature.Clear = config.HostDefaults.DiskTemperature.Trigger - 5
if config.HostDefaults.DiskTemperature.Clear <= 0 {
config.HostDefaults.DiskTemperature.Clear = 50
}
}
ensureValidHysteresis(config.HostDefaults.DiskTemperature, "host.diskTemperature")
}
// normalizeGeneralSettings ensures general alert settings have valid values
func normalizeGeneralSettings(config *AlertConfig) {
if config.MinimumDelta <= 0 {
config.MinimumDelta = 2.0
}
if config.SuppressionWindow <= 0 {
config.SuppressionWindow = 5
}
if config.HysteresisMargin <= 0 {
config.HysteresisMargin = 5.0
}
if config.ObservationWindowHours <= 0 {
config.ObservationWindowHours = 24
}
}
// normalizeTimeThresholds ensures time threshold settings are valid
func normalizeTimeThresholds(config *AlertConfig) {
config.MetricTimeThresholds = normalizeMetricTimeThresholds(config.MetricTimeThresholds)
const defaultDelaySeconds = 5
if config.TimeThreshold <= 0 {
config.TimeThreshold = defaultDelaySeconds
}
if config.TimeThresholds == nil {
config.TimeThresholds = make(map[string]int)
}
ensureDelay := func(key string) {
delay, ok := config.TimeThresholds[key]
if !ok || delay < 0 {
config.TimeThresholds[key] = defaultDelaySeconds
}
}
ensureDelay("guest")
ensureDelay("node")
ensureDelay("storage")
ensureDelay("pbs")
ensureDelay("host")
if delay, ok := config.TimeThresholds["all"]; ok && delay < 0 {
config.TimeThresholds["all"] = defaultDelaySeconds
}
}
// migrateActivationState handles backward compatibility for activation state
func (m *Manager) migrateActivationState(config *AlertConfig) {
if config.ActivationState == "" {
// Determine if this is an existing installation or new
// Existing installations have active alerts already
isExistingInstall := len(m.activeAlerts) > 0 || len(config.Overrides) > 0
if isExistingInstall {
// Existing install: auto-activate to preserve behavior
config.ActivationState = ActivationActive
now := time.Now()
config.ActivationTime = &now
log.Info().Msg("Migrating existing installation to active alert state")
} else {
// New install: start in pending review
config.ActivationState = ActivationPending
log.Info().Msg("New installation: alerts pending activation")
}
}
}
// validateHysteresisThresholds ensures hysteresis thresholds won't cause stuck alerts
func validateHysteresisThresholds(config *AlertConfig) {
ensureValidHysteresis(config.GuestDefaults.CPU, "guest.cpu")
ensureValidHysteresis(config.GuestDefaults.Memory, "guest.memory")
ensureValidHysteresis(config.GuestDefaults.Disk, "guest.disk")
ensureValidHysteresis(config.NodeDefaults.CPU, "node.cpu")
ensureValidHysteresis(config.NodeDefaults.Memory, "node.memory")
ensureValidHysteresis(config.NodeDefaults.Temperature, "node.temperature")
ensureValidHysteresis(&config.StorageDefault, "storage")
}
// validateQuietHoursTimezone validates the timezone for quiet hours
func validateQuietHoursTimezone(config *AlertConfig) {
if config.Schedule.QuietHours.Enabled && config.Schedule.QuietHours.Timezone != "" {
_, err := time.LoadLocation(config.Schedule.QuietHours.Timezone)
if err != nil {
log.Error().
Err(err).
Str("timezone", config.Schedule.QuietHours.Timezone).
Msg("Invalid timezone in quiet hours config, disabling quiet hours")
// Disable quiet hours rather than silently using wrong timezone
config.Schedule.QuietHours.Enabled = false
}
}
}
// normalizeOverrides normalizes all threshold overrides
func normalizeOverrides(overrides map[string]ThresholdConfig) {
for id, override := range overrides {
override.PoweredOffSeverity = normalizePoweredOffSeverity(override.PoweredOffSeverity)
if override.Usage != nil {
override.Usage = ensureHysteresisThreshold(override.Usage)
}
overrides[id] = override
}
}
// normalizeMetricTimeThresholds cleans resource/metric keys and drops invalid delay overrides.
func normalizeMetricTimeThresholds(input map[string]map[string]int) map[string]map[string]int {
if len(input) == 0 {
return nil
}
normalized := make(map[string]map[string]int)
for rawType, metrics := range input {
typeKey := strings.ToLower(strings.TrimSpace(rawType))
if typeKey == "" || len(metrics) == 0 {
continue
}
for rawMetric, delay := range metrics {
metricKey := strings.ToLower(strings.TrimSpace(rawMetric))
if metricKey == "" || delay < 0 {
continue
}
if _, exists := normalized[typeKey]; !exists {
normalized[typeKey] = make(map[string]int)
}
normalized[typeKey][metricKey] = delay
}
}
if len(normalized) == 0 {
return nil
}
return normalized
}
// NormalizeMetricTimeThresholds exposes normalization for other packages (e.g., config persistence).
func NormalizeMetricTimeThresholds(input map[string]map[string]int) map[string]map[string]int {
return normalizeMetricTimeThresholds(input)
}
// NormalizeDockerIgnoredPrefixes trims, deduplicates, and lowercases comparison keys for ignored Docker containers.
// Returned values retain the user's original casing for display but guarantee uniqueness when compared case-insensitively.
func NormalizeDockerIgnoredPrefixes(prefixes []string) []string {
if len(prefixes) == 0 {
return nil
}
seen := make(map[string]struct{}, len(prefixes))
normalized := make([]string, 0, len(prefixes))
for _, prefix := range prefixes {
trimmed := strings.TrimSpace(prefix)
if trimmed == "" {
continue
}
key := strings.ToLower(trimmed)
if _, exists := seen[key]; exists {
continue
}
seen[key] = struct{}{}
normalized = append(normalized, trimmed)
}
if len(normalized) == 0 {
return nil
}
return normalized
}
// applyGlobalOfflineSettingsLocked clears tracking and active alerts for globally disabled offline detectors.
// Caller must hold m.mu.
func (m *Manager) applyGlobalOfflineSettingsLocked() {
if m.config.DisableAllNodesOffline {
var nodeAlerts []string
for alertID := range m.activeAlerts {
if strings.HasPrefix(alertID, "node-offline-") {
nodeAlerts = append(nodeAlerts, alertID)
}
}
for _, alertID := range nodeAlerts {
m.clearAlertNoLock(alertID)
}
m.nodeOfflineCount = make(map[string]int)
}
if m.config.DisableAllPBSOffline {
var pbsAlerts []string
for alertID, alert := range m.activeAlerts {
if strings.HasPrefix(alertID, "pbs-offline-") {
pbsAlerts = append(pbsAlerts, alertID)
delete(m.offlineConfirmations, alert.ResourceID)
}
}
for _, alertID := range pbsAlerts {
m.clearAlertNoLock(alertID)
}
}
if m.config.DisableAllGuestsOffline {
var guestAlerts []string
for alertID, alert := range m.activeAlerts {
if strings.HasPrefix(alertID, "guest-powered-off-") {
guestAlerts = append(guestAlerts, alertID)
delete(m.offlineConfirmations, alert.ResourceID)
}
}
for _, alertID := range guestAlerts {
m.clearAlertNoLock(alertID)
}
}
if m.config.DisableAllDockerHostsOffline {
var hostAlerts []string
for alertID := range m.activeAlerts {
if strings.HasPrefix(alertID, "docker-host-offline-") {
hostAlerts = append(hostAlerts, alertID)
}
}
for _, alertID := range hostAlerts {
m.clearAlertNoLock(alertID)
}
m.dockerOfflineCount = make(map[string]int)
}
if m.config.DisableAllDockerContainers {
var containerAlerts []string
for alertID := range m.activeAlerts {
if strings.HasPrefix(alertID, "docker-container-") {
containerAlerts = append(containerAlerts, alertID)
}
}
for _, alertID := range containerAlerts {
m.clearAlertNoLock(alertID)
}
m.dockerStateConfirm = make(map[string]int)
m.dockerRestartTracking = make(map[string]*dockerRestartRecord)
m.dockerLastExitCode = make(map[string]int)
m.dockerUpdateFirstSeen = make(map[string]time.Time)
m.dockerUpdateFirstSeenByIdentity = make(map[string]time.Time)
}
if m.config.DockerDefaults.UpdateAlertDelayHours < 0 && !m.config.DisableAllDockerContainers {
m.clearDockerContainerUpdateAlertsLocked()
m.dockerUpdateFirstSeen = make(map[string]time.Time)
m.dockerUpdateFirstSeenByIdentity = make(map[string]time.Time)
}
if m.config.DisableAllDockerServices {
var serviceAlerts []string
for alertID := range m.activeAlerts {
if strings.HasPrefix(alertID, "docker-service-") {
serviceAlerts = append(serviceAlerts, alertID)
}
}
for _, alertID := range serviceAlerts {
m.clearAlertNoLock(alertID)
}
}
}
// reevaluateActiveAlertsLocked re-evaluates all active alerts against the current configuration
// This should only be called with m.mu already locked
func (m *Manager) reevaluateActiveAlertsLocked() {
if len(m.activeAlerts) == 0 {
return
}
// Track alerts that should be resolved
alertsToResolve := make([]string, 0)
for alertID, alert := range m.activeAlerts {
resourceTypeMeta := ""
if alert.Metadata != nil {
if metaType, ok := alert.Metadata["resourceType"].(string); ok {
resourceTypeMeta = strings.ToLower(metaType)
}
}
if alert.Type == "docker-container-update" || strings.HasPrefix(alertID, "docker-container-update-") {
if m.shouldResolveDockerContainerUpdateAlertLocked(alert) {
alertsToResolve = append(alertsToResolve, alertID)
}
continue
}
// Parse the alert ID to extract resource ID and metric type
// Alert ID format: {resourceID}-{metricType}
parts := strings.Split(alertID, "-")
if len(parts) < 2 {
continue
}
metricType := parts[len(parts)-1]
resourceID := strings.Join(parts[:len(parts)-1], "-")
// Get the appropriate threshold based on resource type and ID
var threshold *HysteresisThreshold
// Check for PMG alerts by Type
if alert.Type == "queue-depth" || alert.Type == "queue-deferred" || alert.Type == "queue-hold" || alert.Type == "message-age" {
// This is a PMG alert
if m.config.DisableAllPMG {
alertsToResolve = append(alertsToResolve, alertID)
continue
}
}
// Check for Host alerts by resourceType
if resourceTypeMeta == "host" {
if m.config.DisableAllHosts {
alertsToResolve = append(alertsToResolve, alertID)
continue
}
thresholds := m.config.HostDefaults
// Overrides are keyed by raw host ID (without the "host:" prefix
// that hostResourceID adds to the resource ID used in alert IDs).
rawHostID := strings.TrimPrefix(resourceID, "host:")
if override, exists := m.config.Overrides[rawHostID]; exists {
if override.Disabled {
alertsToResolve = append(alertsToResolve, alertID)
continue
}
thresholds = m.applyThresholdOverride(thresholds, override)
}
threshold = getThresholdForMetric(thresholds, metricType)
}
if alert.Type == "docker-host-offline" ||
strings.HasPrefix(alertID, "docker-container-health-") ||
strings.HasPrefix(alertID, "docker-container-state-") ||
strings.HasPrefix(alertID, "docker-container-restart-loop-") ||
strings.HasPrefix(alertID, "docker-container-oom-") ||
strings.HasPrefix(alertID, "docker-container-memory-limit-") {
// Non-metric Docker alerts are not governed by thresholds
continue
}
if resourceTypeMeta == "dockerhost" {
// Check if all Docker host alerts are disabled
if m.config.DisableAllDockerHosts {
alertsToResolve = append(alertsToResolve, alertID)
continue
}
// No threshold evaluation for Docker hosts (connectivity handled separately)
continue
}
if resourceTypeMeta == "docker container" {
// Check if all Docker container alerts are disabled
if m.config.DisableAllDockerContainers {
alertsToResolve = append(alertsToResolve, alertID)
continue
}
containerName := strings.ToLower(strings.TrimSpace(alert.ResourceName))
containerID := ""
if alert.Metadata != nil {
if val, ok := alert.Metadata["containerId"].(string); ok {
containerID = strings.ToLower(strings.TrimSpace(val))
}
if val, ok := alert.Metadata["containerName"].(string); ok && containerName == "" {
containerName = strings.ToLower(strings.TrimSpace(val))
}
}
if matchesDockerIgnoredPrefix(containerName, containerID, m.config.DockerIgnoredContainerPrefixes) {
alertsToResolve = append(alertsToResolve, alertID)
continue
}
thresholds := ThresholdConfig{
CPU: cloneThreshold(&m.config.DockerDefaults.CPU),
Memory: cloneThreshold(&m.config.DockerDefaults.Memory),
Disk: cloneThreshold(&m.config.DockerDefaults.Disk),
}
if override, exists := m.config.Overrides[resourceID]; exists {
if override.Disabled {
alertsToResolve = append(alertsToResolve, alertID)
continue
}
thresholds = m.applyThresholdOverride(thresholds, override)
}
threshold = getThresholdForMetric(thresholds, metricType)
}
// Determine the resource type from the alert's metadata or instance
// We need to check what kind of resource this is
if threshold == nil && !strings.Contains(resourceID, ":") && (alert.Instance == "Node" || alert.Instance == alert.Node) {
// This is a node alert
// Check if all node alerts are disabled
if m.config.DisableAllNodes {
alertsToResolve = append(alertsToResolve, alertID)
continue
}
thresholds := m.config.NodeDefaults
if override, exists := m.config.Overrides[resourceID]; exists {
thresholds = m.applyThresholdOverride(thresholds, override)
}
threshold = getThresholdForMetric(thresholds, metricType)
} else if threshold == nil && (resourceTypeMeta == "storage" || alert.Instance == "Storage" || strings.Contains(alert.ResourceID, ":storage/")) {
// This is a storage alert
// Check if all storage alerts are disabled
if m.config.DisableAllStorage {
alertsToResolve = append(alertsToResolve, alertID)
continue
}
if override, exists := m.config.Overrides[resourceID]; exists && override.Usage != nil {
threshold = override.Usage
} else {
threshold = &m.config.StorageDefault
}
} else if threshold == nil && alert.Instance == "PBS" {
// This is a PBS alert
// Check if all PBS alerts are disabled
if m.config.DisableAllPBS {
alertsToResolve = append(alertsToResolve, alertID)
continue
}
thresholds := m.config.PBSDefaults
if override, exists := m.config.Overrides[resourceID]; exists {
if override.CPU != nil && metricType == "cpu" {
threshold = ensureHysteresisThreshold(override.CPU)
} else if override.Memory != nil && metricType == "memory" {
threshold = ensureHysteresisThreshold(override.Memory)
}
}
if threshold == nil {
threshold = getThresholdForMetric(thresholds, metricType)
}
}
if threshold == nil {
// This is a guest (qemu/lxc) alert
// Check if all guest alerts are disabled
if m.config.DisableAllGuests {
alertsToResolve = append(alertsToResolve, alertID)
continue
}
// We need to evaluate custom rules, but we don't have the guest object here.
// For now, we'll mark these alerts for re-evaluation by the monitor.
// The next poll cycle will properly evaluate them with custom rules.
// Check if there's an override for this specific guest
if override, exists := m.config.Overrides[resourceID]; exists {
if override.Disabled {
// Alert is now disabled for this resource, resolve it
alertsToResolve = append(alertsToResolve, alertID)
continue
}
threshold = getThresholdForMetricFromConfig(override, metricType)
}
// If no override or override doesn't have this metric, use defaults
// Note: This doesn't consider custom rules - those will be evaluated
// on the next poll cycle when we have the full guest object
if threshold == nil {
threshold = getThresholdForMetric(m.config.GuestDefaults, metricType)
}
}
// If no threshold found or threshold is disabled (trigger <= 0), resolve the alert
if threshold == nil || threshold.Trigger <= 0 {
alertsToResolve = append(alertsToResolve, alertID)
continue
}
// Check if current value is now below the clear threshold
clearThreshold := threshold.Clear
if clearThreshold <= 0 {
clearThreshold = threshold.Trigger
}
if alert.Value <= clearThreshold {
// Alert should be resolved due to new threshold
alertsToResolve = append(alertsToResolve, alertID)
log.Info().
Str("alertID", alertID).
Float64("value", alert.Value).
Float64("oldThreshold", alert.Threshold).
Float64("newClearThreshold", clearThreshold).
Msg("Resolving alert due to threshold change")
} else if alert.Value < threshold.Trigger {
// Value is between clear and trigger thresholds after config change
// Resolve it to prevent confusion
alertsToResolve = append(alertsToResolve, alertID)
log.Info().
Str("alertID", alertID).
Float64("value", alert.Value).
Float64("newTrigger", threshold.Trigger).
Float64("newClear", clearThreshold).
Msg("Resolving alert - value now below trigger threshold after config change")
}
}
// Resolve all alerts that should be cleared
for _, alertID := range alertsToResolve {
if alert, exists := m.activeAlerts[alertID]; exists {
resolvedAlert := &ResolvedAlert{
Alert: alert,
ResolvedTime: time.Now(),
}
// Remove any pending notification tracking for this alert since it's no longer valid.
if _, isPending := m.pendingAlerts[alertID]; isPending {
delete(m.pendingAlerts, alertID)
log.Debug().
Str("alertID", alertID).
Msg("Cleared pending alert after configuration update")
}
// Remove from active alerts
m.removeActiveAlertNoLock(alertID)
// Add to recently resolved while respecting lock ordering
m.addRecentlyResolvedWithPrimaryLock(alertID, resolvedAlert)
log.Info().
Str("alertID", alertID).
Msg("Alert auto-resolved after configuration change")
m.safeCallResolvedCallback(alertID, true)
}
}
// Save updated active alerts if any were resolved
if len(alertsToResolve) > 0 {
go func() {
defer func() {
if r := recover(); r != nil {
log.Error().Interface("panic", r).Msg("Panic in SaveActiveAlerts goroutine (config update)")
}
}()
if err := m.SaveActiveAlerts(); err != nil {
log.Error().Err(err).Msg("Failed to save active alerts after config update")
}
}()
}
}
// ReevaluateGuestAlert reevaluates a specific guest's alerts with full threshold resolution including custom rules
// This should be called by the monitor with the current guest state
func (m *Manager) ReevaluateGuestAlert(guest interface{}, guestID string) {
m.mu.Lock()
defer m.mu.Unlock()
// Get the correct thresholds for this guest (includes custom rules evaluation)
thresholds := m.getGuestThresholds(guest, guestID)
// Check all metric types for this guest
metricTypes := []string{"cpu", "memory", "disk", "diskRead", "diskWrite", "networkIn", "networkOut"}
for _, metricType := range metricTypes {
alertID := fmt.Sprintf("%s-%s", guestID, metricType)
alert, exists := m.activeAlerts[alertID]
if !exists {
continue
}
// Get the threshold for this metric
var threshold *HysteresisThreshold
switch metricType {
case "cpu":
threshold = thresholds.CPU
case "memory":
threshold = thresholds.Memory
case "disk":
threshold = thresholds.Disk
case "diskRead":
threshold = thresholds.DiskRead
case "diskWrite":
threshold = thresholds.DiskWrite
case "networkIn":
threshold = thresholds.NetworkIn
case "networkOut":
threshold = thresholds.NetworkOut
}
// If threshold is disabled or doesn't exist, clear the alert
if threshold == nil || threshold.Trigger <= 0 {
m.clearAlertNoLock(alertID)
// Also clear any pending alert for this metric
if _, isPending := m.pendingAlerts[alertID]; isPending {
delete(m.pendingAlerts, alertID)
log.Debug().
Str("alertID", alertID).
Msg("Cleared pending alert - threshold disabled")
}
log.Info().
Str("alertID", alertID).
Str("metric", metricType).
Msg("Cleared alert - threshold disabled")
continue
}
// Check if alert should be cleared based on new threshold
clearThreshold := threshold.Clear
if clearThreshold <= 0 {
clearThreshold = threshold.Trigger
}
if alert.Value <= clearThreshold || alert.Value < threshold.Trigger {
m.clearAlertNoLock(alertID)
log.Info().
Str("alertID", alertID).
Str("metric", metricType).
Float64("value", alert.Value).
Float64("trigger", threshold.Trigger).
Float64("clear", clearThreshold).
Msg("Cleared alert - value now below threshold after config change")
}
}
}
// getThresholdForMetric returns the threshold for a specific metric type from a ThresholdConfig
func getThresholdForMetric(config ThresholdConfig, metricType string) *HysteresisThreshold {
switch metricType {
case "cpu":
return config.CPU
case "memory":
return config.Memory
case "disk":
return config.Disk
case "diskRead":
return config.DiskRead
case "diskWrite":
return config.DiskWrite
case "networkIn":
return config.NetworkIn
case "networkOut":
return config.NetworkOut
case "temperature":
return config.Temperature
case "usage":
return config.Usage
default:
return nil
}
}
// getThresholdForMetricFromConfig returns the threshold for a specific metric type from a ThresholdConfig
// ensuring hysteresis is properly set
func getThresholdForMetricFromConfig(config ThresholdConfig, metricType string) *HysteresisThreshold {
var threshold *HysteresisThreshold
switch metricType {
case "cpu":
if config.CPU != nil {
threshold = ensureHysteresisThreshold(config.CPU)
}
case "memory":
if config.Memory != nil {
threshold = ensureHysteresisThreshold(config.Memory)
}
case "disk":
if config.Disk != nil {
threshold = ensureHysteresisThreshold(config.Disk)
}
case "diskRead":
if config.DiskRead != nil {
threshold = ensureHysteresisThreshold(config.DiskRead)
}
case "diskWrite":
if config.DiskWrite != nil {
threshold = ensureHysteresisThreshold(config.DiskWrite)
}
case "networkIn":
if config.NetworkIn != nil {
threshold = ensureHysteresisThreshold(config.NetworkIn)
}
case "networkOut":
if config.NetworkOut != nil {
threshold = ensureHysteresisThreshold(config.NetworkOut)
}
case "temperature":
if config.Temperature != nil {
threshold = ensureHysteresisThreshold(config.Temperature)
}
case "usage":
if config.Usage != nil {
threshold = ensureHysteresisThreshold(config.Usage)
}
}
return threshold
}
// isInQuietHours checks if the current time is within quiet hours
func (m *Manager) isInQuietHours() bool {
if !m.config.Schedule.QuietHours.Enabled {
return false
}
// Use cached location if available
loc := m.quietHoursLoc
if loc == nil {
// Fallback to loading if not cached yet (shouldn't happen with UpdateConfig)
var err error
loc, err = time.LoadLocation(m.config.Schedule.QuietHours.Timezone)
if err != nil {
log.Warn().Err(err).Str("timezone", m.config.Schedule.QuietHours.Timezone).Msg("Failed to load timezone, using local time")
loc = time.Local
}
m.quietHoursLoc = loc
}
now := time.Now().In(loc)
dayName := strings.ToLower(now.Format("Monday"))
// Check if today is enabled for quiet hours
if enabled, ok := m.config.Schedule.QuietHours.Days[dayName]; !ok || !enabled {
return false
}
// Parse start and end times
startTime, err := time.ParseInLocation("15:04", m.config.Schedule.QuietHours.Start, loc)
if err != nil {
log.Warn().Err(err).Str("start", m.config.Schedule.QuietHours.Start).Msg("Failed to parse quiet hours start time")
return false
}
endTime, err := time.ParseInLocation("15:04", m.config.Schedule.QuietHours.End, loc)
if err != nil {
log.Warn().Err(err).Str("end", m.config.Schedule.QuietHours.End).Msg("Failed to parse quiet hours end time")
return false
}
// Set to today's date
startTime = time.Date(now.Year(), now.Month(), now.Day(), startTime.Hour(), startTime.Minute(), 0, 0, loc)
endTime = time.Date(now.Year(), now.Month(), now.Day(), endTime.Hour(), endTime.Minute(), 0, 0, loc)
// Handle overnight quiet hours (e.g., 22:00 to 08:00)
if endTime.Before(startTime) {
// If we're past the start time or before the end time
if now.After(startTime) || now.Before(endTime) {
return true
}
} else {
// Normal case (e.g., 08:00 to 17:00)
if now.After(startTime) && now.Before(endTime) {
return true
}
}
return false
}
func quietHoursCategoryForAlert(alert *Alert) string {
if alert == nil {
return ""
}
switch alert.Type {
case "cpu", "memory", "disk", "diskRead", "diskWrite", "networkIn", "networkOut", "temperature":
return "performance"
case "queue-depth", "queue-deferred", "queue-hold", "message-age",
"docker-container-health", "docker-container-restart-loop",
"docker-container-oom-kill", "docker-container-memory-limit":
return "performance"
case "usage", "disk-health", "disk-wearout", "zfs-pool-state", "zfs-pool-errors", "zfs-device":
return "storage"
case "connectivity", "offline", "powered-off", "docker-host-offline":
return "offline"
}
if strings.HasPrefix(alert.Type, "docker-container-") {
if alert.Type == "docker-container-state" {
return "offline"
}
return "performance"
}
return ""
}
func (m *Manager) shouldSuppressNotification(alert *Alert) (bool, string) {
if alert == nil {
return false, ""
}
if !m.isInQuietHours() {
return false, ""
}
if alert.Level != AlertLevelCritical {
return true, "non-critical"
}
category := quietHoursCategoryForAlert(alert)
switch category {
case "performance":
if m.config.Schedule.QuietHours.Suppress.Performance {
return true, category
}
case "storage":
if m.config.Schedule.QuietHours.Suppress.Storage {
return true, category
}
case "offline":
if m.config.Schedule.QuietHours.Suppress.Offline {
return true, category
}
}
return false, ""
}
// ShouldSuppressResolvedNotification checks if a recovery notification should be suppressed
// during quiet hours. Recovery notifications follow the same quiet hours rules as their
// corresponding alerts - if the original alert would have been suppressed, so is the recovery.
func (m *Manager) ShouldSuppressResolvedNotification(alert *Alert) bool {
if alert == nil {
return false
}
m.mu.RLock()
defer m.mu.RUnlock()
suppressed, reason := m.shouldSuppressNotification(alert)
if suppressed {
log.Debug().
Str("alertID", alert.ID).
Str("type", alert.Type).
Str("level", string(alert.Level)).
Str("quietHoursRule", reason).
Msg("Recovery notification suppressed during quiet hours")
}
return suppressed
}
// shouldNotifyAfterCooldown checks if enough time has passed since the last notification
// Returns true if notification should be sent, false if still in cooldown period
func (m *Manager) shouldNotifyAfterCooldown(alert *Alert) bool {
// If cooldown is 0 or negative, always allow notifications
if m.config.Schedule.Cooldown <= 0 {
return true
}
// If this is the first notification, allow it
if alert.LastNotified == nil {
return true
}
// Check if enough time has passed
cooldownDuration := time.Duration(m.config.Schedule.Cooldown) * time.Minute
timeSinceLastNotification := time.Since(*alert.LastNotified)
return timeSinceLastNotification >= cooldownDuration
}
// GetConfig returns the current alert configuration
func (m *Manager) GetConfig() AlertConfig {
m.mu.RLock()
defer m.mu.RUnlock()
return m.config
}
// CheckGuest checks a guest (VM or container) against thresholds
func (m *Manager) CheckGuest(guest interface{}, instanceName string) {
m.mu.RLock()
enabled := m.config.Enabled
disableAllGuests := m.config.DisableAllGuests
disableAllGuestsOffline := m.config.DisableAllGuestsOffline
ignoredGuestPrefixes := m.config.IgnoredGuestPrefixes
guestTagWhitelist := m.config.GuestTagWhitelist
guestTagBlacklist := m.config.GuestTagBlacklist
m.mu.RUnlock()
if !enabled {
log.Debug().Msg("CheckGuest: alerts disabled globally")
return
}
if disableAllGuests {
log.Debug().Msg("CheckGuest: all guest alerts disabled")
return
}
var guestID, name, node, guestType, status string
var cpu, memUsage, diskUsage float64
var diskRead, diskWrite, netIn, netOut int64
var disks []models.Disk
var tags []string
// Extract data based on guest type
switch g := guest.(type) {
case models.VM:
guestID = g.ID
name = g.Name
node = g.Node
status = g.Status
guestType = "VM"
cpu = g.CPU * 100 // Convert to percentage
memUsage = g.Memory.Usage
diskUsage = g.Disk.Usage
diskRead = g.DiskRead
diskWrite = g.DiskWrite
netIn = g.NetworkIn
netOut = g.NetworkOut
disks = g.Disks
if len(g.Tags) > 0 {
tags = append(tags, g.Tags...)
}
// Debug logging for high memory VMs
if memUsage > 85 {
log.Debug().
Str("vm", name).
Float64("memUsage", memUsage).
Str("status", status).
Msg("VM with high memory detected in CheckGuest")
}
case models.Container:
guestID = g.ID
name = g.Name
node = g.Node
status = g.Status
guestType = "Container"
cpu = g.CPU * 100 // Convert to percentage
memUsage = g.Memory.Usage
diskUsage = g.Disk.Usage
diskRead = g.DiskRead
diskWrite = g.DiskWrite
netIn = g.NetworkIn
netOut = g.NetworkOut
disks = g.Disks
if len(g.Tags) > 0 {
tags = append(tags, g.Tags...)
}
default:
log.Debug().
Str("type", fmt.Sprintf("%T", guest)).
Msg("CheckGuest: unsupported guest type")
return
}
// Check ignored prefixes
for _, prefix := range ignoredGuestPrefixes {
if prefix != "" && strings.HasPrefix(name, prefix) {
if cleared := m.suppressGuestAlerts(guestID); cleared {
m.saveActiveAlertsAsync("ignored-prefix")
}
return
}
}
settings := parsePulseTags(tags)
if settings.Suppress {
if cleared := m.suppressGuestAlerts(guestID); cleared {
m.saveActiveAlertsAsync("pulse-no-alerts")
}
log.Debug().
Str("guestID", guestID).
Msg("Pulse no-alerts tag active; suppressing guest alerts")
return
}
// Custom Tag Filtering
if len(guestTagBlacklist) > 0 || len(guestTagWhitelist) > 0 {
// Normalize tags once for checking
normalizedTags := make(map[string]bool)
for _, tag := range tags {
normalizedTags[strings.ToLower(strings.TrimSpace(tag))] = true
}
// Check Blacklist
for _, block := range guestTagBlacklist {
if normalizedTags[strings.ToLower(strings.TrimSpace(block))] {
if cleared := m.suppressGuestAlerts(guestID); cleared {
m.saveActiveAlertsAsync("tag-blacklist")
}
log.Debug().Str("guestID", guestID).Msg("Guest suppressed by tag blacklist")
return
}
}
// Check Whitelist
if len(guestTagWhitelist) > 0 {
found := false
for _, allow := range guestTagWhitelist {
if normalizedTags[strings.ToLower(strings.TrimSpace(allow))] {
found = true
break
}
}
if !found {
if cleared := m.suppressGuestAlerts(guestID); cleared {
m.saveActiveAlertsAsync("tag-whitelist")
}
log.Debug().Str("guestID", guestID).Msg("Guest suppressed by tag whitelist (required tag not found)")
return
}
}
}
monitorOnly := settings.MonitorOnly
if monitorOnly || m.guestHasMonitorOnlyAlerts(guestID) {
log.Debug().
Str("guest", name).
Bool("monitorOnly", monitorOnly).
Msg("Pulse monitor-only status applied")
}
// Handle non-running guests
// Proxmox VM states: running, stopped, paused, suspended
if status != "running" {
// Check for powered-off state and generate alert if configured
if status == "stopped" {
if disableAllGuestsOffline {
// Clear any pending powered-off tracking and alerts when globally disabled
m.mu.Lock()
delete(m.offlineConfirmations, guestID)
m.mu.Unlock()
m.clearAlert(fmt.Sprintf("guest-powered-off-%s", guestID))
} else {
m.checkGuestPoweredOff(guestID, name, node, instanceName, guestType, monitorOnly)
}
} else {
// For paused/suspended, clear powered-off alert
m.clearGuestPoweredOffAlert(guestID, name)
}
// Clear all resource metric alerts (cpu, memory, disk, etc.) for non-running guests
m.mu.Lock()
alertsCleared := 0
for alertID, alert := range m.activeAlerts {
// Only clear resource metric alerts, not powered-off alerts
if alert.ResourceID == guestID && alert.Type != "powered-off" {
m.clearAlertNoLock(alertID)
alertsCleared++
log.Debug().
Str("alertID", alertID).
Str("guest", name).
Str("status", status).
Msg("Cleared metric alert for non-running guest")
}
}
m.mu.Unlock()
if alertsCleared > 0 {
log.Debug().
Str("guest", name).
Str("status", status).
Int("alertsCleared", alertsCleared).
Msg("Cleared metric alerts for non-running guest")
}
return
}
// If guest is running, clear any powered-off alert
m.clearGuestPoweredOffAlert(guestID, name)
// Get thresholds (check custom rules, then overrides, then defaults)
m.mu.RLock()
thresholds := m.getGuestThresholds(guest, guestID)
m.mu.RUnlock()
if settings.Relaxed {
thresholds = applyRelaxedGuestThresholds(thresholds)
log.Info().
Str("guest", name).
Float64("trigger", thresholds.CPU.Trigger).
Msg("Applied relaxed thresholds for pulse-relaxed tag")
}
// If alerts are disabled for this guest, clear any existing alerts and return
if thresholds.Disabled {
m.mu.Lock()
for alertID, alert := range m.activeAlerts {
if alert.ResourceID == guestID {
m.clearAlertNoLock(alertID)
log.Info().
Str("alertID", alertID).
Str("guest", name).
Msg("Cleared alert - guest has alerts disabled")
}
}
m.mu.Unlock()
return
}
// Check each metric
log.Debug().
Str("guest", name).
Float64("cpu", cpu).
Float64("memory", memUsage).
Float64("disk", diskUsage).
Interface("thresholds", thresholds).
Msg("Checking guest thresholds")
// Check thresholds (checkMetric will skip if threshold is nil or <= 0)
cpuOpts := &metricOptions{MonitorOnly: monitorOnly}
memOpts := &metricOptions{MonitorOnly: monitorOnly}
diskOpts := &metricOptions{MonitorOnly: monitorOnly}
if !monitorOnly {
cpuOpts = nil
memOpts = nil
diskOpts = nil
}
m.checkMetric(guestID, name, node, instanceName, guestType, "cpu", cpu, thresholds.CPU, cpuOpts)
m.checkMetric(guestID, name, node, instanceName, guestType, "memory", memUsage, thresholds.Memory, memOpts)
m.checkMetric(guestID, name, node, instanceName, guestType, "disk", diskUsage, thresholds.Disk, diskOpts)
if thresholds.Disk != nil && thresholds.Disk.Trigger > 0 && len(disks) > 0 {
seenDisks := make(map[string]struct{})
for idx, disk := range disks {
if disk.Total <= 0 {
continue
}
if disk.Usage < 0 {
continue
}
label := strings.TrimSpace(disk.Mountpoint)
if label == "" {
label = strings.TrimSpace(disk.Device)
}
if label == "" {
label = fmt.Sprintf("Disk %d", idx+1)
}
keySource := label
if disk.Device != "" && !strings.EqualFold(disk.Device, label) {
keySource = fmt.Sprintf("%s-%s", label, disk.Device)
}
sanitizedKey := sanitizeAlertKey(keySource)
if sanitizedKey == "" {
sanitizedKey = fmt.Sprintf("disk-%d", idx+1)
}
// Avoid duplicate checks if two disks resolve to the same key
if _, exists := seenDisks[sanitizedKey]; exists {
continue
}
seenDisks[sanitizedKey] = struct{}{}
perDiskResourceID := fmt.Sprintf("%s-disk-%s", guestID, sanitizedKey)
message := fmt.Sprintf("%s disk (%s) at %.1f%%", guestType, label, disk.Usage)
log.Debug().
Str("guest", name).
Str("node", node).
Str("instance", instanceName).
Str("diskLabel", label).
Float64("usage", disk.Usage).
Msg("Evaluating individual disk for alert thresholds")
metadata := map[string]interface{}{
"mountpoint": disk.Mountpoint,
"device": disk.Device,
"diskType": disk.Type,
"totalBytes": disk.Total,
"usedBytes": disk.Used,
"freeBytes": disk.Free,
"diskIndex": idx,
"label": label,
}
m.checkMetric(perDiskResourceID, name, node, instanceName, guestType, "disk", disk.Usage, thresholds.Disk, &metricOptions{
Metadata: metadata,
Message: message,
MonitorOnly: monitorOnly,
})
}
}
// Check I/O metrics (convert bytes/s to MB/s) - checkMetric will skip if threshold is nil or <= 0
// Check I/O metrics (convert bytes/s to MB/s)
// We call checkMetric unconditionally. If the threshold is nil or disabled (Trigger <= 0),
// checkMetric will automatically clear any existing alerts for that metric.
{
readOpts := &metricOptions{MonitorOnly: monitorOnly}
if !monitorOnly {
readOpts = nil
}
m.checkMetric(guestID, name, node, instanceName, guestType, "diskRead", float64(diskRead)/1024/1024, thresholds.DiskRead, readOpts)
}
{
writeOpts := &metricOptions{MonitorOnly: monitorOnly}
if !monitorOnly {
writeOpts = nil
}
m.checkMetric(guestID, name, node, instanceName, guestType, "diskWrite", float64(diskWrite)/1024/1024, thresholds.DiskWrite, writeOpts)
}
{
netInOpts := &metricOptions{MonitorOnly: monitorOnly}
if !monitorOnly {
netInOpts = nil
}
m.checkMetric(guestID, name, node, instanceName, guestType, "networkIn", float64(netIn)/1024/1024, thresholds.NetworkIn, netInOpts)
}
{
netOutOpts := &metricOptions{MonitorOnly: monitorOnly}
if !monitorOnly {
netOutOpts = nil
}
m.checkMetric(guestID, name, node, instanceName, guestType, "networkOut", float64(netOut)/1024/1024, thresholds.NetworkOut, netOutOpts)
}
}
// CheckNode checks a node against thresholds
func (m *Manager) CheckNode(node models.Node) {
// Cache display name so all alerts (including guest alerts on this node) can resolve it.
m.UpdateNodeDisplayName(node.Instance, node.Name, node.DisplayName)
m.mu.RLock()
if !m.config.Enabled {
m.mu.RUnlock()
return
}
if m.config.DisableAllNodes {
m.mu.RUnlock()
// Clear any existing node alerts when all node alerts are disabled
m.mu.Lock()
// Clear offline tracking
delete(m.nodeOfflineCount, node.ID)
// Clear all possible node alert types
alertTypes := []string{"cpu", "memory", "disk", "temperature"}
for _, alertType := range alertTypes {
alertID := fmt.Sprintf("%s-%s", node.ID, alertType)
if _, exists := m.activeAlerts[alertID]; exists {
m.clearAlertNoLock(alertID)
log.Info().
Str("alertID", alertID).
Str("node", node.Name).
Msg("Cleared node alert - all node alerts disabled")
}
}
// Clear offline alert
offlineAlertID := fmt.Sprintf("node-offline-%s", node.ID)
if _, exists := m.activeAlerts[offlineAlertID]; exists {
m.clearAlertNoLock(offlineAlertID)
log.Info().
Str("alertID", offlineAlertID).
Str("node", node.Name).
Msg("Cleared offline alert - all node alerts disabled")
}
m.mu.Unlock()
return
}
disableNodesOffline := m.config.DisableAllNodesOffline
thresholds := m.config.NodeDefaults
if override, exists := m.config.Overrides[node.ID]; exists {
thresholds = m.applyThresholdOverride(thresholds, override)
}
m.mu.RUnlock()
if disableNodesOffline {
// Clear tracking and any existing offline alerts when globally disabled
m.mu.Lock()
delete(m.nodeOfflineCount, node.ID)
m.mu.Unlock()
m.clearAlert(fmt.Sprintf("node-offline-%s", node.ID))
} else {
// CRITICAL: Check if node is offline first
if node.Status == "offline" || node.ConnectionHealth == "error" || node.ConnectionHealth == "failed" {
m.checkNodeOffline(node)
// Clear resource alerts if node is offline/unreachable.
// This prevents stale alerts from persisting when we can't get new data.
metrics := []string{"cpu", "memory", "disk", "temperature"}
for _, metric := range metrics {
m.clearAlert(fmt.Sprintf("%s-%s", node.ID, metric))
}
} else {
// Clear any existing offline alert if node is back online
m.clearNodeOfflineAlert(node)
// Check each metric (only if node is online and reachable)
// Check for host agent deduplication: if a host agent is running on this node,
// prefer the host agent alerts and skip node metric alerts to avoid duplicates.
if m.hasHostAgentForNode(node.Name) {
log.Debug().
Str("node", node.Name).
Msg("Skipping node metric alerts - host agent is monitoring this machine")
} else {
m.checkMetric(node.ID, node.Name, node.Name, node.Instance, "Node", "cpu", node.CPU*100, thresholds.CPU, nil)
m.checkMetric(node.ID, node.Name, node.Name, node.Instance, "Node", "memory", node.Memory.Usage, thresholds.Memory, nil)
m.checkMetric(node.ID, node.Name, node.Name, node.Instance, "Node", "disk", node.Disk.Usage, thresholds.Disk, nil)
// Check temperature if available
// We pass the check unconditionally so that if the threshold triggers are disabled (set to 0),
// any existing alerts will be properly cleared.
var temp float64
if node.Temperature != nil && node.Temperature.Available {
// Use CPU package temp if available, otherwise use max core temp
temp = node.Temperature.CPUPackage
if temp == 0 {
temp = node.Temperature.CPUMax
}
}
m.checkMetric(node.ID, node.Name, node.Name, node.Instance, "Node", "temperature", temp, thresholds.Temperature, nil)
}
}
}
}
// RegisterHostAgentHostname registers a host agent hostname for deduplication.
// When a host agent is actively monitoring a machine, we prefer its alerts
// over Proxmox node alerts to avoid duplicate monitoring of the same machine.
func (m *Manager) RegisterHostAgentHostname(hostname string) {
normalized := strings.ToLower(strings.TrimSpace(hostname))
if normalized == "" {
return
}
m.mu.Lock()
m.hostAgentHostnames[normalized] = struct{}{}
m.mu.Unlock()
log.Debug().
Str("hostname", hostname).
Msg("Registered host agent hostname for deduplication")
}
// UnregisterHostAgentHostname removes a host agent hostname from deduplication tracking.
func (m *Manager) UnregisterHostAgentHostname(hostname string) {
normalized := strings.ToLower(strings.TrimSpace(hostname))
if normalized == "" {
return
}
m.mu.Lock()
delete(m.hostAgentHostnames, normalized)
m.mu.Unlock()
log.Debug().
Str("hostname", hostname).
Msg("Unregistered host agent hostname from deduplication")
}
// hasHostAgentForNode checks if a host agent is monitoring a machine with the same
// hostname as the given Proxmox node. If so, we should suppress node alerts to
// avoid duplicate alerting.
func (m *Manager) hasHostAgentForNode(nodeName string) bool {
normalized := strings.ToLower(strings.TrimSpace(nodeName))
if normalized == "" {
return false
}
m.mu.RLock()
_, exists := m.hostAgentHostnames[normalized]
m.mu.RUnlock()
return exists
}
func nodeDisplayNameCacheKey(instance, name string) string {
return strings.TrimSpace(instance) + "\x00" + strings.TrimSpace(name)
}
// UpdateNodeDisplayName caches the display name for a node/host so alerts
// can resolve it without needing the full model object.
func (m *Manager) UpdateNodeDisplayName(instance, name, displayName string) {
instance = strings.TrimSpace(instance)
name = strings.TrimSpace(name)
if name == "" {
return
}
displayName = strings.TrimSpace(displayName)
m.mu.Lock()
if instance != "" {
key := nodeDisplayNameCacheKey(instance, name)
if displayName != "" && displayName != name {
m.instanceNodeDisplayNames[key] = displayName
} else {
delete(m.instanceNodeDisplayNames, key)
}
} else {
if displayName != "" && displayName != name {
m.nodeDisplayNames[name] = displayName
} else {
delete(m.nodeDisplayNames, name)
}
}
m.mu.Unlock()
}
// resolveNodeDisplayName returns the cached display name for a node, or empty
// string if none is set. Caller must hold m.mu (read or write).
func (m *Manager) resolveNodeDisplayName(instance, node string) string {
if instance = strings.TrimSpace(instance); instance != "" {
if displayName, ok := m.instanceNodeDisplayNames[nodeDisplayNameCacheKey(instance, node)]; ok {
return displayName
}
}
return m.nodeDisplayNames[node]
}
func hostResourceID(hostID string) string {
trimmed := strings.TrimSpace(hostID)
if trimmed == "" {
return "host:unknown"
}
return fmt.Sprintf("host:%s", trimmed)
}
func hostDisplayName(host models.Host) string {
if name := strings.TrimSpace(host.DisplayName); name != "" {
return name
}
if name := strings.TrimSpace(host.Hostname); name != "" {
return name
}
if host.ID != "" {
return host.ID
}
return "Host"
}
func hostInstanceName(host models.Host) string {
if platform := strings.TrimSpace(host.Platform); platform != "" {
return platform
}
if osName := strings.TrimSpace(host.OSName); osName != "" {
return osName
}
return "Host Agent"
}
func sanitizeHostComponent(value string) string {
value = strings.TrimSpace(strings.ToLower(value))
if value == "" {
return "unknown"
}
var builder strings.Builder
lastHyphen := false
for _, r := range value {
switch {
case r >= 'a' && r <= 'z':
builder.WriteRune(r)
lastHyphen = false
case r >= '0' && r <= '9':
builder.WriteRune(r)
lastHyphen = false
default:
if !lastHyphen {
builder.WriteRune('-')
lastHyphen = true
}
}
}
sanitized := strings.Trim(builder.String(), "-")
if sanitized == "" {
return "unknown"
}
return sanitized
}
// sanitizeRAIDDevice sanitizes RAID device names for use in resource IDs.
func sanitizeRAIDDevice(device string) string {
// Remove /dev/ prefix if present
device = strings.TrimPrefix(device, "/dev/")
return sanitizeHostComponent(device)
}
func hostMatchesVendorHint(host models.Host, hints ...string) bool {
fields := []string{
host.Platform,
host.OSName,
host.OSVersion,
host.DisplayName,
host.Hostname,
}
for _, field := range fields {
value := strings.ToLower(strings.TrimSpace(field))
if value == "" {
continue
}
for _, hint := range hints {
if strings.Contains(value, hint) {
return true
}
}
}
return false
}
func isSynologyLikeHost(host models.Host) bool {
return hostMatchesVendorHint(host, "synology", "dsm")
}
func isQNAPLikeHost(host models.Host) bool {
return hostMatchesVendorHint(host, "qnap", "qts", "quts")
}
func shouldSuppressHostRAIDArray(host models.Host, array models.HostRAIDArray) bool {
deviceLower := strings.ToLower(strings.TrimSpace(strings.TrimPrefix(array.Device, "/dev/")))
switch {
case deviceLower == "":
return false
case isSynologyLikeHost(host):
return deviceLower == "md0" || deviceLower == "md1"
case isQNAPLikeHost(host):
return deviceLower == "md9" || deviceLower == "md13"
default:
return false
}
}
func hostDiskResourceID(host models.Host, disk models.Disk) (string, string) {
label := strings.TrimSpace(disk.Mountpoint)
if label == "" {
label = strings.TrimSpace(disk.Device)
}
if label == "" {
label = "disk"
}
resourceID := fmt.Sprintf("%s/disk:%s", hostResourceID(host.ID), sanitizeHostComponent(label))
resourceName := fmt.Sprintf("%s (%s)", hostDisplayName(host), label)
return resourceID, resourceName
}
// CheckHost evaluates host agent telemetry for alerts.
func (m *Manager) CheckHost(host models.Host) {
if host.ID == "" {
return
}
// Register this host agent hostname for deduplication with Proxmox nodes.
// This prevents duplicate alerts when both a Node and Host agent monitor the same machine.
if host.Hostname != "" {
m.RegisterHostAgentHostname(host.Hostname)
}
// Cache display name so host alerts show the user-configured name.
m.UpdateNodeDisplayName("", host.Hostname, host.DisplayName)
// Fresh telemetry marks the host as online and clears offline tracking.
m.HandleHostOnline(host)
m.mu.RLock()
alertsEnabled := m.config.Enabled
disableAllHosts := m.config.DisableAllHosts
thresholds := m.config.HostDefaults
override, hasOverride := m.config.Overrides[host.ID]
m.mu.RUnlock()
if !alertsEnabled {
return
}
if disableAllHosts {
// Clear any existing host alerts when all host alerts are disabled
m.clearHostMetricAlerts(host.ID)
m.clearHostDiskAlerts(host.ID)
m.clearHostRAIDAlerts(host.ID)
return
}
if hasOverride {
thresholds = m.applyThresholdOverride(thresholds, override)
if thresholds.Disabled {
m.clearHostMetricAlerts(host.ID)
m.clearHostDiskAlerts(host.ID)
m.clearHostRAIDAlerts(host.ID)
return
}
}
resourceID := hostResourceID(host.ID)
resourceName := hostDisplayName(host)
nodeName := strings.TrimSpace(host.Hostname)
instanceName := hostInstanceName(host)
baseMetadata := map[string]interface{}{
"resourceType": "Host",
"hostId": host.ID,
"hostname": host.Hostname,
"displayName": host.DisplayName,
"platform": host.Platform,
"osName": host.OSName,
"osVersion": host.OSVersion,
"agentVersion": host.AgentVersion,
"architecture": host.Architecture,
}
if len(host.Tags) > 0 {
baseMetadata["tags"] = append([]string(nil), host.Tags...)
}
if thresholds.CPU != nil {
cpuMetadata := cloneMetadata(baseMetadata)
cpuMetadata["metric"] = "cpu"
cpuMetadata["cpuUsagePercent"] = host.CPUUsage
if host.CPUCount > 0 {
cpuMetadata["cpuCount"] = host.CPUCount
}
m.checkMetric(resourceID, resourceName, nodeName, instanceName, "Host", "cpu", host.CPUUsage, thresholds.CPU, &metricOptions{Metadata: cpuMetadata})
} else {
m.clearHostMetricAlerts(host.ID, "cpu")
}
if thresholds.Memory != nil {
memMetadata := cloneMetadata(baseMetadata)
memMetadata["metric"] = "memory"
memMetadata["memoryUsagePercent"] = host.Memory.Usage
if host.Memory.Total > 0 {
memMetadata["memoryTotalBytes"] = host.Memory.Total
memMetadata["memoryUsedBytes"] = host.Memory.Used
memMetadata["memoryFreeBytes"] = host.Memory.Free
}
m.checkMetric(resourceID, resourceName, nodeName, instanceName, "Host", "memory", host.Memory.Usage, thresholds.Memory, &metricOptions{Metadata: memMetadata})
} else {
m.clearHostMetricAlerts(host.ID, "memory")
}
if thresholds.DiskTemperature != nil && thresholds.DiskTemperature.Trigger > 0 {
if len(host.Sensors.SMART) > 0 {
for _, disk := range host.Sensors.SMART {
if disk.Temperature > 0 && !disk.Standby {
// Use specific resource ID for the disk: hostID/disk-temp:device
tempResourceID := fmt.Sprintf("%s/disk_temp:%s", hostResourceID(host.ID), sanitizeHostComponent(disk.Device))
tempResourceName := fmt.Sprintf("%s (%s Temp)", host.DisplayName, disk.Device)
diskTempMetadata := cloneMetadata(baseMetadata)
diskTempMetadata["metric"] = "diskTemperature"
diskTempMetadata["device"] = disk.Device
diskTempMetadata["temperature"] = disk.Temperature
diskTempMetadata["model"] = disk.Model
m.checkMetric(tempResourceID, tempResourceName, nodeName, disk.Device, "Host", "diskTemperature", float64(disk.Temperature), thresholds.DiskTemperature, &metricOptions{Metadata: diskTempMetadata})
}
}
}
} else {
// We can't easily clear all disk temp alerts without tracking them,
// but checkMetric logic handles auto-resolution if value drops.
// If feature is disabled, ideally we should clear existing alerts.
// For now simple implementation.
}
seenDisks := make(map[string]struct{}, len(host.Disks))
for _, disk := range host.Disks {
diskResourceID, diskName := hostDiskResourceID(host, disk)
seenDisks[diskResourceID] = struct{}{}
// Check for disk-specific override
m.mu.RLock()
diskOverride, hasDiskOverride := m.config.Overrides[diskResourceID]
m.mu.RUnlock()
// Determine the effective disk threshold
var effectiveDiskThreshold *HysteresisThreshold
if hasDiskOverride {
// If disk is disabled via override, skip alerting
if diskOverride.Disabled {
m.clearAlert(fmt.Sprintf("host-%s-disk-%s", host.ID, sanitizeHostComponent(disk.Mountpoint)))
continue
}
// Use disk-specific threshold if set
if diskOverride.Disk != nil {
effectiveDiskThreshold = ensureHysteresisThreshold(diskOverride.Disk)
} else if diskOverride.DiskLegacy != nil {
effectiveDiskThreshold = m.convertLegacyThreshold(diskOverride.DiskLegacy)
}
}
// Fall back to host-level threshold
if effectiveDiskThreshold == nil {
effectiveDiskThreshold = thresholds.Disk
}
// Skip if no threshold configured (nil)
// We DO NOT skip if Trigger <= 0 because we need to call checkMetric to clear any existing alerts.
if effectiveDiskThreshold == nil {
continue
}
diskMetadata := cloneMetadata(baseMetadata)
diskMetadata["metric"] = "disk"
diskMetadata["mountpoint"] = disk.Mountpoint
diskMetadata["device"] = disk.Device
diskMetadata["diskType"] = disk.Type
diskMetadata["diskUsagePercent"] = disk.Usage
if disk.Total > 0 {
diskMetadata["diskTotalBytes"] = disk.Total
diskMetadata["diskUsedBytes"] = disk.Used
diskMetadata["diskFreeBytes"] = disk.Free
}
m.checkMetric(diskResourceID, diskName, nodeName, instanceName, "Host Disk", "disk", disk.Usage, effectiveDiskThreshold, &metricOptions{Metadata: diskMetadata})
}
// Clear all disk alerts if host-level disk alerting is completely disabled and no disk-specific overrides
if thresholds.Disk == nil || thresholds.Disk.Trigger <= 0 {
// Only clear alerts for disks that don't have their own overrides
m.mu.RLock()
var disksToClear []string
for _, disk := range host.Disks {
diskResourceID, _ := hostDiskResourceID(host, disk)
if _, hasDiskOverride := m.config.Overrides[diskResourceID]; !hasDiskOverride {
disksToClear = append(disksToClear, fmt.Sprintf("host-%s-disk-%s", host.ID, sanitizeHostComponent(disk.Mountpoint)))
}
}
m.mu.RUnlock()
for _, alertID := range disksToClear {
m.clearAlert(alertID)
}
}
m.cleanupHostDiskAlerts(host, seenDisks)
// Check RAID arrays for degraded or failed state
if len(host.RAID) > 0 {
for _, array := range host.RAID {
// Skip vendor-managed system arrays that are not user-facing storage pools.
// Synology uses md0/md1, while QNAP uses md9/md13 for internal OS volumes.
if shouldSuppressHostRAIDArray(host, array) {
// Still clear any existing alerts for these devices
alertID := fmt.Sprintf("host-%s-raid-%s", host.ID, sanitizeRAIDDevice(array.Device))
m.clearAlert(alertID)
continue
}
raidResourceID := fmt.Sprintf("host-%s-raid-%s", host.ID, sanitizeRAIDDevice(array.Device))
raidName := fmt.Sprintf("%s - %s (%s)", resourceName, array.Device, array.Level)
raidMetadata := cloneMetadata(baseMetadata)
raidMetadata["metric"] = "raid"
raidMetadata["raidDevice"] = array.Device
raidMetadata["raidLevel"] = array.Level
raidMetadata["raidState"] = array.State
raidMetadata["raidTotalDevices"] = array.TotalDevices
raidMetadata["raidActiveDevices"] = array.ActiveDevices
raidMetadata["raidFailedDevices"] = array.FailedDevices
raidMetadata["raidSpareDevices"] = array.SpareDevices
if array.UUID != "" {
raidMetadata["raidUUID"] = array.UUID
}
if array.RebuildPercent > 0 {
raidMetadata["raidRebuildPercent"] = array.RebuildPercent
}
// Check for degraded or failed arrays
stateLower := strings.ToLower(array.State)
isDegraded := strings.Contains(stateLower, "degraded") || array.FailedDevices > 0
// A "check" state indicates data scrubbing (e.g., DSM scheduled scrub), not a rebuild.
// Only treat as rebuilding if state indicates actual recovery, not routine maintenance.
isChecking := strings.Contains(stateLower, "check")
isRebuilding := !isChecking && (strings.Contains(stateLower, "recover") ||
strings.Contains(stateLower, "resync") ||
(array.RebuildPercent > 0 && !strings.Contains(stateLower, "clean")))
alertID := fmt.Sprintf("host-%s-raid-%s", host.ID, sanitizeRAIDDevice(array.Device))
if isDegraded {
// Critical alert for degraded arrays
msg := fmt.Sprintf("RAID array %s is degraded", array.Device)
if array.FailedDevices > 0 {
msg = fmt.Sprintf("RAID array %s has %d failed device(s)", array.Device, array.FailedDevices)
}
m.mu.Lock()
if _, exists := m.activeAlerts[alertID]; !exists {
alert := &Alert{
ID: alertID,
Type: "raid",
Level: AlertLevelCritical,
ResourceID: raidResourceID,
ResourceName: raidName,
Node: nodeName,
NodeDisplayName: m.resolveNodeDisplayName(instanceName, nodeName),
Instance: instanceName,
Message: msg,
Value: float64(array.FailedDevices),
Threshold: 0,
StartTime: time.Now(),
LastSeen: time.Now(),
Metadata: raidMetadata,
}
m.preserveAlertState(alertID, alert)
m.activeAlerts[alertID] = alert
m.recentAlerts[alertID] = alert
m.historyManager.AddAlert(*alert)
m.dispatchAlert(alert, false)
m.mu.Unlock()
log.Error().
Str("host", resourceName).
Str("hostID", host.ID).
Str("raidDevice", array.Device).
Str("raidLevel", array.Level).
Int("failedDevices", array.FailedDevices).
Msg("CRITICAL: RAID array degraded")
} else {
m.mu.Unlock()
}
} else if isRebuilding {
// Warning alert for rebuilding arrays
msg := fmt.Sprintf("RAID array %s is rebuilding", array.Device)
if array.RebuildPercent > 0 {
msg = fmt.Sprintf("RAID array %s is rebuilding (%.1f%% complete)", array.Device, array.RebuildPercent)
}
m.mu.Lock()
if _, exists := m.activeAlerts[alertID]; !exists {
alert := &Alert{
ID: alertID,
Type: "raid",
Level: AlertLevelWarning,
ResourceID: raidResourceID,
ResourceName: raidName,
Node: nodeName,
Instance: instanceName,
Message: msg,
Value: array.RebuildPercent,
Threshold: 100,
StartTime: time.Now(),
LastSeen: time.Now(),
Metadata: raidMetadata,
}
m.preserveAlertState(alertID, alert)
m.activeAlerts[alertID] = alert
m.recentAlerts[alertID] = alert
m.historyManager.AddAlert(*alert)
m.dispatchAlert(alert, false)
m.mu.Unlock()
log.Warn().
Str("host", resourceName).
Str("hostID", host.ID).
Str("raidDevice", array.Device).
Str("raidLevel", array.Level).
Float64("rebuildPercent", array.RebuildPercent).
Msg("WARNING: RAID array rebuilding")
} else {
m.mu.Unlock()
}
} else {
// Array is healthy, clear any existing alerts
m.clearAlert(alertID)
}
}
}
}
// HandleHostOnline clears offline tracking and alerts for a host agent.
func (m *Manager) HandleHostOnline(host models.Host) {
if host.ID == "" {
return
}
alertID := fmt.Sprintf("host-offline-%s", host.ID)
resourceKey := hostResourceID(host.ID)
m.mu.Lock()
delete(m.offlineConfirmations, resourceKey)
_, exists := m.activeAlerts[alertID]
m.mu.Unlock()
if exists {
m.clearAlert(alertID)
}
}
// HandleHostRemoved clears alerts and tracking when a host agent is deleted.
func (m *Manager) HandleHostRemoved(host models.Host) {
if host.ID == "" {
return
}
// Unregister the host agent hostname since it's being removed.
if host.Hostname != "" {
m.UnregisterHostAgentHostname(host.Hostname)
}
m.HandleHostOnline(host)
m.clearHostMetricAlerts(host.ID)
m.clearHostDiskAlerts(host.ID)
m.clearHostRAIDAlerts(host.ID)
}
// HandleHostOffline raises an alert when a host agent stops reporting.
func (m *Manager) HandleHostOffline(host models.Host) {
if host.ID == "" {
return
}
// Unregister the host agent hostname since it's no longer actively monitoring.
// This allows node alerts to resume if a Proxmox node with the same hostname exists.
if host.Hostname != "" {
m.UnregisterHostAgentHostname(host.Hostname)
}
m.mu.RLock()
if !m.config.Enabled {
m.mu.RUnlock()
return
}
disableHostsOffline := m.config.DisableAllHostsOffline
m.mu.RUnlock()
alertID := fmt.Sprintf("host-offline-%s", host.ID)
resourceKey := hostResourceID(host.ID)
resourceName := hostDisplayName(host)
nodeName := strings.TrimSpace(host.Hostname)
instanceName := hostInstanceName(host)
if disableHostsOffline {
m.mu.Lock()
delete(m.offlineConfirmations, resourceKey)
m.mu.Unlock()
m.clearAlert(alertID)
return
}
var disableConnectivity bool
m.mu.RLock()
if override, exists := m.config.Overrides[host.ID]; exists {
disableConnectivity = override.DisableConnectivity || override.Disabled
}
m.mu.RUnlock()
if disableConnectivity {
m.clearAlert(alertID)
m.mu.Lock()
delete(m.offlineConfirmations, resourceKey)
m.mu.Unlock()
return
}
m.mu.Lock()
if alert, exists := m.activeAlerts[alertID]; exists && alert != nil {
alert.LastSeen = time.Now()
m.activeAlerts[alertID] = alert
m.mu.Unlock()
return
}
m.offlineConfirmations[resourceKey]++
const requiredConfirmations = 3
if confirmations := m.offlineConfirmations[resourceKey]; confirmations < requiredConfirmations {
m.mu.Unlock()
log.Debug().
Str("host", resourceName).
Str("hostID", host.ID).
Int("confirmations", confirmations).
Int("required", requiredConfirmations).
Msg("Host agent appears offline, awaiting confirmation")
return
}
// Host is confirmed offline. Clear all resource metrics (CPU/Memory/Disk/RAID)
// before raising the offline alert, to avoid stale alerts persisting.
{
// Basic metrics
metricTypes := []string{"cpu", "memory"}
for _, mt := range metricTypes {
m.clearAlertNoLock(fmt.Sprintf("%s-%s", resourceKey, mt))
}
// Disks and RAID
// Note: Disks use ResourceID prefix, RAID uses AlertID prefix
diskResourcePrefix := fmt.Sprintf("%s/disk:", resourceKey)
raidAlertPrefix := fmt.Sprintf("host-%s-raid-", host.ID)
// Collect alert IDs first, then clear (avoids modifying map during iteration)
var alertsToClear []string
for alertID, a := range m.activeAlerts {
if a == nil {
continue
}
if strings.HasPrefix(a.ResourceID, diskResourcePrefix) {
alertsToClear = append(alertsToClear, alertID)
} else if strings.HasPrefix(alertID, raidAlertPrefix) {
alertsToClear = append(alertsToClear, alertID)
}
}
for _, alertID := range alertsToClear {
m.clearAlertNoLock(alertID)
}
}
alert := &Alert{
ID: alertID,
Type: "host-offline",
Level: AlertLevelCritical,
ResourceID: resourceKey,
ResourceName: resourceName,
Node: nodeName,
Instance: instanceName,
Message: fmt.Sprintf("Host '%s' is offline", resourceName),
Value: 0,
Threshold: 0,
StartTime: time.Now(),
LastSeen: time.Now(),
Metadata: map[string]interface{}{
"resourceType": "Host",
"hostId": host.ID,
"hostname": host.Hostname,
"displayName": host.DisplayName,
"platform": host.Platform,
"osName": host.OSName,
"osVersion": host.OSVersion,
},
}
m.preserveAlertState(alertID, alert)
m.activeAlerts[alertID] = alert
m.recentAlerts[alertID] = alert
m.historyManager.AddAlert(*alert)
if !m.checkRateLimit(alertID) {
m.mu.Unlock()
log.Debug().
Str("alertID", alertID).
Int("maxPerHour", m.config.Schedule.MaxAlertsHour).
Msg("Host offline alert suppressed due to rate limit")
return
}
m.dispatchAlert(alert, false)
m.mu.Unlock()
log.Error().
Str("host", resourceName).
Str("hostID", host.ID).
Str("hostname", host.Hostname).
Msg("CRITICAL: Host agent is offline")
}
func (m *Manager) clearHostMetricAlerts(hostID string, metrics ...string) {
if hostID == "" {
return
}
resourceID := hostResourceID(hostID)
if len(metrics) == 0 {
metrics = []string{"cpu", "memory"}
}
for _, metric := range metrics {
m.clearAlert(fmt.Sprintf("%s-%s", resourceID, metric))
}
}
func (m *Manager) clearHostDiskAlerts(hostID string) {
if hostID == "" {
return
}
prefix := fmt.Sprintf("%s/disk:", hostResourceID(hostID))
m.mu.Lock()
defer m.mu.Unlock()
for alertID, alert := range m.activeAlerts {
if alert == nil {
continue
}
if !strings.HasPrefix(alert.ResourceID, prefix) {
continue
}
m.clearAlertNoLock(alertID)
}
}
func (m *Manager) cleanupHostDiskAlerts(host models.Host, seen map[string]struct{}) {
if host.ID == "" {
return
}
prefix := fmt.Sprintf("%s/disk:", hostResourceID(host.ID))
m.mu.Lock()
defer m.mu.Unlock()
for alertID, alert := range m.activeAlerts {
if alert == nil {
continue
}
if !strings.HasPrefix(alert.ResourceID, prefix) {
continue
}
if _, exists := seen[alert.ResourceID]; exists {
continue
}
m.clearAlertNoLock(alertID)
}
}
func (m *Manager) clearHostRAIDAlerts(hostID string) {
if hostID == "" {
return
}
prefix := fmt.Sprintf("host-%s-raid-", hostID)
m.mu.Lock()
defer m.mu.Unlock()
for alertID := range m.activeAlerts {
if strings.HasPrefix(alertID, prefix) {
m.clearAlertNoLock(alertID)
}
}
}
// CheckPBS checks PBS instance metrics against thresholds
func (m *Manager) CheckPBS(pbs models.PBSInstance) {
m.mu.RLock()
if !m.config.Enabled {
m.mu.RUnlock()
return
}
if m.config.DisableAllPBS {
m.mu.RUnlock()
// Clear any existing PBS alerts when all PBS alerts are disabled
m.mu.Lock()
// Reset offline confirmation tracking
delete(m.offlineConfirmations, pbs.ID)
// Clear CPU alert
cpuAlertID := fmt.Sprintf("%s-cpu", pbs.ID)
if _, exists := m.activeAlerts[cpuAlertID]; exists {
m.clearAlertNoLock(cpuAlertID)
log.Info().
Str("alertID", cpuAlertID).
Str("pbs", pbs.Name).
Msg("Cleared CPU alert - all PBS alerts disabled")
}
// Clear Memory alert
memAlertID := fmt.Sprintf("%s-memory", pbs.ID)
if _, exists := m.activeAlerts[memAlertID]; exists {
m.clearAlertNoLock(memAlertID)
log.Info().
Str("alertID", memAlertID).
Str("pbs", pbs.Name).
Msg("Cleared Memory alert - all PBS alerts disabled")
}
// Clear offline alert
offlineAlertID := fmt.Sprintf("pbs-offline-%s", pbs.ID)
if _, exists := m.activeAlerts[offlineAlertID]; exists {
m.clearAlertNoLock(offlineAlertID)
log.Info().
Str("alertID", offlineAlertID).
Str("pbs", pbs.Name).
Msg("Cleared offline alert - all PBS alerts disabled")
}
m.mu.Unlock()
return
}
// Check if there's an override for this PBS instance
override, hasOverride := m.config.Overrides[pbs.ID]
// Use PBS defaults (CPU, Memory)
cpuThreshold := m.config.PBSDefaults.CPU
memoryThreshold := m.config.PBSDefaults.Memory
disablePBSOffline := m.config.DisableAllPBSOffline
m.mu.RUnlock()
// Check override disable BEFORE offline detection to prevent spurious notifications
if hasOverride && override.Disabled {
m.mu.Lock()
// Reset offline confirmation tracking
delete(m.offlineConfirmations, pbs.ID)
// Clear CPU alert
cpuAlertID := fmt.Sprintf("%s-cpu", pbs.ID)
if _, exists := m.activeAlerts[cpuAlertID]; exists {
m.clearAlertNoLock(cpuAlertID)
log.Debug().
Str("alertID", cpuAlertID).
Str("pbs", pbs.Name).
Msg("Cleared CPU alert - PBS has alerts disabled")
}
// Clear Memory alert
memAlertID := fmt.Sprintf("%s-memory", pbs.ID)
if _, exists := m.activeAlerts[memAlertID]; exists {
m.clearAlertNoLock(memAlertID)
log.Debug().
Str("alertID", memAlertID).
Str("pbs", pbs.Name).
Msg("Cleared Memory alert - PBS has alerts disabled")
}
// Clear offline alert
offlineAlertID := fmt.Sprintf("pbs-offline-%s", pbs.ID)
if _, exists := m.activeAlerts[offlineAlertID]; exists {
m.clearAlertNoLock(offlineAlertID)
log.Debug().
Str("alertID", offlineAlertID).
Str("pbs", pbs.Name).
Msg("Cleared offline alert - PBS has alerts disabled")
}
m.mu.Unlock()
return
}
if disablePBSOffline {
// Clear tracking and any existing offline alerts when globally disabled
m.mu.Lock()
delete(m.offlineConfirmations, pbs.ID)
m.mu.Unlock()
m.clearAlert(fmt.Sprintf("pbs-offline-%s", pbs.ID))
} else {
// Check if PBS is offline first (similar to nodes)
if pbs.Status == "offline" || pbs.ConnectionHealth == "error" || pbs.ConnectionHealth == "unhealthy" {
m.checkPBSOffline(pbs)
} else {
// Clear any existing offline alert if PBS is back online
m.clearPBSOfflineAlert(pbs)
}
}
// Check if there are custom thresholds for this PBS instance
if hasOverride {
if override.CPU != nil {
cpuThreshold = override.CPU
}
if override.Memory != nil {
memoryThreshold = override.Memory
}
}
// Check metrics only if PBS is online - checkMetric will skip if threshold is nil or <= 0
if pbs.Status != "offline" {
// PBS CPU is already a percentage
m.checkMetric(pbs.ID, pbs.Name, pbs.Host, pbs.Name, "PBS", "cpu", pbs.CPU, cpuThreshold, nil)
// PBS Memory is already a percentage
m.checkMetric(pbs.ID, pbs.Name, pbs.Host, pbs.Name, "PBS", "memory", pbs.Memory, memoryThreshold, nil)
}
}
// CheckPMG checks a Proxmox Mail Gateway instance against thresholds
func (m *Manager) CheckPMG(pmg models.PMGInstance) {
m.mu.RLock()
if !m.config.Enabled {
m.mu.RUnlock()
return
}
if m.config.DisableAllPMG {
m.mu.RUnlock()
// Clear any existing PMG alerts when all PMG alerts are disabled
m.mu.Lock()
// Reset offline confirmation tracking
delete(m.offlineConfirmations, pmg.ID)
// Clear all possible PMG alert types
alertTypes := []string{"queue-total", "queue-deferred", "queue-hold", "oldest-message"}
for _, alertType := range alertTypes {
alertID := fmt.Sprintf("%s-%s", pmg.ID, alertType)
if _, exists := m.activeAlerts[alertID]; exists {
m.clearAlertNoLock(alertID)
log.Info().
Str("alertID", alertID).
Str("pmg", pmg.Name).
Msg("Cleared PMG alert - all PMG alerts disabled")
}
}
// Clear offline alert
offlineAlertID := fmt.Sprintf("pmg-offline-%s", pmg.ID)
if _, exists := m.activeAlerts[offlineAlertID]; exists {
m.clearAlertNoLock(offlineAlertID)
log.Info().
Str("alertID", offlineAlertID).
Str("pmg", pmg.Name).
Msg("Cleared offline alert - all PMG alerts disabled")
}
m.mu.Unlock()
return
}
// Check if there's an override for this PMG instance
override, hasOverride := m.config.Overrides[pmg.ID]
disablePMGOffline := m.config.DisableAllPMGOffline
pmgDefaults := m.config.PMGDefaults
m.mu.RUnlock()
// Check override disable BEFORE offline detection to prevent spurious notifications
if hasOverride && override.Disabled {
m.mu.Lock()
// Reset offline confirmation tracking
delete(m.offlineConfirmations, pmg.ID)
// Clear all possible PMG alert types
alertTypes := []string{"queue-total", "queue-deferred", "queue-hold", "oldest-message"}
for _, alertType := range alertTypes {
alertID := fmt.Sprintf("%s-%s", pmg.ID, alertType)
if _, exists := m.activeAlerts[alertID]; exists {
m.clearAlertNoLock(alertID)
log.Debug().
Str("alertID", alertID).
Str("pmg", pmg.Name).
Msg("Cleared PMG alert - PMG has alerts disabled")
}
}
// Clear offline alert
offlineAlertID := fmt.Sprintf("pmg-offline-%s", pmg.ID)
if _, exists := m.activeAlerts[offlineAlertID]; exists {
m.clearAlertNoLock(offlineAlertID)
log.Debug().
Str("alertID", offlineAlertID).
Str("pmg", pmg.Name).
Msg("Cleared offline alert - PMG has alerts disabled")
}
m.mu.Unlock()
return
}
// Handle offline detection
if disablePMGOffline {
// Clear tracking and any existing offline alerts when globally disabled
m.mu.Lock()
delete(m.offlineConfirmations, pmg.ID)
m.mu.Unlock()
m.clearAlert(fmt.Sprintf("pmg-offline-%s", pmg.ID))
} else {
// Check if PMG is offline (similar to PBS/nodes)
if pmg.Status == "offline" || pmg.ConnectionHealth == "error" || pmg.ConnectionHealth == "unhealthy" {
m.checkPMGOffline(pmg)
} else {
// Clear any existing offline alert if PMG is back online
m.clearPMGOfflineAlert(pmg)
}
}
// Check metrics only if PMG is online
if pmg.Status != "offline" {
// Check queue depths across all nodes
m.checkPMGQueueDepths(pmg, pmgDefaults)
// Check oldest message age across all nodes
m.checkPMGOldestMessage(pmg, pmgDefaults)
// Check quarantine backlog and growth
m.checkPMGQuarantineBacklog(pmg, pmgDefaults)
// Check spam/virus rate anomalies
m.checkPMGAnomalies(pmg, pmgDefaults)
// Check per-node queue health
m.checkPMGNodeQueues(pmg, pmgDefaults)
}
}
// dockerInstanceName returns the logical instance name used for Docker alerts.
func dockerInstanceName(host models.DockerHost) string {
name := strings.TrimSpace(host.DisplayName)
if name == "" {
name = strings.TrimSpace(host.Hostname)
}
if name == "" {
return "Docker"
}
return fmt.Sprintf("Docker:%s", name)
}
// dockerContainerDisplayName normalizes the container name for alert readability.
func dockerContainerDisplayName(container models.DockerContainer) string {
name := strings.TrimSpace(container.Name)
if strings.HasPrefix(name, "/") {
name = strings.TrimLeft(name, "/")
}
if name == "" {
id := strings.TrimSpace(container.ID)
if len(id) > 12 {
id = id[:12]
}
return id
}
return name
}
// dockerResourceID builds a stable identifier for Docker container alerts.
func dockerResourceID(hostID, containerID string) string {
hostID = strings.TrimSpace(hostID)
containerID = strings.TrimSpace(containerID)
if containerID == "" {
if hostID == "" {
return "docker:unknown"
}
return fmt.Sprintf("docker:%s", hostID)
}
if hostID == "" {
return fmt.Sprintf("docker:container/%s", containerID)
}
return fmt.Sprintf("docker:%s/%s", hostID, containerID)
}
func normalizeDockerUpdateTrackingPart(part string) string {
return strings.ToLower(strings.TrimSpace(part))
}
// dockerUpdateTrackingHostKey builds a stable host identity for Docker update timing.
func dockerUpdateTrackingHostKey(host models.DockerHost) string {
switch {
case normalizeDockerUpdateTrackingPart(host.AgentID) != "":
return "agent:" + normalizeDockerUpdateTrackingPart(host.AgentID)
case normalizeDockerUpdateTrackingPart(host.TokenID) != "":
return "token:" + normalizeDockerUpdateTrackingPart(host.TokenID)
case normalizeDockerUpdateTrackingPart(host.MachineID) != "":
return "machine:" + normalizeDockerUpdateTrackingPart(host.MachineID)
case normalizeDockerUpdateTrackingPart(host.Hostname) != "":
return "hostname:" + normalizeDockerUpdateTrackingPart(host.Hostname)
case normalizeDockerUpdateTrackingPart(host.ID) != "":
return "id:" + normalizeDockerUpdateTrackingPart(host.ID)
case normalizeDockerUpdateTrackingPart(host.DisplayName) != "":
return "name:" + normalizeDockerUpdateTrackingPart(host.DisplayName)
default:
return "unknown-host"
}
}
func dockerUpdateTrackingContainerKey(container models.DockerContainer) string {
if id := normalizeDockerUpdateTrackingPart(container.ID); id != "" {
return "id:" + id
}
name := normalizeDockerUpdateTrackingPart(container.Name)
name = strings.TrimPrefix(name, "/")
if name != "" {
return "name:" + name
}
if image := normalizeDockerUpdateTrackingPart(container.Image); image != "" {
return "image:" + image
}
return "unknown-container"
}
func dockerUpdateTrackingKey(host models.DockerHost, container models.DockerContainer) string {
return fmt.Sprintf("docker-update:%s/%s", dockerUpdateTrackingHostKey(host), dockerUpdateTrackingContainerKey(container))
}
func dockerUpdateTrackingHostPrefix(host models.DockerHost) string {
return fmt.Sprintf("docker-update:%s/", dockerUpdateTrackingHostKey(host))
}
// dockerServiceDisplayName normalizes the service name for alert readability.
func dockerServiceDisplayName(service models.DockerService) string {
name := strings.TrimSpace(service.Name)
if name != "" {
return name
}
id := strings.TrimSpace(service.ID)
if len(id) > 12 {
id = id[:12]
}
if id == "" {
return "service"
}
return id
}
func dockerServiceResourceID(hostID, serviceID, serviceName string) string {
hostID = strings.TrimSpace(hostID)
id := strings.TrimSpace(serviceID)
if id == "" {
name := strings.TrimSpace(serviceName)
if name == "" {
name = "service"
}
builder := strings.Builder{}
for _, r := range strings.ToLower(name) {
switch {
case r >= 'a' && r <= 'z':
builder.WriteRune(r)
case r >= '0' && r <= '9':
builder.WriteRune(r)
case r == '-', r == '_':
builder.WriteRune(r)
case r == ' ' || r == '/' || r == '\\' || r == ':' || r == '.':
builder.WriteRune('-')
}
}
id = strings.Trim(builder.String(), "-_")
if id == "" {
id = "service"
}
if len(id) > 32 {
id = id[:32]
}
}
if hostID == "" {
return fmt.Sprintf("docker-service:%s", id)
}
return fmt.Sprintf("docker:%s/service/%s", hostID, id)
}
func matchesDockerIgnoredPrefix(name, id string, prefixes []string) bool {
if len(prefixes) == 0 {
return false
}
name = strings.ToLower(strings.TrimSpace(name))
id = strings.ToLower(strings.TrimSpace(id))
for _, raw := range prefixes {
prefix := strings.ToLower(strings.TrimSpace(raw))
if prefix == "" {
continue
}
if name != "" && strings.HasPrefix(name, prefix) {
return true
}
if id != "" && strings.HasPrefix(id, prefix) {
return true
}
}
return false
}
// CheckDockerHost evaluates Docker host telemetry and container metrics for alerts.
func (m *Manager) CheckDockerHost(host models.DockerHost) {
if host.ID == "" {
return
}
// Fresh telemetry marks the host as online and clears any offline alert.
m.HandleDockerHostOnline(host)
m.mu.RLock()
alertsEnabled := m.config.Enabled
disableAllHosts := m.config.DisableAllDockerHosts
ignoredPrefixes := append([]string(nil), m.config.DockerIgnoredContainerPrefixes...)
m.mu.RUnlock()
if !alertsEnabled {
return
}
if disableAllHosts {
return
}
seen := make(map[string]struct{}, len(host.Containers)+len(host.Services))
seenUpdateTracking := make(map[string]struct{}, len(host.Containers))
for _, container := range host.Containers {
containerName := dockerContainerDisplayName(container)
resourceID := dockerResourceID(host.ID, container.ID)
updateTrackingKey := dockerUpdateTrackingKey(host, container)
if matchesDockerIgnoredPrefix(containerName, container.ID, ignoredPrefixes) {
log.Debug().
Str("container", containerName).
Str("host", host.DisplayName).
Msg("Skipping Docker container alert evaluation due to ignored prefix")
m.clearDockerContainerStateAlert(resourceID)
m.clearDockerContainerHealthAlert(resourceID)
m.clearDockerContainerMetricAlerts(resourceID)
m.clearAlert(fmt.Sprintf("docker-container-restart-loop-%s", resourceID))
m.clearAlert(fmt.Sprintf("docker-container-oom-%s", resourceID))
m.clearAlert(fmt.Sprintf("docker-container-memory-limit-%s", resourceID))
m.mu.Lock()
delete(m.dockerRestartTracking, resourceID)
delete(m.dockerLastExitCode, resourceID)
m.mu.Unlock()
m.clearDockerContainerUpdateTracking(resourceID, updateTrackingKey)
continue
}
seen[resourceID] = struct{}{}
seenUpdateTracking[updateTrackingKey] = struct{}{}
m.evaluateDockerContainer(host, container, resourceID)
}
for _, service := range host.Services {
resourceID := dockerServiceResourceID(host.ID, service.ID, service.Name)
seen[resourceID] = struct{}{}
m.evaluateDockerService(host, service, resourceID)
}
m.cleanupDockerContainerAlertsWithTracking(host, seen, seenUpdateTracking)
}
func (m *Manager) evaluateDockerContainer(host models.DockerHost, container models.DockerContainer, resourceID string) {
m.mu.RLock()
disableAllContainers := m.config.DisableAllDockerContainers
m.mu.RUnlock()
if disableAllContainers {
return
}
containerName := dockerContainerDisplayName(container)
nodeName := strings.TrimSpace(host.Hostname)
instanceName := dockerInstanceName(host)
resourceType := "Docker Container"
m.mu.RLock()
overrideConfig, hasOverride := m.config.Overrides[resourceID]
m.mu.RUnlock()
if hasOverride && overrideConfig.Disabled {
// Alerts disabled via override; clear any existing alerts and skip evaluation.
m.clearDockerContainerStateAlert(resourceID)
m.clearDockerContainerHealthAlert(resourceID)
m.clearDockerContainerMetricAlerts(resourceID)
m.clearAlert(fmt.Sprintf("docker-container-update-%s", resourceID))
m.clearDockerContainerUpdateTracking(resourceID, dockerUpdateTrackingKey(host, container))
return
}
state := strings.ToLower(strings.TrimSpace(container.State))
if state == "" {
state = strings.ToLower(strings.TrimSpace(container.Status))
}
if state != "running" {
m.checkDockerContainerState(host, container, resourceID, containerName, instanceName, nodeName)
m.clearDockerContainerMetricAlerts(resourceID, "cpu", "memory", "disk")
} else {
m.clearDockerContainerStateAlert(resourceID)
// Use Docker-specific defaults for containers
thresholds := ThresholdConfig{
CPU: &m.config.DockerDefaults.CPU,
Memory: &m.config.DockerDefaults.Memory,
Disk: &m.config.DockerDefaults.Disk,
}
if hasOverride {
thresholds = m.applyThresholdOverride(thresholds, overrideConfig)
}
if thresholds.CPU != nil {
cpuMetadata := map[string]interface{}{
"resourceType": resourceType,
"hostId": host.ID,
"hostName": host.DisplayName,
"hostHostname": host.Hostname,
"containerId": container.ID,
"containerName": containerName,
"image": container.Image,
"state": container.State,
"status": container.Status,
"restartCount": container.RestartCount,
"metric": "cpu",
"cpuPercent": container.CPUPercent,
}
m.checkMetric(resourceID, containerName, nodeName, instanceName, resourceType, "cpu", container.CPUPercent, thresholds.CPU, &metricOptions{Metadata: cpuMetadata})
}
if thresholds.Memory != nil {
memMetadata := map[string]interface{}{
"resourceType": resourceType,
"hostId": host.ID,
"hostName": host.DisplayName,
"hostHostname": host.Hostname,
"containerId": container.ID,
"containerName": containerName,
"image": container.Image,
"state": container.State,
"status": container.Status,
"restartCount": container.RestartCount,
"metric": "memory",
"memoryPercent": container.MemoryPercent,
"memoryUsageBytes": container.MemoryUsage,
}
if container.MemoryLimit > 0 {
memMetadata["memoryLimitBytes"] = container.MemoryLimit
}
m.checkMetric(resourceID, containerName, nodeName, instanceName, resourceType, "memory", container.MemoryPercent, thresholds.Memory, &metricOptions{Metadata: memMetadata})
}
if thresholds.Disk != nil {
totalBytes := container.RootFilesystemBytes
usedBytes := container.WritableLayerBytes
if totalBytes > 0 && usedBytes >= 0 {
diskPercent := (float64(usedBytes) / float64(totalBytes)) * 100
diskMetadata := map[string]interface{}{
"resourceType": resourceType,
"hostId": host.ID,
"hostName": host.DisplayName,
"hostHostname": host.Hostname,
"containerId": container.ID,
"containerName": containerName,
"image": container.Image,
"state": container.State,
"status": container.Status,
"restartCount": container.RestartCount,
"metric": "disk",
"diskPercent": diskPercent,
"writableLayerBytes": usedBytes,
"rootFilesystemBytes": totalBytes,
"mountCount": len(container.Mounts),
}
if container.BlockIO != nil {
diskMetadata["blockIoReadBytes"] = container.BlockIO.ReadBytes
diskMetadata["blockIoWriteBytes"] = container.BlockIO.WriteBytes
}
m.checkMetric(resourceID, containerName, nodeName, instanceName, resourceType, "disk", diskPercent, thresholds.Disk, &metricOptions{Metadata: diskMetadata})
} else {
m.clearDockerContainerMetricAlerts(resourceID, "disk")
}
}
}
m.checkDockerContainerHealth(host, container, resourceID, containerName, instanceName, nodeName)
// Docker-specific checks
m.checkDockerContainerRestartLoop(host, container, resourceID, containerName, instanceName, nodeName)
m.checkDockerContainerOOMKill(host, container, resourceID, containerName, instanceName, nodeName)
m.checkDockerContainerMemoryLimit(host, container, resourceID, containerName, instanceName, nodeName)
m.checkDockerContainerImageUpdate(host, container, resourceID, containerName, instanceName, nodeName)
}
func (m *Manager) evaluateDockerService(host models.DockerHost, service models.DockerService, resourceID string) {
m.mu.RLock()
disableAllServices := m.config.DisableAllDockerServices
warnPct := m.config.DockerDefaults.ServiceWarnGapPct
critPct := m.config.DockerDefaults.ServiceCritGapPct
overrideConfig, hasOverride := m.config.Overrides[resourceID]
m.mu.RUnlock()
if disableAllServices {
m.clearDockerServiceAlert(resourceID)
return
}
if hasOverride && overrideConfig.Disabled {
m.clearDockerServiceAlert(resourceID)
return
}
desired := service.DesiredTasks
running := service.RunningTasks
if desired <= 0 {
m.clearDockerServiceAlert(resourceID)
return
}
missing := desired - running
if missing < 0 {
missing = 0
}
percentMissing := 0.0
if desired > 0 {
percentMissing = (float64(missing) / float64(desired)) * 100.0
}
severity := AlertLevel("")
thresholdValue := 0.0
if critPct > 0 && percentMissing >= float64(critPct) {
severity = AlertLevelCritical
thresholdValue = float64(critPct)
} else if warnPct > 0 && percentMissing >= float64(warnPct) {
severity = AlertLevelWarning
thresholdValue = float64(warnPct)
}
updateState := ""
updateMessage := ""
if service.UpdateStatus != nil {
updateState = strings.ToLower(strings.TrimSpace(service.UpdateStatus.State))
updateMessage = strings.TrimSpace(service.UpdateStatus.Message)
if severity == "" {
switch updateState {
case "paused", "rollback_started", "rollback_paused":
severity = AlertLevelWarning
case "rollback_failed":
severity = AlertLevelCritical
}
}
}
if severity == "" {
m.clearDockerServiceAlert(resourceID)
return
}
serviceName := dockerServiceDisplayName(service)
instanceName := dockerInstanceName(host)
nodeName := strings.TrimSpace(host.Hostname)
message := ""
if missing > 0 {
message = fmt.Sprintf("Docker service '%s' is running %d of %d desired tasks", serviceName, service.RunningTasks, service.DesiredTasks)
} else if updateState != "" {
message = fmt.Sprintf("Docker service '%s' update state: %s", serviceName, service.UpdateStatus.State)
} else {
message = fmt.Sprintf("Docker service '%s' triggered a Swarm alert", serviceName)
}
if updateMessage != "" {
message = fmt.Sprintf("%s (%s)", message, updateMessage)
}
metadata := map[string]interface{}{
"resourceType": "Docker Service",
"hostId": host.ID,
"hostName": host.DisplayName,
"hostHostname": host.Hostname,
"serviceId": service.ID,
"serviceName": service.Name,
"stack": service.Stack,
"mode": service.Mode,
"desiredTasks": service.DesiredTasks,
"runningTasks": service.RunningTasks,
"completedTasks": service.CompletedTasks,
"missingTasks": missing,
"percentMissing": percentMissing,
}
if updateState != "" {
metadata["updateState"] = service.UpdateStatus.State
}
if updateMessage != "" {
metadata["updateMessage"] = updateMessage
}
if service.UpdateStatus != nil && service.UpdateStatus.CompletedAt != nil && !service.UpdateStatus.CompletedAt.IsZero() {
metadata["updateCompletedAt"] = service.UpdateStatus.CompletedAt.UTC()
}
alertID := fmt.Sprintf("docker-service-health-%s", resourceID)
alert := &Alert{
ID: alertID,
Type: "docker-service-health",
Level: severity,
ResourceID: resourceID,
ResourceName: serviceName,
Node: nodeName,
Instance: instanceName,
Message: message,
Value: percentMissing,
Threshold: thresholdValue,
StartTime: time.Now(),
LastSeen: time.Now(),
Metadata: metadata,
}
m.mu.Lock()
if existing, exists := m.activeAlerts[alertID]; exists && existing != nil {
escalatedToCritical := existing.Level != AlertLevelCritical && alert.Level == AlertLevelCritical
m.preserveAlertState(alertID, alert)
m.activeAlerts[alertID] = alert
m.recentAlerts[alertID] = alert
if escalatedToCritical {
m.historyManager.AddAlert(*alert)
if m.checkRateLimit(alertID) {
m.dispatchAlert(alert, true)
log.Warn().
Str("service", serviceName).
Str("host", host.DisplayName).
Float64("percentMissing", percentMissing).
Str("fromLevel", string(existing.Level)).
Str("toLevel", string(alert.Level)).
Msg("Docker service alert escalated")
} else {
log.Debug().
Str("alertID", alertID).
Int("maxPerHour", m.config.Schedule.MaxAlertsHour).
Msg("Docker service escalation notification suppressed due to rate limit")
}
}
m.mu.Unlock()
return
}
m.preserveAlertState(alertID, alert)
m.activeAlerts[alertID] = alert
m.recentAlerts[alertID] = alert
m.historyManager.AddAlert(*alert)
if !m.checkRateLimit(alertID) {
m.mu.Unlock()
log.Debug().
Str("alertID", alertID).
Int("maxPerHour", m.config.Schedule.MaxAlertsHour).
Msg("Docker service alert notification suppressed due to rate limit")
return
}
m.dispatchAlert(alert, true)
m.mu.Unlock()
log.Warn().
Str("service", serviceName).
Str("host", host.DisplayName).
Float64("percentMissing", percentMissing).
Msg("Docker service alert raised")
}
func (m *Manager) clearDockerServiceAlert(resourceID string) {
alertID := fmt.Sprintf("docker-service-health-%s", resourceID)
m.clearAlert(alertID)
}
// HandleDockerHostOnline clears offline tracking and alerts for a Docker host.
func (m *Manager) HandleDockerHostOnline(host models.DockerHost) {
if host.ID == "" {
return
}
alertID := fmt.Sprintf("docker-host-offline-%s", host.ID)
m.mu.Lock()
delete(m.dockerOfflineCount, host.ID)
_, exists := m.activeAlerts[alertID]
m.mu.Unlock()
if exists {
m.clearAlert(alertID)
}
}
// HandleDockerHostRemoved clears all alerts and tracking when a Docker host is deleted.
func (m *Manager) HandleDockerHostRemoved(host models.DockerHost) {
if host.ID == "" {
return
}
// Reuse the online handler to clear offline alerts and tracking.
m.HandleDockerHostOnline(host)
// Drop any container alerts and host-scoped tracking entries.
m.clearDockerHostContainerAlerts(host)
}
// HandleDockerHostOffline raises an alert when a Docker host stops reporting.
func (m *Manager) HandleDockerHostOffline(host models.DockerHost) {
if host.ID == "" {
return
}
m.mu.RLock()
if !m.config.Enabled {
m.mu.RUnlock()
return
}
disableDockerHostsOffline := m.config.DisableAllDockerHostsOffline
m.mu.RUnlock()
alertID := fmt.Sprintf("docker-host-offline-%s", host.ID)
resourceID := fmt.Sprintf("docker:%s", strings.TrimSpace(host.ID))
instanceName := dockerInstanceName(host)
nodeName := strings.TrimSpace(host.Hostname)
if disableDockerHostsOffline {
m.mu.Lock()
delete(m.dockerOfflineCount, host.ID)
m.mu.Unlock()
m.clearAlert(alertID)
return
}
var disableConnectivity bool
m.mu.RLock()
if override, exists := m.config.Overrides[host.ID]; exists {
disableConnectivity = override.DisableConnectivity
}
m.mu.RUnlock()
if disableConnectivity {
m.clearAlert(alertID)
m.mu.Lock()
delete(m.dockerOfflineCount, host.ID)
m.mu.Unlock()
return
}
m.mu.Lock()
if alert, exists := m.activeAlerts[alertID]; exists && alert != nil {
alert.LastSeen = time.Now()
m.activeAlerts[alertID] = alert
m.mu.Unlock()
return
}
m.dockerOfflineCount[host.ID]++
confirmations := m.dockerOfflineCount[host.ID]
const requiredConfirmations = 3
if confirmations < requiredConfirmations {
m.mu.Unlock()
log.Debug().
Str("dockerHost", host.DisplayName).
Str("hostID", host.ID).
Int("confirmations", confirmations).
Int("required", requiredConfirmations).
Msg("Docker host appears offline, awaiting confirmation")
return
}
alert := &Alert{
ID: alertID,
Type: "docker-host-offline",
Level: AlertLevelCritical,
ResourceID: resourceID,
ResourceName: host.DisplayName,
Node: nodeName,
Instance: instanceName,
Message: fmt.Sprintf("Docker host '%s' is offline", host.DisplayName),
Value: 0,
Threshold: 0,
StartTime: time.Now(),
LastSeen: time.Now(),
Metadata: map[string]interface{}{
"resourceType": "DockerHost",
"hostId": host.ID,
"hostname": host.Hostname,
"agentId": host.AgentID,
"displayName": host.DisplayName,
},
}
m.preserveAlertState(alertID, alert)
m.activeAlerts[alertID] = alert
m.recentAlerts[alertID] = alert
m.historyManager.AddAlert(*alert)
// Trigger AI analysis callback unconditionally
if m.onAlertForAI != nil {
alertCopy := alert.Clone()
go func(a *Alert) {
defer func() {
if r := recover(); r != nil {
log.Error().Interface("panic", r).Str("alertID", a.ID).Msg("Panic in AI alert callback")
}
}()
m.onAlertForAI(a)
}(alertCopy)
}
if !m.checkRateLimit(alertID) {
m.mu.Unlock()
log.Debug().
Str("alertID", alertID).
Int("maxPerHour", m.config.Schedule.MaxAlertsHour).
Msg("Docker host offline alert suppressed due to rate limit")
return
}
m.dispatchAlert(alert, false)
m.mu.Unlock()
log.Error().
Str("dockerHost", host.DisplayName).
Str("hostID", host.ID).
Str("hostname", host.Hostname).
Msg("CRITICAL: Docker host is offline")
m.clearDockerHostContainerAlerts(host)
}
func (m *Manager) checkDockerContainerState(host models.DockerHost, container models.DockerContainer, resourceID, containerName, instanceName, nodeName string) {
alertID := fmt.Sprintf("docker-container-state-%s", resourceID)
stateKey := resourceID
m.mu.RLock()
override, hasOverride := m.config.Overrides[resourceID]
defaultDisable := m.config.DockerDefaults.StateDisableConnectivity
defaultSeverity := normalizePoweredOffSeverity(m.config.DockerDefaults.StatePoweredOffSeverity)
m.mu.RUnlock()
disableConnectivity := defaultDisable
severity := defaultSeverity
if hasOverride {
if defaultDisable && !override.DisableConnectivity {
disableConnectivity = false
} else if override.DisableConnectivity {
disableConnectivity = true
}
if override.PoweredOffSeverity != "" {
severity = normalizePoweredOffSeverity(override.PoweredOffSeverity)
}
}
if disableConnectivity {
m.clearDockerContainerStateAlert(resourceID)
return
}
m.mu.Lock()
if alert, exists := m.activeAlerts[alertID]; exists && alert != nil {
alert.LastSeen = time.Now()
alert.Level = severity
if alert.Metadata == nil {
alert.Metadata = make(map[string]interface{})
}
alert.Metadata["state"] = container.State
alert.Metadata["status"] = container.Status
m.activeAlerts[alertID] = alert
m.mu.Unlock()
return
}
m.dockerStateConfirm[stateKey]++
confirmations := m.dockerStateConfirm[stateKey]
const requiredConfirmations = 2
if confirmations < requiredConfirmations {
m.mu.Unlock()
log.Debug().
Str("container", containerName).
Str("host", host.DisplayName).
Str("state", container.State).
Int("confirmations", confirmations).
Int("required", requiredConfirmations).
Msg("Docker container state change detected, awaiting confirmation")
return
}
message := fmt.Sprintf("Docker container '%s' is %s", containerName, strings.TrimSpace(container.Status))
alert := &Alert{
ID: alertID,
Type: "docker-container-state",
Level: severity,
ResourceID: resourceID,
ResourceName: containerName,
Node: nodeName,
Instance: instanceName,
Message: message,
Value: 0,
Threshold: 0,
StartTime: time.Now(),
LastSeen: time.Now(),
Metadata: map[string]interface{}{
"resourceType": "Docker Container",
"hostId": host.ID,
"hostName": host.DisplayName,
"hostHostname": host.Hostname,
"containerId": container.ID,
"containerName": containerName,
"image": container.Image,
"state": container.State,
"status": container.Status,
},
}
m.preserveAlertState(alertID, alert)
m.activeAlerts[alertID] = alert
m.recentAlerts[alertID] = alert
m.historyManager.AddAlert(*alert)
m.dispatchAlert(alert, true)
m.mu.Unlock()
log.Warn().
Str("container", containerName).
Str("host", host.DisplayName).
Str("state", container.State).
Msg("Docker container state alert raised")
}
func (m *Manager) clearDockerContainerStateAlert(resourceID string) {
alertID := fmt.Sprintf("docker-container-state-%s", resourceID)
m.mu.Lock()
delete(m.dockerStateConfirm, resourceID)
m.mu.Unlock()
m.clearAlert(alertID)
}
func (m *Manager) checkDockerContainerHealth(host models.DockerHost, container models.DockerContainer, resourceID, containerName, instanceName, nodeName string) {
health := strings.ToLower(strings.TrimSpace(container.Health))
if health == "" || health == "none" || health == "healthy" || health == "starting" {
m.clearDockerContainerHealthAlert(resourceID)
return
}
level := AlertLevelWarning
if health == "unhealthy" {
level = AlertLevelCritical
}
alertID := fmt.Sprintf("docker-container-health-%s", resourceID)
alert := &Alert{
ID: alertID,
Type: "docker-container-health",
Level: level,
ResourceID: resourceID,
ResourceName: containerName,
Node: nodeName,
Instance: instanceName,
Message: fmt.Sprintf("Docker container '%s' health is %s", containerName, container.Health),
Value: 0,
Threshold: 0,
StartTime: time.Now(),
LastSeen: time.Now(),
Metadata: map[string]interface{}{
"resourceType": "Docker Container",
"hostId": host.ID,
"hostName": host.DisplayName,
"hostHostname": host.Hostname,
"containerId": container.ID,
"containerName": containerName,
"image": container.Image,
"state": container.State,
"status": container.Status,
"health": container.Health,
},
}
m.mu.Lock()
if existing, exists := m.activeAlerts[alertID]; exists && existing != nil {
alert.StartTime = existing.StartTime
}
m.preserveAlertState(alertID, alert)
m.activeAlerts[alertID] = alert
m.recentAlerts[alertID] = alert
m.historyManager.AddAlert(*alert)
m.dispatchAlert(alert, false)
m.mu.Unlock()
log.Warn().
Str("container", containerName).
Str("host", host.DisplayName).
Str("health", container.Health).
Msg("Docker container health alert raised")
}
func (m *Manager) clearDockerContainerHealthAlert(resourceID string) {
alertID := fmt.Sprintf("docker-container-health-%s", resourceID)
m.clearAlert(alertID)
}
// checkDockerContainerRestartLoop detects containers stuck in a restart loop
func (m *Manager) checkDockerContainerRestartLoop(host models.DockerHost, container models.DockerContainer, resourceID, containerName, instanceName, nodeName string) {
alertID := fmt.Sprintf("docker-container-restart-loop-%s", resourceID)
now := time.Now()
// Get config values with defaults
restartThreshold := m.config.DockerDefaults.RestartCount
if restartThreshold == 0 {
restartThreshold = 3 // Default: 3 restarts
}
timeWindow := m.config.DockerDefaults.RestartWindow
if timeWindow == 0 {
timeWindow = 300 // Default: 5 minutes (300 seconds)
}
m.mu.Lock()
record, exists := m.dockerRestartTracking[resourceID]
if !exists {
record = &dockerRestartRecord{
count: container.RestartCount,
lastCount: container.RestartCount,
times: []time.Time{},
lastChecked: now,
}
m.dockerRestartTracking[resourceID] = record
m.mu.Unlock()
return
}
// If restart count increased, track it
if container.RestartCount > record.lastCount {
newRestarts := container.RestartCount - record.lastCount
for i := 0; i < newRestarts; i++ {
record.times = append(record.times, now)
}
record.lastCount = container.RestartCount
}
// Clean up old restart times outside the window
cutoff := now.Add(-time.Duration(timeWindow) * time.Second)
var recentRestarts []time.Time
for _, t := range record.times {
if t.After(cutoff) {
recentRestarts = append(recentRestarts, t)
}
}
record.times = recentRestarts
record.lastChecked = now
recentCount := len(record.times)
m.mu.Unlock()
// Check if we have a restart loop
if recentCount > restartThreshold {
level := AlertLevelCritical
alert := &Alert{
ID: alertID,
Type: "docker-container-restart-loop",
Level: level,
ResourceID: resourceID,
ResourceName: containerName,
Node: nodeName,
Instance: instanceName,
Message: fmt.Sprintf("Docker container '%s' has restarted %d times in the last %d minutes (restart loop detected)", containerName, recentCount, timeWindow/60),
StartTime: now,
LastSeen: now,
Metadata: map[string]interface{}{
"hostId": host.ID,
"hostName": host.DisplayName,
"containerId": container.ID,
"containerName": containerName,
"image": container.Image,
"state": container.State,
"status": container.Status,
"restartCount": container.RestartCount,
"recentRestarts": recentCount,
},
}
m.mu.Lock()
if existing, exists := m.activeAlerts[alertID]; exists && existing != nil {
alert.StartTime = existing.StartTime
}
m.preserveAlertState(alertID, alert)
m.activeAlerts[alertID] = alert
m.recentAlerts[alertID] = alert
m.historyManager.AddAlert(*alert)
m.dispatchAlert(alert, false)
m.mu.Unlock()
log.Warn().
Str("container", containerName).
Str("host", host.DisplayName).
Int("restarts", recentCount).
Msg("Docker container restart loop detected")
} else {
// Clear alert if restart loop has stopped
m.clearAlert(alertID)
}
}
// checkDockerContainerOOMKill detects when a container was killed due to out of memory
func (m *Manager) checkDockerContainerOOMKill(host models.DockerHost, container models.DockerContainer, resourceID, containerName, instanceName, nodeName string) {
alertID := fmt.Sprintf("docker-container-oom-%s", resourceID)
// Exit code 137 means the container was killed by SIGKILL, often due to OOM
// Only alert if the container exited (not running) with exit code 137
state := strings.ToLower(strings.TrimSpace(container.State))
if (state == "exited" || state == "dead") && container.ExitCode == 137 {
m.mu.Lock()
lastExitCode, tracked := m.dockerLastExitCode[resourceID]
// Only alert if this is a new OOM kill (exit code changed to 137)
if !tracked || lastExitCode != 137 {
m.dockerLastExitCode[resourceID] = 137
m.mu.Unlock()
level := AlertLevelCritical
alert := &Alert{
ID: alertID,
Type: "docker-container-oom-kill",
Level: level,
ResourceID: resourceID,
ResourceName: containerName,
Node: nodeName,
Instance: instanceName,
Message: fmt.Sprintf("Docker container '%s' was killed due to out of memory (OOM)", containerName),
StartTime: time.Now(),
LastSeen: time.Now(),
Metadata: map[string]interface{}{
"hostId": host.ID,
"hostName": host.DisplayName,
"containerId": container.ID,
"containerName": containerName,
"image": container.Image,
"state": container.State,
"status": container.Status,
"exitCode": container.ExitCode,
"memoryUsageBytes": container.MemoryUsage,
"memoryLimitBytes": container.MemoryLimit,
},
}
m.mu.Lock()
if existing, exists := m.activeAlerts[alertID]; exists && existing != nil {
alert.StartTime = existing.StartTime
}
m.preserveAlertState(alertID, alert)
m.activeAlerts[alertID] = alert
m.recentAlerts[alertID] = alert
m.historyManager.AddAlert(*alert)
m.dispatchAlert(alert, false)
m.mu.Unlock()
log.Error().
Str("container", containerName).
Str("host", host.DisplayName).
Int64("memoryUsage", container.MemoryUsage).
Int64("memoryLimit", container.MemoryLimit).
Msg("Docker container OOM killed")
} else {
m.mu.Unlock()
}
} else {
// Update last exit code if it changed
if container.ExitCode != 0 {
m.mu.Lock()
m.dockerLastExitCode[resourceID] = container.ExitCode
m.mu.Unlock()
}
// Clear OOM alert if container is running or exited with different code
m.clearAlert(alertID)
}
}
// checkDockerContainerMemoryLimit alerts when container approaches its memory limit
func (m *Manager) checkDockerContainerMemoryLimit(host models.DockerHost, container models.DockerContainer, resourceID, containerName, instanceName, nodeName string) {
// Only check if container is running and has a memory limit
state := strings.ToLower(strings.TrimSpace(container.State))
if state != "running" || container.MemoryLimit <= 0 {
return
}
alertID := fmt.Sprintf("docker-container-memory-limit-%s", resourceID)
// Get config values with defaults
warnThreshold := float64(m.config.DockerDefaults.MemoryWarnPct)
if warnThreshold == 0 {
warnThreshold = 90.0 // Default: 90%
}
criticalThreshold := float64(m.config.DockerDefaults.MemoryCriticalPct)
if criticalThreshold == 0 {
criticalThreshold = 95.0 // Default: 95%
}
// Calculate percentage of limit used
limitPercent := (float64(container.MemoryUsage) / float64(container.MemoryLimit)) * 100
if limitPercent >= warnThreshold {
level := AlertLevelWarning
if limitPercent >= criticalThreshold {
level = AlertLevelCritical
}
alert := &Alert{
ID: alertID,
Type: "docker-container-memory-limit",
Level: level,
ResourceID: resourceID,
ResourceName: containerName,
Node: nodeName,
Instance: instanceName,
Message: fmt.Sprintf("Docker container '%s' is using %.1f%% of its memory limit (%d MB / %d MB)", containerName, limitPercent, container.MemoryUsage/(1024*1024), container.MemoryLimit/(1024*1024)),
StartTime: time.Now(),
LastSeen: time.Now(),
Metadata: map[string]interface{}{
"hostId": host.ID,
"hostName": host.DisplayName,
"containerId": container.ID,
"containerName": containerName,
"image": container.Image,
"memoryUsageBytes": container.MemoryUsage,
"memoryLimitBytes": container.MemoryLimit,
"limitPercent": limitPercent,
},
}
m.mu.Lock()
if existing, exists := m.activeAlerts[alertID]; exists && existing != nil {
alert.StartTime = existing.StartTime
existing.LastSeen = time.Now()
existing.Level = level
existing.Message = alert.Message
existing.Metadata = alert.Metadata
m.mu.Unlock()
return
}
m.preserveAlertState(alertID, alert)
m.activeAlerts[alertID] = alert
m.recentAlerts[alertID] = alert
m.historyManager.AddAlert(*alert)
m.dispatchAlert(alert, false)
m.mu.Unlock()
log.Warn().
Str("container", containerName).
Str("host", host.DisplayName).
Float64("limitPercent", limitPercent).
Msg("Docker container approaching memory limit")
} else {
// Clear alert if below warning threshold minus 5% (hysteresis)
clearThreshold := warnThreshold - 5
if limitPercent < clearThreshold {
m.clearAlert(alertID)
}
}
}
func (m *Manager) clearDockerContainerMetricAlerts(resourceID string, metrics ...string) {
if len(metrics) == 0 {
metrics = []string{"cpu", "memory", "disk"}
}
for _, metric := range metrics {
alertID := fmt.Sprintf("%s-%s", resourceID, metric)
m.clearAlert(alertID)
}
}
func (m *Manager) clearDockerContainerUpdateTracking(resourceID, trackingKey string) {
m.mu.Lock()
delete(m.dockerUpdateFirstSeen, resourceID)
if trackingKey != "" {
delete(m.dockerUpdateFirstSeenByIdentity, trackingKey)
}
m.mu.Unlock()
}
func dockerUpdateTrackingKeyFromAlert(alert *Alert) string {
if alert == nil || alert.Metadata == nil {
return ""
}
host := models.DockerHost{
ID: strings.TrimSpace(fmt.Sprint(alert.Metadata["hostId"])),
DisplayName: strings.TrimSpace(fmt.Sprint(alert.Metadata["hostName"])),
Hostname: strings.TrimSpace(fmt.Sprint(alert.Metadata["hostHostname"])),
}
container := models.DockerContainer{
ID: strings.TrimSpace(fmt.Sprint(alert.Metadata["containerId"])),
Name: strings.TrimSpace(fmt.Sprint(alert.Metadata["containerName"])),
Image: strings.TrimSpace(fmt.Sprint(alert.Metadata["image"])),
}
if host.ID == "" && host.DisplayName == "" && host.Hostname == "" &&
container.ID == "" && container.Name == "" && container.Image == "" {
return ""
}
return dockerUpdateTrackingKey(host, container)
}
func (m *Manager) clearDockerContainerUpdateStateLocked(alert *Alert) {
if alert == nil {
return
}
if alert.ResourceID != "" {
delete(m.dockerUpdateFirstSeen, alert.ResourceID)
}
if trackingKey := dockerUpdateTrackingKeyFromAlert(alert); trackingKey != "" {
delete(m.dockerUpdateFirstSeenByIdentity, trackingKey)
}
}
func (m *Manager) clearDockerContainerUpdateAlertsLocked() {
toClear := make([]string, 0)
for alertID, alert := range m.activeAlerts {
if alert == nil {
continue
}
if alert.Type != "docker-container-update" && !strings.HasPrefix(alertID, "docker-container-update-") {
continue
}
m.clearDockerContainerUpdateStateLocked(alert)
toClear = append(toClear, alertID)
}
for _, alertID := range toClear {
m.clearAlertNoLock(alertID)
}
}
func (m *Manager) shouldResolveDockerContainerUpdateAlertLocked(alert *Alert) bool {
if alert == nil {
return false
}
if m.config.DisableAllDockerContainers || m.config.DockerDefaults.UpdateAlertDelayHours < 0 {
m.clearDockerContainerUpdateStateLocked(alert)
return true
}
if override, exists := m.config.Overrides[alert.ResourceID]; exists && override.Disabled {
m.clearDockerContainerUpdateStateLocked(alert)
return true
}
containerName := strings.TrimSpace(alert.ResourceName)
containerID := ""
if alert.Metadata != nil {
if value, ok := alert.Metadata["containerName"].(string); ok && containerName == "" {
containerName = value
}
if value, ok := alert.Metadata["containerId"].(string); ok {
containerID = value
}
}
if matchesDockerIgnoredPrefix(containerName, containerID, m.config.DockerIgnoredContainerPrefixes) {
m.clearDockerContainerUpdateStateLocked(alert)
return true
}
return false
}
// checkDockerContainerImageUpdate checks if an image update has been pending for too long
func (m *Manager) checkDockerContainerImageUpdate(host models.DockerHost, container models.DockerContainer, resourceID, containerName, instanceName, nodeName string) {
alertID := fmt.Sprintf("docker-container-update-%s", resourceID)
updateTrackingKey := dockerUpdateTrackingKey(host, container)
// Check if update detection is enabled
m.mu.RLock()
delayHours := m.config.DockerDefaults.UpdateAlertDelayHours
m.mu.RUnlock()
// Negative value means disabled
if delayHours < 0 {
m.clearAlert(alertID)
m.clearDockerContainerUpdateTracking(resourceID, updateTrackingKey)
return
}
// Check if this container has an update status reported
if container.UpdateStatus == nil {
// No update status - clear any tracking and alerts
m.clearAlert(alertID)
m.clearDockerContainerUpdateTracking(resourceID, updateTrackingKey)
return
}
// Check for errors in update detection (don't alert on errors)
if container.UpdateStatus.Error != "" {
// Update check failed - clear alert but keep tracking
m.clearAlert(alertID)
return
}
// Check if an update is available
if !container.UpdateStatus.UpdateAvailable {
// No update available - clear tracking and alert
m.clearAlert(alertID)
m.clearDockerContainerUpdateTracking(resourceID, updateTrackingKey)
return
}
// Update is available - track when we first saw it
m.mu.Lock()
firstSeen, exists := m.dockerUpdateFirstSeenByIdentity[updateTrackingKey]
if !exists {
firstSeen, exists = m.dockerUpdateFirstSeen[resourceID]
}
if !exists {
firstSeen = time.Now()
}
m.dockerUpdateFirstSeen[resourceID] = firstSeen
m.dockerUpdateFirstSeenByIdentity[updateTrackingKey] = firstSeen
m.mu.Unlock()
// Check if we've exceeded the delay threshold
pendingDuration := time.Since(firstSeen)
threshold := time.Duration(delayHours) * time.Hour
if pendingDuration < threshold {
// Not yet time to alert
log.Debug().
Str("container", containerName).
Str("host", host.DisplayName).
Str("image", container.Image).
Dur("pending", pendingDuration).
Dur("threshold", threshold).
Msg("Container update pending but below alert threshold")
return
}
// Create or update the alert
pendingHours := int(pendingDuration.Hours())
message := fmt.Sprintf("Docker container '%s' has an image update available for %d hours", containerName, pendingHours)
alert := &Alert{
ID: alertID,
Type: "docker-container-update",
Level: AlertLevelWarning,
ResourceID: resourceID,
ResourceName: containerName,
Node: nodeName,
Instance: instanceName,
Message: message,
StartTime: firstSeen,
LastSeen: time.Now(),
Metadata: map[string]interface{}{
"resourceType": "Docker Container",
"hostId": host.ID,
"hostName": host.DisplayName,
"hostHostname": host.Hostname,
"containerId": container.ID,
"containerName": containerName,
"image": container.Image,
"currentDigest": container.UpdateStatus.CurrentDigest,
"latestDigest": container.UpdateStatus.LatestDigest,
"lastChecked": container.UpdateStatus.LastChecked,
"firstSeen": firstSeen,
"pendingHours": pendingHours,
"thresholdHours": delayHours,
},
}
m.mu.Lock()
if existing, ok := m.activeAlerts[alertID]; ok && existing != nil {
// Update existing alert
existing.LastSeen = time.Now()
existing.Message = message
existing.Metadata = alert.Metadata
m.mu.Unlock()
return
}
m.preserveAlertState(alertID, alert)
m.activeAlerts[alertID] = alert
m.recentAlerts[alertID] = alert
m.historyManager.AddAlert(*alert)
m.dispatchAlert(alert, false)
m.mu.Unlock()
log.Warn().
Str("container", containerName).
Str("host", host.DisplayName).
Str("image", container.Image).
Int("pendingHours", pendingHours).
Msg("Docker container has pending image update")
}
func (m *Manager) cleanupDockerContainerAlerts(host models.DockerHost, seen map[string]struct{}) {
m.cleanupDockerContainerAlertsWithTracking(host, seen, nil)
}
func (m *Manager) cleanupDockerContainerAlertsWithTracking(host models.DockerHost, seen map[string]struct{}, seenUpdateTracking map[string]struct{}) {
prefix := fmt.Sprintf("docker:%s/", strings.TrimSpace(host.ID))
updateTrackingPrefix := dockerUpdateTrackingHostPrefix(host)
m.mu.Lock()
toClear := make([]string, 0)
for alertID, alert := range m.activeAlerts {
if !strings.HasPrefix(alert.ResourceID, prefix) {
continue
}
if _, exists := seen[alert.ResourceID]; exists {
continue
}
toClear = append(toClear, alertID)
}
for resourceID := range m.dockerStateConfirm {
if strings.HasPrefix(resourceID, prefix) {
if _, exists := seen[resourceID]; !exists {
delete(m.dockerStateConfirm, resourceID)
}
}
}
// Cleanup update tracking for removed containers
for resourceID := range m.dockerUpdateFirstSeen {
if strings.HasPrefix(resourceID, prefix) {
if _, exists := seen[resourceID]; !exists {
delete(m.dockerUpdateFirstSeen, resourceID)
}
}
}
if seenUpdateTracking != nil {
for trackingKey := range m.dockerUpdateFirstSeenByIdentity {
if !strings.HasPrefix(trackingKey, updateTrackingPrefix) {
continue
}
if _, exists := seenUpdateTracking[trackingKey]; !exists {
delete(m.dockerUpdateFirstSeenByIdentity, trackingKey)
}
}
}
m.mu.Unlock()
for _, alertID := range toClear {
m.clearAlert(alertID)
}
}
func (m *Manager) clearDockerHostContainerAlerts(host models.DockerHost) {
prefix := fmt.Sprintf("docker:%s/", strings.TrimSpace(host.ID))
updateTrackingPrefix := dockerUpdateTrackingHostPrefix(host)
m.mu.Lock()
toClear := make([]string, 0)
for alertID, alert := range m.activeAlerts {
if strings.HasPrefix(alert.ResourceID, prefix) {
toClear = append(toClear, alertID)
}
}
for resourceID := range m.dockerStateConfirm {
if strings.HasPrefix(resourceID, prefix) {
delete(m.dockerStateConfirm, resourceID)
}
}
for resourceID := range m.dockerRestartTracking {
if strings.HasPrefix(resourceID, prefix) {
delete(m.dockerRestartTracking, resourceID)
}
}
for resourceID := range m.dockerLastExitCode {
if strings.HasPrefix(resourceID, prefix) {
delete(m.dockerLastExitCode, resourceID)
}
}
for resourceID := range m.dockerUpdateFirstSeen {
if strings.HasPrefix(resourceID, prefix) {
delete(m.dockerUpdateFirstSeen, resourceID)
}
}
for trackingKey := range m.dockerUpdateFirstSeenByIdentity {
if strings.HasPrefix(trackingKey, updateTrackingPrefix) {
delete(m.dockerUpdateFirstSeenByIdentity, trackingKey)
}
}
m.mu.Unlock()
for _, alertID := range toClear {
m.clearAlert(alertID)
}
}
// CheckStorage checks storage against thresholds
func (m *Manager) CheckStorage(storage models.Storage) {
m.mu.RLock()
if !m.config.Enabled {
m.mu.RUnlock()
return
}
if m.config.DisableAllStorage {
m.mu.RUnlock()
// Clear any existing storage alerts when all storage alerts are disabled
m.mu.Lock()
usageAlertID := fmt.Sprintf("%s-usage", storage.ID)
if _, exists := m.activeAlerts[usageAlertID]; exists {
m.clearAlertNoLock(usageAlertID)
log.Info().
Str("alertID", usageAlertID).
Str("storage", storage.Name).
Msg("Cleared usage alert - all storage alerts disabled")
}
offlineAlertID := fmt.Sprintf("storage-offline-%s", storage.ID)
if _, exists := m.activeAlerts[offlineAlertID]; exists {
m.clearAlertNoLock(offlineAlertID)
log.Info().
Str("alertID", offlineAlertID).
Str("storage", storage.Name).
Msg("Cleared offline alert - all storage alerts disabled")
}
m.mu.Unlock()
return
}
// Check if there's an override for this storage device. Shared storage used
// to be keyed per reporting node before #1049 switched it to a stable
// cluster-wide ID, so we still honor legacy per-node override keys.
override, hasOverride, _ := findStorageOverride(m.config.Overrides, storage)
threshold := m.config.StorageDefault
// Apply override if it exists for usage threshold
if hasOverride && override.Usage != nil {
threshold = *override.Usage
}
m.mu.RUnlock()
// Check if storage is truly offline/unavailable (not just inactive from other nodes)
// Note: In a cluster, local storage from other nodes shows as inactive which is normal
if storage.Status == "offline" || storage.Status == "unavailable" {
m.checkStorageOffline(storage)
} else {
// Clear any existing offline alert if storage is back online
m.clearStorageOfflineAlert(storage)
}
// If alerts are disabled for this storage device, clear any existing alerts and return
if hasOverride && override.Disabled {
m.mu.Lock()
// Clear usage alert
usageAlertID := fmt.Sprintf("%s-usage", storage.ID)
if _, exists := m.activeAlerts[usageAlertID]; exists {
m.clearAlertNoLock(usageAlertID)
log.Info().
Str("alertID", usageAlertID).
Str("storage", storage.Name).
Msg("Cleared usage alert - storage has alerts disabled")
}
// Clear offline alert
offlineAlertID := fmt.Sprintf("storage-offline-%s", storage.ID)
if _, exists := m.activeAlerts[offlineAlertID]; exists {
m.clearAlertNoLock(offlineAlertID)
log.Info().
Str("alertID", offlineAlertID).
Str("storage", storage.Name).
Msg("Cleared offline alert - storage has alerts disabled")
}
m.mu.Unlock()
return
}
// Check usage if storage has valid data (even if not currently active on this node)
// In clusters, storage may show as inactive on nodes where it's not currently mounted
// but we still want to alert on high usage
log.Debug().
Str("storage", storage.Name).
Str("id", storage.ID).
Float64("usage", storage.Usage).
Str("status", storage.Status).
Float64("trigger", threshold.Trigger).
Float64("clear", threshold.Clear).
Bool("hasOverride", hasOverride).
Msg("Checking storage thresholds")
// Check usage if storage is online - checkMetric will skip if threshold is nil or <= 0
if storage.Status != "offline" && storage.Status != "unavailable" && storage.Usage > 0 {
m.checkMetric(storage.ID, storage.Name, storage.Node, storage.Instance, "Storage", "usage", storage.Usage, &threshold, nil)
}
// Check ZFS pool status if this is ZFS storage
if storage.ZFSPool != nil {
m.checkZFSPoolHealth(storage)
}
}
// BuildGuestKey constructs a unique key for a guest from instance, node, and VMID.
// Uses the canonical format: instance:node:vmid
// This matches the format used by makeGuestID in the monitoring package.
func BuildGuestKey(instance, node string, vmid int) string {
instance = strings.TrimSpace(instance)
node = strings.TrimSpace(node)
if instance == "" {
instance = node
}
return fmt.Sprintf("%s:%s:%d", instance, node, vmid)
}
func storageOverrideLookupKeys(storage models.Storage) []string {
keys := make([]string, 0, 1+len(storage.NodeIDs)+len(storage.Nodes))
seen := make(map[string]struct{})
addKey := func(key string) {
key = strings.TrimSpace(key)
if key == "" {
return
}
if _, exists := seen[key]; exists {
return
}
seen[key] = struct{}{}
keys = append(keys, key)
}
addKey(storage.ID)
if !storage.Shared {
return keys
}
name := strings.TrimSpace(storage.Name)
if name == "" {
return keys
}
for _, nodeID := range storage.NodeIDs {
nodeID = strings.TrimSpace(nodeID)
if nodeID == "" {
continue
}
addKey(fmt.Sprintf("%s-%s", nodeID, name))
}
instance := strings.TrimSpace(storage.Instance)
for _, node := range storage.Nodes {
node = strings.TrimSpace(node)
if node == "" || strings.EqualFold(node, "cluster") {
continue
}
prefix := node
if instance != "" && instance != node {
prefix = fmt.Sprintf("%s-%s", instance, node)
}
addKey(fmt.Sprintf("%s-%s", prefix, name))
}
return keys
}
func findStorageOverride(overrides map[string]ThresholdConfig, storage models.Storage) (ThresholdConfig, bool, string) {
for _, key := range storageOverrideLookupKeys(storage) {
override, exists := overrides[key]
if exists {
return override, true, key
}
}
return ThresholdConfig{}, false, ""
}
// CheckSnapshotsForInstance evaluates guest snapshots for age-based alerts.
func (m *Manager) CheckSnapshotsForInstance(instanceName string, snapshots []models.GuestSnapshot, guestNames map[string]string) {
m.mu.RLock()
enabled := m.config.Enabled
snapshotCfg := m.config.SnapshotDefaults
m.mu.RUnlock()
if !enabled {
return
}
if !snapshotCfg.Enabled {
m.clearSnapshotAlertsForInstance(instanceName)
return
}
now := time.Now()
validAlerts := make(map[string]struct{})
for _, snapshot := range snapshots {
if instanceName != "" && snapshot.Instance != "" && snapshot.Instance != instanceName {
continue
}
if snapshot.Time.IsZero() {
continue
}
ageHours := now.Sub(snapshot.Time).Hours()
if ageHours < 0 {
continue
}
ageDays := ageHours / 24
const gib = 1024.0 * 1024 * 1024
sizeGiB := 0.0
if snapshot.SizeBytes > 0 {
sizeGiB = float64(snapshot.SizeBytes) / gib
}
// Determine thresholds for this snapshot
resourceID := fmt.Sprintf("%s:%s:%d", snapshot.Instance, snapshot.Node, snapshot.VMID)
m.mu.RLock()
gh := m.getGuestThresholds(nil, resourceID)
m.mu.RUnlock()
if gh.Disabled {
continue
}
currentSnapshotCfg := snapshotCfg
if gh.Snapshot != nil {
currentSnapshotCfg = *gh.Snapshot
}
if !currentSnapshotCfg.Enabled {
continue
}
var ageLevel AlertLevel
var ageThreshold int
var sizeLevel AlertLevel
var sizeThreshold float64
var triggeredStats []string
if currentSnapshotCfg.CriticalDays > 0 && ageDays >= float64(currentSnapshotCfg.CriticalDays) {
ageLevel = AlertLevelCritical
ageThreshold = currentSnapshotCfg.CriticalDays
triggeredStats = append(triggeredStats, "age")
} else if currentSnapshotCfg.WarningDays > 0 && ageDays >= float64(currentSnapshotCfg.WarningDays) {
ageLevel = AlertLevelWarning
ageThreshold = currentSnapshotCfg.WarningDays
triggeredStats = append(triggeredStats, "age")
}
if snapshot.SizeBytes > 0 {
if currentSnapshotCfg.CriticalSizeGiB > 0 && sizeGiB >= currentSnapshotCfg.CriticalSizeGiB {
sizeLevel = AlertLevelCritical
sizeThreshold = currentSnapshotCfg.CriticalSizeGiB
triggeredStats = append(triggeredStats, "size")
} else if currentSnapshotCfg.WarningSizeGiB > 0 && sizeGiB >= currentSnapshotCfg.WarningSizeGiB {
sizeLevel = AlertLevelWarning
sizeThreshold = currentSnapshotCfg.WarningSizeGiB
triggeredStats = append(triggeredStats, "size")
}
}
if ageLevel == "" && sizeLevel == "" {
continue
}
var level AlertLevel
switch {
case ageLevel == AlertLevelCritical || sizeLevel == AlertLevelCritical:
level = AlertLevelCritical
case ageLevel == AlertLevelWarning || sizeLevel == AlertLevelWarning:
level = AlertLevelWarning
default:
continue
}
useSizePrimary := false
if sizeLevel == AlertLevelCritical && ageLevel != AlertLevelCritical {
useSizePrimary = true
} else if sizeLevel != "" && ageLevel == "" {
useSizePrimary = true
}
alertID := fmt.Sprintf("snapshot-age-%s", snapshot.ID)
validAlerts[alertID] = struct{}{}
guestKey := BuildGuestKey(snapshot.Instance, snapshot.Node, snapshot.VMID)
guestName := strings.TrimSpace(guestNames[guestKey])
guestType := "VM"
if strings.EqualFold(snapshot.Type, "lxc") {
guestType = "Container"
}
if guestName == "" {
switch guestType {
case "Container":
guestName = fmt.Sprintf("CT %d", snapshot.VMID)
default:
guestName = fmt.Sprintf("VM %d", snapshot.VMID)
}
}
snapshotName := strings.TrimSpace(snapshot.Name)
if snapshotName == "" {
snapshotName = "(unnamed)"
}
ageDaysRounded := math.Round(ageDays*10) / 10
sizeGiBRounded := math.Round(sizeGiB*10) / 10
reasons := make([]string, 0, 2)
if ageLevel != "" {
reasons = append(reasons, fmt.Sprintf("%.1f days old (threshold %d days)", ageDaysRounded, ageThreshold))
}
if sizeLevel != "" {
reasons = append(reasons, fmt.Sprintf("%.1f GiB (threshold %.1f GiB)", sizeGiBRounded, sizeThreshold))
}
reasonText := strings.Join(reasons, " and ")
message := fmt.Sprintf(
"%s snapshot '%s' for %s is %s on %s",
guestType,
snapshotName,
guestName,
reasonText,
snapshot.Node,
)
alertValue := ageDays
alertThreshold := float64(ageThreshold)
thresholdTime := now
if useSizePrimary {
alertValue = sizeGiB
alertThreshold = sizeThreshold
} else if ageThreshold > 0 {
thresholdTime = snapshot.Time.Add(time.Duration(ageThreshold) * 24 * time.Hour)
if thresholdTime.After(now) {
thresholdTime = now
}
}
metadata := map[string]interface{}{
"snapshotName": snapshot.Name,
"snapshotCreatedAt": snapshot.Time,
"snapshotAgeDays": ageDays,
"snapshotAgeHours": ageHours,
"snapshotSizeBytes": snapshot.SizeBytes,
"snapshotSizeGiB": sizeGiB,
"guestName": guestName,
"guestType": guestType,
"guestInstance": snapshot.Instance,
"guestNode": snapshot.Node,
"guestVmid": snapshot.VMID,
"triggeredMetrics": triggeredStats,
"primaryMetric": "age",
}
if useSizePrimary {
metadata["primaryMetric"] = "size"
}
if ageLevel != "" {
metadata["thresholdDays"] = ageThreshold
}
if sizeLevel != "" {
metadata["thresholdSizeGiB"] = sizeThreshold
}
resourceName := fmt.Sprintf("%s snapshot '%s'", guestName, snapshotName)
m.mu.Lock()
if existing, exists := m.activeAlerts[alertID]; exists {
existing.LastSeen = now
existing.Level = level
existing.Value = alertValue
existing.Threshold = alertThreshold
existing.Message = message
existing.ResourceName = resourceName
if existing.Metadata == nil {
existing.Metadata = make(map[string]interface{})
}
for k, v := range metadata {
existing.Metadata[k] = v
}
m.mu.Unlock()
continue
}
alert := &Alert{
ID: alertID,
Type: "snapshot-age",
Level: level,
ResourceID: snapshot.ID,
ResourceName: resourceName,
Node: snapshot.Node,
Instance: snapshot.Instance,
Message: message,
Value: alertValue,
Threshold: alertThreshold,
StartTime: thresholdTime,
LastSeen: now,
Metadata: metadata,
}
m.preserveAlertState(alertID, alert)
m.activeAlerts[alertID] = alert
m.recentAlerts[alertID] = alert
m.historyManager.AddAlert(*alert)
go func() {
defer func() {
if r := recover(); r != nil {
log.Error().Interface("panic", r).Msg("Panic in SaveActiveAlerts goroutine (snapshot)")
}
}()
if err := m.SaveActiveAlerts(); err != nil {
log.Error().Err(err).Msg("Failed to save active alerts after snapshot alert creation")
}
}()
if !m.checkRateLimit(alertID) {
m.mu.Unlock()
log.Debug().
Str("alertID", alertID).
Str("guest", guestName).
Msg("Snapshot alert suppressed due to rate limit")
continue
}
if m.onAlert != nil {
nowCopy := now
alert.LastNotified = &nowCopy
if m.dispatchAlert(alert, true) {
log.Info().
Str("alertID", alertID).
Str("guest", guestName).
Msg("Snapshot age alert dispatched")
} else {
alert.LastNotified = nil
}
} else {
log.Warn().
Str("alertID", alertID).
Msg("Snapshot age alert created but no onAlert callback set")
}
m.mu.Unlock()
}
m.mu.Lock()
for alertID, alert := range m.activeAlerts {
if alert == nil || alert.Type != "snapshot-age" {
continue
}
if instanceName != "" && alert.Instance != instanceName {
continue
}
if _, ok := validAlerts[alertID]; ok {
continue
}
m.clearAlertNoLock(alertID)
}
m.mu.Unlock()
}
// CheckBackups evaluates storage, PBS, and PMG backups for age-based alerts.
func (m *Manager) CheckBackups(
storageBackups []models.StorageBackup,
pbsBackups []models.PBSBackup,
pmgBackups []models.PMGBackup,
guestsByKey map[string]GuestLookup,
guestsByVMID map[string][]GuestLookup,
templateInventoryReady map[string]bool,
) {
m.mu.RLock()
enabled := m.config.Enabled
backupCfg := m.config.BackupDefaults
m.mu.RUnlock()
if backupCfg.AlertOrphaned == nil {
alertOrphaned := true
backupCfg.AlertOrphaned = &alertOrphaned
}
if !enabled || !backupCfg.Enabled {
m.clearBackupAlerts()
return
}
if backupCfg.WarningDays <= 0 && backupCfg.CriticalDays <= 0 {
if backupCfg.AlertOrphaned == nil || !*backupCfg.AlertOrphaned {
m.clearBackupAlerts()
return
}
}
type backupRecord struct {
key string
vmid string
lookup GuestLookup
fallbackName string
instance string
node string
source string
storage string
datastore string
backupType string
filename string
lastTime time.Time
}
records := make(map[string]*backupRecord)
updateRecord := func(key string, candidate backupRecord) {
if key == "" {
return
}
if existing, ok := records[key]; ok {
if candidate.lastTime.After(existing.lastTime) {
*existing = candidate
}
return
}
record := candidate
records[key] = &record
}
now := time.Now()
for _, backup := range storageBackups {
if backup.Time.IsZero() {
continue
}
key := BuildGuestKey(backup.Instance, backup.Node, backup.VMID)
vmid := ""
if backup.VMID > 0 {
vmid = strconv.Itoa(backup.VMID)
}
info := guestsByKey[key]
displayName := info.Name
if displayName == "" {
displayName = fmt.Sprintf("%s-%d", sanitizeAlertKey(backup.Node), backup.VMID)
}
updateRecord(key, backupRecord{
key: key,
vmid: vmid,
lookup: info,
fallbackName: displayName,
instance: backup.Instance,
node: backup.Node,
source: "PVE storage",
storage: backup.Storage,
backupType: backup.Type,
lastTime: backup.Time,
})
}
for _, backup := range pbsBackups {
if backup.BackupTime.IsZero() {
continue
}
if backup.VMID == "0" {
// Host configuration backups - skip from age alerts
continue
}
vmid := backup.VMID
guests, exists := guestsByVMID[backup.VMID]
var info GuestLookup
var key string
var displayName string
var instance string
var node string
if exists && len(guests) > 0 {
// If we have exactly one match, use it directly
// If we have multiple matches, try to disambiguate using the PBS namespace
if len(guests) == 1 {
info = guests[0]
} else if backup.Namespace != "" {
// Try to match namespace to instance name
for _, g := range guests {
if namespaceMatchesInstance(backup.Namespace, g.Instance) {
info = g
break
}
}
// If no namespace match found, info stays zero-value.
// The VMID is ambiguous across instances so we must not guess.
}
// else: multiple guests, no namespace — info stays zero-value (ambiguous)
if info.Instance != "" && info.Node != "" {
key = BuildGuestKey(info.Instance, info.Node, info.VMID)
displayName = info.Name
instance = info.Instance
node = info.Node
} else {
key = fmt.Sprintf("pbs:%s:%s:%s", backup.Instance, backup.BackupType, backup.VMID)
displayName = fmt.Sprintf("VMID %s", backup.VMID)
instance = fmt.Sprintf("PBS:%s", backup.Instance)
node = "Unknown"
}
} else {
key = fmt.Sprintf("pbs:%s:%s:%s", backup.Instance, backup.BackupType, backup.VMID)
displayName = fmt.Sprintf("VMID %s", backup.VMID)
instance = fmt.Sprintf("PBS:%s", backup.Instance)
node = "Unknown"
}
updateRecord(key, backupRecord{
key: key,
vmid: vmid,
lookup: info,
fallbackName: displayName,
instance: instance,
node: node,
source: "PBS",
datastore: backup.Datastore,
backupType: backup.BackupType,
lastTime: backup.BackupTime,
})
}
for _, backup := range pmgBackups {
if backup.BackupTime.IsZero() {
continue
}
instanceLabel := strings.TrimSpace(backup.Instance)
if instanceLabel == "" {
instanceLabel = "PMG"
}
nodeName := strings.TrimSpace(backup.Node)
keyComponent := nodeName
if keyComponent == "" {
keyComponent = strings.TrimSpace(backup.Filename)
}
if keyComponent == "" {
keyComponent = "unknown"
}
displayName := nodeName
if displayName == "" {
displayName = instanceLabel
}
if displayName == "" {
displayName = "PMG gateway"
} else {
displayName = fmt.Sprintf("PMG %s", displayName)
}
instanceField := fmt.Sprintf("PMG:%s", instanceLabel)
key := fmt.Sprintf("pmg:%s:%s", instanceLabel, keyComponent)
updateRecord(key, backupRecord{
key: key,
fallbackName: displayName,
instance: instanceField,
node: nodeName,
source: "PMG",
backupType: "pmg",
filename: backup.Filename,
lastTime: backup.BackupTime,
})
}
if len(records) == 0 {
m.clearBackupAlerts()
return
}
// Build a set of instances whose inventory is safe to use for orphan detection.
// When the monitor provides a template-inventory readiness map, prefer that
// signal because backup polling can race ahead of template discovery even when
// live guests already exist on the instance. Fall back to the legacy "has at
// least one live guest" heuristic for direct callers/tests that do not pass it.
instancesReadyForOrphanDetection := make(map[string]bool)
if templateInventoryReady != nil {
for instance, ready := range templateInventoryReady {
if ready {
instancesReadyForOrphanDetection[instance] = true
}
}
} else {
for _, guests := range guestsByVMID {
for _, g := range guests {
if g.ResourceID != "" && g.Instance != "" {
instancesReadyForOrphanDetection[g.Instance] = true
}
}
}
}
validAlerts := make(map[string]struct{})
for key, record := range records {
age := now.Sub(record.lastTime)
if age < 0 {
continue
}
ageDays := age.Hours() / 24
if ageDays < 0 {
continue
}
ageDaysRounded := math.Round(ageDays*10) / 10
// Determine thresholds for this backup
currentBackupCfg := backupCfg
if record.lookup.ResourceID != "" {
m.mu.RLock()
gh := m.getGuestThresholds(nil, record.lookup.ResourceID)
m.mu.RUnlock()
if gh.Disabled {
continue
}
if gh.Backup != nil {
currentBackupCfg = *gh.Backup
}
}
currentBackupCfg.AlertOrphaned = backupCfg.AlertOrphaned
currentBackupCfg.IgnoreVMIDs = backupCfg.IgnoreVMIDs
if backupIgnoreVMID(record.vmid, currentBackupCfg.IgnoreVMIDs) {
continue
}
// Determine whether we have enough inventory to safely run orphan
// detection for this backup. For PVE storage backups the instance
// guard is strict: only check when that specific PVE instance has
// completed a template-aware inventory poll. For PBS/PMG backups
// (which span instances) it's enough that any instance is ready.
inventoryReady := false
if record.source == "PVE storage" {
inventoryReady = instancesReadyForOrphanDetection[record.instance]
} else {
inventoryReady = len(instancesReadyForOrphanDetection) > 0
}
if record.vmid != "" && record.lookup.ResourceID == "" && inventoryReady {
// Backup has a VMID but no matching live guest in its lookup.
//
// Check whether the VMID exists anywhere in live inventory.
// If it does, the backup is ambiguous (VMID collision) but not orphaned.
// Entries with empty ResourceID are persisted metadata for deleted guests
// and do not count as live inventory.
// For PVE storage backups, only match guests from the same instance —
// a live VMID on instance B does not mean instance A's backup isn't orphaned.
existsInInventory := false
if guests, ok := guestsByVMID[record.vmid]; ok {
for _, g := range guests {
if g.ResourceID == "" {
continue
}
if record.source == "PVE storage" && g.Instance != record.instance {
continue
}
existsInInventory = true
break
}
}
if !existsInInventory {
if g, ok := guestsByKey[record.key]; ok && g.ResourceID != "" {
existsInInventory = true
}
}
if !existsInInventory {
if currentBackupCfg.AlertOrphaned != nil && !*currentBackupCfg.AlertOrphaned {
continue
}
// Create a backup-orphaned alert immediately — no age threshold required.
alertKey := sanitizeAlertKey(key)
alertID := fmt.Sprintf("backup-orphaned-%s", alertKey)
validAlerts[alertID] = struct{}{}
displayName := record.fallbackName
if displayName == "" {
displayName = "Unknown guest"
}
node := record.node
if node == "" {
node = record.lookup.Node
}
instance := record.instance
if instance == "" {
instance = record.lookup.Instance
}
var sourceLabel string
switch record.source {
case "PBS":
sourceLabel = fmt.Sprintf("PBS datastore %s on %s", record.datastore, strings.TrimPrefix(instance, "PBS:"))
case "PMG":
if node != "" {
sourceLabel = fmt.Sprintf("PMG node %s", node)
} else {
sourceLabel = "PMG"
}
default:
sourceLabel = fmt.Sprintf("storage %s on %s", record.storage, node)
}
message := fmt.Sprintf(
"Orphaned backup: %s (VMID %s) via %s — guest no longer exists in inventory",
displayName,
record.vmid,
sourceLabel,
)
metadata := map[string]interface{}{
"source": record.source,
"lastBackupTime": record.lastTime,
"ageDays": ageDays,
"orphaned": true,
"vmid": record.vmid,
}
if record.storage != "" {
metadata["storage"] = record.storage
}
if record.datastore != "" {
metadata["datastore"] = record.datastore
}
if record.backupType != "" {
metadata["backupType"] = record.backupType
}
if record.filename != "" {
metadata["filename"] = record.filename
}
m.mu.Lock()
if existing, exists := m.activeAlerts[alertID]; exists && existing != nil {
existing.LastSeen = now
existing.Level = AlertLevelWarning
existing.Value = ageDays
existing.Threshold = 0
existing.Message = message
if existing.Metadata == nil {
existing.Metadata = make(map[string]interface{})
}
for k, v := range metadata {
existing.Metadata[k] = v
}
m.mu.Unlock()
continue
}
alert := &Alert{
ID: alertID,
Type: "backup-orphaned",
Level: AlertLevelWarning,
ResourceID: alertKey,
ResourceName: fmt.Sprintf("%s backup", displayName),
Node: node,
Instance: instance,
Message: message,
Value: ageDays,
Threshold: 0,
StartTime: now,
LastSeen: now,
Metadata: metadata,
}
m.preserveAlertState(alertID, alert)
m.activeAlerts[alertID] = alert
m.recentAlerts[alertID] = alert
m.historyManager.AddAlert(*alert)
go func() {
defer func() {
if r := recover(); r != nil {
log.Error().Interface("panic", r).Msg("Panic in SaveActiveAlerts goroutine (backup-orphaned)")
}
}()
if err := m.SaveActiveAlerts(); err != nil {
log.Error().Err(err).Msg("Failed to save active alerts after backup-orphaned alert creation")
}
}()
if !m.checkRateLimit(alertID) {
m.mu.Unlock()
log.Debug().
Str("alertID", alertID).
Str("resource", displayName).
Msg("Backup orphaned alert suppressed due to rate limit")
continue
}
if m.onAlert != nil {
notified := now
alert.LastNotified = &notified
if m.dispatchAlert(alert, true) {
log.Info().
Str("alertID", alertID).
Str("resource", displayName).
Msg("Backup orphaned alert dispatched")
} else {
alert.LastNotified = nil
}
}
m.mu.Unlock()
continue
}
}
if !currentBackupCfg.Enabled {
continue
}
var level AlertLevel
var threshold int
switch {
case currentBackupCfg.CriticalDays > 0 && ageDays >= float64(currentBackupCfg.CriticalDays):
level = AlertLevelCritical
threshold = currentBackupCfg.CriticalDays
case currentBackupCfg.WarningDays > 0 && ageDays >= float64(currentBackupCfg.WarningDays):
level = AlertLevelWarning
threshold = currentBackupCfg.WarningDays
default:
continue
}
alertKey := sanitizeAlertKey(key)
alertID := fmt.Sprintf("backup-age-%s", alertKey)
validAlerts[alertID] = struct{}{}
displayName := record.lookup.Name
if displayName == "" {
displayName = record.fallbackName
}
if displayName == "" {
displayName = "Unknown guest"
}
node := record.node
if node == "" {
node = record.lookup.Node
}
instance := record.instance
if instance == "" {
instance = record.lookup.Instance
}
thresholdTime := record.lastTime.Add(time.Duration(threshold) * 24 * time.Hour)
if thresholdTime.After(now) {
thresholdTime = now
}
var sourceLabel string
switch record.source {
case "PBS":
sourceLabel = fmt.Sprintf("PBS datastore %s on %s", record.datastore, strings.TrimPrefix(instance, "PBS:"))
case "PMG":
if node != "" {
sourceLabel = fmt.Sprintf("PMG node %s", node)
} else {
sourceLabel = "PMG"
}
default:
sourceLabel = fmt.Sprintf("storage %s on %s", record.storage, node)
}
message := fmt.Sprintf(
"%s backup via %s is %.1f days old (threshold: %d days)",
displayName,
sourceLabel,
ageDaysRounded,
threshold,
)
metadata := map[string]interface{}{
"source": record.source,
"lastBackupTime": record.lastTime,
"ageDays": ageDays,
"thresholdDays": threshold,
}
if record.storage != "" {
metadata["storage"] = record.storage
}
if record.datastore != "" {
metadata["datastore"] = record.datastore
}
if record.backupType != "" {
metadata["backupType"] = record.backupType
}
if record.filename != "" {
metadata["filename"] = record.filename
}
m.mu.Lock()
if existing, exists := m.activeAlerts[alertID]; exists {
existing.LastSeen = now
existing.Level = level
existing.Value = ageDays
existing.Threshold = float64(threshold)
existing.Message = message
if existing.Metadata == nil {
existing.Metadata = make(map[string]interface{})
}
for k, v := range metadata {
existing.Metadata[k] = v
}
m.mu.Unlock()
continue
}
alert := &Alert{
ID: alertID,
Type: "backup-age",
Level: level,
ResourceID: alertKey,
ResourceName: fmt.Sprintf("%s backup", displayName),
Node: node,
Instance: instance,
Message: message,
Value: ageDays,
Threshold: float64(threshold),
StartTime: thresholdTime,
LastSeen: now,
Metadata: metadata,
}
m.preserveAlertState(alertID, alert)
m.activeAlerts[alertID] = alert
m.recentAlerts[alertID] = alert
m.historyManager.AddAlert(*alert)
go func() {
defer func() {
if r := recover(); r != nil {
log.Error().Interface("panic", r).Msg("Panic in SaveActiveAlerts goroutine (backup)")
}
}()
if err := m.SaveActiveAlerts(); err != nil {
log.Error().Err(err).Msg("Failed to save active alerts after backup alert creation")
}
}()
if !m.checkRateLimit(alertID) {
m.mu.Unlock()
log.Debug().
Str("alertID", alertID).
Str("resource", displayName).
Msg("Backup alert suppressed due to rate limit")
continue
}
if m.onAlert != nil {
notified := now
alert.LastNotified = &notified
if m.dispatchAlert(alert, true) {
log.Info().
Str("alertID", alertID).
Str("resource", displayName).
Msg("Backup age alert dispatched")
} else {
alert.LastNotified = nil
}
}
m.mu.Unlock()
}
m.mu.Lock()
for alertID, alert := range m.activeAlerts {
if alert == nil || (alert.Type != "backup-age" && alert.Type != "backup-orphaned") {
continue
}
if _, ok := validAlerts[alertID]; ok {
continue
}
// When no instances have inventory ready for orphan detection, preserve existing orphan
// alerts rather than clearing them — we can't confirm they're resolved.
if len(instancesReadyForOrphanDetection) == 0 && alert.Type == "backup-orphaned" {
continue
}
m.clearAlertNoLock(alertID)
}
m.mu.Unlock()
}
// checkZFSPoolHealth checks ZFS pool for errors and degraded state
func (m *Manager) checkZFSPoolHealth(storage models.Storage) {
pool := storage.ZFSPool
if pool == nil {
return
}
// Check pool state (DEGRADED, FAULTED, etc.)
stateAlertID := fmt.Sprintf("zfs-pool-state-%s", storage.ID)
if pool.State != "ONLINE" {
level := AlertLevelWarning
if pool.State == "FAULTED" || pool.State == "UNAVAIL" {
level = AlertLevelCritical
}
m.mu.Lock()
if _, exists := m.activeAlerts[stateAlertID]; !exists {
alert := &Alert{
ID: stateAlertID,
Type: "zfs-pool-state",
Level: level,
ResourceID: storage.ID,
ResourceName: fmt.Sprintf("%s (%s)", storage.Name, pool.Name),
Node: storage.Node,
Instance: storage.Instance,
Message: fmt.Sprintf("ZFS pool '%s' is %s", pool.Name, pool.State),
Value: 0,
Threshold: 0,
StartTime: time.Now(),
LastSeen: time.Now(),
Metadata: map[string]interface{}{
"pool_name": pool.Name,
"pool_state": pool.State,
},
}
m.preserveAlertState(stateAlertID, alert)
m.activeAlerts[stateAlertID] = alert
m.recentAlerts[stateAlertID] = alert
m.historyManager.AddAlert(*alert)
m.dispatchAlert(alert, false)
log.Warn().
Str("pool", pool.Name).
Str("state", pool.State).
Str("node", storage.Node).
Msg("ZFS pool is not healthy")
}
m.mu.Unlock()
} else {
// Clear state alert if pool is back online
m.clearAlert(stateAlertID)
}
// Check for read/write/checksum errors
totalErrors := pool.ReadErrors + pool.WriteErrors + pool.ChecksumErrors
errorsAlertID := fmt.Sprintf("zfs-pool-errors-%s", storage.ID)
if totalErrors > 0 {
m.mu.Lock()
existingAlert, exists := m.activeAlerts[errorsAlertID]
// Only create new alert or update if error count increased
if !exists || float64(totalErrors) > existingAlert.Value {
alert := &Alert{
ID: errorsAlertID,
Type: "zfs-pool-errors",
Level: AlertLevelWarning,
ResourceID: storage.ID,
ResourceName: fmt.Sprintf("%s (%s)", storage.Name, pool.Name),
Node: storage.Node,
Instance: storage.Instance,
Message: fmt.Sprintf("ZFS pool '%s' has errors: %d read, %d write, %d checksum",
pool.Name, pool.ReadErrors, pool.WriteErrors, pool.ChecksumErrors),
Value: float64(totalErrors),
Threshold: 0,
StartTime: time.Now(),
LastSeen: time.Now(),
Metadata: map[string]interface{}{
"pool_name": pool.Name,
"read_errors": pool.ReadErrors,
"write_errors": pool.WriteErrors,
"checksum_errors": pool.ChecksumErrors,
},
}
if exists {
// Preserve original start time when updating
alert.StartTime = existingAlert.StartTime
}
m.preserveAlertState(errorsAlertID, alert)
m.activeAlerts[errorsAlertID] = alert
m.recentAlerts[errorsAlertID] = alert
m.historyManager.AddAlert(*alert)
m.dispatchAlert(alert, false)
log.Error().
Str("pool", pool.Name).
Int64("read_errors", pool.ReadErrors).
Int64("write_errors", pool.WriteErrors).
Int64("checksum_errors", pool.ChecksumErrors).
Str("node", storage.Node).
Msg("ZFS pool has I/O errors")
}
m.mu.Unlock()
} else {
m.clearAlert(errorsAlertID)
}
// Check individual devices for errors
m.mu.Lock()
for _, device := range pool.Devices {
alertID := fmt.Sprintf("zfs-device-%s-%s", storage.ID, device.Name)
// Skip SPARE devices unless they have actual errors
if (device.State != "ONLINE" && device.State != "SPARE") || device.ReadErrors > 0 || device.WriteErrors > 0 || device.ChecksumErrors > 0 {
if _, exists := m.activeAlerts[alertID]; !exists {
level := AlertLevelWarning
if device.State == "FAULTED" || device.State == "UNAVAIL" {
level = AlertLevelCritical
}
message := fmt.Sprintf("ZFS device '%s' in pool '%s'", device.Name, pool.Name)
if device.State != "ONLINE" {
message += fmt.Sprintf(" is %s", device.State)
}
if device.ReadErrors > 0 || device.WriteErrors > 0 || device.ChecksumErrors > 0 {
message += fmt.Sprintf(" has errors: %d read, %d write, %d checksum",
device.ReadErrors, device.WriteErrors, device.ChecksumErrors)
}
alert := &Alert{
ID: alertID,
Type: "zfs-device",
Level: level,
ResourceID: storage.ID,
ResourceName: fmt.Sprintf("%s (%s/%s)", storage.Name, pool.Name, device.Name),
Node: storage.Node,
Instance: storage.Instance,
Message: message,
Value: float64(device.ReadErrors + device.WriteErrors + device.ChecksumErrors),
Threshold: 0,
StartTime: time.Now(),
LastSeen: time.Now(),
Metadata: map[string]interface{}{
"pool_name": pool.Name,
"device_name": device.Name,
"device_state": device.State,
"read_errors": device.ReadErrors,
"write_errors": device.WriteErrors,
"checksum_errors": device.ChecksumErrors,
},
}
m.preserveAlertState(alertID, alert)
m.activeAlerts[alertID] = alert
m.recentAlerts[alertID] = alert
m.historyManager.AddAlert(*alert)
m.dispatchAlert(alert, false)
log.Warn().
Str("pool", pool.Name).
Str("device", device.Name).
Str("state", device.State).
Int64("errors", device.ReadErrors+device.WriteErrors+device.ChecksumErrors).
Str("node", storage.Node).
Msg("ZFS device has issues")
}
} else {
// Clear device alert if it's back to normal
m.clearAlertNoLock(alertID)
}
}
m.mu.Unlock()
}
// clearAlert removes an alert if it exists
func (m *Manager) clearAlert(alertID string) {
m.mu.Lock()
alert, exists := m.activeAlerts[alertID]
if exists {
m.removeActiveAlertNoLock(alertID)
}
m.mu.Unlock()
if !exists {
return
}
resolvedAlert := &ResolvedAlert{
Alert: alert,
ResolvedTime: time.Now(),
}
m.addRecentlyResolvedUnlocked(alertID, resolvedAlert)
m.safeCallResolvedCallback(alertID, false)
log.Info().
Str("alertID", alertID).
Msg("Alert cleared")
}
// getTimeThreshold determines the delay to apply for a metric/resource combination.
func (m *Manager) getTimeThreshold(_ string, resourceType, metricType string) int {
if delay, ok := m.getMetricTimeThreshold(resourceType, metricType); ok {
return delay
}
base, hasTypeSpecific := m.getBaseTimeThreshold(resourceType)
if !hasTypeSpecific {
if delay, ok := m.getGlobalMetricTimeThreshold(metricType); ok {
return delay
}
}
return base
}
// getMetricTimeThreshold returns a metric-specific delay if configured at the resource-type level.
func (m *Manager) getMetricTimeThreshold(resourceType, metricType string) (int, bool) {
if len(m.config.MetricTimeThresholds) == 0 {
return 0, false
}
metricKey := strings.ToLower(strings.TrimSpace(metricType))
if metricKey == "" {
return 0, false
}
for _, typeKey := range canonicalResourceTypeKeys(resourceType) {
perType, ok := m.config.MetricTimeThresholds[typeKey]
if !ok || len(perType) == 0 {
continue
}
if delay, ok := perType[metricKey]; ok {
return delay, true
}
if delay, ok := perType["default"]; ok {
return delay, true
}
if delay, ok := perType["_default"]; ok {
return delay, true
}
if delay, ok := perType["*"]; ok {
return delay, true
}
}
return 0, false
}
// getBaseTimeThreshold returns the resource-type level delay.
func (m *Manager) getBaseTimeThreshold(resourceType string) (int, bool) {
if m.config.TimeThresholds != nil {
for _, key := range canonicalResourceTypeKeys(resourceType) {
if delay, ok := m.config.TimeThresholds[key]; ok {
return delay, true
}
}
if delay, ok := m.config.TimeThresholds["all"]; ok {
return delay, false
}
}
return m.config.TimeThreshold, false
}
func (m *Manager) getGlobalMetricTimeThreshold(metricType string) (int, bool) {
if len(m.config.MetricTimeThresholds) == 0 {
return 0, false
}
perType, ok := m.config.MetricTimeThresholds["all"]
if !ok || len(perType) == 0 {
return 0, false
}
metricKey := strings.ToLower(strings.TrimSpace(metricType))
if metricKey == "" {
return 0, false
}
if delay, ok := perType[metricKey]; ok {
return delay, true
}
if delay, ok := perType["default"]; ok {
return delay, true
}
if delay, ok := perType["_default"]; ok {
return delay, true
}
if delay, ok := perType["*"]; ok {
return delay, true
}
return 0, false
}
func canonicalResourceTypeKeys(resourceType string) []string {
typeKey := strings.ToLower(strings.TrimSpace(resourceType))
addUnique := func(slice []string, value string) []string {
if value == "" {
return slice
}
for _, existing := range slice {
if existing == value {
return slice
}
}
return append(slice, value)
}
var keys []string
switch typeKey {
case "guest", "qemu", "vm", "ct", "container", "lxc":
keys = addUnique(keys, "guest")
case "docker", "docker container", "dockercontainer":
keys = addUnique(keys, "docker")
keys = addUnique(keys, "guest")
case "docker host", "dockerhost":
keys = addUnique(keys, "dockerhost")
keys = addUnique(keys, "docker")
keys = addUnique(keys, "node")
case "node":
keys = addUnique(keys, "node")
case "pbs", "pbs server", "pbsserver":
keys = addUnique(keys, "pbs")
keys = addUnique(keys, "node")
case "storage":
keys = addUnique(keys, "storage")
default:
keys = addUnique(keys, typeKey)
}
return keys
}
// checkMetric checks a single metric against its threshold with hysteresis
type metricOptions struct {
Metadata map[string]interface{}
Message string
// MonitorOnly suppresses external notifications while still tracking the alert.
MonitorOnly bool
}
func (m *Manager) checkMetric(resourceID, resourceName, node, instance, resourceType, metricType string, value float64, threshold *HysteresisThreshold, opts *metricOptions) {
if threshold == nil || threshold.Trigger <= 0 {
alertID := fmt.Sprintf("%s-%s", resourceID, metricType)
m.clearAlert(alertID)
return
}
log.Debug().
Str("resource", resourceName).
Str("metric", metricType).
Float64("value", value).
Float64("trigger", threshold.Trigger).
Float64("clear", threshold.Clear).
Bool("exceeds", value >= threshold.Trigger).
Msg("Checking metric threshold")
alertID := fmt.Sprintf("%s-%s", resourceID, metricType)
m.mu.Lock()
defer m.mu.Unlock()
existingAlert, exists := m.activeAlerts[alertID]
monitorOnly := opts != nil && opts.MonitorOnly
// Check for suppression
if suppressUntil, suppressed := m.suppressedUntil[alertID]; suppressed && time.Now().Before(suppressUntil) {
log.Debug().
Str("alertID", alertID).
Time("suppressedUntil", suppressUntil).
Msg("Alert suppressed")
return
}
if value >= threshold.Trigger {
// Threshold exceeded
if !exists {
alertStartTime := time.Now()
// Determine the appropriate time threshold based on resource/metric type
timeThreshold := m.getTimeThreshold(resourceID, resourceType, metricType)
// Check if we have a time threshold configured
if timeThreshold > 0 {
// Check if this threshold was already pending
if pendingTime, isPending := m.pendingAlerts[alertID]; isPending {
// Check if enough time has passed
if time.Since(pendingTime) >= time.Duration(timeThreshold)*time.Second {
// Time threshold met, proceed with alert
delete(m.pendingAlerts, alertID)
if !pendingTime.IsZero() {
alertStartTime = pendingTime
}
log.Debug().
Str("alertID", alertID).
Int("timeThreshold", timeThreshold).
Dur("elapsed", time.Since(pendingTime)).
Msg("Time threshold met, triggering alert")
} else {
// Still waiting for time threshold
log.Debug().
Str("alertID", alertID).
Int("timeThreshold", timeThreshold).
Dur("elapsed", time.Since(pendingTime)).
Msg("Threshold exceeded but waiting for time threshold")
return
}
} else {
// First time exceeding threshold, start tracking
m.pendingAlerts[alertID] = alertStartTime
log.Debug().
Str("alertID", alertID).
Int("timeThreshold", timeThreshold).
Msg("Threshold exceeded, starting time threshold tracking")
return
}
}
// Check for recent similar alert to prevent spam
if recent, hasRecent := m.recentAlerts[alertID]; hasRecent {
// Check minimum delta
if m.config.MinimumDelta > 0 &&
time.Since(recent.StartTime) < time.Duration(m.config.SuppressionWindow)*time.Minute &&
abs(recent.Value-value) < m.config.MinimumDelta {
log.Debug().
Str("alertID", alertID).
Float64("recentValue", recent.Value).
Float64("currentValue", value).
Float64("delta", abs(recent.Value-value)).
Float64("minimumDelta", m.config.MinimumDelta).
Msg("Alert suppressed due to minimum delta")
// Set suppression window
m.suppressedUntil[alertID] = time.Now().Add(time.Duration(m.config.SuppressionWindow) * time.Minute)
return
}
}
// New alert
message := ""
var unit string
if opts != nil && opts.Message != "" {
message = opts.Message
} else {
switch metricType {
case "usage":
message = fmt.Sprintf("%s at %.1f%%", resourceType, value)
case "diskRead", "diskWrite", "networkIn", "networkOut":
message = fmt.Sprintf("%s %s at %.1f MB/s", resourceType, metricType, value)
unit = "MB/s"
case "temperature", "disk_temperature", "diskTemperature":
message = fmt.Sprintf("%s %s at %.1f°C", resourceType, metricType, value)
unit = "°C"
default:
message = fmt.Sprintf("%s %s at %.1f%%", resourceType, metricType, value)
}
}
alertMetadata := map[string]interface{}{
"resourceType": resourceType,
"clearThreshold": threshold.Clear,
}
if unit != "" {
alertMetadata["unit"] = unit
}
if opts != nil && opts.Metadata != nil {
for k, v := range opts.Metadata {
alertMetadata[k] = v
}
}
alertMetadata["monitorOnly"] = monitorOnly
alert := &Alert{
ID: alertID,
Type: metricType,
Level: AlertLevelWarning,
ResourceID: resourceID,
ResourceName: resourceName,
Node: node,
NodeDisplayName: m.resolveNodeDisplayName(instance, node),
Instance: instance,
Message: message,
Value: value,
Threshold: threshold.Trigger,
StartTime: alertStartTime,
LastSeen: time.Now(),
Metadata: alertMetadata,
}
// Set level based on how much over threshold
if value >= threshold.Trigger+10 {
alert.Level = AlertLevelCritical
}
log.Debug().
Str("alertID", alertID).
Time("alertStartTime", alertStartTime).
Time("now", time.Now()).
Dur("initialDuration", time.Since(alertStartTime)).
Msg("Creating new alert with start time")
m.preserveAlertState(alertID, alert)
m.activeAlerts[alertID] = alert
m.recentAlerts[alertID] = alert
m.historyManager.AddAlert(*alert)
// Save active alerts after adding new one
go func() {
defer func() {
if r := recover(); r != nil {
log.Error().Interface("panic", r).Msg("Panic in SaveActiveAlerts goroutine")
}
}()
if err := m.SaveActiveAlerts(); err != nil {
log.Error().Err(err).Msg("Failed to save active alerts after creation")
}
}()
log.Warn().
Str("alertID", alertID).
Str("resource", resourceName).
Str("metric", metricType).
Float64("value", value).
Float64("trigger", threshold.Trigger).
Float64("clear", threshold.Clear).
Int("activeAlerts", len(m.activeAlerts)).
Msg("Alert triggered")
// Trigger AI analysis callback unconditionally (bypasses notification suppression)
if m.onAlertForAI != nil {
alertCopy := alert.Clone()
go func(a *Alert) {
defer func() {
if r := recover(); r != nil {
log.Error().Interface("panic", r).Str("alertID", a.ID).Msg("Panic in AI alert callback")
}
}()
m.onAlertForAI(a)
}(alertCopy)
}
// Check rate limit (but don't remove alert from tracking)
if !m.checkRateLimit(alertID) {
log.Debug().
Str("alertID", alertID).
Int("maxPerHour", m.config.Schedule.MaxAlertsHour).
Msg("Alert notification suppressed due to rate limit")
// Don't delete the alert, just suppress notifications
return
}
// Notify callback (may be suppressed by quiet hours)
if m.onAlert != nil {
now := time.Now()
alert.LastNotified = &now
if m.dispatchAlert(alert, true) {
log.Info().Str("alertID", alertID).Msg("Calling onAlert callback")
} else {
alert.LastNotified = nil
}
} else {
log.Warn().Msg("No onAlert callback set!")
}
} else {
// Update existing alert
existingAlert.LastSeen = time.Now()
existingAlert.Value = value
// Keep display name current (handles upgrades and renames).
if dn := m.resolveNodeDisplayName(existingAlert.Instance, existingAlert.Node); dn != "" {
existingAlert.NodeDisplayName = dn
}
if existingAlert.Metadata == nil {
existingAlert.Metadata = map[string]interface{}{}
}
existingAlert.Metadata["resourceType"] = resourceType
existingAlert.Metadata["clearThreshold"] = threshold.Clear
existingAlert.Metadata["monitorOnly"] = monitorOnly
if opts != nil {
if opts.Message != "" {
existingAlert.Message = opts.Message
}
if opts.Metadata != nil {
for k, v := range opts.Metadata {
existingAlert.Metadata[k] = v
}
}
}
// Update level if needed
oldLevel := existingAlert.Level
if value >= threshold.Trigger+10 {
existingAlert.Level = AlertLevelCritical
} else {
existingAlert.Level = AlertLevelWarning
}
// Check if we should re-notify based on cooldown period
// Never re-notify acknowledged alerts (user has already seen it)
shouldRenotify := false
if existingAlert.Acknowledged {
log.Debug().
Str("alertID", alertID).
Msg("Alert is acknowledged, skipping re-notification")
} else if m.shouldNotifyAfterCooldown(existingAlert) {
shouldRenotify = true
log.Debug().
Str("alertID", alertID).
Dur("cooldown", time.Duration(m.config.Schedule.Cooldown)*time.Minute).
Msg("Cooldown period has passed, will re-notify")
} else if oldLevel != existingAlert.Level && existingAlert.Level == AlertLevelCritical {
// Always re-notify if alert escalated to critical
shouldRenotify = true
log.Debug().
Str("alertID", alertID).
Msg("Alert escalated to critical, will re-notify despite cooldown")
}
// Send re-notification if appropriate (may be suppressed by quiet hours)
if shouldRenotify && m.onAlert != nil {
now := time.Now()
existingAlert.LastNotified = &now
// Dispatch asynchronously so callback I/O cannot block alert evaluation.
if m.dispatchAlert(existingAlert, true) {
log.Info().
Str("alertID", alertID).
Str("level", string(existingAlert.Level)).
Msg("Re-notifying for existing alert")
} else {
existingAlert.LastNotified = nil
}
}
}
} else {
// Value is below trigger threshold
// Clear any pending alert for this metric
if _, isPending := m.pendingAlerts[alertID]; isPending {
delete(m.pendingAlerts, alertID)
log.Debug().
Str("alertID", alertID).
Msg("Value dropped below threshold, clearing pending alert")
}
if exists {
// Use hysteresis for resolution - only resolve if below clear threshold
clearThreshold := threshold.Clear
if clearThreshold <= 0 {
clearThreshold = threshold.Trigger // Fallback to trigger if clear not set
}
if value <= clearThreshold {
// Threshold cleared with hysteresis - auto resolve
resolvedAlert := &ResolvedAlert{
Alert: existingAlert,
ResolvedTime: time.Now(),
}
// Remove from active alerts
m.removeActiveAlertNoLock(alertID)
// Save active alerts after resolution
go func() {
defer func() {
if r := recover(); r != nil {
log.Error().Interface("panic", r).Msg("Panic in SaveActiveAlerts goroutine (resolution)")
}
}()
if err := m.SaveActiveAlerts(); err != nil {
log.Error().Err(err).Msg("Failed to save active alerts after resolution")
}
}()
// Add to recently resolved while preventing lock-order inversions
m.addRecentlyResolvedWithPrimaryLock(alertID, resolvedAlert)
log.Info().
Str("alertID", alertID).
Msg("Added alert to recently resolved")
log.Info().
Str("resource", resourceName).
Str("metric", metricType).
Float64("value", value).
Float64("clearThreshold", clearThreshold).
Bool("wasAcknowledged", existingAlert.Acknowledged).
Msg("Alert resolved with hysteresis")
if m.onResolved != nil {
go m.onResolved(alertID)
}
}
}
}
}
func sanitizeAlertKey(label string) string {
trimmed := strings.TrimSpace(label)
if trimmed == "" {
return ""
}
if trimmed == "/" {
return "root"
}
trimmed = strings.Trim(trimmed, "/\\ ")
if trimmed == "" {
trimmed = "root"
}
lower := strings.ToLower(trimmed)
var builder strings.Builder
builder.Grow(len(lower))
prevDash := false
for _, r := range lower {
if (r >= 'a' && r <= 'z') || (r >= '0' && r <= '9') {
builder.WriteRune(r)
prevDash = false
continue
}
if r == '.' {
builder.WriteRune(r)
prevDash = false
continue
}
if !prevDash {
builder.WriteRune('-')
prevDash = true
}
}
sanitized := strings.Trim(builder.String(), "-.")
if sanitized == "" {
sanitized = "disk"
}
return sanitized
}
// abs returns the absolute value of a float64
func abs(x float64) float64 {
if x < 0 {
return -x
}
return x
}
// namespaceMatchesInstance checks if a PBS namespace likely corresponds to a PVE instance.
// This helps disambiguate backups when multiple PVE instances have VMs with the same VMID.
// Examples: namespace "pve1" matches instance "pve1", namespace "nat" matches instance "pve-nat"
func namespaceMatchesInstance(namespace, instance string) bool {
if namespace == "" || instance == "" {
return false
}
// Normalize both strings: lowercase and keep only alphanumeric
normalize := func(s string) string {
var b strings.Builder
for _, r := range strings.ToLower(s) {
if (r >= 'a' && r <= 'z') || (r >= '0' && r <= '9') {
b.WriteRune(r)
}
}
return b.String()
}
ns := normalize(namespace)
inst := normalize(instance)
if ns == "" || inst == "" {
return false
}
// Exact match after normalization
if ns == inst {
return true
}
// Check if namespace is a suffix of instance
// e.g., namespace "nat" matches instance "pvenat" (normalized from "pve-nat")
// This is more precise than substring matching because:
// - "nat" should match "pve-nat" but not "natpve"
// - "pve" should match "pve" but not "pve-nat" (handled by exact match above)
if strings.HasSuffix(inst, ns) {
return true
}
// Check if instance is a suffix of namespace (reverse case)
// e.g., namespace "pvebackups" could match instance "pve"
if strings.HasSuffix(ns, inst) {
return true
}
return false
}
// AcknowledgeAlert acknowledges an alert
func (m *Manager) AcknowledgeAlert(alertID, user string) error {
m.mu.Lock()
alert, exists := m.activeAlerts[alertID]
if !exists {
m.mu.Unlock()
return fmt.Errorf("alert not found: %s", alertID)
}
alert.Acknowledged = true
now := time.Now()
alert.AckTime = &now
alert.AckUser = user
// Write the modified alert back to the map
m.activeAlerts[alertID] = alert
m.ackState[alertID] = ackRecord{
acknowledged: true,
user: user,
time: now,
}
alertCopy := alert.Clone()
m.mu.Unlock()
log.Debug().
Str("alertID", alertID).
Str("user", user).
Time("ackTime", now).
Msg("Alert acknowledgment recorded")
m.safeCallAcknowledgedCallback(alertCopy, user)
return nil
}
// UnacknowledgeAlert removes the acknowledged status from an alert
func (m *Manager) UnacknowledgeAlert(alertID string) error {
m.mu.Lock()
alert, exists := m.activeAlerts[alertID]
if !exists {
m.mu.Unlock()
return fmt.Errorf("alert not found: %s", alertID)
}
alert.Acknowledged = false
alert.AckTime = nil
alert.AckUser = ""
// Write the modified alert back to the map
m.activeAlerts[alertID] = alert
delete(m.ackState, alertID)
alertCopy := alert.Clone()
m.mu.Unlock()
log.Info().
Str("alertID", alertID).
Msg("Alert unacknowledged")
m.safeCallUnacknowledgedCallback(alertCopy, "")
return nil
}
// preserveAlertState copies acknowledgement and escalation metadata from an existing alert
// into a freshly constructed alert before it replaces the existing entry in the map. This
// prevents UI state from regressing when alerts are rebuilt during polling.
func (m *Manager) preserveAlertState(alertID string, updated *Alert) {
if updated == nil {
return
}
// Auto-resolve node display name if not already set.
if updated.NodeDisplayName == "" && updated.Node != "" {
updated.NodeDisplayName = m.resolveNodeDisplayName(updated.Instance, updated.Node)
}
existing, exists := m.activeAlerts[alertID]
if exists && existing != nil {
// Preserve the original start time so duration calculations are correct
updated.StartTime = existing.StartTime
if existing.LastNotified != nil {
t := *existing.LastNotified
updated.LastNotified = &t
} else {
updated.LastNotified = nil
}
updated.Acknowledged = existing.Acknowledged
updated.AckUser = existing.AckUser
if existing.AckTime != nil {
t := *existing.AckTime
updated.AckTime = &t
} else {
updated.AckTime = nil
}
updated.LastEscalation = existing.LastEscalation
if len(existing.EscalationTimes) > 0 {
updated.EscalationTimes = append([]time.Time(nil), existing.EscalationTimes...)
} else {
updated.EscalationTimes = nil
}
log.Debug().
Str("alertID", alertID).
Time("originalStartTime", existing.StartTime).
Dur("currentDuration", time.Since(existing.StartTime)).
Msg("Preserving alert state including StartTime")
return
}
// Fall back to previously recorded acknowledgement state for this alert ID (e.g., flapping alerts)
if record, ok := m.ackState[alertID]; ok && record.acknowledged {
updated.Acknowledged = true
updated.AckUser = record.user
t := record.time
updated.AckTime = &t
}
}
func (m *Manager) removeActiveAlertNoLock(alertID string) {
// Before deleting, update the history entry with the alert's final LastSeen
// timestamp so the stored duration reflects how long the alert was actually active.
if alert, exists := m.activeAlerts[alertID]; exists && alert != nil {
m.historyManager.UpdateAlertLastSeen(alertID, alert.LastSeen)
}
delete(m.activeAlerts, alertID)
// NOTE: Don't delete ackState here - preserve it so if the same alert
// reappears (e.g., powered-off VM during backup), the acknowledgement
// is restored via preserveAlertState. ackState is cleaned up in Cleanup().
// Update inactiveAt so the cleanup TTL is measured from removal time, not ack time.
if record, exists := m.ackState[alertID]; exists {
record.inactiveAt = time.Now()
m.ackState[alertID] = record
}
}
// GetActiveAlerts returns all active alerts
func (m *Manager) GetActiveAlerts() []Alert {
m.mu.RLock()
defer m.mu.RUnlock()
alerts := make([]Alert, 0, len(m.activeAlerts))
for _, alert := range m.activeAlerts {
a := *alert
// Ensure display name is current (handles upgrades, renames, and
// alerts created before the cache was populated).
if dn := m.resolveNodeDisplayName(a.Instance, a.Node); dn != "" {
a.NodeDisplayName = dn
}
alerts = append(alerts, a)
}
// Sort to ensure stable ordering across poll cycles (map iteration is random)
sort.Slice(alerts, func(i, j int) bool {
if alerts[i].Node != alerts[j].Node {
return alerts[i].Node < alerts[j].Node
}
return alerts[i].ID < alerts[j].ID
})
return alerts
}
// NotifyExistingAlert re-dispatches a notification for an existing active alert
// Used when activation state changes from pending to active
func (m *Manager) NotifyExistingAlert(alertID string) {
m.mu.Lock()
defer m.mu.Unlock()
alert, exists := m.activeAlerts[alertID]
if !exists {
return
}
// Dispatch notification for existing alert while holding lock
// dispatchAlert expects caller to hold m.mu for checkFlapping safety
m.dispatchAlert(alert, true)
}
// GetRecentlyResolved returns recently resolved alerts
func (m *Manager) GetRecentlyResolved() []models.ResolvedAlert {
m.resolvedMutex.RLock()
defer m.resolvedMutex.RUnlock()
resolved := make([]models.ResolvedAlert, 0, len(m.recentlyResolved))
for _, alert := range m.recentlyResolved {
resolved = append(resolved, models.ResolvedAlert{
Alert: models.Alert{
ID: alert.ID,
Type: alert.Type,
Level: string(alert.Level),
ResourceID: alert.ResourceID,
ResourceName: alert.ResourceName,
Node: alert.Node,
Instance: alert.Instance,
Message: alert.Message,
Value: alert.Value,
Threshold: alert.Threshold,
StartTime: alert.StartTime,
Acknowledged: alert.Acknowledged,
},
ResolvedTime: alert.ResolvedTime,
})
}
return resolved
}
// GetResolvedAlert returns a copy of a recently resolved alert by ID.
func (m *Manager) GetResolvedAlert(alertID string) *ResolvedAlert {
m.resolvedMutex.RLock()
defer m.resolvedMutex.RUnlock()
resolved, ok := m.recentlyResolved[alertID]
if !ok || resolved == nil || resolved.Alert == nil {
return nil
}
return &ResolvedAlert{
Alert: resolved.Alert.Clone(),
ResolvedTime: resolved.ResolvedTime,
}
}
// GetAlertHistory returns alert history
func (m *Manager) GetAlertHistory(limit int) []Alert {
return m.historyManager.GetAllHistory(limit)
}
// GetAlertHistorySince returns alert history entries created after the provided time.
func (m *Manager) GetAlertHistorySince(since time.Time, limit int) []Alert {
if since.IsZero() {
return m.GetAlertHistory(limit)
}
return m.historyManager.GetHistory(since, limit)
}
// ClearAlertHistory clears all alert history
func (m *Manager) ClearAlertHistory() error {
return m.historyManager.ClearAllHistory()
}
// OnAlertHistory registers a callback to be called when alerts are added to history.
// This enables external systems like pattern detection to track alerts.
func (m *Manager) OnAlertHistory(cb AlertCallback) {
if m.historyManager != nil {
m.historyManager.OnAlert(cb)
}
}
// checkNodeOffline creates an alert for offline nodes after confirmation
func (m *Manager) checkNodeOffline(node models.Node) {
alertID := fmt.Sprintf("node-offline-%s", node.ID)
m.mu.Lock()
defer m.mu.Unlock()
// Check if node connectivity alerts are disabled
if override, exists := m.config.Overrides[node.ID]; exists && override.DisableConnectivity {
// Node connectivity alerts are disabled, clear any existing alert and return
if _, alertExists := m.activeAlerts[alertID]; alertExists {
m.clearAlertNoLock(alertID)
log.Debug().
Str("node", node.Name).
Msg("Node offline alert cleared (connectivity alerts disabled)")
}
delete(m.nodeOfflineCount, node.ID)
return
}
// Check if alert already exists
if _, exists := m.activeAlerts[alertID]; exists {
// Alert already exists, just update last seen time
m.activeAlerts[alertID].LastSeen = time.Now()
return
}
// Increment offline count
m.nodeOfflineCount[node.ID]++
offlineCount := m.nodeOfflineCount[node.ID]
log.Debug().
Str("node", node.Name).
Str("instance", node.Instance).
Int("offlineCount", offlineCount).
Msg("Node offline detection count")
// Require 3 consecutive offline polls (~15 seconds) before alerting
// This prevents false positives from transient cluster communication issues
const requiredOfflineCount = 3
if offlineCount < requiredOfflineCount {
log.Info().
Str("node", node.Name).
Int("count", offlineCount).
Int("required", requiredOfflineCount).
Msg("Node appears offline, waiting for confirmation")
return
}
// Create new offline alert after confirmation
alert := &Alert{
ID: alertID,
Type: "connectivity",
Level: AlertLevelCritical, // Node offline is always critical
ResourceID: node.ID,
ResourceName: node.Name,
Node: node.Name,
NodeDisplayName: m.resolveNodeDisplayName(node.Instance, node.Name),
Instance: node.Instance,
Message: fmt.Sprintf("Node '%s' is offline", node.Name),
Value: 0, // Not applicable for offline status
Threshold: 0, // Not applicable for offline status
StartTime: time.Now(),
Acknowledged: false,
}
m.preserveAlertState(alertID, alert)
m.activeAlerts[alertID] = alert
m.recentAlerts[alertID] = alert
// Add to history
m.historyManager.AddAlert(*alert)
// Send notification after confirmation
if !m.checkRateLimit(alertID) {
log.Debug().
Str("alertID", alertID).
Int("maxPerHour", m.config.Schedule.MaxAlertsHour).
Msg("Node offline alert suppressed due to rate limit")
return
}
m.dispatchAlert(alert, false)
// Log the critical event
log.Error().
Str("node", node.Name).
Str("instance", node.Instance).
Str("status", node.Status).
Str("connectionHealth", node.ConnectionHealth).
Int("confirmedAfter", requiredOfflineCount).
Msg("CRITICAL: Node is offline (confirmed)")
}
// clearNodeOfflineAlert removes offline alert when node comes back online
func (m *Manager) clearNodeOfflineAlert(node models.Node) {
alertID := fmt.Sprintf("node-offline-%s", node.ID)
m.mu.Lock()
defer m.mu.Unlock()
// Reset offline count when node comes back online
if m.nodeOfflineCount[node.ID] > 0 {
log.Debug().
Str("node", node.Name).
Int("previousCount", m.nodeOfflineCount[node.ID]).
Msg("Node back online, resetting offline count")
delete(m.nodeOfflineCount, node.ID)
}
// Check if offline alert exists
alert, exists := m.activeAlerts[alertID]
if !exists {
return
}
// Remove from active alerts
m.removeActiveAlertNoLock(alertID)
resolvedAlert := &ResolvedAlert{
Alert: alert,
ResolvedTime: time.Now(),
}
m.addRecentlyResolvedWithPrimaryLock(alertID, resolvedAlert)
// Send recovery notification (async to avoid blocking alert cleanup while holding m.mu.Lock)
m.safeCallResolvedCallback(alertID, true)
// Log recovery
log.Info().
Str("node", node.Name).
Str("instance", node.Instance).
Dur("downtime", time.Since(alert.StartTime)).
Msg("Node is back online")
}
// checkPBSOffline creates an alert for offline PBS instances
func (m *Manager) checkPBSOffline(pbs models.PBSInstance) {
alertID := fmt.Sprintf("pbs-offline-%s", pbs.ID)
m.mu.Lock()
defer m.mu.Unlock()
// Check if PBS offline alerts are disabled via disableConnectivity flag
if override, exists := m.config.Overrides[pbs.ID]; exists && (override.Disabled || override.DisableConnectivity) {
// PBS connectivity alerts are disabled, clear any existing alert and return
if _, alertExists := m.activeAlerts[alertID]; alertExists {
m.clearAlertNoLock(alertID)
log.Debug().
Str("pbs", pbs.Name).
Msg("PBS offline alert cleared (connectivity alerts disabled)")
}
return
}
// Track confirmation count for this PBS
m.offlineConfirmations[pbs.ID]++
// Require 3 consecutive offline polls (~15 seconds) before alerting
if m.offlineConfirmations[pbs.ID] < 3 {
log.Debug().
Str("pbs", pbs.Name).
Int("confirmations", m.offlineConfirmations[pbs.ID]).
Msg("PBS offline detected, waiting for confirmation")
return
}
// Check if alert already exists
if _, exists := m.activeAlerts[alertID]; exists {
// Update last seen time
m.activeAlerts[alertID].LastSeen = time.Now()
return
}
// Create new offline alert after confirmation
alert := &Alert{
ID: alertID,
Type: "offline",
Level: AlertLevelCritical,
ResourceID: pbs.ID,
ResourceName: pbs.Name,
Node: pbs.Host,
Instance: pbs.Name,
Message: fmt.Sprintf("PBS instance %s is offline", pbs.Name),
Value: 0,
Threshold: 0,
StartTime: time.Now(),
LastSeen: time.Now(),
}
m.preserveAlertState(alertID, alert)
m.activeAlerts[alertID] = alert
// Log and notify
log.Error().
Str("pbs", pbs.Name).
Str("host", pbs.Host).
Int("confirmations", m.offlineConfirmations[pbs.ID]).
Msg("PBS instance is offline")
if !m.checkRateLimit(alertID) {
log.Debug().
Str("alertID", alertID).
Int("maxPerHour", m.config.Schedule.MaxAlertsHour).
Msg("PBS offline alert suppressed due to rate limit")
return
}
m.dispatchAlert(alert, true)
}
// clearPBSOfflineAlert removes offline alert when PBS comes back online
func (m *Manager) clearPBSOfflineAlert(pbs models.PBSInstance) {
alertID := fmt.Sprintf("pbs-offline-%s", pbs.ID)
m.mu.Lock()
defer m.mu.Unlock()
// Reset offline confirmation count
if count, exists := m.offlineConfirmations[pbs.ID]; exists && count > 0 {
log.Debug().
Str("pbs", pbs.Name).
Int("previousCount", count).
Msg("PBS is online, resetting offline confirmation count")
delete(m.offlineConfirmations, pbs.ID)
}
// Check if offline alert exists
alert, exists := m.activeAlerts[alertID]
if !exists {
return
}
// Remove from active alerts
m.removeActiveAlertNoLock(alertID)
resolvedAlert := &ResolvedAlert{
Alert: alert,
ResolvedTime: time.Now(),
}
m.addRecentlyResolvedWithPrimaryLock(alertID, resolvedAlert)
// Send recovery notification (async to avoid blocking alert cleanup while holding m.mu.Lock)
m.safeCallResolvedCallback(alertID, true)
// Log recovery
log.Info().
Str("pbs", pbs.Name).
Str("host", pbs.Host).
Dur("downtime", time.Since(alert.StartTime)).
Msg("PBS instance is back online")
}
// checkPMGOffline creates an alert for offline PMG instances
func (m *Manager) checkPMGOffline(pmg models.PMGInstance) {
alertID := fmt.Sprintf("pmg-offline-%s", pmg.ID)
m.mu.Lock()
defer m.mu.Unlock()
// Check if PMG offline alerts are disabled via disableConnectivity flag
if override, exists := m.config.Overrides[pmg.ID]; exists && (override.Disabled || override.DisableConnectivity) {
// PMG connectivity alerts are disabled, clear any existing alert and return
if _, alertExists := m.activeAlerts[alertID]; alertExists {
m.clearAlertNoLock(alertID)
log.Debug().
Str("pmg", pmg.Name).
Msg("PMG offline alert cleared (connectivity alerts disabled)")
}
return
}
// Track confirmation count for this PMG
m.offlineConfirmations[pmg.ID]++
// Require 3 consecutive offline polls (~15 seconds) before alerting
if m.offlineConfirmations[pmg.ID] < 3 {
log.Debug().
Str("pmg", pmg.Name).
Int("confirmations", m.offlineConfirmations[pmg.ID]).
Msg("PMG offline detected, waiting for confirmation")
return
}
// Check if alert already exists
if _, exists := m.activeAlerts[alertID]; exists {
// Update last seen time
m.activeAlerts[alertID].LastSeen = time.Now()
return
}
// Create new offline alert after confirmation
alert := &Alert{
ID: alertID,
Type: "offline",
Level: AlertLevelCritical,
ResourceID: pmg.ID,
ResourceName: pmg.Name,
Node: pmg.Host,
Instance: pmg.Name,
Message: fmt.Sprintf("PMG instance %s is offline", pmg.Name),
Value: 0,
Threshold: 0,
StartTime: time.Now(),
LastSeen: time.Now(),
}
m.preserveAlertState(alertID, alert)
m.activeAlerts[alertID] = alert
// Log and notify
log.Error().
Str("pmg", pmg.Name).
Str("host", pmg.Host).
Int("confirmations", m.offlineConfirmations[pmg.ID]).
Msg("PMG instance is offline")
if !m.checkRateLimit(alertID) {
log.Debug().
Str("alertID", alertID).
Int("maxPerHour", m.config.Schedule.MaxAlertsHour).
Msg("PMG offline alert suppressed due to rate limit")
return
}
m.dispatchAlert(alert, true)
}
// clearPMGOfflineAlert removes offline alert when PMG comes back online
func (m *Manager) clearPMGOfflineAlert(pmg models.PMGInstance) {
alertID := fmt.Sprintf("pmg-offline-%s", pmg.ID)
m.mu.Lock()
defer m.mu.Unlock()
// Reset offline confirmation count
if count, exists := m.offlineConfirmations[pmg.ID]; exists && count > 0 {
log.Debug().
Str("pmg", pmg.Name).
Int("previousCount", count).
Msg("PMG is online, resetting offline confirmation count")
delete(m.offlineConfirmations, pmg.ID)
}
// Check if offline alert exists
alert, exists := m.activeAlerts[alertID]
if !exists {
return
}
// Remove from active alerts
m.removeActiveAlertNoLock(alertID)
resolvedAlert := &ResolvedAlert{
Alert: alert,
ResolvedTime: time.Now(),
}
m.addRecentlyResolvedWithPrimaryLock(alertID, resolvedAlert)
// Send recovery notification (async to avoid blocking alert cleanup while holding m.mu.Lock)
m.safeCallResolvedCallback(alertID, true)
// Log recovery
log.Info().
Str("pmg", pmg.Name).
Str("host", pmg.Host).
Dur("downtime", time.Since(alert.StartTime)).
Msg("PMG instance is back online")
}
// checkPMGQueueDepths checks PMG mail queue depths and creates alerts
// Evaluates all queue types (total, deferred, hold) independently
func (m *Manager) checkPMGQueueDepths(pmg models.PMGInstance, defaults PMGThresholdConfig) {
// Aggregate queue totals across all nodes
var totalQueue, totalDeferred, totalHold int
for _, node := range pmg.Nodes {
if node.QueueStatus != nil {
totalQueue += node.QueueStatus.Total
totalDeferred += node.QueueStatus.Deferred
totalHold += node.QueueStatus.Hold
}
}
// Check total queue depth
if defaults.QueueTotalWarning > 0 || defaults.QueueTotalCritical > 0 {
alertID := fmt.Sprintf("%s-queue-total", pmg.ID)
var level AlertLevel
var threshold int
var shouldAlert bool
if defaults.QueueTotalCritical > 0 && totalQueue >= defaults.QueueTotalCritical {
level = AlertLevelCritical
threshold = defaults.QueueTotalCritical
shouldAlert = true
} else if defaults.QueueTotalWarning > 0 && totalQueue >= defaults.QueueTotalWarning {
level = AlertLevelWarning
threshold = defaults.QueueTotalWarning
shouldAlert = true
}
if !shouldAlert {
m.clearAlert(alertID)
} else {
m.mu.Lock()
if alert, exists := m.activeAlerts[alertID]; exists {
alert.LastSeen = time.Now()
alert.Value = float64(totalQueue)
alert.Threshold = float64(threshold)
alert.Level = level
} else {
alert := &Alert{
ID: alertID,
Type: "queue-depth",
Level: level,
ResourceID: pmg.ID,
ResourceName: pmg.Name,
Node: pmg.Host,
NodeDisplayName: m.resolveNodeDisplayName(pmg.Name, pmg.Host),
Instance: pmg.Name,
Message: fmt.Sprintf("PMG %s has %d total messages in queue (threshold: %d)", pmg.Name, totalQueue, threshold),
Value: float64(totalQueue),
Threshold: float64(threshold),
StartTime: time.Now(),
LastSeen: time.Now(),
}
m.activeAlerts[alertID] = alert
m.dispatchAlert(alert, true)
log.Warn().
Str("pmg", pmg.Name).
Int("total_queue", totalQueue).
Int("threshold", threshold).
Str("level", string(level)).
Msg("PMG total queue depth alert triggered")
}
m.mu.Unlock()
}
}
// Check deferred queue depth
if defaults.DeferredQueueWarn > 0 || defaults.DeferredQueueCritical > 0 {
alertID := fmt.Sprintf("%s-queue-deferred", pmg.ID)
var level AlertLevel
var threshold int
var shouldAlert bool
if defaults.DeferredQueueCritical > 0 && totalDeferred >= defaults.DeferredQueueCritical {
level = AlertLevelCritical
threshold = defaults.DeferredQueueCritical
shouldAlert = true
} else if defaults.DeferredQueueWarn > 0 && totalDeferred >= defaults.DeferredQueueWarn {
level = AlertLevelWarning
threshold = defaults.DeferredQueueWarn
shouldAlert = true
}
if !shouldAlert {
m.clearAlert(alertID)
} else {
m.mu.Lock()
if alert, exists := m.activeAlerts[alertID]; exists {
alert.LastSeen = time.Now()
alert.Value = float64(totalDeferred)
alert.Threshold = float64(threshold)
alert.Level = level
} else {
alert := &Alert{
ID: alertID,
Type: "queue-deferred",
Level: level,
ResourceID: pmg.ID,
ResourceName: pmg.Name,
Node: pmg.Host,
NodeDisplayName: m.resolveNodeDisplayName(pmg.Name, pmg.Host),
Instance: pmg.Name,
Message: fmt.Sprintf("PMG %s has %d deferred messages (threshold: %d)", pmg.Name, totalDeferred, threshold),
Value: float64(totalDeferred),
Threshold: float64(threshold),
StartTime: time.Now(),
LastSeen: time.Now(),
}
m.activeAlerts[alertID] = alert
m.dispatchAlert(alert, true)
log.Warn().
Str("pmg", pmg.Name).
Int("deferred_queue", totalDeferred).
Int("threshold", threshold).
Str("level", string(level)).
Msg("PMG deferred queue depth alert triggered")
}
m.mu.Unlock()
}
}
// Check hold queue depth
if defaults.HoldQueueWarn > 0 || defaults.HoldQueueCritical > 0 {
alertID := fmt.Sprintf("%s-queue-hold", pmg.ID)
var level AlertLevel
var threshold int
var shouldAlert bool
if defaults.HoldQueueCritical > 0 && totalHold >= defaults.HoldQueueCritical {
level = AlertLevelCritical
threshold = defaults.HoldQueueCritical
shouldAlert = true
} else if defaults.HoldQueueWarn > 0 && totalHold >= defaults.HoldQueueWarn {
level = AlertLevelWarning
threshold = defaults.HoldQueueWarn
shouldAlert = true
}
if !shouldAlert {
m.clearAlert(alertID)
} else {
m.mu.Lock()
if alert, exists := m.activeAlerts[alertID]; exists {
alert.LastSeen = time.Now()
alert.Value = float64(totalHold)
alert.Threshold = float64(threshold)
alert.Level = level
} else {
alert := &Alert{
ID: alertID,
Type: "queue-hold",
Level: level,
ResourceID: pmg.ID,
ResourceName: pmg.Name,
Node: pmg.Host,
NodeDisplayName: m.resolveNodeDisplayName(pmg.Name, pmg.Host),
Instance: pmg.Name,
Message: fmt.Sprintf("PMG %s has %d held messages (threshold: %d)", pmg.Name, totalHold, threshold),
Value: float64(totalHold),
Threshold: float64(threshold),
StartTime: time.Now(),
LastSeen: time.Now(),
}
m.activeAlerts[alertID] = alert
m.dispatchAlert(alert, true)
log.Warn().
Str("pmg", pmg.Name).
Int("hold_queue", totalHold).
Int("threshold", threshold).
Str("level", string(level)).
Msg("PMG hold queue depth alert triggered")
}
m.mu.Unlock()
}
}
}
// checkPMGOldestMessage checks oldest queued message age and creates alerts
func (m *Manager) checkPMGOldestMessage(pmg models.PMGInstance, defaults PMGThresholdConfig) {
if defaults.OldestMessageWarnMins <= 0 && defaults.OldestMessageCritMins <= 0 {
return
}
// Find the oldest message age across all nodes
var oldestAge int64 // in seconds
for _, node := range pmg.Nodes {
if node.QueueStatus != nil && node.QueueStatus.OldestAge > oldestAge {
oldestAge = node.QueueStatus.OldestAge
}
}
if oldestAge == 0 {
// No messages in queue, clear any existing alert
m.clearAlert(fmt.Sprintf("%s-oldest-message", pmg.ID))
return
}
alertID := fmt.Sprintf("%s-oldest-message", pmg.ID)
oldestMinutes := oldestAge / 60
var level AlertLevel
var threshold int64
if defaults.OldestMessageCritMins > 0 && oldestMinutes >= int64(defaults.OldestMessageCritMins) {
level = AlertLevelCritical
threshold = int64(defaults.OldestMessageCritMins)
} else if defaults.OldestMessageWarnMins > 0 && oldestMinutes >= int64(defaults.OldestMessageWarnMins) {
level = AlertLevelWarning
threshold = int64(defaults.OldestMessageWarnMins)
} else {
// Oldest message is below thresholds, clear any existing alert
m.clearAlert(alertID)
return
}
m.mu.Lock()
defer m.mu.Unlock()
// Check if alert already exists
if alert, exists := m.activeAlerts[alertID]; exists {
// Update existing alert
alert.LastSeen = time.Now()
alert.Value = float64(oldestMinutes)
alert.Threshold = float64(threshold)
alert.Level = level
return
}
// Create new alert
alert := &Alert{
ID: alertID,
Type: "message-age",
Level: level,
ResourceID: pmg.ID,
ResourceName: pmg.Name,
Node: pmg.Host,
NodeDisplayName: m.resolveNodeDisplayName(pmg.Name, pmg.Host),
Instance: pmg.Name,
Message: fmt.Sprintf("PMG %s has messages queued for %d minutes (threshold: %d minutes)", pmg.Name, oldestMinutes, threshold),
Value: float64(oldestMinutes),
Threshold: float64(threshold),
StartTime: time.Now(),
LastSeen: time.Now(),
}
m.activeAlerts[alertID] = alert
m.dispatchAlert(alert, true)
log.Warn().
Str("pmg", pmg.Name).
Int64("oldest_minutes", oldestMinutes).
Int64("threshold", threshold).
Str("level", string(level)).
Msg("PMG oldest message age alert triggered")
}
// checkPMGNodeQueues checks individual PMG node queue health
// Uses scaled thresholds (60% warn, 80% crit) and outlier detection
func (m *Manager) checkPMGNodeQueues(pmg models.PMGInstance, defaults PMGThresholdConfig) {
if len(pmg.Nodes) == 0 {
return
}
// Calculate median queue values across nodes for outlier detection
nodeQueueTotals := make([]int, 0, len(pmg.Nodes))
nodeQueueDeferred := make([]int, 0, len(pmg.Nodes))
nodeQueueHold := make([]int, 0, len(pmg.Nodes))
for _, node := range pmg.Nodes {
if node.QueueStatus != nil {
nodeQueueTotals = append(nodeQueueTotals, node.QueueStatus.Total)
nodeQueueDeferred = append(nodeQueueDeferred, node.QueueStatus.Deferred)
nodeQueueHold = append(nodeQueueHold, node.QueueStatus.Hold)
}
}
medianTotal := calculateMedianInt(nodeQueueTotals)
medianDeferred := calculateMedianInt(nodeQueueDeferred)
medianHold := calculateMedianInt(nodeQueueHold)
// Scaled thresholds: 60% for warning, 80% for critical (computed once, used for all nodes)
scaledQueueWarn := scaleThreshold(defaults.QueueTotalWarning, 0.6)
scaledQueueCrit := scaleThreshold(defaults.QueueTotalCritical, 0.8)
scaledDeferredWarn := scaleThreshold(defaults.DeferredQueueWarn, 0.6)
scaledDeferredCrit := scaleThreshold(defaults.DeferredQueueCritical, 0.8)
scaledHoldWarn := scaleThreshold(defaults.HoldQueueWarn, 0.6)
scaledHoldCrit := scaleThreshold(defaults.HoldQueueCritical, 0.8)
scaledAgeWarn := scaleThreshold(defaults.OldestMessageWarnMins, 0.6)
scaledAgeCrit := scaleThreshold(defaults.OldestMessageCritMins, 0.8)
// Check each node
for _, node := range pmg.Nodes {
if node.QueueStatus == nil {
continue
}
// Check total queue - always check thresholds
if scaledQueueWarn > 0 || scaledQueueCrit > 0 {
total := node.QueueStatus.Total
alertID := fmt.Sprintf("%s-%s-queue-total", pmg.ID, node.Name)
var level AlertLevel
var threshold int
if scaledQueueCrit > 0 && total >= scaledQueueCrit {
level = AlertLevelCritical
threshold = scaledQueueCrit
} else if scaledQueueWarn > 0 && total >= scaledQueueWarn {
level = AlertLevelWarning
threshold = scaledQueueWarn
} else {
m.clearAlert(alertID)
continue
}
// Add outlier indicator to message if applicable
isOutlier := isQueueOutlier(total, medianTotal)
outlierNote := ""
if isOutlier {
outlierNote = ", outlier"
}
m.createOrUpdateNodeAlert(alertID, pmg, node.Name, "queue-total", level, float64(total), float64(threshold),
fmt.Sprintf("PMG node %s on %s has %d total messages in queue (threshold: %d%s)",
node.Name, pmg.Name, total, threshold, outlierNote))
}
// Check deferred queue - always check thresholds
if scaledDeferredWarn > 0 || scaledDeferredCrit > 0 {
deferred := node.QueueStatus.Deferred
alertID := fmt.Sprintf("%s-%s-queue-deferred", pmg.ID, node.Name)
var level AlertLevel
var threshold int
if scaledDeferredCrit > 0 && deferred >= scaledDeferredCrit {
level = AlertLevelCritical
threshold = scaledDeferredCrit
} else if scaledDeferredWarn > 0 && deferred >= scaledDeferredWarn {
level = AlertLevelWarning
threshold = scaledDeferredWarn
} else {
m.clearAlert(alertID)
continue
}
// Add outlier indicator to message if applicable
isOutlier := isQueueOutlier(deferred, medianDeferred)
outlierNote := ""
if isOutlier {
outlierNote = ", outlier"
}
m.createOrUpdateNodeAlert(alertID, pmg, node.Name, "queue-deferred", level, float64(deferred), float64(threshold),
fmt.Sprintf("PMG node %s on %s has %d deferred messages (threshold: %d%s)",
node.Name, pmg.Name, deferred, threshold, outlierNote))
}
// Check hold queue - always check thresholds
if scaledHoldWarn > 0 || scaledHoldCrit > 0 {
hold := node.QueueStatus.Hold
alertID := fmt.Sprintf("%s-%s-queue-hold", pmg.ID, node.Name)
var level AlertLevel
var threshold int
if scaledHoldCrit > 0 && hold >= scaledHoldCrit {
level = AlertLevelCritical
threshold = scaledHoldCrit
} else if scaledHoldWarn > 0 && hold >= scaledHoldWarn {
level = AlertLevelWarning
threshold = scaledHoldWarn
} else {
m.clearAlert(alertID)
continue
}
// Add outlier indicator to message if applicable
isOutlier := isQueueOutlier(hold, medianHold)
outlierNote := ""
if isOutlier {
outlierNote = ", outlier"
}
m.createOrUpdateNodeAlert(alertID, pmg, node.Name, "queue-hold", level, float64(hold), float64(threshold),
fmt.Sprintf("PMG node %s on %s has %d held messages (threshold: %d%s)",
node.Name, pmg.Name, hold, threshold, outlierNote))
}
// Check oldest message age per node
if scaledAgeWarn > 0 || scaledAgeCrit > 0 {
oldestAge := node.QueueStatus.OldestAge
if oldestAge > 0 {
oldestMinutes := oldestAge / 60
alertID := fmt.Sprintf("%s-%s-oldest-message", pmg.ID, node.Name)
var level AlertLevel
var threshold int64
if scaledAgeCrit > 0 && oldestMinutes >= int64(scaledAgeCrit) {
level = AlertLevelCritical
threshold = int64(scaledAgeCrit)
} else if scaledAgeWarn > 0 && oldestMinutes >= int64(scaledAgeWarn) {
level = AlertLevelWarning
threshold = int64(scaledAgeWarn)
} else {
m.clearAlert(alertID)
continue
}
m.createOrUpdateNodeAlert(alertID, pmg, node.Name, "message-age", level, float64(oldestMinutes), float64(threshold),
fmt.Sprintf("PMG node %s on %s has messages queued for %d minutes (threshold: %d min, node-specific)",
node.Name, pmg.Name, oldestMinutes, threshold))
}
}
}
}
// isQueueOutlier determines if a node's queue value is a significant outlier
// Returns true if value is >40% above the median across all nodes
func isQueueOutlier(value, median int) bool {
if median == 0 {
return value > 0
}
percentAboveMedian := float64(value-median) / float64(median) * 100
return percentAboveMedian > 40
}
// scaleThreshold applies a scaling factor to a threshold and ensures minimum value of 1
// Uses ceiling to avoid truncation issues with small thresholds
func scaleThreshold(threshold int, scaleFactor float64) int {
if threshold <= 0 {
return 0
}
scaled := int(math.Ceil(float64(threshold) * scaleFactor))
if scaled < 1 {
return 1
}
return scaled
}
// calculateMedianInt calculates median of integer slice
func calculateMedianInt(values []int) int {
if len(values) == 0 {
return 0
}
// Copy and sort
sorted := make([]int, len(values))
copy(sorted, values)
for i := 0; i < len(sorted); i++ {
for j := i + 1; j < len(sorted); j++ {
if sorted[i] > sorted[j] {
sorted[i], sorted[j] = sorted[j], sorted[i]
}
}
}
mid := len(sorted) / 2
if len(sorted)%2 == 0 {
return (sorted[mid-1] + sorted[mid]) / 2
}
return sorted[mid]
}
// createOrUpdateNodeAlert creates or updates a per-node alert
func (m *Manager) createOrUpdateNodeAlert(alertID string, pmg models.PMGInstance, nodeName, alertType string, level AlertLevel, value, threshold float64, message string) {
m.mu.Lock()
defer m.mu.Unlock()
// Check if alert already exists
if alert, exists := m.activeAlerts[alertID]; exists {
alert.LastSeen = time.Now()
alert.Value = value
alert.Threshold = threshold
alert.Level = level
alert.Message = message
return
}
// Create new alert
alert := &Alert{
ID: alertID,
Type: alertType,
Level: level,
ResourceID: pmg.ID,
ResourceName: pmg.Name,
Node: nodeName,
NodeDisplayName: m.resolveNodeDisplayName(pmg.Name, nodeName),
Instance: pmg.Name,
Message: message,
Value: value,
Threshold: threshold,
StartTime: time.Now(),
LastSeen: time.Now(),
}
m.activeAlerts[alertID] = alert
m.dispatchAlert(alert, true)
log.Warn().
Str("pmg", pmg.Name).
Str("node", nodeName).
Str("type", alertType).
Float64("value", value).
Float64("threshold", threshold).
Str("level", string(level)).
Msg("PMG per-node alert triggered")
}
// checkPMGQuarantineBacklog checks quarantine backlog and growth rates
func (m *Manager) checkPMGQuarantineBacklog(pmg models.PMGInstance, defaults PMGThresholdConfig) {
if pmg.Quarantine == nil {
m.clearAlert(fmt.Sprintf("%s-quarantine-spam", pmg.ID))
m.clearAlert(fmt.Sprintf("%s-quarantine-virus", pmg.ID))
return
}
now := time.Now()
currentSpam := pmg.Quarantine.Spam
currentVirus := pmg.Quarantine.Virus
// Store current snapshot
m.mu.Lock()
snapshot := pmgQuarantineSnapshot{
Spam: currentSpam,
Virus: currentVirus,
Timestamp: now,
}
// Get or create history for this PMG instance
history := m.pmgQuarantineHistory[pmg.ID]
history = append(history, snapshot)
// Clean old snapshots (keep last 3 hours)
cutoff := now.Add(-3 * time.Hour)
validSnapshots := make([]pmgQuarantineSnapshot, 0, len(history))
for _, snap := range history {
if snap.Timestamp.After(cutoff) {
validSnapshots = append(validSnapshots, snap)
}
}
// Limit to max 48 samples to prevent unbounded growth
const maxQuarantineSnapshots = 48
if len(validSnapshots) > maxQuarantineSnapshots {
validSnapshots = validSnapshots[len(validSnapshots)-maxQuarantineSnapshots:]
}
m.pmgQuarantineHistory[pmg.ID] = validSnapshots
m.mu.Unlock()
// Find snapshot from ~2 hours ago (within ±15 min tolerance)
var twoHoursAgo *pmgQuarantineSnapshot
targetTime := now.Add(-2 * time.Hour)
minDiff := 15 * time.Minute
for i := range validSnapshots {
snap := &validSnapshots[i]
diff := snap.Timestamp.Sub(targetTime)
if diff < 0 {
diff = -diff
}
if diff < minDiff {
minDiff = diff
twoHoursAgo = snap
}
}
// Check spam quarantine
m.checkQuarantineMetric(pmg, "spam", currentSpam, twoHoursAgo, defaults)
// Check virus quarantine
m.checkQuarantineMetric(pmg, "virus", currentVirus, twoHoursAgo, defaults)
}
// checkQuarantineMetric checks a single quarantine metric (spam or virus)
func (m *Manager) checkQuarantineMetric(pmg models.PMGInstance, metricType string, current int, twoHoursAgo *pmgQuarantineSnapshot, defaults PMGThresholdConfig) {
alertID := fmt.Sprintf("%s-quarantine-%s", pmg.ID, metricType)
var absoluteWarn, absoluteCrit int
var previousCount int
// Get thresholds and previous count based on metric type
if metricType == "spam" {
absoluteWarn = defaults.QuarantineSpamWarn
absoluteCrit = defaults.QuarantineSpamCritical
if twoHoursAgo != nil {
previousCount = twoHoursAgo.Spam
}
} else { // virus
absoluteWarn = defaults.QuarantineVirusWarn
absoluteCrit = defaults.QuarantineVirusCritical
if twoHoursAgo != nil {
previousCount = twoHoursAgo.Virus
}
}
var level AlertLevel
var message string
var threshold int
var alertTriggered bool
// Check absolute thresholds first
if absoluteCrit > 0 && current >= absoluteCrit {
level = AlertLevelCritical
threshold = absoluteCrit
message = fmt.Sprintf("PMG %s has %d %s messages in quarantine (threshold: %d)", pmg.Name, current, metricType, threshold)
alertTriggered = true
} else if absoluteWarn > 0 && current >= absoluteWarn {
level = AlertLevelWarning
threshold = absoluteWarn
message = fmt.Sprintf("PMG %s has %d %s messages in quarantine (threshold: %d)", pmg.Name, current, metricType, threshold)
alertTriggered = true
}
// Check growth thresholds if we have historical data
if twoHoursAgo != nil && previousCount > 0 {
growth := current - previousCount
growthPct := (float64(growth) / float64(previousCount)) * 100
// Critical growth: ≥50% AND ≥500 messages
if defaults.QuarantineGrowthCritPct > 0 && defaults.QuarantineGrowthCritMin > 0 {
if growthPct >= float64(defaults.QuarantineGrowthCritPct) && growth >= defaults.QuarantineGrowthCritMin {
if level != AlertLevelCritical { // Only override if not already critical from absolute
level = AlertLevelCritical
threshold = previousCount + defaults.QuarantineGrowthCritMin
message = fmt.Sprintf("PMG %s %s quarantine growing rapidly: +%d messages (+%.1f%%) in 2 hours", pmg.Name, metricType, growth, growthPct)
alertTriggered = true
}
}
}
// Warning growth: ≥25% AND ≥250 messages (if not already critical)
if level != AlertLevelCritical && defaults.QuarantineGrowthWarnPct > 0 && defaults.QuarantineGrowthWarnMin > 0 {
if growthPct >= float64(defaults.QuarantineGrowthWarnPct) && growth >= defaults.QuarantineGrowthWarnMin {
level = AlertLevelWarning
threshold = previousCount + defaults.QuarantineGrowthWarnMin
message = fmt.Sprintf("PMG %s %s quarantine growing: +%d messages (+%.1f%%) in 2 hours", pmg.Name, metricType, growth, growthPct)
alertTriggered = true
}
}
}
// Clear alert if no thresholds exceeded
if !alertTriggered {
m.clearAlert(alertID)
return
}
m.mu.Lock()
defer m.mu.Unlock()
// Check if alert already exists
if alert, exists := m.activeAlerts[alertID]; exists {
// Update existing alert
alert.LastSeen = time.Now()
alert.Value = float64(current)
alert.Threshold = float64(threshold)
alert.Level = level
alert.Message = message
return
}
// Create new alert
alert := &Alert{
ID: alertID,
Type: fmt.Sprintf("quarantine-%s", metricType),
Level: level,
ResourceID: pmg.ID,
ResourceName: pmg.Name,
Node: pmg.Host,
NodeDisplayName: m.resolveNodeDisplayName(pmg.Name, pmg.Host),
Instance: pmg.Name,
Message: message,
Value: float64(current),
Threshold: float64(threshold),
StartTime: time.Now(),
LastSeen: time.Now(),
}
m.activeAlerts[alertID] = alert
m.dispatchAlert(alert, true)
log.Warn().
Str("pmg", pmg.Name).
Str("type", metricType).
Int("current", current).
Int("threshold", threshold).
Str("level", string(level)).
Msg("PMG quarantine backlog alert triggered")
}
// calculateTrimmedBaseline computes a robust baseline from historical samples
// using trimmed mean with median fallback for statistical robustness
func calculateTrimmedBaseline(samples []float64) (baseline float64, trustworthy bool) {
sampleCount := len(samples)
// Need at least 12 samples for trustworthy baseline (warmup period)
if sampleCount < 12 {
return 0, false
}
// For full 24-sample baseline, use trimmed mean
if sampleCount >= 24 {
// Create a copy for sorting
sorted := make([]float64, len(samples))
copy(sorted, samples)
// Sort samples
for i := 0; i < len(sorted); i++ {
for j := i + 1; j < len(sorted); j++ {
if sorted[i] > sorted[j] {
sorted[i], sorted[j] = sorted[j], sorted[i]
}
}
}
// Calculate median
var median float64
mid := len(sorted) / 2
if len(sorted)%2 == 0 {
median = (sorted[mid-1] + sorted[mid]) / 2
} else {
median = sorted[mid]
}
// Calculate trimmed mean: drop top and bottom 2, average remaining 20
if len(sorted) >= 24 {
trimmed := sorted[2 : len(sorted)-2]
sum := 0.0
for _, val := range trimmed {
sum += val
}
trimmedMean := sum / float64(len(trimmed))
// Fallback rule: if trimmed mean differs from median by >40%, use median
diff := trimmedMean - median
if diff < 0 {
diff = -diff
}
percentDiff := (diff / median) * 100
if percentDiff > 40 {
return median, true
}
return trimmedMean, true
}
}
// For 12-23 samples, use simple mean (not enough for trimming)
sum := 0.0
for _, val := range samples {
sum += val
}
return sum / float64(len(samples)), true
}
// checkPMGAnomalies detects spam/virus rate anomalies using trimmed baseline
func (m *Manager) checkPMGAnomalies(pmg models.PMGInstance, _ PMGThresholdConfig) {
// Need mail count data
if len(pmg.MailCount) == 0 {
return
}
// Get the latest hourly sample (most recent)
latest := pmg.MailCount[len(pmg.MailCount)-1]
now := time.Now()
// Get or create anomaly tracker for this PMG instance
m.mu.Lock()
tracker := m.pmgAnomalyTrackers[pmg.ID]
if tracker == nil {
tracker = &pmgAnomalyTracker{
Samples: make([]pmgMailMetricSample, 0, 48),
Baselines: make(map[string]pmgBaselineCache),
}
m.pmgAnomalyTrackers[pmg.ID] = tracker
}
// Create sample from latest mail count
sample := pmgMailMetricSample{
SpamIn: latest.SpamIn,
SpamOut: latest.SpamOut,
VirusIn: latest.VirusIn,
VirusOut: latest.VirusOut,
Timestamp: latest.Timestamp,
}
// Check for duplicate timestamp (already processed this sample)
if !tracker.LastSampleTime.IsZero() && !sample.Timestamp.After(tracker.LastSampleTime) {
m.mu.Unlock()
return
}
// Check for timestamp gaps (>90 min indicates data discontinuity)
if !tracker.LastSampleTime.IsZero() {
gap := sample.Timestamp.Sub(tracker.LastSampleTime)
if gap > 90*time.Minute {
// Discard old samples - data gap detected
log.Debug().
Str("pmg", pmg.Name).
Dur("gap", gap).
Msg("PMG mail count data gap detected, resetting anomaly history")
tracker.Samples = make([]pmgMailMetricSample, 0, 48)
tracker.SampleCount = 0
}
}
// Add sample to ring buffer
tracker.Samples = append(tracker.Samples, sample)
tracker.SampleCount++
tracker.LastSampleTime = sample.Timestamp
// Maintain ring buffer size (keep last 48)
if len(tracker.Samples) > 48 {
tracker.Samples = tracker.Samples[len(tracker.Samples)-48:]
}
sampleCount := len(tracker.Samples)
m.mu.Unlock()
// Need at least 12 samples for baseline warmup
if sampleCount < 12 {
log.Debug().
Str("pmg", pmg.Name).
Int("samples", sampleCount).
Msg("PMG anomaly detection warming up (need 12 samples)")
return
}
// Calculate baselines and check each metric
metrics := []struct {
name string
current float64
extractor func(pmgMailMetricSample) float64
}{
{"spamIn", sample.SpamIn, func(s pmgMailMetricSample) float64 { return s.SpamIn }},
{"spamOut", sample.SpamOut, func(s pmgMailMetricSample) float64 { return s.SpamOut }},
{"virusIn", sample.VirusIn, func(s pmgMailMetricSample) float64 { return s.VirusIn }},
{"virusOut", sample.VirusOut, func(s pmgMailMetricSample) float64 { return s.VirusOut }},
}
for _, metric := range metrics {
m.checkAnomalyMetric(pmg, tracker, metric.name, metric.current, metric.extractor, now)
}
}
// checkAnomalyMetric checks a single spam/virus metric for anomalies
func (m *Manager) checkAnomalyMetric(pmg models.PMGInstance, tracker *pmgAnomalyTracker, metricName string, current float64, extractor func(pmgMailMetricSample) float64, now time.Time) {
// Extract historical values for this metric (excluding current sample)
m.mu.RLock()
samples := tracker.Samples
m.mu.RUnlock()
if len(samples) < 2 {
return
}
// Get previous 24 samples (or all available if less than 25 total)
startIdx := 0
if len(samples) > 25 {
startIdx = len(samples) - 25
}
historicalSamples := samples[startIdx : len(samples)-1] // Exclude current (last) sample
// Extract metric values
values := make([]float64, 0, len(historicalSamples))
for _, s := range historicalSamples {
values = append(values, extractor(s))
}
// Calculate baseline
baseline, trustworthy := calculateTrimmedBaseline(values)
if !trustworthy {
return
}
// Handle zero baseline edge case
if baseline == 0 && current > 0 {
baseline = 1.0 // Treat as 1 for ratio math
}
// Determine warning and critical thresholds
var warnRatio, critRatio float64
var warnDelta, critDelta float64
if baseline < 40 {
// Quiet site: use minimum absolute deltas
warnRatio = 0
critRatio = 0
warnDelta = baseline + 60
critDelta = baseline + 120
} else {
// Normal site: use ratio + absolute delta
warnRatio = 1.8
critRatio = 2.5
warnDelta = baseline + 150
critDelta = baseline + 300
}
alertID := fmt.Sprintf("%s-anomaly-%s", pmg.ID, metricName)
pendingKey := fmt.Sprintf("pmg-anomaly-%s-%s", pmg.ID, metricName)
var level AlertLevel
var triggered bool
var ratio float64
if baseline > 0 {
ratio = current / baseline
}
// Check critical threshold
if critRatio > 0 && ratio >= critRatio && current >= critDelta {
level = AlertLevelCritical
triggered = true
} else if warnRatio > 0 && ratio >= warnRatio && current >= warnDelta {
level = AlertLevelWarning
triggered = true
} else if baseline < 40 {
// Quiet site absolute check
if current >= critDelta {
level = AlertLevelCritical
triggered = true
} else if current >= warnDelta {
level = AlertLevelWarning
triggered = true
}
}
// Two-sample confirmation using pendingAlerts
if triggered {
m.mu.Lock()
firstSeen, pending := m.pendingAlerts[pendingKey]
if !pending {
// First sample above threshold - mark as pending
m.pendingAlerts[pendingKey] = now
m.mu.Unlock()
log.Debug().
Str("pmg", pmg.Name).
Str("metric", metricName).
Float64("current", current).
Float64("baseline", baseline).
Msg("PMG anomaly pending confirmation (first sample)")
return
}
m.mu.Unlock()
// Second consecutive sample above threshold - issue alert
log.Debug().
Str("pmg", pmg.Name).
Str("metric", metricName).
Float64("current", current).
Float64("baseline", baseline).
Dur("pending", now.Sub(firstSeen)).
Msg("PMG anomaly confirmed (second sample)")
m.mu.Lock()
delete(m.pendingAlerts, pendingKey) // Clear pending
// Check if alert already exists
if alert, exists := m.activeAlerts[alertID]; exists {
alert.LastSeen = now
alert.Value = current
alert.Threshold = baseline
alert.Level = level
m.mu.Unlock()
return
}
// Create new alert
message := fmt.Sprintf("PMG %s anomaly detected: %s is %.1f messages/hour (%.1fx baseline of %.1f)",
pmg.Name, metricName, current, ratio, baseline)
alert := &Alert{
ID: alertID,
Type: fmt.Sprintf("anomaly-%s", metricName),
Level: level,
ResourceID: pmg.ID,
ResourceName: pmg.Name,
Node: pmg.Host,
NodeDisplayName: m.resolveNodeDisplayName(pmg.Name, pmg.Host),
Instance: pmg.Name,
Message: message,
Value: current,
Threshold: baseline,
StartTime: now,
LastSeen: now,
}
m.activeAlerts[alertID] = alert
m.mu.Unlock()
m.dispatchAlert(alert, true)
log.Warn().
Str("pmg", pmg.Name).
Str("metric", metricName).
Float64("current", current).
Float64("baseline", baseline).
Float64("ratio", ratio).
Str("level", string(level)).
Msg("PMG anomaly alert triggered")
} else {
// Below threshold - clear pending and alert
m.mu.Lock()
delete(m.pendingAlerts, pendingKey)
m.mu.Unlock()
m.clearAlert(alertID)
}
}
// checkStorageOffline creates an alert for offline/unavailable storage
func (m *Manager) checkStorageOffline(storage models.Storage) {
alertID := fmt.Sprintf("storage-offline-%s", storage.ID)
m.mu.Lock()
defer m.mu.Unlock()
// Check if storage offline alerts are disabled
if override, exists, _ := findStorageOverride(m.config.Overrides, storage); exists && override.Disabled {
// Storage alerts are disabled, clear any existing alert and return
if _, alertExists := m.activeAlerts[alertID]; alertExists {
m.clearAlertNoLock(alertID)
log.Debug().
Str("storage", storage.Name).
Msg("Storage offline alert cleared (alerts disabled)")
}
return
}
// Track confirmation count for this storage
m.offlineConfirmations[storage.ID]++
// Require 2 consecutive offline polls (~10 seconds) before alerting for storage
// (less than nodes since storage status can be more transient)
if m.offlineConfirmations[storage.ID] < 2 {
log.Debug().
Str("storage", storage.Name).
Int("confirmations", m.offlineConfirmations[storage.ID]).
Msg("Storage offline detected, waiting for confirmation")
return
}
// Check if alert already exists
if _, exists := m.activeAlerts[alertID]; exists {
// Update last seen time
m.activeAlerts[alertID].LastSeen = time.Now()
return
}
// Create new offline alert after confirmation
alert := &Alert{
ID: alertID,
Type: "offline",
Level: AlertLevelWarning, // Storage offline is Warning, not Critical
ResourceID: storage.ID,
ResourceName: storage.Name,
Node: storage.Node,
Instance: storage.Instance,
Message: fmt.Sprintf("Storage %s on node %s is unavailable", storage.Name, storage.Node),
Value: 0,
Threshold: 0,
StartTime: time.Now(),
LastSeen: time.Now(),
}
m.preserveAlertState(alertID, alert)
m.activeAlerts[alertID] = alert
// Log and notify
log.Warn().
Str("storage", storage.Name).
Str("node", storage.Node).
Int("confirmations", m.offlineConfirmations[storage.ID]).
Msg("Storage is offline/unavailable")
if !m.checkRateLimit(alertID) {
log.Debug().
Str("alertID", alertID).
Int("maxPerHour", m.config.Schedule.MaxAlertsHour).
Msg("Storage offline alert suppressed due to rate limit")
return
}
m.dispatchAlert(alert, true)
}
// clearStorageOfflineAlert removes offline alert when storage comes back online
func (m *Manager) clearStorageOfflineAlert(storage models.Storage) {
alertID := fmt.Sprintf("storage-offline-%s", storage.ID)
m.mu.Lock()
defer m.mu.Unlock()
// Reset offline confirmation count
if count, exists := m.offlineConfirmations[storage.ID]; exists && count > 0 {
log.Debug().
Str("storage", storage.Name).
Int("previousCount", count).
Msg("Storage is online, resetting offline confirmation count")
delete(m.offlineConfirmations, storage.ID)
}
// Check if offline alert exists
alert, exists := m.activeAlerts[alertID]
if !exists {
return
}
// Remove from active alerts
m.removeActiveAlertNoLock(alertID)
resolvedAlert := &ResolvedAlert{
Alert: alert,
ResolvedTime: time.Now(),
}
m.addRecentlyResolvedWithPrimaryLock(alertID, resolvedAlert)
// Send recovery notification (async to avoid blocking alert cleanup while holding m.mu.Lock)
m.safeCallResolvedCallback(alertID, true)
// Log recovery
log.Info().
Str("storage", storage.Name).
Str("node", storage.Node).
Dur("downtime", time.Since(alert.StartTime)).
Msg("Storage is back online")
}
// checkGuestPoweredOff creates an alert for powered-off guests
func (m *Manager) checkGuestPoweredOff(guestID, name, node, instanceName, guestType string, monitorOnly bool) {
alertID := fmt.Sprintf("guest-powered-off-%s", guestID)
m.mu.Lock()
defer m.mu.Unlock()
// Get thresholds to check if powered-off alerts are disabled
var thresholds ThresholdConfig
if override, exists := m.config.Overrides[guestID]; exists {
thresholds = override
} else {
thresholds = m.config.GuestDefaults
}
severity := normalizePoweredOffSeverity(thresholds.PoweredOffSeverity)
// Check if powered-off alerts are disabled for this guest
if thresholds.Disabled || thresholds.DisableConnectivity {
// Powered-off alerts are disabled, clear any existing alert and return
if _, alertExists := m.activeAlerts[alertID]; alertExists {
m.clearAlertNoLock(alertID)
log.Debug().
Str("guest", name).
Msg("Guest powered-off alert cleared (alerts disabled)")
}
delete(m.offlineConfirmations, guestID)
return
}
// Check if alert already exists
if alert, exists := m.activeAlerts[alertID]; exists {
// Alert already exists, just update LastSeen
alert.LastSeen = time.Now()
alert.Level = severity
if alert.Metadata == nil {
alert.Metadata = map[string]interface{}{}
}
alert.Metadata["monitorOnly"] = monitorOnly
return
}
// Increment confirmation count
m.offlineConfirmations[guestID]++
confirmCount := m.offlineConfirmations[guestID]
log.Debug().
Str("guest", name).
Str("type", guestType).
Int("confirmations", confirmCount).
Msg("Guest powered-off detected")
// Require 2 consecutive powered-off polls (~10 seconds) before alerting
// This prevents false positives from transient states
const requiredConfirmations = 2
if confirmCount < requiredConfirmations {
log.Debug().
Str("guest", name).
Int("count", confirmCount).
Int("required", requiredConfirmations).
Msg("Guest appears powered-off, waiting for confirmation")
return
}
// Create new powered-off alert after confirmation
alert := &Alert{
ID: alertID,
Type: "powered-off",
Level: severity,
ResourceID: guestID,
ResourceName: name,
Node: node,
Instance: instanceName,
Message: fmt.Sprintf("%s '%s' is powered off", guestType, name),
Value: 0, // Not applicable for powered-off status
Threshold: 0, // Not applicable for powered-off status
StartTime: time.Now(),
LastSeen: time.Now(),
Acknowledged: false,
Metadata: map[string]interface{}{
"monitorOnly": monitorOnly,
},
}
m.preserveAlertState(alertID, alert)
m.activeAlerts[alertID] = alert
m.recentAlerts[alertID] = alert
// Add to history
m.historyManager.AddAlert(*alert)
// Send notification after confirmation
m.dispatchAlert(alert, false)
// Log the event
log.Warn().
Str("guest", name).
Str("type", guestType).
Str("node", node).
Str("instance", instanceName).
Int("confirmedAfter", requiredConfirmations).
Msg("Guest is powered off (confirmed)")
}
// clearGuestPoweredOffAlert removes powered-off alert when guest starts running
func (m *Manager) clearGuestPoweredOffAlert(guestID, name string) {
alertID := fmt.Sprintf("guest-powered-off-%s", guestID)
m.mu.Lock()
defer m.mu.Unlock()
// Reset confirmation count when guest comes back online
if count, exists := m.offlineConfirmations[guestID]; exists && count > 0 {
log.Debug().
Str("guest", name).
Int("previousCount", count).
Msg("Guest is running, resetting powered-off confirmation count")
delete(m.offlineConfirmations, guestID)
}
// Check if powered-off alert exists
alert, exists := m.activeAlerts[alertID]
if !exists {
return
}
// Remove from active alerts
m.removeActiveAlertNoLock(alertID)
downtime := time.Since(alert.StartTime)
resolvedAlert := &ResolvedAlert{
Alert: alert,
ResolvedTime: time.Now(),
}
m.addRecentlyResolvedWithPrimaryLock(alertID, resolvedAlert)
// Send recovery notification (async to avoid blocking alert cleanup while holding m.mu.Lock)
m.safeCallResolvedCallback(alertID, true)
// Log recovery
log.Info().
Str("guest", name).
Dur("downtime", downtime).
Msg("Guest is now running")
}
// ClearAlert removes an alert from active alerts (but keeps in history)
func (m *Manager) ClearAlert(alertID string) bool {
m.mu.Lock()
if _, exists := m.activeAlerts[alertID]; !exists {
m.mu.Unlock()
return false
}
m.clearAlertNoLock(alertID)
delete(m.recentAlerts, alertID)
delete(m.pendingAlerts, alertID)
delete(m.suppressedUntil, alertID)
delete(m.alertRateLimit, alertID)
m.mu.Unlock()
m.saveActiveAlertsAsync("manual-clear")
return true
}
// Cleanup removes old acknowledged alerts and cleans up tracking maps
func (m *Manager) Cleanup(maxAge time.Duration) {
m.mu.Lock()
now := time.Now()
var autoAcked []*Alert
lastSeenTooOld := func(alert *Alert, cutoff time.Duration) bool {
if alert == nil {
return true
}
lastSeen := alert.LastSeen
if lastSeen.IsZero() {
lastSeen = alert.StartTime
}
return now.Sub(lastSeen) > cutoff
}
// Auto-acknowledge old alerts if configured
if m.config.AutoAcknowledgeAfterHours > 0 {
autoAckThreshold := time.Duration(m.config.AutoAcknowledgeAfterHours) * time.Hour
for id, alert := range m.activeAlerts {
if !alert.Acknowledged && now.Sub(alert.StartTime) > autoAckThreshold {
log.Info().
Str("alertID", id).
Dur("age", now.Sub(alert.StartTime)).
Msg("Auto-acknowledging old alert")
alert.Acknowledged = true
ackTime := now
alert.AckTime = &ackTime
alert.AckUser = "system-auto"
autoAcked = append(autoAcked, alert.Clone())
if recordAlertAcknowledged != nil {
recordAlertAcknowledged()
}
}
}
}
// Clean up acknowledged alerts based on TTL
if m.config.MaxAcknowledgedAgeDays > 0 {
acknowledgedTTL := time.Duration(m.config.MaxAcknowledgedAgeDays) * 24 * time.Hour
for id, alert := range m.activeAlerts {
if alert.Acknowledged && alert.AckTime != nil &&
now.Sub(*alert.AckTime) > acknowledgedTTL &&
lastSeenTooOld(alert, acknowledgedTTL) {
log.Info().
Str("alertID", id).
Dur("age", now.Sub(*alert.AckTime)).
Msg("Cleaning up old acknowledged alert (TTL)")
m.removeActiveAlertNoLock(id)
}
}
}
// Clean up old unacknowledged alerts based on TTL
if m.config.MaxAlertAgeDays > 0 {
alertTTL := time.Duration(m.config.MaxAlertAgeDays) * 24 * time.Hour
for id, alert := range m.activeAlerts {
if !alert.Acknowledged && now.Sub(alert.StartTime) > alertTTL {
log.Info().
Str("alertID", id).
Dur("age", now.Sub(alert.StartTime)).
Msg("Cleaning up old unacknowledged alert (TTL)")
m.removeActiveAlertNoLock(id)
}
}
}
// Original cleanup for acknowledged alerts (fallback if TTL not configured)
for id, alert := range m.activeAlerts {
if alert.Acknowledged && alert.AckTime != nil &&
now.Sub(*alert.AckTime) > maxAge &&
lastSeenTooOld(alert, maxAge) {
m.removeActiveAlertNoLock(id)
}
}
// Clean up stale ackState entries for alerts that no longer exist
// Keep ackState for 1 hour after the alert was removed (not from ack time)
// to handle transient alert clears (e.g., backups of powered-off VMs)
ackStateTTL := 1 * time.Hour
for id, record := range m.ackState {
if _, alertExists := m.activeAlerts[id]; !alertExists {
// Use inactiveAt (when alert was removed) for TTL, not ack time
checkTime := record.inactiveAt
if checkTime.IsZero() {
// Fallback for legacy entries without inactiveAt
checkTime = record.time
}
if now.Sub(checkTime) > ackStateTTL {
delete(m.ackState, id)
}
}
}
// Clean up recent alerts older than suppression window
suppressionWindow := time.Duration(m.config.SuppressionWindow) * time.Minute
if suppressionWindow == 0 {
suppressionWindow = 5 * time.Minute // Default
}
for id, alert := range m.recentAlerts {
if now.Sub(alert.StartTime) > suppressionWindow {
delete(m.recentAlerts, id)
}
}
// Clean up expired suppressions
for id, suppressUntil := range m.suppressedUntil {
if now.After(suppressUntil) {
delete(m.suppressedUntil, id)
}
}
// Clean up old rate limit entries (older than 1 hour)
cutoff := now.Add(-1 * time.Hour)
for alertID, times := range m.alertRateLimit {
var recentTimes []time.Time
for _, t := range times {
if t.After(cutoff) {
recentTimes = append(recentTimes, t)
}
}
if len(recentTimes) == 0 {
// No recent alerts, remove the entry entirely
delete(m.alertRateLimit, alertID)
} else {
// Update with only recent times
m.alertRateLimit[alertID] = recentTimes
}
}
// Clean up old recently resolved alerts (older than 5 minutes)
fiveMinutesAgo := now.Add(-5 * time.Minute)
m.resolvedMutex.Lock()
for alertID, resolved := range m.recentlyResolved {
if resolved.ResolvedTime.Before(fiveMinutesAgo) {
delete(m.recentlyResolved, alertID)
}
}
m.resolvedMutex.Unlock()
// Clean up stale pending alerts (older than max time threshold window)
// This prevents memory leak from deleted resources that never triggered alerts
maxPendingAge := 10 * time.Minute // Longest time threshold + safety buffer
for id, pendingTime := range m.pendingAlerts {
if now.Sub(pendingTime) > maxPendingAge {
delete(m.pendingAlerts, id)
log.Debug().
Str("resourceID", id).
Dur("age", now.Sub(pendingTime)).
Msg("Cleaned up stale pending alert entry")
}
}
// Clean up flapping history for resolved/inactive alerts
flappingCleanupAge := 1 * time.Hour
for alertID := range m.flappingHistory {
// If alert is no longer active and flapping cooldown has expired
if _, exists := m.activeAlerts[alertID]; !exists {
if suppressUntil, suppressed := m.suppressedUntil[alertID]; !suppressed || now.After(suppressUntil.Add(flappingCleanupAge)) {
delete(m.flappingHistory, alertID)
delete(m.flappingActive, alertID)
log.Debug().
Str("alertID", alertID).
Msg("Cleaned up flapping history for inactive alert")
}
}
}
// Clean up old Docker restart tracking (containers not seen in 24h)
// Prevents memory leak from ephemeral containers in CI/CD environments
for resourceID, record := range m.dockerRestartTracking {
if now.Sub(record.lastChecked) > 24*time.Hour {
delete(m.dockerRestartTracking, resourceID)
log.Debug().
Str("resourceID", resourceID).
Msg("Cleaned up stale Docker restart tracking entry")
}
}
// Clean up stale PMG anomaly trackers (no samples in 24h)
// Prevents memory leak from decommissioned or transient PMG instances
staleTrackerAge := 24 * time.Hour
for pmgID, tracker := range m.pmgAnomalyTrackers {
if tracker != nil && !tracker.LastSampleTime.IsZero() {
if now.Sub(tracker.LastSampleTime) > staleTrackerAge {
delete(m.pmgAnomalyTrackers, pmgID)
log.Debug().
Str("pmgID", pmgID).
Time("lastSampleTime", tracker.LastSampleTime).
Msg("Cleaned up stale PMG anomaly tracker")
}
}
}
// Clean up stale PMG quarantine history (no recent snapshots in 7 days)
// Prevents memory leak from deleted PMG instances
staleHistoryAge := 7 * 24 * time.Hour
for pmgID, snapshots := range m.pmgQuarantineHistory {
// If no snapshots remain or last snapshot is very old
if len(snapshots) == 0 {
delete(m.pmgQuarantineHistory, pmgID)
log.Debug().
Str("pmgID", pmgID).
Msg("Cleaned up empty PMG quarantine history")
continue
}
lastSnapshot := snapshots[len(snapshots)-1]
if now.Sub(lastSnapshot.Timestamp) > staleHistoryAge {
delete(m.pmgQuarantineHistory, pmgID)
log.Debug().
Str("pmgID", pmgID).
Time("lastSnapshot", lastSnapshot.Timestamp).
Msg("Cleaned up stale PMG quarantine history")
}
}
m.mu.Unlock()
for _, alert := range autoAcked {
m.safeCallAcknowledgedCallback(alert, "system-auto")
}
}
// convertLegacyThreshold converts a legacy float64 threshold to HysteresisThreshold
func (m *Manager) convertLegacyThreshold(legacy *float64) *HysteresisThreshold {
if legacy == nil || *legacy <= 0 {
return nil
}
margin := m.config.HysteresisMargin
if margin <= 0 {
margin = 5.0 // Default 5% margin
}
return &HysteresisThreshold{
Trigger: *legacy,
Clear: *legacy - margin,
}
}
func cloneThreshold(threshold *HysteresisThreshold) *HysteresisThreshold {
if threshold == nil {
return nil
}
clone := *threshold
return &clone
}
func cloneStringPtr(value *string) *string {
if value == nil {
return nil
}
v := *value
return &v
}
func cloneThresholdConfig(cfg ThresholdConfig) ThresholdConfig {
clone := cfg
clone.CPU = cloneThreshold(cfg.CPU)
clone.Memory = cloneThreshold(cfg.Memory)
clone.Disk = cloneThreshold(cfg.Disk)
clone.DiskRead = cloneThreshold(cfg.DiskRead)
clone.DiskWrite = cloneThreshold(cfg.DiskWrite)
clone.NetworkIn = cloneThreshold(cfg.NetworkIn)
clone.NetworkOut = cloneThreshold(cfg.NetworkOut)
clone.Temperature = cloneThreshold(cfg.Temperature)
clone.DiskTemperature = cloneThreshold(cfg.DiskTemperature)
clone.Usage = cloneThreshold(cfg.Usage)
clone.Note = cloneStringPtr(cfg.Note)
return clone
}
func (m *Manager) applyThresholdOverride(base ThresholdConfig, override ThresholdConfig) ThresholdConfig {
result := base
if override.Disabled {
result.Disabled = true
}
if override.DisableConnectivity {
result.DisableConnectivity = true
}
if override.CPU != nil {
result.CPU = ensureHysteresisThreshold(cloneThreshold(override.CPU))
} else if override.CPULegacy != nil {
result.CPU = m.convertLegacyThreshold(override.CPULegacy)
}
if override.Memory != nil {
result.Memory = ensureHysteresisThreshold(cloneThreshold(override.Memory))
} else if override.MemoryLegacy != nil {
result.Memory = m.convertLegacyThreshold(override.MemoryLegacy)
}
if override.Disk != nil {
result.Disk = ensureHysteresisThreshold(cloneThreshold(override.Disk))
} else if override.DiskLegacy != nil {
result.Disk = m.convertLegacyThreshold(override.DiskLegacy)
}
if override.DiskRead != nil {
result.DiskRead = ensureHysteresisThreshold(cloneThreshold(override.DiskRead))
} else if override.DiskReadLegacy != nil {
result.DiskRead = m.convertLegacyThreshold(override.DiskReadLegacy)
}
if override.DiskWrite != nil {
result.DiskWrite = ensureHysteresisThreshold(cloneThreshold(override.DiskWrite))
} else if override.DiskWriteLegacy != nil {
result.DiskWrite = m.convertLegacyThreshold(override.DiskWriteLegacy)
}
if override.NetworkIn != nil {
result.NetworkIn = ensureHysteresisThreshold(cloneThreshold(override.NetworkIn))
} else if override.NetworkInLegacy != nil {
result.NetworkIn = m.convertLegacyThreshold(override.NetworkInLegacy)
}
if override.NetworkOut != nil {
result.NetworkOut = ensureHysteresisThreshold(cloneThreshold(override.NetworkOut))
} else if override.NetworkOutLegacy != nil {
result.NetworkOut = m.convertLegacyThreshold(override.NetworkOutLegacy)
}
if override.Temperature != nil {
result.Temperature = ensureHysteresisThreshold(cloneThreshold(override.Temperature))
}
if override.DiskTemperature != nil {
result.DiskTemperature = ensureHysteresisThreshold(cloneThreshold(override.DiskTemperature))
}
if override.Usage != nil {
result.Usage = ensureHysteresisThreshold(cloneThreshold(override.Usage))
}
if override.Note != nil {
note := strings.TrimSpace(*override.Note)
if note == "" {
result.Note = nil
} else {
noteCopy := note
result.Note = &noteCopy
}
}
return result
}
// ensureHysteresisThreshold ensures a threshold has hysteresis configured
func ensureHysteresisThreshold(threshold *HysteresisThreshold) *HysteresisThreshold {
if threshold == nil {
return nil
}
if threshold.Clear <= 0 {
threshold.Clear = threshold.Trigger - 5.0 // Default 5% margin
}
return threshold
}
type pulseTagSettings struct {
Suppress bool
MonitorOnly bool
Relaxed bool
}
func parsePulseTags(tags []string) pulseTagSettings {
settings := pulseTagSettings{}
for _, raw := range tags {
tag := strings.TrimSpace(strings.ToLower(raw))
switch tag {
case "pulse-no-alerts":
settings.Suppress = true
case "pulse-monitor-only":
settings.MonitorOnly = true
case "pulse-relaxed":
settings.Relaxed = true
}
}
return settings
}
func applyRelaxedGuestThresholds(cfg ThresholdConfig) ThresholdConfig {
relaxed := cloneThresholdConfig(cfg)
adjust := func(th **HysteresisThreshold, minTrigger float64) {
if *th == nil {
*th = &HysteresisThreshold{Trigger: minTrigger, Clear: minTrigger - 5}
return
}
ensureHysteresisThreshold(*th)
if (*th).Trigger < minTrigger {
(*th).Trigger = minTrigger
}
if (*th).Clear >= (*th).Trigger {
(*th).Clear = (*th).Trigger - 5
}
if (*th).Clear < 0 {
(*th).Clear = 0
}
}
adjust(&relaxed.CPU, 95)
adjust(&relaxed.Memory, 92)
adjust(&relaxed.Disk, 95)
return relaxed
}
func (m *Manager) suppressGuestAlerts(guestID string) bool {
m.mu.Lock()
defer m.mu.Unlock()
cleared := false
for alertID, alert := range m.activeAlerts {
if alert == nil {
continue
}
if alert.ResourceID == guestID || strings.HasPrefix(alert.ResourceID, guestID+"/") || strings.HasPrefix(alertID, guestID) {
m.clearAlertNoLock(alertID)
delete(m.recentAlerts, alertID)
delete(m.pendingAlerts, alertID)
delete(m.suppressedUntil, alertID)
delete(m.alertRateLimit, alertID)
cleared = true
}
}
for key := range m.pendingAlerts {
if strings.HasPrefix(key, guestID) {
delete(m.pendingAlerts, key)
}
}
for key := range m.recentAlerts {
if strings.HasPrefix(key, guestID) {
delete(m.recentAlerts, key)
}
}
for key := range m.suppressedUntil {
if strings.HasPrefix(key, guestID) {
delete(m.suppressedUntil, key)
}
}
for key := range m.alertRateLimit {
if strings.HasPrefix(key, guestID) {
delete(m.alertRateLimit, key)
}
}
delete(m.offlineConfirmations, guestID)
return cleared
}
func (m *Manager) guestHasMonitorOnlyAlerts(guestID string) bool {
m.mu.RLock()
defer m.mu.RUnlock()
for _, alert := range m.activeAlerts {
if alert == nil {
continue
}
if alert.ResourceID != guestID {
continue
}
if isMonitorOnlyAlert(alert) {
return true
}
}
return false
}
// checkRateLimit checks if an alert has exceeded rate limit
func (m *Manager) checkRateLimit(alertID string) bool {
if m.config.Schedule.MaxAlertsHour <= 0 {
return true // No rate limit
}
now := time.Now()
cutoff := now.Add(-1 * time.Hour)
// Clean old entries and count recent alerts
var recentAlerts []time.Time
if times, exists := m.alertRateLimit[alertID]; exists {
for _, t := range times {
if t.After(cutoff) {
recentAlerts = append(recentAlerts, t)
}
}
}
// Check if we've hit the limit
if len(recentAlerts) >= m.config.Schedule.MaxAlertsHour {
return false
}
// Add current time
recentAlerts = append(recentAlerts, now)
m.alertRateLimit[alertID] = recentAlerts
return true
}
// escalationChecker runs periodically to check for alerts that need escalation and cleanup
func (m *Manager) escalationChecker() {
ticker := time.NewTicker(1 * time.Minute)
cleanupTicker := time.NewTicker(10 * time.Minute) // Run cleanup every 10 minutes
defer ticker.Stop()
defer cleanupTicker.Stop()
for {
select {
case <-ticker.C:
m.checkEscalations()
case <-cleanupTicker.C:
m.Cleanup(24 * time.Hour) // Clean up acknowledged alerts older than 24 hours
case <-m.escalationStop:
return
}
}
}
// checkEscalations checks all active alerts for escalation
func (m *Manager) checkEscalations() {
m.mu.Lock()
defer m.mu.Unlock()
// Respect global alert and activation controls before escalating.
// Escalations should never bypass a user disabling alerts.
if !m.config.Enabled || m.config.ActivationState != ActivationActive {
return
}
if !m.config.Schedule.Escalation.Enabled {
return
}
now := time.Now()
for _, alert := range m.activeAlerts {
// Skip acknowledged alerts
if alert.Acknowledged {
continue
}
// Check each escalation level
for i, level := range m.config.Schedule.Escalation.Levels {
// Skip if we've already escalated to this level
if alert.LastEscalation >= i+1 {
continue
}
// Check if it's time to escalate
escalateTime := alert.StartTime.Add(time.Duration(level.After) * time.Minute)
if now.After(escalateTime) {
// Update alert escalation state
alert.LastEscalation = i + 1
alert.EscalationTimes = append(alert.EscalationTimes, now)
log.Info().
Str("alertID", alert.ID).
Int("level", i+1).
Str("notify", level.Notify).
Msg("Alert escalated")
// Trigger escalation callback
m.safeCallEscalateCallback(alert, i+1)
}
}
}
}
// Stop stops the alert manager and saves history
func (m *Manager) Stop() {
close(m.escalationStop)
close(m.cleanupStop)
m.historyManager.Stop()
// Give background goroutines time to exit cleanly
time.Sleep(100 * time.Millisecond)
// Save active alerts before stopping
if err := m.SaveActiveAlerts(); err != nil {
log.Error().Err(err).Msg("Failed to save active alerts on stop")
}
}
// SaveActiveAlerts persists active alerts to disk
func (m *Manager) SaveActiveAlerts() error {
m.mu.RLock()
defer m.mu.RUnlock()
// Create directory if it doesn't exist
alertsDir := filepath.Join(utils.GetDataDir(), "alerts")
if err := os.MkdirAll(alertsDir, 0755); err != nil {
return fmt.Errorf("failed to create alerts directory: %w", err)
}
// Convert map to slice for JSON encoding
alerts := make([]*Alert, 0, len(m.activeAlerts))
for _, alert := range m.activeAlerts {
alerts = append(alerts, alert)
}
data, err := json.Marshal(alerts)
if err != nil {
return fmt.Errorf("failed to marshal active alerts: %w", err)
}
// Write to temporary file first, then rename (atomic operation)
// Use a unique temp file to avoid race conditions between concurrent saves (e.g., periodic vs shutdown)
tmpFile, err := os.CreateTemp(alertsDir, "active-alerts-*.json.tmp")
if err != nil {
return fmt.Errorf("failed to create temp file: %w", err)
}
tmpName := tmpFile.Name()
// Ensure cleanup of temp file in case of failure
defer os.Remove(tmpName)
if _, err := tmpFile.Write(data); err != nil {
tmpFile.Close()
return fmt.Errorf("failed to write active alerts: %w", err)
}
if err := tmpFile.Close(); err != nil {
return fmt.Errorf("failed to close temp file: %w", err)
}
finalFile := filepath.Join(alertsDir, "active-alerts.json")
if err := os.Rename(tmpName, finalFile); err != nil {
return fmt.Errorf("failed to rename active alerts file: %w", err)
}
log.Debug().Int("count", len(alerts)).Msg("Saved active alerts to disk")
return nil
}
func (m *Manager) saveActiveAlertsAsync(context string) {
go func() {
defer func() {
if r := recover(); r != nil {
log.Error().
Interface("panic", r).
Str("context", context).
Msg("Panic in SaveActiveAlerts goroutine")
}
}()
if err := m.SaveActiveAlerts(); err != nil {
log.Error().
Err(err).
Str("context", context).
Msg("Failed to save active alerts")
}
}()
}
// LoadActiveAlerts restores active alerts from disk
func (m *Manager) LoadActiveAlerts() error {
m.mu.Lock()
defer m.mu.Unlock()
alertsFile := filepath.Join(utils.GetDataDir(), "alerts", "active-alerts.json")
data, err := os.ReadFile(alertsFile)
if err != nil {
if os.IsNotExist(err) {
log.Info().Msg("No active alerts file found, starting fresh")
return nil
}
return fmt.Errorf("failed to read active alerts: %w", err)
}
var alerts []*Alert
if err := json.Unmarshal(data, &alerts); err != nil {
return fmt.Errorf("failed to unmarshal active alerts: %w", err)
}
// Restore alerts to the map with deduplication
now := time.Now()
restoredCount := 0
duplicateCount := 0
seen := make(map[string]bool)
for _, alert := range alerts {
// Migrate legacy guest alert IDs to the canonical guest format.
// Check if this is a guest-related alert by looking at common alert types
isGuestAlert := strings.Contains(alert.Type, "cpu") || strings.Contains(alert.Type, "memory") ||
strings.Contains(alert.Type, "disk") || strings.Contains(alert.Type, "network") ||
alert.Type == "guest-offline"
if isGuestAlert {
parts := strings.Split(alert.ResourceID, "-")
if alert.Node != "" && len(parts) >= 2 {
var newResourceID string
oldResourceID := alert.ResourceID
// Try to extract VMID (should be last part)
vmidStr := parts[len(parts)-1]
if _, err := strconv.Atoi(vmidStr); err == nil {
vmid, _ := strconv.Atoi(vmidStr)
newResourceID = BuildGuestKey(alert.Instance, alert.Node, vmid)
if newResourceID != "" && newResourceID != oldResourceID {
log.Info().
Str("oldID", oldResourceID).
Str("newID", newResourceID).
Str("alertType", alert.Type).
Msg("Migrating active alert from legacy guest ID format")
// Update resource ID
alert.ResourceID = newResourceID
// Update alert ID (usually contains resource ID)
alert.ID = strings.Replace(alert.ID, oldResourceID, newResourceID, 1)
}
}
}
}
// Skip duplicates
if seen[alert.ID] {
duplicateCount++
log.Warn().Str("alertID", alert.ID).Msg("Skipping duplicate alert during restore")
continue
}
seen[alert.ID] = true
// Skip very old alerts (older than 24 hours)
if now.Sub(alert.StartTime) > 24*time.Hour {
log.Debug().Str("alertID", alert.ID).Msg("Skipping old alert during restore")
continue
}
// Skip acknowledged alerts older than 1 hour from activeAlerts,
// but still preserve the ackState so if the same alert reappears
// (e.g., backup-age alerts) it won't retrigger notifications.
if alert.Acknowledged && alert.AckTime != nil && now.Sub(*alert.AckTime) > time.Hour {
log.Debug().Str("alertID", alert.ID).Msg("Skipping old acknowledged alert from activeAlerts but preserving ackState")
ackTime := alert.StartTime
if alert.AckTime != nil {
ackTime = *alert.AckTime
}
m.ackState[alert.ID] = ackRecord{
acknowledged: true,
user: alert.AckUser,
time: ackTime,
}
continue
}
m.activeAlerts[alert.ID] = alert
if alert.Acknowledged {
ackTime := alert.StartTime
if alert.AckTime != nil {
ackTime = *alert.AckTime
}
m.ackState[alert.ID] = ackRecord{
acknowledged: true,
user: alert.AckUser,
time: ackTime,
}
}
restoredCount++
// For critical alerts that are still active after restart, send notifications
// This ensures users are notified about ongoing critical issues even after service restarts
// Only notify for alerts that started recently (within last 2 hours) to avoid spam
if alert.Level == AlertLevelCritical && now.Sub(alert.StartTime) < 2*time.Hour {
// Use a goroutine and add a small delay to avoid notification spam on startup
alertCopy := alert.Clone()
go func(a *Alert) {
// Wait for system to stabilize or cancellation
select {
case <-time.After(10 * time.Second):
log.Info().
Str("alertID", a.ID).
Str("resource", a.ResourceName).
Msg("Attempting to send notification for restored critical alert")
// Acquire lock before calling dispatchAlert (it accesses maps)
m.mu.Lock()
m.dispatchAlert(a, false) // Use dispatchAlert to respect activation state and quiet hours
m.mu.Unlock()
case <-m.escalationStop:
log.Debug().
Str("alertID", a.ID).
Msg("Cancelled startup notification due to shutdown")
return
}
}(alertCopy)
}
}
log.Info().
Int("restored", restoredCount).
Int("total", len(alerts)).
Int("duplicates", duplicateCount).
Msg("Restored active alerts from disk")
return nil
}
// CleanupAlertsForNodes removes alerts for nodes that no longer exist
func (m *Manager) CleanupAlertsForNodes(existingNodes map[string]bool) {
m.mu.Lock()
defer m.mu.Unlock()
log.Debug().
Int("totalAlerts", len(m.activeAlerts)).
Int("existingNodes", len(existingNodes)).
Interface("nodes", existingNodes).
Msg("Starting alert cleanup for non-existent nodes")
removedCount := 0
for alertID, alert := range m.activeAlerts {
if alert == nil {
continue
}
// Skip alerts that are not tied to Proxmox nodes. Docker and PBS resources use
// synthetic node identifiers that won't appear in the Proxmox node list, so we
// must preserve their alerts here.
if strings.HasPrefix(alertID, "docker-") || strings.HasPrefix(alert.ResourceID, "docker:") {
continue
}
if strings.HasPrefix(alertID, "pbs-") || alert.Type == "pbs-offline" {
continue
}
// Use the Node field from the alert itself, which is more reliable
node := alert.Node
// If we couldn't get a node or the node doesn't exist, remove the alert
if node == "" || !existingNodes[node] {
m.removeActiveAlertNoLock(alertID)
removedCount++
log.Debug().Str("alertID", alertID).Str("node", node).Msg("Removed alert for non-existent node")
}
}
if removedCount > 0 {
log.Debug().Int("removed", removedCount).Int("remaining", len(m.activeAlerts)).Msg("Cleaned up alerts for non-existent nodes")
// Save the cleaned up state
go func() {
defer func() {
if r := recover(); r != nil {
log.Error().Interface("panic", r).Msg("Panic in SaveActiveAlerts goroutine (cleanup)")
}
}()
if err := m.SaveActiveAlerts(); err != nil {
log.Error().Err(err).Msg("Failed to save alerts after cleanup")
}
}()
} else {
log.Info().Msg("No alerts needed cleanup")
}
}
// ClearActiveAlerts removes all active and pending alerts, resetting the manager state.
func (m *Manager) ClearActiveAlerts() {
m.mu.Lock()
if len(m.activeAlerts) == 0 && len(m.pendingAlerts) == 0 {
m.mu.Unlock()
return
}
m.activeAlerts = make(map[string]*Alert)
m.pendingAlerts = make(map[string]time.Time)
m.recentAlerts = make(map[string]*Alert)
m.suppressedUntil = make(map[string]time.Time)
m.alertRateLimit = make(map[string][]time.Time)
m.nodeOfflineCount = make(map[string]int)
m.offlineConfirmations = make(map[string]int)
m.dockerOfflineCount = make(map[string]int)
m.dockerStateConfirm = make(map[string]int)
m.dockerRestartTracking = make(map[string]*dockerRestartRecord)
m.dockerLastExitCode = make(map[string]int)
m.dockerUpdateFirstSeen = make(map[string]time.Time)
m.dockerUpdateFirstSeenByIdentity = make(map[string]time.Time)
m.ackState = make(map[string]ackRecord)
m.mu.Unlock()
m.resolvedMutex.Lock()
m.recentlyResolved = make(map[string]*ResolvedAlert)
m.resolvedMutex.Unlock()
log.Info().Msg("Cleared all active and pending alerts")
go func() {
defer func() {
if r := recover(); r != nil {
log.Error().Interface("panic", r).Msg("Panic in SaveActiveAlerts goroutine (clear)")
}
}()
if err := m.SaveActiveAlerts(); err != nil {
log.Error().Err(err).Msg("Failed to persist cleared alerts")
}
}()
}
// periodicSaveAlerts saves active alerts to disk periodically
func (m *Manager) periodicSaveAlerts() {
ticker := time.NewTicker(1 * time.Minute)
defer ticker.Stop()
for {
select {
case <-ticker.C:
if err := m.SaveActiveAlerts(); err != nil {
log.Error().Err(err).Msg("Failed to save active alerts during periodic save")
}
case <-m.escalationStop:
return
}
}
}
// trackingMapCleanup periodically cleans up stale entries from tracking maps
// to prevent unbounded memory growth from deleted/decommissioned resources.
func (m *Manager) trackingMapCleanup() {
// Run cleanup every hour
ticker := time.NewTicker(1 * time.Hour)
defer ticker.Stop()
for {
select {
case <-ticker.C:
m.cleanupStaleMaps()
case <-m.cleanupStop:
return
}
}
}
// cleanupStaleMaps removes stale entries from tracking maps.
// Entries are considered stale if they haven't been updated in 24 hours
// and don't correspond to any active alert.
func (m *Manager) cleanupStaleMaps() {
m.mu.Lock()
defer m.mu.Unlock()
now := time.Now()
staleThreshold := StaleTrackingThreshold
cleaned := 0
// Clean up flapping history for resources without active alerts
for alertID, history := range m.flappingHistory {
if _, hasAlert := m.activeAlerts[alertID]; !hasAlert {
// Check if history is stale (last entry older than threshold)
if len(history) == 0 || now.Sub(history[len(history)-1]) > staleThreshold {
delete(m.flappingHistory, alertID)
delete(m.flappingActive, alertID)
cleaned++
}
}
}
// Clean up suppressedUntil entries that have expired
for alertID, suppressUntil := range m.suppressedUntil {
if now.After(suppressUntil) {
delete(m.suppressedUntil, alertID)
cleaned++
}
}
// Clean up pending alerts older than threshold without active alerts
for alertID, pendingTime := range m.pendingAlerts {
if _, hasAlert := m.activeAlerts[alertID]; !hasAlert {
if now.Sub(pendingTime) > staleThreshold {
delete(m.pendingAlerts, alertID)
cleaned++
}
}
}
// Clean up offline confirmation counts for resources without active alerts
for resourceID := range m.offlineConfirmations {
hasRelatedAlert := false
for alertID := range m.activeAlerts {
if strings.Contains(alertID, resourceID) {
hasRelatedAlert = true
break
}
}
if !hasRelatedAlert {
delete(m.offlineConfirmations, resourceID)
cleaned++
}
}
// Clean up node offline counts (legacy)
for nodeID := range m.nodeOfflineCount {
hasRelatedAlert := false
for alertID := range m.activeAlerts {
if strings.Contains(alertID, nodeID) {
hasRelatedAlert = true
break
}
}
if !hasRelatedAlert {
delete(m.nodeOfflineCount, nodeID)
cleaned++
}
}
// Clean up Docker tracking maps
for containerID := range m.dockerStateConfirm {
hasRelatedAlert := false
for alertID := range m.activeAlerts {
if strings.Contains(alertID, containerID) {
hasRelatedAlert = true
break
}
}
if !hasRelatedAlert {
delete(m.dockerStateConfirm, containerID)
cleaned++
}
}
for hostID := range m.dockerOfflineCount {
hasRelatedAlert := false
for alertID := range m.activeAlerts {
if strings.Contains(alertID, hostID) {
hasRelatedAlert = true
break
}
}
if !hasRelatedAlert {
delete(m.dockerOfflineCount, hostID)
cleaned++
}
}
// Clean up Docker restart tracking for stale containers
for containerID, record := range m.dockerRestartTracking {
if record != nil && now.Sub(record.lastChecked) > staleThreshold {
delete(m.dockerRestartTracking, containerID)
delete(m.dockerLastExitCode, containerID)
cleaned++
}
}
// Clean up Docker update tracking for stale entries
for containerID, firstSeen := range m.dockerUpdateFirstSeen {
if now.Sub(firstSeen) > staleThreshold {
delete(m.dockerUpdateFirstSeen, containerID)
cleaned++
}
}
for containerID, firstSeen := range m.dockerUpdateFirstSeenByIdentity {
if now.Sub(firstSeen) > staleThreshold {
delete(m.dockerUpdateFirstSeenByIdentity, containerID)
cleaned++
}
}
// Clean up rate limit entries older than 1 hour
rateLimitThreshold := RateLimitCleanupWindow
for resourceID, times := range m.alertRateLimit {
// Filter to keep only recent entries
var recent []time.Time
for _, t := range times {
if now.Sub(t) < rateLimitThreshold {
recent = append(recent, t)
}
}
if len(recent) == 0 {
delete(m.alertRateLimit, resourceID)
cleaned++
} else if len(recent) < len(times) {
m.alertRateLimit[resourceID] = recent
}
}
// Clean up recent alerts older than suppression window
suppressWindow := time.Duration(m.config.SuppressionWindow) * time.Minute
if suppressWindow <= 0 {
suppressWindow = 5 * time.Minute
}
for alertID, alert := range m.recentAlerts {
if now.Sub(alert.LastSeen) > suppressWindow {
delete(m.recentAlerts, alertID)
cleaned++
}
}
// Clean up ackState for alerts that no longer exist and are older than threshold
for alertID, record := range m.ackState {
if _, hasAlert := m.activeAlerts[alertID]; !hasAlert {
// Use inactiveAt (when alert was removed) for TTL, not ack time
checkTime := record.inactiveAt
if checkTime.IsZero() {
checkTime = record.time
}
if now.Sub(checkTime) > staleThreshold {
delete(m.ackState, alertID)
cleaned++
}
}
}
// Auto-resolve stale alerts - alerts where the resource hasn't been polled in 24 hours.
// This handles cases where a resource (e.g., Docker container, storage) stops being
// monitored but its alert remains active. Without this, alerts would persist indefinitely.
staleAlerts := make([]string, 0)
for alertID, alert := range m.activeAlerts {
if alert != nil && now.Sub(alert.LastSeen) > staleThreshold {
staleAlerts = append(staleAlerts, alertID)
}
}
staleResolved := 0
for _, alertID := range staleAlerts {
alert := m.activeAlerts[alertID]
log.Info().
Str("alertID", alertID).
Str("resourceName", alert.ResourceName).
Time("lastSeen", alert.LastSeen).
Dur("staleFor", now.Sub(alert.LastSeen)).
Msg("Auto-resolving stale alert - resource no longer being monitored")
m.clearAlertNoLock(alertID)
cleaned++
staleResolved++
}
// Persist changes if we resolved any stale alerts
if staleResolved > 0 {
go func() {
defer func() {
if r := recover(); r != nil {
log.Error().Interface("panic", r).Msg("Panic in SaveActiveAlerts goroutine (stale cleanup)")
}
}()
if err := m.SaveActiveAlerts(); err != nil {
log.Error().Err(err).Msg("Failed to save active alerts after stale cleanup")
}
}()
log.Info().
Int("count", staleResolved).
Msg("Auto-resolved stale alerts")
}
if cleaned > 0 {
log.Debug().
Int("entriesCleaned", cleaned).
Msg("Cleaned stale entries from alert tracking maps")
}
}
// hasKnownFirmwareBug checks if a disk model is known to have firmware bugs that cause
// false health status reports. These drives may report FAILED or other error states
// due to firmware issues (e.g., incorrect temperature thresholds) even when the drive
// is actually healthy. This prevents false alerts while still monitoring wearout.
//
// Related to GitHub issue #547: Samsung 980/990 SSDs report false health failures
func hasKnownFirmwareBug(model string) bool {
normalizedModel := strings.ToUpper(strings.TrimSpace(model))
// Samsung 980/990 series drives have known firmware bugs causing false health reports
// These drives report incorrect health status due to temperature threshold bugs
// even when functioning normally. Users should update firmware to latest version.
knownProblematicModels := []string{
"SAMSUNG SSD 980",
"SAMSUNG 980",
"SAMSUNG SSD 990",
"SAMSUNG 990",
}
for _, problematic := range knownProblematicModels {
if strings.Contains(normalizedModel, problematic) {
return true
}
}
return false
}
// CheckDiskHealth checks disk health and creates alerts if needed
func (m *Manager) CheckDiskHealth(instance, node string, disk proxmox.Disk) {
// Create unique alert ID for this disk
alertID := fmt.Sprintf("disk-health-%s-%s-%s", instance, node, disk.DevPath)
m.mu.Lock()
defer m.mu.Unlock()
// Check if disk health is not PASSED
normalizedHealth := strings.ToUpper(strings.TrimSpace(disk.Health))
healthCheckNeeded := normalizedHealth != "" && normalizedHealth != "UNKNOWN" && normalizedHealth != "PASSED" && normalizedHealth != "OK"
// Skip health alerts for drives with known firmware bugs that cause false reports
// These drives may report FAILED status due to firmware issues even when healthy
// We still monitor wearout below, which is more reliable for these drives
if healthCheckNeeded && hasKnownFirmwareBug(disk.Model) {
log.Debug().
Str("node", node).
Str("disk", disk.DevPath).
Str("model", disk.Model).
Str("health", disk.Health).
Msg("Skipping health alert for drive with known firmware bug - health status unreliable")
// Clear any existing health alert since we now recognize this is a false positive
m.clearAlertNoLock(alertID)
healthCheckNeeded = false // Skip to wearout check
}
if healthCheckNeeded {
// Check if alert already exists
if _, exists := m.activeAlerts[alertID]; !exists {
// Create new health alert
alert := &Alert{
ID: alertID,
Type: "disk-health",
Level: AlertLevelCritical,
ResourceID: fmt.Sprintf("%s-%s", node, disk.DevPath),
ResourceName: fmt.Sprintf("%s (%s)", disk.Model, disk.DevPath),
Node: node,
Instance: instance,
Message: fmt.Sprintf("Disk health check failed: %s", disk.Health),
Value: 0, // Not applicable for health status
Threshold: 0,
StartTime: time.Now(),
LastSeen: time.Now(),
Metadata: map[string]interface{}{
"disk_path": disk.DevPath,
"disk_model": disk.Model,
"disk_serial": disk.Serial,
"disk_type": disk.Type,
"disk_health": disk.Health,
"disk_size": disk.Size,
},
}
m.preserveAlertState(alertID, alert)
m.activeAlerts[alertID] = alert
m.recentAlerts[alertID] = alert
m.historyManager.AddAlert(*alert)
m.dispatchAlert(alert, false)
log.Error().
Str("node", node).
Str("disk", disk.DevPath).
Str("model", disk.Model).
Str("health", disk.Health).
Msg("Disk health alert created")
}
} else {
// Disk is healthy, clear alert if it exists
m.clearAlertNoLock(alertID)
}
// Check for low wearout (SSD life remaining)
if disk.Wearout > 0 && disk.Wearout < 10 {
wearoutAlertID := fmt.Sprintf("disk-wearout-%s-%s-%s", instance, node, disk.DevPath)
message := fmt.Sprintf("SSD has less than 10%% life remaining (%d%% wearout)", disk.Wearout)
resourceID := fmt.Sprintf("%s-%s", node, disk.DevPath)
resourceName := fmt.Sprintf("%s (%s)", disk.Model, disk.DevPath)
if existing, exists := m.activeAlerts[wearoutAlertID]; exists {
// Refresh details so legacy alerts pick up updated wording and metadata
existing.LastSeen = time.Now()
existing.Value = float64(disk.Wearout)
existing.Message = message
existing.ResourceID = resourceID
existing.ResourceName = resourceName
existing.Node = node
existing.NodeDisplayName = m.resolveNodeDisplayName(existing.Instance, node)
existing.Instance = instance
if existing.Metadata == nil {
existing.Metadata = map[string]interface{}{}
}
existing.Metadata["disk_path"] = disk.DevPath
existing.Metadata["disk_model"] = disk.Model
existing.Metadata["disk_serial"] = disk.Serial
existing.Metadata["disk_type"] = disk.Type
existing.Metadata["disk_wearout"] = disk.Wearout
delete(existing.Metadata, "disk_wearout_used")
} else {
// Create wearout alert
alert := &Alert{
ID: wearoutAlertID,
Type: "disk-wearout",
Level: AlertLevelWarning,
ResourceID: resourceID,
ResourceName: resourceName,
Node: node,
Instance: instance,
Message: message,
Value: float64(disk.Wearout),
Threshold: 10.0,
StartTime: time.Now(),
LastSeen: time.Now(),
Metadata: map[string]interface{}{
"disk_path": disk.DevPath,
"disk_model": disk.Model,
"disk_serial": disk.Serial,
"disk_type": disk.Type,
"disk_wearout": disk.Wearout,
},
}
m.preserveAlertState(wearoutAlertID, alert)
m.activeAlerts[wearoutAlertID] = alert
m.recentAlerts[wearoutAlertID] = alert
m.historyManager.AddAlert(*alert)
m.dispatchAlert(alert, false)
log.Warn().
Str("node", node).
Str("disk", disk.DevPath).
Str("model", disk.Model).
Int("wearout", disk.Wearout).
Msg("Disk wearout alert created")
}
} else if disk.Wearout >= 10 {
// Wearout is acceptable, clear alert if it exists
wearoutAlertID := fmt.Sprintf("disk-wearout-%s-%s-%s", instance, node, disk.DevPath)
m.clearAlertNoLock(wearoutAlertID)
}
}
// clearAlertNoLock clears an alert without locking (must be called with lock held)
func (m *Manager) clearAlertNoLock(alertID string) {
alert, exists := m.activeAlerts[alertID]
if !exists {
return
}
// Record metric for resolved alert
if recordAlertResolved != nil {
recordAlertResolved(alert)
}
m.removeActiveAlertNoLock(alertID)
resolvedAlert := &ResolvedAlert{
Alert: alert,
ResolvedTime: time.Now(),
}
m.addRecentlyResolvedWithPrimaryLock(alertID, resolvedAlert)
m.safeCallResolvedCallback(alertID, true) // Make async to prevent deadlock
log.Info().
Str("alertID", alertID).
Msg("Alert cleared")
}
func (m *Manager) clearSnapshotAlertsForInstance(instance string) {
m.mu.Lock()
m.clearSnapshotAlertsForInstanceLocked(instance)
m.mu.Unlock()
}
func (m *Manager) clearSnapshotAlertsForInstanceLocked(instance string) {
for alertID, alert := range m.activeAlerts {
if alert == nil || alert.Type != "snapshot-age" {
continue
}
if instance != "" && alert.Instance != instance {
continue
}
m.clearAlertNoLock(alertID)
}
}
func (m *Manager) clearBackupAlerts() {
m.mu.Lock()
m.clearBackupAlertsLocked()
m.mu.Unlock()
}
func (m *Manager) clearBackupAlertsLocked() {
for alertID, alert := range m.activeAlerts {
if alert == nil || (alert.Type != "backup-age" && alert.Type != "backup-orphaned") {
continue
}
m.clearAlertNoLock(alertID)
}
}