mirror of
https://github.com/rcourtman/Pulse.git
synced 2026-04-28 11:30:15 +00:00
10415 lines
318 KiB
Go
10415 lines
318 KiB
Go
package alerts
|
|
|
|
import (
|
|
"encoding/json"
|
|
"fmt"
|
|
"math"
|
|
"os"
|
|
"path/filepath"
|
|
"sort"
|
|
"strconv"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/rcourtman/pulse-go-rewrite/internal/models"
|
|
"github.com/rcourtman/pulse-go-rewrite/internal/utils"
|
|
"github.com/rcourtman/pulse-go-rewrite/pkg/proxmox"
|
|
"github.com/rs/zerolog/log"
|
|
)
|
|
|
|
// AlertLevel represents the severity of an alert
|
|
type AlertLevel string
|
|
|
|
const (
|
|
AlertLevelWarning AlertLevel = "warning"
|
|
AlertLevelCritical AlertLevel = "critical"
|
|
)
|
|
|
|
// ActivationState represents the alert notification activation state
|
|
type ActivationState string
|
|
|
|
const (
|
|
ActivationPending ActivationState = "pending_review"
|
|
ActivationActive ActivationState = "active"
|
|
ActivationSnoozed ActivationState = "snoozed"
|
|
)
|
|
|
|
// Default thresholds and configuration values
|
|
const (
|
|
// Default threshold values
|
|
DefaultCPUTrigger = 80.0
|
|
DefaultCPUClear = 75.0
|
|
DefaultMemoryTrigger = 85.0
|
|
DefaultMemoryClear = 80.0
|
|
DefaultDiskTrigger = 90.0
|
|
DefaultDiskClear = 85.0
|
|
DefaultStorageTrigger = 85.0
|
|
DefaultStorageClear = 80.0
|
|
DefaultTempTrigger = 80.0
|
|
DefaultTempClear = 75.0
|
|
|
|
// Time thresholds
|
|
DefaultDelaySeconds = 5
|
|
DefaultSuppressionWindow = 5 // minutes
|
|
|
|
// Alert management
|
|
DefaultMinimumDelta = 2.0 // minimum % change to trigger new alert
|
|
DefaultHysteresisMargin = 5.0 // % margin between trigger and clear
|
|
DefaultObservationWindow = 24 // hours
|
|
|
|
// Flapping detection
|
|
DefaultFlappingWindow = 300 // seconds (5 minutes)
|
|
DefaultFlappingThreshold = 5 // state changes to trigger flapping
|
|
DefaultFlappingCooldown = 15 // minutes
|
|
|
|
// Confirmation counts for transient state detection
|
|
RequiredOfflineConfirmations = 3
|
|
RequiredStateConfirmations = 2
|
|
|
|
// Cleanup intervals
|
|
StaleTrackingThreshold = 24 * time.Hour
|
|
RateLimitCleanupWindow = 1 * time.Hour
|
|
)
|
|
|
|
func normalizePoweredOffSeverity(level AlertLevel) AlertLevel {
|
|
switch strings.ToLower(string(level)) {
|
|
case string(AlertLevelCritical):
|
|
return AlertLevelCritical
|
|
default:
|
|
return AlertLevelWarning
|
|
}
|
|
}
|
|
|
|
// Alert represents an active alert
|
|
type Alert struct {
|
|
ID string `json:"id"`
|
|
Type string `json:"type"` // cpu, memory, disk, etc.
|
|
Level AlertLevel `json:"level"`
|
|
ResourceID string `json:"resourceId"` // guest or node ID
|
|
ResourceName string `json:"resourceName"`
|
|
Node string `json:"node"`
|
|
NodeDisplayName string `json:"nodeDisplayName,omitempty"`
|
|
Instance string `json:"instance"`
|
|
Message string `json:"message"`
|
|
Value float64 `json:"value"`
|
|
Threshold float64 `json:"threshold"`
|
|
StartTime time.Time `json:"startTime"`
|
|
LastSeen time.Time `json:"lastSeen"`
|
|
Acknowledged bool `json:"acknowledged"`
|
|
AckTime *time.Time `json:"ackTime,omitempty"`
|
|
AckUser string `json:"ackUser,omitempty"`
|
|
Metadata map[string]interface{} `json:"metadata,omitempty"`
|
|
// Notification tracking
|
|
LastNotified *time.Time `json:"lastNotified,omitempty"` // Last time notification was sent
|
|
// Escalation tracking
|
|
LastEscalation int `json:"lastEscalation,omitempty"` // Last escalation level notified
|
|
EscalationTimes []time.Time `json:"escalationTimes,omitempty"` // Times when escalations were sent
|
|
}
|
|
|
|
// Clone returns a deep copy of the alert so it can be safely shared across goroutines.
|
|
func (a *Alert) Clone() *Alert {
|
|
if a == nil {
|
|
return nil
|
|
}
|
|
|
|
clone := *a
|
|
|
|
if a.AckTime != nil {
|
|
t := *a.AckTime
|
|
clone.AckTime = &t
|
|
}
|
|
|
|
if a.LastNotified != nil {
|
|
t := *a.LastNotified
|
|
clone.LastNotified = &t
|
|
}
|
|
|
|
if len(a.EscalationTimes) > 0 {
|
|
clone.EscalationTimes = append([]time.Time(nil), a.EscalationTimes...)
|
|
}
|
|
|
|
if a.Metadata != nil {
|
|
clone.Metadata = cloneMetadata(a.Metadata)
|
|
}
|
|
|
|
return &clone
|
|
}
|
|
|
|
func cloneMetadata(src map[string]interface{}) map[string]interface{} {
|
|
if src == nil {
|
|
return nil
|
|
}
|
|
|
|
dst := make(map[string]interface{}, len(src))
|
|
for k, v := range src {
|
|
dst[k] = cloneMetadataValue(v)
|
|
}
|
|
return dst
|
|
}
|
|
|
|
func cloneMetadataValue(val interface{}) interface{} {
|
|
switch v := val.(type) {
|
|
case map[string]interface{}:
|
|
return cloneMetadata(v)
|
|
case map[string]string:
|
|
m := make(map[string]interface{}, len(v))
|
|
for key, value := range v {
|
|
m[key] = value
|
|
}
|
|
return m
|
|
case []interface{}:
|
|
arr := make([]interface{}, len(v))
|
|
for i, elem := range v {
|
|
arr[i] = cloneMetadataValue(elem)
|
|
}
|
|
return arr
|
|
case []string:
|
|
arr := make([]string, len(v))
|
|
copy(arr, v)
|
|
return arr
|
|
case []int:
|
|
arr := make([]int, len(v))
|
|
copy(arr, v)
|
|
return arr
|
|
case []float64:
|
|
arr := make([]float64, len(v))
|
|
copy(arr, v)
|
|
return arr
|
|
default:
|
|
return v
|
|
}
|
|
}
|
|
|
|
// ResolvedAlert represents a recently resolved alert
|
|
type ResolvedAlert struct {
|
|
*Alert
|
|
ResolvedTime time.Time `json:"resolvedTime"`
|
|
}
|
|
|
|
// HysteresisThreshold represents a threshold with hysteresis
|
|
type HysteresisThreshold struct {
|
|
Trigger float64 `json:"trigger"` // Threshold to trigger alert
|
|
Clear float64 `json:"clear"` // Threshold to clear alert
|
|
}
|
|
|
|
// ThresholdConfig represents threshold configuration
|
|
type ThresholdConfig struct {
|
|
Disabled bool `json:"disabled,omitempty"` // Completely disable alerts for this guest
|
|
DisableConnectivity bool `json:"disableConnectivity,omitempty"` // Disable node offline/connectivity/powered-off alerts
|
|
PoweredOffSeverity AlertLevel `json:"poweredOffSeverity,omitempty"` // Severity for powered-off alerts
|
|
CPU *HysteresisThreshold `json:"cpu,omitempty"`
|
|
Memory *HysteresisThreshold `json:"memory,omitempty"`
|
|
Disk *HysteresisThreshold `json:"disk,omitempty"`
|
|
DiskRead *HysteresisThreshold `json:"diskRead,omitempty"`
|
|
DiskWrite *HysteresisThreshold `json:"diskWrite,omitempty"`
|
|
NetworkIn *HysteresisThreshold `json:"networkIn,omitempty"`
|
|
NetworkOut *HysteresisThreshold `json:"networkOut,omitempty"`
|
|
Usage *HysteresisThreshold `json:"usage,omitempty"` // For storage devices
|
|
Temperature *HysteresisThreshold `json:"temperature,omitempty"` // For node CPU temperature
|
|
DiskTemperature *HysteresisThreshold `json:"diskTemperature,omitempty"` // For host SMART temperatures
|
|
Backup *BackupAlertConfig `json:"backup,omitempty"`
|
|
Snapshot *SnapshotAlertConfig `json:"snapshot,omitempty"`
|
|
Note *string `json:"note,omitempty"`
|
|
// Legacy thresholds for backwards compatibility
|
|
CPULegacy *float64 `json:"cpuLegacy,omitempty"`
|
|
MemoryLegacy *float64 `json:"memoryLegacy,omitempty"`
|
|
DiskLegacy *float64 `json:"diskLegacy,omitempty"`
|
|
DiskReadLegacy *float64 `json:"diskReadLegacy,omitempty"`
|
|
DiskWriteLegacy *float64 `json:"diskWriteLegacy,omitempty"`
|
|
NetworkInLegacy *float64 `json:"networkInLegacy,omitempty"`
|
|
NetworkOutLegacy *float64 `json:"networkOutLegacy,omitempty"`
|
|
}
|
|
|
|
// QuietHours represents quiet hours configuration
|
|
type QuietHours struct {
|
|
Enabled bool `json:"enabled"`
|
|
Start string `json:"start"` // 24-hour format "HH:MM"
|
|
End string `json:"end"` // 24-hour format "HH:MM"
|
|
Timezone string `json:"timezone"`
|
|
Days map[string]bool `json:"days"` // monday, tuesday, etc.
|
|
Suppress QuietHoursSuppression `json:"suppress"`
|
|
}
|
|
|
|
// QuietHoursSuppression controls which alert categories are silenced during quiet hours.
|
|
type QuietHoursSuppression struct {
|
|
Performance bool `json:"performance"`
|
|
Storage bool `json:"storage"`
|
|
Offline bool `json:"offline"`
|
|
}
|
|
|
|
// EscalationLevel represents an escalation rule
|
|
type EscalationLevel struct {
|
|
After int `json:"after"` // minutes after initial alert
|
|
Notify string `json:"notify"` // "email", "webhook", or "all"
|
|
}
|
|
|
|
// EscalationConfig represents alert escalation configuration
|
|
type EscalationConfig struct {
|
|
Enabled bool `json:"enabled"`
|
|
Levels []EscalationLevel `json:"levels"`
|
|
}
|
|
|
|
// GroupingConfig represents alert grouping configuration
|
|
type GroupingConfig struct {
|
|
Enabled bool `json:"enabled"`
|
|
Window int `json:"window"` // seconds
|
|
ByNode bool `json:"byNode"` // Group alerts by node
|
|
ByGuest bool `json:"byGuest"` // Group alerts by guest type
|
|
}
|
|
|
|
// ScheduleConfig represents alerting schedule configuration
|
|
type ScheduleConfig struct {
|
|
QuietHours QuietHours `json:"quietHours"`
|
|
Cooldown int `json:"cooldown"` // minutes
|
|
GroupingWindow int `json:"groupingWindow,omitempty"` // Deprecated: use Grouping.Window instead. Will be auto-migrated on config update.
|
|
MaxAlertsHour int `json:"maxAlertsHour"` // max alerts per hour per resource
|
|
NotifyOnResolve bool `json:"notifyOnResolve"` // Send notification when alert clears
|
|
Escalation EscalationConfig `json:"escalation"`
|
|
Grouping GroupingConfig `json:"grouping"`
|
|
}
|
|
|
|
// FilterCondition represents a single filter condition
|
|
type FilterCondition struct {
|
|
Type string `json:"type"` // "metric", "text", or "raw"
|
|
Field string `json:"field,omitempty"`
|
|
Operator string `json:"operator,omitempty"`
|
|
Value interface{} `json:"value,omitempty"`
|
|
RawText string `json:"rawText,omitempty"`
|
|
}
|
|
|
|
// FilterStack represents a collection of filters with logical operator
|
|
type FilterStack struct {
|
|
Filters []FilterCondition `json:"filters"`
|
|
LogicalOperator string `json:"logicalOperator"` // "AND" or "OR"
|
|
}
|
|
|
|
// CustomAlertRule represents a custom alert rule with filter conditions
|
|
type CustomAlertRule struct {
|
|
ID string `json:"id"`
|
|
Name string `json:"name"`
|
|
Description string `json:"description,omitempty"`
|
|
FilterConditions FilterStack `json:"filterConditions"`
|
|
Thresholds ThresholdConfig `json:"thresholds"`
|
|
Priority int `json:"priority"`
|
|
Enabled bool `json:"enabled"`
|
|
Notifications struct {
|
|
Email *struct {
|
|
Enabled bool `json:"enabled"`
|
|
Recipients []string `json:"recipients"`
|
|
} `json:"email,omitempty"`
|
|
Webhook *struct {
|
|
Enabled bool `json:"enabled"`
|
|
URL string `json:"url"`
|
|
} `json:"webhook,omitempty"`
|
|
} `json:"notifications"`
|
|
CreatedAt time.Time `json:"createdAt"`
|
|
UpdatedAt time.Time `json:"updatedAt"`
|
|
}
|
|
|
|
// DockerThresholdConfig represents Docker-specific alert thresholds
|
|
type DockerThresholdConfig struct {
|
|
CPU HysteresisThreshold `json:"cpu"` // CPU usage % threshold (default: 80%)
|
|
Memory HysteresisThreshold `json:"memory"` // Memory usage % threshold (default: 85%)
|
|
Disk HysteresisThreshold `json:"disk"` // Writable layer usage % threshold (default: 85%)
|
|
RestartCount int `json:"restartCount"` // Number of restarts to trigger alert (default: 3)
|
|
RestartWindow int `json:"restartWindow"` // Time window in seconds for restart loop detection (default: 300 = 5min)
|
|
MemoryWarnPct int `json:"memoryWarnPct"` // Memory limit % to trigger warning (default: 90)
|
|
MemoryCriticalPct int `json:"memoryCriticalPct"` // Memory limit % to trigger critical (default: 95)
|
|
ServiceWarnGapPct int `json:"serviceWarnGapPercent"` // % of desired tasks missing to trigger warning (default: 10)
|
|
ServiceCritGapPct int `json:"serviceCriticalGapPercent"` // % of desired tasks missing to trigger critical (default: 50)
|
|
StateDisableConnectivity bool `json:"stateDisableConnectivity,omitempty"` // Disable container offline/state alerts globally
|
|
StatePoweredOffSeverity AlertLevel `json:"statePoweredOffSeverity,omitempty"` // Default severity for container state/offline alerts
|
|
UpdateAlertDelayHours int `json:"updateAlertDelayHours,omitempty"` // Hours to wait before alerting on available image updates (default: 24, -1 = disabled)
|
|
}
|
|
|
|
// PMGThresholdConfig represents Proxmox Mail Gateway-specific alert thresholds
|
|
type PMGThresholdConfig struct {
|
|
QueueTotalWarning int `json:"queueTotalWarning"` // Total queue depth warning threshold (default: 500)
|
|
QueueTotalCritical int `json:"queueTotalCritical"` // Total queue depth critical threshold (default: 1000)
|
|
OldestMessageWarnMins int `json:"oldestMessageWarnMins"` // Oldest queued message age warning in minutes (default: 30)
|
|
OldestMessageCritMins int `json:"oldestMessageCritMins"` // Oldest queued message age critical in minutes (default: 60)
|
|
DeferredQueueWarn int `json:"deferredQueueWarn"` // Deferred queue depth warning (default: 200)
|
|
DeferredQueueCritical int `json:"deferredQueueCritical"` // Deferred queue depth critical (default: 500)
|
|
HoldQueueWarn int `json:"holdQueueWarn"` // Hold queue depth warning (default: 100)
|
|
HoldQueueCritical int `json:"holdQueueCritical"` // Hold queue depth critical (default: 300)
|
|
QuarantineSpamWarn int `json:"quarantineSpamWarn"` // Spam quarantine absolute warning (default: 2000)
|
|
QuarantineSpamCritical int `json:"quarantineSpamCritical"` // Spam quarantine absolute critical (default: 5000)
|
|
QuarantineVirusWarn int `json:"quarantineVirusWarn"` // Virus quarantine absolute warning (default: 2000)
|
|
QuarantineVirusCritical int `json:"quarantineVirusCritical"` // Virus quarantine absolute critical (default: 5000)
|
|
QuarantineGrowthWarnPct int `json:"quarantineGrowthWarnPct"` // Growth % to trigger warning (default: 25)
|
|
QuarantineGrowthWarnMin int `json:"quarantineGrowthWarnMin"` // Minimum message growth for warning (default: 250)
|
|
QuarantineGrowthCritPct int `json:"quarantineGrowthCritPct"` // Growth % to trigger critical (default: 50)
|
|
QuarantineGrowthCritMin int `json:"quarantineGrowthCritMin"` // Minimum message growth for critical (default: 500)
|
|
}
|
|
|
|
// SnapshotAlertConfig represents snapshot age alert configuration
|
|
type SnapshotAlertConfig struct {
|
|
Enabled bool `json:"enabled"`
|
|
WarningDays int `json:"warningDays"`
|
|
CriticalDays int `json:"criticalDays"`
|
|
WarningSizeGiB float64 `json:"warningSizeGiB,omitempty"`
|
|
CriticalSizeGiB float64 `json:"criticalSizeGiB,omitempty"`
|
|
}
|
|
|
|
// BackupAlertConfig represents backup age alert configuration
|
|
type BackupAlertConfig struct {
|
|
Enabled bool `json:"enabled"`
|
|
WarningDays int `json:"warningDays"`
|
|
CriticalDays int `json:"criticalDays"`
|
|
// Indicator thresholds for the dashboard (separate from alert thresholds)
|
|
FreshHours int `json:"freshHours"` // Backups newer than this show as green (default: 24)
|
|
StaleHours int `json:"staleHours"` // Backups older than FreshHours but newer than this show as amber (default: 72)
|
|
// Global backup alert filters
|
|
AlertOrphaned *bool `json:"alertOrphaned,omitempty"` // Alert on backups that do not match a known guest (default: true)
|
|
IgnoreVMIDs []string `json:"ignoreVMIDs,omitempty"` // Skip alerts for matching VMIDs (supports prefix*)
|
|
}
|
|
|
|
// GuestLookup describes a guest identity used for snapshot/backup evaluations.
|
|
type GuestLookup struct {
|
|
ResourceID string
|
|
Name string
|
|
Instance string
|
|
Node string
|
|
Type string
|
|
VMID int
|
|
}
|
|
|
|
// AlertConfig represents the complete alert configuration
|
|
type AlertConfig struct {
|
|
Enabled bool `json:"enabled"`
|
|
ActivationState ActivationState `json:"activationState,omitempty"`
|
|
ObservationWindowHours int `json:"observationWindowHours,omitempty"`
|
|
ActivationTime *time.Time `json:"activationTime,omitempty"`
|
|
GuestDefaults ThresholdConfig `json:"guestDefaults"`
|
|
NodeDefaults ThresholdConfig `json:"nodeDefaults"`
|
|
HostDefaults ThresholdConfig `json:"hostDefaults"`
|
|
StorageDefault HysteresisThreshold `json:"storageDefault"`
|
|
DockerDefaults DockerThresholdConfig `json:"dockerDefaults"`
|
|
DockerIgnoredContainerPrefixes []string `json:"dockerIgnoredContainerPrefixes,omitempty"`
|
|
IgnoredGuestPrefixes []string `json:"ignoredGuestPrefixes,omitempty"`
|
|
GuestTagWhitelist []string `json:"guestTagWhitelist,omitempty"`
|
|
GuestTagBlacklist []string `json:"guestTagBlacklist,omitempty"`
|
|
PMGDefaults PMGThresholdConfig `json:"pmgDefaults"`
|
|
PBSDefaults ThresholdConfig `json:"pbsDefaults"`
|
|
SnapshotDefaults SnapshotAlertConfig `json:"snapshotDefaults"`
|
|
BackupDefaults BackupAlertConfig `json:"backupDefaults"`
|
|
Overrides map[string]ThresholdConfig `json:"overrides"` // keyed by resource ID
|
|
CustomRules []CustomAlertRule `json:"customRules,omitempty"`
|
|
Schedule ScheduleConfig `json:"schedule"`
|
|
// Global disable flags per resource type
|
|
DisableAllNodes bool `json:"disableAllNodes"` // Disable all alerts for Proxmox nodes
|
|
DisableAllGuests bool `json:"disableAllGuests"` // Disable all alerts for VMs/containers
|
|
DisableAllHosts bool `json:"disableAllHosts"` // Disable all alerts for Pulse host agents
|
|
DisableAllStorage bool `json:"disableAllStorage"` // Disable all alerts for storage
|
|
DisableAllPBS bool `json:"disableAllPBS"` // Disable all alerts for PBS servers
|
|
DisableAllPMG bool `json:"disableAllPMG"` // Disable all alerts for PMG instances
|
|
DisableAllDockerHosts bool `json:"disableAllDockerHosts"` // Disable all alerts for Docker hosts
|
|
DisableAllDockerContainers bool `json:"disableAllDockerContainers"` // Disable all alerts for Docker containers
|
|
DisableAllDockerServices bool `json:"disableAllDockerServices"` // Disable all alerts for Docker services
|
|
DisableAllNodesOffline bool `json:"disableAllNodesOffline"` // Disable node offline/connectivity alerts globally
|
|
DisableAllGuestsOffline bool `json:"disableAllGuestsOffline"` // Disable guest powered-off alerts globally
|
|
DisableAllHostsOffline bool `json:"disableAllHostsOffline"` // Disable host agent offline alerts globally
|
|
DisableAllPBSOffline bool `json:"disableAllPBSOffline"` // Disable PBS offline alerts globally
|
|
DisableAllPMGOffline bool `json:"disableAllPMGOffline"` // Disable PMG offline alerts globally
|
|
DisableAllDockerHostsOffline bool `json:"disableAllDockerHostsOffline"` // Disable Docker host offline alerts globally
|
|
// New configuration options
|
|
MinimumDelta float64 `json:"minimumDelta"` // Minimum % change to trigger new alert
|
|
SuppressionWindow int `json:"suppressionWindow"` // Minutes to suppress duplicate alerts
|
|
HysteresisMargin float64 `json:"hysteresisMargin"` // Default margin for legacy thresholds
|
|
TimeThreshold int `json:"timeThreshold"` // Legacy: Seconds that threshold must be exceeded before triggering
|
|
TimeThresholds map[string]int `json:"timeThresholds"` // Per-type delays: guest, node, storage, pbs
|
|
MetricTimeThresholds map[string]map[string]int `json:"metricTimeThresholds"` // Optional per-metric delays keyed by resource type
|
|
// Alert TTL and auto-cleanup
|
|
MaxAlertAgeDays int `json:"maxAlertAgeDays"` // Maximum age for alerts before auto-cleanup (0 = disabled)
|
|
MaxAcknowledgedAgeDays int `json:"maxAcknowledgedAgeDays"` // Maximum age for acknowledged alerts (0 = disabled)
|
|
AutoAcknowledgeAfterHours int `json:"autoAcknowledgeAfterHours"` // Auto-acknowledge alerts after X hours (0 = disabled)
|
|
// Flapping detection
|
|
FlappingEnabled bool `json:"flappingEnabled"` // Enable flapping detection
|
|
FlappingWindowSeconds int `json:"flappingWindowSeconds"` // Time window for counting state changes
|
|
FlappingThreshold int `json:"flappingThreshold"` // Number of state changes to trigger flapping
|
|
FlappingCooldownMinutes int `json:"flappingCooldownMinutes"` // Cooldown period after flapping detected
|
|
}
|
|
|
|
// pmgQuarantineSnapshot stores quarantine counts at a point in time for growth detection
|
|
type pmgQuarantineSnapshot struct {
|
|
Spam int
|
|
Virus int
|
|
Timestamp time.Time
|
|
}
|
|
|
|
// pmgMailMetricSample stores a single hourly mail count sample
|
|
type pmgMailMetricSample struct {
|
|
SpamIn float64
|
|
SpamOut float64
|
|
VirusIn float64
|
|
VirusOut float64
|
|
Timestamp time.Time
|
|
}
|
|
|
|
// pmgBaselineCache stores calculated baseline values for a metric
|
|
type pmgBaselineCache struct {
|
|
TrimmedMean float64
|
|
Median float64
|
|
LastUpdated time.Time
|
|
}
|
|
|
|
// pmgAnomalyTracker tracks history and baselines for anomaly detection
|
|
type pmgAnomalyTracker struct {
|
|
Samples []pmgMailMetricSample // Ring buffer (max 48 samples)
|
|
Baselines map[string]pmgBaselineCache // Cached baselines per metric (spamIn, spamOut, virusIn, virusOut)
|
|
LastSampleTime time.Time // Timestamp of most recent sample
|
|
SampleCount int // Total samples collected (for warmup check)
|
|
}
|
|
|
|
// Manager handles alert monitoring and state
|
|
//
|
|
// Lock Ordering Documentation:
|
|
// The Manager uses two mutexes to prevent deadlocks:
|
|
// 1. m.mu (primary lock) - protects most manager state
|
|
// 2. m.resolvedMutex - protects only recentlyResolved map
|
|
//
|
|
// Lock Ordering Rules:
|
|
// - NEVER hold m.mu when acquiring resolvedMutex
|
|
// - ALWAYS release m.mu before acquiring resolvedMutex
|
|
// - resolvedMutex can be held independently without m.mu
|
|
// - When both locks are needed, acquire m.mu first, then release it before acquiring resolvedMutex
|
|
//
|
|
// This ordering prevents deadlock scenarios where different goroutines acquire locks in different orders.
|
|
|
|
// Metric hooks for integrating with Prometheus
|
|
var (
|
|
recordAlertFired func(*Alert)
|
|
recordAlertResolved func(*Alert)
|
|
recordAlertSuppressed func(string)
|
|
recordAlertAcknowledged func()
|
|
)
|
|
|
|
// SetMetricHooks registers callbacks for recording alert metrics.
|
|
// - fired: called when an alert is dispatched (in dispatchAlert)
|
|
// - resolved: called when an alert is cleared (in clearAlertNoLock)
|
|
// - suppressed: called when an alert is suppressed due to flapping
|
|
// - acknowledged: called when an alert is acknowledged
|
|
func SetMetricHooks(fired func(*Alert), resolved func(*Alert), suppressed func(string), acknowledged func()) {
|
|
recordAlertFired = fired
|
|
recordAlertResolved = resolved
|
|
recordAlertSuppressed = suppressed
|
|
recordAlertAcknowledged = acknowledged
|
|
}
|
|
|
|
type Manager struct {
|
|
mu sync.RWMutex
|
|
config AlertConfig
|
|
activeAlerts map[string]*Alert
|
|
historyManager *HistoryManager
|
|
onAlert func(alert *Alert)
|
|
onResolved func(alertID string)
|
|
onAcknowledged func(alert *Alert, user string)
|
|
onUnacknowledged func(alert *Alert, user string)
|
|
onEscalate func(alert *Alert, level int)
|
|
onAlertForAI func(alert *Alert) // AI analysis callback - bypasses notification suppression
|
|
escalationStop chan struct{}
|
|
alertRateLimit map[string][]time.Time // Track alert times for rate limiting
|
|
// New fields for deduplication and suppression
|
|
recentAlerts map[string]*Alert // Track recent alerts for deduplication
|
|
suppressedUntil map[string]time.Time // Track suppression windows
|
|
// Recently resolved alerts (kept for 5 minutes)
|
|
recentlyResolved map[string]*ResolvedAlert
|
|
resolvedMutex sync.RWMutex // Secondary lock - see Lock Ordering Documentation above
|
|
// Time threshold tracking
|
|
pendingAlerts map[string]time.Time // Track when thresholds were first exceeded
|
|
// Offline confirmation tracking
|
|
nodeOfflineCount map[string]int // Track consecutive offline counts for nodes (legacy)
|
|
offlineConfirmations map[string]int // Track consecutive offline counts for all resources
|
|
dockerOfflineCount map[string]int // Track consecutive offline counts for Docker hosts
|
|
dockerStateConfirm map[string]int // Track consecutive state confirmations for Docker containers
|
|
dockerRestartTracking map[string]*dockerRestartRecord // Track restart counts and times for restart loop detection
|
|
dockerLastExitCode map[string]int // Track last exit code for OOM detection
|
|
dockerUpdateFirstSeen map[string]time.Time // Track when image updates were first detected for alert delay
|
|
// Stable identity tracking prevents update-delay resets when host IDs churn.
|
|
dockerUpdateFirstSeenByIdentity map[string]time.Time
|
|
// PMG quarantine growth tracking
|
|
pmgQuarantineHistory map[string][]pmgQuarantineSnapshot // Track quarantine snapshots for growth detection
|
|
// PMG anomaly detection tracking
|
|
pmgAnomalyTrackers map[string]*pmgAnomalyTracker // Track mail metrics for anomaly detection per PMG instance
|
|
// Persistent acknowledgement state so quick alert rebuilds keep user acknowledgements
|
|
ackState map[string]ackRecord
|
|
// Flapping detection tracking
|
|
flappingHistory map[string][]time.Time // Track state change times for flapping detection
|
|
flappingActive map[string]bool // Track which alerts are currently in flapping state
|
|
// Cleanup control
|
|
cleanupStop chan struct{} // Signal to stop cleanup goroutine
|
|
// Host agent deduplication: track hostnames of active host agents
|
|
// When a host agent is running on a Proxmox node, we prefer the host agent
|
|
// alerts and suppress the node alerts to avoid duplicate monitoring.
|
|
hostAgentHostnames map[string]struct{} // Normalized hostnames (lowercase)
|
|
// Node display name caches. Proxmox nodes can share the same raw node name
|
|
// across multiple configured instances, so keep instance-scoped entries in
|
|
// addition to the legacy raw-name cache used by instance-less resources.
|
|
nodeDisplayNames map[string]string
|
|
instanceNodeDisplayNames map[string]string
|
|
// License checking for Pro-only alert features
|
|
hasProFeature func(feature string) bool
|
|
|
|
// Cached timezone for quiet hours
|
|
quietHoursLoc *time.Location
|
|
}
|
|
|
|
type ackRecord struct {
|
|
acknowledged bool
|
|
user string
|
|
time time.Time // When the alert was acknowledged
|
|
inactiveAt time.Time // When the alert was removed (zero if still active)
|
|
}
|
|
|
|
type dockerRestartRecord struct {
|
|
count int
|
|
lastCount int
|
|
times []time.Time // Track restart times for loop detection
|
|
lastChecked time.Time
|
|
}
|
|
|
|
// NewManager creates a new alert manager using the global data directory.
|
|
// For multi-tenant deployments, use NewManagerWithDataDir instead.
|
|
func NewManager() *Manager {
|
|
return NewManagerWithDataDir(utils.GetDataDir())
|
|
}
|
|
|
|
// NewManagerWithDataDir creates a new alert manager with a custom data directory.
|
|
// This enables tenant-scoped alert persistence in multi-tenant deployments.
|
|
func NewManagerWithDataDir(dataDir string) *Manager {
|
|
alertsDir := filepath.Join(dataDir, "alerts")
|
|
alertOrphaned := true
|
|
m := &Manager{
|
|
activeAlerts: make(map[string]*Alert),
|
|
historyManager: NewHistoryManager(alertsDir),
|
|
escalationStop: make(chan struct{}),
|
|
alertRateLimit: make(map[string][]time.Time),
|
|
recentAlerts: make(map[string]*Alert),
|
|
suppressedUntil: make(map[string]time.Time),
|
|
recentlyResolved: make(map[string]*ResolvedAlert),
|
|
pendingAlerts: make(map[string]time.Time),
|
|
nodeOfflineCount: make(map[string]int),
|
|
offlineConfirmations: make(map[string]int),
|
|
dockerOfflineCount: make(map[string]int),
|
|
dockerStateConfirm: make(map[string]int),
|
|
dockerRestartTracking: make(map[string]*dockerRestartRecord),
|
|
dockerLastExitCode: make(map[string]int),
|
|
dockerUpdateFirstSeen: make(map[string]time.Time),
|
|
dockerUpdateFirstSeenByIdentity: make(map[string]time.Time),
|
|
pmgQuarantineHistory: make(map[string][]pmgQuarantineSnapshot),
|
|
pmgAnomalyTrackers: make(map[string]*pmgAnomalyTracker),
|
|
ackState: make(map[string]ackRecord),
|
|
flappingHistory: make(map[string][]time.Time),
|
|
flappingActive: make(map[string]bool),
|
|
cleanupStop: make(chan struct{}),
|
|
hostAgentHostnames: make(map[string]struct{}),
|
|
nodeDisplayNames: make(map[string]string),
|
|
instanceNodeDisplayNames: make(map[string]string),
|
|
config: AlertConfig{
|
|
Enabled: true,
|
|
ActivationState: ActivationPending,
|
|
ObservationWindowHours: 24,
|
|
GuestDefaults: ThresholdConfig{
|
|
PoweredOffSeverity: AlertLevelWarning,
|
|
CPU: &HysteresisThreshold{Trigger: 80, Clear: 75},
|
|
Memory: &HysteresisThreshold{Trigger: 85, Clear: 80},
|
|
Disk: &HysteresisThreshold{Trigger: 90, Clear: 85},
|
|
DiskRead: &HysteresisThreshold{Trigger: 0, Clear: 0}, // Off by default
|
|
DiskWrite: &HysteresisThreshold{Trigger: 0, Clear: 0}, // Off by default
|
|
NetworkIn: &HysteresisThreshold{Trigger: 0, Clear: 0}, // Off by default
|
|
NetworkOut: &HysteresisThreshold{Trigger: 0, Clear: 0}, // Off by default
|
|
},
|
|
NodeDefaults: ThresholdConfig{
|
|
CPU: &HysteresisThreshold{Trigger: 80, Clear: 75},
|
|
Memory: &HysteresisThreshold{Trigger: 85, Clear: 80},
|
|
Disk: &HysteresisThreshold{Trigger: 90, Clear: 85},
|
|
Temperature: &HysteresisThreshold{Trigger: 80, Clear: 75}, // Warning at 80°C, clear at 75°C
|
|
},
|
|
HostDefaults: ThresholdConfig{
|
|
CPU: &HysteresisThreshold{Trigger: 80, Clear: 75},
|
|
Memory: &HysteresisThreshold{Trigger: 85, Clear: 80},
|
|
Disk: &HysteresisThreshold{Trigger: 90, Clear: 85},
|
|
DiskTemperature: &HysteresisThreshold{Trigger: 55, Clear: 50},
|
|
},
|
|
DockerDefaults: DockerThresholdConfig{
|
|
CPU: HysteresisThreshold{Trigger: 80, Clear: 75},
|
|
Memory: HysteresisThreshold{Trigger: 85, Clear: 80},
|
|
Disk: HysteresisThreshold{Trigger: 85, Clear: 80},
|
|
RestartCount: 3,
|
|
RestartWindow: 300, // 5 minutes
|
|
MemoryWarnPct: 90,
|
|
MemoryCriticalPct: 95,
|
|
StatePoweredOffSeverity: AlertLevelWarning,
|
|
},
|
|
PMGDefaults: PMGThresholdConfig{
|
|
QueueTotalWarning: 500, // Warning at 500 total queued messages
|
|
QueueTotalCritical: 1000, // Critical at 1000 total queued messages
|
|
OldestMessageWarnMins: 30, // Warning if oldest message is 30+ minutes old
|
|
OldestMessageCritMins: 60, // Critical if oldest message is 60+ minutes old
|
|
DeferredQueueWarn: 200, // Warning at 200 deferred messages
|
|
DeferredQueueCritical: 500, // Critical at 500 deferred messages
|
|
HoldQueueWarn: 100, // Warning at 100 held messages
|
|
HoldQueueCritical: 300, // Critical at 300 held messages
|
|
QuarantineSpamWarn: 2000, // Warning at 2000 spam quarantined
|
|
QuarantineSpamCritical: 5000, // Critical at 5000 spam quarantined
|
|
QuarantineVirusWarn: 2000, // Warning at 2000 virus quarantined
|
|
QuarantineVirusCritical: 5000, // Critical at 5000 virus quarantined
|
|
QuarantineGrowthWarnPct: 25, // Warning if growth ≥25%
|
|
QuarantineGrowthWarnMin: 250, // AND ≥250 messages
|
|
QuarantineGrowthCritPct: 50, // Critical if growth ≥50%
|
|
QuarantineGrowthCritMin: 500, // AND ≥500 messages
|
|
},
|
|
SnapshotDefaults: SnapshotAlertConfig{
|
|
Enabled: false,
|
|
WarningDays: 30,
|
|
CriticalDays: 45,
|
|
WarningSizeGiB: 0,
|
|
CriticalSizeGiB: 0,
|
|
},
|
|
BackupDefaults: BackupAlertConfig{
|
|
Enabled: false,
|
|
WarningDays: 7,
|
|
CriticalDays: 14,
|
|
FreshHours: 24,
|
|
StaleHours: 72,
|
|
AlertOrphaned: &alertOrphaned,
|
|
IgnoreVMIDs: []string{},
|
|
},
|
|
PBSDefaults: ThresholdConfig{
|
|
CPU: &HysteresisThreshold{Trigger: 80, Clear: 75},
|
|
Memory: &HysteresisThreshold{Trigger: 85, Clear: 80},
|
|
},
|
|
StorageDefault: HysteresisThreshold{Trigger: 85, Clear: 80},
|
|
MinimumDelta: 2.0, // 2% minimum change
|
|
SuppressionWindow: 5, // 5 minutes
|
|
HysteresisMargin: 5.0, // 5% default margin
|
|
TimeThreshold: 5,
|
|
TimeThresholds: map[string]int{
|
|
"guest": 5,
|
|
"node": 5,
|
|
"storage": 5,
|
|
"pbs": 5,
|
|
},
|
|
Overrides: make(map[string]ThresholdConfig),
|
|
Schedule: ScheduleConfig{
|
|
QuietHours: QuietHours{
|
|
Enabled: false, // OFF - users should opt-in to quiet hours
|
|
Start: "22:00",
|
|
End: "08:00",
|
|
Timezone: "America/New_York",
|
|
Days: map[string]bool{
|
|
"monday": true,
|
|
"tuesday": true,
|
|
"wednesday": true,
|
|
"thursday": true,
|
|
"friday": true,
|
|
"saturday": false,
|
|
"sunday": false,
|
|
},
|
|
Suppress: QuietHoursSuppression{},
|
|
},
|
|
Cooldown: 5, // ON - 5 minutes prevents spam
|
|
MaxAlertsHour: 10, // ON - 10 alerts/hour prevents flooding
|
|
// Note: GroupingWindow is deprecated - use Grouping.Window instead
|
|
NotifyOnResolve: true,
|
|
Escalation: EscalationConfig{
|
|
Enabled: false, // OFF - requires user configuration
|
|
Levels: []EscalationLevel{
|
|
{After: 15, Notify: "email"},
|
|
{After: 30, Notify: "webhook"},
|
|
{After: 60, Notify: "all"},
|
|
},
|
|
},
|
|
Grouping: GroupingConfig{
|
|
Enabled: true, // ON - reduces notification noise
|
|
Window: 30, // 30 second window for grouping
|
|
ByNode: true, // Group by node for mass node issues
|
|
ByGuest: false, // Don't group by guest by default
|
|
},
|
|
},
|
|
// Alert TTL defaults
|
|
MaxAlertAgeDays: 7, // Auto-cleanup alerts older than 7 days
|
|
MaxAcknowledgedAgeDays: 1, // Auto-cleanup acknowledged alerts older than 1 day
|
|
AutoAcknowledgeAfterHours: 24, // Auto-acknowledge alerts after 24 hours
|
|
// Flapping detection defaults
|
|
FlappingEnabled: true, // Enable flapping detection
|
|
FlappingWindowSeconds: 300, // 5 minute window
|
|
FlappingThreshold: 5, // 5 state changes triggers flapping
|
|
FlappingCooldownMinutes: 15, // 15 minute cooldown
|
|
},
|
|
}
|
|
|
|
// Load saved active alerts
|
|
if err := m.LoadActiveAlerts(); err != nil {
|
|
log.Error().Err(err).Msg("Failed to load active alerts")
|
|
}
|
|
|
|
// Start escalation checker
|
|
go m.escalationChecker()
|
|
|
|
// Start periodic save of active alerts
|
|
go m.periodicSaveAlerts()
|
|
|
|
// Start periodic cleanup of stale tracking map entries
|
|
go m.trackingMapCleanup()
|
|
|
|
return m
|
|
}
|
|
|
|
// SetLicenseChecker sets the function used to check Pro license features.
|
|
// This enables gating Pro-only alert features like update alerts.
|
|
func (m *Manager) SetLicenseChecker(checker func(feature string) bool) {
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
m.hasProFeature = checker
|
|
}
|
|
|
|
// addRecentlyResolvedUnlocked records a resolved alert assuming the caller does not hold m.mu.
|
|
func (m *Manager) addRecentlyResolvedUnlocked(alertID string, resolved *ResolvedAlert) {
|
|
m.resolvedMutex.Lock()
|
|
m.recentlyResolved[alertID] = resolved
|
|
m.resolvedMutex.Unlock()
|
|
}
|
|
|
|
// addRecentlyResolvedWithPrimaryLock records a resolved alert while preserving the caller's
|
|
// ownership of m.mu. Callers must hold m.mu before invoking this helper.
|
|
func (m *Manager) addRecentlyResolvedWithPrimaryLock(alertID string, resolved *ResolvedAlert) {
|
|
m.mu.Unlock()
|
|
m.addRecentlyResolvedUnlocked(alertID, resolved)
|
|
m.mu.Lock()
|
|
}
|
|
|
|
// SetAlertCallback sets the callback for new alerts
|
|
func (m *Manager) SetAlertCallback(cb func(alert *Alert)) {
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
m.onAlert = cb
|
|
}
|
|
|
|
// SetAlertForAICallback sets a callback for AI analysis when alerts are created.
|
|
// Unlike SetAlertCallback, this callback is invoked unconditionally - it bypasses
|
|
// activation state, quiet hours, and other notification suppression checks.
|
|
// This allows AI to analyze alerts even when the user hasn't finished setup.
|
|
func (m *Manager) SetAlertForAICallback(cb func(alert *Alert)) {
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
m.onAlertForAI = cb
|
|
log.Info().Msg("Alert-for-AI callback registered (bypasses notification suppression)")
|
|
}
|
|
|
|
// SetResolvedCallback sets the callback for resolved alerts
|
|
func (m *Manager) SetResolvedCallback(cb func(alertID string)) {
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
m.onResolved = cb
|
|
}
|
|
|
|
// SetAcknowledgedCallback sets the callback for acknowledged alerts.
|
|
func (m *Manager) SetAcknowledgedCallback(cb func(alert *Alert, user string)) {
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
m.onAcknowledged = cb
|
|
}
|
|
|
|
// SetUnacknowledgedCallback sets the callback for unacknowledged alerts.
|
|
func (m *Manager) SetUnacknowledgedCallback(cb func(alert *Alert, user string)) {
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
m.onUnacknowledged = cb
|
|
}
|
|
|
|
// SetEscalateCallback sets the callback for escalated alerts
|
|
func (m *Manager) SetEscalateCallback(cb func(alert *Alert, level int)) {
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
m.onEscalate = cb
|
|
}
|
|
|
|
// safeCallResolvedCallback invokes onResolved with panic recovery
|
|
func (m *Manager) safeCallResolvedCallback(alertID string, async bool) {
|
|
if m.onResolved == nil {
|
|
return
|
|
}
|
|
|
|
callbackFunc := func() {
|
|
defer func() {
|
|
if r := recover(); r != nil {
|
|
log.Error().
|
|
Interface("panic", r).
|
|
Str("alertID", alertID).
|
|
Msg("Panic in onResolved callback")
|
|
}
|
|
}()
|
|
m.onResolved(alertID)
|
|
}
|
|
|
|
if async {
|
|
go callbackFunc()
|
|
} else {
|
|
callbackFunc()
|
|
}
|
|
}
|
|
|
|
// safeCallAcknowledgedCallback invokes onAcknowledged with panic recovery and alert cloning.
|
|
func (m *Manager) safeCallAcknowledgedCallback(alert *Alert, user string) {
|
|
if m.onAcknowledged == nil || alert == nil {
|
|
return
|
|
}
|
|
|
|
alertCopy := alert.Clone()
|
|
go func(a *Alert, u string) {
|
|
defer func() {
|
|
if r := recover(); r != nil {
|
|
log.Error().
|
|
Interface("panic", r).
|
|
Str("alertID", a.ID).
|
|
Msg("Panic in onAcknowledged callback")
|
|
}
|
|
}()
|
|
m.onAcknowledged(a, u)
|
|
}(alertCopy, user)
|
|
}
|
|
|
|
// safeCallUnacknowledgedCallback invokes onUnacknowledged with panic recovery and alert cloning.
|
|
func (m *Manager) safeCallUnacknowledgedCallback(alert *Alert, user string) {
|
|
if m.onUnacknowledged == nil || alert == nil {
|
|
return
|
|
}
|
|
|
|
alertCopy := alert.Clone()
|
|
go func(a *Alert, u string) {
|
|
defer func() {
|
|
if r := recover(); r != nil {
|
|
log.Error().
|
|
Interface("panic", r).
|
|
Str("alertID", a.ID).
|
|
Msg("Panic in onUnacknowledged callback")
|
|
}
|
|
}()
|
|
m.onUnacknowledged(a, u)
|
|
}(alertCopy, user)
|
|
}
|
|
|
|
// safeCallEscalateCallback invokes onEscalate with panic recovery and alert cloning
|
|
func (m *Manager) safeCallEscalateCallback(alert *Alert, level int) {
|
|
if m.onEscalate == nil {
|
|
return
|
|
}
|
|
|
|
// Clone alert to prevent concurrent modification
|
|
alertCopy := alert.Clone()
|
|
go func(a *Alert, lvl int) {
|
|
defer func() {
|
|
if r := recover(); r != nil {
|
|
log.Error().
|
|
Interface("panic", r).
|
|
Str("alertID", a.ID).
|
|
Int("level", lvl).
|
|
Msg("Panic in onEscalate callback")
|
|
}
|
|
}()
|
|
m.onEscalate(a, lvl)
|
|
}(alertCopy, level)
|
|
}
|
|
|
|
// checkFlappingLocked detects alert flapping and returns true if alert should be suppressed.
|
|
// It modifies flappingHistory, flappingActive, and suppressedUntil maps.
|
|
// IMPORTANT: Caller MUST hold m.mu before calling this function.
|
|
func (m *Manager) checkFlappingLocked(alertID string) bool {
|
|
if !m.config.FlappingEnabled {
|
|
return false
|
|
}
|
|
|
|
now := time.Now()
|
|
windowDuration := time.Duration(m.config.FlappingWindowSeconds) * time.Second
|
|
|
|
// Record this state change
|
|
m.flappingHistory[alertID] = append(m.flappingHistory[alertID], now)
|
|
|
|
// Remove state changes outside the window
|
|
history := m.flappingHistory[alertID]
|
|
validHistory := []time.Time{}
|
|
for _, t := range history {
|
|
if now.Sub(t) <= windowDuration {
|
|
validHistory = append(validHistory, t)
|
|
}
|
|
}
|
|
// Limit to max 10 entries to prevent unbounded growth
|
|
const maxFlappingHistory = 10
|
|
if len(validHistory) > maxFlappingHistory {
|
|
validHistory = validHistory[len(validHistory)-maxFlappingHistory:]
|
|
}
|
|
m.flappingHistory[alertID] = validHistory
|
|
|
|
// Check if we've exceeded the threshold
|
|
if len(validHistory) >= m.config.FlappingThreshold {
|
|
// Mark as flapping
|
|
if !m.flappingActive[alertID] {
|
|
log.Warn().
|
|
Str("alertID", alertID).
|
|
Int("stateChanges", len(validHistory)).
|
|
Int("threshold", m.config.FlappingThreshold).
|
|
Int("windowSeconds", m.config.FlappingWindowSeconds).
|
|
Msg("Flapping detected - suppressing alert")
|
|
|
|
m.flappingActive[alertID] = true
|
|
|
|
// Set cooldown period
|
|
cooldownDuration := time.Duration(m.config.FlappingCooldownMinutes) * time.Minute
|
|
m.suppressedUntil[alertID] = now.Add(cooldownDuration)
|
|
|
|
// Record suppression metric
|
|
if recordAlertSuppressed != nil {
|
|
recordAlertSuppressed("flapping")
|
|
}
|
|
}
|
|
return true
|
|
}
|
|
|
|
return false
|
|
}
|
|
|
|
func (m *Manager) dispatchAlert(alert *Alert, async bool) bool {
|
|
if m.onAlert == nil || alert == nil {
|
|
return false
|
|
}
|
|
|
|
// Don't dispatch notifications for acknowledged alerts
|
|
if alert.Acknowledged {
|
|
log.Debug().
|
|
Str("alertID", alert.ID).
|
|
Str("ackUser", alert.AckUser).
|
|
Msg("Alert notification suppressed - already acknowledged")
|
|
return false
|
|
}
|
|
|
|
// Check for flapping (caller must hold m.mu)
|
|
if m.checkFlappingLocked(alert.ID) {
|
|
log.Debug().
|
|
Str("alertID", alert.ID).
|
|
Msg("Alert suppressed due to flapping")
|
|
return false
|
|
}
|
|
|
|
// Check activation state - only dispatch notifications if active
|
|
if m.config.ActivationState != ActivationActive {
|
|
log.Debug().
|
|
Str("alertID", alert.ID).
|
|
Str("activationState", string(m.config.ActivationState)).
|
|
Msg("Alert notification suppressed - not activated")
|
|
return false
|
|
}
|
|
|
|
if suppressed, reason := m.shouldSuppressNotification(alert); suppressed {
|
|
log.Debug().
|
|
Str("alertID", alert.ID).
|
|
Str("type", alert.Type).
|
|
Str("level", string(alert.Level)).
|
|
Str("quietHoursRule", reason).
|
|
Msg("Alert notification suppressed during quiet hours")
|
|
return false
|
|
}
|
|
|
|
if isMonitorOnlyAlert(alert) {
|
|
log.Info().
|
|
Str("alertID", alert.ID).
|
|
Str("resource", alert.ResourceName).
|
|
Bool("monitorOnly", true).
|
|
Msg("Monitor-only alert detected, skipping alert dispatch")
|
|
return false
|
|
}
|
|
|
|
// Record metric for fired alert
|
|
if recordAlertFired != nil {
|
|
recordAlertFired(alert)
|
|
}
|
|
|
|
alertCopy := alert.Clone()
|
|
if async {
|
|
go func(a *Alert) {
|
|
defer func() {
|
|
if r := recover(); r != nil {
|
|
log.Error().
|
|
Interface("panic", r).
|
|
Str("alertID", a.ID).
|
|
Str("type", a.Type).
|
|
Msg("Panic in onAlert callback")
|
|
}
|
|
}()
|
|
m.onAlert(a)
|
|
}(alertCopy)
|
|
} else {
|
|
// Synchronous calls also need panic recovery to prevent service crash
|
|
func() {
|
|
defer func() {
|
|
if r := recover(); r != nil {
|
|
log.Error().
|
|
Interface("panic", r).
|
|
Str("alertID", alertCopy.ID).
|
|
Str("type", alertCopy.Type).
|
|
Msg("Panic in onAlert callback (synchronous)")
|
|
}
|
|
}()
|
|
m.onAlert(alertCopy)
|
|
}()
|
|
}
|
|
return true
|
|
}
|
|
|
|
func isMonitorOnlyAlert(alert *Alert) bool {
|
|
if alert == nil || alert.Metadata == nil {
|
|
return false
|
|
}
|
|
|
|
if value, ok := alert.Metadata["monitorOnly"]; ok {
|
|
switch v := value.(type) {
|
|
case bool:
|
|
return v
|
|
case string:
|
|
return strings.EqualFold(v, "true")
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
// ensureValidHysteresis ensures clear < trigger for hysteresis thresholds
|
|
func ensureValidHysteresis(threshold *HysteresisThreshold, metricName string) {
|
|
if threshold == nil {
|
|
return
|
|
}
|
|
// Disabled thresholds don't need hysteresis validation
|
|
if threshold.Trigger <= 0 {
|
|
return
|
|
}
|
|
if threshold.Clear >= threshold.Trigger {
|
|
log.Warn().
|
|
Str("metric", metricName).
|
|
Float64("trigger", threshold.Trigger).
|
|
Float64("clear", threshold.Clear).
|
|
Msg("Invalid hysteresis: clear >= trigger, auto-fixing")
|
|
// Auto-fix: set clear to 5% below trigger
|
|
threshold.Clear = threshold.Trigger - 5
|
|
if threshold.Clear < 0 {
|
|
threshold.Clear = 0
|
|
}
|
|
}
|
|
}
|
|
|
|
// UpdateConfig updates the alert configuration
|
|
func (m *Manager) UpdateConfig(config AlertConfig) {
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
|
|
// Preserve activation state/time when clients update the config without including it.
|
|
// This avoids unintentionally resetting alerts to pending review when saving thresholds.
|
|
if config.ActivationState == "" && m.config.ActivationState != "" {
|
|
config.ActivationState = m.config.ActivationState
|
|
if config.ActivationTime == nil && m.config.ActivationTime != nil {
|
|
config.ActivationTime = m.config.ActivationTime
|
|
}
|
|
}
|
|
|
|
// Normalize all config sections
|
|
normalizeStorageDefaults(&config)
|
|
normalizeDockerDefaults(&config)
|
|
normalizePMGDefaults(&config)
|
|
normalizeSnapshotDefaults(&config)
|
|
normalizeBackupDefaults(&config)
|
|
normalizeNodeDefaults(&config)
|
|
normalizeHostDefaults(&config)
|
|
normalizeGeneralSettings(&config)
|
|
normalizeTimeThresholds(&config)
|
|
|
|
config.GuestDefaults.PoweredOffSeverity = normalizePoweredOffSeverity(config.GuestDefaults.PoweredOffSeverity)
|
|
config.NodeDefaults.PoweredOffSeverity = normalizePoweredOffSeverity(config.NodeDefaults.PoweredOffSeverity)
|
|
config.DockerIgnoredContainerPrefixes = NormalizeDockerIgnoredPrefixes(config.DockerIgnoredContainerPrefixes)
|
|
|
|
// Migration logic for activation state (backward compatibility)
|
|
m.migrateActivationState(&config)
|
|
|
|
// Validate hysteresis thresholds to prevent stuck alerts
|
|
validateHysteresisThresholds(&config)
|
|
|
|
// Validate timezone if quiet hours are enabled
|
|
validateQuietHoursTimezone(&config)
|
|
|
|
m.config = config
|
|
normalizeOverrides(m.config.Overrides)
|
|
|
|
// Update cached quiet hours location
|
|
if m.config.Schedule.QuietHours.Enabled && m.config.Schedule.QuietHours.Timezone != "" {
|
|
loc, err := time.LoadLocation(m.config.Schedule.QuietHours.Timezone)
|
|
if err == nil {
|
|
m.quietHoursLoc = loc
|
|
} else {
|
|
m.quietHoursLoc = time.Local
|
|
}
|
|
} else {
|
|
m.quietHoursLoc = time.Local
|
|
}
|
|
|
|
if !m.config.SnapshotDefaults.Enabled {
|
|
m.clearSnapshotAlertsForInstanceLocked("")
|
|
}
|
|
if !m.config.BackupDefaults.Enabled {
|
|
m.clearBackupAlertsLocked()
|
|
}
|
|
|
|
m.applyGlobalOfflineSettingsLocked()
|
|
|
|
log.Info().
|
|
Bool("enabled", config.Enabled).
|
|
Interface("guestDefaults", config.GuestDefaults).
|
|
Msg("Alert configuration updated")
|
|
|
|
// Re-evaluate active alerts against new thresholds
|
|
m.reevaluateActiveAlertsLocked()
|
|
}
|
|
|
|
// normalizeStorageDefaults ensures storage default thresholds are set
|
|
// Trigger=0 is allowed and means "disable storage alerting"
|
|
func normalizeStorageDefaults(config *AlertConfig) {
|
|
if config.StorageDefault.Trigger < 0 {
|
|
config.StorageDefault.Trigger = 85
|
|
config.StorageDefault.Clear = 80
|
|
} else if config.StorageDefault.Trigger == 0 {
|
|
// Trigger=0 means disabled, set Clear=0 too
|
|
config.StorageDefault.Clear = 0
|
|
} else if config.StorageDefault.Clear <= 0 {
|
|
config.StorageDefault.Clear = config.StorageDefault.Trigger - 5
|
|
if config.StorageDefault.Clear < 0 {
|
|
config.StorageDefault.Clear = 0
|
|
}
|
|
}
|
|
}
|
|
|
|
// normalizeDockerThreshold normalizes a single Docker threshold
|
|
func normalizeDockerThreshold(th HysteresisThreshold, defaultTrigger float64, metricName string) HysteresisThreshold {
|
|
normalized := th
|
|
|
|
// Negative triggers are treated as unset and replaced with defaults.
|
|
if normalized.Trigger < 0 {
|
|
normalized.Trigger = defaultTrigger
|
|
}
|
|
|
|
// Explicit disable: keep trigger at 0 and clamp clear to 0.
|
|
if normalized.Trigger == 0 {
|
|
if normalized.Clear < 0 {
|
|
normalized.Clear = 0
|
|
}
|
|
return normalized
|
|
}
|
|
|
|
if normalized.Clear <= 0 {
|
|
normalized.Clear = normalized.Trigger - 5
|
|
if normalized.Clear < 0 {
|
|
normalized.Clear = 0
|
|
}
|
|
}
|
|
|
|
ensureValidHysteresis(&normalized, metricName)
|
|
return normalized
|
|
}
|
|
|
|
// normalizeDockerDefaults ensures Docker default thresholds are set
|
|
func normalizeDockerDefaults(config *AlertConfig) {
|
|
config.DockerDefaults.CPU = normalizeDockerThreshold(config.DockerDefaults.CPU, 80, "docker.cpu")
|
|
config.DockerDefaults.Memory = normalizeDockerThreshold(config.DockerDefaults.Memory, 85, "docker.memory")
|
|
config.DockerDefaults.Disk = normalizeDockerThreshold(config.DockerDefaults.Disk, 85, "docker.disk")
|
|
|
|
if config.DockerDefaults.RestartCount <= 0 {
|
|
config.DockerDefaults.RestartCount = 3
|
|
}
|
|
if config.DockerDefaults.RestartWindow <= 0 {
|
|
config.DockerDefaults.RestartWindow = 300 // 5 minutes
|
|
}
|
|
if config.DockerDefaults.MemoryWarnPct <= 0 {
|
|
config.DockerDefaults.MemoryWarnPct = 90
|
|
}
|
|
if config.DockerDefaults.MemoryCriticalPct <= 0 {
|
|
config.DockerDefaults.MemoryCriticalPct = 95
|
|
}
|
|
if config.DockerDefaults.ServiceWarnGapPct <= 0 {
|
|
config.DockerDefaults.ServiceWarnGapPct = 10
|
|
}
|
|
if config.DockerDefaults.ServiceCritGapPct <= 0 {
|
|
config.DockerDefaults.ServiceCritGapPct = 50
|
|
}
|
|
if config.DockerDefaults.ServiceCritGapPct > 0 &&
|
|
config.DockerDefaults.ServiceCritGapPct < config.DockerDefaults.ServiceWarnGapPct {
|
|
log.Warn().
|
|
Int("warnGapPercent", config.DockerDefaults.ServiceWarnGapPct).
|
|
Int("criticalGapPercent", config.DockerDefaults.ServiceCritGapPct).
|
|
Msg("Adjusting Docker service critical gap to match warning gap")
|
|
config.DockerDefaults.ServiceCritGapPct = config.DockerDefaults.ServiceWarnGapPct
|
|
}
|
|
if config.DockerDefaults.StatePoweredOffSeverity == "" {
|
|
config.DockerDefaults.StatePoweredOffSeverity = AlertLevelWarning
|
|
}
|
|
config.DockerDefaults.StatePoweredOffSeverity = normalizePoweredOffSeverity(config.DockerDefaults.StatePoweredOffSeverity)
|
|
// Default to 24 hours delay for update alerts; set to -1 to explicitly disable
|
|
if config.DockerDefaults.UpdateAlertDelayHours == 0 {
|
|
config.DockerDefaults.UpdateAlertDelayHours = 24
|
|
}
|
|
}
|
|
|
|
// normalizePMGDefaults ensures PMG (Proxmox Mail Gateway) defaults are set
|
|
func normalizePMGDefaults(config *AlertConfig) {
|
|
if config.PMGDefaults.QueueTotalWarning <= 0 {
|
|
config.PMGDefaults.QueueTotalWarning = 500
|
|
}
|
|
if config.PMGDefaults.QueueTotalCritical <= 0 {
|
|
config.PMGDefaults.QueueTotalCritical = 1000
|
|
}
|
|
if config.PMGDefaults.OldestMessageWarnMins <= 0 {
|
|
config.PMGDefaults.OldestMessageWarnMins = 30
|
|
}
|
|
if config.PMGDefaults.OldestMessageCritMins <= 0 {
|
|
config.PMGDefaults.OldestMessageCritMins = 60
|
|
}
|
|
if config.PMGDefaults.DeferredQueueWarn <= 0 {
|
|
config.PMGDefaults.DeferredQueueWarn = 200
|
|
}
|
|
if config.PMGDefaults.DeferredQueueCritical <= 0 {
|
|
config.PMGDefaults.DeferredQueueCritical = 500
|
|
}
|
|
if config.PMGDefaults.HoldQueueWarn <= 0 {
|
|
config.PMGDefaults.HoldQueueWarn = 100
|
|
}
|
|
if config.PMGDefaults.HoldQueueCritical <= 0 {
|
|
config.PMGDefaults.HoldQueueCritical = 300
|
|
}
|
|
if config.PMGDefaults.QuarantineSpamWarn <= 0 {
|
|
config.PMGDefaults.QuarantineSpamWarn = 2000
|
|
}
|
|
if config.PMGDefaults.QuarantineSpamCritical <= 0 {
|
|
config.PMGDefaults.QuarantineSpamCritical = 5000
|
|
}
|
|
if config.PMGDefaults.QuarantineVirusWarn <= 0 {
|
|
config.PMGDefaults.QuarantineVirusWarn = 2000
|
|
}
|
|
if config.PMGDefaults.QuarantineVirusCritical <= 0 {
|
|
config.PMGDefaults.QuarantineVirusCritical = 5000
|
|
}
|
|
if config.PMGDefaults.QuarantineGrowthWarnPct <= 0 {
|
|
config.PMGDefaults.QuarantineGrowthWarnPct = 25
|
|
}
|
|
if config.PMGDefaults.QuarantineGrowthWarnMin <= 0 {
|
|
config.PMGDefaults.QuarantineGrowthWarnMin = 250
|
|
}
|
|
if config.PMGDefaults.QuarantineGrowthCritPct <= 0 {
|
|
config.PMGDefaults.QuarantineGrowthCritPct = 50
|
|
}
|
|
if config.PMGDefaults.QuarantineGrowthCritMin <= 0 {
|
|
config.PMGDefaults.QuarantineGrowthCritMin = 500
|
|
}
|
|
}
|
|
|
|
// normalizeSnapshotDefaults ensures snapshot alert thresholds are valid
|
|
func normalizeSnapshotDefaults(config *AlertConfig) {
|
|
if config.SnapshotDefaults.WarningDays < 0 {
|
|
config.SnapshotDefaults.WarningDays = 0
|
|
}
|
|
if config.SnapshotDefaults.CriticalDays < 0 {
|
|
config.SnapshotDefaults.CriticalDays = 0
|
|
}
|
|
if config.SnapshotDefaults.CriticalDays > 0 && config.SnapshotDefaults.WarningDays > config.SnapshotDefaults.CriticalDays {
|
|
config.SnapshotDefaults.WarningDays = config.SnapshotDefaults.CriticalDays
|
|
}
|
|
if config.SnapshotDefaults.CriticalDays == 0 && config.SnapshotDefaults.WarningDays > 0 {
|
|
config.SnapshotDefaults.CriticalDays = config.SnapshotDefaults.WarningDays
|
|
}
|
|
if config.SnapshotDefaults.WarningSizeGiB < 0 {
|
|
config.SnapshotDefaults.WarningSizeGiB = 0
|
|
}
|
|
if config.SnapshotDefaults.CriticalSizeGiB < 0 {
|
|
config.SnapshotDefaults.CriticalSizeGiB = 0
|
|
}
|
|
if config.SnapshotDefaults.CriticalSizeGiB > 0 && config.SnapshotDefaults.WarningSizeGiB > config.SnapshotDefaults.CriticalSizeGiB {
|
|
config.SnapshotDefaults.WarningSizeGiB = config.SnapshotDefaults.CriticalSizeGiB
|
|
}
|
|
if config.SnapshotDefaults.CriticalSizeGiB == 0 && config.SnapshotDefaults.WarningSizeGiB > 0 {
|
|
config.SnapshotDefaults.CriticalSizeGiB = config.SnapshotDefaults.WarningSizeGiB
|
|
}
|
|
}
|
|
|
|
// normalizeBackupDefaults ensures backup alert thresholds are valid
|
|
func normalizeBackupDefaults(config *AlertConfig) {
|
|
if config.BackupDefaults.WarningDays < 0 {
|
|
config.BackupDefaults.WarningDays = 0
|
|
}
|
|
if config.BackupDefaults.CriticalDays < 0 {
|
|
config.BackupDefaults.CriticalDays = 0
|
|
}
|
|
if config.BackupDefaults.CriticalDays > 0 && config.BackupDefaults.WarningDays > config.BackupDefaults.CriticalDays {
|
|
config.BackupDefaults.WarningDays = config.BackupDefaults.CriticalDays
|
|
}
|
|
if config.BackupDefaults.AlertOrphaned == nil {
|
|
alertOrphaned := true
|
|
config.BackupDefaults.AlertOrphaned = &alertOrphaned
|
|
}
|
|
if len(config.BackupDefaults.IgnoreVMIDs) > 0 {
|
|
seen := make(map[string]struct{}, len(config.BackupDefaults.IgnoreVMIDs))
|
|
normalized := make([]string, 0, len(config.BackupDefaults.IgnoreVMIDs))
|
|
for _, entry := range config.BackupDefaults.IgnoreVMIDs {
|
|
value := strings.TrimSpace(entry)
|
|
if value == "" {
|
|
continue
|
|
}
|
|
if _, exists := seen[value]; exists {
|
|
continue
|
|
}
|
|
seen[value] = struct{}{}
|
|
normalized = append(normalized, value)
|
|
}
|
|
config.BackupDefaults.IgnoreVMIDs = normalized
|
|
}
|
|
}
|
|
|
|
func backupIgnoreVMID(vmid string, ignoreList []string) bool {
|
|
if vmid == "" || len(ignoreList) == 0 {
|
|
return false
|
|
}
|
|
for _, entry := range ignoreList {
|
|
value := strings.TrimSpace(entry)
|
|
if value == "" {
|
|
continue
|
|
}
|
|
if strings.HasSuffix(value, "*") {
|
|
prefix := strings.TrimSuffix(value, "*")
|
|
if prefix != "" && strings.HasPrefix(vmid, prefix) {
|
|
return true
|
|
}
|
|
continue
|
|
}
|
|
if vmid == value {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
// normalizeNodeDefaults ensures node threshold defaults exist
|
|
// Trigger=0 is allowed for Temperature and means "disable temperature alerting"
|
|
func normalizeNodeDefaults(config *AlertConfig) {
|
|
// Ensure temperature defaults exist for nodes so high temps alert out of the box
|
|
if config.NodeDefaults.Temperature == nil || config.NodeDefaults.Temperature.Trigger < 0 {
|
|
config.NodeDefaults.Temperature = &HysteresisThreshold{Trigger: 80, Clear: 75}
|
|
} else if config.NodeDefaults.Temperature.Trigger == 0 {
|
|
// Trigger=0 means disabled, set Clear=0 too
|
|
config.NodeDefaults.Temperature.Clear = 0
|
|
} else if config.NodeDefaults.Temperature.Clear <= 0 {
|
|
config.NodeDefaults.Temperature.Clear = config.NodeDefaults.Temperature.Trigger - 5
|
|
if config.NodeDefaults.Temperature.Clear <= 0 {
|
|
config.NodeDefaults.Temperature.Clear = 75
|
|
}
|
|
}
|
|
}
|
|
|
|
// normalizeHostDefaults ensures host agent threshold defaults exist
|
|
// Trigger=0 is allowed and means "disable alerting for this metric"
|
|
func normalizeHostDefaults(config *AlertConfig) {
|
|
if config.HostDefaults.CPU == nil || config.HostDefaults.CPU.Trigger < 0 {
|
|
config.HostDefaults.CPU = &HysteresisThreshold{Trigger: 80, Clear: 75}
|
|
} else if config.HostDefaults.CPU.Trigger == 0 {
|
|
// Trigger=0 means disabled, set Clear=0 too
|
|
config.HostDefaults.CPU.Clear = 0
|
|
} else if config.HostDefaults.CPU.Clear <= 0 {
|
|
config.HostDefaults.CPU.Clear = config.HostDefaults.CPU.Trigger - 5
|
|
if config.HostDefaults.CPU.Clear <= 0 {
|
|
config.HostDefaults.CPU.Clear = 75
|
|
}
|
|
}
|
|
if config.HostDefaults.Memory == nil || config.HostDefaults.Memory.Trigger < 0 {
|
|
config.HostDefaults.Memory = &HysteresisThreshold{Trigger: 85, Clear: 80}
|
|
} else if config.HostDefaults.Memory.Trigger == 0 {
|
|
// Trigger=0 means disabled, set Clear=0 too
|
|
config.HostDefaults.Memory.Clear = 0
|
|
} else if config.HostDefaults.Memory.Clear <= 0 {
|
|
config.HostDefaults.Memory.Clear = config.HostDefaults.Memory.Trigger - 5
|
|
if config.HostDefaults.Memory.Clear <= 0 {
|
|
config.HostDefaults.Memory.Clear = 80
|
|
}
|
|
}
|
|
if config.HostDefaults.Disk == nil || config.HostDefaults.Disk.Trigger < 0 {
|
|
config.HostDefaults.Disk = &HysteresisThreshold{Trigger: 90, Clear: 85}
|
|
} else if config.HostDefaults.Disk.Trigger == 0 {
|
|
// Trigger=0 means disabled, set Clear=0 too
|
|
config.HostDefaults.Disk.Clear = 0
|
|
} else if config.HostDefaults.Disk.Clear <= 0 {
|
|
config.HostDefaults.Disk.Clear = config.HostDefaults.Disk.Trigger - 5
|
|
if config.HostDefaults.Disk.Clear <= 0 {
|
|
config.HostDefaults.Disk.Clear = 85
|
|
}
|
|
}
|
|
|
|
if config.HostDefaults.DiskTemperature == nil || config.HostDefaults.DiskTemperature.Trigger < 0 {
|
|
config.HostDefaults.DiskTemperature = &HysteresisThreshold{Trigger: 55, Clear: 50}
|
|
} else if config.HostDefaults.DiskTemperature.Trigger == 0 {
|
|
config.HostDefaults.DiskTemperature.Clear = 0
|
|
} else if config.HostDefaults.DiskTemperature.Clear <= 0 {
|
|
config.HostDefaults.DiskTemperature.Clear = config.HostDefaults.DiskTemperature.Trigger - 5
|
|
if config.HostDefaults.DiskTemperature.Clear <= 0 {
|
|
config.HostDefaults.DiskTemperature.Clear = 50
|
|
}
|
|
}
|
|
ensureValidHysteresis(config.HostDefaults.DiskTemperature, "host.diskTemperature")
|
|
}
|
|
|
|
// normalizeGeneralSettings ensures general alert settings have valid values
|
|
func normalizeGeneralSettings(config *AlertConfig) {
|
|
if config.MinimumDelta <= 0 {
|
|
config.MinimumDelta = 2.0
|
|
}
|
|
if config.SuppressionWindow <= 0 {
|
|
config.SuppressionWindow = 5
|
|
}
|
|
if config.HysteresisMargin <= 0 {
|
|
config.HysteresisMargin = 5.0
|
|
}
|
|
if config.ObservationWindowHours <= 0 {
|
|
config.ObservationWindowHours = 24
|
|
}
|
|
}
|
|
|
|
// normalizeTimeThresholds ensures time threshold settings are valid
|
|
func normalizeTimeThresholds(config *AlertConfig) {
|
|
config.MetricTimeThresholds = normalizeMetricTimeThresholds(config.MetricTimeThresholds)
|
|
|
|
const defaultDelaySeconds = 5
|
|
if config.TimeThreshold <= 0 {
|
|
config.TimeThreshold = defaultDelaySeconds
|
|
}
|
|
if config.TimeThresholds == nil {
|
|
config.TimeThresholds = make(map[string]int)
|
|
}
|
|
ensureDelay := func(key string) {
|
|
delay, ok := config.TimeThresholds[key]
|
|
if !ok || delay < 0 {
|
|
config.TimeThresholds[key] = defaultDelaySeconds
|
|
}
|
|
}
|
|
ensureDelay("guest")
|
|
ensureDelay("node")
|
|
ensureDelay("storage")
|
|
ensureDelay("pbs")
|
|
ensureDelay("host")
|
|
if delay, ok := config.TimeThresholds["all"]; ok && delay < 0 {
|
|
config.TimeThresholds["all"] = defaultDelaySeconds
|
|
}
|
|
}
|
|
|
|
// migrateActivationState handles backward compatibility for activation state
|
|
func (m *Manager) migrateActivationState(config *AlertConfig) {
|
|
if config.ActivationState == "" {
|
|
// Determine if this is an existing installation or new
|
|
// Existing installations have active alerts already
|
|
isExistingInstall := len(m.activeAlerts) > 0 || len(config.Overrides) > 0
|
|
if isExistingInstall {
|
|
// Existing install: auto-activate to preserve behavior
|
|
config.ActivationState = ActivationActive
|
|
now := time.Now()
|
|
config.ActivationTime = &now
|
|
log.Info().Msg("Migrating existing installation to active alert state")
|
|
} else {
|
|
// New install: start in pending review
|
|
config.ActivationState = ActivationPending
|
|
log.Info().Msg("New installation: alerts pending activation")
|
|
}
|
|
}
|
|
}
|
|
|
|
// validateHysteresisThresholds ensures hysteresis thresholds won't cause stuck alerts
|
|
func validateHysteresisThresholds(config *AlertConfig) {
|
|
ensureValidHysteresis(config.GuestDefaults.CPU, "guest.cpu")
|
|
ensureValidHysteresis(config.GuestDefaults.Memory, "guest.memory")
|
|
ensureValidHysteresis(config.GuestDefaults.Disk, "guest.disk")
|
|
ensureValidHysteresis(config.NodeDefaults.CPU, "node.cpu")
|
|
ensureValidHysteresis(config.NodeDefaults.Memory, "node.memory")
|
|
ensureValidHysteresis(config.NodeDefaults.Temperature, "node.temperature")
|
|
ensureValidHysteresis(&config.StorageDefault, "storage")
|
|
}
|
|
|
|
// validateQuietHoursTimezone validates the timezone for quiet hours
|
|
func validateQuietHoursTimezone(config *AlertConfig) {
|
|
if config.Schedule.QuietHours.Enabled && config.Schedule.QuietHours.Timezone != "" {
|
|
_, err := time.LoadLocation(config.Schedule.QuietHours.Timezone)
|
|
if err != nil {
|
|
log.Error().
|
|
Err(err).
|
|
Str("timezone", config.Schedule.QuietHours.Timezone).
|
|
Msg("Invalid timezone in quiet hours config, disabling quiet hours")
|
|
// Disable quiet hours rather than silently using wrong timezone
|
|
config.Schedule.QuietHours.Enabled = false
|
|
}
|
|
}
|
|
}
|
|
|
|
// normalizeOverrides normalizes all threshold overrides
|
|
func normalizeOverrides(overrides map[string]ThresholdConfig) {
|
|
for id, override := range overrides {
|
|
override.PoweredOffSeverity = normalizePoweredOffSeverity(override.PoweredOffSeverity)
|
|
if override.Usage != nil {
|
|
override.Usage = ensureHysteresisThreshold(override.Usage)
|
|
}
|
|
overrides[id] = override
|
|
}
|
|
}
|
|
|
|
// normalizeMetricTimeThresholds cleans resource/metric keys and drops invalid delay overrides.
|
|
func normalizeMetricTimeThresholds(input map[string]map[string]int) map[string]map[string]int {
|
|
if len(input) == 0 {
|
|
return nil
|
|
}
|
|
|
|
normalized := make(map[string]map[string]int)
|
|
for rawType, metrics := range input {
|
|
typeKey := strings.ToLower(strings.TrimSpace(rawType))
|
|
if typeKey == "" || len(metrics) == 0 {
|
|
continue
|
|
}
|
|
for rawMetric, delay := range metrics {
|
|
metricKey := strings.ToLower(strings.TrimSpace(rawMetric))
|
|
if metricKey == "" || delay < 0 {
|
|
continue
|
|
}
|
|
if _, exists := normalized[typeKey]; !exists {
|
|
normalized[typeKey] = make(map[string]int)
|
|
}
|
|
normalized[typeKey][metricKey] = delay
|
|
}
|
|
}
|
|
|
|
if len(normalized) == 0 {
|
|
return nil
|
|
}
|
|
|
|
return normalized
|
|
}
|
|
|
|
// NormalizeMetricTimeThresholds exposes normalization for other packages (e.g., config persistence).
|
|
func NormalizeMetricTimeThresholds(input map[string]map[string]int) map[string]map[string]int {
|
|
return normalizeMetricTimeThresholds(input)
|
|
}
|
|
|
|
// NormalizeDockerIgnoredPrefixes trims, deduplicates, and lowercases comparison keys for ignored Docker containers.
|
|
// Returned values retain the user's original casing for display but guarantee uniqueness when compared case-insensitively.
|
|
func NormalizeDockerIgnoredPrefixes(prefixes []string) []string {
|
|
if len(prefixes) == 0 {
|
|
return nil
|
|
}
|
|
|
|
seen := make(map[string]struct{}, len(prefixes))
|
|
normalized := make([]string, 0, len(prefixes))
|
|
|
|
for _, prefix := range prefixes {
|
|
trimmed := strings.TrimSpace(prefix)
|
|
if trimmed == "" {
|
|
continue
|
|
}
|
|
|
|
key := strings.ToLower(trimmed)
|
|
if _, exists := seen[key]; exists {
|
|
continue
|
|
}
|
|
seen[key] = struct{}{}
|
|
normalized = append(normalized, trimmed)
|
|
}
|
|
|
|
if len(normalized) == 0 {
|
|
return nil
|
|
}
|
|
|
|
return normalized
|
|
}
|
|
|
|
// applyGlobalOfflineSettingsLocked clears tracking and active alerts for globally disabled offline detectors.
|
|
// Caller must hold m.mu.
|
|
func (m *Manager) applyGlobalOfflineSettingsLocked() {
|
|
if m.config.DisableAllNodesOffline {
|
|
var nodeAlerts []string
|
|
for alertID := range m.activeAlerts {
|
|
if strings.HasPrefix(alertID, "node-offline-") {
|
|
nodeAlerts = append(nodeAlerts, alertID)
|
|
}
|
|
}
|
|
for _, alertID := range nodeAlerts {
|
|
m.clearAlertNoLock(alertID)
|
|
}
|
|
m.nodeOfflineCount = make(map[string]int)
|
|
}
|
|
|
|
if m.config.DisableAllPBSOffline {
|
|
var pbsAlerts []string
|
|
for alertID, alert := range m.activeAlerts {
|
|
if strings.HasPrefix(alertID, "pbs-offline-") {
|
|
pbsAlerts = append(pbsAlerts, alertID)
|
|
delete(m.offlineConfirmations, alert.ResourceID)
|
|
}
|
|
}
|
|
for _, alertID := range pbsAlerts {
|
|
m.clearAlertNoLock(alertID)
|
|
}
|
|
}
|
|
|
|
if m.config.DisableAllGuestsOffline {
|
|
var guestAlerts []string
|
|
for alertID, alert := range m.activeAlerts {
|
|
if strings.HasPrefix(alertID, "guest-powered-off-") {
|
|
guestAlerts = append(guestAlerts, alertID)
|
|
delete(m.offlineConfirmations, alert.ResourceID)
|
|
}
|
|
}
|
|
for _, alertID := range guestAlerts {
|
|
m.clearAlertNoLock(alertID)
|
|
}
|
|
}
|
|
|
|
if m.config.DisableAllDockerHostsOffline {
|
|
var hostAlerts []string
|
|
for alertID := range m.activeAlerts {
|
|
if strings.HasPrefix(alertID, "docker-host-offline-") {
|
|
hostAlerts = append(hostAlerts, alertID)
|
|
}
|
|
}
|
|
for _, alertID := range hostAlerts {
|
|
m.clearAlertNoLock(alertID)
|
|
}
|
|
m.dockerOfflineCount = make(map[string]int)
|
|
}
|
|
|
|
if m.config.DisableAllDockerContainers {
|
|
var containerAlerts []string
|
|
for alertID := range m.activeAlerts {
|
|
if strings.HasPrefix(alertID, "docker-container-") {
|
|
containerAlerts = append(containerAlerts, alertID)
|
|
}
|
|
}
|
|
for _, alertID := range containerAlerts {
|
|
m.clearAlertNoLock(alertID)
|
|
}
|
|
m.dockerStateConfirm = make(map[string]int)
|
|
m.dockerRestartTracking = make(map[string]*dockerRestartRecord)
|
|
m.dockerLastExitCode = make(map[string]int)
|
|
m.dockerUpdateFirstSeen = make(map[string]time.Time)
|
|
m.dockerUpdateFirstSeenByIdentity = make(map[string]time.Time)
|
|
}
|
|
if m.config.DockerDefaults.UpdateAlertDelayHours < 0 && !m.config.DisableAllDockerContainers {
|
|
m.clearDockerContainerUpdateAlertsLocked()
|
|
m.dockerUpdateFirstSeen = make(map[string]time.Time)
|
|
m.dockerUpdateFirstSeenByIdentity = make(map[string]time.Time)
|
|
}
|
|
if m.config.DisableAllDockerServices {
|
|
var serviceAlerts []string
|
|
for alertID := range m.activeAlerts {
|
|
if strings.HasPrefix(alertID, "docker-service-") {
|
|
serviceAlerts = append(serviceAlerts, alertID)
|
|
}
|
|
}
|
|
for _, alertID := range serviceAlerts {
|
|
m.clearAlertNoLock(alertID)
|
|
}
|
|
}
|
|
}
|
|
|
|
// reevaluateActiveAlertsLocked re-evaluates all active alerts against the current configuration
|
|
// This should only be called with m.mu already locked
|
|
func (m *Manager) reevaluateActiveAlertsLocked() {
|
|
if len(m.activeAlerts) == 0 {
|
|
return
|
|
}
|
|
|
|
// Track alerts that should be resolved
|
|
alertsToResolve := make([]string, 0)
|
|
|
|
for alertID, alert := range m.activeAlerts {
|
|
resourceTypeMeta := ""
|
|
if alert.Metadata != nil {
|
|
if metaType, ok := alert.Metadata["resourceType"].(string); ok {
|
|
resourceTypeMeta = strings.ToLower(metaType)
|
|
}
|
|
}
|
|
|
|
if alert.Type == "docker-container-update" || strings.HasPrefix(alertID, "docker-container-update-") {
|
|
if m.shouldResolveDockerContainerUpdateAlertLocked(alert) {
|
|
alertsToResolve = append(alertsToResolve, alertID)
|
|
}
|
|
continue
|
|
}
|
|
|
|
// Parse the alert ID to extract resource ID and metric type
|
|
// Alert ID format: {resourceID}-{metricType}
|
|
parts := strings.Split(alertID, "-")
|
|
if len(parts) < 2 {
|
|
continue
|
|
}
|
|
|
|
metricType := parts[len(parts)-1]
|
|
resourceID := strings.Join(parts[:len(parts)-1], "-")
|
|
|
|
// Get the appropriate threshold based on resource type and ID
|
|
var threshold *HysteresisThreshold
|
|
|
|
// Check for PMG alerts by Type
|
|
if alert.Type == "queue-depth" || alert.Type == "queue-deferred" || alert.Type == "queue-hold" || alert.Type == "message-age" {
|
|
// This is a PMG alert
|
|
if m.config.DisableAllPMG {
|
|
alertsToResolve = append(alertsToResolve, alertID)
|
|
continue
|
|
}
|
|
}
|
|
|
|
// Check for Host alerts by resourceType
|
|
if resourceTypeMeta == "host" {
|
|
if m.config.DisableAllHosts {
|
|
alertsToResolve = append(alertsToResolve, alertID)
|
|
continue
|
|
}
|
|
thresholds := m.config.HostDefaults
|
|
// Overrides are keyed by raw host ID (without the "host:" prefix
|
|
// that hostResourceID adds to the resource ID used in alert IDs).
|
|
rawHostID := strings.TrimPrefix(resourceID, "host:")
|
|
if override, exists := m.config.Overrides[rawHostID]; exists {
|
|
if override.Disabled {
|
|
alertsToResolve = append(alertsToResolve, alertID)
|
|
continue
|
|
}
|
|
thresholds = m.applyThresholdOverride(thresholds, override)
|
|
}
|
|
threshold = getThresholdForMetric(thresholds, metricType)
|
|
}
|
|
|
|
if alert.Type == "docker-host-offline" ||
|
|
strings.HasPrefix(alertID, "docker-container-health-") ||
|
|
strings.HasPrefix(alertID, "docker-container-state-") ||
|
|
strings.HasPrefix(alertID, "docker-container-restart-loop-") ||
|
|
strings.HasPrefix(alertID, "docker-container-oom-") ||
|
|
strings.HasPrefix(alertID, "docker-container-memory-limit-") {
|
|
// Non-metric Docker alerts are not governed by thresholds
|
|
continue
|
|
}
|
|
|
|
if resourceTypeMeta == "dockerhost" {
|
|
// Check if all Docker host alerts are disabled
|
|
if m.config.DisableAllDockerHosts {
|
|
alertsToResolve = append(alertsToResolve, alertID)
|
|
continue
|
|
}
|
|
// No threshold evaluation for Docker hosts (connectivity handled separately)
|
|
continue
|
|
}
|
|
if resourceTypeMeta == "docker container" {
|
|
// Check if all Docker container alerts are disabled
|
|
if m.config.DisableAllDockerContainers {
|
|
alertsToResolve = append(alertsToResolve, alertID)
|
|
continue
|
|
}
|
|
containerName := strings.ToLower(strings.TrimSpace(alert.ResourceName))
|
|
containerID := ""
|
|
if alert.Metadata != nil {
|
|
if val, ok := alert.Metadata["containerId"].(string); ok {
|
|
containerID = strings.ToLower(strings.TrimSpace(val))
|
|
}
|
|
if val, ok := alert.Metadata["containerName"].(string); ok && containerName == "" {
|
|
containerName = strings.ToLower(strings.TrimSpace(val))
|
|
}
|
|
}
|
|
if matchesDockerIgnoredPrefix(containerName, containerID, m.config.DockerIgnoredContainerPrefixes) {
|
|
alertsToResolve = append(alertsToResolve, alertID)
|
|
continue
|
|
}
|
|
thresholds := ThresholdConfig{
|
|
CPU: cloneThreshold(&m.config.DockerDefaults.CPU),
|
|
Memory: cloneThreshold(&m.config.DockerDefaults.Memory),
|
|
Disk: cloneThreshold(&m.config.DockerDefaults.Disk),
|
|
}
|
|
if override, exists := m.config.Overrides[resourceID]; exists {
|
|
if override.Disabled {
|
|
alertsToResolve = append(alertsToResolve, alertID)
|
|
continue
|
|
}
|
|
thresholds = m.applyThresholdOverride(thresholds, override)
|
|
}
|
|
threshold = getThresholdForMetric(thresholds, metricType)
|
|
}
|
|
|
|
// Determine the resource type from the alert's metadata or instance
|
|
// We need to check what kind of resource this is
|
|
if threshold == nil && !strings.Contains(resourceID, ":") && (alert.Instance == "Node" || alert.Instance == alert.Node) {
|
|
// This is a node alert
|
|
// Check if all node alerts are disabled
|
|
if m.config.DisableAllNodes {
|
|
alertsToResolve = append(alertsToResolve, alertID)
|
|
continue
|
|
}
|
|
thresholds := m.config.NodeDefaults
|
|
if override, exists := m.config.Overrides[resourceID]; exists {
|
|
thresholds = m.applyThresholdOverride(thresholds, override)
|
|
}
|
|
threshold = getThresholdForMetric(thresholds, metricType)
|
|
} else if threshold == nil && (resourceTypeMeta == "storage" || alert.Instance == "Storage" || strings.Contains(alert.ResourceID, ":storage/")) {
|
|
// This is a storage alert
|
|
// Check if all storage alerts are disabled
|
|
if m.config.DisableAllStorage {
|
|
alertsToResolve = append(alertsToResolve, alertID)
|
|
continue
|
|
}
|
|
if override, exists := m.config.Overrides[resourceID]; exists && override.Usage != nil {
|
|
threshold = override.Usage
|
|
} else {
|
|
threshold = &m.config.StorageDefault
|
|
}
|
|
} else if threshold == nil && alert.Instance == "PBS" {
|
|
// This is a PBS alert
|
|
// Check if all PBS alerts are disabled
|
|
if m.config.DisableAllPBS {
|
|
alertsToResolve = append(alertsToResolve, alertID)
|
|
continue
|
|
}
|
|
thresholds := m.config.PBSDefaults
|
|
if override, exists := m.config.Overrides[resourceID]; exists {
|
|
if override.CPU != nil && metricType == "cpu" {
|
|
threshold = ensureHysteresisThreshold(override.CPU)
|
|
} else if override.Memory != nil && metricType == "memory" {
|
|
threshold = ensureHysteresisThreshold(override.Memory)
|
|
}
|
|
}
|
|
if threshold == nil {
|
|
threshold = getThresholdForMetric(thresholds, metricType)
|
|
}
|
|
}
|
|
|
|
if threshold == nil {
|
|
// This is a guest (qemu/lxc) alert
|
|
// Check if all guest alerts are disabled
|
|
if m.config.DisableAllGuests {
|
|
alertsToResolve = append(alertsToResolve, alertID)
|
|
continue
|
|
}
|
|
// We need to evaluate custom rules, but we don't have the guest object here.
|
|
// For now, we'll mark these alerts for re-evaluation by the monitor.
|
|
// The next poll cycle will properly evaluate them with custom rules.
|
|
|
|
// Check if there's an override for this specific guest
|
|
if override, exists := m.config.Overrides[resourceID]; exists {
|
|
if override.Disabled {
|
|
// Alert is now disabled for this resource, resolve it
|
|
alertsToResolve = append(alertsToResolve, alertID)
|
|
continue
|
|
}
|
|
threshold = getThresholdForMetricFromConfig(override, metricType)
|
|
}
|
|
|
|
// If no override or override doesn't have this metric, use defaults
|
|
// Note: This doesn't consider custom rules - those will be evaluated
|
|
// on the next poll cycle when we have the full guest object
|
|
if threshold == nil {
|
|
threshold = getThresholdForMetric(m.config.GuestDefaults, metricType)
|
|
}
|
|
}
|
|
|
|
// If no threshold found or threshold is disabled (trigger <= 0), resolve the alert
|
|
if threshold == nil || threshold.Trigger <= 0 {
|
|
alertsToResolve = append(alertsToResolve, alertID)
|
|
continue
|
|
}
|
|
|
|
// Check if current value is now below the clear threshold
|
|
clearThreshold := threshold.Clear
|
|
if clearThreshold <= 0 {
|
|
clearThreshold = threshold.Trigger
|
|
}
|
|
|
|
if alert.Value <= clearThreshold {
|
|
// Alert should be resolved due to new threshold
|
|
alertsToResolve = append(alertsToResolve, alertID)
|
|
log.Info().
|
|
Str("alertID", alertID).
|
|
Float64("value", alert.Value).
|
|
Float64("oldThreshold", alert.Threshold).
|
|
Float64("newClearThreshold", clearThreshold).
|
|
Msg("Resolving alert due to threshold change")
|
|
} else if alert.Value < threshold.Trigger {
|
|
// Value is between clear and trigger thresholds after config change
|
|
// Resolve it to prevent confusion
|
|
alertsToResolve = append(alertsToResolve, alertID)
|
|
log.Info().
|
|
Str("alertID", alertID).
|
|
Float64("value", alert.Value).
|
|
Float64("newTrigger", threshold.Trigger).
|
|
Float64("newClear", clearThreshold).
|
|
Msg("Resolving alert - value now below trigger threshold after config change")
|
|
}
|
|
}
|
|
|
|
// Resolve all alerts that should be cleared
|
|
for _, alertID := range alertsToResolve {
|
|
if alert, exists := m.activeAlerts[alertID]; exists {
|
|
resolvedAlert := &ResolvedAlert{
|
|
Alert: alert,
|
|
ResolvedTime: time.Now(),
|
|
}
|
|
|
|
// Remove any pending notification tracking for this alert since it's no longer valid.
|
|
if _, isPending := m.pendingAlerts[alertID]; isPending {
|
|
delete(m.pendingAlerts, alertID)
|
|
log.Debug().
|
|
Str("alertID", alertID).
|
|
Msg("Cleared pending alert after configuration update")
|
|
}
|
|
|
|
// Remove from active alerts
|
|
m.removeActiveAlertNoLock(alertID)
|
|
|
|
// Add to recently resolved while respecting lock ordering
|
|
m.addRecentlyResolvedWithPrimaryLock(alertID, resolvedAlert)
|
|
|
|
log.Info().
|
|
Str("alertID", alertID).
|
|
Msg("Alert auto-resolved after configuration change")
|
|
|
|
m.safeCallResolvedCallback(alertID, true)
|
|
}
|
|
}
|
|
|
|
// Save updated active alerts if any were resolved
|
|
if len(alertsToResolve) > 0 {
|
|
go func() {
|
|
defer func() {
|
|
if r := recover(); r != nil {
|
|
log.Error().Interface("panic", r).Msg("Panic in SaveActiveAlerts goroutine (config update)")
|
|
}
|
|
}()
|
|
if err := m.SaveActiveAlerts(); err != nil {
|
|
log.Error().Err(err).Msg("Failed to save active alerts after config update")
|
|
}
|
|
}()
|
|
}
|
|
}
|
|
|
|
// ReevaluateGuestAlert reevaluates a specific guest's alerts with full threshold resolution including custom rules
|
|
// This should be called by the monitor with the current guest state
|
|
func (m *Manager) ReevaluateGuestAlert(guest interface{}, guestID string) {
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
|
|
// Get the correct thresholds for this guest (includes custom rules evaluation)
|
|
thresholds := m.getGuestThresholds(guest, guestID)
|
|
|
|
// Check all metric types for this guest
|
|
metricTypes := []string{"cpu", "memory", "disk", "diskRead", "diskWrite", "networkIn", "networkOut"}
|
|
|
|
for _, metricType := range metricTypes {
|
|
alertID := fmt.Sprintf("%s-%s", guestID, metricType)
|
|
alert, exists := m.activeAlerts[alertID]
|
|
if !exists {
|
|
continue
|
|
}
|
|
|
|
// Get the threshold for this metric
|
|
var threshold *HysteresisThreshold
|
|
switch metricType {
|
|
case "cpu":
|
|
threshold = thresholds.CPU
|
|
case "memory":
|
|
threshold = thresholds.Memory
|
|
case "disk":
|
|
threshold = thresholds.Disk
|
|
case "diskRead":
|
|
threshold = thresholds.DiskRead
|
|
case "diskWrite":
|
|
threshold = thresholds.DiskWrite
|
|
case "networkIn":
|
|
threshold = thresholds.NetworkIn
|
|
case "networkOut":
|
|
threshold = thresholds.NetworkOut
|
|
}
|
|
|
|
// If threshold is disabled or doesn't exist, clear the alert
|
|
if threshold == nil || threshold.Trigger <= 0 {
|
|
m.clearAlertNoLock(alertID)
|
|
// Also clear any pending alert for this metric
|
|
if _, isPending := m.pendingAlerts[alertID]; isPending {
|
|
delete(m.pendingAlerts, alertID)
|
|
log.Debug().
|
|
Str("alertID", alertID).
|
|
Msg("Cleared pending alert - threshold disabled")
|
|
}
|
|
log.Info().
|
|
Str("alertID", alertID).
|
|
Str("metric", metricType).
|
|
Msg("Cleared alert - threshold disabled")
|
|
continue
|
|
}
|
|
|
|
// Check if alert should be cleared based on new threshold
|
|
clearThreshold := threshold.Clear
|
|
if clearThreshold <= 0 {
|
|
clearThreshold = threshold.Trigger
|
|
}
|
|
|
|
if alert.Value <= clearThreshold || alert.Value < threshold.Trigger {
|
|
m.clearAlertNoLock(alertID)
|
|
log.Info().
|
|
Str("alertID", alertID).
|
|
Str("metric", metricType).
|
|
Float64("value", alert.Value).
|
|
Float64("trigger", threshold.Trigger).
|
|
Float64("clear", clearThreshold).
|
|
Msg("Cleared alert - value now below threshold after config change")
|
|
}
|
|
}
|
|
}
|
|
|
|
// getThresholdForMetric returns the threshold for a specific metric type from a ThresholdConfig
|
|
func getThresholdForMetric(config ThresholdConfig, metricType string) *HysteresisThreshold {
|
|
switch metricType {
|
|
case "cpu":
|
|
return config.CPU
|
|
case "memory":
|
|
return config.Memory
|
|
case "disk":
|
|
return config.Disk
|
|
case "diskRead":
|
|
return config.DiskRead
|
|
case "diskWrite":
|
|
return config.DiskWrite
|
|
case "networkIn":
|
|
return config.NetworkIn
|
|
case "networkOut":
|
|
return config.NetworkOut
|
|
case "temperature":
|
|
return config.Temperature
|
|
case "usage":
|
|
return config.Usage
|
|
default:
|
|
return nil
|
|
}
|
|
}
|
|
|
|
// getThresholdForMetricFromConfig returns the threshold for a specific metric type from a ThresholdConfig
|
|
// ensuring hysteresis is properly set
|
|
func getThresholdForMetricFromConfig(config ThresholdConfig, metricType string) *HysteresisThreshold {
|
|
var threshold *HysteresisThreshold
|
|
switch metricType {
|
|
case "cpu":
|
|
if config.CPU != nil {
|
|
threshold = ensureHysteresisThreshold(config.CPU)
|
|
}
|
|
case "memory":
|
|
if config.Memory != nil {
|
|
threshold = ensureHysteresisThreshold(config.Memory)
|
|
}
|
|
case "disk":
|
|
if config.Disk != nil {
|
|
threshold = ensureHysteresisThreshold(config.Disk)
|
|
}
|
|
case "diskRead":
|
|
if config.DiskRead != nil {
|
|
threshold = ensureHysteresisThreshold(config.DiskRead)
|
|
}
|
|
case "diskWrite":
|
|
if config.DiskWrite != nil {
|
|
threshold = ensureHysteresisThreshold(config.DiskWrite)
|
|
}
|
|
case "networkIn":
|
|
if config.NetworkIn != nil {
|
|
threshold = ensureHysteresisThreshold(config.NetworkIn)
|
|
}
|
|
case "networkOut":
|
|
if config.NetworkOut != nil {
|
|
threshold = ensureHysteresisThreshold(config.NetworkOut)
|
|
}
|
|
case "temperature":
|
|
if config.Temperature != nil {
|
|
threshold = ensureHysteresisThreshold(config.Temperature)
|
|
}
|
|
case "usage":
|
|
if config.Usage != nil {
|
|
threshold = ensureHysteresisThreshold(config.Usage)
|
|
}
|
|
}
|
|
return threshold
|
|
}
|
|
|
|
// isInQuietHours checks if the current time is within quiet hours
|
|
func (m *Manager) isInQuietHours() bool {
|
|
if !m.config.Schedule.QuietHours.Enabled {
|
|
return false
|
|
}
|
|
|
|
// Use cached location if available
|
|
loc := m.quietHoursLoc
|
|
if loc == nil {
|
|
// Fallback to loading if not cached yet (shouldn't happen with UpdateConfig)
|
|
var err error
|
|
loc, err = time.LoadLocation(m.config.Schedule.QuietHours.Timezone)
|
|
if err != nil {
|
|
log.Warn().Err(err).Str("timezone", m.config.Schedule.QuietHours.Timezone).Msg("Failed to load timezone, using local time")
|
|
loc = time.Local
|
|
}
|
|
m.quietHoursLoc = loc
|
|
}
|
|
|
|
now := time.Now().In(loc)
|
|
dayName := strings.ToLower(now.Format("Monday"))
|
|
|
|
// Check if today is enabled for quiet hours
|
|
if enabled, ok := m.config.Schedule.QuietHours.Days[dayName]; !ok || !enabled {
|
|
return false
|
|
}
|
|
|
|
// Parse start and end times
|
|
startTime, err := time.ParseInLocation("15:04", m.config.Schedule.QuietHours.Start, loc)
|
|
if err != nil {
|
|
log.Warn().Err(err).Str("start", m.config.Schedule.QuietHours.Start).Msg("Failed to parse quiet hours start time")
|
|
return false
|
|
}
|
|
|
|
endTime, err := time.ParseInLocation("15:04", m.config.Schedule.QuietHours.End, loc)
|
|
if err != nil {
|
|
log.Warn().Err(err).Str("end", m.config.Schedule.QuietHours.End).Msg("Failed to parse quiet hours end time")
|
|
return false
|
|
}
|
|
|
|
// Set to today's date
|
|
startTime = time.Date(now.Year(), now.Month(), now.Day(), startTime.Hour(), startTime.Minute(), 0, 0, loc)
|
|
endTime = time.Date(now.Year(), now.Month(), now.Day(), endTime.Hour(), endTime.Minute(), 0, 0, loc)
|
|
|
|
// Handle overnight quiet hours (e.g., 22:00 to 08:00)
|
|
if endTime.Before(startTime) {
|
|
// If we're past the start time or before the end time
|
|
if now.After(startTime) || now.Before(endTime) {
|
|
return true
|
|
}
|
|
} else {
|
|
// Normal case (e.g., 08:00 to 17:00)
|
|
if now.After(startTime) && now.Before(endTime) {
|
|
return true
|
|
}
|
|
}
|
|
|
|
return false
|
|
}
|
|
|
|
func quietHoursCategoryForAlert(alert *Alert) string {
|
|
if alert == nil {
|
|
return ""
|
|
}
|
|
|
|
switch alert.Type {
|
|
case "cpu", "memory", "disk", "diskRead", "diskWrite", "networkIn", "networkOut", "temperature":
|
|
return "performance"
|
|
case "queue-depth", "queue-deferred", "queue-hold", "message-age",
|
|
"docker-container-health", "docker-container-restart-loop",
|
|
"docker-container-oom-kill", "docker-container-memory-limit":
|
|
return "performance"
|
|
case "usage", "disk-health", "disk-wearout", "zfs-pool-state", "zfs-pool-errors", "zfs-device":
|
|
return "storage"
|
|
case "connectivity", "offline", "powered-off", "docker-host-offline":
|
|
return "offline"
|
|
}
|
|
|
|
if strings.HasPrefix(alert.Type, "docker-container-") {
|
|
if alert.Type == "docker-container-state" {
|
|
return "offline"
|
|
}
|
|
return "performance"
|
|
}
|
|
|
|
return ""
|
|
}
|
|
|
|
func (m *Manager) shouldSuppressNotification(alert *Alert) (bool, string) {
|
|
if alert == nil {
|
|
return false, ""
|
|
}
|
|
|
|
if !m.isInQuietHours() {
|
|
return false, ""
|
|
}
|
|
|
|
if alert.Level != AlertLevelCritical {
|
|
return true, "non-critical"
|
|
}
|
|
|
|
category := quietHoursCategoryForAlert(alert)
|
|
switch category {
|
|
case "performance":
|
|
if m.config.Schedule.QuietHours.Suppress.Performance {
|
|
return true, category
|
|
}
|
|
case "storage":
|
|
if m.config.Schedule.QuietHours.Suppress.Storage {
|
|
return true, category
|
|
}
|
|
case "offline":
|
|
if m.config.Schedule.QuietHours.Suppress.Offline {
|
|
return true, category
|
|
}
|
|
}
|
|
|
|
return false, ""
|
|
}
|
|
|
|
// ShouldSuppressResolvedNotification checks if a recovery notification should be suppressed
|
|
// during quiet hours. Recovery notifications follow the same quiet hours rules as their
|
|
// corresponding alerts - if the original alert would have been suppressed, so is the recovery.
|
|
func (m *Manager) ShouldSuppressResolvedNotification(alert *Alert) bool {
|
|
if alert == nil {
|
|
return false
|
|
}
|
|
|
|
m.mu.RLock()
|
|
defer m.mu.RUnlock()
|
|
|
|
suppressed, reason := m.shouldSuppressNotification(alert)
|
|
if suppressed {
|
|
log.Debug().
|
|
Str("alertID", alert.ID).
|
|
Str("type", alert.Type).
|
|
Str("level", string(alert.Level)).
|
|
Str("quietHoursRule", reason).
|
|
Msg("Recovery notification suppressed during quiet hours")
|
|
}
|
|
return suppressed
|
|
}
|
|
|
|
// shouldNotifyAfterCooldown checks if enough time has passed since the last notification
|
|
// Returns true if notification should be sent, false if still in cooldown period
|
|
func (m *Manager) shouldNotifyAfterCooldown(alert *Alert) bool {
|
|
// If cooldown is 0 or negative, always allow notifications
|
|
if m.config.Schedule.Cooldown <= 0 {
|
|
return true
|
|
}
|
|
|
|
// If this is the first notification, allow it
|
|
if alert.LastNotified == nil {
|
|
return true
|
|
}
|
|
|
|
// Check if enough time has passed
|
|
cooldownDuration := time.Duration(m.config.Schedule.Cooldown) * time.Minute
|
|
timeSinceLastNotification := time.Since(*alert.LastNotified)
|
|
|
|
return timeSinceLastNotification >= cooldownDuration
|
|
}
|
|
|
|
// GetConfig returns the current alert configuration
|
|
func (m *Manager) GetConfig() AlertConfig {
|
|
m.mu.RLock()
|
|
defer m.mu.RUnlock()
|
|
return m.config
|
|
}
|
|
|
|
// CheckGuest checks a guest (VM or container) against thresholds
|
|
func (m *Manager) CheckGuest(guest interface{}, instanceName string) {
|
|
m.mu.RLock()
|
|
enabled := m.config.Enabled
|
|
disableAllGuests := m.config.DisableAllGuests
|
|
disableAllGuestsOffline := m.config.DisableAllGuestsOffline
|
|
ignoredGuestPrefixes := m.config.IgnoredGuestPrefixes
|
|
guestTagWhitelist := m.config.GuestTagWhitelist
|
|
guestTagBlacklist := m.config.GuestTagBlacklist
|
|
m.mu.RUnlock()
|
|
|
|
if !enabled {
|
|
log.Debug().Msg("CheckGuest: alerts disabled globally")
|
|
return
|
|
}
|
|
if disableAllGuests {
|
|
log.Debug().Msg("CheckGuest: all guest alerts disabled")
|
|
return
|
|
}
|
|
|
|
var guestID, name, node, guestType, status string
|
|
var cpu, memUsage, diskUsage float64
|
|
var diskRead, diskWrite, netIn, netOut int64
|
|
var disks []models.Disk
|
|
var tags []string
|
|
|
|
// Extract data based on guest type
|
|
switch g := guest.(type) {
|
|
case models.VM:
|
|
guestID = g.ID
|
|
name = g.Name
|
|
node = g.Node
|
|
status = g.Status
|
|
guestType = "VM"
|
|
cpu = g.CPU * 100 // Convert to percentage
|
|
memUsage = g.Memory.Usage
|
|
diskUsage = g.Disk.Usage
|
|
diskRead = g.DiskRead
|
|
diskWrite = g.DiskWrite
|
|
netIn = g.NetworkIn
|
|
netOut = g.NetworkOut
|
|
disks = g.Disks
|
|
if len(g.Tags) > 0 {
|
|
tags = append(tags, g.Tags...)
|
|
}
|
|
|
|
// Debug logging for high memory VMs
|
|
if memUsage > 85 {
|
|
log.Debug().
|
|
Str("vm", name).
|
|
Float64("memUsage", memUsage).
|
|
Str("status", status).
|
|
Msg("VM with high memory detected in CheckGuest")
|
|
}
|
|
case models.Container:
|
|
guestID = g.ID
|
|
name = g.Name
|
|
node = g.Node
|
|
status = g.Status
|
|
guestType = "Container"
|
|
cpu = g.CPU * 100 // Convert to percentage
|
|
memUsage = g.Memory.Usage
|
|
diskUsage = g.Disk.Usage
|
|
diskRead = g.DiskRead
|
|
diskWrite = g.DiskWrite
|
|
netIn = g.NetworkIn
|
|
netOut = g.NetworkOut
|
|
disks = g.Disks
|
|
if len(g.Tags) > 0 {
|
|
tags = append(tags, g.Tags...)
|
|
}
|
|
default:
|
|
log.Debug().
|
|
Str("type", fmt.Sprintf("%T", guest)).
|
|
Msg("CheckGuest: unsupported guest type")
|
|
return
|
|
}
|
|
|
|
// Check ignored prefixes
|
|
for _, prefix := range ignoredGuestPrefixes {
|
|
if prefix != "" && strings.HasPrefix(name, prefix) {
|
|
if cleared := m.suppressGuestAlerts(guestID); cleared {
|
|
m.saveActiveAlertsAsync("ignored-prefix")
|
|
}
|
|
return
|
|
}
|
|
}
|
|
|
|
settings := parsePulseTags(tags)
|
|
if settings.Suppress {
|
|
if cleared := m.suppressGuestAlerts(guestID); cleared {
|
|
m.saveActiveAlertsAsync("pulse-no-alerts")
|
|
}
|
|
log.Debug().
|
|
Str("guestID", guestID).
|
|
Msg("Pulse no-alerts tag active; suppressing guest alerts")
|
|
return
|
|
}
|
|
|
|
// Custom Tag Filtering
|
|
if len(guestTagBlacklist) > 0 || len(guestTagWhitelist) > 0 {
|
|
// Normalize tags once for checking
|
|
normalizedTags := make(map[string]bool)
|
|
for _, tag := range tags {
|
|
normalizedTags[strings.ToLower(strings.TrimSpace(tag))] = true
|
|
}
|
|
|
|
// Check Blacklist
|
|
for _, block := range guestTagBlacklist {
|
|
if normalizedTags[strings.ToLower(strings.TrimSpace(block))] {
|
|
if cleared := m.suppressGuestAlerts(guestID); cleared {
|
|
m.saveActiveAlertsAsync("tag-blacklist")
|
|
}
|
|
log.Debug().Str("guestID", guestID).Msg("Guest suppressed by tag blacklist")
|
|
return
|
|
}
|
|
}
|
|
|
|
// Check Whitelist
|
|
if len(guestTagWhitelist) > 0 {
|
|
found := false
|
|
for _, allow := range guestTagWhitelist {
|
|
if normalizedTags[strings.ToLower(strings.TrimSpace(allow))] {
|
|
found = true
|
|
break
|
|
}
|
|
}
|
|
if !found {
|
|
if cleared := m.suppressGuestAlerts(guestID); cleared {
|
|
m.saveActiveAlertsAsync("tag-whitelist")
|
|
}
|
|
log.Debug().Str("guestID", guestID).Msg("Guest suppressed by tag whitelist (required tag not found)")
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
monitorOnly := settings.MonitorOnly
|
|
if monitorOnly || m.guestHasMonitorOnlyAlerts(guestID) {
|
|
log.Debug().
|
|
Str("guest", name).
|
|
Bool("monitorOnly", monitorOnly).
|
|
Msg("Pulse monitor-only status applied")
|
|
}
|
|
|
|
// Handle non-running guests
|
|
// Proxmox VM states: running, stopped, paused, suspended
|
|
if status != "running" {
|
|
// Check for powered-off state and generate alert if configured
|
|
if status == "stopped" {
|
|
if disableAllGuestsOffline {
|
|
// Clear any pending powered-off tracking and alerts when globally disabled
|
|
m.mu.Lock()
|
|
delete(m.offlineConfirmations, guestID)
|
|
m.mu.Unlock()
|
|
m.clearAlert(fmt.Sprintf("guest-powered-off-%s", guestID))
|
|
} else {
|
|
m.checkGuestPoweredOff(guestID, name, node, instanceName, guestType, monitorOnly)
|
|
}
|
|
} else {
|
|
// For paused/suspended, clear powered-off alert
|
|
m.clearGuestPoweredOffAlert(guestID, name)
|
|
}
|
|
|
|
// Clear all resource metric alerts (cpu, memory, disk, etc.) for non-running guests
|
|
m.mu.Lock()
|
|
alertsCleared := 0
|
|
for alertID, alert := range m.activeAlerts {
|
|
// Only clear resource metric alerts, not powered-off alerts
|
|
if alert.ResourceID == guestID && alert.Type != "powered-off" {
|
|
m.clearAlertNoLock(alertID)
|
|
alertsCleared++
|
|
log.Debug().
|
|
Str("alertID", alertID).
|
|
Str("guest", name).
|
|
Str("status", status).
|
|
Msg("Cleared metric alert for non-running guest")
|
|
}
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
if alertsCleared > 0 {
|
|
log.Debug().
|
|
Str("guest", name).
|
|
Str("status", status).
|
|
Int("alertsCleared", alertsCleared).
|
|
Msg("Cleared metric alerts for non-running guest")
|
|
}
|
|
return
|
|
}
|
|
|
|
// If guest is running, clear any powered-off alert
|
|
m.clearGuestPoweredOffAlert(guestID, name)
|
|
|
|
// Get thresholds (check custom rules, then overrides, then defaults)
|
|
m.mu.RLock()
|
|
thresholds := m.getGuestThresholds(guest, guestID)
|
|
m.mu.RUnlock()
|
|
|
|
if settings.Relaxed {
|
|
thresholds = applyRelaxedGuestThresholds(thresholds)
|
|
log.Info().
|
|
Str("guest", name).
|
|
Float64("trigger", thresholds.CPU.Trigger).
|
|
Msg("Applied relaxed thresholds for pulse-relaxed tag")
|
|
}
|
|
|
|
// If alerts are disabled for this guest, clear any existing alerts and return
|
|
if thresholds.Disabled {
|
|
m.mu.Lock()
|
|
for alertID, alert := range m.activeAlerts {
|
|
if alert.ResourceID == guestID {
|
|
m.clearAlertNoLock(alertID)
|
|
log.Info().
|
|
Str("alertID", alertID).
|
|
Str("guest", name).
|
|
Msg("Cleared alert - guest has alerts disabled")
|
|
}
|
|
}
|
|
m.mu.Unlock()
|
|
return
|
|
}
|
|
|
|
// Check each metric
|
|
log.Debug().
|
|
Str("guest", name).
|
|
Float64("cpu", cpu).
|
|
Float64("memory", memUsage).
|
|
Float64("disk", diskUsage).
|
|
Interface("thresholds", thresholds).
|
|
Msg("Checking guest thresholds")
|
|
|
|
// Check thresholds (checkMetric will skip if threshold is nil or <= 0)
|
|
cpuOpts := &metricOptions{MonitorOnly: monitorOnly}
|
|
memOpts := &metricOptions{MonitorOnly: monitorOnly}
|
|
diskOpts := &metricOptions{MonitorOnly: monitorOnly}
|
|
|
|
if !monitorOnly {
|
|
cpuOpts = nil
|
|
memOpts = nil
|
|
diskOpts = nil
|
|
}
|
|
|
|
m.checkMetric(guestID, name, node, instanceName, guestType, "cpu", cpu, thresholds.CPU, cpuOpts)
|
|
m.checkMetric(guestID, name, node, instanceName, guestType, "memory", memUsage, thresholds.Memory, memOpts)
|
|
m.checkMetric(guestID, name, node, instanceName, guestType, "disk", diskUsage, thresholds.Disk, diskOpts)
|
|
|
|
if thresholds.Disk != nil && thresholds.Disk.Trigger > 0 && len(disks) > 0 {
|
|
seenDisks := make(map[string]struct{})
|
|
for idx, disk := range disks {
|
|
if disk.Total <= 0 {
|
|
continue
|
|
}
|
|
if disk.Usage < 0 {
|
|
continue
|
|
}
|
|
|
|
label := strings.TrimSpace(disk.Mountpoint)
|
|
if label == "" {
|
|
label = strings.TrimSpace(disk.Device)
|
|
}
|
|
if label == "" {
|
|
label = fmt.Sprintf("Disk %d", idx+1)
|
|
}
|
|
|
|
keySource := label
|
|
if disk.Device != "" && !strings.EqualFold(disk.Device, label) {
|
|
keySource = fmt.Sprintf("%s-%s", label, disk.Device)
|
|
}
|
|
sanitizedKey := sanitizeAlertKey(keySource)
|
|
if sanitizedKey == "" {
|
|
sanitizedKey = fmt.Sprintf("disk-%d", idx+1)
|
|
}
|
|
|
|
// Avoid duplicate checks if two disks resolve to the same key
|
|
if _, exists := seenDisks[sanitizedKey]; exists {
|
|
continue
|
|
}
|
|
seenDisks[sanitizedKey] = struct{}{}
|
|
|
|
perDiskResourceID := fmt.Sprintf("%s-disk-%s", guestID, sanitizedKey)
|
|
message := fmt.Sprintf("%s disk (%s) at %.1f%%", guestType, label, disk.Usage)
|
|
|
|
log.Debug().
|
|
Str("guest", name).
|
|
Str("node", node).
|
|
Str("instance", instanceName).
|
|
Str("diskLabel", label).
|
|
Float64("usage", disk.Usage).
|
|
Msg("Evaluating individual disk for alert thresholds")
|
|
|
|
metadata := map[string]interface{}{
|
|
"mountpoint": disk.Mountpoint,
|
|
"device": disk.Device,
|
|
"diskType": disk.Type,
|
|
"totalBytes": disk.Total,
|
|
"usedBytes": disk.Used,
|
|
"freeBytes": disk.Free,
|
|
"diskIndex": idx,
|
|
"label": label,
|
|
}
|
|
|
|
m.checkMetric(perDiskResourceID, name, node, instanceName, guestType, "disk", disk.Usage, thresholds.Disk, &metricOptions{
|
|
Metadata: metadata,
|
|
Message: message,
|
|
MonitorOnly: monitorOnly,
|
|
})
|
|
}
|
|
}
|
|
|
|
// Check I/O metrics (convert bytes/s to MB/s) - checkMetric will skip if threshold is nil or <= 0
|
|
// Check I/O metrics (convert bytes/s to MB/s)
|
|
// We call checkMetric unconditionally. If the threshold is nil or disabled (Trigger <= 0),
|
|
// checkMetric will automatically clear any existing alerts for that metric.
|
|
{
|
|
readOpts := &metricOptions{MonitorOnly: monitorOnly}
|
|
if !monitorOnly {
|
|
readOpts = nil
|
|
}
|
|
m.checkMetric(guestID, name, node, instanceName, guestType, "diskRead", float64(diskRead)/1024/1024, thresholds.DiskRead, readOpts)
|
|
}
|
|
|
|
{
|
|
writeOpts := &metricOptions{MonitorOnly: monitorOnly}
|
|
if !monitorOnly {
|
|
writeOpts = nil
|
|
}
|
|
m.checkMetric(guestID, name, node, instanceName, guestType, "diskWrite", float64(diskWrite)/1024/1024, thresholds.DiskWrite, writeOpts)
|
|
}
|
|
|
|
{
|
|
netInOpts := &metricOptions{MonitorOnly: monitorOnly}
|
|
if !monitorOnly {
|
|
netInOpts = nil
|
|
}
|
|
m.checkMetric(guestID, name, node, instanceName, guestType, "networkIn", float64(netIn)/1024/1024, thresholds.NetworkIn, netInOpts)
|
|
}
|
|
|
|
{
|
|
netOutOpts := &metricOptions{MonitorOnly: monitorOnly}
|
|
if !monitorOnly {
|
|
netOutOpts = nil
|
|
}
|
|
m.checkMetric(guestID, name, node, instanceName, guestType, "networkOut", float64(netOut)/1024/1024, thresholds.NetworkOut, netOutOpts)
|
|
}
|
|
}
|
|
|
|
// CheckNode checks a node against thresholds
|
|
func (m *Manager) CheckNode(node models.Node) {
|
|
// Cache display name so all alerts (including guest alerts on this node) can resolve it.
|
|
m.UpdateNodeDisplayName(node.Instance, node.Name, node.DisplayName)
|
|
|
|
m.mu.RLock()
|
|
if !m.config.Enabled {
|
|
m.mu.RUnlock()
|
|
return
|
|
}
|
|
if m.config.DisableAllNodes {
|
|
m.mu.RUnlock()
|
|
// Clear any existing node alerts when all node alerts are disabled
|
|
m.mu.Lock()
|
|
// Clear offline tracking
|
|
delete(m.nodeOfflineCount, node.ID)
|
|
// Clear all possible node alert types
|
|
alertTypes := []string{"cpu", "memory", "disk", "temperature"}
|
|
for _, alertType := range alertTypes {
|
|
alertID := fmt.Sprintf("%s-%s", node.ID, alertType)
|
|
if _, exists := m.activeAlerts[alertID]; exists {
|
|
m.clearAlertNoLock(alertID)
|
|
log.Info().
|
|
Str("alertID", alertID).
|
|
Str("node", node.Name).
|
|
Msg("Cleared node alert - all node alerts disabled")
|
|
}
|
|
}
|
|
// Clear offline alert
|
|
offlineAlertID := fmt.Sprintf("node-offline-%s", node.ID)
|
|
if _, exists := m.activeAlerts[offlineAlertID]; exists {
|
|
m.clearAlertNoLock(offlineAlertID)
|
|
log.Info().
|
|
Str("alertID", offlineAlertID).
|
|
Str("node", node.Name).
|
|
Msg("Cleared offline alert - all node alerts disabled")
|
|
}
|
|
m.mu.Unlock()
|
|
return
|
|
}
|
|
disableNodesOffline := m.config.DisableAllNodesOffline
|
|
thresholds := m.config.NodeDefaults
|
|
if override, exists := m.config.Overrides[node.ID]; exists {
|
|
thresholds = m.applyThresholdOverride(thresholds, override)
|
|
}
|
|
m.mu.RUnlock()
|
|
|
|
if disableNodesOffline {
|
|
// Clear tracking and any existing offline alerts when globally disabled
|
|
m.mu.Lock()
|
|
delete(m.nodeOfflineCount, node.ID)
|
|
m.mu.Unlock()
|
|
m.clearAlert(fmt.Sprintf("node-offline-%s", node.ID))
|
|
} else {
|
|
// CRITICAL: Check if node is offline first
|
|
if node.Status == "offline" || node.ConnectionHealth == "error" || node.ConnectionHealth == "failed" {
|
|
m.checkNodeOffline(node)
|
|
|
|
// Clear resource alerts if node is offline/unreachable.
|
|
// This prevents stale alerts from persisting when we can't get new data.
|
|
metrics := []string{"cpu", "memory", "disk", "temperature"}
|
|
for _, metric := range metrics {
|
|
m.clearAlert(fmt.Sprintf("%s-%s", node.ID, metric))
|
|
}
|
|
} else {
|
|
// Clear any existing offline alert if node is back online
|
|
m.clearNodeOfflineAlert(node)
|
|
|
|
// Check each metric (only if node is online and reachable)
|
|
// Check for host agent deduplication: if a host agent is running on this node,
|
|
// prefer the host agent alerts and skip node metric alerts to avoid duplicates.
|
|
if m.hasHostAgentForNode(node.Name) {
|
|
log.Debug().
|
|
Str("node", node.Name).
|
|
Msg("Skipping node metric alerts - host agent is monitoring this machine")
|
|
} else {
|
|
m.checkMetric(node.ID, node.Name, node.Name, node.Instance, "Node", "cpu", node.CPU*100, thresholds.CPU, nil)
|
|
m.checkMetric(node.ID, node.Name, node.Name, node.Instance, "Node", "memory", node.Memory.Usage, thresholds.Memory, nil)
|
|
m.checkMetric(node.ID, node.Name, node.Name, node.Instance, "Node", "disk", node.Disk.Usage, thresholds.Disk, nil)
|
|
|
|
// Check temperature if available
|
|
// We pass the check unconditionally so that if the threshold triggers are disabled (set to 0),
|
|
// any existing alerts will be properly cleared.
|
|
var temp float64
|
|
if node.Temperature != nil && node.Temperature.Available {
|
|
// Use CPU package temp if available, otherwise use max core temp
|
|
temp = node.Temperature.CPUPackage
|
|
if temp == 0 {
|
|
temp = node.Temperature.CPUMax
|
|
}
|
|
}
|
|
m.checkMetric(node.ID, node.Name, node.Name, node.Instance, "Node", "temperature", temp, thresholds.Temperature, nil)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// RegisterHostAgentHostname registers a host agent hostname for deduplication.
|
|
// When a host agent is actively monitoring a machine, we prefer its alerts
|
|
// over Proxmox node alerts to avoid duplicate monitoring of the same machine.
|
|
func (m *Manager) RegisterHostAgentHostname(hostname string) {
|
|
normalized := strings.ToLower(strings.TrimSpace(hostname))
|
|
if normalized == "" {
|
|
return
|
|
}
|
|
m.mu.Lock()
|
|
m.hostAgentHostnames[normalized] = struct{}{}
|
|
m.mu.Unlock()
|
|
|
|
log.Debug().
|
|
Str("hostname", hostname).
|
|
Msg("Registered host agent hostname for deduplication")
|
|
}
|
|
|
|
// UnregisterHostAgentHostname removes a host agent hostname from deduplication tracking.
|
|
func (m *Manager) UnregisterHostAgentHostname(hostname string) {
|
|
normalized := strings.ToLower(strings.TrimSpace(hostname))
|
|
if normalized == "" {
|
|
return
|
|
}
|
|
m.mu.Lock()
|
|
delete(m.hostAgentHostnames, normalized)
|
|
m.mu.Unlock()
|
|
|
|
log.Debug().
|
|
Str("hostname", hostname).
|
|
Msg("Unregistered host agent hostname from deduplication")
|
|
}
|
|
|
|
// hasHostAgentForNode checks if a host agent is monitoring a machine with the same
|
|
// hostname as the given Proxmox node. If so, we should suppress node alerts to
|
|
// avoid duplicate alerting.
|
|
func (m *Manager) hasHostAgentForNode(nodeName string) bool {
|
|
normalized := strings.ToLower(strings.TrimSpace(nodeName))
|
|
if normalized == "" {
|
|
return false
|
|
}
|
|
m.mu.RLock()
|
|
_, exists := m.hostAgentHostnames[normalized]
|
|
m.mu.RUnlock()
|
|
return exists
|
|
}
|
|
|
|
func nodeDisplayNameCacheKey(instance, name string) string {
|
|
return strings.TrimSpace(instance) + "\x00" + strings.TrimSpace(name)
|
|
}
|
|
|
|
// UpdateNodeDisplayName caches the display name for a node/host so alerts
|
|
// can resolve it without needing the full model object.
|
|
func (m *Manager) UpdateNodeDisplayName(instance, name, displayName string) {
|
|
instance = strings.TrimSpace(instance)
|
|
name = strings.TrimSpace(name)
|
|
if name == "" {
|
|
return
|
|
}
|
|
displayName = strings.TrimSpace(displayName)
|
|
m.mu.Lock()
|
|
if instance != "" {
|
|
key := nodeDisplayNameCacheKey(instance, name)
|
|
if displayName != "" && displayName != name {
|
|
m.instanceNodeDisplayNames[key] = displayName
|
|
} else {
|
|
delete(m.instanceNodeDisplayNames, key)
|
|
}
|
|
} else {
|
|
if displayName != "" && displayName != name {
|
|
m.nodeDisplayNames[name] = displayName
|
|
} else {
|
|
delete(m.nodeDisplayNames, name)
|
|
}
|
|
}
|
|
m.mu.Unlock()
|
|
}
|
|
|
|
// resolveNodeDisplayName returns the cached display name for a node, or empty
|
|
// string if none is set. Caller must hold m.mu (read or write).
|
|
func (m *Manager) resolveNodeDisplayName(instance, node string) string {
|
|
if instance = strings.TrimSpace(instance); instance != "" {
|
|
if displayName, ok := m.instanceNodeDisplayNames[nodeDisplayNameCacheKey(instance, node)]; ok {
|
|
return displayName
|
|
}
|
|
}
|
|
return m.nodeDisplayNames[node]
|
|
}
|
|
|
|
func hostResourceID(hostID string) string {
|
|
trimmed := strings.TrimSpace(hostID)
|
|
if trimmed == "" {
|
|
return "host:unknown"
|
|
}
|
|
return fmt.Sprintf("host:%s", trimmed)
|
|
}
|
|
|
|
func hostDisplayName(host models.Host) string {
|
|
if name := strings.TrimSpace(host.DisplayName); name != "" {
|
|
return name
|
|
}
|
|
if name := strings.TrimSpace(host.Hostname); name != "" {
|
|
return name
|
|
}
|
|
if host.ID != "" {
|
|
return host.ID
|
|
}
|
|
return "Host"
|
|
}
|
|
|
|
func hostInstanceName(host models.Host) string {
|
|
if platform := strings.TrimSpace(host.Platform); platform != "" {
|
|
return platform
|
|
}
|
|
if osName := strings.TrimSpace(host.OSName); osName != "" {
|
|
return osName
|
|
}
|
|
return "Host Agent"
|
|
}
|
|
|
|
func sanitizeHostComponent(value string) string {
|
|
value = strings.TrimSpace(strings.ToLower(value))
|
|
if value == "" {
|
|
return "unknown"
|
|
}
|
|
|
|
var builder strings.Builder
|
|
lastHyphen := false
|
|
for _, r := range value {
|
|
switch {
|
|
case r >= 'a' && r <= 'z':
|
|
builder.WriteRune(r)
|
|
lastHyphen = false
|
|
case r >= '0' && r <= '9':
|
|
builder.WriteRune(r)
|
|
lastHyphen = false
|
|
default:
|
|
if !lastHyphen {
|
|
builder.WriteRune('-')
|
|
lastHyphen = true
|
|
}
|
|
}
|
|
}
|
|
|
|
sanitized := strings.Trim(builder.String(), "-")
|
|
if sanitized == "" {
|
|
return "unknown"
|
|
}
|
|
return sanitized
|
|
}
|
|
|
|
// sanitizeRAIDDevice sanitizes RAID device names for use in resource IDs.
|
|
func sanitizeRAIDDevice(device string) string {
|
|
// Remove /dev/ prefix if present
|
|
device = strings.TrimPrefix(device, "/dev/")
|
|
return sanitizeHostComponent(device)
|
|
}
|
|
|
|
func hostMatchesVendorHint(host models.Host, hints ...string) bool {
|
|
fields := []string{
|
|
host.Platform,
|
|
host.OSName,
|
|
host.OSVersion,
|
|
host.DisplayName,
|
|
host.Hostname,
|
|
}
|
|
for _, field := range fields {
|
|
value := strings.ToLower(strings.TrimSpace(field))
|
|
if value == "" {
|
|
continue
|
|
}
|
|
for _, hint := range hints {
|
|
if strings.Contains(value, hint) {
|
|
return true
|
|
}
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
func isSynologyLikeHost(host models.Host) bool {
|
|
return hostMatchesVendorHint(host, "synology", "dsm")
|
|
}
|
|
|
|
func isQNAPLikeHost(host models.Host) bool {
|
|
return hostMatchesVendorHint(host, "qnap", "qts", "quts")
|
|
}
|
|
|
|
func shouldSuppressHostRAIDArray(host models.Host, array models.HostRAIDArray) bool {
|
|
deviceLower := strings.ToLower(strings.TrimSpace(strings.TrimPrefix(array.Device, "/dev/")))
|
|
switch {
|
|
case deviceLower == "":
|
|
return false
|
|
case isSynologyLikeHost(host):
|
|
return deviceLower == "md0" || deviceLower == "md1"
|
|
case isQNAPLikeHost(host):
|
|
return deviceLower == "md9" || deviceLower == "md13"
|
|
default:
|
|
return false
|
|
}
|
|
}
|
|
|
|
func hostDiskResourceID(host models.Host, disk models.Disk) (string, string) {
|
|
label := strings.TrimSpace(disk.Mountpoint)
|
|
if label == "" {
|
|
label = strings.TrimSpace(disk.Device)
|
|
}
|
|
if label == "" {
|
|
label = "disk"
|
|
}
|
|
resourceID := fmt.Sprintf("%s/disk:%s", hostResourceID(host.ID), sanitizeHostComponent(label))
|
|
resourceName := fmt.Sprintf("%s (%s)", hostDisplayName(host), label)
|
|
return resourceID, resourceName
|
|
}
|
|
|
|
// CheckHost evaluates host agent telemetry for alerts.
|
|
func (m *Manager) CheckHost(host models.Host) {
|
|
if host.ID == "" {
|
|
return
|
|
}
|
|
|
|
// Register this host agent hostname for deduplication with Proxmox nodes.
|
|
// This prevents duplicate alerts when both a Node and Host agent monitor the same machine.
|
|
if host.Hostname != "" {
|
|
m.RegisterHostAgentHostname(host.Hostname)
|
|
}
|
|
|
|
// Cache display name so host alerts show the user-configured name.
|
|
m.UpdateNodeDisplayName("", host.Hostname, host.DisplayName)
|
|
|
|
// Fresh telemetry marks the host as online and clears offline tracking.
|
|
m.HandleHostOnline(host)
|
|
|
|
m.mu.RLock()
|
|
alertsEnabled := m.config.Enabled
|
|
disableAllHosts := m.config.DisableAllHosts
|
|
thresholds := m.config.HostDefaults
|
|
override, hasOverride := m.config.Overrides[host.ID]
|
|
m.mu.RUnlock()
|
|
|
|
if !alertsEnabled {
|
|
return
|
|
}
|
|
|
|
if disableAllHosts {
|
|
// Clear any existing host alerts when all host alerts are disabled
|
|
m.clearHostMetricAlerts(host.ID)
|
|
m.clearHostDiskAlerts(host.ID)
|
|
m.clearHostRAIDAlerts(host.ID)
|
|
return
|
|
}
|
|
|
|
if hasOverride {
|
|
thresholds = m.applyThresholdOverride(thresholds, override)
|
|
if thresholds.Disabled {
|
|
m.clearHostMetricAlerts(host.ID)
|
|
m.clearHostDiskAlerts(host.ID)
|
|
m.clearHostRAIDAlerts(host.ID)
|
|
return
|
|
}
|
|
}
|
|
|
|
resourceID := hostResourceID(host.ID)
|
|
resourceName := hostDisplayName(host)
|
|
nodeName := strings.TrimSpace(host.Hostname)
|
|
instanceName := hostInstanceName(host)
|
|
|
|
baseMetadata := map[string]interface{}{
|
|
"resourceType": "Host",
|
|
"hostId": host.ID,
|
|
"hostname": host.Hostname,
|
|
"displayName": host.DisplayName,
|
|
"platform": host.Platform,
|
|
"osName": host.OSName,
|
|
"osVersion": host.OSVersion,
|
|
"agentVersion": host.AgentVersion,
|
|
"architecture": host.Architecture,
|
|
}
|
|
if len(host.Tags) > 0 {
|
|
baseMetadata["tags"] = append([]string(nil), host.Tags...)
|
|
}
|
|
|
|
if thresholds.CPU != nil {
|
|
cpuMetadata := cloneMetadata(baseMetadata)
|
|
cpuMetadata["metric"] = "cpu"
|
|
cpuMetadata["cpuUsagePercent"] = host.CPUUsage
|
|
if host.CPUCount > 0 {
|
|
cpuMetadata["cpuCount"] = host.CPUCount
|
|
}
|
|
m.checkMetric(resourceID, resourceName, nodeName, instanceName, "Host", "cpu", host.CPUUsage, thresholds.CPU, &metricOptions{Metadata: cpuMetadata})
|
|
} else {
|
|
m.clearHostMetricAlerts(host.ID, "cpu")
|
|
}
|
|
|
|
if thresholds.Memory != nil {
|
|
memMetadata := cloneMetadata(baseMetadata)
|
|
memMetadata["metric"] = "memory"
|
|
memMetadata["memoryUsagePercent"] = host.Memory.Usage
|
|
if host.Memory.Total > 0 {
|
|
memMetadata["memoryTotalBytes"] = host.Memory.Total
|
|
memMetadata["memoryUsedBytes"] = host.Memory.Used
|
|
memMetadata["memoryFreeBytes"] = host.Memory.Free
|
|
}
|
|
m.checkMetric(resourceID, resourceName, nodeName, instanceName, "Host", "memory", host.Memory.Usage, thresholds.Memory, &metricOptions{Metadata: memMetadata})
|
|
} else {
|
|
m.clearHostMetricAlerts(host.ID, "memory")
|
|
}
|
|
|
|
if thresholds.DiskTemperature != nil && thresholds.DiskTemperature.Trigger > 0 {
|
|
if len(host.Sensors.SMART) > 0 {
|
|
for _, disk := range host.Sensors.SMART {
|
|
if disk.Temperature > 0 && !disk.Standby {
|
|
// Use specific resource ID for the disk: hostID/disk-temp:device
|
|
tempResourceID := fmt.Sprintf("%s/disk_temp:%s", hostResourceID(host.ID), sanitizeHostComponent(disk.Device))
|
|
tempResourceName := fmt.Sprintf("%s (%s Temp)", host.DisplayName, disk.Device)
|
|
|
|
diskTempMetadata := cloneMetadata(baseMetadata)
|
|
diskTempMetadata["metric"] = "diskTemperature"
|
|
diskTempMetadata["device"] = disk.Device
|
|
diskTempMetadata["temperature"] = disk.Temperature
|
|
diskTempMetadata["model"] = disk.Model
|
|
|
|
m.checkMetric(tempResourceID, tempResourceName, nodeName, disk.Device, "Host", "diskTemperature", float64(disk.Temperature), thresholds.DiskTemperature, &metricOptions{Metadata: diskTempMetadata})
|
|
}
|
|
}
|
|
}
|
|
} else {
|
|
// We can't easily clear all disk temp alerts without tracking them,
|
|
// but checkMetric logic handles auto-resolution if value drops.
|
|
// If feature is disabled, ideally we should clear existing alerts.
|
|
// For now simple implementation.
|
|
}
|
|
|
|
seenDisks := make(map[string]struct{}, len(host.Disks))
|
|
for _, disk := range host.Disks {
|
|
diskResourceID, diskName := hostDiskResourceID(host, disk)
|
|
seenDisks[diskResourceID] = struct{}{}
|
|
|
|
// Check for disk-specific override
|
|
m.mu.RLock()
|
|
diskOverride, hasDiskOverride := m.config.Overrides[diskResourceID]
|
|
m.mu.RUnlock()
|
|
|
|
// Determine the effective disk threshold
|
|
var effectiveDiskThreshold *HysteresisThreshold
|
|
if hasDiskOverride {
|
|
// If disk is disabled via override, skip alerting
|
|
if diskOverride.Disabled {
|
|
m.clearAlert(fmt.Sprintf("host-%s-disk-%s", host.ID, sanitizeHostComponent(disk.Mountpoint)))
|
|
continue
|
|
}
|
|
// Use disk-specific threshold if set
|
|
if diskOverride.Disk != nil {
|
|
effectiveDiskThreshold = ensureHysteresisThreshold(diskOverride.Disk)
|
|
} else if diskOverride.DiskLegacy != nil {
|
|
effectiveDiskThreshold = m.convertLegacyThreshold(diskOverride.DiskLegacy)
|
|
}
|
|
}
|
|
// Fall back to host-level threshold
|
|
if effectiveDiskThreshold == nil {
|
|
effectiveDiskThreshold = thresholds.Disk
|
|
}
|
|
|
|
// Skip if no threshold configured (nil)
|
|
// We DO NOT skip if Trigger <= 0 because we need to call checkMetric to clear any existing alerts.
|
|
if effectiveDiskThreshold == nil {
|
|
continue
|
|
}
|
|
|
|
diskMetadata := cloneMetadata(baseMetadata)
|
|
diskMetadata["metric"] = "disk"
|
|
diskMetadata["mountpoint"] = disk.Mountpoint
|
|
diskMetadata["device"] = disk.Device
|
|
diskMetadata["diskType"] = disk.Type
|
|
diskMetadata["diskUsagePercent"] = disk.Usage
|
|
if disk.Total > 0 {
|
|
diskMetadata["diskTotalBytes"] = disk.Total
|
|
diskMetadata["diskUsedBytes"] = disk.Used
|
|
diskMetadata["diskFreeBytes"] = disk.Free
|
|
}
|
|
|
|
m.checkMetric(diskResourceID, diskName, nodeName, instanceName, "Host Disk", "disk", disk.Usage, effectiveDiskThreshold, &metricOptions{Metadata: diskMetadata})
|
|
}
|
|
|
|
// Clear all disk alerts if host-level disk alerting is completely disabled and no disk-specific overrides
|
|
if thresholds.Disk == nil || thresholds.Disk.Trigger <= 0 {
|
|
// Only clear alerts for disks that don't have their own overrides
|
|
m.mu.RLock()
|
|
var disksToClear []string
|
|
for _, disk := range host.Disks {
|
|
diskResourceID, _ := hostDiskResourceID(host, disk)
|
|
if _, hasDiskOverride := m.config.Overrides[diskResourceID]; !hasDiskOverride {
|
|
disksToClear = append(disksToClear, fmt.Sprintf("host-%s-disk-%s", host.ID, sanitizeHostComponent(disk.Mountpoint)))
|
|
}
|
|
}
|
|
m.mu.RUnlock()
|
|
|
|
for _, alertID := range disksToClear {
|
|
m.clearAlert(alertID)
|
|
}
|
|
}
|
|
|
|
m.cleanupHostDiskAlerts(host, seenDisks)
|
|
|
|
// Check RAID arrays for degraded or failed state
|
|
if len(host.RAID) > 0 {
|
|
for _, array := range host.RAID {
|
|
// Skip vendor-managed system arrays that are not user-facing storage pools.
|
|
// Synology uses md0/md1, while QNAP uses md9/md13 for internal OS volumes.
|
|
if shouldSuppressHostRAIDArray(host, array) {
|
|
// Still clear any existing alerts for these devices
|
|
alertID := fmt.Sprintf("host-%s-raid-%s", host.ID, sanitizeRAIDDevice(array.Device))
|
|
m.clearAlert(alertID)
|
|
continue
|
|
}
|
|
|
|
raidResourceID := fmt.Sprintf("host-%s-raid-%s", host.ID, sanitizeRAIDDevice(array.Device))
|
|
raidName := fmt.Sprintf("%s - %s (%s)", resourceName, array.Device, array.Level)
|
|
|
|
raidMetadata := cloneMetadata(baseMetadata)
|
|
raidMetadata["metric"] = "raid"
|
|
raidMetadata["raidDevice"] = array.Device
|
|
raidMetadata["raidLevel"] = array.Level
|
|
raidMetadata["raidState"] = array.State
|
|
raidMetadata["raidTotalDevices"] = array.TotalDevices
|
|
raidMetadata["raidActiveDevices"] = array.ActiveDevices
|
|
raidMetadata["raidFailedDevices"] = array.FailedDevices
|
|
raidMetadata["raidSpareDevices"] = array.SpareDevices
|
|
if array.UUID != "" {
|
|
raidMetadata["raidUUID"] = array.UUID
|
|
}
|
|
if array.RebuildPercent > 0 {
|
|
raidMetadata["raidRebuildPercent"] = array.RebuildPercent
|
|
}
|
|
|
|
// Check for degraded or failed arrays
|
|
stateLower := strings.ToLower(array.State)
|
|
isDegraded := strings.Contains(stateLower, "degraded") || array.FailedDevices > 0
|
|
|
|
// A "check" state indicates data scrubbing (e.g., DSM scheduled scrub), not a rebuild.
|
|
// Only treat as rebuilding if state indicates actual recovery, not routine maintenance.
|
|
isChecking := strings.Contains(stateLower, "check")
|
|
isRebuilding := !isChecking && (strings.Contains(stateLower, "recover") ||
|
|
strings.Contains(stateLower, "resync") ||
|
|
(array.RebuildPercent > 0 && !strings.Contains(stateLower, "clean")))
|
|
|
|
alertID := fmt.Sprintf("host-%s-raid-%s", host.ID, sanitizeRAIDDevice(array.Device))
|
|
|
|
if isDegraded {
|
|
// Critical alert for degraded arrays
|
|
msg := fmt.Sprintf("RAID array %s is degraded", array.Device)
|
|
if array.FailedDevices > 0 {
|
|
msg = fmt.Sprintf("RAID array %s has %d failed device(s)", array.Device, array.FailedDevices)
|
|
}
|
|
|
|
m.mu.Lock()
|
|
if _, exists := m.activeAlerts[alertID]; !exists {
|
|
alert := &Alert{
|
|
ID: alertID,
|
|
Type: "raid",
|
|
Level: AlertLevelCritical,
|
|
ResourceID: raidResourceID,
|
|
ResourceName: raidName,
|
|
Node: nodeName,
|
|
NodeDisplayName: m.resolveNodeDisplayName(instanceName, nodeName),
|
|
Instance: instanceName,
|
|
Message: msg,
|
|
Value: float64(array.FailedDevices),
|
|
Threshold: 0,
|
|
StartTime: time.Now(),
|
|
LastSeen: time.Now(),
|
|
Metadata: raidMetadata,
|
|
}
|
|
m.preserveAlertState(alertID, alert)
|
|
m.activeAlerts[alertID] = alert
|
|
m.recentAlerts[alertID] = alert
|
|
m.historyManager.AddAlert(*alert)
|
|
m.dispatchAlert(alert, false)
|
|
m.mu.Unlock()
|
|
|
|
log.Error().
|
|
Str("host", resourceName).
|
|
Str("hostID", host.ID).
|
|
Str("raidDevice", array.Device).
|
|
Str("raidLevel", array.Level).
|
|
Int("failedDevices", array.FailedDevices).
|
|
Msg("CRITICAL: RAID array degraded")
|
|
} else {
|
|
m.mu.Unlock()
|
|
}
|
|
} else if isRebuilding {
|
|
// Warning alert for rebuilding arrays
|
|
msg := fmt.Sprintf("RAID array %s is rebuilding", array.Device)
|
|
if array.RebuildPercent > 0 {
|
|
msg = fmt.Sprintf("RAID array %s is rebuilding (%.1f%% complete)", array.Device, array.RebuildPercent)
|
|
}
|
|
|
|
m.mu.Lock()
|
|
if _, exists := m.activeAlerts[alertID]; !exists {
|
|
alert := &Alert{
|
|
ID: alertID,
|
|
Type: "raid",
|
|
Level: AlertLevelWarning,
|
|
ResourceID: raidResourceID,
|
|
ResourceName: raidName,
|
|
Node: nodeName,
|
|
Instance: instanceName,
|
|
Message: msg,
|
|
Value: array.RebuildPercent,
|
|
Threshold: 100,
|
|
StartTime: time.Now(),
|
|
LastSeen: time.Now(),
|
|
Metadata: raidMetadata,
|
|
}
|
|
m.preserveAlertState(alertID, alert)
|
|
m.activeAlerts[alertID] = alert
|
|
m.recentAlerts[alertID] = alert
|
|
m.historyManager.AddAlert(*alert)
|
|
m.dispatchAlert(alert, false)
|
|
m.mu.Unlock()
|
|
|
|
log.Warn().
|
|
Str("host", resourceName).
|
|
Str("hostID", host.ID).
|
|
Str("raidDevice", array.Device).
|
|
Str("raidLevel", array.Level).
|
|
Float64("rebuildPercent", array.RebuildPercent).
|
|
Msg("WARNING: RAID array rebuilding")
|
|
} else {
|
|
m.mu.Unlock()
|
|
}
|
|
} else {
|
|
// Array is healthy, clear any existing alerts
|
|
m.clearAlert(alertID)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// HandleHostOnline clears offline tracking and alerts for a host agent.
|
|
func (m *Manager) HandleHostOnline(host models.Host) {
|
|
if host.ID == "" {
|
|
return
|
|
}
|
|
|
|
alertID := fmt.Sprintf("host-offline-%s", host.ID)
|
|
resourceKey := hostResourceID(host.ID)
|
|
|
|
m.mu.Lock()
|
|
delete(m.offlineConfirmations, resourceKey)
|
|
_, exists := m.activeAlerts[alertID]
|
|
m.mu.Unlock()
|
|
|
|
if exists {
|
|
m.clearAlert(alertID)
|
|
}
|
|
}
|
|
|
|
// HandleHostRemoved clears alerts and tracking when a host agent is deleted.
|
|
func (m *Manager) HandleHostRemoved(host models.Host) {
|
|
if host.ID == "" {
|
|
return
|
|
}
|
|
|
|
// Unregister the host agent hostname since it's being removed.
|
|
if host.Hostname != "" {
|
|
m.UnregisterHostAgentHostname(host.Hostname)
|
|
}
|
|
|
|
m.HandleHostOnline(host)
|
|
m.clearHostMetricAlerts(host.ID)
|
|
m.clearHostDiskAlerts(host.ID)
|
|
m.clearHostRAIDAlerts(host.ID)
|
|
}
|
|
|
|
// HandleHostOffline raises an alert when a host agent stops reporting.
|
|
func (m *Manager) HandleHostOffline(host models.Host) {
|
|
if host.ID == "" {
|
|
return
|
|
}
|
|
|
|
// Unregister the host agent hostname since it's no longer actively monitoring.
|
|
// This allows node alerts to resume if a Proxmox node with the same hostname exists.
|
|
if host.Hostname != "" {
|
|
m.UnregisterHostAgentHostname(host.Hostname)
|
|
}
|
|
|
|
m.mu.RLock()
|
|
if !m.config.Enabled {
|
|
m.mu.RUnlock()
|
|
return
|
|
}
|
|
disableHostsOffline := m.config.DisableAllHostsOffline
|
|
m.mu.RUnlock()
|
|
|
|
alertID := fmt.Sprintf("host-offline-%s", host.ID)
|
|
resourceKey := hostResourceID(host.ID)
|
|
resourceName := hostDisplayName(host)
|
|
nodeName := strings.TrimSpace(host.Hostname)
|
|
instanceName := hostInstanceName(host)
|
|
|
|
if disableHostsOffline {
|
|
m.mu.Lock()
|
|
delete(m.offlineConfirmations, resourceKey)
|
|
m.mu.Unlock()
|
|
m.clearAlert(alertID)
|
|
return
|
|
}
|
|
|
|
var disableConnectivity bool
|
|
m.mu.RLock()
|
|
if override, exists := m.config.Overrides[host.ID]; exists {
|
|
disableConnectivity = override.DisableConnectivity || override.Disabled
|
|
}
|
|
m.mu.RUnlock()
|
|
|
|
if disableConnectivity {
|
|
m.clearAlert(alertID)
|
|
m.mu.Lock()
|
|
delete(m.offlineConfirmations, resourceKey)
|
|
m.mu.Unlock()
|
|
return
|
|
}
|
|
|
|
m.mu.Lock()
|
|
if alert, exists := m.activeAlerts[alertID]; exists && alert != nil {
|
|
alert.LastSeen = time.Now()
|
|
m.activeAlerts[alertID] = alert
|
|
m.mu.Unlock()
|
|
return
|
|
}
|
|
|
|
m.offlineConfirmations[resourceKey]++
|
|
const requiredConfirmations = 3
|
|
if confirmations := m.offlineConfirmations[resourceKey]; confirmations < requiredConfirmations {
|
|
m.mu.Unlock()
|
|
log.Debug().
|
|
Str("host", resourceName).
|
|
Str("hostID", host.ID).
|
|
Int("confirmations", confirmations).
|
|
Int("required", requiredConfirmations).
|
|
Msg("Host agent appears offline, awaiting confirmation")
|
|
return
|
|
}
|
|
|
|
// Host is confirmed offline. Clear all resource metrics (CPU/Memory/Disk/RAID)
|
|
// before raising the offline alert, to avoid stale alerts persisting.
|
|
{
|
|
// Basic metrics
|
|
metricTypes := []string{"cpu", "memory"}
|
|
for _, mt := range metricTypes {
|
|
m.clearAlertNoLock(fmt.Sprintf("%s-%s", resourceKey, mt))
|
|
}
|
|
|
|
// Disks and RAID
|
|
// Note: Disks use ResourceID prefix, RAID uses AlertID prefix
|
|
diskResourcePrefix := fmt.Sprintf("%s/disk:", resourceKey)
|
|
raidAlertPrefix := fmt.Sprintf("host-%s-raid-", host.ID)
|
|
|
|
// Collect alert IDs first, then clear (avoids modifying map during iteration)
|
|
var alertsToClear []string
|
|
for alertID, a := range m.activeAlerts {
|
|
if a == nil {
|
|
continue
|
|
}
|
|
if strings.HasPrefix(a.ResourceID, diskResourcePrefix) {
|
|
alertsToClear = append(alertsToClear, alertID)
|
|
} else if strings.HasPrefix(alertID, raidAlertPrefix) {
|
|
alertsToClear = append(alertsToClear, alertID)
|
|
}
|
|
}
|
|
for _, alertID := range alertsToClear {
|
|
m.clearAlertNoLock(alertID)
|
|
}
|
|
}
|
|
|
|
alert := &Alert{
|
|
ID: alertID,
|
|
Type: "host-offline",
|
|
Level: AlertLevelCritical,
|
|
ResourceID: resourceKey,
|
|
ResourceName: resourceName,
|
|
Node: nodeName,
|
|
Instance: instanceName,
|
|
Message: fmt.Sprintf("Host '%s' is offline", resourceName),
|
|
Value: 0,
|
|
Threshold: 0,
|
|
StartTime: time.Now(),
|
|
LastSeen: time.Now(),
|
|
Metadata: map[string]interface{}{
|
|
"resourceType": "Host",
|
|
"hostId": host.ID,
|
|
"hostname": host.Hostname,
|
|
"displayName": host.DisplayName,
|
|
"platform": host.Platform,
|
|
"osName": host.OSName,
|
|
"osVersion": host.OSVersion,
|
|
},
|
|
}
|
|
|
|
m.preserveAlertState(alertID, alert)
|
|
m.activeAlerts[alertID] = alert
|
|
m.recentAlerts[alertID] = alert
|
|
m.historyManager.AddAlert(*alert)
|
|
if !m.checkRateLimit(alertID) {
|
|
m.mu.Unlock()
|
|
log.Debug().
|
|
Str("alertID", alertID).
|
|
Int("maxPerHour", m.config.Schedule.MaxAlertsHour).
|
|
Msg("Host offline alert suppressed due to rate limit")
|
|
return
|
|
}
|
|
|
|
m.dispatchAlert(alert, false)
|
|
m.mu.Unlock()
|
|
|
|
log.Error().
|
|
Str("host", resourceName).
|
|
Str("hostID", host.ID).
|
|
Str("hostname", host.Hostname).
|
|
Msg("CRITICAL: Host agent is offline")
|
|
}
|
|
|
|
func (m *Manager) clearHostMetricAlerts(hostID string, metrics ...string) {
|
|
if hostID == "" {
|
|
return
|
|
}
|
|
resourceID := hostResourceID(hostID)
|
|
if len(metrics) == 0 {
|
|
metrics = []string{"cpu", "memory"}
|
|
}
|
|
for _, metric := range metrics {
|
|
m.clearAlert(fmt.Sprintf("%s-%s", resourceID, metric))
|
|
}
|
|
}
|
|
|
|
func (m *Manager) clearHostDiskAlerts(hostID string) {
|
|
if hostID == "" {
|
|
return
|
|
}
|
|
|
|
prefix := fmt.Sprintf("%s/disk:", hostResourceID(hostID))
|
|
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
|
|
for alertID, alert := range m.activeAlerts {
|
|
if alert == nil {
|
|
continue
|
|
}
|
|
if !strings.HasPrefix(alert.ResourceID, prefix) {
|
|
continue
|
|
}
|
|
m.clearAlertNoLock(alertID)
|
|
}
|
|
}
|
|
|
|
func (m *Manager) cleanupHostDiskAlerts(host models.Host, seen map[string]struct{}) {
|
|
if host.ID == "" {
|
|
return
|
|
}
|
|
|
|
prefix := fmt.Sprintf("%s/disk:", hostResourceID(host.ID))
|
|
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
|
|
for alertID, alert := range m.activeAlerts {
|
|
if alert == nil {
|
|
continue
|
|
}
|
|
if !strings.HasPrefix(alert.ResourceID, prefix) {
|
|
continue
|
|
}
|
|
if _, exists := seen[alert.ResourceID]; exists {
|
|
continue
|
|
}
|
|
m.clearAlertNoLock(alertID)
|
|
}
|
|
}
|
|
|
|
func (m *Manager) clearHostRAIDAlerts(hostID string) {
|
|
if hostID == "" {
|
|
return
|
|
}
|
|
|
|
prefix := fmt.Sprintf("host-%s-raid-", hostID)
|
|
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
|
|
for alertID := range m.activeAlerts {
|
|
if strings.HasPrefix(alertID, prefix) {
|
|
m.clearAlertNoLock(alertID)
|
|
}
|
|
}
|
|
}
|
|
|
|
// CheckPBS checks PBS instance metrics against thresholds
|
|
func (m *Manager) CheckPBS(pbs models.PBSInstance) {
|
|
m.mu.RLock()
|
|
if !m.config.Enabled {
|
|
m.mu.RUnlock()
|
|
return
|
|
}
|
|
if m.config.DisableAllPBS {
|
|
m.mu.RUnlock()
|
|
// Clear any existing PBS alerts when all PBS alerts are disabled
|
|
m.mu.Lock()
|
|
// Reset offline confirmation tracking
|
|
delete(m.offlineConfirmations, pbs.ID)
|
|
// Clear CPU alert
|
|
cpuAlertID := fmt.Sprintf("%s-cpu", pbs.ID)
|
|
if _, exists := m.activeAlerts[cpuAlertID]; exists {
|
|
m.clearAlertNoLock(cpuAlertID)
|
|
log.Info().
|
|
Str("alertID", cpuAlertID).
|
|
Str("pbs", pbs.Name).
|
|
Msg("Cleared CPU alert - all PBS alerts disabled")
|
|
}
|
|
// Clear Memory alert
|
|
memAlertID := fmt.Sprintf("%s-memory", pbs.ID)
|
|
if _, exists := m.activeAlerts[memAlertID]; exists {
|
|
m.clearAlertNoLock(memAlertID)
|
|
log.Info().
|
|
Str("alertID", memAlertID).
|
|
Str("pbs", pbs.Name).
|
|
Msg("Cleared Memory alert - all PBS alerts disabled")
|
|
}
|
|
// Clear offline alert
|
|
offlineAlertID := fmt.Sprintf("pbs-offline-%s", pbs.ID)
|
|
if _, exists := m.activeAlerts[offlineAlertID]; exists {
|
|
m.clearAlertNoLock(offlineAlertID)
|
|
log.Info().
|
|
Str("alertID", offlineAlertID).
|
|
Str("pbs", pbs.Name).
|
|
Msg("Cleared offline alert - all PBS alerts disabled")
|
|
}
|
|
m.mu.Unlock()
|
|
return
|
|
}
|
|
|
|
// Check if there's an override for this PBS instance
|
|
override, hasOverride := m.config.Overrides[pbs.ID]
|
|
|
|
// Use PBS defaults (CPU, Memory)
|
|
cpuThreshold := m.config.PBSDefaults.CPU
|
|
memoryThreshold := m.config.PBSDefaults.Memory
|
|
disablePBSOffline := m.config.DisableAllPBSOffline
|
|
m.mu.RUnlock()
|
|
|
|
// Check override disable BEFORE offline detection to prevent spurious notifications
|
|
if hasOverride && override.Disabled {
|
|
m.mu.Lock()
|
|
// Reset offline confirmation tracking
|
|
delete(m.offlineConfirmations, pbs.ID)
|
|
// Clear CPU alert
|
|
cpuAlertID := fmt.Sprintf("%s-cpu", pbs.ID)
|
|
if _, exists := m.activeAlerts[cpuAlertID]; exists {
|
|
m.clearAlertNoLock(cpuAlertID)
|
|
log.Debug().
|
|
Str("alertID", cpuAlertID).
|
|
Str("pbs", pbs.Name).
|
|
Msg("Cleared CPU alert - PBS has alerts disabled")
|
|
}
|
|
// Clear Memory alert
|
|
memAlertID := fmt.Sprintf("%s-memory", pbs.ID)
|
|
if _, exists := m.activeAlerts[memAlertID]; exists {
|
|
m.clearAlertNoLock(memAlertID)
|
|
log.Debug().
|
|
Str("alertID", memAlertID).
|
|
Str("pbs", pbs.Name).
|
|
Msg("Cleared Memory alert - PBS has alerts disabled")
|
|
}
|
|
// Clear offline alert
|
|
offlineAlertID := fmt.Sprintf("pbs-offline-%s", pbs.ID)
|
|
if _, exists := m.activeAlerts[offlineAlertID]; exists {
|
|
m.clearAlertNoLock(offlineAlertID)
|
|
log.Debug().
|
|
Str("alertID", offlineAlertID).
|
|
Str("pbs", pbs.Name).
|
|
Msg("Cleared offline alert - PBS has alerts disabled")
|
|
}
|
|
m.mu.Unlock()
|
|
return
|
|
}
|
|
|
|
if disablePBSOffline {
|
|
// Clear tracking and any existing offline alerts when globally disabled
|
|
m.mu.Lock()
|
|
delete(m.offlineConfirmations, pbs.ID)
|
|
m.mu.Unlock()
|
|
m.clearAlert(fmt.Sprintf("pbs-offline-%s", pbs.ID))
|
|
} else {
|
|
// Check if PBS is offline first (similar to nodes)
|
|
if pbs.Status == "offline" || pbs.ConnectionHealth == "error" || pbs.ConnectionHealth == "unhealthy" {
|
|
m.checkPBSOffline(pbs)
|
|
} else {
|
|
// Clear any existing offline alert if PBS is back online
|
|
m.clearPBSOfflineAlert(pbs)
|
|
}
|
|
}
|
|
|
|
// Check if there are custom thresholds for this PBS instance
|
|
if hasOverride {
|
|
if override.CPU != nil {
|
|
cpuThreshold = override.CPU
|
|
}
|
|
if override.Memory != nil {
|
|
memoryThreshold = override.Memory
|
|
}
|
|
}
|
|
|
|
// Check metrics only if PBS is online - checkMetric will skip if threshold is nil or <= 0
|
|
if pbs.Status != "offline" {
|
|
// PBS CPU is already a percentage
|
|
m.checkMetric(pbs.ID, pbs.Name, pbs.Host, pbs.Name, "PBS", "cpu", pbs.CPU, cpuThreshold, nil)
|
|
// PBS Memory is already a percentage
|
|
m.checkMetric(pbs.ID, pbs.Name, pbs.Host, pbs.Name, "PBS", "memory", pbs.Memory, memoryThreshold, nil)
|
|
}
|
|
}
|
|
|
|
// CheckPMG checks a Proxmox Mail Gateway instance against thresholds
|
|
func (m *Manager) CheckPMG(pmg models.PMGInstance) {
|
|
m.mu.RLock()
|
|
if !m.config.Enabled {
|
|
m.mu.RUnlock()
|
|
return
|
|
}
|
|
if m.config.DisableAllPMG {
|
|
m.mu.RUnlock()
|
|
// Clear any existing PMG alerts when all PMG alerts are disabled
|
|
m.mu.Lock()
|
|
// Reset offline confirmation tracking
|
|
delete(m.offlineConfirmations, pmg.ID)
|
|
// Clear all possible PMG alert types
|
|
alertTypes := []string{"queue-total", "queue-deferred", "queue-hold", "oldest-message"}
|
|
for _, alertType := range alertTypes {
|
|
alertID := fmt.Sprintf("%s-%s", pmg.ID, alertType)
|
|
if _, exists := m.activeAlerts[alertID]; exists {
|
|
m.clearAlertNoLock(alertID)
|
|
log.Info().
|
|
Str("alertID", alertID).
|
|
Str("pmg", pmg.Name).
|
|
Msg("Cleared PMG alert - all PMG alerts disabled")
|
|
}
|
|
}
|
|
// Clear offline alert
|
|
offlineAlertID := fmt.Sprintf("pmg-offline-%s", pmg.ID)
|
|
if _, exists := m.activeAlerts[offlineAlertID]; exists {
|
|
m.clearAlertNoLock(offlineAlertID)
|
|
log.Info().
|
|
Str("alertID", offlineAlertID).
|
|
Str("pmg", pmg.Name).
|
|
Msg("Cleared offline alert - all PMG alerts disabled")
|
|
}
|
|
m.mu.Unlock()
|
|
return
|
|
}
|
|
|
|
// Check if there's an override for this PMG instance
|
|
override, hasOverride := m.config.Overrides[pmg.ID]
|
|
disablePMGOffline := m.config.DisableAllPMGOffline
|
|
pmgDefaults := m.config.PMGDefaults
|
|
m.mu.RUnlock()
|
|
|
|
// Check override disable BEFORE offline detection to prevent spurious notifications
|
|
if hasOverride && override.Disabled {
|
|
m.mu.Lock()
|
|
// Reset offline confirmation tracking
|
|
delete(m.offlineConfirmations, pmg.ID)
|
|
// Clear all possible PMG alert types
|
|
alertTypes := []string{"queue-total", "queue-deferred", "queue-hold", "oldest-message"}
|
|
for _, alertType := range alertTypes {
|
|
alertID := fmt.Sprintf("%s-%s", pmg.ID, alertType)
|
|
if _, exists := m.activeAlerts[alertID]; exists {
|
|
m.clearAlertNoLock(alertID)
|
|
log.Debug().
|
|
Str("alertID", alertID).
|
|
Str("pmg", pmg.Name).
|
|
Msg("Cleared PMG alert - PMG has alerts disabled")
|
|
}
|
|
}
|
|
// Clear offline alert
|
|
offlineAlertID := fmt.Sprintf("pmg-offline-%s", pmg.ID)
|
|
if _, exists := m.activeAlerts[offlineAlertID]; exists {
|
|
m.clearAlertNoLock(offlineAlertID)
|
|
log.Debug().
|
|
Str("alertID", offlineAlertID).
|
|
Str("pmg", pmg.Name).
|
|
Msg("Cleared offline alert - PMG has alerts disabled")
|
|
}
|
|
m.mu.Unlock()
|
|
return
|
|
}
|
|
|
|
// Handle offline detection
|
|
if disablePMGOffline {
|
|
// Clear tracking and any existing offline alerts when globally disabled
|
|
m.mu.Lock()
|
|
delete(m.offlineConfirmations, pmg.ID)
|
|
m.mu.Unlock()
|
|
m.clearAlert(fmt.Sprintf("pmg-offline-%s", pmg.ID))
|
|
} else {
|
|
// Check if PMG is offline (similar to PBS/nodes)
|
|
if pmg.Status == "offline" || pmg.ConnectionHealth == "error" || pmg.ConnectionHealth == "unhealthy" {
|
|
m.checkPMGOffline(pmg)
|
|
} else {
|
|
// Clear any existing offline alert if PMG is back online
|
|
m.clearPMGOfflineAlert(pmg)
|
|
}
|
|
}
|
|
|
|
// Check metrics only if PMG is online
|
|
if pmg.Status != "offline" {
|
|
// Check queue depths across all nodes
|
|
m.checkPMGQueueDepths(pmg, pmgDefaults)
|
|
// Check oldest message age across all nodes
|
|
m.checkPMGOldestMessage(pmg, pmgDefaults)
|
|
// Check quarantine backlog and growth
|
|
m.checkPMGQuarantineBacklog(pmg, pmgDefaults)
|
|
// Check spam/virus rate anomalies
|
|
m.checkPMGAnomalies(pmg, pmgDefaults)
|
|
// Check per-node queue health
|
|
m.checkPMGNodeQueues(pmg, pmgDefaults)
|
|
}
|
|
}
|
|
|
|
// dockerInstanceName returns the logical instance name used for Docker alerts.
|
|
func dockerInstanceName(host models.DockerHost) string {
|
|
name := strings.TrimSpace(host.DisplayName)
|
|
if name == "" {
|
|
name = strings.TrimSpace(host.Hostname)
|
|
}
|
|
if name == "" {
|
|
return "Docker"
|
|
}
|
|
return fmt.Sprintf("Docker:%s", name)
|
|
}
|
|
|
|
// dockerContainerDisplayName normalizes the container name for alert readability.
|
|
func dockerContainerDisplayName(container models.DockerContainer) string {
|
|
name := strings.TrimSpace(container.Name)
|
|
if strings.HasPrefix(name, "/") {
|
|
name = strings.TrimLeft(name, "/")
|
|
}
|
|
if name == "" {
|
|
id := strings.TrimSpace(container.ID)
|
|
if len(id) > 12 {
|
|
id = id[:12]
|
|
}
|
|
return id
|
|
}
|
|
return name
|
|
}
|
|
|
|
// dockerResourceID builds a stable identifier for Docker container alerts.
|
|
func dockerResourceID(hostID, containerID string) string {
|
|
hostID = strings.TrimSpace(hostID)
|
|
containerID = strings.TrimSpace(containerID)
|
|
if containerID == "" {
|
|
if hostID == "" {
|
|
return "docker:unknown"
|
|
}
|
|
return fmt.Sprintf("docker:%s", hostID)
|
|
}
|
|
if hostID == "" {
|
|
return fmt.Sprintf("docker:container/%s", containerID)
|
|
}
|
|
return fmt.Sprintf("docker:%s/%s", hostID, containerID)
|
|
}
|
|
|
|
func normalizeDockerUpdateTrackingPart(part string) string {
|
|
return strings.ToLower(strings.TrimSpace(part))
|
|
}
|
|
|
|
// dockerUpdateTrackingHostKey builds a stable host identity for Docker update timing.
|
|
func dockerUpdateTrackingHostKey(host models.DockerHost) string {
|
|
switch {
|
|
case normalizeDockerUpdateTrackingPart(host.AgentID) != "":
|
|
return "agent:" + normalizeDockerUpdateTrackingPart(host.AgentID)
|
|
case normalizeDockerUpdateTrackingPart(host.TokenID) != "":
|
|
return "token:" + normalizeDockerUpdateTrackingPart(host.TokenID)
|
|
case normalizeDockerUpdateTrackingPart(host.MachineID) != "":
|
|
return "machine:" + normalizeDockerUpdateTrackingPart(host.MachineID)
|
|
case normalizeDockerUpdateTrackingPart(host.Hostname) != "":
|
|
return "hostname:" + normalizeDockerUpdateTrackingPart(host.Hostname)
|
|
case normalizeDockerUpdateTrackingPart(host.ID) != "":
|
|
return "id:" + normalizeDockerUpdateTrackingPart(host.ID)
|
|
case normalizeDockerUpdateTrackingPart(host.DisplayName) != "":
|
|
return "name:" + normalizeDockerUpdateTrackingPart(host.DisplayName)
|
|
default:
|
|
return "unknown-host"
|
|
}
|
|
}
|
|
|
|
func dockerUpdateTrackingContainerKey(container models.DockerContainer) string {
|
|
if id := normalizeDockerUpdateTrackingPart(container.ID); id != "" {
|
|
return "id:" + id
|
|
}
|
|
|
|
name := normalizeDockerUpdateTrackingPart(container.Name)
|
|
name = strings.TrimPrefix(name, "/")
|
|
if name != "" {
|
|
return "name:" + name
|
|
}
|
|
|
|
if image := normalizeDockerUpdateTrackingPart(container.Image); image != "" {
|
|
return "image:" + image
|
|
}
|
|
|
|
return "unknown-container"
|
|
}
|
|
|
|
func dockerUpdateTrackingKey(host models.DockerHost, container models.DockerContainer) string {
|
|
return fmt.Sprintf("docker-update:%s/%s", dockerUpdateTrackingHostKey(host), dockerUpdateTrackingContainerKey(container))
|
|
}
|
|
|
|
func dockerUpdateTrackingHostPrefix(host models.DockerHost) string {
|
|
return fmt.Sprintf("docker-update:%s/", dockerUpdateTrackingHostKey(host))
|
|
}
|
|
|
|
// dockerServiceDisplayName normalizes the service name for alert readability.
|
|
func dockerServiceDisplayName(service models.DockerService) string {
|
|
name := strings.TrimSpace(service.Name)
|
|
if name != "" {
|
|
return name
|
|
}
|
|
id := strings.TrimSpace(service.ID)
|
|
if len(id) > 12 {
|
|
id = id[:12]
|
|
}
|
|
if id == "" {
|
|
return "service"
|
|
}
|
|
return id
|
|
}
|
|
|
|
func dockerServiceResourceID(hostID, serviceID, serviceName string) string {
|
|
hostID = strings.TrimSpace(hostID)
|
|
id := strings.TrimSpace(serviceID)
|
|
if id == "" {
|
|
name := strings.TrimSpace(serviceName)
|
|
if name == "" {
|
|
name = "service"
|
|
}
|
|
builder := strings.Builder{}
|
|
for _, r := range strings.ToLower(name) {
|
|
switch {
|
|
case r >= 'a' && r <= 'z':
|
|
builder.WriteRune(r)
|
|
case r >= '0' && r <= '9':
|
|
builder.WriteRune(r)
|
|
case r == '-', r == '_':
|
|
builder.WriteRune(r)
|
|
case r == ' ' || r == '/' || r == '\\' || r == ':' || r == '.':
|
|
builder.WriteRune('-')
|
|
}
|
|
}
|
|
id = strings.Trim(builder.String(), "-_")
|
|
if id == "" {
|
|
id = "service"
|
|
}
|
|
if len(id) > 32 {
|
|
id = id[:32]
|
|
}
|
|
}
|
|
if hostID == "" {
|
|
return fmt.Sprintf("docker-service:%s", id)
|
|
}
|
|
return fmt.Sprintf("docker:%s/service/%s", hostID, id)
|
|
}
|
|
|
|
func matchesDockerIgnoredPrefix(name, id string, prefixes []string) bool {
|
|
if len(prefixes) == 0 {
|
|
return false
|
|
}
|
|
|
|
name = strings.ToLower(strings.TrimSpace(name))
|
|
id = strings.ToLower(strings.TrimSpace(id))
|
|
|
|
for _, raw := range prefixes {
|
|
prefix := strings.ToLower(strings.TrimSpace(raw))
|
|
if prefix == "" {
|
|
continue
|
|
}
|
|
if name != "" && strings.HasPrefix(name, prefix) {
|
|
return true
|
|
}
|
|
if id != "" && strings.HasPrefix(id, prefix) {
|
|
return true
|
|
}
|
|
}
|
|
|
|
return false
|
|
}
|
|
|
|
// CheckDockerHost evaluates Docker host telemetry and container metrics for alerts.
|
|
func (m *Manager) CheckDockerHost(host models.DockerHost) {
|
|
if host.ID == "" {
|
|
return
|
|
}
|
|
|
|
// Fresh telemetry marks the host as online and clears any offline alert.
|
|
m.HandleDockerHostOnline(host)
|
|
|
|
m.mu.RLock()
|
|
alertsEnabled := m.config.Enabled
|
|
disableAllHosts := m.config.DisableAllDockerHosts
|
|
ignoredPrefixes := append([]string(nil), m.config.DockerIgnoredContainerPrefixes...)
|
|
m.mu.RUnlock()
|
|
if !alertsEnabled {
|
|
return
|
|
}
|
|
if disableAllHosts {
|
|
return
|
|
}
|
|
|
|
seen := make(map[string]struct{}, len(host.Containers)+len(host.Services))
|
|
seenUpdateTracking := make(map[string]struct{}, len(host.Containers))
|
|
for _, container := range host.Containers {
|
|
containerName := dockerContainerDisplayName(container)
|
|
resourceID := dockerResourceID(host.ID, container.ID)
|
|
updateTrackingKey := dockerUpdateTrackingKey(host, container)
|
|
|
|
if matchesDockerIgnoredPrefix(containerName, container.ID, ignoredPrefixes) {
|
|
log.Debug().
|
|
Str("container", containerName).
|
|
Str("host", host.DisplayName).
|
|
Msg("Skipping Docker container alert evaluation due to ignored prefix")
|
|
m.clearDockerContainerStateAlert(resourceID)
|
|
m.clearDockerContainerHealthAlert(resourceID)
|
|
m.clearDockerContainerMetricAlerts(resourceID)
|
|
m.clearAlert(fmt.Sprintf("docker-container-restart-loop-%s", resourceID))
|
|
m.clearAlert(fmt.Sprintf("docker-container-oom-%s", resourceID))
|
|
m.clearAlert(fmt.Sprintf("docker-container-memory-limit-%s", resourceID))
|
|
m.mu.Lock()
|
|
delete(m.dockerRestartTracking, resourceID)
|
|
delete(m.dockerLastExitCode, resourceID)
|
|
m.mu.Unlock()
|
|
m.clearDockerContainerUpdateTracking(resourceID, updateTrackingKey)
|
|
continue
|
|
}
|
|
|
|
seen[resourceID] = struct{}{}
|
|
seenUpdateTracking[updateTrackingKey] = struct{}{}
|
|
m.evaluateDockerContainer(host, container, resourceID)
|
|
}
|
|
|
|
for _, service := range host.Services {
|
|
resourceID := dockerServiceResourceID(host.ID, service.ID, service.Name)
|
|
seen[resourceID] = struct{}{}
|
|
m.evaluateDockerService(host, service, resourceID)
|
|
}
|
|
|
|
m.cleanupDockerContainerAlertsWithTracking(host, seen, seenUpdateTracking)
|
|
}
|
|
|
|
func (m *Manager) evaluateDockerContainer(host models.DockerHost, container models.DockerContainer, resourceID string) {
|
|
m.mu.RLock()
|
|
disableAllContainers := m.config.DisableAllDockerContainers
|
|
m.mu.RUnlock()
|
|
if disableAllContainers {
|
|
return
|
|
}
|
|
|
|
containerName := dockerContainerDisplayName(container)
|
|
nodeName := strings.TrimSpace(host.Hostname)
|
|
instanceName := dockerInstanceName(host)
|
|
resourceType := "Docker Container"
|
|
|
|
m.mu.RLock()
|
|
overrideConfig, hasOverride := m.config.Overrides[resourceID]
|
|
m.mu.RUnlock()
|
|
if hasOverride && overrideConfig.Disabled {
|
|
// Alerts disabled via override; clear any existing alerts and skip evaluation.
|
|
m.clearDockerContainerStateAlert(resourceID)
|
|
m.clearDockerContainerHealthAlert(resourceID)
|
|
m.clearDockerContainerMetricAlerts(resourceID)
|
|
m.clearAlert(fmt.Sprintf("docker-container-update-%s", resourceID))
|
|
m.clearDockerContainerUpdateTracking(resourceID, dockerUpdateTrackingKey(host, container))
|
|
return
|
|
}
|
|
|
|
state := strings.ToLower(strings.TrimSpace(container.State))
|
|
if state == "" {
|
|
state = strings.ToLower(strings.TrimSpace(container.Status))
|
|
}
|
|
|
|
if state != "running" {
|
|
m.checkDockerContainerState(host, container, resourceID, containerName, instanceName, nodeName)
|
|
m.clearDockerContainerMetricAlerts(resourceID, "cpu", "memory", "disk")
|
|
} else {
|
|
m.clearDockerContainerStateAlert(resourceID)
|
|
|
|
// Use Docker-specific defaults for containers
|
|
thresholds := ThresholdConfig{
|
|
CPU: &m.config.DockerDefaults.CPU,
|
|
Memory: &m.config.DockerDefaults.Memory,
|
|
Disk: &m.config.DockerDefaults.Disk,
|
|
}
|
|
if hasOverride {
|
|
thresholds = m.applyThresholdOverride(thresholds, overrideConfig)
|
|
}
|
|
|
|
if thresholds.CPU != nil {
|
|
cpuMetadata := map[string]interface{}{
|
|
"resourceType": resourceType,
|
|
"hostId": host.ID,
|
|
"hostName": host.DisplayName,
|
|
"hostHostname": host.Hostname,
|
|
"containerId": container.ID,
|
|
"containerName": containerName,
|
|
"image": container.Image,
|
|
"state": container.State,
|
|
"status": container.Status,
|
|
"restartCount": container.RestartCount,
|
|
"metric": "cpu",
|
|
"cpuPercent": container.CPUPercent,
|
|
}
|
|
m.checkMetric(resourceID, containerName, nodeName, instanceName, resourceType, "cpu", container.CPUPercent, thresholds.CPU, &metricOptions{Metadata: cpuMetadata})
|
|
}
|
|
|
|
if thresholds.Memory != nil {
|
|
memMetadata := map[string]interface{}{
|
|
"resourceType": resourceType,
|
|
"hostId": host.ID,
|
|
"hostName": host.DisplayName,
|
|
"hostHostname": host.Hostname,
|
|
"containerId": container.ID,
|
|
"containerName": containerName,
|
|
"image": container.Image,
|
|
"state": container.State,
|
|
"status": container.Status,
|
|
"restartCount": container.RestartCount,
|
|
"metric": "memory",
|
|
"memoryPercent": container.MemoryPercent,
|
|
"memoryUsageBytes": container.MemoryUsage,
|
|
}
|
|
if container.MemoryLimit > 0 {
|
|
memMetadata["memoryLimitBytes"] = container.MemoryLimit
|
|
}
|
|
m.checkMetric(resourceID, containerName, nodeName, instanceName, resourceType, "memory", container.MemoryPercent, thresholds.Memory, &metricOptions{Metadata: memMetadata})
|
|
}
|
|
|
|
if thresholds.Disk != nil {
|
|
totalBytes := container.RootFilesystemBytes
|
|
usedBytes := container.WritableLayerBytes
|
|
if totalBytes > 0 && usedBytes >= 0 {
|
|
diskPercent := (float64(usedBytes) / float64(totalBytes)) * 100
|
|
diskMetadata := map[string]interface{}{
|
|
"resourceType": resourceType,
|
|
"hostId": host.ID,
|
|
"hostName": host.DisplayName,
|
|
"hostHostname": host.Hostname,
|
|
"containerId": container.ID,
|
|
"containerName": containerName,
|
|
"image": container.Image,
|
|
"state": container.State,
|
|
"status": container.Status,
|
|
"restartCount": container.RestartCount,
|
|
"metric": "disk",
|
|
"diskPercent": diskPercent,
|
|
"writableLayerBytes": usedBytes,
|
|
"rootFilesystemBytes": totalBytes,
|
|
"mountCount": len(container.Mounts),
|
|
}
|
|
if container.BlockIO != nil {
|
|
diskMetadata["blockIoReadBytes"] = container.BlockIO.ReadBytes
|
|
diskMetadata["blockIoWriteBytes"] = container.BlockIO.WriteBytes
|
|
}
|
|
m.checkMetric(resourceID, containerName, nodeName, instanceName, resourceType, "disk", diskPercent, thresholds.Disk, &metricOptions{Metadata: diskMetadata})
|
|
} else {
|
|
m.clearDockerContainerMetricAlerts(resourceID, "disk")
|
|
}
|
|
}
|
|
}
|
|
|
|
m.checkDockerContainerHealth(host, container, resourceID, containerName, instanceName, nodeName)
|
|
|
|
// Docker-specific checks
|
|
m.checkDockerContainerRestartLoop(host, container, resourceID, containerName, instanceName, nodeName)
|
|
m.checkDockerContainerOOMKill(host, container, resourceID, containerName, instanceName, nodeName)
|
|
m.checkDockerContainerMemoryLimit(host, container, resourceID, containerName, instanceName, nodeName)
|
|
m.checkDockerContainerImageUpdate(host, container, resourceID, containerName, instanceName, nodeName)
|
|
}
|
|
|
|
func (m *Manager) evaluateDockerService(host models.DockerHost, service models.DockerService, resourceID string) {
|
|
m.mu.RLock()
|
|
disableAllServices := m.config.DisableAllDockerServices
|
|
warnPct := m.config.DockerDefaults.ServiceWarnGapPct
|
|
critPct := m.config.DockerDefaults.ServiceCritGapPct
|
|
overrideConfig, hasOverride := m.config.Overrides[resourceID]
|
|
m.mu.RUnlock()
|
|
|
|
if disableAllServices {
|
|
m.clearDockerServiceAlert(resourceID)
|
|
return
|
|
}
|
|
if hasOverride && overrideConfig.Disabled {
|
|
m.clearDockerServiceAlert(resourceID)
|
|
return
|
|
}
|
|
|
|
desired := service.DesiredTasks
|
|
running := service.RunningTasks
|
|
if desired <= 0 {
|
|
m.clearDockerServiceAlert(resourceID)
|
|
return
|
|
}
|
|
|
|
missing := desired - running
|
|
if missing < 0 {
|
|
missing = 0
|
|
}
|
|
|
|
percentMissing := 0.0
|
|
if desired > 0 {
|
|
percentMissing = (float64(missing) / float64(desired)) * 100.0
|
|
}
|
|
|
|
severity := AlertLevel("")
|
|
thresholdValue := 0.0
|
|
if critPct > 0 && percentMissing >= float64(critPct) {
|
|
severity = AlertLevelCritical
|
|
thresholdValue = float64(critPct)
|
|
} else if warnPct > 0 && percentMissing >= float64(warnPct) {
|
|
severity = AlertLevelWarning
|
|
thresholdValue = float64(warnPct)
|
|
}
|
|
|
|
updateState := ""
|
|
updateMessage := ""
|
|
if service.UpdateStatus != nil {
|
|
updateState = strings.ToLower(strings.TrimSpace(service.UpdateStatus.State))
|
|
updateMessage = strings.TrimSpace(service.UpdateStatus.Message)
|
|
if severity == "" {
|
|
switch updateState {
|
|
case "paused", "rollback_started", "rollback_paused":
|
|
severity = AlertLevelWarning
|
|
case "rollback_failed":
|
|
severity = AlertLevelCritical
|
|
}
|
|
}
|
|
}
|
|
|
|
if severity == "" {
|
|
m.clearDockerServiceAlert(resourceID)
|
|
return
|
|
}
|
|
|
|
serviceName := dockerServiceDisplayName(service)
|
|
instanceName := dockerInstanceName(host)
|
|
nodeName := strings.TrimSpace(host.Hostname)
|
|
|
|
message := ""
|
|
if missing > 0 {
|
|
message = fmt.Sprintf("Docker service '%s' is running %d of %d desired tasks", serviceName, service.RunningTasks, service.DesiredTasks)
|
|
} else if updateState != "" {
|
|
message = fmt.Sprintf("Docker service '%s' update state: %s", serviceName, service.UpdateStatus.State)
|
|
} else {
|
|
message = fmt.Sprintf("Docker service '%s' triggered a Swarm alert", serviceName)
|
|
}
|
|
if updateMessage != "" {
|
|
message = fmt.Sprintf("%s (%s)", message, updateMessage)
|
|
}
|
|
|
|
metadata := map[string]interface{}{
|
|
"resourceType": "Docker Service",
|
|
"hostId": host.ID,
|
|
"hostName": host.DisplayName,
|
|
"hostHostname": host.Hostname,
|
|
"serviceId": service.ID,
|
|
"serviceName": service.Name,
|
|
"stack": service.Stack,
|
|
"mode": service.Mode,
|
|
"desiredTasks": service.DesiredTasks,
|
|
"runningTasks": service.RunningTasks,
|
|
"completedTasks": service.CompletedTasks,
|
|
"missingTasks": missing,
|
|
"percentMissing": percentMissing,
|
|
}
|
|
if updateState != "" {
|
|
metadata["updateState"] = service.UpdateStatus.State
|
|
}
|
|
if updateMessage != "" {
|
|
metadata["updateMessage"] = updateMessage
|
|
}
|
|
if service.UpdateStatus != nil && service.UpdateStatus.CompletedAt != nil && !service.UpdateStatus.CompletedAt.IsZero() {
|
|
metadata["updateCompletedAt"] = service.UpdateStatus.CompletedAt.UTC()
|
|
}
|
|
|
|
alertID := fmt.Sprintf("docker-service-health-%s", resourceID)
|
|
alert := &Alert{
|
|
ID: alertID,
|
|
Type: "docker-service-health",
|
|
Level: severity,
|
|
ResourceID: resourceID,
|
|
ResourceName: serviceName,
|
|
Node: nodeName,
|
|
Instance: instanceName,
|
|
Message: message,
|
|
Value: percentMissing,
|
|
Threshold: thresholdValue,
|
|
StartTime: time.Now(),
|
|
LastSeen: time.Now(),
|
|
Metadata: metadata,
|
|
}
|
|
|
|
m.mu.Lock()
|
|
if existing, exists := m.activeAlerts[alertID]; exists && existing != nil {
|
|
escalatedToCritical := existing.Level != AlertLevelCritical && alert.Level == AlertLevelCritical
|
|
m.preserveAlertState(alertID, alert)
|
|
m.activeAlerts[alertID] = alert
|
|
m.recentAlerts[alertID] = alert
|
|
|
|
if escalatedToCritical {
|
|
m.historyManager.AddAlert(*alert)
|
|
if m.checkRateLimit(alertID) {
|
|
m.dispatchAlert(alert, true)
|
|
log.Warn().
|
|
Str("service", serviceName).
|
|
Str("host", host.DisplayName).
|
|
Float64("percentMissing", percentMissing).
|
|
Str("fromLevel", string(existing.Level)).
|
|
Str("toLevel", string(alert.Level)).
|
|
Msg("Docker service alert escalated")
|
|
} else {
|
|
log.Debug().
|
|
Str("alertID", alertID).
|
|
Int("maxPerHour", m.config.Schedule.MaxAlertsHour).
|
|
Msg("Docker service escalation notification suppressed due to rate limit")
|
|
}
|
|
}
|
|
m.mu.Unlock()
|
|
return
|
|
}
|
|
|
|
m.preserveAlertState(alertID, alert)
|
|
m.activeAlerts[alertID] = alert
|
|
m.recentAlerts[alertID] = alert
|
|
m.historyManager.AddAlert(*alert)
|
|
if !m.checkRateLimit(alertID) {
|
|
m.mu.Unlock()
|
|
log.Debug().
|
|
Str("alertID", alertID).
|
|
Int("maxPerHour", m.config.Schedule.MaxAlertsHour).
|
|
Msg("Docker service alert notification suppressed due to rate limit")
|
|
return
|
|
}
|
|
m.dispatchAlert(alert, true)
|
|
m.mu.Unlock()
|
|
|
|
log.Warn().
|
|
Str("service", serviceName).
|
|
Str("host", host.DisplayName).
|
|
Float64("percentMissing", percentMissing).
|
|
Msg("Docker service alert raised")
|
|
}
|
|
|
|
func (m *Manager) clearDockerServiceAlert(resourceID string) {
|
|
alertID := fmt.Sprintf("docker-service-health-%s", resourceID)
|
|
m.clearAlert(alertID)
|
|
}
|
|
|
|
// HandleDockerHostOnline clears offline tracking and alerts for a Docker host.
|
|
func (m *Manager) HandleDockerHostOnline(host models.DockerHost) {
|
|
if host.ID == "" {
|
|
return
|
|
}
|
|
|
|
alertID := fmt.Sprintf("docker-host-offline-%s", host.ID)
|
|
|
|
m.mu.Lock()
|
|
delete(m.dockerOfflineCount, host.ID)
|
|
_, exists := m.activeAlerts[alertID]
|
|
m.mu.Unlock()
|
|
|
|
if exists {
|
|
m.clearAlert(alertID)
|
|
}
|
|
}
|
|
|
|
// HandleDockerHostRemoved clears all alerts and tracking when a Docker host is deleted.
|
|
func (m *Manager) HandleDockerHostRemoved(host models.DockerHost) {
|
|
if host.ID == "" {
|
|
return
|
|
}
|
|
|
|
// Reuse the online handler to clear offline alerts and tracking.
|
|
m.HandleDockerHostOnline(host)
|
|
// Drop any container alerts and host-scoped tracking entries.
|
|
m.clearDockerHostContainerAlerts(host)
|
|
}
|
|
|
|
// HandleDockerHostOffline raises an alert when a Docker host stops reporting.
|
|
func (m *Manager) HandleDockerHostOffline(host models.DockerHost) {
|
|
if host.ID == "" {
|
|
return
|
|
}
|
|
|
|
m.mu.RLock()
|
|
if !m.config.Enabled {
|
|
m.mu.RUnlock()
|
|
return
|
|
}
|
|
disableDockerHostsOffline := m.config.DisableAllDockerHostsOffline
|
|
m.mu.RUnlock()
|
|
|
|
alertID := fmt.Sprintf("docker-host-offline-%s", host.ID)
|
|
resourceID := fmt.Sprintf("docker:%s", strings.TrimSpace(host.ID))
|
|
instanceName := dockerInstanceName(host)
|
|
nodeName := strings.TrimSpace(host.Hostname)
|
|
|
|
if disableDockerHostsOffline {
|
|
m.mu.Lock()
|
|
delete(m.dockerOfflineCount, host.ID)
|
|
m.mu.Unlock()
|
|
m.clearAlert(alertID)
|
|
return
|
|
}
|
|
|
|
var disableConnectivity bool
|
|
m.mu.RLock()
|
|
if override, exists := m.config.Overrides[host.ID]; exists {
|
|
disableConnectivity = override.DisableConnectivity
|
|
}
|
|
m.mu.RUnlock()
|
|
|
|
if disableConnectivity {
|
|
m.clearAlert(alertID)
|
|
m.mu.Lock()
|
|
delete(m.dockerOfflineCount, host.ID)
|
|
m.mu.Unlock()
|
|
return
|
|
}
|
|
|
|
m.mu.Lock()
|
|
if alert, exists := m.activeAlerts[alertID]; exists && alert != nil {
|
|
alert.LastSeen = time.Now()
|
|
m.activeAlerts[alertID] = alert
|
|
m.mu.Unlock()
|
|
return
|
|
}
|
|
|
|
m.dockerOfflineCount[host.ID]++
|
|
confirmations := m.dockerOfflineCount[host.ID]
|
|
const requiredConfirmations = 3
|
|
if confirmations < requiredConfirmations {
|
|
m.mu.Unlock()
|
|
log.Debug().
|
|
Str("dockerHost", host.DisplayName).
|
|
Str("hostID", host.ID).
|
|
Int("confirmations", confirmations).
|
|
Int("required", requiredConfirmations).
|
|
Msg("Docker host appears offline, awaiting confirmation")
|
|
return
|
|
}
|
|
|
|
alert := &Alert{
|
|
ID: alertID,
|
|
Type: "docker-host-offline",
|
|
Level: AlertLevelCritical,
|
|
ResourceID: resourceID,
|
|
ResourceName: host.DisplayName,
|
|
Node: nodeName,
|
|
Instance: instanceName,
|
|
Message: fmt.Sprintf("Docker host '%s' is offline", host.DisplayName),
|
|
Value: 0,
|
|
Threshold: 0,
|
|
StartTime: time.Now(),
|
|
LastSeen: time.Now(),
|
|
Metadata: map[string]interface{}{
|
|
"resourceType": "DockerHost",
|
|
"hostId": host.ID,
|
|
"hostname": host.Hostname,
|
|
"agentId": host.AgentID,
|
|
"displayName": host.DisplayName,
|
|
},
|
|
}
|
|
|
|
m.preserveAlertState(alertID, alert)
|
|
m.activeAlerts[alertID] = alert
|
|
m.recentAlerts[alertID] = alert
|
|
m.historyManager.AddAlert(*alert)
|
|
|
|
// Trigger AI analysis callback unconditionally
|
|
if m.onAlertForAI != nil {
|
|
alertCopy := alert.Clone()
|
|
go func(a *Alert) {
|
|
defer func() {
|
|
if r := recover(); r != nil {
|
|
log.Error().Interface("panic", r).Str("alertID", a.ID).Msg("Panic in AI alert callback")
|
|
}
|
|
}()
|
|
m.onAlertForAI(a)
|
|
}(alertCopy)
|
|
}
|
|
|
|
if !m.checkRateLimit(alertID) {
|
|
m.mu.Unlock()
|
|
log.Debug().
|
|
Str("alertID", alertID).
|
|
Int("maxPerHour", m.config.Schedule.MaxAlertsHour).
|
|
Msg("Docker host offline alert suppressed due to rate limit")
|
|
return
|
|
}
|
|
|
|
m.dispatchAlert(alert, false)
|
|
m.mu.Unlock()
|
|
|
|
log.Error().
|
|
Str("dockerHost", host.DisplayName).
|
|
Str("hostID", host.ID).
|
|
Str("hostname", host.Hostname).
|
|
Msg("CRITICAL: Docker host is offline")
|
|
|
|
m.clearDockerHostContainerAlerts(host)
|
|
}
|
|
|
|
func (m *Manager) checkDockerContainerState(host models.DockerHost, container models.DockerContainer, resourceID, containerName, instanceName, nodeName string) {
|
|
alertID := fmt.Sprintf("docker-container-state-%s", resourceID)
|
|
stateKey := resourceID
|
|
|
|
m.mu.RLock()
|
|
override, hasOverride := m.config.Overrides[resourceID]
|
|
defaultDisable := m.config.DockerDefaults.StateDisableConnectivity
|
|
defaultSeverity := normalizePoweredOffSeverity(m.config.DockerDefaults.StatePoweredOffSeverity)
|
|
m.mu.RUnlock()
|
|
|
|
disableConnectivity := defaultDisable
|
|
severity := defaultSeverity
|
|
if hasOverride {
|
|
if defaultDisable && !override.DisableConnectivity {
|
|
disableConnectivity = false
|
|
} else if override.DisableConnectivity {
|
|
disableConnectivity = true
|
|
}
|
|
|
|
if override.PoweredOffSeverity != "" {
|
|
severity = normalizePoweredOffSeverity(override.PoweredOffSeverity)
|
|
}
|
|
}
|
|
|
|
if disableConnectivity {
|
|
m.clearDockerContainerStateAlert(resourceID)
|
|
return
|
|
}
|
|
|
|
m.mu.Lock()
|
|
if alert, exists := m.activeAlerts[alertID]; exists && alert != nil {
|
|
alert.LastSeen = time.Now()
|
|
alert.Level = severity
|
|
if alert.Metadata == nil {
|
|
alert.Metadata = make(map[string]interface{})
|
|
}
|
|
alert.Metadata["state"] = container.State
|
|
alert.Metadata["status"] = container.Status
|
|
m.activeAlerts[alertID] = alert
|
|
m.mu.Unlock()
|
|
return
|
|
}
|
|
|
|
m.dockerStateConfirm[stateKey]++
|
|
confirmations := m.dockerStateConfirm[stateKey]
|
|
const requiredConfirmations = 2
|
|
if confirmations < requiredConfirmations {
|
|
m.mu.Unlock()
|
|
log.Debug().
|
|
Str("container", containerName).
|
|
Str("host", host.DisplayName).
|
|
Str("state", container.State).
|
|
Int("confirmations", confirmations).
|
|
Int("required", requiredConfirmations).
|
|
Msg("Docker container state change detected, awaiting confirmation")
|
|
return
|
|
}
|
|
|
|
message := fmt.Sprintf("Docker container '%s' is %s", containerName, strings.TrimSpace(container.Status))
|
|
alert := &Alert{
|
|
ID: alertID,
|
|
Type: "docker-container-state",
|
|
Level: severity,
|
|
ResourceID: resourceID,
|
|
ResourceName: containerName,
|
|
Node: nodeName,
|
|
Instance: instanceName,
|
|
Message: message,
|
|
Value: 0,
|
|
Threshold: 0,
|
|
StartTime: time.Now(),
|
|
LastSeen: time.Now(),
|
|
Metadata: map[string]interface{}{
|
|
"resourceType": "Docker Container",
|
|
"hostId": host.ID,
|
|
"hostName": host.DisplayName,
|
|
"hostHostname": host.Hostname,
|
|
"containerId": container.ID,
|
|
"containerName": containerName,
|
|
"image": container.Image,
|
|
"state": container.State,
|
|
"status": container.Status,
|
|
},
|
|
}
|
|
|
|
m.preserveAlertState(alertID, alert)
|
|
m.activeAlerts[alertID] = alert
|
|
m.recentAlerts[alertID] = alert
|
|
m.historyManager.AddAlert(*alert)
|
|
m.dispatchAlert(alert, true)
|
|
m.mu.Unlock()
|
|
|
|
log.Warn().
|
|
Str("container", containerName).
|
|
Str("host", host.DisplayName).
|
|
Str("state", container.State).
|
|
Msg("Docker container state alert raised")
|
|
}
|
|
|
|
func (m *Manager) clearDockerContainerStateAlert(resourceID string) {
|
|
alertID := fmt.Sprintf("docker-container-state-%s", resourceID)
|
|
m.mu.Lock()
|
|
delete(m.dockerStateConfirm, resourceID)
|
|
m.mu.Unlock()
|
|
m.clearAlert(alertID)
|
|
}
|
|
|
|
func (m *Manager) checkDockerContainerHealth(host models.DockerHost, container models.DockerContainer, resourceID, containerName, instanceName, nodeName string) {
|
|
health := strings.ToLower(strings.TrimSpace(container.Health))
|
|
if health == "" || health == "none" || health == "healthy" || health == "starting" {
|
|
m.clearDockerContainerHealthAlert(resourceID)
|
|
return
|
|
}
|
|
|
|
level := AlertLevelWarning
|
|
if health == "unhealthy" {
|
|
level = AlertLevelCritical
|
|
}
|
|
|
|
alertID := fmt.Sprintf("docker-container-health-%s", resourceID)
|
|
alert := &Alert{
|
|
ID: alertID,
|
|
Type: "docker-container-health",
|
|
Level: level,
|
|
ResourceID: resourceID,
|
|
ResourceName: containerName,
|
|
Node: nodeName,
|
|
Instance: instanceName,
|
|
Message: fmt.Sprintf("Docker container '%s' health is %s", containerName, container.Health),
|
|
Value: 0,
|
|
Threshold: 0,
|
|
StartTime: time.Now(),
|
|
LastSeen: time.Now(),
|
|
Metadata: map[string]interface{}{
|
|
"resourceType": "Docker Container",
|
|
"hostId": host.ID,
|
|
"hostName": host.DisplayName,
|
|
"hostHostname": host.Hostname,
|
|
"containerId": container.ID,
|
|
"containerName": containerName,
|
|
"image": container.Image,
|
|
"state": container.State,
|
|
"status": container.Status,
|
|
"health": container.Health,
|
|
},
|
|
}
|
|
|
|
m.mu.Lock()
|
|
if existing, exists := m.activeAlerts[alertID]; exists && existing != nil {
|
|
alert.StartTime = existing.StartTime
|
|
}
|
|
m.preserveAlertState(alertID, alert)
|
|
m.activeAlerts[alertID] = alert
|
|
m.recentAlerts[alertID] = alert
|
|
m.historyManager.AddAlert(*alert)
|
|
m.dispatchAlert(alert, false)
|
|
m.mu.Unlock()
|
|
|
|
log.Warn().
|
|
Str("container", containerName).
|
|
Str("host", host.DisplayName).
|
|
Str("health", container.Health).
|
|
Msg("Docker container health alert raised")
|
|
}
|
|
|
|
func (m *Manager) clearDockerContainerHealthAlert(resourceID string) {
|
|
alertID := fmt.Sprintf("docker-container-health-%s", resourceID)
|
|
m.clearAlert(alertID)
|
|
}
|
|
|
|
// checkDockerContainerRestartLoop detects containers stuck in a restart loop
|
|
func (m *Manager) checkDockerContainerRestartLoop(host models.DockerHost, container models.DockerContainer, resourceID, containerName, instanceName, nodeName string) {
|
|
alertID := fmt.Sprintf("docker-container-restart-loop-%s", resourceID)
|
|
now := time.Now()
|
|
|
|
// Get config values with defaults
|
|
restartThreshold := m.config.DockerDefaults.RestartCount
|
|
if restartThreshold == 0 {
|
|
restartThreshold = 3 // Default: 3 restarts
|
|
}
|
|
timeWindow := m.config.DockerDefaults.RestartWindow
|
|
if timeWindow == 0 {
|
|
timeWindow = 300 // Default: 5 minutes (300 seconds)
|
|
}
|
|
|
|
m.mu.Lock()
|
|
|
|
record, exists := m.dockerRestartTracking[resourceID]
|
|
if !exists {
|
|
record = &dockerRestartRecord{
|
|
count: container.RestartCount,
|
|
lastCount: container.RestartCount,
|
|
times: []time.Time{},
|
|
lastChecked: now,
|
|
}
|
|
m.dockerRestartTracking[resourceID] = record
|
|
m.mu.Unlock()
|
|
return
|
|
}
|
|
|
|
// If restart count increased, track it
|
|
if container.RestartCount > record.lastCount {
|
|
newRestarts := container.RestartCount - record.lastCount
|
|
for i := 0; i < newRestarts; i++ {
|
|
record.times = append(record.times, now)
|
|
}
|
|
record.lastCount = container.RestartCount
|
|
}
|
|
|
|
// Clean up old restart times outside the window
|
|
cutoff := now.Add(-time.Duration(timeWindow) * time.Second)
|
|
var recentRestarts []time.Time
|
|
for _, t := range record.times {
|
|
if t.After(cutoff) {
|
|
recentRestarts = append(recentRestarts, t)
|
|
}
|
|
}
|
|
record.times = recentRestarts
|
|
record.lastChecked = now
|
|
|
|
recentCount := len(record.times)
|
|
m.mu.Unlock()
|
|
|
|
// Check if we have a restart loop
|
|
if recentCount > restartThreshold {
|
|
level := AlertLevelCritical
|
|
|
|
alert := &Alert{
|
|
ID: alertID,
|
|
Type: "docker-container-restart-loop",
|
|
Level: level,
|
|
ResourceID: resourceID,
|
|
ResourceName: containerName,
|
|
Node: nodeName,
|
|
Instance: instanceName,
|
|
Message: fmt.Sprintf("Docker container '%s' has restarted %d times in the last %d minutes (restart loop detected)", containerName, recentCount, timeWindow/60),
|
|
StartTime: now,
|
|
LastSeen: now,
|
|
Metadata: map[string]interface{}{
|
|
"hostId": host.ID,
|
|
"hostName": host.DisplayName,
|
|
"containerId": container.ID,
|
|
"containerName": containerName,
|
|
"image": container.Image,
|
|
"state": container.State,
|
|
"status": container.Status,
|
|
"restartCount": container.RestartCount,
|
|
"recentRestarts": recentCount,
|
|
},
|
|
}
|
|
|
|
m.mu.Lock()
|
|
if existing, exists := m.activeAlerts[alertID]; exists && existing != nil {
|
|
alert.StartTime = existing.StartTime
|
|
}
|
|
m.preserveAlertState(alertID, alert)
|
|
m.activeAlerts[alertID] = alert
|
|
m.recentAlerts[alertID] = alert
|
|
m.historyManager.AddAlert(*alert)
|
|
m.dispatchAlert(alert, false)
|
|
m.mu.Unlock()
|
|
|
|
log.Warn().
|
|
Str("container", containerName).
|
|
Str("host", host.DisplayName).
|
|
Int("restarts", recentCount).
|
|
Msg("Docker container restart loop detected")
|
|
} else {
|
|
// Clear alert if restart loop has stopped
|
|
m.clearAlert(alertID)
|
|
}
|
|
}
|
|
|
|
// checkDockerContainerOOMKill detects when a container was killed due to out of memory
|
|
func (m *Manager) checkDockerContainerOOMKill(host models.DockerHost, container models.DockerContainer, resourceID, containerName, instanceName, nodeName string) {
|
|
alertID := fmt.Sprintf("docker-container-oom-%s", resourceID)
|
|
|
|
// Exit code 137 means the container was killed by SIGKILL, often due to OOM
|
|
// Only alert if the container exited (not running) with exit code 137
|
|
state := strings.ToLower(strings.TrimSpace(container.State))
|
|
if (state == "exited" || state == "dead") && container.ExitCode == 137 {
|
|
m.mu.Lock()
|
|
lastExitCode, tracked := m.dockerLastExitCode[resourceID]
|
|
|
|
// Only alert if this is a new OOM kill (exit code changed to 137)
|
|
if !tracked || lastExitCode != 137 {
|
|
m.dockerLastExitCode[resourceID] = 137
|
|
m.mu.Unlock()
|
|
|
|
level := AlertLevelCritical
|
|
|
|
alert := &Alert{
|
|
ID: alertID,
|
|
Type: "docker-container-oom-kill",
|
|
Level: level,
|
|
ResourceID: resourceID,
|
|
ResourceName: containerName,
|
|
Node: nodeName,
|
|
Instance: instanceName,
|
|
Message: fmt.Sprintf("Docker container '%s' was killed due to out of memory (OOM)", containerName),
|
|
StartTime: time.Now(),
|
|
LastSeen: time.Now(),
|
|
Metadata: map[string]interface{}{
|
|
"hostId": host.ID,
|
|
"hostName": host.DisplayName,
|
|
"containerId": container.ID,
|
|
"containerName": containerName,
|
|
"image": container.Image,
|
|
"state": container.State,
|
|
"status": container.Status,
|
|
"exitCode": container.ExitCode,
|
|
"memoryUsageBytes": container.MemoryUsage,
|
|
"memoryLimitBytes": container.MemoryLimit,
|
|
},
|
|
}
|
|
|
|
m.mu.Lock()
|
|
if existing, exists := m.activeAlerts[alertID]; exists && existing != nil {
|
|
alert.StartTime = existing.StartTime
|
|
}
|
|
m.preserveAlertState(alertID, alert)
|
|
m.activeAlerts[alertID] = alert
|
|
m.recentAlerts[alertID] = alert
|
|
m.historyManager.AddAlert(*alert)
|
|
m.dispatchAlert(alert, false)
|
|
m.mu.Unlock()
|
|
|
|
log.Error().
|
|
Str("container", containerName).
|
|
Str("host", host.DisplayName).
|
|
Int64("memoryUsage", container.MemoryUsage).
|
|
Int64("memoryLimit", container.MemoryLimit).
|
|
Msg("Docker container OOM killed")
|
|
} else {
|
|
m.mu.Unlock()
|
|
}
|
|
} else {
|
|
// Update last exit code if it changed
|
|
if container.ExitCode != 0 {
|
|
m.mu.Lock()
|
|
m.dockerLastExitCode[resourceID] = container.ExitCode
|
|
m.mu.Unlock()
|
|
}
|
|
// Clear OOM alert if container is running or exited with different code
|
|
m.clearAlert(alertID)
|
|
}
|
|
}
|
|
|
|
// checkDockerContainerMemoryLimit alerts when container approaches its memory limit
|
|
func (m *Manager) checkDockerContainerMemoryLimit(host models.DockerHost, container models.DockerContainer, resourceID, containerName, instanceName, nodeName string) {
|
|
// Only check if container is running and has a memory limit
|
|
state := strings.ToLower(strings.TrimSpace(container.State))
|
|
if state != "running" || container.MemoryLimit <= 0 {
|
|
return
|
|
}
|
|
|
|
alertID := fmt.Sprintf("docker-container-memory-limit-%s", resourceID)
|
|
|
|
// Get config values with defaults
|
|
warnThreshold := float64(m.config.DockerDefaults.MemoryWarnPct)
|
|
if warnThreshold == 0 {
|
|
warnThreshold = 90.0 // Default: 90%
|
|
}
|
|
criticalThreshold := float64(m.config.DockerDefaults.MemoryCriticalPct)
|
|
if criticalThreshold == 0 {
|
|
criticalThreshold = 95.0 // Default: 95%
|
|
}
|
|
|
|
// Calculate percentage of limit used
|
|
limitPercent := (float64(container.MemoryUsage) / float64(container.MemoryLimit)) * 100
|
|
|
|
if limitPercent >= warnThreshold {
|
|
level := AlertLevelWarning
|
|
if limitPercent >= criticalThreshold {
|
|
level = AlertLevelCritical
|
|
}
|
|
|
|
alert := &Alert{
|
|
ID: alertID,
|
|
Type: "docker-container-memory-limit",
|
|
Level: level,
|
|
ResourceID: resourceID,
|
|
ResourceName: containerName,
|
|
Node: nodeName,
|
|
Instance: instanceName,
|
|
Message: fmt.Sprintf("Docker container '%s' is using %.1f%% of its memory limit (%d MB / %d MB)", containerName, limitPercent, container.MemoryUsage/(1024*1024), container.MemoryLimit/(1024*1024)),
|
|
StartTime: time.Now(),
|
|
LastSeen: time.Now(),
|
|
Metadata: map[string]interface{}{
|
|
"hostId": host.ID,
|
|
"hostName": host.DisplayName,
|
|
"containerId": container.ID,
|
|
"containerName": containerName,
|
|
"image": container.Image,
|
|
"memoryUsageBytes": container.MemoryUsage,
|
|
"memoryLimitBytes": container.MemoryLimit,
|
|
"limitPercent": limitPercent,
|
|
},
|
|
}
|
|
|
|
m.mu.Lock()
|
|
if existing, exists := m.activeAlerts[alertID]; exists && existing != nil {
|
|
alert.StartTime = existing.StartTime
|
|
existing.LastSeen = time.Now()
|
|
existing.Level = level
|
|
existing.Message = alert.Message
|
|
existing.Metadata = alert.Metadata
|
|
m.mu.Unlock()
|
|
return
|
|
}
|
|
m.preserveAlertState(alertID, alert)
|
|
m.activeAlerts[alertID] = alert
|
|
m.recentAlerts[alertID] = alert
|
|
m.historyManager.AddAlert(*alert)
|
|
m.dispatchAlert(alert, false)
|
|
m.mu.Unlock()
|
|
|
|
log.Warn().
|
|
Str("container", containerName).
|
|
Str("host", host.DisplayName).
|
|
Float64("limitPercent", limitPercent).
|
|
Msg("Docker container approaching memory limit")
|
|
} else {
|
|
// Clear alert if below warning threshold minus 5% (hysteresis)
|
|
clearThreshold := warnThreshold - 5
|
|
if limitPercent < clearThreshold {
|
|
m.clearAlert(alertID)
|
|
}
|
|
}
|
|
}
|
|
|
|
func (m *Manager) clearDockerContainerMetricAlerts(resourceID string, metrics ...string) {
|
|
if len(metrics) == 0 {
|
|
metrics = []string{"cpu", "memory", "disk"}
|
|
}
|
|
for _, metric := range metrics {
|
|
alertID := fmt.Sprintf("%s-%s", resourceID, metric)
|
|
m.clearAlert(alertID)
|
|
}
|
|
}
|
|
|
|
func (m *Manager) clearDockerContainerUpdateTracking(resourceID, trackingKey string) {
|
|
m.mu.Lock()
|
|
delete(m.dockerUpdateFirstSeen, resourceID)
|
|
if trackingKey != "" {
|
|
delete(m.dockerUpdateFirstSeenByIdentity, trackingKey)
|
|
}
|
|
m.mu.Unlock()
|
|
}
|
|
|
|
func dockerUpdateTrackingKeyFromAlert(alert *Alert) string {
|
|
if alert == nil || alert.Metadata == nil {
|
|
return ""
|
|
}
|
|
|
|
host := models.DockerHost{
|
|
ID: strings.TrimSpace(fmt.Sprint(alert.Metadata["hostId"])),
|
|
DisplayName: strings.TrimSpace(fmt.Sprint(alert.Metadata["hostName"])),
|
|
Hostname: strings.TrimSpace(fmt.Sprint(alert.Metadata["hostHostname"])),
|
|
}
|
|
container := models.DockerContainer{
|
|
ID: strings.TrimSpace(fmt.Sprint(alert.Metadata["containerId"])),
|
|
Name: strings.TrimSpace(fmt.Sprint(alert.Metadata["containerName"])),
|
|
Image: strings.TrimSpace(fmt.Sprint(alert.Metadata["image"])),
|
|
}
|
|
|
|
if host.ID == "" && host.DisplayName == "" && host.Hostname == "" &&
|
|
container.ID == "" && container.Name == "" && container.Image == "" {
|
|
return ""
|
|
}
|
|
|
|
return dockerUpdateTrackingKey(host, container)
|
|
}
|
|
|
|
func (m *Manager) clearDockerContainerUpdateStateLocked(alert *Alert) {
|
|
if alert == nil {
|
|
return
|
|
}
|
|
|
|
if alert.ResourceID != "" {
|
|
delete(m.dockerUpdateFirstSeen, alert.ResourceID)
|
|
}
|
|
if trackingKey := dockerUpdateTrackingKeyFromAlert(alert); trackingKey != "" {
|
|
delete(m.dockerUpdateFirstSeenByIdentity, trackingKey)
|
|
}
|
|
}
|
|
|
|
func (m *Manager) clearDockerContainerUpdateAlertsLocked() {
|
|
toClear := make([]string, 0)
|
|
for alertID, alert := range m.activeAlerts {
|
|
if alert == nil {
|
|
continue
|
|
}
|
|
if alert.Type != "docker-container-update" && !strings.HasPrefix(alertID, "docker-container-update-") {
|
|
continue
|
|
}
|
|
m.clearDockerContainerUpdateStateLocked(alert)
|
|
toClear = append(toClear, alertID)
|
|
}
|
|
for _, alertID := range toClear {
|
|
m.clearAlertNoLock(alertID)
|
|
}
|
|
}
|
|
|
|
func (m *Manager) shouldResolveDockerContainerUpdateAlertLocked(alert *Alert) bool {
|
|
if alert == nil {
|
|
return false
|
|
}
|
|
|
|
if m.config.DisableAllDockerContainers || m.config.DockerDefaults.UpdateAlertDelayHours < 0 {
|
|
m.clearDockerContainerUpdateStateLocked(alert)
|
|
return true
|
|
}
|
|
|
|
if override, exists := m.config.Overrides[alert.ResourceID]; exists && override.Disabled {
|
|
m.clearDockerContainerUpdateStateLocked(alert)
|
|
return true
|
|
}
|
|
|
|
containerName := strings.TrimSpace(alert.ResourceName)
|
|
containerID := ""
|
|
if alert.Metadata != nil {
|
|
if value, ok := alert.Metadata["containerName"].(string); ok && containerName == "" {
|
|
containerName = value
|
|
}
|
|
if value, ok := alert.Metadata["containerId"].(string); ok {
|
|
containerID = value
|
|
}
|
|
}
|
|
if matchesDockerIgnoredPrefix(containerName, containerID, m.config.DockerIgnoredContainerPrefixes) {
|
|
m.clearDockerContainerUpdateStateLocked(alert)
|
|
return true
|
|
}
|
|
|
|
return false
|
|
}
|
|
|
|
// checkDockerContainerImageUpdate checks if an image update has been pending for too long
|
|
func (m *Manager) checkDockerContainerImageUpdate(host models.DockerHost, container models.DockerContainer, resourceID, containerName, instanceName, nodeName string) {
|
|
alertID := fmt.Sprintf("docker-container-update-%s", resourceID)
|
|
updateTrackingKey := dockerUpdateTrackingKey(host, container)
|
|
|
|
// Check if update detection is enabled
|
|
m.mu.RLock()
|
|
delayHours := m.config.DockerDefaults.UpdateAlertDelayHours
|
|
m.mu.RUnlock()
|
|
|
|
// Negative value means disabled
|
|
if delayHours < 0 {
|
|
m.clearAlert(alertID)
|
|
m.clearDockerContainerUpdateTracking(resourceID, updateTrackingKey)
|
|
return
|
|
}
|
|
|
|
// Check if this container has an update status reported
|
|
if container.UpdateStatus == nil {
|
|
// No update status - clear any tracking and alerts
|
|
m.clearAlert(alertID)
|
|
m.clearDockerContainerUpdateTracking(resourceID, updateTrackingKey)
|
|
return
|
|
}
|
|
|
|
// Check for errors in update detection (don't alert on errors)
|
|
if container.UpdateStatus.Error != "" {
|
|
// Update check failed - clear alert but keep tracking
|
|
m.clearAlert(alertID)
|
|
return
|
|
}
|
|
|
|
// Check if an update is available
|
|
if !container.UpdateStatus.UpdateAvailable {
|
|
// No update available - clear tracking and alert
|
|
m.clearAlert(alertID)
|
|
m.clearDockerContainerUpdateTracking(resourceID, updateTrackingKey)
|
|
return
|
|
}
|
|
|
|
// Update is available - track when we first saw it
|
|
m.mu.Lock()
|
|
firstSeen, exists := m.dockerUpdateFirstSeenByIdentity[updateTrackingKey]
|
|
if !exists {
|
|
firstSeen, exists = m.dockerUpdateFirstSeen[resourceID]
|
|
}
|
|
if !exists {
|
|
firstSeen = time.Now()
|
|
}
|
|
m.dockerUpdateFirstSeen[resourceID] = firstSeen
|
|
m.dockerUpdateFirstSeenByIdentity[updateTrackingKey] = firstSeen
|
|
m.mu.Unlock()
|
|
|
|
// Check if we've exceeded the delay threshold
|
|
pendingDuration := time.Since(firstSeen)
|
|
threshold := time.Duration(delayHours) * time.Hour
|
|
if pendingDuration < threshold {
|
|
// Not yet time to alert
|
|
log.Debug().
|
|
Str("container", containerName).
|
|
Str("host", host.DisplayName).
|
|
Str("image", container.Image).
|
|
Dur("pending", pendingDuration).
|
|
Dur("threshold", threshold).
|
|
Msg("Container update pending but below alert threshold")
|
|
return
|
|
}
|
|
|
|
// Create or update the alert
|
|
pendingHours := int(pendingDuration.Hours())
|
|
message := fmt.Sprintf("Docker container '%s' has an image update available for %d hours", containerName, pendingHours)
|
|
|
|
alert := &Alert{
|
|
ID: alertID,
|
|
Type: "docker-container-update",
|
|
Level: AlertLevelWarning,
|
|
ResourceID: resourceID,
|
|
ResourceName: containerName,
|
|
Node: nodeName,
|
|
Instance: instanceName,
|
|
Message: message,
|
|
StartTime: firstSeen,
|
|
LastSeen: time.Now(),
|
|
Metadata: map[string]interface{}{
|
|
"resourceType": "Docker Container",
|
|
"hostId": host.ID,
|
|
"hostName": host.DisplayName,
|
|
"hostHostname": host.Hostname,
|
|
"containerId": container.ID,
|
|
"containerName": containerName,
|
|
"image": container.Image,
|
|
"currentDigest": container.UpdateStatus.CurrentDigest,
|
|
"latestDigest": container.UpdateStatus.LatestDigest,
|
|
"lastChecked": container.UpdateStatus.LastChecked,
|
|
"firstSeen": firstSeen,
|
|
"pendingHours": pendingHours,
|
|
"thresholdHours": delayHours,
|
|
},
|
|
}
|
|
|
|
m.mu.Lock()
|
|
if existing, ok := m.activeAlerts[alertID]; ok && existing != nil {
|
|
// Update existing alert
|
|
existing.LastSeen = time.Now()
|
|
existing.Message = message
|
|
existing.Metadata = alert.Metadata
|
|
m.mu.Unlock()
|
|
return
|
|
}
|
|
m.preserveAlertState(alertID, alert)
|
|
m.activeAlerts[alertID] = alert
|
|
m.recentAlerts[alertID] = alert
|
|
m.historyManager.AddAlert(*alert)
|
|
m.dispatchAlert(alert, false)
|
|
m.mu.Unlock()
|
|
|
|
log.Warn().
|
|
Str("container", containerName).
|
|
Str("host", host.DisplayName).
|
|
Str("image", container.Image).
|
|
Int("pendingHours", pendingHours).
|
|
Msg("Docker container has pending image update")
|
|
}
|
|
|
|
func (m *Manager) cleanupDockerContainerAlerts(host models.DockerHost, seen map[string]struct{}) {
|
|
m.cleanupDockerContainerAlertsWithTracking(host, seen, nil)
|
|
}
|
|
|
|
func (m *Manager) cleanupDockerContainerAlertsWithTracking(host models.DockerHost, seen map[string]struct{}, seenUpdateTracking map[string]struct{}) {
|
|
prefix := fmt.Sprintf("docker:%s/", strings.TrimSpace(host.ID))
|
|
updateTrackingPrefix := dockerUpdateTrackingHostPrefix(host)
|
|
|
|
m.mu.Lock()
|
|
toClear := make([]string, 0)
|
|
for alertID, alert := range m.activeAlerts {
|
|
if !strings.HasPrefix(alert.ResourceID, prefix) {
|
|
continue
|
|
}
|
|
if _, exists := seen[alert.ResourceID]; exists {
|
|
continue
|
|
}
|
|
toClear = append(toClear, alertID)
|
|
}
|
|
for resourceID := range m.dockerStateConfirm {
|
|
if strings.HasPrefix(resourceID, prefix) {
|
|
if _, exists := seen[resourceID]; !exists {
|
|
delete(m.dockerStateConfirm, resourceID)
|
|
}
|
|
}
|
|
}
|
|
// Cleanup update tracking for removed containers
|
|
for resourceID := range m.dockerUpdateFirstSeen {
|
|
if strings.HasPrefix(resourceID, prefix) {
|
|
if _, exists := seen[resourceID]; !exists {
|
|
delete(m.dockerUpdateFirstSeen, resourceID)
|
|
}
|
|
}
|
|
}
|
|
if seenUpdateTracking != nil {
|
|
for trackingKey := range m.dockerUpdateFirstSeenByIdentity {
|
|
if !strings.HasPrefix(trackingKey, updateTrackingPrefix) {
|
|
continue
|
|
}
|
|
if _, exists := seenUpdateTracking[trackingKey]; !exists {
|
|
delete(m.dockerUpdateFirstSeenByIdentity, trackingKey)
|
|
}
|
|
}
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
for _, alertID := range toClear {
|
|
m.clearAlert(alertID)
|
|
}
|
|
}
|
|
|
|
func (m *Manager) clearDockerHostContainerAlerts(host models.DockerHost) {
|
|
prefix := fmt.Sprintf("docker:%s/", strings.TrimSpace(host.ID))
|
|
updateTrackingPrefix := dockerUpdateTrackingHostPrefix(host)
|
|
|
|
m.mu.Lock()
|
|
toClear := make([]string, 0)
|
|
for alertID, alert := range m.activeAlerts {
|
|
if strings.HasPrefix(alert.ResourceID, prefix) {
|
|
toClear = append(toClear, alertID)
|
|
}
|
|
}
|
|
for resourceID := range m.dockerStateConfirm {
|
|
if strings.HasPrefix(resourceID, prefix) {
|
|
delete(m.dockerStateConfirm, resourceID)
|
|
}
|
|
}
|
|
for resourceID := range m.dockerRestartTracking {
|
|
if strings.HasPrefix(resourceID, prefix) {
|
|
delete(m.dockerRestartTracking, resourceID)
|
|
}
|
|
}
|
|
for resourceID := range m.dockerLastExitCode {
|
|
if strings.HasPrefix(resourceID, prefix) {
|
|
delete(m.dockerLastExitCode, resourceID)
|
|
}
|
|
}
|
|
for resourceID := range m.dockerUpdateFirstSeen {
|
|
if strings.HasPrefix(resourceID, prefix) {
|
|
delete(m.dockerUpdateFirstSeen, resourceID)
|
|
}
|
|
}
|
|
for trackingKey := range m.dockerUpdateFirstSeenByIdentity {
|
|
if strings.HasPrefix(trackingKey, updateTrackingPrefix) {
|
|
delete(m.dockerUpdateFirstSeenByIdentity, trackingKey)
|
|
}
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
for _, alertID := range toClear {
|
|
m.clearAlert(alertID)
|
|
}
|
|
}
|
|
|
|
// CheckStorage checks storage against thresholds
|
|
func (m *Manager) CheckStorage(storage models.Storage) {
|
|
m.mu.RLock()
|
|
if !m.config.Enabled {
|
|
m.mu.RUnlock()
|
|
return
|
|
}
|
|
if m.config.DisableAllStorage {
|
|
m.mu.RUnlock()
|
|
// Clear any existing storage alerts when all storage alerts are disabled
|
|
m.mu.Lock()
|
|
usageAlertID := fmt.Sprintf("%s-usage", storage.ID)
|
|
if _, exists := m.activeAlerts[usageAlertID]; exists {
|
|
m.clearAlertNoLock(usageAlertID)
|
|
log.Info().
|
|
Str("alertID", usageAlertID).
|
|
Str("storage", storage.Name).
|
|
Msg("Cleared usage alert - all storage alerts disabled")
|
|
}
|
|
offlineAlertID := fmt.Sprintf("storage-offline-%s", storage.ID)
|
|
if _, exists := m.activeAlerts[offlineAlertID]; exists {
|
|
m.clearAlertNoLock(offlineAlertID)
|
|
log.Info().
|
|
Str("alertID", offlineAlertID).
|
|
Str("storage", storage.Name).
|
|
Msg("Cleared offline alert - all storage alerts disabled")
|
|
}
|
|
m.mu.Unlock()
|
|
return
|
|
}
|
|
|
|
// Check if there's an override for this storage device. Shared storage used
|
|
// to be keyed per reporting node before #1049 switched it to a stable
|
|
// cluster-wide ID, so we still honor legacy per-node override keys.
|
|
override, hasOverride, _ := findStorageOverride(m.config.Overrides, storage)
|
|
threshold := m.config.StorageDefault
|
|
|
|
// Apply override if it exists for usage threshold
|
|
if hasOverride && override.Usage != nil {
|
|
threshold = *override.Usage
|
|
}
|
|
m.mu.RUnlock()
|
|
|
|
// Check if storage is truly offline/unavailable (not just inactive from other nodes)
|
|
// Note: In a cluster, local storage from other nodes shows as inactive which is normal
|
|
if storage.Status == "offline" || storage.Status == "unavailable" {
|
|
m.checkStorageOffline(storage)
|
|
} else {
|
|
// Clear any existing offline alert if storage is back online
|
|
m.clearStorageOfflineAlert(storage)
|
|
}
|
|
|
|
// If alerts are disabled for this storage device, clear any existing alerts and return
|
|
if hasOverride && override.Disabled {
|
|
m.mu.Lock()
|
|
// Clear usage alert
|
|
usageAlertID := fmt.Sprintf("%s-usage", storage.ID)
|
|
if _, exists := m.activeAlerts[usageAlertID]; exists {
|
|
m.clearAlertNoLock(usageAlertID)
|
|
log.Info().
|
|
Str("alertID", usageAlertID).
|
|
Str("storage", storage.Name).
|
|
Msg("Cleared usage alert - storage has alerts disabled")
|
|
}
|
|
// Clear offline alert
|
|
offlineAlertID := fmt.Sprintf("storage-offline-%s", storage.ID)
|
|
if _, exists := m.activeAlerts[offlineAlertID]; exists {
|
|
m.clearAlertNoLock(offlineAlertID)
|
|
log.Info().
|
|
Str("alertID", offlineAlertID).
|
|
Str("storage", storage.Name).
|
|
Msg("Cleared offline alert - storage has alerts disabled")
|
|
}
|
|
m.mu.Unlock()
|
|
return
|
|
}
|
|
|
|
// Check usage if storage has valid data (even if not currently active on this node)
|
|
// In clusters, storage may show as inactive on nodes where it's not currently mounted
|
|
// but we still want to alert on high usage
|
|
log.Debug().
|
|
Str("storage", storage.Name).
|
|
Str("id", storage.ID).
|
|
Float64("usage", storage.Usage).
|
|
Str("status", storage.Status).
|
|
Float64("trigger", threshold.Trigger).
|
|
Float64("clear", threshold.Clear).
|
|
Bool("hasOverride", hasOverride).
|
|
Msg("Checking storage thresholds")
|
|
|
|
// Check usage if storage is online - checkMetric will skip if threshold is nil or <= 0
|
|
if storage.Status != "offline" && storage.Status != "unavailable" && storage.Usage > 0 {
|
|
m.checkMetric(storage.ID, storage.Name, storage.Node, storage.Instance, "Storage", "usage", storage.Usage, &threshold, nil)
|
|
}
|
|
|
|
// Check ZFS pool status if this is ZFS storage
|
|
if storage.ZFSPool != nil {
|
|
m.checkZFSPoolHealth(storage)
|
|
}
|
|
}
|
|
|
|
// BuildGuestKey constructs a unique key for a guest from instance, node, and VMID.
|
|
// Uses the canonical format: instance:node:vmid
|
|
// This matches the format used by makeGuestID in the monitoring package.
|
|
func BuildGuestKey(instance, node string, vmid int) string {
|
|
instance = strings.TrimSpace(instance)
|
|
node = strings.TrimSpace(node)
|
|
if instance == "" {
|
|
instance = node
|
|
}
|
|
return fmt.Sprintf("%s:%s:%d", instance, node, vmid)
|
|
}
|
|
|
|
func storageOverrideLookupKeys(storage models.Storage) []string {
|
|
keys := make([]string, 0, 1+len(storage.NodeIDs)+len(storage.Nodes))
|
|
seen := make(map[string]struct{})
|
|
|
|
addKey := func(key string) {
|
|
key = strings.TrimSpace(key)
|
|
if key == "" {
|
|
return
|
|
}
|
|
if _, exists := seen[key]; exists {
|
|
return
|
|
}
|
|
seen[key] = struct{}{}
|
|
keys = append(keys, key)
|
|
}
|
|
|
|
addKey(storage.ID)
|
|
|
|
if !storage.Shared {
|
|
return keys
|
|
}
|
|
|
|
name := strings.TrimSpace(storage.Name)
|
|
if name == "" {
|
|
return keys
|
|
}
|
|
|
|
for _, nodeID := range storage.NodeIDs {
|
|
nodeID = strings.TrimSpace(nodeID)
|
|
if nodeID == "" {
|
|
continue
|
|
}
|
|
addKey(fmt.Sprintf("%s-%s", nodeID, name))
|
|
}
|
|
|
|
instance := strings.TrimSpace(storage.Instance)
|
|
for _, node := range storage.Nodes {
|
|
node = strings.TrimSpace(node)
|
|
if node == "" || strings.EqualFold(node, "cluster") {
|
|
continue
|
|
}
|
|
|
|
prefix := node
|
|
if instance != "" && instance != node {
|
|
prefix = fmt.Sprintf("%s-%s", instance, node)
|
|
}
|
|
addKey(fmt.Sprintf("%s-%s", prefix, name))
|
|
}
|
|
|
|
return keys
|
|
}
|
|
|
|
func findStorageOverride(overrides map[string]ThresholdConfig, storage models.Storage) (ThresholdConfig, bool, string) {
|
|
for _, key := range storageOverrideLookupKeys(storage) {
|
|
override, exists := overrides[key]
|
|
if exists {
|
|
return override, true, key
|
|
}
|
|
}
|
|
return ThresholdConfig{}, false, ""
|
|
}
|
|
|
|
// CheckSnapshotsForInstance evaluates guest snapshots for age-based alerts.
|
|
func (m *Manager) CheckSnapshotsForInstance(instanceName string, snapshots []models.GuestSnapshot, guestNames map[string]string) {
|
|
m.mu.RLock()
|
|
enabled := m.config.Enabled
|
|
snapshotCfg := m.config.SnapshotDefaults
|
|
m.mu.RUnlock()
|
|
|
|
if !enabled {
|
|
return
|
|
}
|
|
|
|
if !snapshotCfg.Enabled {
|
|
m.clearSnapshotAlertsForInstance(instanceName)
|
|
return
|
|
}
|
|
|
|
now := time.Now()
|
|
validAlerts := make(map[string]struct{})
|
|
|
|
for _, snapshot := range snapshots {
|
|
if instanceName != "" && snapshot.Instance != "" && snapshot.Instance != instanceName {
|
|
continue
|
|
}
|
|
if snapshot.Time.IsZero() {
|
|
continue
|
|
}
|
|
|
|
ageHours := now.Sub(snapshot.Time).Hours()
|
|
if ageHours < 0 {
|
|
continue
|
|
}
|
|
ageDays := ageHours / 24
|
|
|
|
const gib = 1024.0 * 1024 * 1024
|
|
sizeGiB := 0.0
|
|
if snapshot.SizeBytes > 0 {
|
|
sizeGiB = float64(snapshot.SizeBytes) / gib
|
|
}
|
|
|
|
// Determine thresholds for this snapshot
|
|
resourceID := fmt.Sprintf("%s:%s:%d", snapshot.Instance, snapshot.Node, snapshot.VMID)
|
|
m.mu.RLock()
|
|
gh := m.getGuestThresholds(nil, resourceID)
|
|
m.mu.RUnlock()
|
|
|
|
if gh.Disabled {
|
|
continue
|
|
}
|
|
|
|
currentSnapshotCfg := snapshotCfg
|
|
if gh.Snapshot != nil {
|
|
currentSnapshotCfg = *gh.Snapshot
|
|
}
|
|
|
|
if !currentSnapshotCfg.Enabled {
|
|
continue
|
|
}
|
|
|
|
var ageLevel AlertLevel
|
|
var ageThreshold int
|
|
var sizeLevel AlertLevel
|
|
var sizeThreshold float64
|
|
var triggeredStats []string
|
|
|
|
if currentSnapshotCfg.CriticalDays > 0 && ageDays >= float64(currentSnapshotCfg.CriticalDays) {
|
|
ageLevel = AlertLevelCritical
|
|
ageThreshold = currentSnapshotCfg.CriticalDays
|
|
triggeredStats = append(triggeredStats, "age")
|
|
} else if currentSnapshotCfg.WarningDays > 0 && ageDays >= float64(currentSnapshotCfg.WarningDays) {
|
|
ageLevel = AlertLevelWarning
|
|
ageThreshold = currentSnapshotCfg.WarningDays
|
|
triggeredStats = append(triggeredStats, "age")
|
|
}
|
|
|
|
if snapshot.SizeBytes > 0 {
|
|
if currentSnapshotCfg.CriticalSizeGiB > 0 && sizeGiB >= currentSnapshotCfg.CriticalSizeGiB {
|
|
sizeLevel = AlertLevelCritical
|
|
sizeThreshold = currentSnapshotCfg.CriticalSizeGiB
|
|
triggeredStats = append(triggeredStats, "size")
|
|
} else if currentSnapshotCfg.WarningSizeGiB > 0 && sizeGiB >= currentSnapshotCfg.WarningSizeGiB {
|
|
sizeLevel = AlertLevelWarning
|
|
sizeThreshold = currentSnapshotCfg.WarningSizeGiB
|
|
triggeredStats = append(triggeredStats, "size")
|
|
}
|
|
}
|
|
|
|
if ageLevel == "" && sizeLevel == "" {
|
|
continue
|
|
}
|
|
|
|
var level AlertLevel
|
|
switch {
|
|
case ageLevel == AlertLevelCritical || sizeLevel == AlertLevelCritical:
|
|
level = AlertLevelCritical
|
|
case ageLevel == AlertLevelWarning || sizeLevel == AlertLevelWarning:
|
|
level = AlertLevelWarning
|
|
default:
|
|
continue
|
|
}
|
|
|
|
useSizePrimary := false
|
|
if sizeLevel == AlertLevelCritical && ageLevel != AlertLevelCritical {
|
|
useSizePrimary = true
|
|
} else if sizeLevel != "" && ageLevel == "" {
|
|
useSizePrimary = true
|
|
}
|
|
|
|
alertID := fmt.Sprintf("snapshot-age-%s", snapshot.ID)
|
|
validAlerts[alertID] = struct{}{}
|
|
|
|
guestKey := BuildGuestKey(snapshot.Instance, snapshot.Node, snapshot.VMID)
|
|
guestName := strings.TrimSpace(guestNames[guestKey])
|
|
|
|
guestType := "VM"
|
|
if strings.EqualFold(snapshot.Type, "lxc") {
|
|
guestType = "Container"
|
|
}
|
|
|
|
if guestName == "" {
|
|
switch guestType {
|
|
case "Container":
|
|
guestName = fmt.Sprintf("CT %d", snapshot.VMID)
|
|
default:
|
|
guestName = fmt.Sprintf("VM %d", snapshot.VMID)
|
|
}
|
|
}
|
|
|
|
snapshotName := strings.TrimSpace(snapshot.Name)
|
|
if snapshotName == "" {
|
|
snapshotName = "(unnamed)"
|
|
}
|
|
|
|
ageDaysRounded := math.Round(ageDays*10) / 10
|
|
sizeGiBRounded := math.Round(sizeGiB*10) / 10
|
|
reasons := make([]string, 0, 2)
|
|
if ageLevel != "" {
|
|
reasons = append(reasons, fmt.Sprintf("%.1f days old (threshold %d days)", ageDaysRounded, ageThreshold))
|
|
}
|
|
if sizeLevel != "" {
|
|
reasons = append(reasons, fmt.Sprintf("%.1f GiB (threshold %.1f GiB)", sizeGiBRounded, sizeThreshold))
|
|
}
|
|
reasonText := strings.Join(reasons, " and ")
|
|
message := fmt.Sprintf(
|
|
"%s snapshot '%s' for %s is %s on %s",
|
|
guestType,
|
|
snapshotName,
|
|
guestName,
|
|
reasonText,
|
|
snapshot.Node,
|
|
)
|
|
|
|
alertValue := ageDays
|
|
alertThreshold := float64(ageThreshold)
|
|
thresholdTime := now
|
|
if useSizePrimary {
|
|
alertValue = sizeGiB
|
|
alertThreshold = sizeThreshold
|
|
} else if ageThreshold > 0 {
|
|
thresholdTime = snapshot.Time.Add(time.Duration(ageThreshold) * 24 * time.Hour)
|
|
if thresholdTime.After(now) {
|
|
thresholdTime = now
|
|
}
|
|
}
|
|
|
|
metadata := map[string]interface{}{
|
|
"snapshotName": snapshot.Name,
|
|
"snapshotCreatedAt": snapshot.Time,
|
|
"snapshotAgeDays": ageDays,
|
|
"snapshotAgeHours": ageHours,
|
|
"snapshotSizeBytes": snapshot.SizeBytes,
|
|
"snapshotSizeGiB": sizeGiB,
|
|
"guestName": guestName,
|
|
"guestType": guestType,
|
|
"guestInstance": snapshot.Instance,
|
|
"guestNode": snapshot.Node,
|
|
"guestVmid": snapshot.VMID,
|
|
"triggeredMetrics": triggeredStats,
|
|
"primaryMetric": "age",
|
|
}
|
|
if useSizePrimary {
|
|
metadata["primaryMetric"] = "size"
|
|
}
|
|
if ageLevel != "" {
|
|
metadata["thresholdDays"] = ageThreshold
|
|
}
|
|
if sizeLevel != "" {
|
|
metadata["thresholdSizeGiB"] = sizeThreshold
|
|
}
|
|
|
|
resourceName := fmt.Sprintf("%s snapshot '%s'", guestName, snapshotName)
|
|
|
|
m.mu.Lock()
|
|
if existing, exists := m.activeAlerts[alertID]; exists {
|
|
existing.LastSeen = now
|
|
existing.Level = level
|
|
existing.Value = alertValue
|
|
existing.Threshold = alertThreshold
|
|
existing.Message = message
|
|
existing.ResourceName = resourceName
|
|
if existing.Metadata == nil {
|
|
existing.Metadata = make(map[string]interface{})
|
|
}
|
|
for k, v := range metadata {
|
|
existing.Metadata[k] = v
|
|
}
|
|
m.mu.Unlock()
|
|
continue
|
|
}
|
|
|
|
alert := &Alert{
|
|
ID: alertID,
|
|
Type: "snapshot-age",
|
|
Level: level,
|
|
ResourceID: snapshot.ID,
|
|
ResourceName: resourceName,
|
|
Node: snapshot.Node,
|
|
Instance: snapshot.Instance,
|
|
Message: message,
|
|
Value: alertValue,
|
|
Threshold: alertThreshold,
|
|
StartTime: thresholdTime,
|
|
LastSeen: now,
|
|
Metadata: metadata,
|
|
}
|
|
|
|
m.preserveAlertState(alertID, alert)
|
|
|
|
m.activeAlerts[alertID] = alert
|
|
m.recentAlerts[alertID] = alert
|
|
m.historyManager.AddAlert(*alert)
|
|
|
|
go func() {
|
|
defer func() {
|
|
if r := recover(); r != nil {
|
|
log.Error().Interface("panic", r).Msg("Panic in SaveActiveAlerts goroutine (snapshot)")
|
|
}
|
|
}()
|
|
if err := m.SaveActiveAlerts(); err != nil {
|
|
log.Error().Err(err).Msg("Failed to save active alerts after snapshot alert creation")
|
|
}
|
|
}()
|
|
|
|
if !m.checkRateLimit(alertID) {
|
|
m.mu.Unlock()
|
|
log.Debug().
|
|
Str("alertID", alertID).
|
|
Str("guest", guestName).
|
|
Msg("Snapshot alert suppressed due to rate limit")
|
|
continue
|
|
}
|
|
|
|
if m.onAlert != nil {
|
|
nowCopy := now
|
|
alert.LastNotified = &nowCopy
|
|
if m.dispatchAlert(alert, true) {
|
|
log.Info().
|
|
Str("alertID", alertID).
|
|
Str("guest", guestName).
|
|
Msg("Snapshot age alert dispatched")
|
|
} else {
|
|
alert.LastNotified = nil
|
|
}
|
|
} else {
|
|
log.Warn().
|
|
Str("alertID", alertID).
|
|
Msg("Snapshot age alert created but no onAlert callback set")
|
|
}
|
|
|
|
m.mu.Unlock()
|
|
}
|
|
|
|
m.mu.Lock()
|
|
for alertID, alert := range m.activeAlerts {
|
|
if alert == nil || alert.Type != "snapshot-age" {
|
|
continue
|
|
}
|
|
if instanceName != "" && alert.Instance != instanceName {
|
|
continue
|
|
}
|
|
if _, ok := validAlerts[alertID]; ok {
|
|
continue
|
|
}
|
|
m.clearAlertNoLock(alertID)
|
|
}
|
|
m.mu.Unlock()
|
|
}
|
|
|
|
// CheckBackups evaluates storage, PBS, and PMG backups for age-based alerts.
|
|
func (m *Manager) CheckBackups(
|
|
storageBackups []models.StorageBackup,
|
|
pbsBackups []models.PBSBackup,
|
|
pmgBackups []models.PMGBackup,
|
|
guestsByKey map[string]GuestLookup,
|
|
guestsByVMID map[string][]GuestLookup,
|
|
templateInventoryReady map[string]bool,
|
|
) {
|
|
m.mu.RLock()
|
|
enabled := m.config.Enabled
|
|
backupCfg := m.config.BackupDefaults
|
|
m.mu.RUnlock()
|
|
|
|
if backupCfg.AlertOrphaned == nil {
|
|
alertOrphaned := true
|
|
backupCfg.AlertOrphaned = &alertOrphaned
|
|
}
|
|
|
|
if !enabled || !backupCfg.Enabled {
|
|
m.clearBackupAlerts()
|
|
return
|
|
}
|
|
|
|
if backupCfg.WarningDays <= 0 && backupCfg.CriticalDays <= 0 {
|
|
if backupCfg.AlertOrphaned == nil || !*backupCfg.AlertOrphaned {
|
|
m.clearBackupAlerts()
|
|
return
|
|
}
|
|
}
|
|
|
|
type backupRecord struct {
|
|
key string
|
|
vmid string
|
|
lookup GuestLookup
|
|
fallbackName string
|
|
instance string
|
|
node string
|
|
source string
|
|
storage string
|
|
datastore string
|
|
backupType string
|
|
filename string
|
|
lastTime time.Time
|
|
}
|
|
|
|
records := make(map[string]*backupRecord)
|
|
|
|
updateRecord := func(key string, candidate backupRecord) {
|
|
if key == "" {
|
|
return
|
|
}
|
|
if existing, ok := records[key]; ok {
|
|
if candidate.lastTime.After(existing.lastTime) {
|
|
*existing = candidate
|
|
}
|
|
return
|
|
}
|
|
record := candidate
|
|
records[key] = &record
|
|
}
|
|
|
|
now := time.Now()
|
|
|
|
for _, backup := range storageBackups {
|
|
if backup.Time.IsZero() {
|
|
continue
|
|
}
|
|
|
|
key := BuildGuestKey(backup.Instance, backup.Node, backup.VMID)
|
|
vmid := ""
|
|
if backup.VMID > 0 {
|
|
vmid = strconv.Itoa(backup.VMID)
|
|
}
|
|
info := guestsByKey[key]
|
|
displayName := info.Name
|
|
if displayName == "" {
|
|
displayName = fmt.Sprintf("%s-%d", sanitizeAlertKey(backup.Node), backup.VMID)
|
|
}
|
|
|
|
updateRecord(key, backupRecord{
|
|
key: key,
|
|
vmid: vmid,
|
|
lookup: info,
|
|
fallbackName: displayName,
|
|
instance: backup.Instance,
|
|
node: backup.Node,
|
|
source: "PVE storage",
|
|
storage: backup.Storage,
|
|
backupType: backup.Type,
|
|
lastTime: backup.Time,
|
|
})
|
|
}
|
|
|
|
for _, backup := range pbsBackups {
|
|
if backup.BackupTime.IsZero() {
|
|
continue
|
|
}
|
|
if backup.VMID == "0" {
|
|
// Host configuration backups - skip from age alerts
|
|
continue
|
|
}
|
|
|
|
vmid := backup.VMID
|
|
guests, exists := guestsByVMID[backup.VMID]
|
|
var info GuestLookup
|
|
var key string
|
|
var displayName string
|
|
var instance string
|
|
var node string
|
|
|
|
if exists && len(guests) > 0 {
|
|
// If we have exactly one match, use it directly
|
|
// If we have multiple matches, try to disambiguate using the PBS namespace
|
|
if len(guests) == 1 {
|
|
info = guests[0]
|
|
} else if backup.Namespace != "" {
|
|
// Try to match namespace to instance name
|
|
for _, g := range guests {
|
|
if namespaceMatchesInstance(backup.Namespace, g.Instance) {
|
|
info = g
|
|
break
|
|
}
|
|
}
|
|
// If no namespace match found, info stays zero-value.
|
|
// The VMID is ambiguous across instances so we must not guess.
|
|
}
|
|
// else: multiple guests, no namespace — info stays zero-value (ambiguous)
|
|
if info.Instance != "" && info.Node != "" {
|
|
key = BuildGuestKey(info.Instance, info.Node, info.VMID)
|
|
displayName = info.Name
|
|
instance = info.Instance
|
|
node = info.Node
|
|
} else {
|
|
key = fmt.Sprintf("pbs:%s:%s:%s", backup.Instance, backup.BackupType, backup.VMID)
|
|
displayName = fmt.Sprintf("VMID %s", backup.VMID)
|
|
instance = fmt.Sprintf("PBS:%s", backup.Instance)
|
|
node = "Unknown"
|
|
}
|
|
} else {
|
|
key = fmt.Sprintf("pbs:%s:%s:%s", backup.Instance, backup.BackupType, backup.VMID)
|
|
displayName = fmt.Sprintf("VMID %s", backup.VMID)
|
|
instance = fmt.Sprintf("PBS:%s", backup.Instance)
|
|
node = "Unknown"
|
|
}
|
|
|
|
updateRecord(key, backupRecord{
|
|
key: key,
|
|
vmid: vmid,
|
|
lookup: info,
|
|
fallbackName: displayName,
|
|
instance: instance,
|
|
node: node,
|
|
source: "PBS",
|
|
datastore: backup.Datastore,
|
|
backupType: backup.BackupType,
|
|
lastTime: backup.BackupTime,
|
|
})
|
|
}
|
|
|
|
for _, backup := range pmgBackups {
|
|
if backup.BackupTime.IsZero() {
|
|
continue
|
|
}
|
|
|
|
instanceLabel := strings.TrimSpace(backup.Instance)
|
|
if instanceLabel == "" {
|
|
instanceLabel = "PMG"
|
|
}
|
|
|
|
nodeName := strings.TrimSpace(backup.Node)
|
|
keyComponent := nodeName
|
|
if keyComponent == "" {
|
|
keyComponent = strings.TrimSpace(backup.Filename)
|
|
}
|
|
if keyComponent == "" {
|
|
keyComponent = "unknown"
|
|
}
|
|
|
|
displayName := nodeName
|
|
if displayName == "" {
|
|
displayName = instanceLabel
|
|
}
|
|
if displayName == "" {
|
|
displayName = "PMG gateway"
|
|
} else {
|
|
displayName = fmt.Sprintf("PMG %s", displayName)
|
|
}
|
|
|
|
instanceField := fmt.Sprintf("PMG:%s", instanceLabel)
|
|
key := fmt.Sprintf("pmg:%s:%s", instanceLabel, keyComponent)
|
|
|
|
updateRecord(key, backupRecord{
|
|
key: key,
|
|
fallbackName: displayName,
|
|
instance: instanceField,
|
|
node: nodeName,
|
|
source: "PMG",
|
|
backupType: "pmg",
|
|
filename: backup.Filename,
|
|
lastTime: backup.BackupTime,
|
|
})
|
|
}
|
|
|
|
if len(records) == 0 {
|
|
m.clearBackupAlerts()
|
|
return
|
|
}
|
|
|
|
// Build a set of instances whose inventory is safe to use for orphan detection.
|
|
// When the monitor provides a template-inventory readiness map, prefer that
|
|
// signal because backup polling can race ahead of template discovery even when
|
|
// live guests already exist on the instance. Fall back to the legacy "has at
|
|
// least one live guest" heuristic for direct callers/tests that do not pass it.
|
|
instancesReadyForOrphanDetection := make(map[string]bool)
|
|
if templateInventoryReady != nil {
|
|
for instance, ready := range templateInventoryReady {
|
|
if ready {
|
|
instancesReadyForOrphanDetection[instance] = true
|
|
}
|
|
}
|
|
} else {
|
|
for _, guests := range guestsByVMID {
|
|
for _, g := range guests {
|
|
if g.ResourceID != "" && g.Instance != "" {
|
|
instancesReadyForOrphanDetection[g.Instance] = true
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
validAlerts := make(map[string]struct{})
|
|
|
|
for key, record := range records {
|
|
age := now.Sub(record.lastTime)
|
|
if age < 0 {
|
|
continue
|
|
}
|
|
|
|
ageDays := age.Hours() / 24
|
|
if ageDays < 0 {
|
|
continue
|
|
}
|
|
ageDaysRounded := math.Round(ageDays*10) / 10
|
|
|
|
// Determine thresholds for this backup
|
|
currentBackupCfg := backupCfg
|
|
if record.lookup.ResourceID != "" {
|
|
m.mu.RLock()
|
|
gh := m.getGuestThresholds(nil, record.lookup.ResourceID)
|
|
m.mu.RUnlock()
|
|
if gh.Disabled {
|
|
continue
|
|
}
|
|
if gh.Backup != nil {
|
|
currentBackupCfg = *gh.Backup
|
|
}
|
|
}
|
|
|
|
currentBackupCfg.AlertOrphaned = backupCfg.AlertOrphaned
|
|
currentBackupCfg.IgnoreVMIDs = backupCfg.IgnoreVMIDs
|
|
|
|
if backupIgnoreVMID(record.vmid, currentBackupCfg.IgnoreVMIDs) {
|
|
continue
|
|
}
|
|
// Determine whether we have enough inventory to safely run orphan
|
|
// detection for this backup. For PVE storage backups the instance
|
|
// guard is strict: only check when that specific PVE instance has
|
|
// completed a template-aware inventory poll. For PBS/PMG backups
|
|
// (which span instances) it's enough that any instance is ready.
|
|
inventoryReady := false
|
|
if record.source == "PVE storage" {
|
|
inventoryReady = instancesReadyForOrphanDetection[record.instance]
|
|
} else {
|
|
inventoryReady = len(instancesReadyForOrphanDetection) > 0
|
|
}
|
|
if record.vmid != "" && record.lookup.ResourceID == "" && inventoryReady {
|
|
// Backup has a VMID but no matching live guest in its lookup.
|
|
//
|
|
// Check whether the VMID exists anywhere in live inventory.
|
|
// If it does, the backup is ambiguous (VMID collision) but not orphaned.
|
|
// Entries with empty ResourceID are persisted metadata for deleted guests
|
|
// and do not count as live inventory.
|
|
// For PVE storage backups, only match guests from the same instance —
|
|
// a live VMID on instance B does not mean instance A's backup isn't orphaned.
|
|
existsInInventory := false
|
|
if guests, ok := guestsByVMID[record.vmid]; ok {
|
|
for _, g := range guests {
|
|
if g.ResourceID == "" {
|
|
continue
|
|
}
|
|
if record.source == "PVE storage" && g.Instance != record.instance {
|
|
continue
|
|
}
|
|
existsInInventory = true
|
|
break
|
|
}
|
|
}
|
|
if !existsInInventory {
|
|
if g, ok := guestsByKey[record.key]; ok && g.ResourceID != "" {
|
|
existsInInventory = true
|
|
}
|
|
}
|
|
|
|
if !existsInInventory {
|
|
if currentBackupCfg.AlertOrphaned != nil && !*currentBackupCfg.AlertOrphaned {
|
|
continue
|
|
}
|
|
|
|
// Create a backup-orphaned alert immediately — no age threshold required.
|
|
alertKey := sanitizeAlertKey(key)
|
|
alertID := fmt.Sprintf("backup-orphaned-%s", alertKey)
|
|
validAlerts[alertID] = struct{}{}
|
|
|
|
displayName := record.fallbackName
|
|
if displayName == "" {
|
|
displayName = "Unknown guest"
|
|
}
|
|
|
|
node := record.node
|
|
if node == "" {
|
|
node = record.lookup.Node
|
|
}
|
|
instance := record.instance
|
|
if instance == "" {
|
|
instance = record.lookup.Instance
|
|
}
|
|
|
|
var sourceLabel string
|
|
switch record.source {
|
|
case "PBS":
|
|
sourceLabel = fmt.Sprintf("PBS datastore %s on %s", record.datastore, strings.TrimPrefix(instance, "PBS:"))
|
|
case "PMG":
|
|
if node != "" {
|
|
sourceLabel = fmt.Sprintf("PMG node %s", node)
|
|
} else {
|
|
sourceLabel = "PMG"
|
|
}
|
|
default:
|
|
sourceLabel = fmt.Sprintf("storage %s on %s", record.storage, node)
|
|
}
|
|
|
|
message := fmt.Sprintf(
|
|
"Orphaned backup: %s (VMID %s) via %s — guest no longer exists in inventory",
|
|
displayName,
|
|
record.vmid,
|
|
sourceLabel,
|
|
)
|
|
|
|
metadata := map[string]interface{}{
|
|
"source": record.source,
|
|
"lastBackupTime": record.lastTime,
|
|
"ageDays": ageDays,
|
|
"orphaned": true,
|
|
"vmid": record.vmid,
|
|
}
|
|
if record.storage != "" {
|
|
metadata["storage"] = record.storage
|
|
}
|
|
if record.datastore != "" {
|
|
metadata["datastore"] = record.datastore
|
|
}
|
|
if record.backupType != "" {
|
|
metadata["backupType"] = record.backupType
|
|
}
|
|
if record.filename != "" {
|
|
metadata["filename"] = record.filename
|
|
}
|
|
|
|
m.mu.Lock()
|
|
if existing, exists := m.activeAlerts[alertID]; exists && existing != nil {
|
|
existing.LastSeen = now
|
|
existing.Level = AlertLevelWarning
|
|
existing.Value = ageDays
|
|
existing.Threshold = 0
|
|
existing.Message = message
|
|
if existing.Metadata == nil {
|
|
existing.Metadata = make(map[string]interface{})
|
|
}
|
|
for k, v := range metadata {
|
|
existing.Metadata[k] = v
|
|
}
|
|
m.mu.Unlock()
|
|
continue
|
|
}
|
|
|
|
alert := &Alert{
|
|
ID: alertID,
|
|
Type: "backup-orphaned",
|
|
Level: AlertLevelWarning,
|
|
ResourceID: alertKey,
|
|
ResourceName: fmt.Sprintf("%s backup", displayName),
|
|
Node: node,
|
|
Instance: instance,
|
|
Message: message,
|
|
Value: ageDays,
|
|
Threshold: 0,
|
|
StartTime: now,
|
|
LastSeen: now,
|
|
Metadata: metadata,
|
|
}
|
|
|
|
m.preserveAlertState(alertID, alert)
|
|
|
|
m.activeAlerts[alertID] = alert
|
|
m.recentAlerts[alertID] = alert
|
|
m.historyManager.AddAlert(*alert)
|
|
|
|
go func() {
|
|
defer func() {
|
|
if r := recover(); r != nil {
|
|
log.Error().Interface("panic", r).Msg("Panic in SaveActiveAlerts goroutine (backup-orphaned)")
|
|
}
|
|
}()
|
|
if err := m.SaveActiveAlerts(); err != nil {
|
|
log.Error().Err(err).Msg("Failed to save active alerts after backup-orphaned alert creation")
|
|
}
|
|
}()
|
|
|
|
if !m.checkRateLimit(alertID) {
|
|
m.mu.Unlock()
|
|
log.Debug().
|
|
Str("alertID", alertID).
|
|
Str("resource", displayName).
|
|
Msg("Backup orphaned alert suppressed due to rate limit")
|
|
continue
|
|
}
|
|
|
|
if m.onAlert != nil {
|
|
notified := now
|
|
alert.LastNotified = ¬ified
|
|
if m.dispatchAlert(alert, true) {
|
|
log.Info().
|
|
Str("alertID", alertID).
|
|
Str("resource", displayName).
|
|
Msg("Backup orphaned alert dispatched")
|
|
} else {
|
|
alert.LastNotified = nil
|
|
}
|
|
}
|
|
m.mu.Unlock()
|
|
continue
|
|
}
|
|
}
|
|
|
|
if !currentBackupCfg.Enabled {
|
|
continue
|
|
}
|
|
|
|
var level AlertLevel
|
|
var threshold int
|
|
switch {
|
|
case currentBackupCfg.CriticalDays > 0 && ageDays >= float64(currentBackupCfg.CriticalDays):
|
|
level = AlertLevelCritical
|
|
threshold = currentBackupCfg.CriticalDays
|
|
case currentBackupCfg.WarningDays > 0 && ageDays >= float64(currentBackupCfg.WarningDays):
|
|
level = AlertLevelWarning
|
|
threshold = currentBackupCfg.WarningDays
|
|
default:
|
|
continue
|
|
}
|
|
|
|
alertKey := sanitizeAlertKey(key)
|
|
alertID := fmt.Sprintf("backup-age-%s", alertKey)
|
|
validAlerts[alertID] = struct{}{}
|
|
|
|
displayName := record.lookup.Name
|
|
if displayName == "" {
|
|
displayName = record.fallbackName
|
|
}
|
|
if displayName == "" {
|
|
displayName = "Unknown guest"
|
|
}
|
|
|
|
node := record.node
|
|
if node == "" {
|
|
node = record.lookup.Node
|
|
}
|
|
instance := record.instance
|
|
if instance == "" {
|
|
instance = record.lookup.Instance
|
|
}
|
|
|
|
thresholdTime := record.lastTime.Add(time.Duration(threshold) * 24 * time.Hour)
|
|
if thresholdTime.After(now) {
|
|
thresholdTime = now
|
|
}
|
|
|
|
var sourceLabel string
|
|
switch record.source {
|
|
case "PBS":
|
|
sourceLabel = fmt.Sprintf("PBS datastore %s on %s", record.datastore, strings.TrimPrefix(instance, "PBS:"))
|
|
case "PMG":
|
|
if node != "" {
|
|
sourceLabel = fmt.Sprintf("PMG node %s", node)
|
|
} else {
|
|
sourceLabel = "PMG"
|
|
}
|
|
default:
|
|
sourceLabel = fmt.Sprintf("storage %s on %s", record.storage, node)
|
|
}
|
|
|
|
message := fmt.Sprintf(
|
|
"%s backup via %s is %.1f days old (threshold: %d days)",
|
|
displayName,
|
|
sourceLabel,
|
|
ageDaysRounded,
|
|
threshold,
|
|
)
|
|
|
|
metadata := map[string]interface{}{
|
|
"source": record.source,
|
|
"lastBackupTime": record.lastTime,
|
|
"ageDays": ageDays,
|
|
"thresholdDays": threshold,
|
|
}
|
|
if record.storage != "" {
|
|
metadata["storage"] = record.storage
|
|
}
|
|
if record.datastore != "" {
|
|
metadata["datastore"] = record.datastore
|
|
}
|
|
if record.backupType != "" {
|
|
metadata["backupType"] = record.backupType
|
|
}
|
|
if record.filename != "" {
|
|
metadata["filename"] = record.filename
|
|
}
|
|
|
|
m.mu.Lock()
|
|
if existing, exists := m.activeAlerts[alertID]; exists {
|
|
existing.LastSeen = now
|
|
existing.Level = level
|
|
existing.Value = ageDays
|
|
existing.Threshold = float64(threshold)
|
|
existing.Message = message
|
|
if existing.Metadata == nil {
|
|
existing.Metadata = make(map[string]interface{})
|
|
}
|
|
for k, v := range metadata {
|
|
existing.Metadata[k] = v
|
|
}
|
|
m.mu.Unlock()
|
|
continue
|
|
}
|
|
|
|
alert := &Alert{
|
|
ID: alertID,
|
|
Type: "backup-age",
|
|
Level: level,
|
|
ResourceID: alertKey,
|
|
ResourceName: fmt.Sprintf("%s backup", displayName),
|
|
Node: node,
|
|
Instance: instance,
|
|
Message: message,
|
|
Value: ageDays,
|
|
Threshold: float64(threshold),
|
|
StartTime: thresholdTime,
|
|
LastSeen: now,
|
|
Metadata: metadata,
|
|
}
|
|
|
|
m.preserveAlertState(alertID, alert)
|
|
|
|
m.activeAlerts[alertID] = alert
|
|
m.recentAlerts[alertID] = alert
|
|
m.historyManager.AddAlert(*alert)
|
|
|
|
go func() {
|
|
defer func() {
|
|
if r := recover(); r != nil {
|
|
log.Error().Interface("panic", r).Msg("Panic in SaveActiveAlerts goroutine (backup)")
|
|
}
|
|
}()
|
|
if err := m.SaveActiveAlerts(); err != nil {
|
|
log.Error().Err(err).Msg("Failed to save active alerts after backup alert creation")
|
|
}
|
|
}()
|
|
|
|
if !m.checkRateLimit(alertID) {
|
|
m.mu.Unlock()
|
|
log.Debug().
|
|
Str("alertID", alertID).
|
|
Str("resource", displayName).
|
|
Msg("Backup alert suppressed due to rate limit")
|
|
continue
|
|
}
|
|
|
|
if m.onAlert != nil {
|
|
notified := now
|
|
alert.LastNotified = ¬ified
|
|
if m.dispatchAlert(alert, true) {
|
|
log.Info().
|
|
Str("alertID", alertID).
|
|
Str("resource", displayName).
|
|
Msg("Backup age alert dispatched")
|
|
} else {
|
|
alert.LastNotified = nil
|
|
}
|
|
}
|
|
m.mu.Unlock()
|
|
}
|
|
|
|
m.mu.Lock()
|
|
for alertID, alert := range m.activeAlerts {
|
|
if alert == nil || (alert.Type != "backup-age" && alert.Type != "backup-orphaned") {
|
|
continue
|
|
}
|
|
if _, ok := validAlerts[alertID]; ok {
|
|
continue
|
|
}
|
|
// When no instances have inventory ready for orphan detection, preserve existing orphan
|
|
// alerts rather than clearing them — we can't confirm they're resolved.
|
|
if len(instancesReadyForOrphanDetection) == 0 && alert.Type == "backup-orphaned" {
|
|
continue
|
|
}
|
|
m.clearAlertNoLock(alertID)
|
|
}
|
|
m.mu.Unlock()
|
|
}
|
|
|
|
// checkZFSPoolHealth checks ZFS pool for errors and degraded state
|
|
func (m *Manager) checkZFSPoolHealth(storage models.Storage) {
|
|
pool := storage.ZFSPool
|
|
if pool == nil {
|
|
return
|
|
}
|
|
|
|
// Check pool state (DEGRADED, FAULTED, etc.)
|
|
stateAlertID := fmt.Sprintf("zfs-pool-state-%s", storage.ID)
|
|
if pool.State != "ONLINE" {
|
|
level := AlertLevelWarning
|
|
if pool.State == "FAULTED" || pool.State == "UNAVAIL" {
|
|
level = AlertLevelCritical
|
|
}
|
|
|
|
m.mu.Lock()
|
|
if _, exists := m.activeAlerts[stateAlertID]; !exists {
|
|
alert := &Alert{
|
|
ID: stateAlertID,
|
|
Type: "zfs-pool-state",
|
|
Level: level,
|
|
ResourceID: storage.ID,
|
|
ResourceName: fmt.Sprintf("%s (%s)", storage.Name, pool.Name),
|
|
Node: storage.Node,
|
|
Instance: storage.Instance,
|
|
Message: fmt.Sprintf("ZFS pool '%s' is %s", pool.Name, pool.State),
|
|
Value: 0,
|
|
Threshold: 0,
|
|
StartTime: time.Now(),
|
|
LastSeen: time.Now(),
|
|
Metadata: map[string]interface{}{
|
|
"pool_name": pool.Name,
|
|
"pool_state": pool.State,
|
|
},
|
|
}
|
|
|
|
m.preserveAlertState(stateAlertID, alert)
|
|
|
|
m.activeAlerts[stateAlertID] = alert
|
|
m.recentAlerts[stateAlertID] = alert
|
|
m.historyManager.AddAlert(*alert)
|
|
|
|
m.dispatchAlert(alert, false)
|
|
|
|
log.Warn().
|
|
Str("pool", pool.Name).
|
|
Str("state", pool.State).
|
|
Str("node", storage.Node).
|
|
Msg("ZFS pool is not healthy")
|
|
}
|
|
m.mu.Unlock()
|
|
} else {
|
|
// Clear state alert if pool is back online
|
|
m.clearAlert(stateAlertID)
|
|
}
|
|
|
|
// Check for read/write/checksum errors
|
|
totalErrors := pool.ReadErrors + pool.WriteErrors + pool.ChecksumErrors
|
|
errorsAlertID := fmt.Sprintf("zfs-pool-errors-%s", storage.ID)
|
|
if totalErrors > 0 {
|
|
m.mu.Lock()
|
|
existingAlert, exists := m.activeAlerts[errorsAlertID]
|
|
|
|
// Only create new alert or update if error count increased
|
|
if !exists || float64(totalErrors) > existingAlert.Value {
|
|
alert := &Alert{
|
|
ID: errorsAlertID,
|
|
Type: "zfs-pool-errors",
|
|
Level: AlertLevelWarning,
|
|
ResourceID: storage.ID,
|
|
ResourceName: fmt.Sprintf("%s (%s)", storage.Name, pool.Name),
|
|
Node: storage.Node,
|
|
Instance: storage.Instance,
|
|
Message: fmt.Sprintf("ZFS pool '%s' has errors: %d read, %d write, %d checksum",
|
|
pool.Name, pool.ReadErrors, pool.WriteErrors, pool.ChecksumErrors),
|
|
Value: float64(totalErrors),
|
|
Threshold: 0,
|
|
StartTime: time.Now(),
|
|
LastSeen: time.Now(),
|
|
Metadata: map[string]interface{}{
|
|
"pool_name": pool.Name,
|
|
"read_errors": pool.ReadErrors,
|
|
"write_errors": pool.WriteErrors,
|
|
"checksum_errors": pool.ChecksumErrors,
|
|
},
|
|
}
|
|
|
|
if exists {
|
|
// Preserve original start time when updating
|
|
alert.StartTime = existingAlert.StartTime
|
|
}
|
|
|
|
m.preserveAlertState(errorsAlertID, alert)
|
|
|
|
m.activeAlerts[errorsAlertID] = alert
|
|
m.recentAlerts[errorsAlertID] = alert
|
|
m.historyManager.AddAlert(*alert)
|
|
|
|
m.dispatchAlert(alert, false)
|
|
|
|
log.Error().
|
|
Str("pool", pool.Name).
|
|
Int64("read_errors", pool.ReadErrors).
|
|
Int64("write_errors", pool.WriteErrors).
|
|
Int64("checksum_errors", pool.ChecksumErrors).
|
|
Str("node", storage.Node).
|
|
Msg("ZFS pool has I/O errors")
|
|
}
|
|
m.mu.Unlock()
|
|
} else {
|
|
m.clearAlert(errorsAlertID)
|
|
}
|
|
|
|
// Check individual devices for errors
|
|
m.mu.Lock()
|
|
for _, device := range pool.Devices {
|
|
alertID := fmt.Sprintf("zfs-device-%s-%s", storage.ID, device.Name)
|
|
|
|
// Skip SPARE devices unless they have actual errors
|
|
if (device.State != "ONLINE" && device.State != "SPARE") || device.ReadErrors > 0 || device.WriteErrors > 0 || device.ChecksumErrors > 0 {
|
|
if _, exists := m.activeAlerts[alertID]; !exists {
|
|
level := AlertLevelWarning
|
|
if device.State == "FAULTED" || device.State == "UNAVAIL" {
|
|
level = AlertLevelCritical
|
|
}
|
|
|
|
message := fmt.Sprintf("ZFS device '%s' in pool '%s'", device.Name, pool.Name)
|
|
if device.State != "ONLINE" {
|
|
message += fmt.Sprintf(" is %s", device.State)
|
|
}
|
|
if device.ReadErrors > 0 || device.WriteErrors > 0 || device.ChecksumErrors > 0 {
|
|
message += fmt.Sprintf(" has errors: %d read, %d write, %d checksum",
|
|
device.ReadErrors, device.WriteErrors, device.ChecksumErrors)
|
|
}
|
|
|
|
alert := &Alert{
|
|
ID: alertID,
|
|
Type: "zfs-device",
|
|
Level: level,
|
|
ResourceID: storage.ID,
|
|
ResourceName: fmt.Sprintf("%s (%s/%s)", storage.Name, pool.Name, device.Name),
|
|
Node: storage.Node,
|
|
Instance: storage.Instance,
|
|
Message: message,
|
|
Value: float64(device.ReadErrors + device.WriteErrors + device.ChecksumErrors),
|
|
Threshold: 0,
|
|
StartTime: time.Now(),
|
|
LastSeen: time.Now(),
|
|
Metadata: map[string]interface{}{
|
|
"pool_name": pool.Name,
|
|
"device_name": device.Name,
|
|
"device_state": device.State,
|
|
"read_errors": device.ReadErrors,
|
|
"write_errors": device.WriteErrors,
|
|
"checksum_errors": device.ChecksumErrors,
|
|
},
|
|
}
|
|
|
|
m.preserveAlertState(alertID, alert)
|
|
|
|
m.activeAlerts[alertID] = alert
|
|
m.recentAlerts[alertID] = alert
|
|
m.historyManager.AddAlert(*alert)
|
|
|
|
m.dispatchAlert(alert, false)
|
|
|
|
log.Warn().
|
|
Str("pool", pool.Name).
|
|
Str("device", device.Name).
|
|
Str("state", device.State).
|
|
Int64("errors", device.ReadErrors+device.WriteErrors+device.ChecksumErrors).
|
|
Str("node", storage.Node).
|
|
Msg("ZFS device has issues")
|
|
}
|
|
} else {
|
|
// Clear device alert if it's back to normal
|
|
m.clearAlertNoLock(alertID)
|
|
}
|
|
}
|
|
m.mu.Unlock()
|
|
}
|
|
|
|
// clearAlert removes an alert if it exists
|
|
func (m *Manager) clearAlert(alertID string) {
|
|
m.mu.Lock()
|
|
alert, exists := m.activeAlerts[alertID]
|
|
if exists {
|
|
m.removeActiveAlertNoLock(alertID)
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
if !exists {
|
|
return
|
|
}
|
|
|
|
resolvedAlert := &ResolvedAlert{
|
|
Alert: alert,
|
|
ResolvedTime: time.Now(),
|
|
}
|
|
|
|
m.addRecentlyResolvedUnlocked(alertID, resolvedAlert)
|
|
|
|
m.safeCallResolvedCallback(alertID, false)
|
|
|
|
log.Info().
|
|
Str("alertID", alertID).
|
|
Msg("Alert cleared")
|
|
}
|
|
|
|
// getTimeThreshold determines the delay to apply for a metric/resource combination.
|
|
func (m *Manager) getTimeThreshold(_ string, resourceType, metricType string) int {
|
|
if delay, ok := m.getMetricTimeThreshold(resourceType, metricType); ok {
|
|
return delay
|
|
}
|
|
|
|
base, hasTypeSpecific := m.getBaseTimeThreshold(resourceType)
|
|
|
|
if !hasTypeSpecific {
|
|
if delay, ok := m.getGlobalMetricTimeThreshold(metricType); ok {
|
|
return delay
|
|
}
|
|
}
|
|
|
|
return base
|
|
}
|
|
|
|
// getMetricTimeThreshold returns a metric-specific delay if configured at the resource-type level.
|
|
func (m *Manager) getMetricTimeThreshold(resourceType, metricType string) (int, bool) {
|
|
if len(m.config.MetricTimeThresholds) == 0 {
|
|
return 0, false
|
|
}
|
|
|
|
metricKey := strings.ToLower(strings.TrimSpace(metricType))
|
|
if metricKey == "" {
|
|
return 0, false
|
|
}
|
|
|
|
for _, typeKey := range canonicalResourceTypeKeys(resourceType) {
|
|
perType, ok := m.config.MetricTimeThresholds[typeKey]
|
|
if !ok || len(perType) == 0 {
|
|
continue
|
|
}
|
|
|
|
if delay, ok := perType[metricKey]; ok {
|
|
return delay, true
|
|
}
|
|
if delay, ok := perType["default"]; ok {
|
|
return delay, true
|
|
}
|
|
if delay, ok := perType["_default"]; ok {
|
|
return delay, true
|
|
}
|
|
if delay, ok := perType["*"]; ok {
|
|
return delay, true
|
|
}
|
|
}
|
|
|
|
return 0, false
|
|
}
|
|
|
|
// getBaseTimeThreshold returns the resource-type level delay.
|
|
func (m *Manager) getBaseTimeThreshold(resourceType string) (int, bool) {
|
|
if m.config.TimeThresholds != nil {
|
|
for _, key := range canonicalResourceTypeKeys(resourceType) {
|
|
if delay, ok := m.config.TimeThresholds[key]; ok {
|
|
return delay, true
|
|
}
|
|
}
|
|
if delay, ok := m.config.TimeThresholds["all"]; ok {
|
|
return delay, false
|
|
}
|
|
}
|
|
|
|
return m.config.TimeThreshold, false
|
|
}
|
|
|
|
func (m *Manager) getGlobalMetricTimeThreshold(metricType string) (int, bool) {
|
|
if len(m.config.MetricTimeThresholds) == 0 {
|
|
return 0, false
|
|
}
|
|
|
|
perType, ok := m.config.MetricTimeThresholds["all"]
|
|
if !ok || len(perType) == 0 {
|
|
return 0, false
|
|
}
|
|
|
|
metricKey := strings.ToLower(strings.TrimSpace(metricType))
|
|
if metricKey == "" {
|
|
return 0, false
|
|
}
|
|
|
|
if delay, ok := perType[metricKey]; ok {
|
|
return delay, true
|
|
}
|
|
if delay, ok := perType["default"]; ok {
|
|
return delay, true
|
|
}
|
|
if delay, ok := perType["_default"]; ok {
|
|
return delay, true
|
|
}
|
|
if delay, ok := perType["*"]; ok {
|
|
return delay, true
|
|
}
|
|
|
|
return 0, false
|
|
}
|
|
|
|
func canonicalResourceTypeKeys(resourceType string) []string {
|
|
typeKey := strings.ToLower(strings.TrimSpace(resourceType))
|
|
|
|
addUnique := func(slice []string, value string) []string {
|
|
if value == "" {
|
|
return slice
|
|
}
|
|
for _, existing := range slice {
|
|
if existing == value {
|
|
return slice
|
|
}
|
|
}
|
|
return append(slice, value)
|
|
}
|
|
|
|
var keys []string
|
|
switch typeKey {
|
|
case "guest", "qemu", "vm", "ct", "container", "lxc":
|
|
keys = addUnique(keys, "guest")
|
|
case "docker", "docker container", "dockercontainer":
|
|
keys = addUnique(keys, "docker")
|
|
keys = addUnique(keys, "guest")
|
|
case "docker host", "dockerhost":
|
|
keys = addUnique(keys, "dockerhost")
|
|
keys = addUnique(keys, "docker")
|
|
keys = addUnique(keys, "node")
|
|
case "node":
|
|
keys = addUnique(keys, "node")
|
|
case "pbs", "pbs server", "pbsserver":
|
|
keys = addUnique(keys, "pbs")
|
|
keys = addUnique(keys, "node")
|
|
case "storage":
|
|
keys = addUnique(keys, "storage")
|
|
default:
|
|
keys = addUnique(keys, typeKey)
|
|
}
|
|
|
|
return keys
|
|
}
|
|
|
|
// checkMetric checks a single metric against its threshold with hysteresis
|
|
type metricOptions struct {
|
|
Metadata map[string]interface{}
|
|
Message string
|
|
// MonitorOnly suppresses external notifications while still tracking the alert.
|
|
MonitorOnly bool
|
|
}
|
|
|
|
func (m *Manager) checkMetric(resourceID, resourceName, node, instance, resourceType, metricType string, value float64, threshold *HysteresisThreshold, opts *metricOptions) {
|
|
if threshold == nil || threshold.Trigger <= 0 {
|
|
alertID := fmt.Sprintf("%s-%s", resourceID, metricType)
|
|
m.clearAlert(alertID)
|
|
return
|
|
}
|
|
|
|
log.Debug().
|
|
Str("resource", resourceName).
|
|
Str("metric", metricType).
|
|
Float64("value", value).
|
|
Float64("trigger", threshold.Trigger).
|
|
Float64("clear", threshold.Clear).
|
|
Bool("exceeds", value >= threshold.Trigger).
|
|
Msg("Checking metric threshold")
|
|
|
|
alertID := fmt.Sprintf("%s-%s", resourceID, metricType)
|
|
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
|
|
existingAlert, exists := m.activeAlerts[alertID]
|
|
monitorOnly := opts != nil && opts.MonitorOnly
|
|
|
|
// Check for suppression
|
|
if suppressUntil, suppressed := m.suppressedUntil[alertID]; suppressed && time.Now().Before(suppressUntil) {
|
|
log.Debug().
|
|
Str("alertID", alertID).
|
|
Time("suppressedUntil", suppressUntil).
|
|
Msg("Alert suppressed")
|
|
return
|
|
}
|
|
|
|
if value >= threshold.Trigger {
|
|
// Threshold exceeded
|
|
if !exists {
|
|
alertStartTime := time.Now()
|
|
|
|
// Determine the appropriate time threshold based on resource/metric type
|
|
timeThreshold := m.getTimeThreshold(resourceID, resourceType, metricType)
|
|
|
|
// Check if we have a time threshold configured
|
|
if timeThreshold > 0 {
|
|
// Check if this threshold was already pending
|
|
if pendingTime, isPending := m.pendingAlerts[alertID]; isPending {
|
|
// Check if enough time has passed
|
|
if time.Since(pendingTime) >= time.Duration(timeThreshold)*time.Second {
|
|
// Time threshold met, proceed with alert
|
|
delete(m.pendingAlerts, alertID)
|
|
if !pendingTime.IsZero() {
|
|
alertStartTime = pendingTime
|
|
}
|
|
log.Debug().
|
|
Str("alertID", alertID).
|
|
Int("timeThreshold", timeThreshold).
|
|
Dur("elapsed", time.Since(pendingTime)).
|
|
Msg("Time threshold met, triggering alert")
|
|
} else {
|
|
// Still waiting for time threshold
|
|
log.Debug().
|
|
Str("alertID", alertID).
|
|
Int("timeThreshold", timeThreshold).
|
|
Dur("elapsed", time.Since(pendingTime)).
|
|
Msg("Threshold exceeded but waiting for time threshold")
|
|
return
|
|
}
|
|
} else {
|
|
// First time exceeding threshold, start tracking
|
|
m.pendingAlerts[alertID] = alertStartTime
|
|
log.Debug().
|
|
Str("alertID", alertID).
|
|
Int("timeThreshold", timeThreshold).
|
|
Msg("Threshold exceeded, starting time threshold tracking")
|
|
return
|
|
}
|
|
}
|
|
|
|
// Check for recent similar alert to prevent spam
|
|
if recent, hasRecent := m.recentAlerts[alertID]; hasRecent {
|
|
// Check minimum delta
|
|
if m.config.MinimumDelta > 0 &&
|
|
time.Since(recent.StartTime) < time.Duration(m.config.SuppressionWindow)*time.Minute &&
|
|
abs(recent.Value-value) < m.config.MinimumDelta {
|
|
log.Debug().
|
|
Str("alertID", alertID).
|
|
Float64("recentValue", recent.Value).
|
|
Float64("currentValue", value).
|
|
Float64("delta", abs(recent.Value-value)).
|
|
Float64("minimumDelta", m.config.MinimumDelta).
|
|
Msg("Alert suppressed due to minimum delta")
|
|
|
|
// Set suppression window
|
|
m.suppressedUntil[alertID] = time.Now().Add(time.Duration(m.config.SuppressionWindow) * time.Minute)
|
|
return
|
|
}
|
|
}
|
|
|
|
// New alert
|
|
message := ""
|
|
var unit string
|
|
if opts != nil && opts.Message != "" {
|
|
message = opts.Message
|
|
} else {
|
|
switch metricType {
|
|
case "usage":
|
|
message = fmt.Sprintf("%s at %.1f%%", resourceType, value)
|
|
case "diskRead", "diskWrite", "networkIn", "networkOut":
|
|
message = fmt.Sprintf("%s %s at %.1f MB/s", resourceType, metricType, value)
|
|
unit = "MB/s"
|
|
case "temperature", "disk_temperature", "diskTemperature":
|
|
message = fmt.Sprintf("%s %s at %.1f°C", resourceType, metricType, value)
|
|
unit = "°C"
|
|
default:
|
|
message = fmt.Sprintf("%s %s at %.1f%%", resourceType, metricType, value)
|
|
}
|
|
}
|
|
|
|
alertMetadata := map[string]interface{}{
|
|
"resourceType": resourceType,
|
|
"clearThreshold": threshold.Clear,
|
|
}
|
|
if unit != "" {
|
|
alertMetadata["unit"] = unit
|
|
}
|
|
if opts != nil && opts.Metadata != nil {
|
|
for k, v := range opts.Metadata {
|
|
alertMetadata[k] = v
|
|
}
|
|
}
|
|
alertMetadata["monitorOnly"] = monitorOnly
|
|
|
|
alert := &Alert{
|
|
ID: alertID,
|
|
Type: metricType,
|
|
Level: AlertLevelWarning,
|
|
ResourceID: resourceID,
|
|
ResourceName: resourceName,
|
|
Node: node,
|
|
NodeDisplayName: m.resolveNodeDisplayName(instance, node),
|
|
Instance: instance,
|
|
Message: message,
|
|
Value: value,
|
|
Threshold: threshold.Trigger,
|
|
StartTime: alertStartTime,
|
|
LastSeen: time.Now(),
|
|
Metadata: alertMetadata,
|
|
}
|
|
|
|
// Set level based on how much over threshold
|
|
if value >= threshold.Trigger+10 {
|
|
alert.Level = AlertLevelCritical
|
|
}
|
|
|
|
log.Debug().
|
|
Str("alertID", alertID).
|
|
Time("alertStartTime", alertStartTime).
|
|
Time("now", time.Now()).
|
|
Dur("initialDuration", time.Since(alertStartTime)).
|
|
Msg("Creating new alert with start time")
|
|
|
|
m.preserveAlertState(alertID, alert)
|
|
|
|
m.activeAlerts[alertID] = alert
|
|
m.recentAlerts[alertID] = alert
|
|
m.historyManager.AddAlert(*alert)
|
|
|
|
// Save active alerts after adding new one
|
|
go func() {
|
|
defer func() {
|
|
if r := recover(); r != nil {
|
|
log.Error().Interface("panic", r).Msg("Panic in SaveActiveAlerts goroutine")
|
|
}
|
|
}()
|
|
if err := m.SaveActiveAlerts(); err != nil {
|
|
log.Error().Err(err).Msg("Failed to save active alerts after creation")
|
|
}
|
|
}()
|
|
|
|
log.Warn().
|
|
Str("alertID", alertID).
|
|
Str("resource", resourceName).
|
|
Str("metric", metricType).
|
|
Float64("value", value).
|
|
Float64("trigger", threshold.Trigger).
|
|
Float64("clear", threshold.Clear).
|
|
Int("activeAlerts", len(m.activeAlerts)).
|
|
Msg("Alert triggered")
|
|
|
|
// Trigger AI analysis callback unconditionally (bypasses notification suppression)
|
|
if m.onAlertForAI != nil {
|
|
alertCopy := alert.Clone()
|
|
go func(a *Alert) {
|
|
defer func() {
|
|
if r := recover(); r != nil {
|
|
log.Error().Interface("panic", r).Str("alertID", a.ID).Msg("Panic in AI alert callback")
|
|
}
|
|
}()
|
|
m.onAlertForAI(a)
|
|
}(alertCopy)
|
|
}
|
|
|
|
// Check rate limit (but don't remove alert from tracking)
|
|
if !m.checkRateLimit(alertID) {
|
|
log.Debug().
|
|
Str("alertID", alertID).
|
|
Int("maxPerHour", m.config.Schedule.MaxAlertsHour).
|
|
Msg("Alert notification suppressed due to rate limit")
|
|
// Don't delete the alert, just suppress notifications
|
|
return
|
|
}
|
|
|
|
// Notify callback (may be suppressed by quiet hours)
|
|
if m.onAlert != nil {
|
|
now := time.Now()
|
|
alert.LastNotified = &now
|
|
if m.dispatchAlert(alert, true) {
|
|
log.Info().Str("alertID", alertID).Msg("Calling onAlert callback")
|
|
} else {
|
|
alert.LastNotified = nil
|
|
}
|
|
} else {
|
|
log.Warn().Msg("No onAlert callback set!")
|
|
}
|
|
} else {
|
|
// Update existing alert
|
|
existingAlert.LastSeen = time.Now()
|
|
existingAlert.Value = value
|
|
// Keep display name current (handles upgrades and renames).
|
|
if dn := m.resolveNodeDisplayName(existingAlert.Instance, existingAlert.Node); dn != "" {
|
|
existingAlert.NodeDisplayName = dn
|
|
}
|
|
if existingAlert.Metadata == nil {
|
|
existingAlert.Metadata = map[string]interface{}{}
|
|
}
|
|
existingAlert.Metadata["resourceType"] = resourceType
|
|
existingAlert.Metadata["clearThreshold"] = threshold.Clear
|
|
existingAlert.Metadata["monitorOnly"] = monitorOnly
|
|
if opts != nil {
|
|
if opts.Message != "" {
|
|
existingAlert.Message = opts.Message
|
|
}
|
|
if opts.Metadata != nil {
|
|
for k, v := range opts.Metadata {
|
|
existingAlert.Metadata[k] = v
|
|
}
|
|
}
|
|
}
|
|
|
|
// Update level if needed
|
|
oldLevel := existingAlert.Level
|
|
if value >= threshold.Trigger+10 {
|
|
existingAlert.Level = AlertLevelCritical
|
|
} else {
|
|
existingAlert.Level = AlertLevelWarning
|
|
}
|
|
|
|
// Check if we should re-notify based on cooldown period
|
|
// Never re-notify acknowledged alerts (user has already seen it)
|
|
shouldRenotify := false
|
|
if existingAlert.Acknowledged {
|
|
log.Debug().
|
|
Str("alertID", alertID).
|
|
Msg("Alert is acknowledged, skipping re-notification")
|
|
} else if m.shouldNotifyAfterCooldown(existingAlert) {
|
|
shouldRenotify = true
|
|
log.Debug().
|
|
Str("alertID", alertID).
|
|
Dur("cooldown", time.Duration(m.config.Schedule.Cooldown)*time.Minute).
|
|
Msg("Cooldown period has passed, will re-notify")
|
|
} else if oldLevel != existingAlert.Level && existingAlert.Level == AlertLevelCritical {
|
|
// Always re-notify if alert escalated to critical
|
|
shouldRenotify = true
|
|
log.Debug().
|
|
Str("alertID", alertID).
|
|
Msg("Alert escalated to critical, will re-notify despite cooldown")
|
|
}
|
|
|
|
// Send re-notification if appropriate (may be suppressed by quiet hours)
|
|
if shouldRenotify && m.onAlert != nil {
|
|
now := time.Now()
|
|
existingAlert.LastNotified = &now
|
|
// Dispatch asynchronously so callback I/O cannot block alert evaluation.
|
|
if m.dispatchAlert(existingAlert, true) {
|
|
log.Info().
|
|
Str("alertID", alertID).
|
|
Str("level", string(existingAlert.Level)).
|
|
Msg("Re-notifying for existing alert")
|
|
} else {
|
|
existingAlert.LastNotified = nil
|
|
}
|
|
}
|
|
}
|
|
} else {
|
|
// Value is below trigger threshold
|
|
// Clear any pending alert for this metric
|
|
if _, isPending := m.pendingAlerts[alertID]; isPending {
|
|
delete(m.pendingAlerts, alertID)
|
|
log.Debug().
|
|
Str("alertID", alertID).
|
|
Msg("Value dropped below threshold, clearing pending alert")
|
|
}
|
|
|
|
if exists {
|
|
// Use hysteresis for resolution - only resolve if below clear threshold
|
|
clearThreshold := threshold.Clear
|
|
if clearThreshold <= 0 {
|
|
clearThreshold = threshold.Trigger // Fallback to trigger if clear not set
|
|
}
|
|
|
|
if value <= clearThreshold {
|
|
// Threshold cleared with hysteresis - auto resolve
|
|
resolvedAlert := &ResolvedAlert{
|
|
Alert: existingAlert,
|
|
ResolvedTime: time.Now(),
|
|
}
|
|
|
|
// Remove from active alerts
|
|
m.removeActiveAlertNoLock(alertID)
|
|
|
|
// Save active alerts after resolution
|
|
go func() {
|
|
defer func() {
|
|
if r := recover(); r != nil {
|
|
log.Error().Interface("panic", r).Msg("Panic in SaveActiveAlerts goroutine (resolution)")
|
|
}
|
|
}()
|
|
if err := m.SaveActiveAlerts(); err != nil {
|
|
log.Error().Err(err).Msg("Failed to save active alerts after resolution")
|
|
}
|
|
}()
|
|
|
|
// Add to recently resolved while preventing lock-order inversions
|
|
m.addRecentlyResolvedWithPrimaryLock(alertID, resolvedAlert)
|
|
|
|
log.Info().
|
|
Str("alertID", alertID).
|
|
Msg("Added alert to recently resolved")
|
|
|
|
log.Info().
|
|
Str("resource", resourceName).
|
|
Str("metric", metricType).
|
|
Float64("value", value).
|
|
Float64("clearThreshold", clearThreshold).
|
|
Bool("wasAcknowledged", existingAlert.Acknowledged).
|
|
Msg("Alert resolved with hysteresis")
|
|
|
|
if m.onResolved != nil {
|
|
go m.onResolved(alertID)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
func sanitizeAlertKey(label string) string {
|
|
trimmed := strings.TrimSpace(label)
|
|
if trimmed == "" {
|
|
return ""
|
|
}
|
|
|
|
if trimmed == "/" {
|
|
return "root"
|
|
}
|
|
|
|
trimmed = strings.Trim(trimmed, "/\\ ")
|
|
if trimmed == "" {
|
|
trimmed = "root"
|
|
}
|
|
|
|
lower := strings.ToLower(trimmed)
|
|
var builder strings.Builder
|
|
builder.Grow(len(lower))
|
|
prevDash := false
|
|
for _, r := range lower {
|
|
if (r >= 'a' && r <= 'z') || (r >= '0' && r <= '9') {
|
|
builder.WriteRune(r)
|
|
prevDash = false
|
|
continue
|
|
}
|
|
if r == '.' {
|
|
builder.WriteRune(r)
|
|
prevDash = false
|
|
continue
|
|
}
|
|
if !prevDash {
|
|
builder.WriteRune('-')
|
|
prevDash = true
|
|
}
|
|
}
|
|
|
|
sanitized := strings.Trim(builder.String(), "-.")
|
|
if sanitized == "" {
|
|
sanitized = "disk"
|
|
}
|
|
|
|
return sanitized
|
|
}
|
|
|
|
// abs returns the absolute value of a float64
|
|
func abs(x float64) float64 {
|
|
if x < 0 {
|
|
return -x
|
|
}
|
|
return x
|
|
}
|
|
|
|
// namespaceMatchesInstance checks if a PBS namespace likely corresponds to a PVE instance.
|
|
// This helps disambiguate backups when multiple PVE instances have VMs with the same VMID.
|
|
// Examples: namespace "pve1" matches instance "pve1", namespace "nat" matches instance "pve-nat"
|
|
func namespaceMatchesInstance(namespace, instance string) bool {
|
|
if namespace == "" || instance == "" {
|
|
return false
|
|
}
|
|
|
|
// Normalize both strings: lowercase and keep only alphanumeric
|
|
normalize := func(s string) string {
|
|
var b strings.Builder
|
|
for _, r := range strings.ToLower(s) {
|
|
if (r >= 'a' && r <= 'z') || (r >= '0' && r <= '9') {
|
|
b.WriteRune(r)
|
|
}
|
|
}
|
|
return b.String()
|
|
}
|
|
|
|
ns := normalize(namespace)
|
|
inst := normalize(instance)
|
|
|
|
if ns == "" || inst == "" {
|
|
return false
|
|
}
|
|
|
|
// Exact match after normalization
|
|
if ns == inst {
|
|
return true
|
|
}
|
|
|
|
// Check if namespace is a suffix of instance
|
|
// e.g., namespace "nat" matches instance "pvenat" (normalized from "pve-nat")
|
|
// This is more precise than substring matching because:
|
|
// - "nat" should match "pve-nat" but not "natpve"
|
|
// - "pve" should match "pve" but not "pve-nat" (handled by exact match above)
|
|
if strings.HasSuffix(inst, ns) {
|
|
return true
|
|
}
|
|
|
|
// Check if instance is a suffix of namespace (reverse case)
|
|
// e.g., namespace "pvebackups" could match instance "pve"
|
|
if strings.HasSuffix(ns, inst) {
|
|
return true
|
|
}
|
|
|
|
return false
|
|
}
|
|
|
|
// AcknowledgeAlert acknowledges an alert
|
|
func (m *Manager) AcknowledgeAlert(alertID, user string) error {
|
|
m.mu.Lock()
|
|
|
|
alert, exists := m.activeAlerts[alertID]
|
|
if !exists {
|
|
m.mu.Unlock()
|
|
return fmt.Errorf("alert not found: %s", alertID)
|
|
}
|
|
|
|
alert.Acknowledged = true
|
|
now := time.Now()
|
|
alert.AckTime = &now
|
|
alert.AckUser = user
|
|
|
|
// Write the modified alert back to the map
|
|
m.activeAlerts[alertID] = alert
|
|
m.ackState[alertID] = ackRecord{
|
|
acknowledged: true,
|
|
user: user,
|
|
time: now,
|
|
}
|
|
|
|
alertCopy := alert.Clone()
|
|
m.mu.Unlock()
|
|
|
|
log.Debug().
|
|
Str("alertID", alertID).
|
|
Str("user", user).
|
|
Time("ackTime", now).
|
|
Msg("Alert acknowledgment recorded")
|
|
|
|
m.safeCallAcknowledgedCallback(alertCopy, user)
|
|
return nil
|
|
}
|
|
|
|
// UnacknowledgeAlert removes the acknowledged status from an alert
|
|
func (m *Manager) UnacknowledgeAlert(alertID string) error {
|
|
m.mu.Lock()
|
|
|
|
alert, exists := m.activeAlerts[alertID]
|
|
if !exists {
|
|
m.mu.Unlock()
|
|
return fmt.Errorf("alert not found: %s", alertID)
|
|
}
|
|
|
|
alert.Acknowledged = false
|
|
alert.AckTime = nil
|
|
alert.AckUser = ""
|
|
|
|
// Write the modified alert back to the map
|
|
m.activeAlerts[alertID] = alert
|
|
delete(m.ackState, alertID)
|
|
|
|
alertCopy := alert.Clone()
|
|
m.mu.Unlock()
|
|
|
|
log.Info().
|
|
Str("alertID", alertID).
|
|
Msg("Alert unacknowledged")
|
|
|
|
m.safeCallUnacknowledgedCallback(alertCopy, "")
|
|
return nil
|
|
}
|
|
|
|
// preserveAlertState copies acknowledgement and escalation metadata from an existing alert
|
|
// into a freshly constructed alert before it replaces the existing entry in the map. This
|
|
// prevents UI state from regressing when alerts are rebuilt during polling.
|
|
func (m *Manager) preserveAlertState(alertID string, updated *Alert) {
|
|
if updated == nil {
|
|
return
|
|
}
|
|
|
|
// Auto-resolve node display name if not already set.
|
|
if updated.NodeDisplayName == "" && updated.Node != "" {
|
|
updated.NodeDisplayName = m.resolveNodeDisplayName(updated.Instance, updated.Node)
|
|
}
|
|
|
|
existing, exists := m.activeAlerts[alertID]
|
|
if exists && existing != nil {
|
|
// Preserve the original start time so duration calculations are correct
|
|
updated.StartTime = existing.StartTime
|
|
if existing.LastNotified != nil {
|
|
t := *existing.LastNotified
|
|
updated.LastNotified = &t
|
|
} else {
|
|
updated.LastNotified = nil
|
|
}
|
|
updated.Acknowledged = existing.Acknowledged
|
|
updated.AckUser = existing.AckUser
|
|
if existing.AckTime != nil {
|
|
t := *existing.AckTime
|
|
updated.AckTime = &t
|
|
} else {
|
|
updated.AckTime = nil
|
|
}
|
|
updated.LastEscalation = existing.LastEscalation
|
|
if len(existing.EscalationTimes) > 0 {
|
|
updated.EscalationTimes = append([]time.Time(nil), existing.EscalationTimes...)
|
|
} else {
|
|
updated.EscalationTimes = nil
|
|
}
|
|
|
|
log.Debug().
|
|
Str("alertID", alertID).
|
|
Time("originalStartTime", existing.StartTime).
|
|
Dur("currentDuration", time.Since(existing.StartTime)).
|
|
Msg("Preserving alert state including StartTime")
|
|
return
|
|
}
|
|
|
|
// Fall back to previously recorded acknowledgement state for this alert ID (e.g., flapping alerts)
|
|
if record, ok := m.ackState[alertID]; ok && record.acknowledged {
|
|
updated.Acknowledged = true
|
|
updated.AckUser = record.user
|
|
t := record.time
|
|
updated.AckTime = &t
|
|
}
|
|
}
|
|
|
|
func (m *Manager) removeActiveAlertNoLock(alertID string) {
|
|
// Before deleting, update the history entry with the alert's final LastSeen
|
|
// timestamp so the stored duration reflects how long the alert was actually active.
|
|
if alert, exists := m.activeAlerts[alertID]; exists && alert != nil {
|
|
m.historyManager.UpdateAlertLastSeen(alertID, alert.LastSeen)
|
|
}
|
|
delete(m.activeAlerts, alertID)
|
|
// NOTE: Don't delete ackState here - preserve it so if the same alert
|
|
// reappears (e.g., powered-off VM during backup), the acknowledgement
|
|
// is restored via preserveAlertState. ackState is cleaned up in Cleanup().
|
|
// Update inactiveAt so the cleanup TTL is measured from removal time, not ack time.
|
|
if record, exists := m.ackState[alertID]; exists {
|
|
record.inactiveAt = time.Now()
|
|
m.ackState[alertID] = record
|
|
}
|
|
}
|
|
|
|
// GetActiveAlerts returns all active alerts
|
|
func (m *Manager) GetActiveAlerts() []Alert {
|
|
m.mu.RLock()
|
|
defer m.mu.RUnlock()
|
|
|
|
alerts := make([]Alert, 0, len(m.activeAlerts))
|
|
for _, alert := range m.activeAlerts {
|
|
a := *alert
|
|
// Ensure display name is current (handles upgrades, renames, and
|
|
// alerts created before the cache was populated).
|
|
if dn := m.resolveNodeDisplayName(a.Instance, a.Node); dn != "" {
|
|
a.NodeDisplayName = dn
|
|
}
|
|
alerts = append(alerts, a)
|
|
}
|
|
|
|
// Sort to ensure stable ordering across poll cycles (map iteration is random)
|
|
sort.Slice(alerts, func(i, j int) bool {
|
|
if alerts[i].Node != alerts[j].Node {
|
|
return alerts[i].Node < alerts[j].Node
|
|
}
|
|
return alerts[i].ID < alerts[j].ID
|
|
})
|
|
|
|
return alerts
|
|
}
|
|
|
|
// NotifyExistingAlert re-dispatches a notification for an existing active alert
|
|
// Used when activation state changes from pending to active
|
|
func (m *Manager) NotifyExistingAlert(alertID string) {
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
|
|
alert, exists := m.activeAlerts[alertID]
|
|
if !exists {
|
|
return
|
|
}
|
|
|
|
// Dispatch notification for existing alert while holding lock
|
|
// dispatchAlert expects caller to hold m.mu for checkFlapping safety
|
|
m.dispatchAlert(alert, true)
|
|
}
|
|
|
|
// GetRecentlyResolved returns recently resolved alerts
|
|
func (m *Manager) GetRecentlyResolved() []models.ResolvedAlert {
|
|
m.resolvedMutex.RLock()
|
|
defer m.resolvedMutex.RUnlock()
|
|
|
|
resolved := make([]models.ResolvedAlert, 0, len(m.recentlyResolved))
|
|
for _, alert := range m.recentlyResolved {
|
|
resolved = append(resolved, models.ResolvedAlert{
|
|
Alert: models.Alert{
|
|
ID: alert.ID,
|
|
Type: alert.Type,
|
|
Level: string(alert.Level),
|
|
ResourceID: alert.ResourceID,
|
|
ResourceName: alert.ResourceName,
|
|
Node: alert.Node,
|
|
Instance: alert.Instance,
|
|
Message: alert.Message,
|
|
Value: alert.Value,
|
|
Threshold: alert.Threshold,
|
|
StartTime: alert.StartTime,
|
|
Acknowledged: alert.Acknowledged,
|
|
},
|
|
ResolvedTime: alert.ResolvedTime,
|
|
})
|
|
}
|
|
return resolved
|
|
}
|
|
|
|
// GetResolvedAlert returns a copy of a recently resolved alert by ID.
|
|
func (m *Manager) GetResolvedAlert(alertID string) *ResolvedAlert {
|
|
m.resolvedMutex.RLock()
|
|
defer m.resolvedMutex.RUnlock()
|
|
|
|
resolved, ok := m.recentlyResolved[alertID]
|
|
if !ok || resolved == nil || resolved.Alert == nil {
|
|
return nil
|
|
}
|
|
|
|
return &ResolvedAlert{
|
|
Alert: resolved.Alert.Clone(),
|
|
ResolvedTime: resolved.ResolvedTime,
|
|
}
|
|
}
|
|
|
|
// GetAlertHistory returns alert history
|
|
func (m *Manager) GetAlertHistory(limit int) []Alert {
|
|
return m.historyManager.GetAllHistory(limit)
|
|
}
|
|
|
|
// GetAlertHistorySince returns alert history entries created after the provided time.
|
|
func (m *Manager) GetAlertHistorySince(since time.Time, limit int) []Alert {
|
|
if since.IsZero() {
|
|
return m.GetAlertHistory(limit)
|
|
}
|
|
|
|
return m.historyManager.GetHistory(since, limit)
|
|
}
|
|
|
|
// ClearAlertHistory clears all alert history
|
|
func (m *Manager) ClearAlertHistory() error {
|
|
return m.historyManager.ClearAllHistory()
|
|
}
|
|
|
|
// OnAlertHistory registers a callback to be called when alerts are added to history.
|
|
// This enables external systems like pattern detection to track alerts.
|
|
func (m *Manager) OnAlertHistory(cb AlertCallback) {
|
|
if m.historyManager != nil {
|
|
m.historyManager.OnAlert(cb)
|
|
}
|
|
}
|
|
|
|
// checkNodeOffline creates an alert for offline nodes after confirmation
|
|
func (m *Manager) checkNodeOffline(node models.Node) {
|
|
alertID := fmt.Sprintf("node-offline-%s", node.ID)
|
|
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
|
|
// Check if node connectivity alerts are disabled
|
|
if override, exists := m.config.Overrides[node.ID]; exists && override.DisableConnectivity {
|
|
// Node connectivity alerts are disabled, clear any existing alert and return
|
|
if _, alertExists := m.activeAlerts[alertID]; alertExists {
|
|
m.clearAlertNoLock(alertID)
|
|
log.Debug().
|
|
Str("node", node.Name).
|
|
Msg("Node offline alert cleared (connectivity alerts disabled)")
|
|
}
|
|
delete(m.nodeOfflineCount, node.ID)
|
|
return
|
|
}
|
|
|
|
// Check if alert already exists
|
|
if _, exists := m.activeAlerts[alertID]; exists {
|
|
// Alert already exists, just update last seen time
|
|
m.activeAlerts[alertID].LastSeen = time.Now()
|
|
return
|
|
}
|
|
|
|
// Increment offline count
|
|
m.nodeOfflineCount[node.ID]++
|
|
offlineCount := m.nodeOfflineCount[node.ID]
|
|
|
|
log.Debug().
|
|
Str("node", node.Name).
|
|
Str("instance", node.Instance).
|
|
Int("offlineCount", offlineCount).
|
|
Msg("Node offline detection count")
|
|
|
|
// Require 3 consecutive offline polls (~15 seconds) before alerting
|
|
// This prevents false positives from transient cluster communication issues
|
|
const requiredOfflineCount = 3
|
|
if offlineCount < requiredOfflineCount {
|
|
log.Info().
|
|
Str("node", node.Name).
|
|
Int("count", offlineCount).
|
|
Int("required", requiredOfflineCount).
|
|
Msg("Node appears offline, waiting for confirmation")
|
|
return
|
|
}
|
|
|
|
// Create new offline alert after confirmation
|
|
alert := &Alert{
|
|
ID: alertID,
|
|
Type: "connectivity",
|
|
Level: AlertLevelCritical, // Node offline is always critical
|
|
ResourceID: node.ID,
|
|
ResourceName: node.Name,
|
|
Node: node.Name,
|
|
NodeDisplayName: m.resolveNodeDisplayName(node.Instance, node.Name),
|
|
Instance: node.Instance,
|
|
Message: fmt.Sprintf("Node '%s' is offline", node.Name),
|
|
Value: 0, // Not applicable for offline status
|
|
Threshold: 0, // Not applicable for offline status
|
|
StartTime: time.Now(),
|
|
Acknowledged: false,
|
|
}
|
|
|
|
m.preserveAlertState(alertID, alert)
|
|
|
|
m.activeAlerts[alertID] = alert
|
|
m.recentAlerts[alertID] = alert
|
|
|
|
// Add to history
|
|
m.historyManager.AddAlert(*alert)
|
|
|
|
// Send notification after confirmation
|
|
if !m.checkRateLimit(alertID) {
|
|
log.Debug().
|
|
Str("alertID", alertID).
|
|
Int("maxPerHour", m.config.Schedule.MaxAlertsHour).
|
|
Msg("Node offline alert suppressed due to rate limit")
|
|
return
|
|
}
|
|
|
|
m.dispatchAlert(alert, false)
|
|
|
|
// Log the critical event
|
|
log.Error().
|
|
Str("node", node.Name).
|
|
Str("instance", node.Instance).
|
|
Str("status", node.Status).
|
|
Str("connectionHealth", node.ConnectionHealth).
|
|
Int("confirmedAfter", requiredOfflineCount).
|
|
Msg("CRITICAL: Node is offline (confirmed)")
|
|
}
|
|
|
|
// clearNodeOfflineAlert removes offline alert when node comes back online
|
|
func (m *Manager) clearNodeOfflineAlert(node models.Node) {
|
|
alertID := fmt.Sprintf("node-offline-%s", node.ID)
|
|
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
|
|
// Reset offline count when node comes back online
|
|
if m.nodeOfflineCount[node.ID] > 0 {
|
|
log.Debug().
|
|
Str("node", node.Name).
|
|
Int("previousCount", m.nodeOfflineCount[node.ID]).
|
|
Msg("Node back online, resetting offline count")
|
|
delete(m.nodeOfflineCount, node.ID)
|
|
}
|
|
|
|
// Check if offline alert exists
|
|
alert, exists := m.activeAlerts[alertID]
|
|
if !exists {
|
|
return
|
|
}
|
|
|
|
// Remove from active alerts
|
|
m.removeActiveAlertNoLock(alertID)
|
|
|
|
resolvedAlert := &ResolvedAlert{
|
|
Alert: alert,
|
|
ResolvedTime: time.Now(),
|
|
}
|
|
m.addRecentlyResolvedWithPrimaryLock(alertID, resolvedAlert)
|
|
|
|
// Send recovery notification (async to avoid blocking alert cleanup while holding m.mu.Lock)
|
|
m.safeCallResolvedCallback(alertID, true)
|
|
|
|
// Log recovery
|
|
log.Info().
|
|
Str("node", node.Name).
|
|
Str("instance", node.Instance).
|
|
Dur("downtime", time.Since(alert.StartTime)).
|
|
Msg("Node is back online")
|
|
}
|
|
|
|
// checkPBSOffline creates an alert for offline PBS instances
|
|
func (m *Manager) checkPBSOffline(pbs models.PBSInstance) {
|
|
alertID := fmt.Sprintf("pbs-offline-%s", pbs.ID)
|
|
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
|
|
// Check if PBS offline alerts are disabled via disableConnectivity flag
|
|
if override, exists := m.config.Overrides[pbs.ID]; exists && (override.Disabled || override.DisableConnectivity) {
|
|
// PBS connectivity alerts are disabled, clear any existing alert and return
|
|
if _, alertExists := m.activeAlerts[alertID]; alertExists {
|
|
m.clearAlertNoLock(alertID)
|
|
log.Debug().
|
|
Str("pbs", pbs.Name).
|
|
Msg("PBS offline alert cleared (connectivity alerts disabled)")
|
|
}
|
|
return
|
|
}
|
|
|
|
// Track confirmation count for this PBS
|
|
m.offlineConfirmations[pbs.ID]++
|
|
|
|
// Require 3 consecutive offline polls (~15 seconds) before alerting
|
|
if m.offlineConfirmations[pbs.ID] < 3 {
|
|
log.Debug().
|
|
Str("pbs", pbs.Name).
|
|
Int("confirmations", m.offlineConfirmations[pbs.ID]).
|
|
Msg("PBS offline detected, waiting for confirmation")
|
|
return
|
|
}
|
|
|
|
// Check if alert already exists
|
|
if _, exists := m.activeAlerts[alertID]; exists {
|
|
// Update last seen time
|
|
m.activeAlerts[alertID].LastSeen = time.Now()
|
|
return
|
|
}
|
|
|
|
// Create new offline alert after confirmation
|
|
alert := &Alert{
|
|
ID: alertID,
|
|
Type: "offline",
|
|
Level: AlertLevelCritical,
|
|
ResourceID: pbs.ID,
|
|
ResourceName: pbs.Name,
|
|
Node: pbs.Host,
|
|
Instance: pbs.Name,
|
|
Message: fmt.Sprintf("PBS instance %s is offline", pbs.Name),
|
|
Value: 0,
|
|
Threshold: 0,
|
|
StartTime: time.Now(),
|
|
LastSeen: time.Now(),
|
|
}
|
|
|
|
m.preserveAlertState(alertID, alert)
|
|
|
|
m.activeAlerts[alertID] = alert
|
|
|
|
// Log and notify
|
|
log.Error().
|
|
Str("pbs", pbs.Name).
|
|
Str("host", pbs.Host).
|
|
Int("confirmations", m.offlineConfirmations[pbs.ID]).
|
|
Msg("PBS instance is offline")
|
|
|
|
if !m.checkRateLimit(alertID) {
|
|
log.Debug().
|
|
Str("alertID", alertID).
|
|
Int("maxPerHour", m.config.Schedule.MaxAlertsHour).
|
|
Msg("PBS offline alert suppressed due to rate limit")
|
|
return
|
|
}
|
|
|
|
m.dispatchAlert(alert, true)
|
|
}
|
|
|
|
// clearPBSOfflineAlert removes offline alert when PBS comes back online
|
|
func (m *Manager) clearPBSOfflineAlert(pbs models.PBSInstance) {
|
|
alertID := fmt.Sprintf("pbs-offline-%s", pbs.ID)
|
|
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
|
|
// Reset offline confirmation count
|
|
if count, exists := m.offlineConfirmations[pbs.ID]; exists && count > 0 {
|
|
log.Debug().
|
|
Str("pbs", pbs.Name).
|
|
Int("previousCount", count).
|
|
Msg("PBS is online, resetting offline confirmation count")
|
|
delete(m.offlineConfirmations, pbs.ID)
|
|
}
|
|
|
|
// Check if offline alert exists
|
|
alert, exists := m.activeAlerts[alertID]
|
|
if !exists {
|
|
return
|
|
}
|
|
|
|
// Remove from active alerts
|
|
m.removeActiveAlertNoLock(alertID)
|
|
|
|
resolvedAlert := &ResolvedAlert{
|
|
Alert: alert,
|
|
ResolvedTime: time.Now(),
|
|
}
|
|
m.addRecentlyResolvedWithPrimaryLock(alertID, resolvedAlert)
|
|
|
|
// Send recovery notification (async to avoid blocking alert cleanup while holding m.mu.Lock)
|
|
m.safeCallResolvedCallback(alertID, true)
|
|
|
|
// Log recovery
|
|
log.Info().
|
|
Str("pbs", pbs.Name).
|
|
Str("host", pbs.Host).
|
|
Dur("downtime", time.Since(alert.StartTime)).
|
|
Msg("PBS instance is back online")
|
|
}
|
|
|
|
// checkPMGOffline creates an alert for offline PMG instances
|
|
func (m *Manager) checkPMGOffline(pmg models.PMGInstance) {
|
|
alertID := fmt.Sprintf("pmg-offline-%s", pmg.ID)
|
|
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
|
|
// Check if PMG offline alerts are disabled via disableConnectivity flag
|
|
if override, exists := m.config.Overrides[pmg.ID]; exists && (override.Disabled || override.DisableConnectivity) {
|
|
// PMG connectivity alerts are disabled, clear any existing alert and return
|
|
if _, alertExists := m.activeAlerts[alertID]; alertExists {
|
|
m.clearAlertNoLock(alertID)
|
|
log.Debug().
|
|
Str("pmg", pmg.Name).
|
|
Msg("PMG offline alert cleared (connectivity alerts disabled)")
|
|
}
|
|
return
|
|
}
|
|
|
|
// Track confirmation count for this PMG
|
|
m.offlineConfirmations[pmg.ID]++
|
|
|
|
// Require 3 consecutive offline polls (~15 seconds) before alerting
|
|
if m.offlineConfirmations[pmg.ID] < 3 {
|
|
log.Debug().
|
|
Str("pmg", pmg.Name).
|
|
Int("confirmations", m.offlineConfirmations[pmg.ID]).
|
|
Msg("PMG offline detected, waiting for confirmation")
|
|
return
|
|
}
|
|
|
|
// Check if alert already exists
|
|
if _, exists := m.activeAlerts[alertID]; exists {
|
|
// Update last seen time
|
|
m.activeAlerts[alertID].LastSeen = time.Now()
|
|
return
|
|
}
|
|
|
|
// Create new offline alert after confirmation
|
|
alert := &Alert{
|
|
ID: alertID,
|
|
Type: "offline",
|
|
Level: AlertLevelCritical,
|
|
ResourceID: pmg.ID,
|
|
ResourceName: pmg.Name,
|
|
Node: pmg.Host,
|
|
Instance: pmg.Name,
|
|
Message: fmt.Sprintf("PMG instance %s is offline", pmg.Name),
|
|
Value: 0,
|
|
Threshold: 0,
|
|
StartTime: time.Now(),
|
|
LastSeen: time.Now(),
|
|
}
|
|
|
|
m.preserveAlertState(alertID, alert)
|
|
|
|
m.activeAlerts[alertID] = alert
|
|
|
|
// Log and notify
|
|
log.Error().
|
|
Str("pmg", pmg.Name).
|
|
Str("host", pmg.Host).
|
|
Int("confirmations", m.offlineConfirmations[pmg.ID]).
|
|
Msg("PMG instance is offline")
|
|
|
|
if !m.checkRateLimit(alertID) {
|
|
log.Debug().
|
|
Str("alertID", alertID).
|
|
Int("maxPerHour", m.config.Schedule.MaxAlertsHour).
|
|
Msg("PMG offline alert suppressed due to rate limit")
|
|
return
|
|
}
|
|
|
|
m.dispatchAlert(alert, true)
|
|
}
|
|
|
|
// clearPMGOfflineAlert removes offline alert when PMG comes back online
|
|
func (m *Manager) clearPMGOfflineAlert(pmg models.PMGInstance) {
|
|
alertID := fmt.Sprintf("pmg-offline-%s", pmg.ID)
|
|
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
|
|
// Reset offline confirmation count
|
|
if count, exists := m.offlineConfirmations[pmg.ID]; exists && count > 0 {
|
|
log.Debug().
|
|
Str("pmg", pmg.Name).
|
|
Int("previousCount", count).
|
|
Msg("PMG is online, resetting offline confirmation count")
|
|
delete(m.offlineConfirmations, pmg.ID)
|
|
}
|
|
|
|
// Check if offline alert exists
|
|
alert, exists := m.activeAlerts[alertID]
|
|
if !exists {
|
|
return
|
|
}
|
|
|
|
// Remove from active alerts
|
|
m.removeActiveAlertNoLock(alertID)
|
|
|
|
resolvedAlert := &ResolvedAlert{
|
|
Alert: alert,
|
|
ResolvedTime: time.Now(),
|
|
}
|
|
m.addRecentlyResolvedWithPrimaryLock(alertID, resolvedAlert)
|
|
|
|
// Send recovery notification (async to avoid blocking alert cleanup while holding m.mu.Lock)
|
|
m.safeCallResolvedCallback(alertID, true)
|
|
|
|
// Log recovery
|
|
log.Info().
|
|
Str("pmg", pmg.Name).
|
|
Str("host", pmg.Host).
|
|
Dur("downtime", time.Since(alert.StartTime)).
|
|
Msg("PMG instance is back online")
|
|
}
|
|
|
|
// checkPMGQueueDepths checks PMG mail queue depths and creates alerts
|
|
// Evaluates all queue types (total, deferred, hold) independently
|
|
func (m *Manager) checkPMGQueueDepths(pmg models.PMGInstance, defaults PMGThresholdConfig) {
|
|
// Aggregate queue totals across all nodes
|
|
var totalQueue, totalDeferred, totalHold int
|
|
|
|
for _, node := range pmg.Nodes {
|
|
if node.QueueStatus != nil {
|
|
totalQueue += node.QueueStatus.Total
|
|
totalDeferred += node.QueueStatus.Deferred
|
|
totalHold += node.QueueStatus.Hold
|
|
}
|
|
}
|
|
|
|
// Check total queue depth
|
|
if defaults.QueueTotalWarning > 0 || defaults.QueueTotalCritical > 0 {
|
|
alertID := fmt.Sprintf("%s-queue-total", pmg.ID)
|
|
var level AlertLevel
|
|
var threshold int
|
|
var shouldAlert bool
|
|
|
|
if defaults.QueueTotalCritical > 0 && totalQueue >= defaults.QueueTotalCritical {
|
|
level = AlertLevelCritical
|
|
threshold = defaults.QueueTotalCritical
|
|
shouldAlert = true
|
|
} else if defaults.QueueTotalWarning > 0 && totalQueue >= defaults.QueueTotalWarning {
|
|
level = AlertLevelWarning
|
|
threshold = defaults.QueueTotalWarning
|
|
shouldAlert = true
|
|
}
|
|
|
|
if !shouldAlert {
|
|
m.clearAlert(alertID)
|
|
} else {
|
|
m.mu.Lock()
|
|
if alert, exists := m.activeAlerts[alertID]; exists {
|
|
alert.LastSeen = time.Now()
|
|
alert.Value = float64(totalQueue)
|
|
alert.Threshold = float64(threshold)
|
|
alert.Level = level
|
|
} else {
|
|
alert := &Alert{
|
|
ID: alertID,
|
|
Type: "queue-depth",
|
|
Level: level,
|
|
ResourceID: pmg.ID,
|
|
ResourceName: pmg.Name,
|
|
Node: pmg.Host,
|
|
NodeDisplayName: m.resolveNodeDisplayName(pmg.Name, pmg.Host),
|
|
Instance: pmg.Name,
|
|
Message: fmt.Sprintf("PMG %s has %d total messages in queue (threshold: %d)", pmg.Name, totalQueue, threshold),
|
|
Value: float64(totalQueue),
|
|
Threshold: float64(threshold),
|
|
StartTime: time.Now(),
|
|
LastSeen: time.Now(),
|
|
}
|
|
m.activeAlerts[alertID] = alert
|
|
m.dispatchAlert(alert, true)
|
|
log.Warn().
|
|
Str("pmg", pmg.Name).
|
|
Int("total_queue", totalQueue).
|
|
Int("threshold", threshold).
|
|
Str("level", string(level)).
|
|
Msg("PMG total queue depth alert triggered")
|
|
}
|
|
m.mu.Unlock()
|
|
}
|
|
}
|
|
|
|
// Check deferred queue depth
|
|
if defaults.DeferredQueueWarn > 0 || defaults.DeferredQueueCritical > 0 {
|
|
alertID := fmt.Sprintf("%s-queue-deferred", pmg.ID)
|
|
var level AlertLevel
|
|
var threshold int
|
|
var shouldAlert bool
|
|
|
|
if defaults.DeferredQueueCritical > 0 && totalDeferred >= defaults.DeferredQueueCritical {
|
|
level = AlertLevelCritical
|
|
threshold = defaults.DeferredQueueCritical
|
|
shouldAlert = true
|
|
} else if defaults.DeferredQueueWarn > 0 && totalDeferred >= defaults.DeferredQueueWarn {
|
|
level = AlertLevelWarning
|
|
threshold = defaults.DeferredQueueWarn
|
|
shouldAlert = true
|
|
}
|
|
|
|
if !shouldAlert {
|
|
m.clearAlert(alertID)
|
|
} else {
|
|
m.mu.Lock()
|
|
if alert, exists := m.activeAlerts[alertID]; exists {
|
|
alert.LastSeen = time.Now()
|
|
alert.Value = float64(totalDeferred)
|
|
alert.Threshold = float64(threshold)
|
|
alert.Level = level
|
|
} else {
|
|
alert := &Alert{
|
|
ID: alertID,
|
|
Type: "queue-deferred",
|
|
Level: level,
|
|
ResourceID: pmg.ID,
|
|
ResourceName: pmg.Name,
|
|
Node: pmg.Host,
|
|
NodeDisplayName: m.resolveNodeDisplayName(pmg.Name, pmg.Host),
|
|
Instance: pmg.Name,
|
|
Message: fmt.Sprintf("PMG %s has %d deferred messages (threshold: %d)", pmg.Name, totalDeferred, threshold),
|
|
Value: float64(totalDeferred),
|
|
Threshold: float64(threshold),
|
|
StartTime: time.Now(),
|
|
LastSeen: time.Now(),
|
|
}
|
|
m.activeAlerts[alertID] = alert
|
|
m.dispatchAlert(alert, true)
|
|
log.Warn().
|
|
Str("pmg", pmg.Name).
|
|
Int("deferred_queue", totalDeferred).
|
|
Int("threshold", threshold).
|
|
Str("level", string(level)).
|
|
Msg("PMG deferred queue depth alert triggered")
|
|
}
|
|
m.mu.Unlock()
|
|
}
|
|
}
|
|
|
|
// Check hold queue depth
|
|
if defaults.HoldQueueWarn > 0 || defaults.HoldQueueCritical > 0 {
|
|
alertID := fmt.Sprintf("%s-queue-hold", pmg.ID)
|
|
var level AlertLevel
|
|
var threshold int
|
|
var shouldAlert bool
|
|
|
|
if defaults.HoldQueueCritical > 0 && totalHold >= defaults.HoldQueueCritical {
|
|
level = AlertLevelCritical
|
|
threshold = defaults.HoldQueueCritical
|
|
shouldAlert = true
|
|
} else if defaults.HoldQueueWarn > 0 && totalHold >= defaults.HoldQueueWarn {
|
|
level = AlertLevelWarning
|
|
threshold = defaults.HoldQueueWarn
|
|
shouldAlert = true
|
|
}
|
|
|
|
if !shouldAlert {
|
|
m.clearAlert(alertID)
|
|
} else {
|
|
m.mu.Lock()
|
|
if alert, exists := m.activeAlerts[alertID]; exists {
|
|
alert.LastSeen = time.Now()
|
|
alert.Value = float64(totalHold)
|
|
alert.Threshold = float64(threshold)
|
|
alert.Level = level
|
|
} else {
|
|
alert := &Alert{
|
|
ID: alertID,
|
|
Type: "queue-hold",
|
|
Level: level,
|
|
ResourceID: pmg.ID,
|
|
ResourceName: pmg.Name,
|
|
Node: pmg.Host,
|
|
NodeDisplayName: m.resolveNodeDisplayName(pmg.Name, pmg.Host),
|
|
Instance: pmg.Name,
|
|
Message: fmt.Sprintf("PMG %s has %d held messages (threshold: %d)", pmg.Name, totalHold, threshold),
|
|
Value: float64(totalHold),
|
|
Threshold: float64(threshold),
|
|
StartTime: time.Now(),
|
|
LastSeen: time.Now(),
|
|
}
|
|
m.activeAlerts[alertID] = alert
|
|
m.dispatchAlert(alert, true)
|
|
log.Warn().
|
|
Str("pmg", pmg.Name).
|
|
Int("hold_queue", totalHold).
|
|
Int("threshold", threshold).
|
|
Str("level", string(level)).
|
|
Msg("PMG hold queue depth alert triggered")
|
|
}
|
|
m.mu.Unlock()
|
|
}
|
|
}
|
|
}
|
|
|
|
// checkPMGOldestMessage checks oldest queued message age and creates alerts
|
|
func (m *Manager) checkPMGOldestMessage(pmg models.PMGInstance, defaults PMGThresholdConfig) {
|
|
if defaults.OldestMessageWarnMins <= 0 && defaults.OldestMessageCritMins <= 0 {
|
|
return
|
|
}
|
|
|
|
// Find the oldest message age across all nodes
|
|
var oldestAge int64 // in seconds
|
|
for _, node := range pmg.Nodes {
|
|
if node.QueueStatus != nil && node.QueueStatus.OldestAge > oldestAge {
|
|
oldestAge = node.QueueStatus.OldestAge
|
|
}
|
|
}
|
|
|
|
if oldestAge == 0 {
|
|
// No messages in queue, clear any existing alert
|
|
m.clearAlert(fmt.Sprintf("%s-oldest-message", pmg.ID))
|
|
return
|
|
}
|
|
|
|
alertID := fmt.Sprintf("%s-oldest-message", pmg.ID)
|
|
oldestMinutes := oldestAge / 60
|
|
|
|
var level AlertLevel
|
|
var threshold int64
|
|
|
|
if defaults.OldestMessageCritMins > 0 && oldestMinutes >= int64(defaults.OldestMessageCritMins) {
|
|
level = AlertLevelCritical
|
|
threshold = int64(defaults.OldestMessageCritMins)
|
|
} else if defaults.OldestMessageWarnMins > 0 && oldestMinutes >= int64(defaults.OldestMessageWarnMins) {
|
|
level = AlertLevelWarning
|
|
threshold = int64(defaults.OldestMessageWarnMins)
|
|
} else {
|
|
// Oldest message is below thresholds, clear any existing alert
|
|
m.clearAlert(alertID)
|
|
return
|
|
}
|
|
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
|
|
// Check if alert already exists
|
|
if alert, exists := m.activeAlerts[alertID]; exists {
|
|
// Update existing alert
|
|
alert.LastSeen = time.Now()
|
|
alert.Value = float64(oldestMinutes)
|
|
alert.Threshold = float64(threshold)
|
|
alert.Level = level
|
|
return
|
|
}
|
|
|
|
// Create new alert
|
|
alert := &Alert{
|
|
ID: alertID,
|
|
Type: "message-age",
|
|
Level: level,
|
|
ResourceID: pmg.ID,
|
|
ResourceName: pmg.Name,
|
|
Node: pmg.Host,
|
|
NodeDisplayName: m.resolveNodeDisplayName(pmg.Name, pmg.Host),
|
|
Instance: pmg.Name,
|
|
Message: fmt.Sprintf("PMG %s has messages queued for %d minutes (threshold: %d minutes)", pmg.Name, oldestMinutes, threshold),
|
|
Value: float64(oldestMinutes),
|
|
Threshold: float64(threshold),
|
|
StartTime: time.Now(),
|
|
LastSeen: time.Now(),
|
|
}
|
|
|
|
m.activeAlerts[alertID] = alert
|
|
m.dispatchAlert(alert, true)
|
|
|
|
log.Warn().
|
|
Str("pmg", pmg.Name).
|
|
Int64("oldest_minutes", oldestMinutes).
|
|
Int64("threshold", threshold).
|
|
Str("level", string(level)).
|
|
Msg("PMG oldest message age alert triggered")
|
|
}
|
|
|
|
// checkPMGNodeQueues checks individual PMG node queue health
|
|
// Uses scaled thresholds (60% warn, 80% crit) and outlier detection
|
|
func (m *Manager) checkPMGNodeQueues(pmg models.PMGInstance, defaults PMGThresholdConfig) {
|
|
if len(pmg.Nodes) == 0 {
|
|
return
|
|
}
|
|
|
|
// Calculate median queue values across nodes for outlier detection
|
|
nodeQueueTotals := make([]int, 0, len(pmg.Nodes))
|
|
nodeQueueDeferred := make([]int, 0, len(pmg.Nodes))
|
|
nodeQueueHold := make([]int, 0, len(pmg.Nodes))
|
|
|
|
for _, node := range pmg.Nodes {
|
|
if node.QueueStatus != nil {
|
|
nodeQueueTotals = append(nodeQueueTotals, node.QueueStatus.Total)
|
|
nodeQueueDeferred = append(nodeQueueDeferred, node.QueueStatus.Deferred)
|
|
nodeQueueHold = append(nodeQueueHold, node.QueueStatus.Hold)
|
|
}
|
|
}
|
|
|
|
medianTotal := calculateMedianInt(nodeQueueTotals)
|
|
medianDeferred := calculateMedianInt(nodeQueueDeferred)
|
|
medianHold := calculateMedianInt(nodeQueueHold)
|
|
|
|
// Scaled thresholds: 60% for warning, 80% for critical (computed once, used for all nodes)
|
|
scaledQueueWarn := scaleThreshold(defaults.QueueTotalWarning, 0.6)
|
|
scaledQueueCrit := scaleThreshold(defaults.QueueTotalCritical, 0.8)
|
|
scaledDeferredWarn := scaleThreshold(defaults.DeferredQueueWarn, 0.6)
|
|
scaledDeferredCrit := scaleThreshold(defaults.DeferredQueueCritical, 0.8)
|
|
scaledHoldWarn := scaleThreshold(defaults.HoldQueueWarn, 0.6)
|
|
scaledHoldCrit := scaleThreshold(defaults.HoldQueueCritical, 0.8)
|
|
scaledAgeWarn := scaleThreshold(defaults.OldestMessageWarnMins, 0.6)
|
|
scaledAgeCrit := scaleThreshold(defaults.OldestMessageCritMins, 0.8)
|
|
|
|
// Check each node
|
|
for _, node := range pmg.Nodes {
|
|
if node.QueueStatus == nil {
|
|
continue
|
|
}
|
|
|
|
// Check total queue - always check thresholds
|
|
if scaledQueueWarn > 0 || scaledQueueCrit > 0 {
|
|
total := node.QueueStatus.Total
|
|
alertID := fmt.Sprintf("%s-%s-queue-total", pmg.ID, node.Name)
|
|
var level AlertLevel
|
|
var threshold int
|
|
|
|
if scaledQueueCrit > 0 && total >= scaledQueueCrit {
|
|
level = AlertLevelCritical
|
|
threshold = scaledQueueCrit
|
|
} else if scaledQueueWarn > 0 && total >= scaledQueueWarn {
|
|
level = AlertLevelWarning
|
|
threshold = scaledQueueWarn
|
|
} else {
|
|
m.clearAlert(alertID)
|
|
continue
|
|
}
|
|
|
|
// Add outlier indicator to message if applicable
|
|
isOutlier := isQueueOutlier(total, medianTotal)
|
|
outlierNote := ""
|
|
if isOutlier {
|
|
outlierNote = ", outlier"
|
|
}
|
|
|
|
m.createOrUpdateNodeAlert(alertID, pmg, node.Name, "queue-total", level, float64(total), float64(threshold),
|
|
fmt.Sprintf("PMG node %s on %s has %d total messages in queue (threshold: %d%s)",
|
|
node.Name, pmg.Name, total, threshold, outlierNote))
|
|
}
|
|
|
|
// Check deferred queue - always check thresholds
|
|
if scaledDeferredWarn > 0 || scaledDeferredCrit > 0 {
|
|
deferred := node.QueueStatus.Deferred
|
|
alertID := fmt.Sprintf("%s-%s-queue-deferred", pmg.ID, node.Name)
|
|
var level AlertLevel
|
|
var threshold int
|
|
|
|
if scaledDeferredCrit > 0 && deferred >= scaledDeferredCrit {
|
|
level = AlertLevelCritical
|
|
threshold = scaledDeferredCrit
|
|
} else if scaledDeferredWarn > 0 && deferred >= scaledDeferredWarn {
|
|
level = AlertLevelWarning
|
|
threshold = scaledDeferredWarn
|
|
} else {
|
|
m.clearAlert(alertID)
|
|
continue
|
|
}
|
|
|
|
// Add outlier indicator to message if applicable
|
|
isOutlier := isQueueOutlier(deferred, medianDeferred)
|
|
outlierNote := ""
|
|
if isOutlier {
|
|
outlierNote = ", outlier"
|
|
}
|
|
|
|
m.createOrUpdateNodeAlert(alertID, pmg, node.Name, "queue-deferred", level, float64(deferred), float64(threshold),
|
|
fmt.Sprintf("PMG node %s on %s has %d deferred messages (threshold: %d%s)",
|
|
node.Name, pmg.Name, deferred, threshold, outlierNote))
|
|
}
|
|
|
|
// Check hold queue - always check thresholds
|
|
if scaledHoldWarn > 0 || scaledHoldCrit > 0 {
|
|
hold := node.QueueStatus.Hold
|
|
alertID := fmt.Sprintf("%s-%s-queue-hold", pmg.ID, node.Name)
|
|
var level AlertLevel
|
|
var threshold int
|
|
|
|
if scaledHoldCrit > 0 && hold >= scaledHoldCrit {
|
|
level = AlertLevelCritical
|
|
threshold = scaledHoldCrit
|
|
} else if scaledHoldWarn > 0 && hold >= scaledHoldWarn {
|
|
level = AlertLevelWarning
|
|
threshold = scaledHoldWarn
|
|
} else {
|
|
m.clearAlert(alertID)
|
|
continue
|
|
}
|
|
|
|
// Add outlier indicator to message if applicable
|
|
isOutlier := isQueueOutlier(hold, medianHold)
|
|
outlierNote := ""
|
|
if isOutlier {
|
|
outlierNote = ", outlier"
|
|
}
|
|
|
|
m.createOrUpdateNodeAlert(alertID, pmg, node.Name, "queue-hold", level, float64(hold), float64(threshold),
|
|
fmt.Sprintf("PMG node %s on %s has %d held messages (threshold: %d%s)",
|
|
node.Name, pmg.Name, hold, threshold, outlierNote))
|
|
}
|
|
|
|
// Check oldest message age per node
|
|
if scaledAgeWarn > 0 || scaledAgeCrit > 0 {
|
|
oldestAge := node.QueueStatus.OldestAge
|
|
if oldestAge > 0 {
|
|
oldestMinutes := oldestAge / 60
|
|
alertID := fmt.Sprintf("%s-%s-oldest-message", pmg.ID, node.Name)
|
|
var level AlertLevel
|
|
var threshold int64
|
|
|
|
if scaledAgeCrit > 0 && oldestMinutes >= int64(scaledAgeCrit) {
|
|
level = AlertLevelCritical
|
|
threshold = int64(scaledAgeCrit)
|
|
} else if scaledAgeWarn > 0 && oldestMinutes >= int64(scaledAgeWarn) {
|
|
level = AlertLevelWarning
|
|
threshold = int64(scaledAgeWarn)
|
|
} else {
|
|
m.clearAlert(alertID)
|
|
continue
|
|
}
|
|
|
|
m.createOrUpdateNodeAlert(alertID, pmg, node.Name, "message-age", level, float64(oldestMinutes), float64(threshold),
|
|
fmt.Sprintf("PMG node %s on %s has messages queued for %d minutes (threshold: %d min, node-specific)",
|
|
node.Name, pmg.Name, oldestMinutes, threshold))
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// isQueueOutlier determines if a node's queue value is a significant outlier
|
|
// Returns true if value is >40% above the median across all nodes
|
|
func isQueueOutlier(value, median int) bool {
|
|
if median == 0 {
|
|
return value > 0
|
|
}
|
|
percentAboveMedian := float64(value-median) / float64(median) * 100
|
|
return percentAboveMedian > 40
|
|
}
|
|
|
|
// scaleThreshold applies a scaling factor to a threshold and ensures minimum value of 1
|
|
// Uses ceiling to avoid truncation issues with small thresholds
|
|
func scaleThreshold(threshold int, scaleFactor float64) int {
|
|
if threshold <= 0 {
|
|
return 0
|
|
}
|
|
scaled := int(math.Ceil(float64(threshold) * scaleFactor))
|
|
if scaled < 1 {
|
|
return 1
|
|
}
|
|
return scaled
|
|
}
|
|
|
|
// calculateMedianInt calculates median of integer slice
|
|
func calculateMedianInt(values []int) int {
|
|
if len(values) == 0 {
|
|
return 0
|
|
}
|
|
|
|
// Copy and sort
|
|
sorted := make([]int, len(values))
|
|
copy(sorted, values)
|
|
for i := 0; i < len(sorted); i++ {
|
|
for j := i + 1; j < len(sorted); j++ {
|
|
if sorted[i] > sorted[j] {
|
|
sorted[i], sorted[j] = sorted[j], sorted[i]
|
|
}
|
|
}
|
|
}
|
|
|
|
mid := len(sorted) / 2
|
|
if len(sorted)%2 == 0 {
|
|
return (sorted[mid-1] + sorted[mid]) / 2
|
|
}
|
|
return sorted[mid]
|
|
}
|
|
|
|
// createOrUpdateNodeAlert creates or updates a per-node alert
|
|
func (m *Manager) createOrUpdateNodeAlert(alertID string, pmg models.PMGInstance, nodeName, alertType string, level AlertLevel, value, threshold float64, message string) {
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
|
|
// Check if alert already exists
|
|
if alert, exists := m.activeAlerts[alertID]; exists {
|
|
alert.LastSeen = time.Now()
|
|
alert.Value = value
|
|
alert.Threshold = threshold
|
|
alert.Level = level
|
|
alert.Message = message
|
|
return
|
|
}
|
|
|
|
// Create new alert
|
|
alert := &Alert{
|
|
ID: alertID,
|
|
Type: alertType,
|
|
Level: level,
|
|
ResourceID: pmg.ID,
|
|
ResourceName: pmg.Name,
|
|
Node: nodeName,
|
|
NodeDisplayName: m.resolveNodeDisplayName(pmg.Name, nodeName),
|
|
Instance: pmg.Name,
|
|
Message: message,
|
|
Value: value,
|
|
Threshold: threshold,
|
|
StartTime: time.Now(),
|
|
LastSeen: time.Now(),
|
|
}
|
|
|
|
m.activeAlerts[alertID] = alert
|
|
m.dispatchAlert(alert, true)
|
|
|
|
log.Warn().
|
|
Str("pmg", pmg.Name).
|
|
Str("node", nodeName).
|
|
Str("type", alertType).
|
|
Float64("value", value).
|
|
Float64("threshold", threshold).
|
|
Str("level", string(level)).
|
|
Msg("PMG per-node alert triggered")
|
|
}
|
|
|
|
// checkPMGQuarantineBacklog checks quarantine backlog and growth rates
|
|
func (m *Manager) checkPMGQuarantineBacklog(pmg models.PMGInstance, defaults PMGThresholdConfig) {
|
|
if pmg.Quarantine == nil {
|
|
m.clearAlert(fmt.Sprintf("%s-quarantine-spam", pmg.ID))
|
|
m.clearAlert(fmt.Sprintf("%s-quarantine-virus", pmg.ID))
|
|
return
|
|
}
|
|
|
|
now := time.Now()
|
|
currentSpam := pmg.Quarantine.Spam
|
|
currentVirus := pmg.Quarantine.Virus
|
|
|
|
// Store current snapshot
|
|
m.mu.Lock()
|
|
snapshot := pmgQuarantineSnapshot{
|
|
Spam: currentSpam,
|
|
Virus: currentVirus,
|
|
Timestamp: now,
|
|
}
|
|
|
|
// Get or create history for this PMG instance
|
|
history := m.pmgQuarantineHistory[pmg.ID]
|
|
history = append(history, snapshot)
|
|
|
|
// Clean old snapshots (keep last 3 hours)
|
|
cutoff := now.Add(-3 * time.Hour)
|
|
validSnapshots := make([]pmgQuarantineSnapshot, 0, len(history))
|
|
for _, snap := range history {
|
|
if snap.Timestamp.After(cutoff) {
|
|
validSnapshots = append(validSnapshots, snap)
|
|
}
|
|
}
|
|
// Limit to max 48 samples to prevent unbounded growth
|
|
const maxQuarantineSnapshots = 48
|
|
if len(validSnapshots) > maxQuarantineSnapshots {
|
|
validSnapshots = validSnapshots[len(validSnapshots)-maxQuarantineSnapshots:]
|
|
}
|
|
m.pmgQuarantineHistory[pmg.ID] = validSnapshots
|
|
m.mu.Unlock()
|
|
|
|
// Find snapshot from ~2 hours ago (within ±15 min tolerance)
|
|
var twoHoursAgo *pmgQuarantineSnapshot
|
|
targetTime := now.Add(-2 * time.Hour)
|
|
minDiff := 15 * time.Minute
|
|
|
|
for i := range validSnapshots {
|
|
snap := &validSnapshots[i]
|
|
diff := snap.Timestamp.Sub(targetTime)
|
|
if diff < 0 {
|
|
diff = -diff
|
|
}
|
|
if diff < minDiff {
|
|
minDiff = diff
|
|
twoHoursAgo = snap
|
|
}
|
|
}
|
|
|
|
// Check spam quarantine
|
|
m.checkQuarantineMetric(pmg, "spam", currentSpam, twoHoursAgo, defaults)
|
|
|
|
// Check virus quarantine
|
|
m.checkQuarantineMetric(pmg, "virus", currentVirus, twoHoursAgo, defaults)
|
|
}
|
|
|
|
// checkQuarantineMetric checks a single quarantine metric (spam or virus)
|
|
func (m *Manager) checkQuarantineMetric(pmg models.PMGInstance, metricType string, current int, twoHoursAgo *pmgQuarantineSnapshot, defaults PMGThresholdConfig) {
|
|
alertID := fmt.Sprintf("%s-quarantine-%s", pmg.ID, metricType)
|
|
|
|
var absoluteWarn, absoluteCrit int
|
|
var previousCount int
|
|
|
|
// Get thresholds and previous count based on metric type
|
|
if metricType == "spam" {
|
|
absoluteWarn = defaults.QuarantineSpamWarn
|
|
absoluteCrit = defaults.QuarantineSpamCritical
|
|
if twoHoursAgo != nil {
|
|
previousCount = twoHoursAgo.Spam
|
|
}
|
|
} else { // virus
|
|
absoluteWarn = defaults.QuarantineVirusWarn
|
|
absoluteCrit = defaults.QuarantineVirusCritical
|
|
if twoHoursAgo != nil {
|
|
previousCount = twoHoursAgo.Virus
|
|
}
|
|
}
|
|
|
|
var level AlertLevel
|
|
var message string
|
|
var threshold int
|
|
var alertTriggered bool
|
|
|
|
// Check absolute thresholds first
|
|
if absoluteCrit > 0 && current >= absoluteCrit {
|
|
level = AlertLevelCritical
|
|
threshold = absoluteCrit
|
|
message = fmt.Sprintf("PMG %s has %d %s messages in quarantine (threshold: %d)", pmg.Name, current, metricType, threshold)
|
|
alertTriggered = true
|
|
} else if absoluteWarn > 0 && current >= absoluteWarn {
|
|
level = AlertLevelWarning
|
|
threshold = absoluteWarn
|
|
message = fmt.Sprintf("PMG %s has %d %s messages in quarantine (threshold: %d)", pmg.Name, current, metricType, threshold)
|
|
alertTriggered = true
|
|
}
|
|
|
|
// Check growth thresholds if we have historical data
|
|
if twoHoursAgo != nil && previousCount > 0 {
|
|
growth := current - previousCount
|
|
growthPct := (float64(growth) / float64(previousCount)) * 100
|
|
|
|
// Critical growth: ≥50% AND ≥500 messages
|
|
if defaults.QuarantineGrowthCritPct > 0 && defaults.QuarantineGrowthCritMin > 0 {
|
|
if growthPct >= float64(defaults.QuarantineGrowthCritPct) && growth >= defaults.QuarantineGrowthCritMin {
|
|
if level != AlertLevelCritical { // Only override if not already critical from absolute
|
|
level = AlertLevelCritical
|
|
threshold = previousCount + defaults.QuarantineGrowthCritMin
|
|
message = fmt.Sprintf("PMG %s %s quarantine growing rapidly: +%d messages (+%.1f%%) in 2 hours", pmg.Name, metricType, growth, growthPct)
|
|
alertTriggered = true
|
|
}
|
|
}
|
|
}
|
|
|
|
// Warning growth: ≥25% AND ≥250 messages (if not already critical)
|
|
if level != AlertLevelCritical && defaults.QuarantineGrowthWarnPct > 0 && defaults.QuarantineGrowthWarnMin > 0 {
|
|
if growthPct >= float64(defaults.QuarantineGrowthWarnPct) && growth >= defaults.QuarantineGrowthWarnMin {
|
|
level = AlertLevelWarning
|
|
threshold = previousCount + defaults.QuarantineGrowthWarnMin
|
|
message = fmt.Sprintf("PMG %s %s quarantine growing: +%d messages (+%.1f%%) in 2 hours", pmg.Name, metricType, growth, growthPct)
|
|
alertTriggered = true
|
|
}
|
|
}
|
|
}
|
|
|
|
// Clear alert if no thresholds exceeded
|
|
if !alertTriggered {
|
|
m.clearAlert(alertID)
|
|
return
|
|
}
|
|
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
|
|
// Check if alert already exists
|
|
if alert, exists := m.activeAlerts[alertID]; exists {
|
|
// Update existing alert
|
|
alert.LastSeen = time.Now()
|
|
alert.Value = float64(current)
|
|
alert.Threshold = float64(threshold)
|
|
alert.Level = level
|
|
alert.Message = message
|
|
return
|
|
}
|
|
|
|
// Create new alert
|
|
alert := &Alert{
|
|
ID: alertID,
|
|
Type: fmt.Sprintf("quarantine-%s", metricType),
|
|
Level: level,
|
|
ResourceID: pmg.ID,
|
|
ResourceName: pmg.Name,
|
|
Node: pmg.Host,
|
|
NodeDisplayName: m.resolveNodeDisplayName(pmg.Name, pmg.Host),
|
|
Instance: pmg.Name,
|
|
Message: message,
|
|
Value: float64(current),
|
|
Threshold: float64(threshold),
|
|
StartTime: time.Now(),
|
|
LastSeen: time.Now(),
|
|
}
|
|
|
|
m.activeAlerts[alertID] = alert
|
|
m.dispatchAlert(alert, true)
|
|
|
|
log.Warn().
|
|
Str("pmg", pmg.Name).
|
|
Str("type", metricType).
|
|
Int("current", current).
|
|
Int("threshold", threshold).
|
|
Str("level", string(level)).
|
|
Msg("PMG quarantine backlog alert triggered")
|
|
}
|
|
|
|
// calculateTrimmedBaseline computes a robust baseline from historical samples
|
|
// using trimmed mean with median fallback for statistical robustness
|
|
func calculateTrimmedBaseline(samples []float64) (baseline float64, trustworthy bool) {
|
|
sampleCount := len(samples)
|
|
|
|
// Need at least 12 samples for trustworthy baseline (warmup period)
|
|
if sampleCount < 12 {
|
|
return 0, false
|
|
}
|
|
|
|
// For full 24-sample baseline, use trimmed mean
|
|
if sampleCount >= 24 {
|
|
// Create a copy for sorting
|
|
sorted := make([]float64, len(samples))
|
|
copy(sorted, samples)
|
|
|
|
// Sort samples
|
|
for i := 0; i < len(sorted); i++ {
|
|
for j := i + 1; j < len(sorted); j++ {
|
|
if sorted[i] > sorted[j] {
|
|
sorted[i], sorted[j] = sorted[j], sorted[i]
|
|
}
|
|
}
|
|
}
|
|
|
|
// Calculate median
|
|
var median float64
|
|
mid := len(sorted) / 2
|
|
if len(sorted)%2 == 0 {
|
|
median = (sorted[mid-1] + sorted[mid]) / 2
|
|
} else {
|
|
median = sorted[mid]
|
|
}
|
|
|
|
// Calculate trimmed mean: drop top and bottom 2, average remaining 20
|
|
if len(sorted) >= 24 {
|
|
trimmed := sorted[2 : len(sorted)-2]
|
|
sum := 0.0
|
|
for _, val := range trimmed {
|
|
sum += val
|
|
}
|
|
trimmedMean := sum / float64(len(trimmed))
|
|
|
|
// Fallback rule: if trimmed mean differs from median by >40%, use median
|
|
diff := trimmedMean - median
|
|
if diff < 0 {
|
|
diff = -diff
|
|
}
|
|
percentDiff := (diff / median) * 100
|
|
|
|
if percentDiff > 40 {
|
|
return median, true
|
|
}
|
|
return trimmedMean, true
|
|
}
|
|
}
|
|
|
|
// For 12-23 samples, use simple mean (not enough for trimming)
|
|
sum := 0.0
|
|
for _, val := range samples {
|
|
sum += val
|
|
}
|
|
return sum / float64(len(samples)), true
|
|
}
|
|
|
|
// checkPMGAnomalies detects spam/virus rate anomalies using trimmed baseline
|
|
func (m *Manager) checkPMGAnomalies(pmg models.PMGInstance, _ PMGThresholdConfig) {
|
|
// Need mail count data
|
|
if len(pmg.MailCount) == 0 {
|
|
return
|
|
}
|
|
|
|
// Get the latest hourly sample (most recent)
|
|
latest := pmg.MailCount[len(pmg.MailCount)-1]
|
|
now := time.Now()
|
|
|
|
// Get or create anomaly tracker for this PMG instance
|
|
m.mu.Lock()
|
|
tracker := m.pmgAnomalyTrackers[pmg.ID]
|
|
if tracker == nil {
|
|
tracker = &pmgAnomalyTracker{
|
|
Samples: make([]pmgMailMetricSample, 0, 48),
|
|
Baselines: make(map[string]pmgBaselineCache),
|
|
}
|
|
m.pmgAnomalyTrackers[pmg.ID] = tracker
|
|
}
|
|
|
|
// Create sample from latest mail count
|
|
sample := pmgMailMetricSample{
|
|
SpamIn: latest.SpamIn,
|
|
SpamOut: latest.SpamOut,
|
|
VirusIn: latest.VirusIn,
|
|
VirusOut: latest.VirusOut,
|
|
Timestamp: latest.Timestamp,
|
|
}
|
|
|
|
// Check for duplicate timestamp (already processed this sample)
|
|
if !tracker.LastSampleTime.IsZero() && !sample.Timestamp.After(tracker.LastSampleTime) {
|
|
m.mu.Unlock()
|
|
return
|
|
}
|
|
|
|
// Check for timestamp gaps (>90 min indicates data discontinuity)
|
|
if !tracker.LastSampleTime.IsZero() {
|
|
gap := sample.Timestamp.Sub(tracker.LastSampleTime)
|
|
if gap > 90*time.Minute {
|
|
// Discard old samples - data gap detected
|
|
log.Debug().
|
|
Str("pmg", pmg.Name).
|
|
Dur("gap", gap).
|
|
Msg("PMG mail count data gap detected, resetting anomaly history")
|
|
tracker.Samples = make([]pmgMailMetricSample, 0, 48)
|
|
tracker.SampleCount = 0
|
|
}
|
|
}
|
|
|
|
// Add sample to ring buffer
|
|
tracker.Samples = append(tracker.Samples, sample)
|
|
tracker.SampleCount++
|
|
tracker.LastSampleTime = sample.Timestamp
|
|
|
|
// Maintain ring buffer size (keep last 48)
|
|
if len(tracker.Samples) > 48 {
|
|
tracker.Samples = tracker.Samples[len(tracker.Samples)-48:]
|
|
}
|
|
|
|
sampleCount := len(tracker.Samples)
|
|
m.mu.Unlock()
|
|
|
|
// Need at least 12 samples for baseline warmup
|
|
if sampleCount < 12 {
|
|
log.Debug().
|
|
Str("pmg", pmg.Name).
|
|
Int("samples", sampleCount).
|
|
Msg("PMG anomaly detection warming up (need 12 samples)")
|
|
return
|
|
}
|
|
|
|
// Calculate baselines and check each metric
|
|
metrics := []struct {
|
|
name string
|
|
current float64
|
|
extractor func(pmgMailMetricSample) float64
|
|
}{
|
|
{"spamIn", sample.SpamIn, func(s pmgMailMetricSample) float64 { return s.SpamIn }},
|
|
{"spamOut", sample.SpamOut, func(s pmgMailMetricSample) float64 { return s.SpamOut }},
|
|
{"virusIn", sample.VirusIn, func(s pmgMailMetricSample) float64 { return s.VirusIn }},
|
|
{"virusOut", sample.VirusOut, func(s pmgMailMetricSample) float64 { return s.VirusOut }},
|
|
}
|
|
|
|
for _, metric := range metrics {
|
|
m.checkAnomalyMetric(pmg, tracker, metric.name, metric.current, metric.extractor, now)
|
|
}
|
|
}
|
|
|
|
// checkAnomalyMetric checks a single spam/virus metric for anomalies
|
|
func (m *Manager) checkAnomalyMetric(pmg models.PMGInstance, tracker *pmgAnomalyTracker, metricName string, current float64, extractor func(pmgMailMetricSample) float64, now time.Time) {
|
|
// Extract historical values for this metric (excluding current sample)
|
|
m.mu.RLock()
|
|
samples := tracker.Samples
|
|
m.mu.RUnlock()
|
|
|
|
if len(samples) < 2 {
|
|
return
|
|
}
|
|
|
|
// Get previous 24 samples (or all available if less than 25 total)
|
|
startIdx := 0
|
|
if len(samples) > 25 {
|
|
startIdx = len(samples) - 25
|
|
}
|
|
historicalSamples := samples[startIdx : len(samples)-1] // Exclude current (last) sample
|
|
|
|
// Extract metric values
|
|
values := make([]float64, 0, len(historicalSamples))
|
|
for _, s := range historicalSamples {
|
|
values = append(values, extractor(s))
|
|
}
|
|
|
|
// Calculate baseline
|
|
baseline, trustworthy := calculateTrimmedBaseline(values)
|
|
if !trustworthy {
|
|
return
|
|
}
|
|
|
|
// Handle zero baseline edge case
|
|
if baseline == 0 && current > 0 {
|
|
baseline = 1.0 // Treat as 1 for ratio math
|
|
}
|
|
|
|
// Determine warning and critical thresholds
|
|
var warnRatio, critRatio float64
|
|
var warnDelta, critDelta float64
|
|
|
|
if baseline < 40 {
|
|
// Quiet site: use minimum absolute deltas
|
|
warnRatio = 0
|
|
critRatio = 0
|
|
warnDelta = baseline + 60
|
|
critDelta = baseline + 120
|
|
} else {
|
|
// Normal site: use ratio + absolute delta
|
|
warnRatio = 1.8
|
|
critRatio = 2.5
|
|
warnDelta = baseline + 150
|
|
critDelta = baseline + 300
|
|
}
|
|
|
|
alertID := fmt.Sprintf("%s-anomaly-%s", pmg.ID, metricName)
|
|
pendingKey := fmt.Sprintf("pmg-anomaly-%s-%s", pmg.ID, metricName)
|
|
|
|
var level AlertLevel
|
|
var triggered bool
|
|
var ratio float64
|
|
|
|
if baseline > 0 {
|
|
ratio = current / baseline
|
|
}
|
|
|
|
// Check critical threshold
|
|
if critRatio > 0 && ratio >= critRatio && current >= critDelta {
|
|
level = AlertLevelCritical
|
|
triggered = true
|
|
} else if warnRatio > 0 && ratio >= warnRatio && current >= warnDelta {
|
|
level = AlertLevelWarning
|
|
triggered = true
|
|
} else if baseline < 40 {
|
|
// Quiet site absolute check
|
|
if current >= critDelta {
|
|
level = AlertLevelCritical
|
|
triggered = true
|
|
} else if current >= warnDelta {
|
|
level = AlertLevelWarning
|
|
triggered = true
|
|
}
|
|
}
|
|
|
|
// Two-sample confirmation using pendingAlerts
|
|
if triggered {
|
|
m.mu.Lock()
|
|
firstSeen, pending := m.pendingAlerts[pendingKey]
|
|
if !pending {
|
|
// First sample above threshold - mark as pending
|
|
m.pendingAlerts[pendingKey] = now
|
|
m.mu.Unlock()
|
|
log.Debug().
|
|
Str("pmg", pmg.Name).
|
|
Str("metric", metricName).
|
|
Float64("current", current).
|
|
Float64("baseline", baseline).
|
|
Msg("PMG anomaly pending confirmation (first sample)")
|
|
return
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
// Second consecutive sample above threshold - issue alert
|
|
log.Debug().
|
|
Str("pmg", pmg.Name).
|
|
Str("metric", metricName).
|
|
Float64("current", current).
|
|
Float64("baseline", baseline).
|
|
Dur("pending", now.Sub(firstSeen)).
|
|
Msg("PMG anomaly confirmed (second sample)")
|
|
|
|
m.mu.Lock()
|
|
delete(m.pendingAlerts, pendingKey) // Clear pending
|
|
|
|
// Check if alert already exists
|
|
if alert, exists := m.activeAlerts[alertID]; exists {
|
|
alert.LastSeen = now
|
|
alert.Value = current
|
|
alert.Threshold = baseline
|
|
alert.Level = level
|
|
m.mu.Unlock()
|
|
return
|
|
}
|
|
|
|
// Create new alert
|
|
message := fmt.Sprintf("PMG %s anomaly detected: %s is %.1f messages/hour (%.1fx baseline of %.1f)",
|
|
pmg.Name, metricName, current, ratio, baseline)
|
|
|
|
alert := &Alert{
|
|
ID: alertID,
|
|
Type: fmt.Sprintf("anomaly-%s", metricName),
|
|
Level: level,
|
|
ResourceID: pmg.ID,
|
|
ResourceName: pmg.Name,
|
|
Node: pmg.Host,
|
|
NodeDisplayName: m.resolveNodeDisplayName(pmg.Name, pmg.Host),
|
|
Instance: pmg.Name,
|
|
Message: message,
|
|
Value: current,
|
|
Threshold: baseline,
|
|
StartTime: now,
|
|
LastSeen: now,
|
|
}
|
|
|
|
m.activeAlerts[alertID] = alert
|
|
m.mu.Unlock()
|
|
m.dispatchAlert(alert, true)
|
|
|
|
log.Warn().
|
|
Str("pmg", pmg.Name).
|
|
Str("metric", metricName).
|
|
Float64("current", current).
|
|
Float64("baseline", baseline).
|
|
Float64("ratio", ratio).
|
|
Str("level", string(level)).
|
|
Msg("PMG anomaly alert triggered")
|
|
} else {
|
|
// Below threshold - clear pending and alert
|
|
m.mu.Lock()
|
|
delete(m.pendingAlerts, pendingKey)
|
|
m.mu.Unlock()
|
|
m.clearAlert(alertID)
|
|
}
|
|
}
|
|
|
|
// checkStorageOffline creates an alert for offline/unavailable storage
|
|
func (m *Manager) checkStorageOffline(storage models.Storage) {
|
|
alertID := fmt.Sprintf("storage-offline-%s", storage.ID)
|
|
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
|
|
// Check if storage offline alerts are disabled
|
|
if override, exists, _ := findStorageOverride(m.config.Overrides, storage); exists && override.Disabled {
|
|
// Storage alerts are disabled, clear any existing alert and return
|
|
if _, alertExists := m.activeAlerts[alertID]; alertExists {
|
|
m.clearAlertNoLock(alertID)
|
|
log.Debug().
|
|
Str("storage", storage.Name).
|
|
Msg("Storage offline alert cleared (alerts disabled)")
|
|
}
|
|
return
|
|
}
|
|
|
|
// Track confirmation count for this storage
|
|
m.offlineConfirmations[storage.ID]++
|
|
|
|
// Require 2 consecutive offline polls (~10 seconds) before alerting for storage
|
|
// (less than nodes since storage status can be more transient)
|
|
if m.offlineConfirmations[storage.ID] < 2 {
|
|
log.Debug().
|
|
Str("storage", storage.Name).
|
|
Int("confirmations", m.offlineConfirmations[storage.ID]).
|
|
Msg("Storage offline detected, waiting for confirmation")
|
|
return
|
|
}
|
|
|
|
// Check if alert already exists
|
|
if _, exists := m.activeAlerts[alertID]; exists {
|
|
// Update last seen time
|
|
m.activeAlerts[alertID].LastSeen = time.Now()
|
|
return
|
|
}
|
|
|
|
// Create new offline alert after confirmation
|
|
alert := &Alert{
|
|
ID: alertID,
|
|
Type: "offline",
|
|
Level: AlertLevelWarning, // Storage offline is Warning, not Critical
|
|
ResourceID: storage.ID,
|
|
ResourceName: storage.Name,
|
|
Node: storage.Node,
|
|
Instance: storage.Instance,
|
|
Message: fmt.Sprintf("Storage %s on node %s is unavailable", storage.Name, storage.Node),
|
|
Value: 0,
|
|
Threshold: 0,
|
|
StartTime: time.Now(),
|
|
LastSeen: time.Now(),
|
|
}
|
|
|
|
m.preserveAlertState(alertID, alert)
|
|
|
|
m.activeAlerts[alertID] = alert
|
|
|
|
// Log and notify
|
|
log.Warn().
|
|
Str("storage", storage.Name).
|
|
Str("node", storage.Node).
|
|
Int("confirmations", m.offlineConfirmations[storage.ID]).
|
|
Msg("Storage is offline/unavailable")
|
|
|
|
if !m.checkRateLimit(alertID) {
|
|
log.Debug().
|
|
Str("alertID", alertID).
|
|
Int("maxPerHour", m.config.Schedule.MaxAlertsHour).
|
|
Msg("Storage offline alert suppressed due to rate limit")
|
|
return
|
|
}
|
|
|
|
m.dispatchAlert(alert, true)
|
|
}
|
|
|
|
// clearStorageOfflineAlert removes offline alert when storage comes back online
|
|
func (m *Manager) clearStorageOfflineAlert(storage models.Storage) {
|
|
alertID := fmt.Sprintf("storage-offline-%s", storage.ID)
|
|
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
|
|
// Reset offline confirmation count
|
|
if count, exists := m.offlineConfirmations[storage.ID]; exists && count > 0 {
|
|
log.Debug().
|
|
Str("storage", storage.Name).
|
|
Int("previousCount", count).
|
|
Msg("Storage is online, resetting offline confirmation count")
|
|
delete(m.offlineConfirmations, storage.ID)
|
|
}
|
|
|
|
// Check if offline alert exists
|
|
alert, exists := m.activeAlerts[alertID]
|
|
if !exists {
|
|
return
|
|
}
|
|
|
|
// Remove from active alerts
|
|
m.removeActiveAlertNoLock(alertID)
|
|
|
|
resolvedAlert := &ResolvedAlert{
|
|
Alert: alert,
|
|
ResolvedTime: time.Now(),
|
|
}
|
|
m.addRecentlyResolvedWithPrimaryLock(alertID, resolvedAlert)
|
|
|
|
// Send recovery notification (async to avoid blocking alert cleanup while holding m.mu.Lock)
|
|
m.safeCallResolvedCallback(alertID, true)
|
|
|
|
// Log recovery
|
|
log.Info().
|
|
Str("storage", storage.Name).
|
|
Str("node", storage.Node).
|
|
Dur("downtime", time.Since(alert.StartTime)).
|
|
Msg("Storage is back online")
|
|
}
|
|
|
|
// checkGuestPoweredOff creates an alert for powered-off guests
|
|
func (m *Manager) checkGuestPoweredOff(guestID, name, node, instanceName, guestType string, monitorOnly bool) {
|
|
alertID := fmt.Sprintf("guest-powered-off-%s", guestID)
|
|
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
|
|
// Get thresholds to check if powered-off alerts are disabled
|
|
var thresholds ThresholdConfig
|
|
if override, exists := m.config.Overrides[guestID]; exists {
|
|
thresholds = override
|
|
} else {
|
|
thresholds = m.config.GuestDefaults
|
|
}
|
|
|
|
severity := normalizePoweredOffSeverity(thresholds.PoweredOffSeverity)
|
|
|
|
// Check if powered-off alerts are disabled for this guest
|
|
if thresholds.Disabled || thresholds.DisableConnectivity {
|
|
// Powered-off alerts are disabled, clear any existing alert and return
|
|
if _, alertExists := m.activeAlerts[alertID]; alertExists {
|
|
m.clearAlertNoLock(alertID)
|
|
log.Debug().
|
|
Str("guest", name).
|
|
Msg("Guest powered-off alert cleared (alerts disabled)")
|
|
}
|
|
delete(m.offlineConfirmations, guestID)
|
|
return
|
|
}
|
|
|
|
// Check if alert already exists
|
|
if alert, exists := m.activeAlerts[alertID]; exists {
|
|
// Alert already exists, just update LastSeen
|
|
alert.LastSeen = time.Now()
|
|
alert.Level = severity
|
|
if alert.Metadata == nil {
|
|
alert.Metadata = map[string]interface{}{}
|
|
}
|
|
alert.Metadata["monitorOnly"] = monitorOnly
|
|
return
|
|
}
|
|
|
|
// Increment confirmation count
|
|
m.offlineConfirmations[guestID]++
|
|
confirmCount := m.offlineConfirmations[guestID]
|
|
|
|
log.Debug().
|
|
Str("guest", name).
|
|
Str("type", guestType).
|
|
Int("confirmations", confirmCount).
|
|
Msg("Guest powered-off detected")
|
|
|
|
// Require 2 consecutive powered-off polls (~10 seconds) before alerting
|
|
// This prevents false positives from transient states
|
|
const requiredConfirmations = 2
|
|
if confirmCount < requiredConfirmations {
|
|
log.Debug().
|
|
Str("guest", name).
|
|
Int("count", confirmCount).
|
|
Int("required", requiredConfirmations).
|
|
Msg("Guest appears powered-off, waiting for confirmation")
|
|
return
|
|
}
|
|
|
|
// Create new powered-off alert after confirmation
|
|
alert := &Alert{
|
|
ID: alertID,
|
|
Type: "powered-off",
|
|
Level: severity,
|
|
ResourceID: guestID,
|
|
ResourceName: name,
|
|
Node: node,
|
|
Instance: instanceName,
|
|
Message: fmt.Sprintf("%s '%s' is powered off", guestType, name),
|
|
Value: 0, // Not applicable for powered-off status
|
|
Threshold: 0, // Not applicable for powered-off status
|
|
StartTime: time.Now(),
|
|
LastSeen: time.Now(),
|
|
Acknowledged: false,
|
|
Metadata: map[string]interface{}{
|
|
"monitorOnly": monitorOnly,
|
|
},
|
|
}
|
|
|
|
m.preserveAlertState(alertID, alert)
|
|
|
|
m.activeAlerts[alertID] = alert
|
|
m.recentAlerts[alertID] = alert
|
|
|
|
// Add to history
|
|
m.historyManager.AddAlert(*alert)
|
|
|
|
// Send notification after confirmation
|
|
m.dispatchAlert(alert, false)
|
|
|
|
// Log the event
|
|
log.Warn().
|
|
Str("guest", name).
|
|
Str("type", guestType).
|
|
Str("node", node).
|
|
Str("instance", instanceName).
|
|
Int("confirmedAfter", requiredConfirmations).
|
|
Msg("Guest is powered off (confirmed)")
|
|
}
|
|
|
|
// clearGuestPoweredOffAlert removes powered-off alert when guest starts running
|
|
func (m *Manager) clearGuestPoweredOffAlert(guestID, name string) {
|
|
alertID := fmt.Sprintf("guest-powered-off-%s", guestID)
|
|
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
|
|
// Reset confirmation count when guest comes back online
|
|
if count, exists := m.offlineConfirmations[guestID]; exists && count > 0 {
|
|
log.Debug().
|
|
Str("guest", name).
|
|
Int("previousCount", count).
|
|
Msg("Guest is running, resetting powered-off confirmation count")
|
|
delete(m.offlineConfirmations, guestID)
|
|
}
|
|
|
|
// Check if powered-off alert exists
|
|
alert, exists := m.activeAlerts[alertID]
|
|
if !exists {
|
|
return
|
|
}
|
|
|
|
// Remove from active alerts
|
|
m.removeActiveAlertNoLock(alertID)
|
|
|
|
downtime := time.Since(alert.StartTime)
|
|
resolvedAlert := &ResolvedAlert{
|
|
Alert: alert,
|
|
ResolvedTime: time.Now(),
|
|
}
|
|
m.addRecentlyResolvedWithPrimaryLock(alertID, resolvedAlert)
|
|
|
|
// Send recovery notification (async to avoid blocking alert cleanup while holding m.mu.Lock)
|
|
m.safeCallResolvedCallback(alertID, true)
|
|
|
|
// Log recovery
|
|
log.Info().
|
|
Str("guest", name).
|
|
Dur("downtime", downtime).
|
|
Msg("Guest is now running")
|
|
}
|
|
|
|
// ClearAlert removes an alert from active alerts (but keeps in history)
|
|
func (m *Manager) ClearAlert(alertID string) bool {
|
|
m.mu.Lock()
|
|
if _, exists := m.activeAlerts[alertID]; !exists {
|
|
m.mu.Unlock()
|
|
return false
|
|
}
|
|
|
|
m.clearAlertNoLock(alertID)
|
|
delete(m.recentAlerts, alertID)
|
|
delete(m.pendingAlerts, alertID)
|
|
delete(m.suppressedUntil, alertID)
|
|
delete(m.alertRateLimit, alertID)
|
|
m.mu.Unlock()
|
|
|
|
m.saveActiveAlertsAsync("manual-clear")
|
|
return true
|
|
}
|
|
|
|
// Cleanup removes old acknowledged alerts and cleans up tracking maps
|
|
func (m *Manager) Cleanup(maxAge time.Duration) {
|
|
m.mu.Lock()
|
|
now := time.Now()
|
|
var autoAcked []*Alert
|
|
|
|
lastSeenTooOld := func(alert *Alert, cutoff time.Duration) bool {
|
|
if alert == nil {
|
|
return true
|
|
}
|
|
lastSeen := alert.LastSeen
|
|
if lastSeen.IsZero() {
|
|
lastSeen = alert.StartTime
|
|
}
|
|
return now.Sub(lastSeen) > cutoff
|
|
}
|
|
|
|
// Auto-acknowledge old alerts if configured
|
|
if m.config.AutoAcknowledgeAfterHours > 0 {
|
|
autoAckThreshold := time.Duration(m.config.AutoAcknowledgeAfterHours) * time.Hour
|
|
for id, alert := range m.activeAlerts {
|
|
if !alert.Acknowledged && now.Sub(alert.StartTime) > autoAckThreshold {
|
|
log.Info().
|
|
Str("alertID", id).
|
|
Dur("age", now.Sub(alert.StartTime)).
|
|
Msg("Auto-acknowledging old alert")
|
|
alert.Acknowledged = true
|
|
ackTime := now
|
|
alert.AckTime = &ackTime
|
|
alert.AckUser = "system-auto"
|
|
autoAcked = append(autoAcked, alert.Clone())
|
|
|
|
if recordAlertAcknowledged != nil {
|
|
recordAlertAcknowledged()
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Clean up acknowledged alerts based on TTL
|
|
if m.config.MaxAcknowledgedAgeDays > 0 {
|
|
acknowledgedTTL := time.Duration(m.config.MaxAcknowledgedAgeDays) * 24 * time.Hour
|
|
for id, alert := range m.activeAlerts {
|
|
if alert.Acknowledged && alert.AckTime != nil &&
|
|
now.Sub(*alert.AckTime) > acknowledgedTTL &&
|
|
lastSeenTooOld(alert, acknowledgedTTL) {
|
|
log.Info().
|
|
Str("alertID", id).
|
|
Dur("age", now.Sub(*alert.AckTime)).
|
|
Msg("Cleaning up old acknowledged alert (TTL)")
|
|
m.removeActiveAlertNoLock(id)
|
|
}
|
|
}
|
|
}
|
|
|
|
// Clean up old unacknowledged alerts based on TTL
|
|
if m.config.MaxAlertAgeDays > 0 {
|
|
alertTTL := time.Duration(m.config.MaxAlertAgeDays) * 24 * time.Hour
|
|
for id, alert := range m.activeAlerts {
|
|
if !alert.Acknowledged && now.Sub(alert.StartTime) > alertTTL {
|
|
log.Info().
|
|
Str("alertID", id).
|
|
Dur("age", now.Sub(alert.StartTime)).
|
|
Msg("Cleaning up old unacknowledged alert (TTL)")
|
|
m.removeActiveAlertNoLock(id)
|
|
}
|
|
}
|
|
}
|
|
|
|
// Original cleanup for acknowledged alerts (fallback if TTL not configured)
|
|
for id, alert := range m.activeAlerts {
|
|
if alert.Acknowledged && alert.AckTime != nil &&
|
|
now.Sub(*alert.AckTime) > maxAge &&
|
|
lastSeenTooOld(alert, maxAge) {
|
|
m.removeActiveAlertNoLock(id)
|
|
}
|
|
}
|
|
|
|
// Clean up stale ackState entries for alerts that no longer exist
|
|
// Keep ackState for 1 hour after the alert was removed (not from ack time)
|
|
// to handle transient alert clears (e.g., backups of powered-off VMs)
|
|
ackStateTTL := 1 * time.Hour
|
|
for id, record := range m.ackState {
|
|
if _, alertExists := m.activeAlerts[id]; !alertExists {
|
|
// Use inactiveAt (when alert was removed) for TTL, not ack time
|
|
checkTime := record.inactiveAt
|
|
if checkTime.IsZero() {
|
|
// Fallback for legacy entries without inactiveAt
|
|
checkTime = record.time
|
|
}
|
|
if now.Sub(checkTime) > ackStateTTL {
|
|
delete(m.ackState, id)
|
|
}
|
|
}
|
|
}
|
|
|
|
// Clean up recent alerts older than suppression window
|
|
suppressionWindow := time.Duration(m.config.SuppressionWindow) * time.Minute
|
|
if suppressionWindow == 0 {
|
|
suppressionWindow = 5 * time.Minute // Default
|
|
}
|
|
|
|
for id, alert := range m.recentAlerts {
|
|
if now.Sub(alert.StartTime) > suppressionWindow {
|
|
delete(m.recentAlerts, id)
|
|
}
|
|
}
|
|
|
|
// Clean up expired suppressions
|
|
for id, suppressUntil := range m.suppressedUntil {
|
|
if now.After(suppressUntil) {
|
|
delete(m.suppressedUntil, id)
|
|
}
|
|
}
|
|
|
|
// Clean up old rate limit entries (older than 1 hour)
|
|
cutoff := now.Add(-1 * time.Hour)
|
|
for alertID, times := range m.alertRateLimit {
|
|
var recentTimes []time.Time
|
|
for _, t := range times {
|
|
if t.After(cutoff) {
|
|
recentTimes = append(recentTimes, t)
|
|
}
|
|
}
|
|
if len(recentTimes) == 0 {
|
|
// No recent alerts, remove the entry entirely
|
|
delete(m.alertRateLimit, alertID)
|
|
} else {
|
|
// Update with only recent times
|
|
m.alertRateLimit[alertID] = recentTimes
|
|
}
|
|
}
|
|
|
|
// Clean up old recently resolved alerts (older than 5 minutes)
|
|
fiveMinutesAgo := now.Add(-5 * time.Minute)
|
|
m.resolvedMutex.Lock()
|
|
for alertID, resolved := range m.recentlyResolved {
|
|
if resolved.ResolvedTime.Before(fiveMinutesAgo) {
|
|
delete(m.recentlyResolved, alertID)
|
|
}
|
|
}
|
|
m.resolvedMutex.Unlock()
|
|
|
|
// Clean up stale pending alerts (older than max time threshold window)
|
|
// This prevents memory leak from deleted resources that never triggered alerts
|
|
maxPendingAge := 10 * time.Minute // Longest time threshold + safety buffer
|
|
for id, pendingTime := range m.pendingAlerts {
|
|
if now.Sub(pendingTime) > maxPendingAge {
|
|
delete(m.pendingAlerts, id)
|
|
log.Debug().
|
|
Str("resourceID", id).
|
|
Dur("age", now.Sub(pendingTime)).
|
|
Msg("Cleaned up stale pending alert entry")
|
|
}
|
|
}
|
|
|
|
// Clean up flapping history for resolved/inactive alerts
|
|
flappingCleanupAge := 1 * time.Hour
|
|
for alertID := range m.flappingHistory {
|
|
// If alert is no longer active and flapping cooldown has expired
|
|
if _, exists := m.activeAlerts[alertID]; !exists {
|
|
if suppressUntil, suppressed := m.suppressedUntil[alertID]; !suppressed || now.After(suppressUntil.Add(flappingCleanupAge)) {
|
|
delete(m.flappingHistory, alertID)
|
|
delete(m.flappingActive, alertID)
|
|
log.Debug().
|
|
Str("alertID", alertID).
|
|
Msg("Cleaned up flapping history for inactive alert")
|
|
}
|
|
}
|
|
}
|
|
|
|
// Clean up old Docker restart tracking (containers not seen in 24h)
|
|
// Prevents memory leak from ephemeral containers in CI/CD environments
|
|
for resourceID, record := range m.dockerRestartTracking {
|
|
if now.Sub(record.lastChecked) > 24*time.Hour {
|
|
delete(m.dockerRestartTracking, resourceID)
|
|
log.Debug().
|
|
Str("resourceID", resourceID).
|
|
Msg("Cleaned up stale Docker restart tracking entry")
|
|
}
|
|
}
|
|
|
|
// Clean up stale PMG anomaly trackers (no samples in 24h)
|
|
// Prevents memory leak from decommissioned or transient PMG instances
|
|
staleTrackerAge := 24 * time.Hour
|
|
for pmgID, tracker := range m.pmgAnomalyTrackers {
|
|
if tracker != nil && !tracker.LastSampleTime.IsZero() {
|
|
if now.Sub(tracker.LastSampleTime) > staleTrackerAge {
|
|
delete(m.pmgAnomalyTrackers, pmgID)
|
|
log.Debug().
|
|
Str("pmgID", pmgID).
|
|
Time("lastSampleTime", tracker.LastSampleTime).
|
|
Msg("Cleaned up stale PMG anomaly tracker")
|
|
}
|
|
}
|
|
}
|
|
|
|
// Clean up stale PMG quarantine history (no recent snapshots in 7 days)
|
|
// Prevents memory leak from deleted PMG instances
|
|
staleHistoryAge := 7 * 24 * time.Hour
|
|
for pmgID, snapshots := range m.pmgQuarantineHistory {
|
|
// If no snapshots remain or last snapshot is very old
|
|
if len(snapshots) == 0 {
|
|
delete(m.pmgQuarantineHistory, pmgID)
|
|
log.Debug().
|
|
Str("pmgID", pmgID).
|
|
Msg("Cleaned up empty PMG quarantine history")
|
|
continue
|
|
}
|
|
|
|
lastSnapshot := snapshots[len(snapshots)-1]
|
|
if now.Sub(lastSnapshot.Timestamp) > staleHistoryAge {
|
|
delete(m.pmgQuarantineHistory, pmgID)
|
|
log.Debug().
|
|
Str("pmgID", pmgID).
|
|
Time("lastSnapshot", lastSnapshot.Timestamp).
|
|
Msg("Cleaned up stale PMG quarantine history")
|
|
}
|
|
}
|
|
|
|
m.mu.Unlock()
|
|
|
|
for _, alert := range autoAcked {
|
|
m.safeCallAcknowledgedCallback(alert, "system-auto")
|
|
}
|
|
}
|
|
|
|
// convertLegacyThreshold converts a legacy float64 threshold to HysteresisThreshold
|
|
func (m *Manager) convertLegacyThreshold(legacy *float64) *HysteresisThreshold {
|
|
if legacy == nil || *legacy <= 0 {
|
|
return nil
|
|
}
|
|
margin := m.config.HysteresisMargin
|
|
if margin <= 0 {
|
|
margin = 5.0 // Default 5% margin
|
|
}
|
|
return &HysteresisThreshold{
|
|
Trigger: *legacy,
|
|
Clear: *legacy - margin,
|
|
}
|
|
}
|
|
|
|
func cloneThreshold(threshold *HysteresisThreshold) *HysteresisThreshold {
|
|
if threshold == nil {
|
|
return nil
|
|
}
|
|
clone := *threshold
|
|
return &clone
|
|
}
|
|
|
|
func cloneStringPtr(value *string) *string {
|
|
if value == nil {
|
|
return nil
|
|
}
|
|
v := *value
|
|
return &v
|
|
}
|
|
|
|
func cloneThresholdConfig(cfg ThresholdConfig) ThresholdConfig {
|
|
clone := cfg
|
|
clone.CPU = cloneThreshold(cfg.CPU)
|
|
clone.Memory = cloneThreshold(cfg.Memory)
|
|
clone.Disk = cloneThreshold(cfg.Disk)
|
|
clone.DiskRead = cloneThreshold(cfg.DiskRead)
|
|
clone.DiskWrite = cloneThreshold(cfg.DiskWrite)
|
|
clone.NetworkIn = cloneThreshold(cfg.NetworkIn)
|
|
clone.NetworkOut = cloneThreshold(cfg.NetworkOut)
|
|
clone.Temperature = cloneThreshold(cfg.Temperature)
|
|
clone.DiskTemperature = cloneThreshold(cfg.DiskTemperature)
|
|
clone.Usage = cloneThreshold(cfg.Usage)
|
|
clone.Note = cloneStringPtr(cfg.Note)
|
|
return clone
|
|
}
|
|
|
|
func (m *Manager) applyThresholdOverride(base ThresholdConfig, override ThresholdConfig) ThresholdConfig {
|
|
result := base
|
|
|
|
if override.Disabled {
|
|
result.Disabled = true
|
|
}
|
|
if override.DisableConnectivity {
|
|
result.DisableConnectivity = true
|
|
}
|
|
|
|
if override.CPU != nil {
|
|
result.CPU = ensureHysteresisThreshold(cloneThreshold(override.CPU))
|
|
} else if override.CPULegacy != nil {
|
|
result.CPU = m.convertLegacyThreshold(override.CPULegacy)
|
|
}
|
|
|
|
if override.Memory != nil {
|
|
result.Memory = ensureHysteresisThreshold(cloneThreshold(override.Memory))
|
|
} else if override.MemoryLegacy != nil {
|
|
result.Memory = m.convertLegacyThreshold(override.MemoryLegacy)
|
|
}
|
|
|
|
if override.Disk != nil {
|
|
result.Disk = ensureHysteresisThreshold(cloneThreshold(override.Disk))
|
|
} else if override.DiskLegacy != nil {
|
|
result.Disk = m.convertLegacyThreshold(override.DiskLegacy)
|
|
}
|
|
|
|
if override.DiskRead != nil {
|
|
result.DiskRead = ensureHysteresisThreshold(cloneThreshold(override.DiskRead))
|
|
} else if override.DiskReadLegacy != nil {
|
|
result.DiskRead = m.convertLegacyThreshold(override.DiskReadLegacy)
|
|
}
|
|
|
|
if override.DiskWrite != nil {
|
|
result.DiskWrite = ensureHysteresisThreshold(cloneThreshold(override.DiskWrite))
|
|
} else if override.DiskWriteLegacy != nil {
|
|
result.DiskWrite = m.convertLegacyThreshold(override.DiskWriteLegacy)
|
|
}
|
|
|
|
if override.NetworkIn != nil {
|
|
result.NetworkIn = ensureHysteresisThreshold(cloneThreshold(override.NetworkIn))
|
|
} else if override.NetworkInLegacy != nil {
|
|
result.NetworkIn = m.convertLegacyThreshold(override.NetworkInLegacy)
|
|
}
|
|
|
|
if override.NetworkOut != nil {
|
|
result.NetworkOut = ensureHysteresisThreshold(cloneThreshold(override.NetworkOut))
|
|
} else if override.NetworkOutLegacy != nil {
|
|
result.NetworkOut = m.convertLegacyThreshold(override.NetworkOutLegacy)
|
|
}
|
|
|
|
if override.Temperature != nil {
|
|
result.Temperature = ensureHysteresisThreshold(cloneThreshold(override.Temperature))
|
|
}
|
|
|
|
if override.DiskTemperature != nil {
|
|
result.DiskTemperature = ensureHysteresisThreshold(cloneThreshold(override.DiskTemperature))
|
|
}
|
|
|
|
if override.Usage != nil {
|
|
result.Usage = ensureHysteresisThreshold(cloneThreshold(override.Usage))
|
|
}
|
|
|
|
if override.Note != nil {
|
|
note := strings.TrimSpace(*override.Note)
|
|
if note == "" {
|
|
result.Note = nil
|
|
} else {
|
|
noteCopy := note
|
|
result.Note = ¬eCopy
|
|
}
|
|
}
|
|
|
|
return result
|
|
}
|
|
|
|
// ensureHysteresisThreshold ensures a threshold has hysteresis configured
|
|
func ensureHysteresisThreshold(threshold *HysteresisThreshold) *HysteresisThreshold {
|
|
if threshold == nil {
|
|
return nil
|
|
}
|
|
if threshold.Clear <= 0 {
|
|
threshold.Clear = threshold.Trigger - 5.0 // Default 5% margin
|
|
}
|
|
return threshold
|
|
}
|
|
|
|
type pulseTagSettings struct {
|
|
Suppress bool
|
|
MonitorOnly bool
|
|
Relaxed bool
|
|
}
|
|
|
|
func parsePulseTags(tags []string) pulseTagSettings {
|
|
settings := pulseTagSettings{}
|
|
for _, raw := range tags {
|
|
tag := strings.TrimSpace(strings.ToLower(raw))
|
|
switch tag {
|
|
case "pulse-no-alerts":
|
|
settings.Suppress = true
|
|
case "pulse-monitor-only":
|
|
settings.MonitorOnly = true
|
|
case "pulse-relaxed":
|
|
settings.Relaxed = true
|
|
}
|
|
}
|
|
return settings
|
|
}
|
|
|
|
func applyRelaxedGuestThresholds(cfg ThresholdConfig) ThresholdConfig {
|
|
relaxed := cloneThresholdConfig(cfg)
|
|
|
|
adjust := func(th **HysteresisThreshold, minTrigger float64) {
|
|
if *th == nil {
|
|
*th = &HysteresisThreshold{Trigger: minTrigger, Clear: minTrigger - 5}
|
|
return
|
|
}
|
|
ensureHysteresisThreshold(*th)
|
|
if (*th).Trigger < minTrigger {
|
|
(*th).Trigger = minTrigger
|
|
}
|
|
if (*th).Clear >= (*th).Trigger {
|
|
(*th).Clear = (*th).Trigger - 5
|
|
}
|
|
if (*th).Clear < 0 {
|
|
(*th).Clear = 0
|
|
}
|
|
}
|
|
|
|
adjust(&relaxed.CPU, 95)
|
|
adjust(&relaxed.Memory, 92)
|
|
adjust(&relaxed.Disk, 95)
|
|
|
|
return relaxed
|
|
}
|
|
|
|
func (m *Manager) suppressGuestAlerts(guestID string) bool {
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
|
|
cleared := false
|
|
|
|
for alertID, alert := range m.activeAlerts {
|
|
if alert == nil {
|
|
continue
|
|
}
|
|
if alert.ResourceID == guestID || strings.HasPrefix(alert.ResourceID, guestID+"/") || strings.HasPrefix(alertID, guestID) {
|
|
m.clearAlertNoLock(alertID)
|
|
delete(m.recentAlerts, alertID)
|
|
delete(m.pendingAlerts, alertID)
|
|
delete(m.suppressedUntil, alertID)
|
|
delete(m.alertRateLimit, alertID)
|
|
cleared = true
|
|
}
|
|
}
|
|
|
|
for key := range m.pendingAlerts {
|
|
if strings.HasPrefix(key, guestID) {
|
|
delete(m.pendingAlerts, key)
|
|
}
|
|
}
|
|
for key := range m.recentAlerts {
|
|
if strings.HasPrefix(key, guestID) {
|
|
delete(m.recentAlerts, key)
|
|
}
|
|
}
|
|
for key := range m.suppressedUntil {
|
|
if strings.HasPrefix(key, guestID) {
|
|
delete(m.suppressedUntil, key)
|
|
}
|
|
}
|
|
for key := range m.alertRateLimit {
|
|
if strings.HasPrefix(key, guestID) {
|
|
delete(m.alertRateLimit, key)
|
|
}
|
|
}
|
|
|
|
delete(m.offlineConfirmations, guestID)
|
|
|
|
return cleared
|
|
}
|
|
|
|
func (m *Manager) guestHasMonitorOnlyAlerts(guestID string) bool {
|
|
m.mu.RLock()
|
|
defer m.mu.RUnlock()
|
|
|
|
for _, alert := range m.activeAlerts {
|
|
if alert == nil {
|
|
continue
|
|
}
|
|
if alert.ResourceID != guestID {
|
|
continue
|
|
}
|
|
if isMonitorOnlyAlert(alert) {
|
|
return true
|
|
}
|
|
}
|
|
|
|
return false
|
|
}
|
|
|
|
// checkRateLimit checks if an alert has exceeded rate limit
|
|
func (m *Manager) checkRateLimit(alertID string) bool {
|
|
if m.config.Schedule.MaxAlertsHour <= 0 {
|
|
return true // No rate limit
|
|
}
|
|
|
|
now := time.Now()
|
|
cutoff := now.Add(-1 * time.Hour)
|
|
|
|
// Clean old entries and count recent alerts
|
|
var recentAlerts []time.Time
|
|
if times, exists := m.alertRateLimit[alertID]; exists {
|
|
for _, t := range times {
|
|
if t.After(cutoff) {
|
|
recentAlerts = append(recentAlerts, t)
|
|
}
|
|
}
|
|
}
|
|
|
|
// Check if we've hit the limit
|
|
if len(recentAlerts) >= m.config.Schedule.MaxAlertsHour {
|
|
return false
|
|
}
|
|
|
|
// Add current time
|
|
recentAlerts = append(recentAlerts, now)
|
|
m.alertRateLimit[alertID] = recentAlerts
|
|
|
|
return true
|
|
}
|
|
|
|
// escalationChecker runs periodically to check for alerts that need escalation and cleanup
|
|
func (m *Manager) escalationChecker() {
|
|
ticker := time.NewTicker(1 * time.Minute)
|
|
cleanupTicker := time.NewTicker(10 * time.Minute) // Run cleanup every 10 minutes
|
|
defer ticker.Stop()
|
|
defer cleanupTicker.Stop()
|
|
|
|
for {
|
|
select {
|
|
case <-ticker.C:
|
|
m.checkEscalations()
|
|
case <-cleanupTicker.C:
|
|
m.Cleanup(24 * time.Hour) // Clean up acknowledged alerts older than 24 hours
|
|
case <-m.escalationStop:
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
// checkEscalations checks all active alerts for escalation
|
|
func (m *Manager) checkEscalations() {
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
|
|
// Respect global alert and activation controls before escalating.
|
|
// Escalations should never bypass a user disabling alerts.
|
|
if !m.config.Enabled || m.config.ActivationState != ActivationActive {
|
|
return
|
|
}
|
|
|
|
if !m.config.Schedule.Escalation.Enabled {
|
|
return
|
|
}
|
|
|
|
now := time.Now()
|
|
for _, alert := range m.activeAlerts {
|
|
// Skip acknowledged alerts
|
|
if alert.Acknowledged {
|
|
continue
|
|
}
|
|
|
|
// Check each escalation level
|
|
for i, level := range m.config.Schedule.Escalation.Levels {
|
|
// Skip if we've already escalated to this level
|
|
if alert.LastEscalation >= i+1 {
|
|
continue
|
|
}
|
|
|
|
// Check if it's time to escalate
|
|
escalateTime := alert.StartTime.Add(time.Duration(level.After) * time.Minute)
|
|
if now.After(escalateTime) {
|
|
// Update alert escalation state
|
|
alert.LastEscalation = i + 1
|
|
alert.EscalationTimes = append(alert.EscalationTimes, now)
|
|
|
|
log.Info().
|
|
Str("alertID", alert.ID).
|
|
Int("level", i+1).
|
|
Str("notify", level.Notify).
|
|
Msg("Alert escalated")
|
|
|
|
// Trigger escalation callback
|
|
m.safeCallEscalateCallback(alert, i+1)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Stop stops the alert manager and saves history
|
|
func (m *Manager) Stop() {
|
|
close(m.escalationStop)
|
|
close(m.cleanupStop)
|
|
m.historyManager.Stop()
|
|
|
|
// Give background goroutines time to exit cleanly
|
|
time.Sleep(100 * time.Millisecond)
|
|
|
|
// Save active alerts before stopping
|
|
if err := m.SaveActiveAlerts(); err != nil {
|
|
log.Error().Err(err).Msg("Failed to save active alerts on stop")
|
|
}
|
|
}
|
|
|
|
// SaveActiveAlerts persists active alerts to disk
|
|
func (m *Manager) SaveActiveAlerts() error {
|
|
m.mu.RLock()
|
|
defer m.mu.RUnlock()
|
|
|
|
// Create directory if it doesn't exist
|
|
alertsDir := filepath.Join(utils.GetDataDir(), "alerts")
|
|
if err := os.MkdirAll(alertsDir, 0755); err != nil {
|
|
return fmt.Errorf("failed to create alerts directory: %w", err)
|
|
}
|
|
|
|
// Convert map to slice for JSON encoding
|
|
alerts := make([]*Alert, 0, len(m.activeAlerts))
|
|
for _, alert := range m.activeAlerts {
|
|
alerts = append(alerts, alert)
|
|
}
|
|
|
|
data, err := json.Marshal(alerts)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to marshal active alerts: %w", err)
|
|
}
|
|
|
|
// Write to temporary file first, then rename (atomic operation)
|
|
// Use a unique temp file to avoid race conditions between concurrent saves (e.g., periodic vs shutdown)
|
|
tmpFile, err := os.CreateTemp(alertsDir, "active-alerts-*.json.tmp")
|
|
if err != nil {
|
|
return fmt.Errorf("failed to create temp file: %w", err)
|
|
}
|
|
tmpName := tmpFile.Name()
|
|
|
|
// Ensure cleanup of temp file in case of failure
|
|
defer os.Remove(tmpName)
|
|
|
|
if _, err := tmpFile.Write(data); err != nil {
|
|
tmpFile.Close()
|
|
return fmt.Errorf("failed to write active alerts: %w", err)
|
|
}
|
|
if err := tmpFile.Close(); err != nil {
|
|
return fmt.Errorf("failed to close temp file: %w", err)
|
|
}
|
|
|
|
finalFile := filepath.Join(alertsDir, "active-alerts.json")
|
|
if err := os.Rename(tmpName, finalFile); err != nil {
|
|
return fmt.Errorf("failed to rename active alerts file: %w", err)
|
|
}
|
|
|
|
log.Debug().Int("count", len(alerts)).Msg("Saved active alerts to disk")
|
|
return nil
|
|
}
|
|
|
|
func (m *Manager) saveActiveAlertsAsync(context string) {
|
|
go func() {
|
|
defer func() {
|
|
if r := recover(); r != nil {
|
|
log.Error().
|
|
Interface("panic", r).
|
|
Str("context", context).
|
|
Msg("Panic in SaveActiveAlerts goroutine")
|
|
}
|
|
}()
|
|
if err := m.SaveActiveAlerts(); err != nil {
|
|
log.Error().
|
|
Err(err).
|
|
Str("context", context).
|
|
Msg("Failed to save active alerts")
|
|
}
|
|
}()
|
|
}
|
|
|
|
// LoadActiveAlerts restores active alerts from disk
|
|
func (m *Manager) LoadActiveAlerts() error {
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
|
|
alertsFile := filepath.Join(utils.GetDataDir(), "alerts", "active-alerts.json")
|
|
data, err := os.ReadFile(alertsFile)
|
|
if err != nil {
|
|
if os.IsNotExist(err) {
|
|
log.Info().Msg("No active alerts file found, starting fresh")
|
|
return nil
|
|
}
|
|
return fmt.Errorf("failed to read active alerts: %w", err)
|
|
}
|
|
|
|
var alerts []*Alert
|
|
if err := json.Unmarshal(data, &alerts); err != nil {
|
|
return fmt.Errorf("failed to unmarshal active alerts: %w", err)
|
|
}
|
|
|
|
// Restore alerts to the map with deduplication
|
|
now := time.Now()
|
|
restoredCount := 0
|
|
duplicateCount := 0
|
|
seen := make(map[string]bool)
|
|
|
|
for _, alert := range alerts {
|
|
// Migrate legacy guest alert IDs to the canonical guest format.
|
|
// Check if this is a guest-related alert by looking at common alert types
|
|
isGuestAlert := strings.Contains(alert.Type, "cpu") || strings.Contains(alert.Type, "memory") ||
|
|
strings.Contains(alert.Type, "disk") || strings.Contains(alert.Type, "network") ||
|
|
alert.Type == "guest-offline"
|
|
if isGuestAlert {
|
|
parts := strings.Split(alert.ResourceID, "-")
|
|
|
|
if alert.Node != "" && len(parts) >= 2 {
|
|
var newResourceID string
|
|
oldResourceID := alert.ResourceID
|
|
|
|
// Try to extract VMID (should be last part)
|
|
vmidStr := parts[len(parts)-1]
|
|
if _, err := strconv.Atoi(vmidStr); err == nil {
|
|
vmid, _ := strconv.Atoi(vmidStr)
|
|
newResourceID = BuildGuestKey(alert.Instance, alert.Node, vmid)
|
|
|
|
if newResourceID != "" && newResourceID != oldResourceID {
|
|
log.Info().
|
|
Str("oldID", oldResourceID).
|
|
Str("newID", newResourceID).
|
|
Str("alertType", alert.Type).
|
|
Msg("Migrating active alert from legacy guest ID format")
|
|
|
|
// Update resource ID
|
|
alert.ResourceID = newResourceID
|
|
|
|
// Update alert ID (usually contains resource ID)
|
|
alert.ID = strings.Replace(alert.ID, oldResourceID, newResourceID, 1)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Skip duplicates
|
|
if seen[alert.ID] {
|
|
duplicateCount++
|
|
log.Warn().Str("alertID", alert.ID).Msg("Skipping duplicate alert during restore")
|
|
continue
|
|
}
|
|
seen[alert.ID] = true
|
|
|
|
// Skip very old alerts (older than 24 hours)
|
|
if now.Sub(alert.StartTime) > 24*time.Hour {
|
|
log.Debug().Str("alertID", alert.ID).Msg("Skipping old alert during restore")
|
|
continue
|
|
}
|
|
|
|
// Skip acknowledged alerts older than 1 hour from activeAlerts,
|
|
// but still preserve the ackState so if the same alert reappears
|
|
// (e.g., backup-age alerts) it won't retrigger notifications.
|
|
if alert.Acknowledged && alert.AckTime != nil && now.Sub(*alert.AckTime) > time.Hour {
|
|
log.Debug().Str("alertID", alert.ID).Msg("Skipping old acknowledged alert from activeAlerts but preserving ackState")
|
|
ackTime := alert.StartTime
|
|
if alert.AckTime != nil {
|
|
ackTime = *alert.AckTime
|
|
}
|
|
m.ackState[alert.ID] = ackRecord{
|
|
acknowledged: true,
|
|
user: alert.AckUser,
|
|
time: ackTime,
|
|
}
|
|
continue
|
|
}
|
|
|
|
m.activeAlerts[alert.ID] = alert
|
|
if alert.Acknowledged {
|
|
ackTime := alert.StartTime
|
|
if alert.AckTime != nil {
|
|
ackTime = *alert.AckTime
|
|
}
|
|
m.ackState[alert.ID] = ackRecord{
|
|
acknowledged: true,
|
|
user: alert.AckUser,
|
|
time: ackTime,
|
|
}
|
|
}
|
|
restoredCount++
|
|
|
|
// For critical alerts that are still active after restart, send notifications
|
|
// This ensures users are notified about ongoing critical issues even after service restarts
|
|
// Only notify for alerts that started recently (within last 2 hours) to avoid spam
|
|
if alert.Level == AlertLevelCritical && now.Sub(alert.StartTime) < 2*time.Hour {
|
|
// Use a goroutine and add a small delay to avoid notification spam on startup
|
|
alertCopy := alert.Clone()
|
|
go func(a *Alert) {
|
|
// Wait for system to stabilize or cancellation
|
|
select {
|
|
case <-time.After(10 * time.Second):
|
|
log.Info().
|
|
Str("alertID", a.ID).
|
|
Str("resource", a.ResourceName).
|
|
Msg("Attempting to send notification for restored critical alert")
|
|
|
|
// Acquire lock before calling dispatchAlert (it accesses maps)
|
|
m.mu.Lock()
|
|
m.dispatchAlert(a, false) // Use dispatchAlert to respect activation state and quiet hours
|
|
m.mu.Unlock()
|
|
case <-m.escalationStop:
|
|
log.Debug().
|
|
Str("alertID", a.ID).
|
|
Msg("Cancelled startup notification due to shutdown")
|
|
return
|
|
}
|
|
}(alertCopy)
|
|
}
|
|
}
|
|
|
|
log.Info().
|
|
Int("restored", restoredCount).
|
|
Int("total", len(alerts)).
|
|
Int("duplicates", duplicateCount).
|
|
Msg("Restored active alerts from disk")
|
|
return nil
|
|
}
|
|
|
|
// CleanupAlertsForNodes removes alerts for nodes that no longer exist
|
|
func (m *Manager) CleanupAlertsForNodes(existingNodes map[string]bool) {
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
|
|
log.Debug().
|
|
Int("totalAlerts", len(m.activeAlerts)).
|
|
Int("existingNodes", len(existingNodes)).
|
|
Interface("nodes", existingNodes).
|
|
Msg("Starting alert cleanup for non-existent nodes")
|
|
|
|
removedCount := 0
|
|
for alertID, alert := range m.activeAlerts {
|
|
if alert == nil {
|
|
continue
|
|
}
|
|
|
|
// Skip alerts that are not tied to Proxmox nodes. Docker and PBS resources use
|
|
// synthetic node identifiers that won't appear in the Proxmox node list, so we
|
|
// must preserve their alerts here.
|
|
if strings.HasPrefix(alertID, "docker-") || strings.HasPrefix(alert.ResourceID, "docker:") {
|
|
continue
|
|
}
|
|
if strings.HasPrefix(alertID, "pbs-") || alert.Type == "pbs-offline" {
|
|
continue
|
|
}
|
|
// Use the Node field from the alert itself, which is more reliable
|
|
node := alert.Node
|
|
|
|
// If we couldn't get a node or the node doesn't exist, remove the alert
|
|
if node == "" || !existingNodes[node] {
|
|
m.removeActiveAlertNoLock(alertID)
|
|
removedCount++
|
|
log.Debug().Str("alertID", alertID).Str("node", node).Msg("Removed alert for non-existent node")
|
|
}
|
|
}
|
|
|
|
if removedCount > 0 {
|
|
log.Debug().Int("removed", removedCount).Int("remaining", len(m.activeAlerts)).Msg("Cleaned up alerts for non-existent nodes")
|
|
// Save the cleaned up state
|
|
go func() {
|
|
defer func() {
|
|
if r := recover(); r != nil {
|
|
log.Error().Interface("panic", r).Msg("Panic in SaveActiveAlerts goroutine (cleanup)")
|
|
}
|
|
}()
|
|
if err := m.SaveActiveAlerts(); err != nil {
|
|
log.Error().Err(err).Msg("Failed to save alerts after cleanup")
|
|
}
|
|
}()
|
|
} else {
|
|
log.Info().Msg("No alerts needed cleanup")
|
|
}
|
|
}
|
|
|
|
// ClearActiveAlerts removes all active and pending alerts, resetting the manager state.
|
|
func (m *Manager) ClearActiveAlerts() {
|
|
m.mu.Lock()
|
|
if len(m.activeAlerts) == 0 && len(m.pendingAlerts) == 0 {
|
|
m.mu.Unlock()
|
|
return
|
|
}
|
|
m.activeAlerts = make(map[string]*Alert)
|
|
m.pendingAlerts = make(map[string]time.Time)
|
|
m.recentAlerts = make(map[string]*Alert)
|
|
m.suppressedUntil = make(map[string]time.Time)
|
|
m.alertRateLimit = make(map[string][]time.Time)
|
|
m.nodeOfflineCount = make(map[string]int)
|
|
m.offlineConfirmations = make(map[string]int)
|
|
m.dockerOfflineCount = make(map[string]int)
|
|
m.dockerStateConfirm = make(map[string]int)
|
|
m.dockerRestartTracking = make(map[string]*dockerRestartRecord)
|
|
m.dockerLastExitCode = make(map[string]int)
|
|
m.dockerUpdateFirstSeen = make(map[string]time.Time)
|
|
m.dockerUpdateFirstSeenByIdentity = make(map[string]time.Time)
|
|
m.ackState = make(map[string]ackRecord)
|
|
m.mu.Unlock()
|
|
|
|
m.resolvedMutex.Lock()
|
|
m.recentlyResolved = make(map[string]*ResolvedAlert)
|
|
m.resolvedMutex.Unlock()
|
|
|
|
log.Info().Msg("Cleared all active and pending alerts")
|
|
|
|
go func() {
|
|
defer func() {
|
|
if r := recover(); r != nil {
|
|
log.Error().Interface("panic", r).Msg("Panic in SaveActiveAlerts goroutine (clear)")
|
|
}
|
|
}()
|
|
if err := m.SaveActiveAlerts(); err != nil {
|
|
log.Error().Err(err).Msg("Failed to persist cleared alerts")
|
|
}
|
|
}()
|
|
}
|
|
|
|
// periodicSaveAlerts saves active alerts to disk periodically
|
|
func (m *Manager) periodicSaveAlerts() {
|
|
ticker := time.NewTicker(1 * time.Minute)
|
|
defer ticker.Stop()
|
|
|
|
for {
|
|
select {
|
|
case <-ticker.C:
|
|
if err := m.SaveActiveAlerts(); err != nil {
|
|
log.Error().Err(err).Msg("Failed to save active alerts during periodic save")
|
|
}
|
|
case <-m.escalationStop:
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
// trackingMapCleanup periodically cleans up stale entries from tracking maps
|
|
// to prevent unbounded memory growth from deleted/decommissioned resources.
|
|
func (m *Manager) trackingMapCleanup() {
|
|
// Run cleanup every hour
|
|
ticker := time.NewTicker(1 * time.Hour)
|
|
defer ticker.Stop()
|
|
|
|
for {
|
|
select {
|
|
case <-ticker.C:
|
|
m.cleanupStaleMaps()
|
|
case <-m.cleanupStop:
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
// cleanupStaleMaps removes stale entries from tracking maps.
|
|
// Entries are considered stale if they haven't been updated in 24 hours
|
|
// and don't correspond to any active alert.
|
|
func (m *Manager) cleanupStaleMaps() {
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
|
|
now := time.Now()
|
|
staleThreshold := StaleTrackingThreshold
|
|
cleaned := 0
|
|
|
|
// Clean up flapping history for resources without active alerts
|
|
for alertID, history := range m.flappingHistory {
|
|
if _, hasAlert := m.activeAlerts[alertID]; !hasAlert {
|
|
// Check if history is stale (last entry older than threshold)
|
|
if len(history) == 0 || now.Sub(history[len(history)-1]) > staleThreshold {
|
|
delete(m.flappingHistory, alertID)
|
|
delete(m.flappingActive, alertID)
|
|
cleaned++
|
|
}
|
|
}
|
|
}
|
|
|
|
// Clean up suppressedUntil entries that have expired
|
|
for alertID, suppressUntil := range m.suppressedUntil {
|
|
if now.After(suppressUntil) {
|
|
delete(m.suppressedUntil, alertID)
|
|
cleaned++
|
|
}
|
|
}
|
|
|
|
// Clean up pending alerts older than threshold without active alerts
|
|
for alertID, pendingTime := range m.pendingAlerts {
|
|
if _, hasAlert := m.activeAlerts[alertID]; !hasAlert {
|
|
if now.Sub(pendingTime) > staleThreshold {
|
|
delete(m.pendingAlerts, alertID)
|
|
cleaned++
|
|
}
|
|
}
|
|
}
|
|
|
|
// Clean up offline confirmation counts for resources without active alerts
|
|
for resourceID := range m.offlineConfirmations {
|
|
hasRelatedAlert := false
|
|
for alertID := range m.activeAlerts {
|
|
if strings.Contains(alertID, resourceID) {
|
|
hasRelatedAlert = true
|
|
break
|
|
}
|
|
}
|
|
if !hasRelatedAlert {
|
|
delete(m.offlineConfirmations, resourceID)
|
|
cleaned++
|
|
}
|
|
}
|
|
|
|
// Clean up node offline counts (legacy)
|
|
for nodeID := range m.nodeOfflineCount {
|
|
hasRelatedAlert := false
|
|
for alertID := range m.activeAlerts {
|
|
if strings.Contains(alertID, nodeID) {
|
|
hasRelatedAlert = true
|
|
break
|
|
}
|
|
}
|
|
if !hasRelatedAlert {
|
|
delete(m.nodeOfflineCount, nodeID)
|
|
cleaned++
|
|
}
|
|
}
|
|
|
|
// Clean up Docker tracking maps
|
|
for containerID := range m.dockerStateConfirm {
|
|
hasRelatedAlert := false
|
|
for alertID := range m.activeAlerts {
|
|
if strings.Contains(alertID, containerID) {
|
|
hasRelatedAlert = true
|
|
break
|
|
}
|
|
}
|
|
if !hasRelatedAlert {
|
|
delete(m.dockerStateConfirm, containerID)
|
|
cleaned++
|
|
}
|
|
}
|
|
|
|
for hostID := range m.dockerOfflineCount {
|
|
hasRelatedAlert := false
|
|
for alertID := range m.activeAlerts {
|
|
if strings.Contains(alertID, hostID) {
|
|
hasRelatedAlert = true
|
|
break
|
|
}
|
|
}
|
|
if !hasRelatedAlert {
|
|
delete(m.dockerOfflineCount, hostID)
|
|
cleaned++
|
|
}
|
|
}
|
|
|
|
// Clean up Docker restart tracking for stale containers
|
|
for containerID, record := range m.dockerRestartTracking {
|
|
if record != nil && now.Sub(record.lastChecked) > staleThreshold {
|
|
delete(m.dockerRestartTracking, containerID)
|
|
delete(m.dockerLastExitCode, containerID)
|
|
cleaned++
|
|
}
|
|
}
|
|
|
|
// Clean up Docker update tracking for stale entries
|
|
for containerID, firstSeen := range m.dockerUpdateFirstSeen {
|
|
if now.Sub(firstSeen) > staleThreshold {
|
|
delete(m.dockerUpdateFirstSeen, containerID)
|
|
cleaned++
|
|
}
|
|
}
|
|
for containerID, firstSeen := range m.dockerUpdateFirstSeenByIdentity {
|
|
if now.Sub(firstSeen) > staleThreshold {
|
|
delete(m.dockerUpdateFirstSeenByIdentity, containerID)
|
|
cleaned++
|
|
}
|
|
}
|
|
|
|
// Clean up rate limit entries older than 1 hour
|
|
rateLimitThreshold := RateLimitCleanupWindow
|
|
for resourceID, times := range m.alertRateLimit {
|
|
// Filter to keep only recent entries
|
|
var recent []time.Time
|
|
for _, t := range times {
|
|
if now.Sub(t) < rateLimitThreshold {
|
|
recent = append(recent, t)
|
|
}
|
|
}
|
|
if len(recent) == 0 {
|
|
delete(m.alertRateLimit, resourceID)
|
|
cleaned++
|
|
} else if len(recent) < len(times) {
|
|
m.alertRateLimit[resourceID] = recent
|
|
}
|
|
}
|
|
|
|
// Clean up recent alerts older than suppression window
|
|
suppressWindow := time.Duration(m.config.SuppressionWindow) * time.Minute
|
|
if suppressWindow <= 0 {
|
|
suppressWindow = 5 * time.Minute
|
|
}
|
|
for alertID, alert := range m.recentAlerts {
|
|
if now.Sub(alert.LastSeen) > suppressWindow {
|
|
delete(m.recentAlerts, alertID)
|
|
cleaned++
|
|
}
|
|
}
|
|
|
|
// Clean up ackState for alerts that no longer exist and are older than threshold
|
|
for alertID, record := range m.ackState {
|
|
if _, hasAlert := m.activeAlerts[alertID]; !hasAlert {
|
|
// Use inactiveAt (when alert was removed) for TTL, not ack time
|
|
checkTime := record.inactiveAt
|
|
if checkTime.IsZero() {
|
|
checkTime = record.time
|
|
}
|
|
if now.Sub(checkTime) > staleThreshold {
|
|
delete(m.ackState, alertID)
|
|
cleaned++
|
|
}
|
|
}
|
|
}
|
|
|
|
// Auto-resolve stale alerts - alerts where the resource hasn't been polled in 24 hours.
|
|
// This handles cases where a resource (e.g., Docker container, storage) stops being
|
|
// monitored but its alert remains active. Without this, alerts would persist indefinitely.
|
|
staleAlerts := make([]string, 0)
|
|
for alertID, alert := range m.activeAlerts {
|
|
if alert != nil && now.Sub(alert.LastSeen) > staleThreshold {
|
|
staleAlerts = append(staleAlerts, alertID)
|
|
}
|
|
}
|
|
staleResolved := 0
|
|
for _, alertID := range staleAlerts {
|
|
alert := m.activeAlerts[alertID]
|
|
log.Info().
|
|
Str("alertID", alertID).
|
|
Str("resourceName", alert.ResourceName).
|
|
Time("lastSeen", alert.LastSeen).
|
|
Dur("staleFor", now.Sub(alert.LastSeen)).
|
|
Msg("Auto-resolving stale alert - resource no longer being monitored")
|
|
m.clearAlertNoLock(alertID)
|
|
cleaned++
|
|
staleResolved++
|
|
}
|
|
|
|
// Persist changes if we resolved any stale alerts
|
|
if staleResolved > 0 {
|
|
go func() {
|
|
defer func() {
|
|
if r := recover(); r != nil {
|
|
log.Error().Interface("panic", r).Msg("Panic in SaveActiveAlerts goroutine (stale cleanup)")
|
|
}
|
|
}()
|
|
if err := m.SaveActiveAlerts(); err != nil {
|
|
log.Error().Err(err).Msg("Failed to save active alerts after stale cleanup")
|
|
}
|
|
}()
|
|
log.Info().
|
|
Int("count", staleResolved).
|
|
Msg("Auto-resolved stale alerts")
|
|
}
|
|
|
|
if cleaned > 0 {
|
|
log.Debug().
|
|
Int("entriesCleaned", cleaned).
|
|
Msg("Cleaned stale entries from alert tracking maps")
|
|
}
|
|
}
|
|
|
|
// hasKnownFirmwareBug checks if a disk model is known to have firmware bugs that cause
|
|
// false health status reports. These drives may report FAILED or other error states
|
|
// due to firmware issues (e.g., incorrect temperature thresholds) even when the drive
|
|
// is actually healthy. This prevents false alerts while still monitoring wearout.
|
|
//
|
|
// Related to GitHub issue #547: Samsung 980/990 SSDs report false health failures
|
|
func hasKnownFirmwareBug(model string) bool {
|
|
normalizedModel := strings.ToUpper(strings.TrimSpace(model))
|
|
|
|
// Samsung 980/990 series drives have known firmware bugs causing false health reports
|
|
// These drives report incorrect health status due to temperature threshold bugs
|
|
// even when functioning normally. Users should update firmware to latest version.
|
|
knownProblematicModels := []string{
|
|
"SAMSUNG SSD 980",
|
|
"SAMSUNG 980",
|
|
"SAMSUNG SSD 990",
|
|
"SAMSUNG 990",
|
|
}
|
|
|
|
for _, problematic := range knownProblematicModels {
|
|
if strings.Contains(normalizedModel, problematic) {
|
|
return true
|
|
}
|
|
}
|
|
|
|
return false
|
|
}
|
|
|
|
// CheckDiskHealth checks disk health and creates alerts if needed
|
|
func (m *Manager) CheckDiskHealth(instance, node string, disk proxmox.Disk) {
|
|
// Create unique alert ID for this disk
|
|
alertID := fmt.Sprintf("disk-health-%s-%s-%s", instance, node, disk.DevPath)
|
|
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
|
|
// Check if disk health is not PASSED
|
|
normalizedHealth := strings.ToUpper(strings.TrimSpace(disk.Health))
|
|
healthCheckNeeded := normalizedHealth != "" && normalizedHealth != "UNKNOWN" && normalizedHealth != "PASSED" && normalizedHealth != "OK"
|
|
|
|
// Skip health alerts for drives with known firmware bugs that cause false reports
|
|
// These drives may report FAILED status due to firmware issues even when healthy
|
|
// We still monitor wearout below, which is more reliable for these drives
|
|
if healthCheckNeeded && hasKnownFirmwareBug(disk.Model) {
|
|
log.Debug().
|
|
Str("node", node).
|
|
Str("disk", disk.DevPath).
|
|
Str("model", disk.Model).
|
|
Str("health", disk.Health).
|
|
Msg("Skipping health alert for drive with known firmware bug - health status unreliable")
|
|
|
|
// Clear any existing health alert since we now recognize this is a false positive
|
|
m.clearAlertNoLock(alertID)
|
|
healthCheckNeeded = false // Skip to wearout check
|
|
}
|
|
|
|
if healthCheckNeeded {
|
|
// Check if alert already exists
|
|
if _, exists := m.activeAlerts[alertID]; !exists {
|
|
// Create new health alert
|
|
alert := &Alert{
|
|
ID: alertID,
|
|
Type: "disk-health",
|
|
Level: AlertLevelCritical,
|
|
ResourceID: fmt.Sprintf("%s-%s", node, disk.DevPath),
|
|
ResourceName: fmt.Sprintf("%s (%s)", disk.Model, disk.DevPath),
|
|
Node: node,
|
|
Instance: instance,
|
|
Message: fmt.Sprintf("Disk health check failed: %s", disk.Health),
|
|
Value: 0, // Not applicable for health status
|
|
Threshold: 0,
|
|
StartTime: time.Now(),
|
|
LastSeen: time.Now(),
|
|
Metadata: map[string]interface{}{
|
|
"disk_path": disk.DevPath,
|
|
"disk_model": disk.Model,
|
|
"disk_serial": disk.Serial,
|
|
"disk_type": disk.Type,
|
|
"disk_health": disk.Health,
|
|
"disk_size": disk.Size,
|
|
},
|
|
}
|
|
|
|
m.preserveAlertState(alertID, alert)
|
|
|
|
m.activeAlerts[alertID] = alert
|
|
m.recentAlerts[alertID] = alert
|
|
m.historyManager.AddAlert(*alert)
|
|
|
|
m.dispatchAlert(alert, false)
|
|
|
|
log.Error().
|
|
Str("node", node).
|
|
Str("disk", disk.DevPath).
|
|
Str("model", disk.Model).
|
|
Str("health", disk.Health).
|
|
Msg("Disk health alert created")
|
|
}
|
|
} else {
|
|
// Disk is healthy, clear alert if it exists
|
|
m.clearAlertNoLock(alertID)
|
|
}
|
|
|
|
// Check for low wearout (SSD life remaining)
|
|
if disk.Wearout > 0 && disk.Wearout < 10 {
|
|
wearoutAlertID := fmt.Sprintf("disk-wearout-%s-%s-%s", instance, node, disk.DevPath)
|
|
message := fmt.Sprintf("SSD has less than 10%% life remaining (%d%% wearout)", disk.Wearout)
|
|
resourceID := fmt.Sprintf("%s-%s", node, disk.DevPath)
|
|
resourceName := fmt.Sprintf("%s (%s)", disk.Model, disk.DevPath)
|
|
|
|
if existing, exists := m.activeAlerts[wearoutAlertID]; exists {
|
|
// Refresh details so legacy alerts pick up updated wording and metadata
|
|
existing.LastSeen = time.Now()
|
|
existing.Value = float64(disk.Wearout)
|
|
existing.Message = message
|
|
existing.ResourceID = resourceID
|
|
existing.ResourceName = resourceName
|
|
existing.Node = node
|
|
existing.NodeDisplayName = m.resolveNodeDisplayName(existing.Instance, node)
|
|
existing.Instance = instance
|
|
if existing.Metadata == nil {
|
|
existing.Metadata = map[string]interface{}{}
|
|
}
|
|
existing.Metadata["disk_path"] = disk.DevPath
|
|
existing.Metadata["disk_model"] = disk.Model
|
|
existing.Metadata["disk_serial"] = disk.Serial
|
|
existing.Metadata["disk_type"] = disk.Type
|
|
existing.Metadata["disk_wearout"] = disk.Wearout
|
|
delete(existing.Metadata, "disk_wearout_used")
|
|
} else {
|
|
// Create wearout alert
|
|
alert := &Alert{
|
|
ID: wearoutAlertID,
|
|
Type: "disk-wearout",
|
|
Level: AlertLevelWarning,
|
|
ResourceID: resourceID,
|
|
ResourceName: resourceName,
|
|
Node: node,
|
|
Instance: instance,
|
|
Message: message,
|
|
Value: float64(disk.Wearout),
|
|
Threshold: 10.0,
|
|
StartTime: time.Now(),
|
|
LastSeen: time.Now(),
|
|
Metadata: map[string]interface{}{
|
|
"disk_path": disk.DevPath,
|
|
"disk_model": disk.Model,
|
|
"disk_serial": disk.Serial,
|
|
"disk_type": disk.Type,
|
|
"disk_wearout": disk.Wearout,
|
|
},
|
|
}
|
|
|
|
m.preserveAlertState(wearoutAlertID, alert)
|
|
|
|
m.activeAlerts[wearoutAlertID] = alert
|
|
m.recentAlerts[wearoutAlertID] = alert
|
|
m.historyManager.AddAlert(*alert)
|
|
|
|
m.dispatchAlert(alert, false)
|
|
|
|
log.Warn().
|
|
Str("node", node).
|
|
Str("disk", disk.DevPath).
|
|
Str("model", disk.Model).
|
|
Int("wearout", disk.Wearout).
|
|
Msg("Disk wearout alert created")
|
|
}
|
|
} else if disk.Wearout >= 10 {
|
|
// Wearout is acceptable, clear alert if it exists
|
|
wearoutAlertID := fmt.Sprintf("disk-wearout-%s-%s-%s", instance, node, disk.DevPath)
|
|
m.clearAlertNoLock(wearoutAlertID)
|
|
}
|
|
}
|
|
|
|
// clearAlertNoLock clears an alert without locking (must be called with lock held)
|
|
func (m *Manager) clearAlertNoLock(alertID string) {
|
|
alert, exists := m.activeAlerts[alertID]
|
|
if !exists {
|
|
return
|
|
}
|
|
|
|
// Record metric for resolved alert
|
|
if recordAlertResolved != nil {
|
|
recordAlertResolved(alert)
|
|
}
|
|
|
|
m.removeActiveAlertNoLock(alertID)
|
|
resolvedAlert := &ResolvedAlert{
|
|
Alert: alert,
|
|
ResolvedTime: time.Now(),
|
|
}
|
|
|
|
m.addRecentlyResolvedWithPrimaryLock(alertID, resolvedAlert)
|
|
|
|
m.safeCallResolvedCallback(alertID, true) // Make async to prevent deadlock
|
|
|
|
log.Info().
|
|
Str("alertID", alertID).
|
|
Msg("Alert cleared")
|
|
}
|
|
|
|
func (m *Manager) clearSnapshotAlertsForInstance(instance string) {
|
|
m.mu.Lock()
|
|
m.clearSnapshotAlertsForInstanceLocked(instance)
|
|
m.mu.Unlock()
|
|
}
|
|
|
|
func (m *Manager) clearSnapshotAlertsForInstanceLocked(instance string) {
|
|
for alertID, alert := range m.activeAlerts {
|
|
if alert == nil || alert.Type != "snapshot-age" {
|
|
continue
|
|
}
|
|
if instance != "" && alert.Instance != instance {
|
|
continue
|
|
}
|
|
m.clearAlertNoLock(alertID)
|
|
}
|
|
}
|
|
|
|
func (m *Manager) clearBackupAlerts() {
|
|
m.mu.Lock()
|
|
m.clearBackupAlertsLocked()
|
|
m.mu.Unlock()
|
|
}
|
|
|
|
func (m *Manager) clearBackupAlertsLocked() {
|
|
for alertID, alert := range m.activeAlerts {
|
|
if alert == nil || (alert.Type != "backup-age" && alert.Type != "backup-orphaned") {
|
|
continue
|
|
}
|
|
m.clearAlertNoLock(alertID)
|
|
}
|
|
}
|