Pulse/internal/alerts/alerts.go

package alerts

import (
	"encoding/json"
	"errors"
	"fmt"
	"math"
	"os"
	"path/filepath"
	"slices"
	"sort"
	"strconv"
	"strings"
	"sync"
	"time"

	alertspecs "github.com/rcourtman/pulse-go-rewrite/internal/alerts/specs"
	"github.com/rcourtman/pulse-go-rewrite/internal/models"
	"github.com/rcourtman/pulse-go-rewrite/internal/recovery"
	"github.com/rcourtman/pulse-go-rewrite/internal/storagehealth"
	"github.com/rcourtman/pulse-go-rewrite/internal/unifiedresources"
	"github.com/rcourtman/pulse-go-rewrite/internal/utils"
	"github.com/rcourtman/pulse-go-rewrite/pkg/proxmox"
	"github.com/rs/zerolog/log"
)

// AlertLevel represents the severity of an alert
type AlertLevel string

const (
	AlertLevelWarning  AlertLevel = "warning"
	AlertLevelCritical AlertLevel = "critical"
)

var ErrAlertNotFound = errors.New("alert not found")

// ActivationState represents the alert notification activation state
type ActivationState string

const (
	ActivationPending ActivationState = "pending_review"
	ActivationActive  ActivationState = "active"
	ActivationSnoozed ActivationState = "snoozed"
)

// Cleanup intervals
const (
	StaleTrackingThreshold = 24 * time.Hour
	RateLimitCleanupWindow = 1 * time.Hour
	alertsDirPerm          = 0o700
	alertsFilePerm         = 0o600
)

func normalizePoweredOffSeverity(level AlertLevel) AlertLevel {
	switch strings.ToLower(string(level)) {
	case string(AlertLevelCritical):
		return AlertLevelCritical
	default:
		return AlertLevelWarning
	}
}

// Alert represents an active alert
type Alert struct {
	ID              string                 `json:"id"`
	Type            string                 `json:"type"` // cpu, memory, disk, etc.
	Level           AlertLevel             `json:"level"`
	ResourceID      string                 `json:"resourceId"` // guest or node ID
	CanonicalSpecID string                 `json:"canonicalSpecId,omitempty"`
	CanonicalKind   string                 `json:"canonicalKind,omitempty"`
	CanonicalState  string                 `json:"canonicalState,omitempty"`
	ResourceName    string                 `json:"resourceName"`
	Node            string                 `json:"node"`
	NodeDisplayName string                 `json:"nodeDisplayName,omitempty"`
	Instance        string                 `json:"instance"`
	Message         string                 `json:"message"`
	Value           float64                `json:"value"`
	Threshold       float64                `json:"threshold"`
	StartTime       time.Time              `json:"startTime"`
	LastSeen        time.Time              `json:"lastSeen"`
	Acknowledged    bool                   `json:"acknowledged"`
	AckTime         *time.Time             `json:"ackTime,omitempty"`
	AckUser         string                 `json:"ackUser,omitempty"`
	Metadata        map[string]interface{} `json:"metadata,omitempty"`
	// Notification tracking
	LastNotified *time.Time `json:"lastNotified,omitempty"` // Last time notification was sent
	// Escalation tracking
	LastEscalation  int         `json:"lastEscalation,omitempty"`  // Last escalation level notified
	EscalationTimes []time.Time `json:"escalationTimes,omitempty"` // Times when escalations were sent
}

// Clone returns a deep copy of the alert so it can be safely shared across goroutines.
func (a *Alert) Clone() *Alert {
	if a == nil {
		return nil
	}

	clone := *a

	if a.AckTime != nil {
		t := *a.AckTime
		clone.AckTime = &t
	}

	if a.LastNotified != nil {
		t := *a.LastNotified
		clone.LastNotified = &t
	}

	if len(a.EscalationTimes) > 0 {
		clone.EscalationTimes = append([]time.Time(nil), a.EscalationTimes...)
	}

	if a.Metadata != nil {
		clone.Metadata = cloneMetadata(a.Metadata)
	}

	return &clone
}

func cloneMetadata(src map[string]interface{}) map[string]interface{} {
	if src == nil {
		return nil
	}

	dst := make(map[string]interface{}, len(src))
	for k, v := range src {
		dst[k] = cloneMetadataValue(v)
	}
	return dst
}

func cloneMetadataValue(val interface{}) interface{} {
	switch v := val.(type) {
	case map[string]interface{}:
		return cloneMetadata(v)
	case map[string]string:
		m := make(map[string]interface{}, len(v))
		for key, value := range v {
			m[key] = value
		}
		return m
	case []interface{}:
		arr := make([]interface{}, len(v))
		for i, elem := range v {
			arr[i] = cloneMetadataValue(elem)
		}
		return arr
	case []string:
		arr := make([]string, len(v))
		copy(arr, v)
		return arr
	case []int:
		arr := make([]int, len(v))
		copy(arr, v)
		return arr
	case []float64:
		arr := make([]float64, len(v))
		copy(arr, v)
		return arr
	default:
		return v
	}
}

// ResolvedAlert represents a recently resolved alert
type ResolvedAlert struct {
	*Alert
	ResolvedTime time.Time `json:"resolvedTime"`
}

// HysteresisThreshold represents a threshold with hysteresis
type HysteresisThreshold struct {
	Trigger float64 `json:"trigger"` // Threshold to trigger alert
	Clear   float64 `json:"clear"`   // Threshold to clear alert
}

// ThresholdConfig represents threshold configuration
type ThresholdConfig struct {
	Disabled            bool                 `json:"disabled,omitempty"`            // Completely disable alerts for this guest
	DisableConnectivity bool                 `json:"disableConnectivity,omitempty"` // Disable node offline/connectivity/powered-off alerts
	PoweredOffSeverity  AlertLevel           `json:"poweredOffSeverity,omitempty"`  // Severity for powered-off alerts
	CPU                 *HysteresisThreshold `json:"cpu,omitempty"`
	Memory              *HysteresisThreshold `json:"memory,omitempty"`
	Disk                *HysteresisThreshold `json:"disk,omitempty"`
	DiskRead            *HysteresisThreshold `json:"diskRead,omitempty"`
	DiskWrite           *HysteresisThreshold `json:"diskWrite,omitempty"`
	NetworkIn           *HysteresisThreshold `json:"networkIn,omitempty"`
	NetworkOut          *HysteresisThreshold `json:"networkOut,omitempty"`
	Usage               *HysteresisThreshold `json:"usage,omitempty"`           // For storage devices
	Temperature         *HysteresisThreshold `json:"temperature,omitempty"`     // For node CPU temperature
	DiskTemperature     *HysteresisThreshold `json:"diskTemperature,omitempty"` // For host SMART temperatures
	Backup              *BackupAlertConfig   `json:"backup,omitempty"`
	Snapshot            *SnapshotAlertConfig `json:"snapshot,omitempty"`
	Note                *string              `json:"note,omitempty"`
}

// QuietHours represents quiet hours configuration
type QuietHours struct {
	Enabled  bool                  `json:"enabled"`
	Start    string                `json:"start"` // 24-hour format "HH:MM"
	End      string                `json:"end"`   // 24-hour format "HH:MM"
	Timezone string                `json:"timezone"`
	Days     map[string]bool       `json:"days"` // monday, tuesday, etc.
	Suppress QuietHoursSuppression `json:"suppress"`
}

// QuietHoursSuppression controls which alert categories are silenced during quiet hours.
type QuietHoursSuppression struct {
	Performance bool `json:"performance"`
	Storage     bool `json:"storage"`
	Offline     bool `json:"offline"`
}

// EscalationLevel represents an escalation rule
type EscalationLevel struct {
	After  int    `json:"after"`  // minutes after initial alert
	Notify string `json:"notify"` // "email", "webhook", or "all"
}

// EscalationConfig represents alert escalation configuration
type EscalationConfig struct {
	Enabled bool              `json:"enabled"`
	Levels  []EscalationLevel `json:"levels"`
}

// GroupingConfig represents alert grouping configuration
type GroupingConfig struct {
	Enabled bool `json:"enabled"`
	Window  int  `json:"window"`  // seconds
	ByNode  bool `json:"byNode"`  // Group alerts by node
	ByGuest bool `json:"byGuest"` // Group alerts by guest type
}

// ScheduleConfig represents alerting schedule configuration
type ScheduleConfig struct {
	QuietHours      QuietHours       `json:"quietHours"`
	Cooldown        int              `json:"cooldown"`        // minutes
	MaxAlertsHour   int              `json:"maxAlertsHour"`   // max alerts per hour per resource
	NotifyOnResolve bool             `json:"notifyOnResolve"` // Send notification when alert clears
	Escalation      EscalationConfig `json:"escalation"`
	Grouping        GroupingConfig   `json:"grouping"`
}

// FilterCondition represents a single filter condition
type FilterCondition struct {
	Type     string      `json:"type"` // "metric", "text", or "raw"
	Field    string      `json:"field,omitempty"`
	Operator string      `json:"operator,omitempty"`
	Value    interface{} `json:"value,omitempty"`
	RawText  string      `json:"rawText,omitempty"`
}

// FilterStack represents a collection of filters with logical operator
type FilterStack struct {
	Filters         []FilterCondition `json:"filters"`
	LogicalOperator string            `json:"logicalOperator"` // "AND" or "OR"
}

// CustomAlertRule represents a custom alert rule with filter conditions
type CustomAlertRule struct {
	ID               string          `json:"id"`
	Name             string          `json:"name"`
	Description      string          `json:"description,omitempty"`
	FilterConditions FilterStack     `json:"filterConditions"`
	Thresholds       ThresholdConfig `json:"thresholds"`
	Priority         int             `json:"priority"`
	Enabled          bool            `json:"enabled"`
	Notifications    struct {
		Email *struct {
			Enabled    bool     `json:"enabled"`
			Recipients []string `json:"recipients"`
		} `json:"email,omitempty"`
		Webhook *struct {
			Enabled bool   `json:"enabled"`
			URL     string `json:"url"`
		} `json:"webhook,omitempty"`
	} `json:"notifications"`
	CreatedAt time.Time `json:"createdAt"`
	UpdatedAt time.Time `json:"updatedAt"`
}

// DockerThresholdConfig represents Docker-specific alert thresholds
type DockerThresholdConfig struct {
	CPU                      HysteresisThreshold `json:"cpu"`                                // CPU usage % threshold (default: 80%)
	Memory                   HysteresisThreshold `json:"memory"`                             // Memory usage % threshold (default: 85%)
	Disk                     HysteresisThreshold `json:"disk"`                               // Writable layer usage % threshold (default: 85%)
	RestartCount             int                 `json:"restartCount"`                       // Number of restarts to trigger alert (default: 3)
	RestartWindow            int                 `json:"restartWindow"`                      // Time window in seconds for restart loop detection (default: 300 = 5min)
	MemoryWarnPct            int                 `json:"memoryWarnPct"`                      // Memory limit % to trigger warning (default: 90)
	MemoryCriticalPct        int                 `json:"memoryCriticalPct"`                  // Memory limit % to trigger critical (default: 95)
	ServiceWarnGapPct        int                 `json:"serviceWarnGapPercent"`              // % of desired tasks missing to trigger warning (default: 10)
	ServiceCritGapPct        int                 `json:"serviceCriticalGapPercent"`          // % of desired tasks missing to trigger critical (default: 50)
	StateDisableConnectivity bool                `json:"stateDisableConnectivity,omitempty"` // Disable container offline/state alerts globally
	StatePoweredOffSeverity  AlertLevel          `json:"statePoweredOffSeverity,omitempty"`  // Default severity for container state/offline alerts
	UpdateAlertDelayHours    int                 `json:"updateAlertDelayHours,omitempty"`    // Hours to wait before alerting on available image updates (default: 24, -1 = disabled)
}

// PMGThresholdConfig represents Proxmox Mail Gateway-specific alert thresholds
type PMGThresholdConfig struct {
	QueueTotalWarning       int `json:"queueTotalWarning"`       // Total queue depth warning threshold (default: 500)
	QueueTotalCritical      int `json:"queueTotalCritical"`      // Total queue depth critical threshold (default: 1000)
	OldestMessageWarnMins   int `json:"oldestMessageWarnMins"`   // Oldest queued message age warning in minutes (default: 30)
	OldestMessageCritMins   int `json:"oldestMessageCritMins"`   // Oldest queued message age critical in minutes (default: 60)
	DeferredQueueWarn       int `json:"deferredQueueWarn"`       // Deferred queue depth warning (default: 200)
	DeferredQueueCritical   int `json:"deferredQueueCritical"`   // Deferred queue depth critical (default: 500)
	HoldQueueWarn           int `json:"holdQueueWarn"`           // Hold queue depth warning (default: 100)
	HoldQueueCritical       int `json:"holdQueueCritical"`       // Hold queue depth critical (default: 300)
	QuarantineSpamWarn      int `json:"quarantineSpamWarn"`      // Spam quarantine absolute warning (default: 2000)
	QuarantineSpamCritical  int `json:"quarantineSpamCritical"`  // Spam quarantine absolute critical (default: 5000)
	QuarantineVirusWarn     int `json:"quarantineVirusWarn"`     // Virus quarantine absolute warning (default: 2000)
	QuarantineVirusCritical int `json:"quarantineVirusCritical"` // Virus quarantine absolute critical (default: 5000)
	QuarantineGrowthWarnPct int `json:"quarantineGrowthWarnPct"` // Growth % to trigger warning (default: 25)
	QuarantineGrowthWarnMin int `json:"quarantineGrowthWarnMin"` // Minimum message growth for warning (default: 250)
	QuarantineGrowthCritPct int `json:"quarantineGrowthCritPct"` // Growth % to trigger critical (default: 50)
	QuarantineGrowthCritMin int `json:"quarantineGrowthCritMin"` // Minimum message growth for critical (default: 500)
}

// SnapshotAlertConfig represents snapshot age alert configuration
type SnapshotAlertConfig struct {
	Enabled         bool    `json:"enabled"`
	WarningDays     int     `json:"warningDays"`
	CriticalDays    int     `json:"criticalDays"`
	WarningSizeGiB  float64 `json:"warningSizeGiB,omitempty"`
	CriticalSizeGiB float64 `json:"criticalSizeGiB,omitempty"`
}

// BackupAlertConfig represents backup age alert configuration
type BackupAlertConfig struct {
	Enabled      bool `json:"enabled"`
	WarningDays  int  `json:"warningDays"`
	CriticalDays int  `json:"criticalDays"`
	// Indicator thresholds for the dashboard (separate from alert thresholds)
	FreshHours int `json:"freshHours"` // Backups newer than this show as green (default: 24)
	StaleHours int `json:"staleHours"` // Backups older than FreshHours but newer than this show as amber (default: 72)
	// Global backup alert filters
	AlertOrphaned *bool    `json:"alertOrphaned,omitempty"` // Alert on backups that do not match a known guest (default: true)
	IgnoreVMIDs   []string `json:"ignoreVMIDs,omitempty"`   // Skip alerts for matching VMIDs (supports prefix*)
}

// GuestLookup describes a guest identity used for snapshot/backup evaluations.
type GuestLookup struct {
	ResourceID string
	Name       string
	Instance   string
	Node       string
	Type       string
	VMID       int
}

// AlertConfig represents the complete alert configuration
type AlertConfig struct {
	Enabled                        bool                       `json:"enabled"`
	ActivationState                ActivationState            `json:"activationState,omitempty"`
	ObservationWindowHours         int                        `json:"observationWindowHours,omitempty"`
	ActivationTime                 *time.Time                 `json:"activationTime,omitempty"`
	GuestDefaults                  ThresholdConfig            `json:"guestDefaults"`
	NodeDefaults                   ThresholdConfig            `json:"nodeDefaults"`
	AgentDefaults                  ThresholdConfig            `json:"agentDefaults"`
	StorageDefault                 HysteresisThreshold        `json:"storageDefault"`
	DockerDefaults                 DockerThresholdConfig      `json:"dockerDefaults"`
	DockerIgnoredContainerPrefixes []string                   `json:"dockerIgnoredContainerPrefixes,omitempty"`
	IgnoredGuestPrefixes           []string                   `json:"ignoredGuestPrefixes,omitempty"`
	GuestTagWhitelist              []string                   `json:"guestTagWhitelist,omitempty"`
	GuestTagBlacklist              []string                   `json:"guestTagBlacklist,omitempty"`
	PMGDefaults                    PMGThresholdConfig         `json:"pmgDefaults"`
	PBSDefaults                    ThresholdConfig            `json:"pbsDefaults"`
	SnapshotDefaults               SnapshotAlertConfig        `json:"snapshotDefaults"`
	BackupDefaults                 BackupAlertConfig          `json:"backupDefaults"`
	Overrides                      map[string]ThresholdConfig `json:"overrides"` // keyed by resource ID
	CustomRules                    []CustomAlertRule          `json:"customRules,omitempty"`
	Schedule                       ScheduleConfig             `json:"schedule"`
	// Global disable flags per resource type
	DisableAllNodes              bool `json:"disableAllNodes"`              // Disable all alerts for Proxmox nodes
	DisableAllGuests             bool `json:"disableAllGuests"`             // Disable all alerts for VMs/containers
	DisableAllAgents             bool `json:"disableAllAgents"`             // Disable all alerts for Pulse agents
	DisableAllStorage            bool `json:"disableAllStorage"`            // Disable all alerts for storage
	DisableAllPBS                bool `json:"disableAllPBS"`                // Disable all alerts for PBS servers
	DisableAllPMG                bool `json:"disableAllPMG"`                // Disable all alerts for PMG instances
	DisableAllDockerHosts        bool `json:"disableAllDockerHosts"`        // Disable all alerts for Docker hosts
	DisableAllDockerContainers   bool `json:"disableAllDockerContainers"`   // Disable all alerts for Docker containers
	DisableAllDockerServices     bool `json:"disableAllDockerServices"`     // Disable all alerts for Docker services
	DisableAllNodesOffline       bool `json:"disableAllNodesOffline"`       // Disable node offline/connectivity alerts globally
	DisableAllGuestsOffline      bool `json:"disableAllGuestsOffline"`      // Disable guest powered-off alerts globally
	DisableAllAgentsOffline      bool `json:"disableAllAgentsOffline"`      // Disable agent offline alerts globally
	DisableAllPBSOffline         bool `json:"disableAllPBSOffline"`         // Disable PBS offline alerts globally
	DisableAllPMGOffline         bool `json:"disableAllPMGOffline"`         // Disable PMG offline alerts globally
	DisableAllDockerHostsOffline bool `json:"disableAllDockerHostsOffline"` // Disable Docker host offline alerts globally
	// New configuration options
	MinimumDelta         float64                   `json:"minimumDelta"`         // Minimum % change to trigger new alert
	SuppressionWindow    int                       `json:"suppressionWindow"`    // Minutes to suppress duplicate alerts
	HysteresisMargin     float64                   `json:"hysteresisMargin"`     // Default margin for legacy thresholds
	TimeThresholds       map[string]int            `json:"timeThresholds"`       // Per-type delays: guest, node, agent, storage, pbs
	MetricTimeThresholds map[string]map[string]int `json:"metricTimeThresholds"` // Optional per-metric delays keyed by resource type
	// Alert TTL and auto-cleanup
	MaxAlertAgeDays           int `json:"maxAlertAgeDays"`           // Maximum age for alerts before auto-cleanup (0 = disabled)
	MaxAcknowledgedAgeDays    int `json:"maxAcknowledgedAgeDays"`    // Maximum age for acknowledged alerts (0 = disabled)
	AutoAcknowledgeAfterHours int `json:"autoAcknowledgeAfterHours"` // Auto-acknowledge alerts after X hours (0 = disabled)
	// Flapping detection
	FlappingEnabled         bool `json:"flappingEnabled"`         // Enable flapping detection
	FlappingWindowSeconds   int  `json:"flappingWindowSeconds"`   // Time window for counting state changes
	FlappingThreshold       int  `json:"flappingThreshold"`       // Number of state changes to trigger flapping
	FlappingCooldownMinutes int  `json:"flappingCooldownMinutes"` // Cooldown period after flapping detected
}

// UnmarshalJSON accepts canonical v6 alert config keys.
func (c *AlertConfig) UnmarshalJSON(data []byte) error {
	type alias AlertConfig
	var decoded alias
	if err := json.Unmarshal(data, &decoded); err != nil {
		return err
	}
	*c = AlertConfig(decoded)

	raw := make(map[string]json.RawMessage)
	if err := json.Unmarshal(data, &raw); err != nil {
		return err
	}

	NormalizeAlertConfigAliases(c)
	return nil
}

// NormalizeAlertConfigAliases strips deprecated legacy alias keys.
func NormalizeAlertConfigAliases(config *AlertConfig) {
	if config == nil {
		return
	}

	if config.TimeThresholds != nil {
		for key := range config.TimeThresholds {
			typeKey := canonicalAlertResourceType(key)
			if typeKey == "" || typeKey == "all" {
				continue
			}
			if isUnsupportedLegacyAlertResourceType(typeKey) {
				delete(config.TimeThresholds, key)
			}
		}
	}

	if len(config.MetricTimeThresholds) == 0 {
		return
	}

	for key := range config.MetricTimeThresholds {
		typeKey := canonicalAlertResourceType(key)
		if typeKey == "" || typeKey == "all" {
			continue
		}
		if isUnsupportedLegacyAlertResourceType(typeKey) {
			delete(config.MetricTimeThresholds, key)
		}
	}
}

func cloneStringIntMap(src map[string]int) map[string]int {
	if len(src) == 0 {
		return nil
	}

	dst := make(map[string]int, len(src))
	for key, value := range src {
		dst[key] = value
	}
	return dst
}

// pmgQuarantineSnapshot stores quarantine counts at a point in time for growth detection
type pmgQuarantineSnapshot struct {
	Spam      int
	Virus     int
	Timestamp time.Time
}

// pmgMailMetricSample stores a single hourly mail count sample
type pmgMailMetricSample struct {
	SpamIn    float64
	SpamOut   float64
	VirusIn   float64
	VirusOut  float64
	Timestamp time.Time
}

// pmgAnomalyTracker tracks history for anomaly detection.
type pmgAnomalyTracker struct {
	Samples        []pmgMailMetricSample // Ring buffer (max 48 samples)
	LastSampleTime time.Time             // Timestamp of most recent sample
	SampleCount    int                   // Total samples collected (for warmup check)
}

// Manager handles alert monitoring and state
//
// Lock Ordering Documentation:
// The Manager uses two mutexes to prevent deadlocks:
//  1. m.mu (primary lock) - protects most manager state
//  2. m.resolvedMutex - protects only recentlyResolved map
//
// Lock Ordering Rules:
//   - NEVER hold m.mu when acquiring resolvedMutex
//   - ALWAYS release m.mu before acquiring resolvedMutex
//   - resolvedMutex can be held independently without m.mu
//   - When both locks are needed, acquire m.mu first, then release it before acquiring resolvedMutex
//
// This ordering prevents deadlock scenarios where different goroutines acquire locks in different orders.

// Metric hooks for integrating with Prometheus
var (
	recordAlertFired        func(*Alert)
	recordAlertResolved     func(*Alert)
	recordAlertSuppressed   func(string)
	recordAlertAcknowledged func()
)

// SetMetricHooks registers callbacks for recording alert metrics.
// - fired: called when an alert is dispatched (in dispatchAlert)
// - resolved: called when an alert is cleared (in clearAlertNoLock)
// - suppressed: called when an alert is suppressed due to flapping
// - acknowledged: called when an alert is acknowledged
func SetMetricHooks(fired func(*Alert), resolved func(*Alert), suppressed func(string), acknowledged func()) {
	recordAlertFired = fired
	recordAlertResolved = resolved
	recordAlertSuppressed = suppressed
	recordAlertAcknowledged = acknowledged
}

type Manager struct {
	mu               sync.RWMutex
	saveMu           sync.Mutex
	callbackMu       sync.RWMutex
	alertsDir        string
	config           AlertConfig
	activeAlerts     map[string]*Alert
	activeAlertAlias map[string]string
	historyManager   *HistoryManager
	onAlert          func(alert *Alert)
	alertSubs        map[int]func(alert *Alert)
	onResolved       func(alertID string)
	resolvedSubs     map[int]func(alertID string)
	onAcknowledged   func(alert *Alert, user string)
	onUnacknowledged func(alert *Alert, user string)
	onEscalate       func(alert *Alert, level int)
	onAlertForAI     func(alert *Alert) // AI analysis callback - bypasses notification suppression
	alertForAISubs   map[int]func(alert *Alert)
	nextCallbackID   int
	escalationStop   chan struct{}
	alertRateLimit   map[string][]time.Time // Track alert times for rate limiting
	// New fields for deduplication and suppression
	recentAlerts    map[string]*Alert    // Track recent alerts for deduplication
	suppressedUntil map[string]time.Time // Track suppression windows
	// Recently resolved alerts (kept for 5 minutes)
	recentlyResolved map[string]*ResolvedAlert
	resolvedAlias    map[string]string
	resolvedMutex    sync.RWMutex // Secondary lock - see Lock Ordering Documentation above
	// Time threshold tracking
	pendingAlerts map[string]time.Time // Track when thresholds were first exceeded
	// Offline confirmation tracking
	nodeOfflineCount      map[string]int                  // Track consecutive offline counts for nodes (legacy)
	offlineConfirmations  map[string]int                  // Track consecutive offline counts for all resources
	dockerOfflineCount    map[string]int                  // Track consecutive offline counts for Docker hosts
	dockerStateConfirm    map[string]int                  // Track consecutive state confirmations for Docker containers
	dockerRestartTracking map[string]*dockerRestartRecord // Track restart counts and times for restart loop detection
	dockerLastExitCode    map[string]int                  // Track last exit code for OOM detection
	dockerUpdateFirstSeen map[string]time.Time            // Track when image updates were first detected for alert delay
	// Stable identity tracking prevents update-delay resets when host IDs churn.
	dockerUpdateFirstSeenByIdentity map[string]time.Time
	// PMG quarantine growth tracking
	pmgQuarantineHistory map[string][]pmgQuarantineSnapshot // Track quarantine snapshots for growth detection
	// PMG anomaly detection tracking
	pmgAnomalyTrackers map[string]*pmgAnomalyTracker // Track mail metrics for anomaly detection per PMG instance
	// Persistent acknowledgement state so quick alert rebuilds keep user acknowledgements
	ackState map[string]ackRecord
	// Canonical acknowledgement state is keyed by resource_id + spec_id so later
	// alert-ID migration can preserve user state across storage-key changes.
	ackStateByCanonical map[string]ackRecord
	// Flapping detection tracking
	flappingHistory map[string][]time.Time // Track state change times for flapping detection
	flappingActive  map[string]bool        // Track which alerts are currently in flapping state
	// Cleanup control
	cleanupStop chan struct{} // Signal to stop cleanup goroutine
	// Host agent deduplication: track hostnames of active host agents
	// When a host agent is running on a Proxmox node, we prefer the host agent
	// alerts and suppress the node alerts to avoid duplicate monitoring.
	hostAgentHostnames map[string]struct{} // Normalized hostnames (lowercase)
	// Node display name caches. Proxmox nodes can share the same raw node name
	// across multiple configured instances, so keep instance-scoped entries in
	// addition to the legacy raw-name cache used by instance-less resources.
	nodeDisplayNames         map[string]string
	instanceNodeDisplayNames map[string]string
	// License checking for Pro-only alert features
	hasProFeature func(feature string) bool

	// Cached timezone for quiet hours
	quietHoursLoc *time.Location
	stopOnce      sync.Once
}

type ackRecord struct {
	acknowledged bool
	user         string
	time         time.Time // When the alert was acknowledged
	inactiveAt   time.Time // When the alert was removed (zero if still active)
}

type dockerRestartRecord struct {
	count       int
	lastCount   int
	times       []time.Time // Track restart times for loop detection
	lastChecked time.Time
}

// NewManager creates a new alert manager using the global data directory.
// For multi-tenant deployments, use NewManagerWithDataDir instead.
func NewManager() *Manager {
	return NewManagerWithDataDir(utils.GetDataDir())
}

// NewManagerWithDataDir creates a new alert manager with a custom data directory.
// This enables tenant-scoped alert persistence in multi-tenant deployments.
func NewManagerWithDataDir(dataDir string) *Manager {
	if strings.TrimSpace(dataDir) == "" {
		dataDir = utils.GetDataDir()
	}

	alertsDir := filepath.Join(dataDir, "alerts")
	alertOrphaned := true
	m := &Manager{
		alertsDir:                       alertsDir,
		activeAlerts:                    make(map[string]*Alert),
		activeAlertAlias:                make(map[string]string),
		historyManager:                  NewHistoryManager(alertsDir),
		escalationStop:                  make(chan struct{}),
		alertSubs:                       make(map[int]func(*Alert)),
		resolvedSubs:                    make(map[int]func(string)),
		alertForAISubs:                  make(map[int]func(*Alert)),
		alertRateLimit:                  make(map[string][]time.Time),
		recentAlerts:                    make(map[string]*Alert),
		suppressedUntil:                 make(map[string]time.Time),
		recentlyResolved:                make(map[string]*ResolvedAlert),
		resolvedAlias:                   make(map[string]string),
		pendingAlerts:                   make(map[string]time.Time),
		nodeOfflineCount:                make(map[string]int),
		offlineConfirmations:            make(map[string]int),
		dockerOfflineCount:              make(map[string]int),
		dockerStateConfirm:              make(map[string]int),
		dockerRestartTracking:           make(map[string]*dockerRestartRecord),
		dockerLastExitCode:              make(map[string]int),
		dockerUpdateFirstSeen:           make(map[string]time.Time),
		dockerUpdateFirstSeenByIdentity: make(map[string]time.Time),
		pmgQuarantineHistory:            make(map[string][]pmgQuarantineSnapshot),
		pmgAnomalyTrackers:              make(map[string]*pmgAnomalyTracker),
		ackState:                        make(map[string]ackRecord),
		ackStateByCanonical:             make(map[string]ackRecord),
		flappingHistory:                 make(map[string][]time.Time),
		flappingActive:                  make(map[string]bool),
		cleanupStop:                     make(chan struct{}),
		hostAgentHostnames:              make(map[string]struct{}),
		nodeDisplayNames:                make(map[string]string),
		instanceNodeDisplayNames:        make(map[string]string),
		config: AlertConfig{
			Enabled:                true,
			ActivationState:        ActivationPending,
			ObservationWindowHours: 24,
			GuestDefaults: ThresholdConfig{
				PoweredOffSeverity: AlertLevelWarning,
				CPU:                &HysteresisThreshold{Trigger: 80, Clear: 75},
				Memory:             &HysteresisThreshold{Trigger: 85, Clear: 80},
				Disk:               &HysteresisThreshold{Trigger: 90, Clear: 85},
				DiskRead:           &HysteresisThreshold{Trigger: 0, Clear: 0}, // Off by default
				DiskWrite:          &HysteresisThreshold{Trigger: 0, Clear: 0}, // Off by default
				NetworkIn:          &HysteresisThreshold{Trigger: 0, Clear: 0}, // Off by default
				NetworkOut:         &HysteresisThreshold{Trigger: 0, Clear: 0}, // Off by default
			},
			NodeDefaults: ThresholdConfig{
				CPU:         &HysteresisThreshold{Trigger: 80, Clear: 75},
				Memory:      &HysteresisThreshold{Trigger: 85, Clear: 80},
				Disk:        &HysteresisThreshold{Trigger: 90, Clear: 85},
				Temperature: &HysteresisThreshold{Trigger: 80, Clear: 75}, // Warning at 80°C, clear at 75°C
			},
			AgentDefaults: ThresholdConfig{
				CPU:             &HysteresisThreshold{Trigger: 80, Clear: 75},
				Memory:          &HysteresisThreshold{Trigger: 85, Clear: 80},
				Disk:            &HysteresisThreshold{Trigger: 90, Clear: 85},
				DiskTemperature: &HysteresisThreshold{Trigger: 55, Clear: 50},
			},
			DockerDefaults: DockerThresholdConfig{
				CPU:                     HysteresisThreshold{Trigger: 80, Clear: 75},
				Memory:                  HysteresisThreshold{Trigger: 85, Clear: 80},
				Disk:                    HysteresisThreshold{Trigger: 85, Clear: 80},
				RestartCount:            3,
				RestartWindow:           300, // 5 minutes
				MemoryWarnPct:           90,
				MemoryCriticalPct:       95,
				StatePoweredOffSeverity: AlertLevelWarning,
			},
			PMGDefaults: PMGThresholdConfig{
				QueueTotalWarning:       500,  // Warning at 500 total queued messages
				QueueTotalCritical:      1000, // Critical at 1000 total queued messages
				OldestMessageWarnMins:   30,   // Warning if oldest message is 30+ minutes old
				OldestMessageCritMins:   60,   // Critical if oldest message is 60+ minutes old
				DeferredQueueWarn:       200,  // Warning at 200 deferred messages
				DeferredQueueCritical:   500,  // Critical at 500 deferred messages
				HoldQueueWarn:           100,  // Warning at 100 held messages
				HoldQueueCritical:       300,  // Critical at 300 held messages
				QuarantineSpamWarn:      2000, // Warning at 2000 spam quarantined
				QuarantineSpamCritical:  5000, // Critical at 5000 spam quarantined
				QuarantineVirusWarn:     2000, // Warning at 2000 virus quarantined
				QuarantineVirusCritical: 5000, // Critical at 5000 virus quarantined
				QuarantineGrowthWarnPct: 25,   // Warning if growth ≥25%
				QuarantineGrowthWarnMin: 250,  // AND ≥250 messages
				QuarantineGrowthCritPct: 50,   // Critical if growth ≥50%
				QuarantineGrowthCritMin: 500,  // AND ≥500 messages
			},
			SnapshotDefaults: SnapshotAlertConfig{
				Enabled:         false,
				WarningDays:     30,
				CriticalDays:    45,
				WarningSizeGiB:  0,
				CriticalSizeGiB: 0,
			},
			BackupDefaults: BackupAlertConfig{
				Enabled:       false,
				WarningDays:   7,
				CriticalDays:  14,
				FreshHours:    24,
				StaleHours:    72,
				AlertOrphaned: &alertOrphaned,
				IgnoreVMIDs:   []string{},
			},
			PBSDefaults: ThresholdConfig{
				CPU:    &HysteresisThreshold{Trigger: 80, Clear: 75},
				Memory: &HysteresisThreshold{Trigger: 85, Clear: 80},
			},
			StorageDefault:    HysteresisThreshold{Trigger: 85, Clear: 80},
			MinimumDelta:      2.0, // 2% minimum change
			SuppressionWindow: 5,   // 5 minutes
			HysteresisMargin:  5.0, // 5% default margin
			TimeThresholds: map[string]int{
				"guest":   5,
				"node":    5,
				"agent":   5,
				"storage": 5,
				"pbs":     5,
			},
			Overrides: make(map[string]ThresholdConfig),
			Schedule: ScheduleConfig{
				QuietHours: QuietHours{
					Enabled:  false, // OFF - users should opt-in to quiet hours
					Start:    "22:00",
					End:      "08:00",
					Timezone: "America/New_York",
					Days: map[string]bool{
						"monday":    true,
						"tuesday":   true,
						"wednesday": true,
						"thursday":  true,
						"friday":    true,
						"saturday":  false,
						"sunday":    false,
					},
					Suppress: QuietHoursSuppression{},
				},
				Cooldown:        5,  // ON - 5 minutes prevents spam
				MaxAlertsHour:   10, // ON - 10 alerts/hour prevents flooding
				NotifyOnResolve: true,
				Escalation: EscalationConfig{
					Enabled: false, // OFF - requires user configuration
					Levels: []EscalationLevel{
						{After: 15, Notify: "email"},
						{After: 30, Notify: "webhook"},
						{After: 60, Notify: "all"},
					},
				},
				Grouping: GroupingConfig{
					Enabled: true,  // ON - reduces notification noise
					Window:  30,    // 30 second window for grouping
					ByNode:  true,  // Group by node for mass node issues
					ByGuest: false, // Don't group by guest by default
				},
			},
			// Alert TTL defaults
			MaxAlertAgeDays:           7,  // Auto-cleanup alerts older than 7 days
			MaxAcknowledgedAgeDays:    1,  // Auto-cleanup acknowledged alerts older than 1 day
			AutoAcknowledgeAfterHours: 24, // Auto-acknowledge alerts after 24 hours
			// Flapping detection defaults
			FlappingEnabled:         true, // Enable flapping detection
			FlappingWindowSeconds:   300,  // 5 minute window
			FlappingThreshold:       5,    // 5 state changes triggers flapping
			FlappingCooldownMinutes: 15,   // 15 minute cooldown
		},
	}

	// Load saved active alerts
	if err := m.LoadActiveAlerts(); err != nil {
		log.Error().Err(err).Msg("failed to load active alerts")
	}

	// Start escalation checker
	go m.escalationChecker()

	// Start periodic save of active alerts
	go m.periodicSaveAlerts()

	// Start periodic cleanup of stale tracking map entries
	go m.trackingMapCleanup()

	return m
}

// SetLicenseChecker sets the function used to check Pro license features.
// This enables gating Pro-only alert features like update alerts.
func (m *Manager) SetLicenseChecker(checker func(feature string) bool) {
	m.mu.Lock()
	defer m.mu.Unlock()
	m.hasProFeature = checker
}

// addRecentlyResolvedUnlocked records a resolved alert assuming the caller does not hold m.mu.
func (m *Manager) addRecentlyResolvedUnlocked(resolved *ResolvedAlert) {
	m.resolvedMutex.Lock()
	if resolved == nil || resolved.Alert == nil {
		m.resolvedMutex.Unlock()
		return
	}
	storageKey := activeAlertStorageKey(resolved.Alert, resolved.Alert.ID)
	m.recentlyResolved[storageKey] = resolved
	m.registerResolvedAliasUnlocked(storageKey, resolved)
	m.resolvedMutex.Unlock()
}

// addRecentlyResolvedWithPrimaryLock records a resolved alert while preserving the caller's
// ownership of m.mu. Callers must hold m.mu before invoking this helper.
func (m *Manager) addRecentlyResolvedWithPrimaryLock(resolved *ResolvedAlert) {
	m.mu.Unlock()
	m.addRecentlyResolvedUnlocked(resolved)
	m.mu.Lock()
}

// SetAlertCallback sets the callback for new alerts
func (m *Manager) SetAlertCallback(cb func(alert *Alert)) {
	m.callbackMu.Lock()
	defer m.callbackMu.Unlock()
	m.onAlert = cb
}

// SubscribeAlertCallback registers an additional alert callback without
// replacing the legacy single callback slot. The returned function removes the
// subscription when called.
func (m *Manager) SubscribeAlertCallback(cb func(alert *Alert)) func() {
	if cb == nil {
		return func() {}
	}

	m.callbackMu.Lock()
	m.nextCallbackID++
	id := m.nextCallbackID
	m.alertSubs[id] = cb
	m.callbackMu.Unlock()

	return func() {
		m.callbackMu.Lock()
		delete(m.alertSubs, id)
		m.callbackMu.Unlock()
	}
}

// SetAlertForAICallback sets a callback for AI analysis when alerts are created.
// Unlike SetAlertCallback, this callback is invoked unconditionally - it bypasses
// activation state, quiet hours, and other notification suppression checks.
// This allows AI to analyze alerts even when the user hasn't finished setup.
func (m *Manager) SetAlertForAICallback(cb func(alert *Alert)) {
	m.callbackMu.Lock()
	defer m.callbackMu.Unlock()
	m.onAlertForAI = cb
	log.Info().Msg("alert-for-AI callback registered (bypasses notification suppression)")
}

// SubscribeAlertForAICallback registers an additional AI alert callback without
// replacing the legacy single callback slot. The returned function removes the
// subscription when called.
func (m *Manager) SubscribeAlertForAICallback(cb func(alert *Alert)) func() {
	if cb == nil {
		return func() {}
	}

	m.callbackMu.Lock()
	m.nextCallbackID++
	id := m.nextCallbackID
	m.alertForAISubs[id] = cb
	m.callbackMu.Unlock()

	return func() {
		m.callbackMu.Lock()
		delete(m.alertForAISubs, id)
		m.callbackMu.Unlock()
	}
}

// SetResolvedCallback sets the callback for resolved alerts
func (m *Manager) SetResolvedCallback(cb func(alertID string)) {
	m.callbackMu.Lock()
	defer m.callbackMu.Unlock()
	m.onResolved = cb
}

// SubscribeResolvedCallback registers an additional resolved-alert callback
// without replacing the legacy single callback slot. The returned function
// removes the subscription when called.
func (m *Manager) SubscribeResolvedCallback(cb func(alertID string)) func() {
	if cb == nil {
		return func() {}
	}

	m.callbackMu.Lock()
	m.nextCallbackID++
	id := m.nextCallbackID
	m.resolvedSubs[id] = cb
	m.callbackMu.Unlock()

	return func() {
		m.callbackMu.Lock()
		delete(m.resolvedSubs, id)
		m.callbackMu.Unlock()
	}
}

// SetAcknowledgedCallback sets the callback for acknowledged alerts.
func (m *Manager) SetAcknowledgedCallback(cb func(alert *Alert, user string)) {
	m.callbackMu.Lock()
	defer m.callbackMu.Unlock()
	m.onAcknowledged = cb
}

// SetUnacknowledgedCallback sets the callback for unacknowledged alerts.
func (m *Manager) SetUnacknowledgedCallback(cb func(alert *Alert, user string)) {
	m.callbackMu.Lock()
	defer m.callbackMu.Unlock()
	m.onUnacknowledged = cb
}

// SetEscalateCallback sets the callback for escalated alerts
func (m *Manager) SetEscalateCallback(cb func(alert *Alert, level int)) {
	m.callbackMu.Lock()
	defer m.callbackMu.Unlock()
	m.onEscalate = cb
}

func (m *Manager) getAlertCallback() func(alert *Alert) {
	m.callbackMu.RLock()
	cb := m.onAlert
	m.callbackMu.RUnlock()
	return cb
}

func (m *Manager) getAlertCallbacks() []func(alert *Alert) {
	m.callbackMu.RLock()
	defer m.callbackMu.RUnlock()

	callbacks := make([]func(alert *Alert), 0, len(m.alertSubs)+1)
	if m.onAlert != nil {
		callbacks = append(callbacks, m.onAlert)
	}
	for _, cb := range m.alertSubs {
		if cb != nil {
			callbacks = append(callbacks, cb)
		}
	}
	return callbacks
}

func (m *Manager) getAlertForAICallback() func(alert *Alert) {
	m.callbackMu.RLock()
	cb := m.onAlertForAI
	m.callbackMu.RUnlock()
	return cb
}

func (m *Manager) getAlertForAICallbacks() []func(alert *Alert) {
	m.callbackMu.RLock()
	defer m.callbackMu.RUnlock()

	callbacks := make([]func(alert *Alert), 0, len(m.alertForAISubs)+1)
	if m.onAlertForAI != nil {
		callbacks = append(callbacks, m.onAlertForAI)
	}
	for _, cb := range m.alertForAISubs {
		if cb != nil {
			callbacks = append(callbacks, cb)
		}
	}
	return callbacks
}

func (m *Manager) getResolvedCallback() func(alertID string) {
	m.callbackMu.RLock()
	cb := m.onResolved
	m.callbackMu.RUnlock()
	return cb
}

func (m *Manager) getResolvedCallbacks() []func(alertID string) {
	m.callbackMu.RLock()
	defer m.callbackMu.RUnlock()

	callbacks := make([]func(alertID string), 0, len(m.resolvedSubs)+1)
	if m.onResolved != nil {
		callbacks = append(callbacks, m.onResolved)
	}
	for _, cb := range m.resolvedSubs {
		if cb != nil {
			callbacks = append(callbacks, cb)
		}
	}
	return callbacks
}

func (m *Manager) getAcknowledgedCallback() func(alert *Alert, user string) {
	m.callbackMu.RLock()
	cb := m.onAcknowledged
	m.callbackMu.RUnlock()
	return cb
}

func (m *Manager) getUnacknowledgedCallback() func(alert *Alert, user string) {
	m.callbackMu.RLock()
	cb := m.onUnacknowledged
	m.callbackMu.RUnlock()
	return cb
}

func (m *Manager) getEscalateCallback() func(alert *Alert, level int) {
	m.callbackMu.RLock()
	cb := m.onEscalate
	m.callbackMu.RUnlock()
	return cb
}

// safeCallResolvedAlertCallback invokes onResolved with panic recovery while
// preserving canonical state as the internal identity and emitting the public
// alert ID to external callbacks for compatibility.
func (m *Manager) safeCallResolvedAlertCallback(alert *Alert, fallbackID string, async bool) {
	callbacks := m.getResolvedCallbacks()
	if len(callbacks) == 0 {
		return
	}

	publicID := exportedAlertID(alert, fallbackID)
	trackingKey := canonicalTrackingKeyForAlert(alert)

	callbackFunc := func() {
		defer func() {
			if r := recover(); r != nil {
				log.Error().
					Interface("panic", r).
					Str("alertID", publicID).
					Str("trackingKey", trackingKey).
					Msg("Panic in onResolved callback")
			}
		}()
		for _, callback := range callbacks {
			callback(publicID)
		}
	}

	if async {
		go callbackFunc()
	} else {
		callbackFunc()
	}
}

// safeCallAcknowledgedCallback invokes onAcknowledged with panic recovery and alert cloning.
func (m *Manager) safeCallAcknowledgedCallback(alert *Alert, user string) {
	callback := m.getAcknowledgedCallback()
	if callback == nil || alert == nil {
		return
	}

	alertCopy := cloneAlertForOutput(alert)
	go func(a *Alert, u string) {
		defer func() {
			if r := recover(); r != nil {
				log.Error().
					Interface("panic", r).
					Str("alertID", a.ID).
					Msg("Panic in onAcknowledged callback")
			}
		}()
		callback(a, u)
	}(alertCopy, user)
}

// safeCallUnacknowledgedCallback invokes onUnacknowledged with panic recovery and alert cloning.
func (m *Manager) safeCallUnacknowledgedCallback(alert *Alert, user string) {
	callback := m.getUnacknowledgedCallback()
	if callback == nil || alert == nil {
		return
	}

	alertCopy := cloneAlertForOutput(alert)
	go func(a *Alert, u string) {
		defer func() {
			if r := recover(); r != nil {
				log.Error().
					Interface("panic", r).
					Str("alertID", a.ID).
					Msg("Panic in onUnacknowledged callback")
			}
		}()
		callback(a, u)
	}(alertCopy, user)
}

// safeCallEscalateCallback invokes onEscalate with panic recovery and alert cloning
func (m *Manager) safeCallEscalateCallback(alert *Alert, level int) {
	callback := m.getEscalateCallback()
	if callback == nil {
		return
	}

	// Clone alert to prevent concurrent modification
	alertCopy := cloneAlertForOutput(alert)
	go func(a *Alert, lvl int) {
		defer func() {
			if r := recover(); r != nil {
				log.Error().
					Interface("panic", r).
					Str("alertID", a.ID).
					Int("level", lvl).
					Msg("Panic in onEscalate callback")
			}
		}()
		callback(a, lvl)
	}(alertCopy, level)
}

// checkFlappingLocked detects alert flapping and returns true if alert should be suppressed.
// It modifies flappingHistory, flappingActive, and suppressedUntil maps.
// IMPORTANT: Caller MUST hold m.mu before calling this function.
func (m *Manager) checkFlappingLocked(trackingKey string) bool {
	if !m.config.FlappingEnabled {
		return false
	}

	now := time.Now()
	windowDuration := time.Duration(m.config.FlappingWindowSeconds) * time.Second

	// Record this state change
	m.flappingHistory[trackingKey] = append(m.flappingHistory[trackingKey], now)

	// Remove state changes outside the window
	history := m.flappingHistory[trackingKey]
	validHistory := []time.Time{}
	for _, t := range history {
		if now.Sub(t) <= windowDuration {
			validHistory = append(validHistory, t)
		}
	}
	// Limit to max 10 entries to prevent unbounded growth
	const maxFlappingHistory = 10
	if len(validHistory) > maxFlappingHistory {
		validHistory = validHistory[len(validHistory)-maxFlappingHistory:]
	}
	m.flappingHistory[trackingKey] = validHistory

	// Check if we've exceeded the threshold
	if len(validHistory) >= m.config.FlappingThreshold {
		// Mark as flapping
		if !m.flappingActive[trackingKey] {
			log.Warn().
				Str("trackingKey", trackingKey).
				Int("stateChanges", len(validHistory)).
				Int("threshold", m.config.FlappingThreshold).
				Int("windowSeconds", m.config.FlappingWindowSeconds).
				Msg("Flapping detected - suppressing alert")

			m.flappingActive[trackingKey] = true

			// Set cooldown period
			cooldownDuration := time.Duration(m.config.FlappingCooldownMinutes) * time.Minute
			m.suppressedUntil[trackingKey] = now.Add(cooldownDuration)

			// Record suppression metric
			if recordAlertSuppressed != nil {
				recordAlertSuppressed("flapping")
			}
		}
		return true
	}

	return false
}

func (m *Manager) dispatchAlert(alert *Alert, async bool) bool {
	callbacks := m.getAlertCallbacks()
	if len(callbacks) == 0 || alert == nil {
		return false
	}

	// Don't dispatch notifications for acknowledged alerts
	if alert.Acknowledged {
		log.Debug().
			Str("alertID", alert.ID).
			Str("ackUser", alert.AckUser).
			Msg("Alert notification suppressed - already acknowledged")
		return false
	}

	trackingKey := canonicalTrackingKeyForAlert(alert)

	// Check for flapping (caller must hold m.mu)
	if m.checkFlappingLocked(trackingKey) {
		log.Debug().
			Str("alertID", alert.ID).
			Str("trackingKey", trackingKey).
			Msg("Alert suppressed due to flapping")
		return false
	}

	// Check activation state - only dispatch notifications if active
	if m.config.ActivationState != ActivationActive {
		log.Debug().
			Str("alertID", alert.ID).
			Str("activationState", string(m.config.ActivationState)).
			Msg("Alert notification suppressed - not activated")
		return false
	}

	if suppressed, reason := m.shouldSuppressNotification(alert); suppressed {
		log.Debug().
			Str("alertID", alert.ID).
			Str("type", alert.Type).
			Str("level", string(alert.Level)).
			Str("quietHoursRule", reason).
			Msg("Alert notification suppressed during quiet hours")
		return false
	}

	if isMonitorOnlyAlert(alert) {
		log.Info().
			Str("alertID", alert.ID).
			Str("resource", alert.ResourceName).
			Bool("monitorOnly", true).
			Msg("Monitor-only alert detected, skipping alert dispatch")
		return false
	}

	// Record metric for fired alert
	if recordAlertFired != nil {
		recordAlertFired(alert)
	}

	alertCopy := cloneAlertForOutput(alert)
	if async {
		go func(a *Alert, fns []func(*Alert)) {
			defer func() {
				if r := recover(); r != nil {
					log.Error().
						Interface("panic", r).
						Str("alertID", a.ID).
						Str("type", a.Type).
						Msg("Panic in onAlert callback")
				}
			}()
			for _, callback := range fns {
				callback(a)
			}
		}(alertCopy, callbacks)
	} else {
		// Synchronous calls also need panic recovery to prevent service crash
		func(fns []func(*Alert)) {
			defer func() {
				if r := recover(); r != nil {
					log.Error().
						Interface("panic", r).
						Str("alertID", alertCopy.ID).
						Str("type", alertCopy.Type).
						Msg("Panic in onAlert callback (synchronous)")
				}
			}()
			for _, callback := range fns {
				callback(alertCopy)
			}
		}(callbacks)
	}
	return true
}

func isMonitorOnlyAlert(alert *Alert) bool {
	if alert == nil || alert.Metadata == nil {
		return false
	}

	if value, ok := alert.Metadata["monitorOnly"]; ok {
		switch v := value.(type) {
		case bool:
			return v
		case string:
			return strings.EqualFold(v, "true")
		}
	}
	return false
}

// ensureValidHysteresis ensures clear < trigger for hysteresis thresholds
func ensureValidHysteresis(threshold *HysteresisThreshold, metricName string) {
	if threshold == nil {
		return
	}
	// Disabled thresholds don't need hysteresis validation
	if threshold.Trigger <= 0 {
		return
	}
	if threshold.Clear >= threshold.Trigger {
		log.Warn().
			Str("metric", metricName).
			Float64("trigger", threshold.Trigger).
			Float64("clear", threshold.Clear).
			Msg("Invalid hysteresis: clear >= trigger, auto-fixing")
		// Auto-fix: set clear to 5% below trigger
		threshold.Clear = threshold.Trigger - 5
		if threshold.Clear < 0 {
			threshold.Clear = 0
		}
	}
}

// UpdateConfig updates the alert configuration
func (m *Manager) UpdateConfig(config AlertConfig) {
	m.mu.Lock()
	defer m.mu.Unlock()

	// Preserve activation state/time when clients update the config without including it.
	// This avoids unintentionally resetting alerts to pending review when saving thresholds.
	if config.ActivationState == "" && m.config.ActivationState != "" {
		config.ActivationState = m.config.ActivationState
		if config.ActivationTime == nil && m.config.ActivationTime != nil {
			config.ActivationTime = m.config.ActivationTime
		}
	}

	// Normalize all config sections
	NormalizeAlertConfigAliases(&config)
	normalizeStorageDefaults(&config)
	normalizeDockerDefaults(&config)
	normalizePMGDefaults(&config)
	normalizeSnapshotDefaults(&config)
	normalizeBackupDefaults(&config)
	normalizeNodeDefaults(&config)
	normalizeAgentDefaults(&config)
	normalizeGeneralSettings(&config)
	normalizeTimeThresholds(&config)

	config.GuestDefaults.PoweredOffSeverity = normalizePoweredOffSeverity(config.GuestDefaults.PoweredOffSeverity)
	config.NodeDefaults.PoweredOffSeverity = normalizePoweredOffSeverity(config.NodeDefaults.PoweredOffSeverity)
	config.DockerIgnoredContainerPrefixes = NormalizeDockerIgnoredPrefixes(config.DockerIgnoredContainerPrefixes)

	// Migration logic for activation state (backward compatibility)
	m.migrateActivationState(&config)

	// Validate hysteresis thresholds to prevent stuck alerts
	validateHysteresisThresholds(&config)

	// Validate timezone if quiet hours are enabled
	validateQuietHoursTimezone(&config)

	m.config = config
	normalizeOverrides(m.config.Overrides)

	// Update cached quiet hours location
	if m.config.Schedule.QuietHours.Enabled && m.config.Schedule.QuietHours.Timezone != "" {
		loc, err := time.LoadLocation(m.config.Schedule.QuietHours.Timezone)
		if err == nil {
			m.quietHoursLoc = loc
		} else {
			m.quietHoursLoc = time.Local
		}
	} else {
		m.quietHoursLoc = time.Local
	}

	if !m.config.SnapshotDefaults.Enabled {
		m.clearSnapshotAlertsForInstanceLocked("")
	}
	if !m.config.BackupDefaults.Enabled {
		m.clearBackupAlertsLocked()
	}

	m.applyGlobalOfflineSettingsLocked()

	log.Info().
		Bool("enabled", config.Enabled).
		Interface("guestDefaults", config.GuestDefaults).
		Msg("Alert configuration updated")

	// Re-evaluate active alerts against new thresholds
	m.reevaluateActiveAlertsLocked()
}

// normalizeStorageDefaults ensures storage default thresholds are set
// Trigger=0 is allowed and means "disable storage alerting"
func normalizeStorageDefaults(config *AlertConfig) {
	if config.StorageDefault.Trigger < 0 {
		config.StorageDefault.Trigger = 85
		config.StorageDefault.Clear = 80
	} else if config.StorageDefault.Trigger == 0 {
		// Trigger=0 means disabled, set Clear=0 too
		config.StorageDefault.Clear = 0
	} else if config.StorageDefault.Clear <= 0 {
		config.StorageDefault.Clear = config.StorageDefault.Trigger - 5
		if config.StorageDefault.Clear < 0 {
			config.StorageDefault.Clear = 0
		}
	}
}

// normalizeDockerThreshold normalizes a single Docker threshold
func normalizeDockerThreshold(th HysteresisThreshold, defaultTrigger float64, metricName string) HysteresisThreshold {
	normalized := th

	// Negative triggers are treated as unset and replaced with defaults.
	if normalized.Trigger < 0 {
		normalized.Trigger = defaultTrigger
	}

	// Explicit disable: keep trigger at 0 and clamp clear to 0.
	if normalized.Trigger == 0 {
		if normalized.Clear < 0 {
			normalized.Clear = 0
		}
		return normalized
	}

	if normalized.Clear <= 0 {
		normalized.Clear = normalized.Trigger - 5
		if normalized.Clear < 0 {
			normalized.Clear = 0
		}
	}

	ensureValidHysteresis(&normalized, metricName)
	return normalized
}

// normalizeDockerDefaults ensures Docker default thresholds are set
func normalizeDockerDefaults(config *AlertConfig) {
	config.DockerDefaults.CPU = normalizeDockerThreshold(config.DockerDefaults.CPU, 80, "docker.cpu")
	config.DockerDefaults.Memory = normalizeDockerThreshold(config.DockerDefaults.Memory, 85, "docker.memory")
	config.DockerDefaults.Disk = normalizeDockerThreshold(config.DockerDefaults.Disk, 85, "docker.disk")

	if config.DockerDefaults.RestartCount <= 0 {
		config.DockerDefaults.RestartCount = 3
	}
	if config.DockerDefaults.RestartWindow <= 0 {
		config.DockerDefaults.RestartWindow = 300 // 5 minutes
	}
	if config.DockerDefaults.MemoryWarnPct <= 0 {
		config.DockerDefaults.MemoryWarnPct = 90
	}
	if config.DockerDefaults.MemoryCriticalPct <= 0 {
		config.DockerDefaults.MemoryCriticalPct = 95
	}
	if config.DockerDefaults.ServiceWarnGapPct <= 0 {
		config.DockerDefaults.ServiceWarnGapPct = 10
	}
	if config.DockerDefaults.ServiceCritGapPct <= 0 {
		config.DockerDefaults.ServiceCritGapPct = 50
	}
	if config.DockerDefaults.ServiceCritGapPct > 0 &&
		config.DockerDefaults.ServiceCritGapPct < config.DockerDefaults.ServiceWarnGapPct {
		log.Warn().
			Int("warnGapPercent", config.DockerDefaults.ServiceWarnGapPct).
			Int("criticalGapPercent", config.DockerDefaults.ServiceCritGapPct).
			Msg("Adjusting Docker service critical gap to match warning gap")
		config.DockerDefaults.ServiceCritGapPct = config.DockerDefaults.ServiceWarnGapPct
	}
	if config.DockerDefaults.StatePoweredOffSeverity == "" {
		config.DockerDefaults.StatePoweredOffSeverity = AlertLevelWarning
	}
	config.DockerDefaults.StatePoweredOffSeverity = normalizePoweredOffSeverity(config.DockerDefaults.StatePoweredOffSeverity)
	// Default to 24 hours delay for update alerts; set to -1 to explicitly disable
	if config.DockerDefaults.UpdateAlertDelayHours == 0 {
		config.DockerDefaults.UpdateAlertDelayHours = 24
	}
}

// normalizePMGDefaults ensures PMG (Proxmox Mail Gateway) defaults are set
func normalizePMGDefaults(config *AlertConfig) {
	if config.PMGDefaults.QueueTotalWarning <= 0 {
		config.PMGDefaults.QueueTotalWarning = 500
	}
	if config.PMGDefaults.QueueTotalCritical <= 0 {
		config.PMGDefaults.QueueTotalCritical = 1000
	}
	if config.PMGDefaults.OldestMessageWarnMins <= 0 {
		config.PMGDefaults.OldestMessageWarnMins = 30
	}
	if config.PMGDefaults.OldestMessageCritMins <= 0 {
		config.PMGDefaults.OldestMessageCritMins = 60
	}
	if config.PMGDefaults.DeferredQueueWarn <= 0 {
		config.PMGDefaults.DeferredQueueWarn = 200
	}
	if config.PMGDefaults.DeferredQueueCritical <= 0 {
		config.PMGDefaults.DeferredQueueCritical = 500
	}
	if config.PMGDefaults.HoldQueueWarn <= 0 {
		config.PMGDefaults.HoldQueueWarn = 100
	}
	if config.PMGDefaults.HoldQueueCritical <= 0 {
		config.PMGDefaults.HoldQueueCritical = 300
	}
	if config.PMGDefaults.QuarantineSpamWarn <= 0 {
		config.PMGDefaults.QuarantineSpamWarn = 2000
	}
	if config.PMGDefaults.QuarantineSpamCritical <= 0 {
		config.PMGDefaults.QuarantineSpamCritical = 5000
	}
	if config.PMGDefaults.QuarantineVirusWarn <= 0 {
		config.PMGDefaults.QuarantineVirusWarn = 2000
	}
	if config.PMGDefaults.QuarantineVirusCritical <= 0 {
		config.PMGDefaults.QuarantineVirusCritical = 5000
	}
	if config.PMGDefaults.QuarantineGrowthWarnPct <= 0 {
		config.PMGDefaults.QuarantineGrowthWarnPct = 25
	}
	if config.PMGDefaults.QuarantineGrowthWarnMin <= 0 {
		config.PMGDefaults.QuarantineGrowthWarnMin = 250
	}
	if config.PMGDefaults.QuarantineGrowthCritPct <= 0 {
		config.PMGDefaults.QuarantineGrowthCritPct = 50
	}
	if config.PMGDefaults.QuarantineGrowthCritMin <= 0 {
		config.PMGDefaults.QuarantineGrowthCritMin = 500
	}
}

// normalizeSnapshotDefaults ensures snapshot alert thresholds are valid
func normalizeSnapshotDefaults(config *AlertConfig) {
	if config.SnapshotDefaults.WarningDays < 0 {
		config.SnapshotDefaults.WarningDays = 0
	}
	if config.SnapshotDefaults.CriticalDays < 0 {
		config.SnapshotDefaults.CriticalDays = 0
	}
	if config.SnapshotDefaults.CriticalDays > 0 && config.SnapshotDefaults.WarningDays > config.SnapshotDefaults.CriticalDays {
		config.SnapshotDefaults.WarningDays = config.SnapshotDefaults.CriticalDays
	}
	if config.SnapshotDefaults.CriticalDays == 0 && config.SnapshotDefaults.WarningDays > 0 {
		config.SnapshotDefaults.CriticalDays = config.SnapshotDefaults.WarningDays
	}
	if config.SnapshotDefaults.WarningSizeGiB < 0 {
		config.SnapshotDefaults.WarningSizeGiB = 0
	}
	if config.SnapshotDefaults.CriticalSizeGiB < 0 {
		config.SnapshotDefaults.CriticalSizeGiB = 0
	}
	if config.SnapshotDefaults.CriticalSizeGiB > 0 && config.SnapshotDefaults.WarningSizeGiB > config.SnapshotDefaults.CriticalSizeGiB {
		config.SnapshotDefaults.WarningSizeGiB = config.SnapshotDefaults.CriticalSizeGiB
	}
	if config.SnapshotDefaults.CriticalSizeGiB == 0 && config.SnapshotDefaults.WarningSizeGiB > 0 {
		config.SnapshotDefaults.CriticalSizeGiB = config.SnapshotDefaults.WarningSizeGiB
	}
}

// normalizeBackupDefaults ensures backup alert thresholds are valid
func normalizeBackupDefaults(config *AlertConfig) {
	if config.BackupDefaults.WarningDays < 0 {
		config.BackupDefaults.WarningDays = 0
	}
	if config.BackupDefaults.CriticalDays < 0 {
		config.BackupDefaults.CriticalDays = 0
	}
	if config.BackupDefaults.CriticalDays > 0 && config.BackupDefaults.WarningDays > config.BackupDefaults.CriticalDays {
		config.BackupDefaults.WarningDays = config.BackupDefaults.CriticalDays
	}
	// Default indicator thresholds for dashboard (separate from alert thresholds).
	if config.BackupDefaults.FreshHours <= 0 {
		config.BackupDefaults.FreshHours = 24
	}
	if config.BackupDefaults.StaleHours <= 0 {
		config.BackupDefaults.StaleHours = 72
	}
	if config.BackupDefaults.StaleHours < config.BackupDefaults.FreshHours {
		config.BackupDefaults.StaleHours = config.BackupDefaults.FreshHours
	}
	if config.BackupDefaults.AlertOrphaned == nil {
		alertOrphaned := true
		config.BackupDefaults.AlertOrphaned = &alertOrphaned
	}
	if len(config.BackupDefaults.IgnoreVMIDs) > 0 {
		seen := make(map[string]struct{}, len(config.BackupDefaults.IgnoreVMIDs))
		normalized := make([]string, 0, len(config.BackupDefaults.IgnoreVMIDs))
		for _, entry := range config.BackupDefaults.IgnoreVMIDs {
			value := strings.TrimSpace(entry)
			if value == "" {
				continue
			}
			if _, exists := seen[value]; exists {
				continue
			}
			seen[value] = struct{}{}
			normalized = append(normalized, value)
		}
		config.BackupDefaults.IgnoreVMIDs = normalized
	}
}

func backupIgnoreVMID(vmID string, ignoreList []string) bool {
	if vmID == "" || len(ignoreList) == 0 {
		return false
	}
	for _, entry := range ignoreList {
		value := strings.TrimSpace(entry)
		if value == "" {
			continue
		}
		if strings.HasSuffix(value, "*") {
			prefix := strings.TrimSuffix(value, "*")
			if prefix != "" && strings.HasPrefix(vmID, prefix) {
				return true
			}
			continue
		}
		if vmID == value {
			return true
		}
	}
	return false
}

// normalizeNodeDefaults ensures node threshold defaults exist
// Trigger=0 is allowed for Temperature and means "disable temperature alerting"
func normalizeNodeDefaults(config *AlertConfig) {
	// Ensure temperature defaults exist for nodes so high temps alert out of the box
	if config.NodeDefaults.Temperature == nil || config.NodeDefaults.Temperature.Trigger < 0 {
		config.NodeDefaults.Temperature = &HysteresisThreshold{Trigger: 80, Clear: 75}
	} else if config.NodeDefaults.Temperature.Trigger == 0 {
		// Trigger=0 means disabled, set Clear=0 too
		config.NodeDefaults.Temperature.Clear = 0
	} else if config.NodeDefaults.Temperature.Clear <= 0 {
		config.NodeDefaults.Temperature.Clear = config.NodeDefaults.Temperature.Trigger - 5
		if config.NodeDefaults.Temperature.Clear <= 0 {
			config.NodeDefaults.Temperature.Clear = 75
		}
	}
}

// normalizeAgentDefaults ensures host agent threshold defaults exist
// Trigger=0 is allowed and means "disable alerting for this metric"
func normalizeAgentDefaults(config *AlertConfig) {
	if config.AgentDefaults.CPU == nil || config.AgentDefaults.CPU.Trigger < 0 {
		config.AgentDefaults.CPU = &HysteresisThreshold{Trigger: 80, Clear: 75}
	} else if config.AgentDefaults.CPU.Trigger == 0 {
		// Trigger=0 means disabled, set Clear=0 too
		config.AgentDefaults.CPU.Clear = 0
	} else if config.AgentDefaults.CPU.Clear <= 0 {
		config.AgentDefaults.CPU.Clear = config.AgentDefaults.CPU.Trigger - 5
		if config.AgentDefaults.CPU.Clear <= 0 {
			config.AgentDefaults.CPU.Clear = 75
		}
	}
	if config.AgentDefaults.Memory == nil || config.AgentDefaults.Memory.Trigger < 0 {
		config.AgentDefaults.Memory = &HysteresisThreshold{Trigger: 85, Clear: 80}
	} else if config.AgentDefaults.Memory.Trigger == 0 {
		// Trigger=0 means disabled, set Clear=0 too
		config.AgentDefaults.Memory.Clear = 0
	} else if config.AgentDefaults.Memory.Clear <= 0 {
		config.AgentDefaults.Memory.Clear = config.AgentDefaults.Memory.Trigger - 5
		if config.AgentDefaults.Memory.Clear <= 0 {
			config.AgentDefaults.Memory.Clear = 80
		}
	}
	if config.AgentDefaults.Disk == nil || config.AgentDefaults.Disk.Trigger < 0 {
		config.AgentDefaults.Disk = &HysteresisThreshold{Trigger: 90, Clear: 85}
	} else if config.AgentDefaults.Disk.Trigger == 0 {
		// Trigger=0 means disabled, set Clear=0 too
		config.AgentDefaults.Disk.Clear = 0
	} else if config.AgentDefaults.Disk.Clear <= 0 {
		config.AgentDefaults.Disk.Clear = config.AgentDefaults.Disk.Trigger - 5
		if config.AgentDefaults.Disk.Clear <= 0 {
			config.AgentDefaults.Disk.Clear = 85
		}
	}

	if config.AgentDefaults.DiskTemperature == nil || config.AgentDefaults.DiskTemperature.Trigger < 0 {
		config.AgentDefaults.DiskTemperature = &HysteresisThreshold{Trigger: 55, Clear: 50}
	} else if config.AgentDefaults.DiskTemperature.Trigger == 0 {
		config.AgentDefaults.DiskTemperature.Clear = 0
	} else if config.AgentDefaults.DiskTemperature.Clear <= 0 {
		config.AgentDefaults.DiskTemperature.Clear = config.AgentDefaults.DiskTemperature.Trigger - 5
		if config.AgentDefaults.DiskTemperature.Clear <= 0 {
			config.AgentDefaults.DiskTemperature.Clear = 50
		}
	}
	ensureValidHysteresis(config.AgentDefaults.DiskTemperature, "agent.diskTemperature")
}

// normalizeGeneralSettings ensures general alert settings have valid values
func normalizeGeneralSettings(config *AlertConfig) {
	if config.MinimumDelta <= 0 {
		config.MinimumDelta = 2.0
	}
	if config.SuppressionWindow <= 0 {
		config.SuppressionWindow = 5
	}
	if config.HysteresisMargin <= 0 {
		config.HysteresisMargin = 5.0
	}
	if config.ObservationWindowHours <= 0 {
		config.ObservationWindowHours = 24
	}
	if config.FlappingWindowSeconds <= 0 {
		config.FlappingWindowSeconds = 300
	}
	if config.FlappingThreshold <= 0 {
		config.FlappingThreshold = 5
	}
	if config.FlappingCooldownMinutes <= 0 {
		config.FlappingCooldownMinutes = 15
	}
}

// normalizeTimeThresholds ensures time threshold settings are valid
func normalizeTimeThresholds(config *AlertConfig) {
	NormalizeAlertConfigAliases(config)
	config.MetricTimeThresholds = normalizeMetricTimeThresholds(config.MetricTimeThresholds)

	const defaultDelaySeconds = 5
	if config.TimeThresholds == nil {
		config.TimeThresholds = make(map[string]int)
	}
	ensureDelay := func(key string) {
		delay, ok := config.TimeThresholds[key]
		if !ok || delay < 0 {
			config.TimeThresholds[key] = defaultDelaySeconds
		}
	}
	ensureDelay("guest")
	ensureDelay("node")
	ensureDelay("storage")
	ensureDelay("pbs")
	ensureDelay("agent")
	if delay, ok := config.TimeThresholds["all"]; ok && delay < 0 {
		config.TimeThresholds["all"] = defaultDelaySeconds
	}
}

// migrateActivationState handles backward compatibility for activation state
func (m *Manager) migrateActivationState(config *AlertConfig) {
	if config.ActivationState == "" {
		// Determine if this is an existing installation or new
		// Existing installations have active alerts already
		isExistingInstall := len(m.activeAlerts) > 0 || len(config.Overrides) > 0
		if isExistingInstall {
			// Existing install: auto-activate to preserve behavior
			config.ActivationState = ActivationActive
			now := time.Now()
			config.ActivationTime = &now
			log.Info().Msg("migrating existing installation to active alert state")
		} else {
			// New install: start in pending review
			config.ActivationState = ActivationPending
			log.Info().Msg("new installation: alerts pending activation")
		}
	}
}

// validateHysteresisThresholds ensures hysteresis thresholds won't cause stuck alerts
func validateHysteresisThresholds(config *AlertConfig) {
	ensureValidHysteresis(config.GuestDefaults.CPU, "guest.cpu")
	ensureValidHysteresis(config.GuestDefaults.Memory, "guest.memory")
	ensureValidHysteresis(config.GuestDefaults.Disk, "guest.disk")
	ensureValidHysteresis(config.NodeDefaults.CPU, "node.cpu")
	ensureValidHysteresis(config.NodeDefaults.Memory, "node.memory")
	ensureValidHysteresis(config.NodeDefaults.Temperature, "node.temperature")
	ensureValidHysteresis(&config.StorageDefault, "storage")
}

// validateQuietHoursTimezone validates the timezone for quiet hours
func validateQuietHoursTimezone(config *AlertConfig) {
	if config.Schedule.QuietHours.Enabled && config.Schedule.QuietHours.Timezone != "" {
		_, err := time.LoadLocation(config.Schedule.QuietHours.Timezone)
		if err != nil {
			log.Error().
				Err(err).
				Str("timezone", config.Schedule.QuietHours.Timezone).
				Msg("Invalid timezone in quiet hours config, disabling quiet hours")
			// Disable quiet hours rather than silently using wrong timezone
			config.Schedule.QuietHours.Enabled = false
		}
	}
}

// normalizeOverrides normalizes all threshold overrides
func normalizeOverrides(overrides map[string]ThresholdConfig) {
	normalized := make(map[string]ThresholdConfig, len(overrides))
	priorityByKey := make(map[string]int, len(overrides))
	for id, override := range overrides {
		override.PoweredOffSeverity = normalizePoweredOffSeverity(override.PoweredOffSeverity)
		if override.Usage != nil {
			override.Usage = ensureHysteresisThreshold(override.Usage)
		}
		normalizedKey := id
		if ident, ok := parseCanonicalGuestKey(id); ok {
			if stableKey := clusteredGuestOverrideKey(ident); stableKey != "" {
				normalizedKey = stableKey
			}
		}
		priority := 0
		if normalizedKey == id {
			priority = 1
		}
		if existingPriority, exists := priorityByKey[normalizedKey]; exists && existingPriority > priority {
			continue
		}
		priorityByKey[normalizedKey] = priority
		normalized[normalizedKey] = override
	}
	for key := range overrides {
		delete(overrides, key)
	}
	for key, override := range normalized {
		overrides[key] = override
	}
}

// normalizeMetricTimeThresholds cleans resource/metric keys and drops invalid delay overrides.
func normalizeMetricTimeThresholds(input map[string]map[string]int) map[string]map[string]int {
	if len(input) == 0 {
		return nil
	}

	normalized := make(map[string]map[string]int)
	for rawType, metrics := range input {
		typeKey := canonicalAlertResourceType(rawType)
		if typeKey == "" || len(metrics) == 0 {
			continue
		}
		if typeKey != "all" && isUnsupportedLegacyAlertResourceType(typeKey) {
			continue
		}
		for rawMetric, delay := range metrics {
			metricKey := strings.ToLower(strings.TrimSpace(rawMetric))
			if metricKey == "" || delay < 0 {
				continue
			}
			if _, exists := normalized[typeKey]; !exists {
				normalized[typeKey] = make(map[string]int)
			}
			normalized[typeKey][metricKey] = delay
		}
	}

	if len(normalized) == 0 {
		return nil
	}

	return normalized
}

// NormalizeMetricTimeThresholds exposes normalization for other packages (e.g., config persistence).
func NormalizeMetricTimeThresholds(input map[string]map[string]int) map[string]map[string]int {
	return normalizeMetricTimeThresholds(input)
}

// NormalizeDockerIgnoredPrefixes trims, deduplicates, and lowercases comparison keys for ignored Docker containers.
// Returned values retain the user's original casing for display but guarantee uniqueness when compared case-insensitively.
func NormalizeDockerIgnoredPrefixes(prefixes []string) []string {
	if len(prefixes) == 0 {
		return nil
	}

	seen := make(map[string]struct{}, len(prefixes))
	normalized := make([]string, 0, len(prefixes))

	for _, prefix := range prefixes {
		trimmed := strings.TrimSpace(prefix)
		if trimmed == "" {
			continue
		}

		key := strings.ToLower(trimmed)
		if _, exists := seen[key]; exists {
			continue
		}
		seen[key] = struct{}{}
		normalized = append(normalized, trimmed)
	}

	if len(normalized) == 0 {
		return nil
	}

	return normalized
}

// applyGlobalOfflineSettingsLocked clears tracking and active alerts for globally disabled offline detectors.
// Caller must hold m.mu.
func (m *Manager) applyGlobalOfflineSettingsLocked() {
	if m.config.DisableAllNodesOffline {
		var nodeAlerts []string
		for storageKey, alert := range m.activeAlerts {
			if alert != nil && alert.CanonicalKind == string(alertspecs.AlertSpecKindConnectivity) {
				if resourceType, _ := alert.Metadata["resourceType"].(string); resourceType == "node" {
					nodeAlerts = append(nodeAlerts, storageKey)
				}
			}
		}
		for _, alertID := range nodeAlerts {
			m.clearAlertNoLock(alertID)
		}
		m.nodeOfflineCount = make(map[string]int)
	}

	if m.config.DisableAllPBSOffline {
		var pbsAlerts []string
		for storageKey, alert := range m.activeAlerts {
			if alert != nil && alert.CanonicalKind == string(alertspecs.AlertSpecKindConnectivity) {
				if resourceType, _ := alert.Metadata["resourceType"].(string); resourceType == "pbs" {
					pbsAlerts = append(pbsAlerts, storageKey)
				}
				delete(m.offlineConfirmations, alert.ResourceID)
			}
		}
		for _, alertID := range pbsAlerts {
			m.clearAlertNoLock(alertID)
		}
	}

	if m.config.DisableAllGuestsOffline {
		var guestAlerts []string
		for storageKey, alert := range m.activeAlerts {
			if alert != nil && alert.CanonicalKind == string(alertspecs.AlertSpecKindPoweredState) {
				guestAlerts = append(guestAlerts, storageKey)
				delete(m.offlineConfirmations, alert.ResourceID)
			}
		}
		for _, alertID := range guestAlerts {
			m.clearAlertNoLock(alertID)
		}
	}

	if m.config.DisableAllDockerHostsOffline {
		var hostAlerts []string
		for storageKey, alert := range m.activeAlerts {
			if alert != nil && alert.CanonicalKind == string(alertspecs.AlertSpecKindConnectivity) {
				if resourceType, _ := alert.Metadata["resourceType"].(string); resourceType == "docker-host" {
					hostAlerts = append(hostAlerts, storageKey)
				}
			}
		}
		for _, alertID := range hostAlerts {
			m.clearAlertNoLock(alertID)
		}
		m.dockerOfflineCount = make(map[string]int)
	}

	if m.config.DisableAllDockerContainers {
		var containerAlerts []string
		for storageKey, alert := range m.activeAlerts {
			id := effectiveAlertID(alert, storageKey)
			if strings.HasPrefix(id, "docker-container-") {
				containerAlerts = append(containerAlerts, id)
			}
		}
		for _, alertID := range containerAlerts {
			m.clearAlertNoLock(alertID)
		}
		m.dockerStateConfirm = make(map[string]int)
		m.dockerRestartTracking = make(map[string]*dockerRestartRecord)
		m.dockerLastExitCode = make(map[string]int)
		m.dockerUpdateFirstSeen = make(map[string]time.Time)
		m.dockerUpdateFirstSeenByIdentity = make(map[string]time.Time)
	}
	if m.config.DisableAllDockerServices {
		var serviceAlerts []string
		for storageKey, alert := range m.activeAlerts {
			id := effectiveAlertID(alert, storageKey)
			if strings.HasPrefix(id, "docker-service-") {
				serviceAlerts = append(serviceAlerts, id)
			}
		}
		for _, alertID := range serviceAlerts {
			m.clearAlertNoLock(alertID)
		}
	}
}

// reevaluateActiveAlertsLocked re-evaluates all active alerts against the current configuration
// This should only be called with m.mu already locked
func (m *Manager) reevaluateActiveAlertsLocked() {
	if len(m.activeAlerts) == 0 {
		return
	}

	// Track alerts that should be resolved
	alertsToResolve := make([]string, 0)

	for storageKey, alert := range m.activeAlerts {
		alertID := effectiveAlertID(alert, storageKey)
		backfillCanonicalIdentity(alert)
		resourceID := alert.ResourceID
		metricType := alert.Type
		if resourceID == "" || metricType == "" {
			parts := strings.Split(alertID, "-")
			if len(parts) < 2 {
				continue
			}
			metricType = parts[len(parts)-1]
			resourceID = strings.Join(parts[:len(parts)-1], "-")
		}

		// Get the appropriate threshold based on resource type and ID
		var threshold *HysteresisThreshold

		resourceTypeMeta := ""
		if alert.Metadata != nil {
			if metaType, ok := alert.Metadata["resourceType"].(string); ok {
				resourceTypeMeta = canonicalAlertResourceType(metaType)
			}
		}

		// Check for PMG alerts by Type
		if alert.Type == "queue-depth" || alert.Type == "queue-deferred" || alert.Type == "queue-hold" || alert.Type == "message-age" {
			// This is a PMG alert
			if m.config.DisableAllPMG {
				alertsToResolve = append(alertsToResolve, alertID)
				continue
			}
		}

		// Check for agent alerts by canonicalized resourceType metadata.
		isAgentResource := false
		for _, key := range CanonicalResourceTypeKeys(resourceTypeMeta) {
			if key == "agent" {
				isAgentResource = true
				break
			}
		}
		if isAgentResource {
			if m.config.DisableAllAgents {
				alertsToResolve = append(alertsToResolve, alertID)
				continue
			}
			// Overrides are keyed by raw host ID (without the "agent:" prefix
			// that hostResourceID adds to the resource ID used in alert IDs).
			rawHostID := stripHostResourcePrefix(resourceID)
			thresholds := m.resolveResourceThresholds("agent", rawHostID)
			if thresholds.Disabled {
				alertsToResolve = append(alertsToResolve, alertID)
				continue
			}
			threshold = getThresholdForMetric(thresholds, metricType)
		}

		if alert.Type == "docker-host-offline" ||
			alert.Type == "docker-container-health" ||
			alert.Type == "docker-container-state" ||
			alert.Type == "docker-container-restart-loop" ||
			alert.Type == "docker-container-oom-kill" ||
			alert.Type == "docker-container-memory-limit" {
			// Non-metric Docker alerts are not governed by thresholds
			continue
		}

		if resourceTypeMeta == "docker-host" {
			// Check if all Docker host alerts are disabled
			if m.config.DisableAllDockerHosts {
				alertsToResolve = append(alertsToResolve, alertID)
				continue
			}
			// No threshold evaluation for Docker hosts (connectivity handled separately)
			continue
		}
		if resourceTypeMeta == "app-container" {
			// Check if all Docker container alerts are disabled
			if m.config.DisableAllDockerContainers {
				alertsToResolve = append(alertsToResolve, alertID)
				continue
			}
			containerName := strings.ToLower(strings.TrimSpace(alert.ResourceName))
			containerID := ""
			if alert.Metadata != nil {
				if val, ok := alert.Metadata["containerId"].(string); ok {
					containerID = strings.ToLower(strings.TrimSpace(val))
				}
				if val, ok := alert.Metadata["containerName"].(string); ok && containerName == "" {
					containerName = strings.ToLower(strings.TrimSpace(val))
				}
			}
			if matchesDockerIgnoredPrefix(containerName, containerID, m.config.DockerIgnoredContainerPrefixes) {
				alertsToResolve = append(alertsToResolve, alertID)
				continue
			}
			thresholds := ThresholdConfig{
				CPU:    cloneThreshold(&m.config.DockerDefaults.CPU),
				Memory: cloneThreshold(&m.config.DockerDefaults.Memory),
				Disk:   cloneThreshold(&m.config.DockerDefaults.Disk),
			}
			if override, exists := m.config.Overrides[resourceID]; exists {
				if override.Disabled {
					alertsToResolve = append(alertsToResolve, alertID)
					continue
				}
				thresholds = m.applyThresholdOverride(thresholds, override)
			}
			threshold = getThresholdForMetric(thresholds, metricType)
		}

		// Determine the resource type from the alert's metadata or instance
		// We need to check what kind of resource this is
		if threshold == nil && !strings.Contains(resourceID, ":") && (alert.Instance == "Node" || alert.Instance == alert.Node) {
			// This is a node alert
			// Check if all node alerts are disabled
			if m.config.DisableAllNodes {
				alertsToResolve = append(alertsToResolve, alertID)
				continue
			}
			thresholds := m.resolveResourceThresholds("node", resourceID)
			if thresholds.Disabled {
				alertsToResolve = append(alertsToResolve, alertID)
				continue
			}
			threshold = getThresholdForMetric(thresholds, metricType)
		} else if threshold == nil && (alert.Instance == "Storage" || strings.Contains(alert.ResourceID, ":storage/")) {
			// This is a storage alert
			// Check if all storage alerts are disabled
			if m.config.DisableAllStorage {
				alertsToResolve = append(alertsToResolve, alertID)
				continue
			}
			thresholds := m.resolveResourceThresholds("storage", resourceID)
			if thresholds.Disabled {
				alertsToResolve = append(alertsToResolve, alertID)
				continue
			}
			threshold = getThresholdForMetric(thresholds, metricType)
		} else if threshold == nil && alert.Instance == "PBS" {
			// This is a PBS alert
			// Check if all PBS alerts are disabled
			if m.config.DisableAllPBS {
				alertsToResolve = append(alertsToResolve, alertID)
				continue
			}
			thresholds := m.resolveResourceThresholds("pbs", resourceID)
			if thresholds.Disabled {
				alertsToResolve = append(alertsToResolve, alertID)
				continue
			}
			threshold = getThresholdForMetric(thresholds, metricType)
		}

		if threshold == nil {
			// This is a guest (vm/system-container) alert
			// Check if all guest alerts are disabled
			if m.config.DisableAllGuests {
				alertsToResolve = append(alertsToResolve, alertID)
				continue
			}
			// We need to evaluate custom rules, but we don't have the guest object here.
			// For now, we'll mark these alerts for re-evaluation by the monitor.
			// The next poll cycle will properly evaluate them with custom rules.

			thresholds := m.resolveGuestThresholdOverride(cloneThresholdConfig(m.config.GuestDefaults), nil, resourceID)
			if thresholds.Disabled {
				alertsToResolve = append(alertsToResolve, alertID)
				continue
			}

			// If no custom rule context is available, reevaluation still uses the
			// shared default+override resolution path and waits for the next poll
			// to apply filter-driven guest rules.
			// Note: This doesn't consider custom rules - those will be evaluated
			// on the next poll cycle when we have the full guest object
			threshold = getThresholdForMetric(thresholds, metricType)
		}

		// If no threshold found or threshold is disabled (trigger <= 0), resolve the alert
		if threshold == nil || threshold.Trigger <= 0 {
			alertsToResolve = append(alertsToResolve, alertID)
			continue
		}

		// Check if current value is now below the clear threshold
		clearThreshold := threshold.Clear
		if clearThreshold <= 0 {
			clearThreshold = threshold.Trigger
		}

		if alert.Value <= clearThreshold {
			// Alert should be resolved due to new threshold
			alertsToResolve = append(alertsToResolve, alertID)
			log.Info().
				Str("alertID", alertID).
				Float64("value", alert.Value).
				Float64("oldThreshold", alert.Threshold).
				Float64("newClearThreshold", clearThreshold).
				Msg("Resolving alert due to threshold change")
		} else if alert.Value < threshold.Trigger {
			// Value is between clear and trigger thresholds after config change
			// Resolve it to prevent confusion
			alertsToResolve = append(alertsToResolve, alertID)
			log.Info().
				Str("alertID", alertID).
				Float64("value", alert.Value).
				Float64("newTrigger", threshold.Trigger).
				Float64("newClear", clearThreshold).
				Msg("Resolving alert - value now below trigger threshold after config change")
		}
	}

	// Resolve all alerts that should be cleared
	for _, alertID := range alertsToResolve {
		if alert, exists := m.getActiveAlertNoLock(alertID); exists {
			resolvedAlert := &ResolvedAlert{
				Alert:        alert,
				ResolvedTime: time.Now(),
			}

			// Remove any pending notification tracking for this alert since it's no longer valid.
			trackingKey := canonicalTrackingKeyForAlert(alert)
			if _, isPending := m.pendingAlerts[trackingKey]; isPending {
				delete(m.pendingAlerts, trackingKey)
				log.Debug().
					Str("alertID", alertID).
					Msg("Cleared pending alert after configuration update")
			}

			// Remove from active alerts
			m.removeActiveAlertNoLock(alertID)

			// Add to recently resolved while respecting lock ordering
			m.addRecentlyResolvedWithPrimaryLock(resolvedAlert)

			log.Info().
				Str("alertID", alertID).
				Msg("Alert auto-resolved after configuration change")

			m.safeCallResolvedAlertCallback(resolvedAlert.Alert, alertID, true)
		}
	}

	// Save updated active alerts if any were resolved
	if len(alertsToResolve) > 0 {
		go func() {
			defer func() {
				if r := recover(); r != nil {
					log.Error().Interface("panic", r).Msg("panic in SaveActiveAlerts goroutine (config update)")
				}
			}()
			if err := m.SaveActiveAlerts(); err != nil {
				log.Error().Err(err).Msg("failed to save active alerts after config update")
			}
		}()
	}
}

// ReevaluateGuestAlert reevaluates a specific guest's alerts with full threshold resolution including custom rules
// This should be called by the monitor with the current guest state
func (m *Manager) ReevaluateGuestAlert(guest any, guestID string) {
	m.mu.Lock()
	defer m.mu.Unlock()

	// Get the correct thresholds for this guest (includes custom rules evaluation)
	thresholds := m.getGuestThresholds(guest, guestID)

	// Check all metric types for this guest
	metricTypes := []string{"cpu", "memory", "disk", "diskRead", "diskWrite", "networkIn", "networkOut"}

	for _, metricType := range metricTypes {
		alertID := canonicalMetricStateID(guestID, metricType)
		alert, exists := m.getActiveAlertNoLock(alertID)
		if !exists {
			alert, exists = m.getActiveAlertNoLock(fmt.Sprintf("%s-%s", guestID, metricType))
		}
		if !exists {
			continue
		}
		trackingKey := canonicalTrackingKeyForAlert(alert)
		if trackingKey == "" {
			trackingKey = alertID
		}

		// Get the threshold for this metric
		var threshold *HysteresisThreshold
		switch metricType {
		case "cpu":
			threshold = thresholds.CPU
		case "memory":
			threshold = thresholds.Memory
		case "disk":
			threshold = thresholds.Disk
		case "diskRead":
			threshold = thresholds.DiskRead
		case "diskWrite":
			threshold = thresholds.DiskWrite
		case "networkIn":
			threshold = thresholds.NetworkIn
		case "networkOut":
			threshold = thresholds.NetworkOut
		}

		// If threshold is disabled or doesn't exist, clear the alert
		if threshold == nil || threshold.Trigger <= 0 {
			m.clearAlertNoLock(trackingKey)
			// Also clear any pending alert for this metric
			if _, isPending := m.pendingAlerts[trackingKey]; isPending {
				delete(m.pendingAlerts, trackingKey)
				log.Debug().
					Str("alertID", alertID).
					Msg("Cleared pending alert - threshold disabled")
			}
			log.Info().
				Str("alertID", alertID).
				Str("metric", metricType).
				Msg("Cleared alert - threshold disabled")
			continue
		}

		// Check if alert should be cleared based on new threshold
		clearThreshold := threshold.Clear
		if clearThreshold <= 0 {
			clearThreshold = threshold.Trigger
		}

		if alert.Value <= clearThreshold || alert.Value < threshold.Trigger {
			m.clearAlertNoLock(trackingKey)
			log.Info().
				Str("alertID", alertID).
				Str("metric", metricType).
				Float64("value", alert.Value).
				Float64("trigger", threshold.Trigger).
				Float64("clear", clearThreshold).
				Msg("Cleared alert - value now below threshold after config change")
		}
	}
}

// getThresholdForMetric returns the threshold for a specific metric type from a ThresholdConfig
func getThresholdForMetric(config ThresholdConfig, metricType string) *HysteresisThreshold {
	switch metricType {
	case "cpu":
		return config.CPU
	case "memory":
		return config.Memory
	case "disk":
		return config.Disk
	case "diskRead":
		return config.DiskRead
	case "diskWrite":
		return config.DiskWrite
	case "networkIn":
		return config.NetworkIn
	case "networkOut":
		return config.NetworkOut
	case "temperature":
		return config.Temperature
	case "usage":
		return config.Usage
	default:
		return nil
	}
}

// getThresholdForMetricFromConfig returns the threshold for a specific metric type from a ThresholdConfig
// ensuring hysteresis is properly set
func getThresholdForMetricFromConfig(config ThresholdConfig, metricType string) *HysteresisThreshold {
	th := getThresholdForMetric(config, metricType)
	if th == nil {
		return nil
	}
	return ensureHysteresisThreshold(th)
}

// isInQuietHours checks if the current time is within quiet hours
func (m *Manager) isInQuietHours() bool {
	if !m.config.Schedule.QuietHours.Enabled {
		return false
	}

	// Use cached location if available
	loc := m.quietHoursLoc
	if loc == nil {
		// Fallback to loading if not cached yet (shouldn't happen with UpdateConfig)
		var err error
		loc, err = time.LoadLocation(m.config.Schedule.QuietHours.Timezone)
		if err != nil {
			log.Warn().Err(err).Str("timezone", m.config.Schedule.QuietHours.Timezone).Msg("failed to load timezone, using local time")
			loc = time.Local
		}
		m.quietHoursLoc = loc
	}

	now := time.Now().In(loc)
	dayName := strings.ToLower(now.Format("Monday"))

	// Check if today is enabled for quiet hours
	if enabled, ok := m.config.Schedule.QuietHours.Days[dayName]; !ok || !enabled {
		return false
	}

	// Parse start and end times
	startTime, err := time.ParseInLocation("15:04", m.config.Schedule.QuietHours.Start, loc)
	if err != nil {
		log.Warn().Err(err).Str("start", m.config.Schedule.QuietHours.Start).Msg("failed to parse quiet hours start time")
		return false
	}

	endTime, err := time.ParseInLocation("15:04", m.config.Schedule.QuietHours.End, loc)
	if err != nil {
		log.Warn().Err(err).Str("end", m.config.Schedule.QuietHours.End).Msg("failed to parse quiet hours end time")
		return false
	}

	// Set to today's date
	startTime = time.Date(now.Year(), now.Month(), now.Day(), startTime.Hour(), startTime.Minute(), 0, 0, loc)
	endTime = time.Date(now.Year(), now.Month(), now.Day(), endTime.Hour(), endTime.Minute(), 0, 0, loc)

	// Handle overnight quiet hours (e.g., 22:00 to 08:00)
	if endTime.Before(startTime) {
		// If we're past the start time or before the end time
		if now.After(startTime) || now.Before(endTime) {
			return true
		}
	} else {
		// Normal case (e.g., 08:00 to 17:00)
		if now.After(startTime) && now.Before(endTime) {
			return true
		}
	}

	return false
}

func quietHoursCategoryForAlert(alert *Alert) string {
	if alert == nil {
		return ""
	}

	switch alert.Type {
	case "cpu", "memory", "disk", "diskRead", "diskWrite", "networkIn", "networkOut", "temperature":
		return "performance"
	case "queue-depth", "queue-deferred", "queue-hold", "message-age",
		"docker-container-health", "docker-container-restart-loop",
		"docker-container-oom-kill", "docker-container-memory-limit":
		return "performance"
	case "usage", "disk-health", "disk-wearout", "zfs-pool-state", "zfs-pool-errors", "zfs-device", "storage-incident", "backup-storage-incident", "backup-posture-incident":
		return "storage"
	case "resource-incident":
		if metadataStringValue(alert.Metadata, "incidentCategory") == unifiedresources.IncidentCategoryAvailability {
			return "offline"
		}
		return "performance"
	case "connectivity", "offline", "powered-off", "docker-host-offline":
		return "offline"
	}

	if strings.HasPrefix(alert.Type, "docker-container-") {
		if alert.Type == "docker-container-state" {
			return "offline"
		}
		return "performance"
	}

	return ""
}

func (m *Manager) shouldSuppressNotification(alert *Alert) (bool, string) {
	if alert == nil {
		return false, ""
	}

	if !m.isInQuietHours() {
		return false, ""
	}

	if alert.Level != AlertLevelCritical {
		return true, "non-critical"
	}

	category := quietHoursCategoryForAlert(alert)
	switch category {
	case "performance":
		if m.config.Schedule.QuietHours.Suppress.Performance {
			return true, category
		}
	case "storage":
		if m.config.Schedule.QuietHours.Suppress.Storage {
			return true, category
		}
	case "offline":
		if m.config.Schedule.QuietHours.Suppress.Offline {
			return true, category
		}
	}

	return false, ""
}

// ShouldSuppressResolvedNotification checks if a recovery notification should be suppressed
// during quiet hours. Recovery notifications follow the same quiet hours rules as their
// corresponding alerts - if the original alert would have been suppressed, so is the recovery.
func (m *Manager) ShouldSuppressResolvedNotification(alert *Alert) bool {
	if alert == nil {
		return false
	}

	m.mu.RLock()
	defer m.mu.RUnlock()

	suppressed, reason := m.shouldSuppressNotification(alert)
	if suppressed {
		log.Debug().
			Str("alertID", alert.ID).
			Str("type", alert.Type).
			Str("level", string(alert.Level)).
			Str("quietHoursRule", reason).
			Msg("Recovery notification suppressed during quiet hours")
	}
	return suppressed
}

// shouldNotifyAfterCooldown checks if enough time has passed since the last notification
// Returns true if notification should be sent, false if still in cooldown period
func (m *Manager) shouldNotifyAfterCooldown(alert *Alert) bool {
	// If cooldown is 0 or negative, always allow notifications
	if m.config.Schedule.Cooldown <= 0 {
		return true
	}

	// If this is the first notification, allow it
	if alert.LastNotified == nil {
		return true
	}

	// Check if enough time has passed
	cooldownDuration := time.Duration(m.config.Schedule.Cooldown) * time.Minute
	timeSinceLastNotification := time.Since(*alert.LastNotified)

	return timeSinceLastNotification >= cooldownDuration
}

// GetConfig returns the current alert configuration
func (m *Manager) GetConfig() AlertConfig {
	m.mu.RLock()
	defer m.mu.RUnlock()
	return m.config
}

// CheckGuest checks a guest (VM or container) against thresholds
func (m *Manager) CheckGuest(guest any, instanceName string) {
	m.mu.RLock()
	enabled := m.config.Enabled
	disableAllGuests := m.config.DisableAllGuests
	disableAllGuestsOffline := m.config.DisableAllGuestsOffline
	ignoredGuestPrefixes := m.config.IgnoredGuestPrefixes
	guestTagWhitelist := m.config.GuestTagWhitelist
	guestTagBlacklist := m.config.GuestTagBlacklist
	m.mu.RUnlock()

	if !enabled {
		log.Debug().Msg("checkGuest: alerts disabled globally")
		return
	}
	if disableAllGuests {
		log.Debug().Msg("checkGuest: all guest alerts disabled")
		return
	}

	snapshot, ok := extractGuestSnapshot(guest)
	if !ok {
		log.Debug().
			Str("type", fmt.Sprintf("%T", guest)).
			Msg("CheckGuest: unsupported guest type")
		return
	}

	guestID := snapshot.ID
	name := snapshot.Name
	node := snapshot.Node
	guestType := snapshot.displayType()
	status := snapshot.Status
	cpu := snapshot.CPUPercent
	memUsage := snapshot.MemUsage
	diskUsage := snapshot.DiskUsage
	diskRead := snapshot.DiskRead
	diskWrite := snapshot.DiskWrite
	netIn := snapshot.NetworkIn
	netOut := snapshot.NetworkOut
	disks := snapshot.Disks
	tags := snapshot.Tags

	// Debug logging for high memory VMs
	if snapshot.Kind == guestKindVM && memUsage > 85 {
		log.Debug().
			Str("vm", name).
			Float64("memUsage", memUsage).
			Str("status", status).
			Msg("VM with high memory detected in CheckGuest")
	}

	// Check ignored prefixes
	for _, prefix := range ignoredGuestPrefixes {
		if prefix != "" && strings.HasPrefix(name, prefix) {
			if cleared := m.suppressGuestAlerts(guestID); cleared {
				m.saveActiveAlertsAsync("ignored-prefix")
			}
			return
		}
	}

	settings := parsePulseTags(tags)
	if settings.Suppress {
		if cleared := m.suppressGuestAlerts(guestID); cleared {
			m.saveActiveAlertsAsync("pulse-no-alerts")
		}
		log.Debug().
			Str("guestID", guestID).
			Msg("Pulse no-alerts tag active; suppressing guest alerts")
		return
	}

	// Custom Tag Filtering
	if len(guestTagBlacklist) > 0 || len(guestTagWhitelist) > 0 {
		// Normalize tags once for checking
		normalizedTags := make(map[string]bool)
		for _, tag := range tags {
			normalizedTags[strings.ToLower(strings.TrimSpace(tag))] = true
		}

		// Check Blacklist
		for _, block := range guestTagBlacklist {
			if normalizedTags[strings.ToLower(strings.TrimSpace(block))] {
				if cleared := m.suppressGuestAlerts(guestID); cleared {
					m.saveActiveAlertsAsync("tag-blacklist")
				}
				log.Debug().Str("guestID", guestID).Msg("guest suppressed by tag blacklist")
				return
			}
		}

		// Check Whitelist
		if len(guestTagWhitelist) > 0 {
			found := false
			for _, allow := range guestTagWhitelist {
				if normalizedTags[strings.ToLower(strings.TrimSpace(allow))] {
					found = true
					break
				}
			}
			if !found {
				if cleared := m.suppressGuestAlerts(guestID); cleared {
					m.saveActiveAlertsAsync("tag-whitelist")
				}
				log.Debug().Str("guestID", guestID).Msg("guest suppressed by tag whitelist (required tag not found)")
				return
			}
		}
	}

	monitorOnly := settings.MonitorOnly
	if monitorOnly || m.guestHasMonitorOnlyAlerts(guestID) {
		log.Debug().
			Str("guest", name).
			Bool("monitorOnly", monitorOnly).
			Msg("Pulse monitor-only status applied")
	}

	// Handle non-running guests
	// Proxmox VM states: running, stopped, paused, suspended
	if status != "running" {
		// Check for powered-off state and generate alert if configured
		if status == "stopped" {
			if disableAllGuestsOffline {
				// Clear any pending powered-off tracking and alerts when globally disabled
				m.mu.Lock()
				delete(m.offlineConfirmations, guestID)
				m.mu.Unlock()
				m.clearAlert(canonicalPoweredStateStateID(guestID))
			} else {
				m.mu.RLock()
				thresholds := m.getGuestThresholds(guest, guestID)
				m.mu.RUnlock()
				m.checkGuestPoweredOffWithThresholds(guestID, name, node, instanceName, guestType, thresholds, monitorOnly)
			}
		} else {
			// For paused/suspended, clear powered-off alert
			m.clearGuestPoweredOffAlert(guestID, name)
		}

		// Clear all resource metric alerts (cpu, memory, disk, etc.) for non-running guests
		m.mu.Lock()
		alertsCleared := 0
		for storageKey, alert := range m.activeAlerts {
			alertID := effectiveAlertID(alert, storageKey)
			// Only clear resource metric alerts, not powered-off alerts
			if alert.ResourceID == guestID && alert.Type != "powered-off" {
				m.clearAlertNoLock(alertID)
				alertsCleared++
				log.Debug().
					Str("alertID", alertID).
					Str("guest", name).
					Str("status", status).
					Msg("Cleared metric alert for non-running guest")
			}
		}
		m.mu.Unlock()

		if alertsCleared > 0 {
			log.Debug().
				Str("guest", name).
				Str("status", status).
				Int("alertsCleared", alertsCleared).
				Msg("Cleared metric alerts for non-running guest")
		}
		return
	}

	// If guest is running, clear any powered-off alert
	m.clearGuestPoweredOffAlert(guestID, name)

	// Get thresholds (check custom rules, then overrides, then defaults)
	m.mu.RLock()
	thresholds := m.getGuestThresholds(guest, guestID)
	m.mu.RUnlock()

	if settings.Relaxed {
		thresholds = applyRelaxedGuestThresholds(thresholds)
		log.Info().
			Str("guest", name).
			Float64("trigger", thresholds.CPU.Trigger).
			Msg("Applied relaxed thresholds for pulse-relaxed tag")
	}

	// If alerts are disabled for this guest, clear any existing alerts and return
	if thresholds.Disabled {
		m.mu.Lock()
		for storageKey, alert := range m.activeAlerts {
			alertID := effectiveAlertID(alert, storageKey)
			if alert.ResourceID == guestID {
				m.clearAlertNoLock(alertID)
				log.Info().
					Str("alertID", alertID).
					Str("guest", name).
					Msg("Cleared alert - guest has alerts disabled")
			}
		}
		m.mu.Unlock()
		return
	}

	// Check each metric
	log.Debug().
		Str("guest", name).
		Float64("cpu", cpu).
		Float64("memory", memUsage).
		Float64("disk", diskUsage).
		Interface("thresholds", thresholds).
		Msg("Checking guest thresholds")

	// Evaluate standard metrics through unified path
	var evalOpts *metricOptions
	if monitorOnly {
		evalOpts = &metricOptions{MonitorOnly: true}
	}
	m.evaluateUnifiedMetrics(&UnifiedResourceInput{
		ID:         guestID,
		Type:       snapshot.resourceType(),
		Name:       name,
		Node:       node,
		Instance:   instanceName,
		CPU:        &UnifiedResourceMetric{Percent: cpu},
		Memory:     &UnifiedResourceMetric{Percent: memUsage},
		Disk:       &UnifiedResourceMetric{Percent: diskUsage},
		DiskRead:   &UnifiedResourceMetric{Value: float64(diskRead) / 1024 / 1024},
		DiskWrite:  &UnifiedResourceMetric{Value: float64(diskWrite) / 1024 / 1024},
		NetworkIn:  &UnifiedResourceMetric{Value: float64(netIn) / 1024 / 1024},
		NetworkOut: &UnifiedResourceMetric{Value: float64(netOut) / 1024 / 1024},
	}, thresholds, evalOpts)

	if thresholds.Disk != nil && thresholds.Disk.Trigger > 0 && len(disks) > 0 {
		seenDisks := make(map[string]struct{})
		for idx, disk := range disks {
			if disk.Total <= 0 {
				continue
			}
			if disk.Usage < 0 {
				continue
			}

			label := strings.TrimSpace(disk.Mountpoint)
			if label == "" {
				label = strings.TrimSpace(disk.Device)
			}
			if label == "" {
				label = fmt.Sprintf("Disk %d", idx+1)
			}

			keySource := label
			if disk.Device != "" && !strings.EqualFold(disk.Device, label) {
				keySource = fmt.Sprintf("%s-%s", label, disk.Device)
			}
			sanitizedKey := sanitizeAlertKey(keySource)
			if sanitizedKey == "" {
				sanitizedKey = fmt.Sprintf("disk-%d", idx+1)
			}

			// Avoid duplicate checks if two disks resolve to the same key
			if _, exists := seenDisks[sanitizedKey]; exists {
				continue
			}
			seenDisks[sanitizedKey] = struct{}{}

			perDiskResourceID := fmt.Sprintf("%s-disk-%s", guestID, sanitizedKey)
			message := fmt.Sprintf("%s disk (%s) at %.1f%%", guestType, label, disk.Usage)

			log.Debug().
				Str("guest", name).
				Str("node", node).
				Str("instance", instanceName).
				Str("diskLabel", label).
				Float64("usage", disk.Usage).
				Msg("Evaluating individual disk for alert thresholds")

			metadata := map[string]interface{}{
				"mountpoint": disk.Mountpoint,
				"device":     disk.Device,
				"diskType":   disk.Type,
				"totalBytes": disk.Total,
				"usedBytes":  disk.Used,
				"freeBytes":  disk.Free,
				"diskIndex":  idx,
				"label":      label,
			}
			resourceType, ok := unifiedMetricResourceType(snapshot.resourceType())
			if !ok {
				m.checkMetric(perDiskResourceID, name, node, instanceName, snapshot.resourceType(), "disk", disk.Usage, thresholds.Disk, &metricOptions{
					Metadata:    metadata,
					Message:     message,
					MonitorOnly: monitorOnly,
				})
				continue
			}
			spec, err := buildCanonicalMetricSpec(perDiskResourceID, name, resourceType, "disk", thresholds.Disk)
			if err != nil {
				log.Warn().
					Err(err).
					Str("resourceID", perDiskResourceID).
					Str("guest", name).
					Msg("Skipping invalid canonical guest disk metric spec")
				continue
			}

			m.checkMetricWithCanonicalSpec(spec, name, node, instanceName, snapshot.resourceType(), disk.Usage, thresholds.Disk, &metricOptions{
				Metadata:    metadata,
				Message:     message,
				MonitorOnly: monitorOnly,
			})
		}
	}
}

// CheckNode checks a node against thresholds
func (m *Manager) CheckNode(node models.Node) {
	// Cache display name so all alerts (including guest alerts on this node) can resolve it.
	m.UpdateNodeDisplayName(node.Instance, node.Name, node.DisplayName)

	m.mu.RLock()
	if !m.config.Enabled {
		m.mu.RUnlock()
		return
	}
	if m.config.DisableAllNodes {
		m.mu.RUnlock()
		// Clear any existing node alerts when all node alerts are disabled
		m.mu.Lock()
		// Clear offline tracking
		delete(m.nodeOfflineCount, node.ID)
		// Clear all possible node alert types
		alertTypes := []string{"cpu", "memory", "disk", "temperature"}
		for _, alertType := range alertTypes {
			alertID := canonicalMetricStateID(node.ID, alertType)
			if m.hasActiveAlertNoLock(alertID) {
				m.clearAlertNoLock(alertID)
				log.Info().
					Str("alertID", alertID).
					Str("node", node.Name).
					Msg("Cleared node alert - all node alerts disabled")
			}
		}
		// Clear offline alert
		offlineAlertID := canonicalConnectivityStateID(node.ID)
		if m.hasActiveAlertNoLock(offlineAlertID) {
			m.clearAlertNoLock(offlineAlertID)
			log.Info().
				Str("alertID", offlineAlertID).
				Str("node", node.Name).
				Msg("Cleared offline alert - all node alerts disabled")
		}
		m.mu.Unlock()
		return
	}
	disableNodesOffline := m.config.DisableAllNodesOffline
	thresholds := m.resolveResourceThresholds("node", node.ID)
	m.mu.RUnlock()

	if thresholds.Disabled {
		m.mu.Lock()
		delete(m.nodeOfflineCount, node.ID)
		m.mu.Unlock()
		for _, alertID := range []string{
			canonicalMetricStateID(node.ID, "cpu"),
			canonicalMetricStateID(node.ID, "memory"),
			canonicalMetricStateID(node.ID, "disk"),
			canonicalMetricStateID(node.ID, "temperature"),
			canonicalConnectivityStateID(node.ID),
		} {
			m.clearAlert(alertID)
		}
		return
	}

	if disableNodesOffline || thresholds.DisableConnectivity {
		// Clear tracking and any existing offline alerts when globally disabled
		m.mu.Lock()
		delete(m.nodeOfflineCount, node.ID)
		m.mu.Unlock()
		m.clearAlert(canonicalConnectivityStateID(node.ID))
	} else {
		// CRITICAL: Check if node is offline first
		if node.Status == "offline" || node.ConnectionHealth == "error" || node.ConnectionHealth == "failed" {
			m.checkNodeOffline(node)

			// Clear resource alerts if node is offline/unreachable.
			// This prevents stale alerts from persisting when we can't get new data.
			metrics := []string{"cpu", "memory", "disk", "temperature"}
			for _, metric := range metrics {
				m.clearAlert(canonicalMetricStateID(node.ID, metric))
			}
		} else {
			// Clear any existing offline alert if node is back online
			m.clearNodeOfflineAlert(node)

			// Check each metric (only if node is online and reachable)
			// Check for host agent deduplication: if a host agent is running on this node,
			// prefer the host agent alerts and skip node metric alerts to avoid duplicates.
			if m.hasHostAgentForNode(node.Name) {
				log.Debug().
					Str("node", node.Name).
					Msg("Skipping node metric alerts - host agent is monitoring this machine")
			} else {
				m.evaluateUnifiedMetrics(&UnifiedResourceInput{
					ID:       node.ID,
					Type:     "node",
					Name:     node.Name,
					Node:     node.Name,
					Instance: node.Instance,
					CPU:      &UnifiedResourceMetric{Percent: node.CPU * 100},
					Memory:   &UnifiedResourceMetric{Percent: node.Memory.Usage},
					Disk:     &UnifiedResourceMetric{Percent: node.Disk.Usage},
				}, thresholds, nil)

				// Check temperature if available
				// We pass the check unconditionally so that if the threshold triggers are disabled (set to 0),
				// any existing alerts will be properly cleared.
				var temp float64
				if node.Temperature != nil && node.Temperature.Available {
					// Use CPU package temp if available, otherwise use max core temp
					temp = node.Temperature.CPUPackage
					if temp == 0 {
						temp = node.Temperature.CPUMax
					}
				}
				spec, err := buildCanonicalMetricSpec(node.ID, node.Name, unifiedresources.ResourceType("node"), "temperature", thresholds.Temperature)
				if err != nil {
					log.Warn().
						Err(err).
						Str("resourceID", node.ID).
						Str("node", node.Name).
						Msg("Skipping invalid canonical node temperature metric spec")
				} else {
					m.checkMetricWithCanonicalSpec(spec, node.Name, node.Name, node.Instance, "node", temp, thresholds.Temperature, nil)
				}
			}
		}
	}
}

// RegisterHostAgentHostname registers a host agent hostname for deduplication.
// When a host agent is actively monitoring a machine, we prefer its alerts
// over Proxmox node alerts to avoid duplicate monitoring of the same machine.
func (m *Manager) RegisterHostAgentHostname(hostname string) {
	normalized := strings.ToLower(strings.TrimSpace(hostname))
	if normalized == "" {
		return
	}
	m.mu.Lock()
	m.hostAgentHostnames[normalized] = struct{}{}
	m.mu.Unlock()

	log.Debug().
		Str("hostname", hostname).
		Msg("Registered host agent hostname for deduplication")
}

// UnregisterHostAgentHostname removes a host agent hostname from deduplication tracking.
func (m *Manager) UnregisterHostAgentHostname(hostname string) {
	normalized := strings.ToLower(strings.TrimSpace(hostname))
	if normalized == "" {
		return
	}
	m.mu.Lock()
	delete(m.hostAgentHostnames, normalized)
	m.mu.Unlock()

	log.Debug().
		Str("hostname", hostname).
		Msg("Unregistered host agent hostname from deduplication")
}

// hasHostAgentForNode checks if a host agent is monitoring a machine with the same
// hostname as the given Proxmox node. If so, we should suppress node alerts to
// avoid duplicate alerting.
func (m *Manager) hasHostAgentForNode(nodeName string) bool {
	normalized := strings.ToLower(strings.TrimSpace(nodeName))
	if normalized == "" {
		return false
	}
	m.mu.RLock()
	_, exists := m.hostAgentHostnames[normalized]
	m.mu.RUnlock()
	return exists
}

// UpdateNodeDisplayName caches the display name for a node/host so alerts
// can resolve it without needing the full model object.
func nodeDisplayNameCacheKey(instance, name string) string {
	return strings.TrimSpace(instance) + "\x00" + strings.TrimSpace(name)
}

func (m *Manager) UpdateNodeDisplayName(parts ...string) {
	var instance, name, displayName string
	switch len(parts) {
	case 2:
		name, displayName = parts[0], parts[1]
	case 3:
		instance, name, displayName = parts[0], parts[1], parts[2]
	default:
		return
	}

	instance = strings.TrimSpace(instance)
	name = strings.TrimSpace(name)
	if name == "" {
		return
	}
	displayName = strings.TrimSpace(displayName)
	m.mu.Lock()
	if instance != "" {
		key := nodeDisplayNameCacheKey(instance, name)
		if displayName != "" && displayName != name {
			m.instanceNodeDisplayNames[key] = displayName
		} else {
			delete(m.instanceNodeDisplayNames, key)
		}
	} else {
		if displayName != "" && displayName != name {
			m.nodeDisplayNames[name] = displayName
		} else {
			delete(m.nodeDisplayNames, name)
		}
	}
	m.mu.Unlock()
}

// resolveNodeDisplayName returns the cached display name for a node, or empty
// string if none is set. Caller must hold m.mu (read or write).
func (m *Manager) resolveNodeDisplayName(instance, node string) string {
	if instance = strings.TrimSpace(instance); instance != "" {
		if displayName, ok := m.instanceNodeDisplayNames[nodeDisplayNameCacheKey(instance, node)]; ok {
			return displayName
		}
	}
	return m.nodeDisplayNames[node]
}

func hostResourceID(hostID string) string {
	trimmed := strings.TrimSpace(hostID)
	if trimmed == "" {
		return "agent:unknown"
	}
	return fmt.Sprintf("agent:%s", trimmed)
}

func stripHostResourcePrefix(resourceID string) string {
	trimmed := strings.TrimSpace(resourceID)
	trimmed = strings.TrimPrefix(trimmed, "agent:")
	return strings.TrimSpace(trimmed)
}

func hostDisplayName(host models.Host) string {
	if name := strings.TrimSpace(host.DisplayName); name != "" {
		return name
	}
	if name := strings.TrimSpace(host.Hostname); name != "" {
		return name
	}
	if host.ID != "" {
		return host.ID
	}
	return "Agent"
}

func hostInstanceName(host models.Host) string {
	if platform := strings.TrimSpace(host.Platform); platform != "" {
		return platform
	}
	if osName := strings.TrimSpace(host.OSName); osName != "" {
		return osName
	}
	return "Agent"
}

func sanitizeHostComponent(value string) string {
	value = strings.TrimSpace(strings.ToLower(value))
	if value == "" {
		return "unknown"
	}

	var builder strings.Builder
	lastHyphen := false
	for _, r := range value {
		switch {
		case r >= 'a' && r <= 'z':
			builder.WriteRune(r)
			lastHyphen = false
		case r >= '0' && r <= '9':
			builder.WriteRune(r)
			lastHyphen = false
		default:
			if !lastHyphen {
				builder.WriteRune('-')
				lastHyphen = true
			}
		}
	}

	sanitized := strings.Trim(builder.String(), "-")
	if sanitized == "" {
		return "unknown"
	}
	return sanitized
}

// sanitizeRAIDDevice sanitizes RAID device names for use in resource IDs.
func sanitizeRAIDDevice(device string) string {
	// Remove /dev/ prefix if present
	device = strings.TrimPrefix(device, "/dev/")
	return sanitizeHostComponent(device)
}

func hostDiskResourceIDWithPrefix(host models.Host, disk models.Disk, resourcePrefix string) (string, string) {
	label := strings.TrimSpace(disk.Mountpoint)
	if label == "" {
		label = strings.TrimSpace(disk.Device)
	}
	if label == "" {
		label = "disk"
	}
	resourceID := fmt.Sprintf("%s/disk:%s", resourcePrefix, sanitizeHostComponent(label))
	resourceName := fmt.Sprintf("%s (%s)", hostDisplayName(host), label)
	return resourceID, resourceName
}

func hostDiskResourceID(host models.Host, disk models.Disk) (string, string) {
	return hostDiskResourceIDWithPrefix(host, disk, hostResourceID(host.ID))
}

func hostSMARTDiskResourceID(host models.Host, disk models.HostDiskSMART) (string, string) {
	label := strings.TrimSpace(strings.TrimPrefix(disk.Device, "/dev/"))
	if label == "" {
		label = strings.TrimSpace(disk.Serial)
	}
	if label == "" {
		label = strings.TrimSpace(disk.WWN)
	}
	if label == "" {
		label = strings.TrimSpace(disk.Model)
	}
	if label == "" {
		label = "smart-disk"
	}

	resourceID := fmt.Sprintf("%s/disk:%s", hostResourceID(host.ID), sanitizeHostComponent(label))
	resourceName := fmt.Sprintf("%s (%s)", hostDisplayName(host), label)
	return resourceID, resourceName
}

// CheckHost evaluates host agent telemetry for alerts.
func (m *Manager) CheckHost(host models.Host) {
	if host.ID == "" {
		return
	}

	// Register this host agent hostname for deduplication with Proxmox nodes.
	// This prevents duplicate alerts when both a Node and Host agent monitor the same machine.
	if host.Hostname != "" {
		m.RegisterHostAgentHostname(host.Hostname)
	}

	// Cache display name so host alerts show the user-configured name.
	m.UpdateNodeDisplayName("", host.Hostname, host.DisplayName)

	// Fresh telemetry marks the host as online and clears offline tracking.
	m.HandleHostOnline(host)

	m.mu.RLock()
	alertsEnabled := m.config.Enabled
	disableAllAgents := m.config.DisableAllAgents
	thresholds := m.resolveResourceThresholds("agent", host.ID)
	m.mu.RUnlock()

	if !alertsEnabled {
		return
	}

	if disableAllAgents {
		// Clear any existing host alerts when all host alerts are disabled
		m.clearHostMetricAlerts(host.ID)
		m.clearHostDiskAlerts(host.ID)
		m.clearHostRAIDAlerts(host.ID)
		m.clearHostUnraidAlerts(host.ID)
		return
	}

	if thresholds.Disabled {
		m.clearHostMetricAlerts(host.ID)
		m.clearHostDiskAlerts(host.ID)
		m.clearHostRAIDAlerts(host.ID)
		m.clearHostUnraidAlerts(host.ID)
		return
	}

	resourceID := hostResourceID(host.ID)
	resourceName := hostDisplayName(host)
	nodeName := strings.TrimSpace(host.Hostname)
	instanceName := hostInstanceName(host)

	baseMetadata := map[string]interface{}{
		"resourceType": "agent",
		"hostId":       host.ID,
		"hostname":     host.Hostname,
		"displayName":  host.DisplayName,
		"platform":     host.Platform,
		"osName":       host.OSName,
		"osVersion":    host.OSVersion,
		"agentVersion": host.AgentVersion,
		"architecture": host.Architecture,
	}
	if len(host.Tags) > 0 {
		baseMetadata["tags"] = append([]string(nil), host.Tags...)
	}

	if thresholds.CPU != nil {
		cpuMetadata := cloneMetadata(baseMetadata)
		cpuMetadata["metric"] = "cpu"
		cpuMetadata["cpuUsagePercent"] = host.CPUUsage
		if host.CPUCount > 0 {
			cpuMetadata["cpuCount"] = host.CPUCount
		}
		spec, err := buildCanonicalMetricSpec(resourceID, resourceName, unifiedresources.ResourceTypeAgent, "cpu", thresholds.CPU)
		if err != nil {
			log.Warn().
				Err(err).
				Str("resourceID", resourceID).
				Str("host", resourceName).
				Msg("Skipping invalid canonical host CPU metric spec")
		} else {
			m.checkMetricWithCanonicalSpec(spec, resourceName, nodeName, instanceName, "agent", host.CPUUsage, thresholds.CPU, &metricOptions{Metadata: cpuMetadata})
		}
	} else {
		m.clearHostMetricAlerts(host.ID, "cpu")
	}

	if thresholds.Memory != nil {
		memMetadata := cloneMetadata(baseMetadata)
		memMetadata["metric"] = "memory"
		memMetadata["memoryUsagePercent"] = host.Memory.Usage
		if host.Memory.Total > 0 {
			memMetadata["memoryTotalBytes"] = host.Memory.Total
			memMetadata["memoryUsedBytes"] = host.Memory.Used
			memMetadata["memoryFreeBytes"] = host.Memory.Free
		}
		spec, err := buildCanonicalMetricSpec(resourceID, resourceName, unifiedresources.ResourceTypeAgent, "memory", thresholds.Memory)
		if err != nil {
			log.Warn().
				Err(err).
				Str("resourceID", resourceID).
				Str("host", resourceName).
				Msg("Skipping invalid canonical host memory metric spec")
		} else {
			m.checkMetricWithCanonicalSpec(spec, resourceName, nodeName, instanceName, "agent", host.Memory.Usage, thresholds.Memory, &metricOptions{Metadata: memMetadata})
		}
	} else {
		m.clearHostMetricAlerts(host.ID, "memory")
	}

	if thresholds.DiskTemperature != nil && thresholds.DiskTemperature.Trigger > 0 {
		if len(host.Sensors.SMART) > 0 {
			for _, disk := range host.Sensors.SMART {
				if disk.Temperature > 0 && !disk.Standby {
					// Use specific resource ID for the disk: hostID/disk-temp:device
					tempResourceID := fmt.Sprintf("%s/disk_temp:%s", hostResourceID(host.ID), sanitizeHostComponent(disk.Device))
					tempResourceName := fmt.Sprintf("%s (%s Temp)", host.DisplayName, disk.Device)

					diskTempMetadata := cloneMetadata(baseMetadata)
					diskTempMetadata["metric"] = "diskTemperature"
					diskTempMetadata["device"] = disk.Device
					diskTempMetadata["temperature"] = disk.Temperature
					diskTempMetadata["model"] = disk.Model
					spec, err := buildCanonicalMetricSpec(tempResourceID, tempResourceName, unifiedresources.ResourceType("agent-disk"), "diskTemperature", thresholds.DiskTemperature)
					if err != nil {
						log.Warn().
							Err(err).
							Str("resourceID", tempResourceID).
							Str("host", resourceName).
							Str("device", disk.Device).
							Msg("Skipping invalid canonical host disk temperature metric spec")
						continue
					}

					m.checkMetricWithCanonicalSpec(spec, tempResourceName, nodeName, disk.Device, "agent", float64(disk.Temperature), thresholds.DiskTemperature, &metricOptions{Metadata: diskTempMetadata})
				}
			}
		}
	} else {
		// We can't easily clear all disk temp alerts without tracking them,
		// but checkMetric logic handles auto-resolution if value drops.
		// If feature is disabled, ideally we should clear existing alerts.
		// For now simple implementation.
	}

	seenDisks := make(map[string]struct{}, len(host.Disks))
	if len(host.Sensors.SMART) > 0 {
		for _, disk := range host.Sensors.SMART {
			diskResourceID, diskName := hostSMARTDiskResourceID(host, disk)
			if host.LinkedNodeID == "" {
				seenDisks[diskResourceID] = struct{}{}
				m.syncHostSMARTDiskRiskAlerts(host, disk, diskResourceID, diskName, nodeName, instanceName, baseMetadata)
				continue
			}
			m.syncHostSMARTDiskAlert(host, disk, diskResourceID, diskName, nodeName, instanceName, baseMetadata, "disk-health", nil)
			m.syncHostSMARTDiskAlert(host, disk, diskResourceID, diskName, nodeName, instanceName, baseMetadata, "disk-wearout", nil)
		}
	}

	for _, disk := range host.Disks {
		diskResourceID, diskName := hostDiskResourceID(host, disk)
		seenDisks[diskResourceID] = struct{}{}

		// Check for disk-specific override
		m.mu.RLock()
		diskOverride, hasDiskOverride := m.config.Overrides[diskResourceID]
		m.mu.RUnlock()

		// Determine the effective disk threshold
		var effectiveDiskThreshold *HysteresisThreshold
		if hasDiskOverride {
			// If disk is disabled via override, skip alerting
			if diskOverride.Disabled {
				m.clearAlert(canonicalMetricStateID(diskResourceID, "disk"))
				continue
			}
			// Use disk-specific threshold if set
			if diskOverride.Disk != nil {
				effectiveDiskThreshold = ensureHysteresisThreshold(diskOverride.Disk)
			}
		}
		// Fall back to host-level threshold
		if effectiveDiskThreshold == nil {
			effectiveDiskThreshold = thresholds.Disk
		}

		// Skip if no threshold configured (nil)
		// We DO NOT skip if Trigger <= 0 because we need to call checkMetric to clear any existing alerts.
		if effectiveDiskThreshold == nil {
			continue
		}

		diskMetadata := cloneMetadata(baseMetadata)
		diskMetadata["metric"] = "disk"
		diskMetadata["mountpoint"] = disk.Mountpoint
		diskMetadata["device"] = disk.Device
		diskMetadata["diskType"] = disk.Type
		diskMetadata["diskUsagePercent"] = disk.Usage
		if disk.Total > 0 {
			diskMetadata["diskTotalBytes"] = disk.Total
			diskMetadata["diskUsedBytes"] = disk.Used
			diskMetadata["diskFreeBytes"] = disk.Free
		}
		spec, err := buildCanonicalMetricSpec(diskResourceID, diskName, unifiedresources.ResourceType("agent-disk"), "disk", effectiveDiskThreshold)
		if err != nil {
			log.Warn().
				Err(err).
				Str("resourceID", diskResourceID).
				Str("host", resourceName).
				Str("mountpoint", disk.Mountpoint).
				Msg("Skipping invalid canonical host disk metric spec")
			continue
		}

		m.checkMetricWithCanonicalSpec(spec, diskName, nodeName, instanceName, "agent-disk", disk.Usage, effectiveDiskThreshold, &metricOptions{Metadata: diskMetadata})
	}

	// Clear all disk alerts if host-level disk alerting is completely disabled and no disk-specific overrides
	if thresholds.Disk == nil || thresholds.Disk.Trigger <= 0 {
		// Only clear alerts for disks that don't have their own overrides
		m.mu.RLock()
		var disksToClear []string
		for _, disk := range host.Disks {
			diskResourceID, _ := hostDiskResourceID(host, disk)
			_, hasDiskOverride := m.config.Overrides[diskResourceID]
			if !hasDiskOverride {
				disksToClear = append(disksToClear, canonicalMetricStateID(diskResourceID, "disk"))
			}
		}
		m.mu.RUnlock()

		for _, alertID := range disksToClear {
			m.clearAlert(alertID)
		}
	}

	m.cleanupHostDiskAlerts(host, seenDisks)

	if host.Unraid != nil {
		m.syncHostUnraidStorageAlert(host, nodeName, instanceName, resourceName, baseMetadata)
	} else {
		m.clearHostUnraidAlerts(host.ID)
	}

	// Check RAID arrays for degraded or failed state
	if len(host.RAID) > 0 {
		for _, array := range host.RAID {
			// Skip Synology internal system arrays (md0/md1) which often report false positives.
			// DSM handles these differently and they're not user-facing storage arrays.
			deviceLower := strings.ToLower(strings.TrimPrefix(array.Device, "/dev/"))
			if deviceLower == "md0" || deviceLower == "md1" {
				// Still clear any existing alerts for these devices
				raidSpecResourceID := fmt.Sprintf("%s/raid:%s", hostResourceID(host.ID), sanitizeRAIDDevice(array.Device))
				m.clearAlert(buildCanonicalStateID(raidSpecResourceID, raidSpecResourceID+"-health"))
				continue
			}

			raidResourceID := fmt.Sprintf("host-%s-raid-%s", host.ID, sanitizeRAIDDevice(array.Device))
			raidName := fmt.Sprintf("%s - %s (%s)", resourceName, array.Device, array.Level)
			raidSpecResourceID := fmt.Sprintf("%s/raid:%s", hostResourceID(host.ID), sanitizeRAIDDevice(array.Device))

			raidMetadata := cloneMetadata(baseMetadata)
			raidMetadata["metric"] = "raid"
			raidMetadata["raidDevice"] = array.Device
			raidMetadata["raidLevel"] = array.Level
			raidMetadata["raidState"] = array.State
			raidMetadata["raidTotalDevices"] = array.TotalDevices
			raidMetadata["raidActiveDevices"] = array.ActiveDevices
			raidMetadata["raidFailedDevices"] = array.FailedDevices
			raidMetadata["raidSpareDevices"] = array.SpareDevices
			if array.UUID != "" {
				raidMetadata["raidUUID"] = array.UUID
			}
			if array.RebuildPercent > 0 {
				raidMetadata["raidRebuildPercent"] = array.RebuildPercent
			}

			alertID := fmt.Sprintf("host-%s-raid-%s", host.ID, sanitizeRAIDDevice(array.Device))
			assessment := storagehealth.AssessHostRAIDArray(array)
			result, _ := m.syncCanonicalHealthAssessmentAlert(canonicalHealthAssessmentAlertParams{
				SpecID:         raidSpecResourceID + "-health",
				Signal:         "host-raid",
				Codes:          raidAssessmentCodes,
				Reasons:        assessment.Reasons,
				AlertID:        alertID,
				AlertType:      "raid",
				SpecResourceID: raidSpecResourceID,
				ResourceID:     raidResourceID,
				ResourceName:   raidName,
				ResourceType:   unifiedresources.ResourceTypeAgent,
				Node:           nodeName,
				Instance:       instanceName,
				Metadata:       raidMetadata,
				MessageBuilder: func(result alertspecs.EvaluationResult) (string, float64, float64) {
					message := strings.Join(storageHealthReasonSummaries(assessment.Reasons), "; ")
					switch result.State.Severity {
					case alertspecs.AlertSeverityCritical:
						return message, float64(array.FailedDevices), 0
					case alertspecs.AlertSeverityWarning:
						return message, array.RebuildPercent, 100
					default:
						return message, 0, 0
					}
				},
			})

			if result.Transition != nil && result.Transition.Kind == alertspecs.EvaluationTransitionActivated {
				switch result.State.Severity {
				case alertspecs.AlertSeverityCritical:
					log.Error().
						Str("host", resourceName).
						Str("hostID", host.ID).
						Str("raidDevice", array.Device).
						Str("raidLevel", array.Level).
						Int("failedDevices", array.FailedDevices).
						Msg("CRITICAL: RAID array degraded")
				case alertspecs.AlertSeverityWarning:
					log.Warn().
						Str("host", resourceName).
						Str("hostID", host.ID).
						Str("raidDevice", array.Device).
						Str("raidLevel", array.Level).
						Float64("rebuildPercent", array.RebuildPercent).
						Msg("WARNING: RAID array rebuilding")
				}
			}
		}
	}
}

// HandleHostOnline clears offline tracking and alerts for a host agent.
func (m *Manager) HandleHostOnline(host models.Host) {
	if host.ID == "" {
		return
	}

	resourceKey := hostResourceID(host.ID)
	alertID := canonicalConnectivityStateID(resourceKey)

	m.mu.Lock()
	delete(m.offlineConfirmations, resourceKey)
	exists := m.hasActiveAlertNoLock(alertID)
	m.mu.Unlock()

	if exists {
		m.clearAlert(alertID)
	}
}

// HandleHostRemoved clears alerts and tracking when a host agent is deleted.
func (m *Manager) HandleHostRemoved(host models.Host) {
	if host.ID == "" {
		return
	}

	// Unregister the host agent hostname since it's being removed.
	if host.Hostname != "" {
		m.UnregisterHostAgentHostname(host.Hostname)
	}

	m.HandleHostOnline(host)
	m.clearHostMetricAlerts(host.ID)
	m.clearHostDiskAlerts(host.ID)
	m.clearHostRAIDAlerts(host.ID)
	m.clearHostUnraidAlerts(host.ID)
}

// HandleHostOffline raises an alert when a host agent stops reporting.
func (m *Manager) HandleHostOffline(host models.Host) {
	if host.ID == "" {
		return
	}

	// Unregister the host agent hostname since it's no longer actively monitoring.
	// This allows node alerts to resume if a Proxmox node with the same hostname exists.
	if host.Hostname != "" {
		m.UnregisterHostAgentHostname(host.Hostname)
	}

	m.mu.RLock()
	if !m.config.Enabled {
		m.mu.RUnlock()
		return
	}
	disableHostsOffline := m.config.DisableAllAgentsOffline
	thresholds := m.resolveResourceThresholds("agent", host.ID)
	m.mu.RUnlock()

	resourceKey := hostResourceID(host.ID)
	alertID := canonicalConnectivityStateID(resourceKey)
	resourceName := hostDisplayName(host)
	nodeName := strings.TrimSpace(host.Hostname)
	instanceName := hostInstanceName(host)

	if disableHostsOffline {
		m.mu.Lock()
		delete(m.offlineConfirmations, resourceKey)
		m.mu.Unlock()
		m.clearAlert(alertID)
		return
	}

	if thresholds.Disabled || thresholds.DisableConnectivity {
		m.clearAlert(alertID)
		m.mu.Lock()
		delete(m.offlineConfirmations, resourceKey)
		m.mu.Unlock()
		return
	}

	spec, err := buildCanonicalConnectivitySpec(resourceKey, resourceName, unifiedresources.ResourceTypeAgent, AlertLevelCritical, 3, false)
	if err != nil {
		log.Warn().
			Err(err).
			Str("host", resourceName).
			Str("hostID", host.ID).
			Msg("Skipping invalid canonical host connectivity spec")
		return
	}

	result, ok := m.evaluateCanonicalLifecycleAlert(canonicalLifecycleAlertParams{
		Spec: spec,
		Evidence: alertspecs.AlertEvidence{
			ObservedAt: time.Now(),
			Connectivity: &alertspecs.ConnectivityEvidence{
				Signal:    "status",
				Connected: false,
			},
		},
		Tracking:     m.offlineConfirmations,
		TrackingKey:  resourceKey,
		AlertID:      alertID,
		AlertType:    "host-offline",
		ResourceID:   resourceKey,
		ResourceName: resourceName,
		Node:         nodeName,
		Instance:     instanceName,
		Message:      fmt.Sprintf("Host '%s' is offline", resourceName),
		Metadata: map[string]interface{}{
			"resourceType": "agent",
			"hostId":       host.ID,
			"hostname":     host.Hostname,
			"displayName":  host.DisplayName,
			"platform":     host.Platform,
			"osName":       host.OSName,
			"osVersion":    host.OSVersion,
		},
		AddToRecent:   true,
		AddToHistory:  true,
		RateLimit:     true,
		DispatchAsync: false,
	})
	if !ok {
		return
	}
	if result.State.State == alertspecs.AlertStatePending {
		log.Debug().
			Str("host", resourceName).
			Str("hostID", host.ID).
			Int("confirmations", result.State.ConsecutiveMatches).
			Int("required", 3).
			Msg("Host agent appears offline, awaiting confirmation")
		return
	}
	if result.Transition == nil || result.Transition.Kind != alertspecs.EvaluationTransitionActivated {
		return
	}

	// Host is confirmed offline. Clear all host-scoped metrics and storage-health alerts
	// so the connectivity alert becomes the only active signal for this agent.
	m.mu.Lock()
	for _, mt := range []string{"cpu", "memory"} {
		m.clearAlertNoLock(canonicalMetricStateID(resourceKey, mt))
	}

	diskResourcePrefixes := []string{
		fmt.Sprintf("%s/disk:", resourceKey),
	}
	raidAlertPrefix := fmt.Sprintf("host-%s-raid-", host.ID)
	var alertsToClear []string
	for activeAlertID, a := range m.activeAlerts {
		if a == nil {
			continue
		}
		matchesDiskPrefix := false
		for _, diskResourcePrefix := range diskResourcePrefixes {
			if strings.HasPrefix(a.ResourceID, diskResourcePrefix) {
				matchesDiskPrefix = true
				break
			}
		}
		if matchesDiskPrefix || strings.HasPrefix(activeAlertID, raidAlertPrefix) {
			alertsToClear = append(alertsToClear, activeAlertID)
		}
	}
	for _, staleAlertID := range alertsToClear {
		m.clearAlertNoLock(staleAlertID)
	}
	m.mu.Unlock()

	log.Error().
		Str("host", resourceName).
		Str("hostID", host.ID).
		Str("hostname", host.Hostname).
		Msg("CRITICAL: Host agent is offline")
}

func (m *Manager) clearHostMetricAlerts(hostID string, metrics ...string) {
	if hostID == "" {
		return
	}
	resourceIDs := []string{
		hostResourceID(hostID),
	}
	if len(metrics) == 0 {
		metrics = []string{"cpu", "memory"}
	}
	for _, resourceID := range resourceIDs {
		for _, metric := range metrics {
			m.clearAlert(canonicalMetricStateID(resourceID, metric))
		}
	}
}

func (m *Manager) clearHostDiskAlerts(hostID string) {
	if hostID == "" {
		return
	}

	prefixes := []string{
		fmt.Sprintf("%s/disk:", hostResourceID(hostID)),
	}

	m.mu.Lock()
	defer m.mu.Unlock()

	for storageKey, alert := range m.activeAlerts {
		alertID := effectiveAlertID(alert, storageKey)
		if alert == nil {
			continue
		}
		matches := false
		for _, prefix := range prefixes {
			if strings.HasPrefix(alert.ResourceID, prefix) {
				matches = true
				break
			}
		}
		if !matches {
			continue
		}
		m.clearAlertNoLock(alertID)
	}
}

func (m *Manager) cleanupHostDiskAlerts(host models.Host, seen map[string]struct{}) {
	if host.ID == "" {
		return
	}

	prefixes := []string{
		fmt.Sprintf("%s/disk:", hostResourceID(host.ID)),
	}

	m.mu.Lock()
	defer m.mu.Unlock()

	for storageKey, alert := range m.activeAlerts {
		alertID := effectiveAlertID(alert, storageKey)
		if alert == nil {
			continue
		}
		matches := false
		for _, prefix := range prefixes {
			if strings.HasPrefix(alert.ResourceID, prefix) {
				matches = true
				break
			}
		}
		if !matches {
			continue
		}
		if _, exists := seen[alert.ResourceID]; exists {
			continue
		}
		m.clearAlertNoLock(alertID)
	}
}

func (m *Manager) syncHostSMARTDiskRiskAlerts(host models.Host, disk models.HostDiskSMART, resourceID, resourceName, nodeName, instanceName string, baseMetadata map[string]interface{}) {
	assessment := storagehealth.AssessHostSMARTDisk(disk)
	healthReasons, wearReasons := splitSMARTAlertReasons(assessment.Reasons)

	m.syncHostSMARTDiskAlert(host, disk, resourceID, resourceName, nodeName, instanceName, baseMetadata, "disk-health", healthReasons)
	m.syncHostSMARTDiskAlert(host, disk, resourceID, resourceName, nodeName, instanceName, baseMetadata, "disk-wearout", wearReasons)
}

func splitSMARTAlertReasons(reasons []storagehealth.Reason) ([]storagehealth.Reason, []storagehealth.Reason) {
	healthReasons := make([]storagehealth.Reason, 0, len(reasons))
	wearReasons := make([]storagehealth.Reason, 0, len(reasons))

	for _, reason := range reasons {
		if reason.Severity != storagehealth.RiskWarning && reason.Severity != storagehealth.RiskCritical {
			continue
		}
		switch reason.Code {
		case "wearout_low", "nvme_available_spare_low", "nvme_percentage_used_high":
			wearReasons = append(wearReasons, reason)
		case "temperature_high":
			continue
		default:
			healthReasons = append(healthReasons, reason)
		}
	}

	return healthReasons, wearReasons
}

var (
	smartHealthAssessmentCodes = []string{
		"health_status",
		"pending_sectors",
		"offline_uncorrectable",
		"media_errors",
		"reallocated_sectors",
	}
	smartWearoutAssessmentCodes = []string{
		"wearout_low",
		"nvme_available_spare_low",
		"nvme_percentage_used_high",
	}
	raidAssessmentCodes = []string{
		"raid_degraded",
		"raid_unavailable",
		"raid_rebuilding",
	}
	zfsPoolAssessmentCodes = []string{
		"zfs_pool_state",
	}
	zfsPoolErrorAssessmentCodes = []string{
		"zfs_pool_errors",
	}
	zfsDeviceAssessmentCodes = []string{
		"zfs_device_state",
		"zfs_device_errors",
	}
)

type canonicalHealthAssessmentAlertParams struct {
	SpecID         string
	Signal         string
	Codes          []string
	Reasons        []storagehealth.Reason
	AlertID        string
	AlertType      string
	SpecResourceID string
	ResourceID     string
	ResourceName   string
	ResourceType   unifiedresources.ResourceType
	Node           string
	Instance       string
	Metadata       map[string]interface{}
	Disabled       bool
	MessageBuilder func(alertspecs.EvaluationResult) (string, float64, float64)
}

func storageHealthReasonCodes(reasons []storagehealth.Reason) []string {
	codes := make([]string, 0, len(reasons))
	seen := make(map[string]struct{}, len(reasons))
	for _, reason := range reasons {
		code := strings.TrimSpace(reason.Code)
		if code == "" {
			continue
		}
		if _, ok := seen[code]; ok {
			continue
		}
		seen[code] = struct{}{}
		codes = append(codes, code)
	}
	slices.Sort(codes)
	return codes
}

func storageHealthReasonSummaries(reasons []storagehealth.Reason) []string {
	summaries := make([]string, 0, len(reasons))
	for _, reason := range reasons {
		summary := strings.TrimSpace(reason.Summary)
		if summary == "" {
			continue
		}
		summaries = append(summaries, summary)
	}
	return summaries
}

func storageHealthAssessmentSeverity(reasons []storagehealth.Reason) alertspecs.AlertSeverity {
	severity := alertspecs.AlertSeverity("")
	for _, reason := range reasons {
		switch reason.Severity {
		case storagehealth.RiskCritical:
			return alertspecs.AlertSeverityCritical
		case storagehealth.RiskWarning:
			severity = alertspecs.AlertSeverityWarning
		}
	}
	return severity
}

func (m *Manager) activeAlertValue(alertID string) (float64, bool) {
	m.mu.RLock()
	defer m.mu.RUnlock()

	alert, ok := m.getActiveAlertNoLock(alertID)
	if !ok || alert == nil {
		return 0, false
	}
	return alert.Value, true
}

func filterStorageHealthReasonsByCodes(reasons []storagehealth.Reason, codes []string) []storagehealth.Reason {
	if len(reasons) == 0 || len(codes) == 0 {
		return nil
	}

	allowed := make(map[string]struct{}, len(codes))
	for _, code := range codes {
		code = strings.TrimSpace(code)
		if code == "" {
			continue
		}
		allowed[code] = struct{}{}
	}

	filtered := make([]storagehealth.Reason, 0, len(reasons))
	for _, reason := range reasons {
		if _, ok := allowed[strings.TrimSpace(reason.Code)]; ok {
			filtered = append(filtered, reason)
		}
	}
	return filtered
}

func zfsDeviceAssessment(device models.ZFSDevice) storagehealth.Assessment {
	assessment := storagehealth.Assessment{Level: storagehealth.RiskHealthy}
	addReason := func(code string, severity storagehealth.RiskLevel, summary string) {
		if strings.TrimSpace(summary) == "" {
			return
		}
		assessment.Reasons = append(assessment.Reasons, storagehealth.Reason{
			Code:     code,
			Severity: severity,
			Summary:  summary,
		})
		switch severity {
		case storagehealth.RiskCritical:
			assessment.Level = storagehealth.RiskCritical
		case storagehealth.RiskWarning:
			if assessment.Level != storagehealth.RiskCritical {
				assessment.Level = storagehealth.RiskWarning
			}
		}
	}

	state := strings.ToUpper(strings.TrimSpace(device.State))
	switch state {
	case "", "ONLINE", "SPARE":
	case "DEGRADED":
		addReason("zfs_device_state", storagehealth.RiskWarning, fmt.Sprintf("ZFS device %s is DEGRADED", device.Name))
	default:
		addReason("zfs_device_state", storagehealth.RiskCritical, fmt.Sprintf("ZFS device %s is %s", device.Name, state))
	}

	errors := device.ReadErrors + device.WriteErrors + device.ChecksumErrors
	if errors > 0 {
		addReason(
			"zfs_device_errors",
			storagehealth.RiskWarning,
			fmt.Sprintf("ZFS device %s has errors: %d read, %d write, %d checksum", device.Name, device.ReadErrors, device.WriteErrors, device.ChecksumErrors),
		)
	}

	return assessment
}

func (m *Manager) syncCanonicalHealthAssessmentAlert(params canonicalHealthAssessmentAlertParams) (alertspecs.EvaluationResult, bool) {
	if len(params.Reasons) == 0 {
		m.clearAlert(buildCanonicalStateID(params.SpecResourceID, params.SpecID))
		return alertspecs.EvaluationResult{}, true
	}

	spec, err := buildCanonicalHealthAssessmentSpec(
		params.SpecID,
		params.SpecResourceID,
		params.ResourceName,
		params.ResourceType,
		params.Signal,
		params.Codes,
		params.Disabled,
	)
	if err != nil {
		log.Warn().
			Err(err).
			Str("alertID", params.AlertID).
			Str("resourceID", params.SpecResourceID).
			Msg("Skipping invalid canonical health assessment spec")
		return alertspecs.EvaluationResult{}, false
	}

	now := time.Now()
	return m.evaluateCanonicalStatefulAlert(canonicalStatefulAlertParams{
		Spec: spec,
		Evidence: alertspecs.AlertEvidence{
			ObservedAt: now,
			HealthAssessment: &alertspecs.HealthAssessmentEvidence{
				Signal:   params.Signal,
				Severity: storageHealthAssessmentSeverity(params.Reasons),
				Codes:    storageHealthReasonCodes(params.Reasons),
			},
		},
		AlertID:        params.AlertID,
		AlertType:      params.AlertType,
		ResourceID:     params.ResourceID,
		ResourceName:   params.ResourceName,
		Node:           params.Node,
		Instance:       params.Instance,
		Message:        strings.Join(storageHealthReasonSummaries(params.Reasons), "; "),
		Metadata:       cloneMetadata(params.Metadata),
		AddToRecent:    true,
		AddToHistory:   true,
		MessageBuilder: params.MessageBuilder,
	})
}

func (m *Manager) syncHostSMARTDiskAlert(host models.Host, disk models.HostDiskSMART, resourceID, resourceName, nodeName, instanceName string, baseMetadata map[string]interface{}, alertType string, reasons []storagehealth.Reason) {
	alertID := fmt.Sprintf("host-%s-%s-%s", host.ID, alertType, strings.TrimPrefix(resourceID, hostResourceID(host.ID)+"/disk:"))
	reasonCodes := storageHealthReasonCodes(reasons)
	reasonSummaries := storageHealthReasonSummaries(reasons)

	metadata := cloneMetadata(baseMetadata)
	metadata["metric"] = alertType
	metadata["device"] = disk.Device
	metadata["model"] = disk.Model
	metadata["serial"] = disk.Serial
	metadata["wwn"] = disk.WWN
	metadata["diskHealth"] = disk.Health
	metadata["riskCodes"] = reasonCodes
	metadata["riskSummaries"] = reasonSummaries
	if disk.Temperature > 0 {
		metadata["temperature"] = disk.Temperature
	}

	specCodes := smartHealthAssessmentCodes
	if alertType == "disk-wearout" {
		specCodes = smartWearoutAssessmentCodes
	}

	_, _ = m.syncCanonicalHealthAssessmentAlert(canonicalHealthAssessmentAlertParams{
		SpecID:         resourceID + "-" + alertType,
		Signal:         "host-smart",
		Codes:          specCodes,
		Reasons:        reasons,
		AlertID:        alertID,
		AlertType:      alertType,
		SpecResourceID: resourceID,
		ResourceID:     resourceID,
		ResourceName:   resourceName,
		ResourceType:   unifiedresources.ResourceTypeAgent,
		Node:           nodeName,
		Instance:       instanceName,
		Metadata:       metadata,
	})
}

func (m *Manager) clearHostRAIDAlerts(hostID string) {
	if hostID == "" {
		return
	}

	resourcePrefix := hostResourceID(hostID) + "/raid:"

	m.mu.Lock()
	defer m.mu.Unlock()

	for storageKey, alert := range m.activeAlerts {
		if alert == nil || alert.Type != "raid" {
			continue
		}
		if strings.HasPrefix(alert.ResourceID, resourcePrefix) || strings.HasPrefix(alert.CanonicalSpecID, resourcePrefix) {
			m.clearAlertNoLock(storageKey)
		}
	}
}

func (m *Manager) clearHostUnraidAlerts(hostID string) {
	if hostID == "" {
		return
	}
	resourceID := fmt.Sprintf("%s/storage:unraid-array", hostResourceID(hostID))
	m.clearAlert(buildCanonicalStateID(resourceID, resourceID+"-health"))
}

func (m *Manager) syncHostUnraidStorageAlert(host models.Host, nodeName, instanceName, resourceName string, baseMetadata map[string]interface{}) {
	if host.Unraid == nil {
		m.clearHostUnraidAlerts(host.ID)
		return
	}

	assessment := storagehealth.AssessUnraidStorage(*host.Unraid)
	reasons := make([]storagehealth.Reason, 0, len(assessment.Reasons))
	for _, reason := range assessment.Reasons {
		if reason.Severity == storagehealth.RiskWarning || reason.Severity == storagehealth.RiskCritical {
			reasons = append(reasons, reason)
		}
	}

	alertID := fmt.Sprintf("host-%s-unraid-array", host.ID)
	reasonCodes := storageHealthReasonCodes(reasons)
	reasonSummaries := storageHealthReasonSummaries(reasons)

	metadata := cloneMetadata(baseMetadata)
	metadata["metric"] = "storageTopology"
	metadata["storagePlatform"] = "unraid"
	metadata["storageTopology"] = "array"
	metadata["arrayState"] = host.Unraid.ArrayState
	metadata["syncAction"] = host.Unraid.SyncAction
	metadata["syncProgress"] = host.Unraid.SyncProgress
	metadata["numProtected"] = host.Unraid.NumProtected
	metadata["numDisabled"] = host.Unraid.NumDisabled
	metadata["numInvalid"] = host.Unraid.NumInvalid
	metadata["numMissing"] = host.Unraid.NumMissing
	metadata["riskCodes"] = reasonCodes
	metadata["riskSummaries"] = reasonSummaries

	resourceID := fmt.Sprintf("%s/storage:unraid-array", hostResourceID(host.ID))
	resourceLabel := fmt.Sprintf("%s - Unraid Array", resourceName)

	_, _ = m.syncCanonicalHealthAssessmentAlert(canonicalHealthAssessmentAlertParams{
		SpecID:         resourceID + "-health",
		Signal:         "unraid-storage",
		Reasons:        reasons,
		AlertID:        alertID,
		AlertType:      "storage-topology",
		SpecResourceID: resourceID,
		ResourceID:     resourceID,
		ResourceName:   resourceLabel,
		ResourceType:   unifiedresources.ResourceTypeAgent,
		Node:           nodeName,
		Instance:       instanceName,
		Metadata:       metadata,
	})
}

func isPBSOffline(pbs models.PBSInstance) bool {
	status := strings.ToLower(strings.TrimSpace(pbs.Status))
	health := strings.ToLower(strings.TrimSpace(pbs.ConnectionHealth))
	return status == "offline" || health == "error" || health == "failed" || health == "unhealthy"
}

func (m *Manager) clearPBSMetricAlerts(pbsID string) {
	if strings.TrimSpace(pbsID) == "" {
		return
	}

	m.clearAlert(canonicalMetricStateID(pbsID, "cpu"))
	m.clearAlert(canonicalMetricStateID(pbsID, "memory"))
}

func isPMGOffline(pmg models.PMGInstance) bool {
	status := strings.ToLower(strings.TrimSpace(pmg.Status))
	health := strings.ToLower(strings.TrimSpace(pmg.ConnectionHealth))
	return status == "offline" || health == "error" || health == "failed" || health == "unhealthy"
}

func (m *Manager) clearPMGMetricAlerts(pmgID string) {
	pmgID = strings.TrimSpace(pmgID)
	if pmgID == "" {
		return
	}

	offlineAlertID := canonicalConnectivityStateID(pmgID)

	m.mu.RLock()
	alertIDs := make([]string, 0)
	for storageKey, alert := range m.activeAlerts {
		alertID := storageKey
		if alertID == offlineAlertID {
			continue
		}
		if alert != nil && alert.ResourceID == pmgID {
			alertIDs = append(alertIDs, alertID)
		}
	}
	m.mu.RUnlock()

	for _, alertID := range alertIDs {
		m.clearAlert(alertID)
	}
}

// CheckPBS checks PBS instance metrics against thresholds
func (m *Manager) CheckPBS(pbs models.PBSInstance) {
	m.mu.RLock()
	if !m.config.Enabled {
		m.mu.RUnlock()
		return
	}
	if m.config.DisableAllPBS {
		m.mu.RUnlock()
		// Clear any existing PBS alerts when all PBS alerts are disabled
		m.mu.Lock()
		// Reset offline confirmation tracking
		delete(m.offlineConfirmations, pbs.ID)
		// Clear CPU alert
		cpuAlertID := canonicalMetricStateID(pbs.ID, "cpu")
		if m.clearActiveAlertIfPresentNoLock(cpuAlertID) {
			log.Info().
				Str("alertID", cpuAlertID).
				Str("pbs", pbs.Name).
				Msg("Cleared CPU alert - all PBS alerts disabled")
		}
		// Clear Memory alert
		memAlertID := canonicalMetricStateID(pbs.ID, "memory")
		if m.clearActiveAlertIfPresentNoLock(memAlertID) {
			log.Info().
				Str("alertID", memAlertID).
				Str("pbs", pbs.Name).
				Msg("Cleared Memory alert - all PBS alerts disabled")
		}
		// Clear offline alert
		offlineAlertID := canonicalConnectivityStateID(pbs.ID)
		if m.clearActiveAlertIfPresentNoLock(offlineAlertID) {
			log.Info().
				Str("alertID", offlineAlertID).
				Str("pbs", pbs.Name).
				Msg("Cleared offline alert - all PBS alerts disabled")
		}
		m.mu.Unlock()
		return
	}

	thresholds := m.resolveResourceThresholds("pbs", pbs.ID)
	disablePBSOffline := m.config.DisableAllPBSOffline
	m.mu.RUnlock()

	// Check override disable BEFORE offline detection to prevent spurious notifications
	if thresholds.Disabled {
		m.mu.Lock()
		// Reset offline confirmation tracking
		delete(m.offlineConfirmations, pbs.ID)
		// Clear CPU alert
		cpuAlertID := canonicalMetricStateID(pbs.ID, "cpu")
		if m.clearActiveAlertIfPresentNoLock(cpuAlertID) {
			log.Debug().
				Str("alertID", cpuAlertID).
				Str("pbs", pbs.Name).
				Msg("Cleared CPU alert - PBS has alerts disabled")
		}
		// Clear Memory alert
		memAlertID := canonicalMetricStateID(pbs.ID, "memory")
		if m.clearActiveAlertIfPresentNoLock(memAlertID) {
			log.Debug().
				Str("alertID", memAlertID).
				Str("pbs", pbs.Name).
				Msg("Cleared Memory alert - PBS has alerts disabled")
		}
		// Clear offline alert
		offlineAlertID := canonicalConnectivityStateID(pbs.ID)
		if m.clearActiveAlertIfPresentNoLock(offlineAlertID) {
			log.Debug().
				Str("alertID", offlineAlertID).
				Str("pbs", pbs.Name).
				Msg("Cleared offline alert - PBS has alerts disabled")
		}
		m.mu.Unlock()
		return
	}

	pbsOffline := isPBSOffline(pbs)

	if disablePBSOffline || thresholds.DisableConnectivity {
		// Clear tracking and any existing offline alerts when globally disabled
		m.mu.Lock()
		delete(m.offlineConfirmations, pbs.ID)
		m.mu.Unlock()
		m.clearAlert(canonicalConnectivityStateID(pbs.ID))
	} else {
		// Check if PBS is offline first (similar to nodes)
		if pbsOffline {
			m.checkPBSOffline(pbs)
		} else {
			// Clear any existing offline alert if PBS is back online
			m.clearPBSOfflineAlert(pbs)
		}
	}

	// When PBS is offline/unhealthy, clear stale metric alerts immediately.
	if pbsOffline {
		m.clearPBSMetricAlerts(pbs.ID)
		return
	}

	m.evaluateUnifiedMetrics(&UnifiedResourceInput{
		ID:       pbs.ID,
		Type:     "pbs",
		Name:     pbs.Name,
		Node:     pbs.Host,
		Instance: pbs.Name,
		CPU:      &UnifiedResourceMetric{Percent: pbs.CPU},
		Memory:   &UnifiedResourceMetric{Percent: pbs.Memory},
	}, thresholds, nil)
}

// CheckPMG checks a Proxmox Mail Gateway instance against thresholds
func (m *Manager) CheckPMG(pmg models.PMGInstance) {
	m.mu.RLock()
	if !m.config.Enabled {
		m.mu.RUnlock()
		return
	}
	if m.config.DisableAllPMG {
		m.mu.RUnlock()
		// Clear any existing PMG alerts when all PMG alerts are disabled.
		m.mu.Lock()
		delete(m.offlineConfirmations, pmg.ID)
		m.mu.Unlock()
		m.clearPMGMetricAlerts(pmg.ID)
		m.clearAlert(canonicalConnectivityStateID(pmg.ID))
		return
	}

	// Check if there's an override for this PMG instance
	override, hasOverride := m.config.Overrides[pmg.ID]
	disablePMGOffline := m.config.DisableAllPMGOffline
	pmgDefaults := m.config.PMGDefaults
	m.mu.RUnlock()

	// Check override disable BEFORE offline detection to prevent spurious notifications
	if hasOverride && override.Disabled {
		m.mu.Lock()
		delete(m.offlineConfirmations, pmg.ID)
		m.mu.Unlock()
		m.clearPMGMetricAlerts(pmg.ID)
		m.clearAlert(canonicalConnectivityStateID(pmg.ID))
		return
	}

	pmgOffline := isPMGOffline(pmg)

	// Handle offline detection
	if disablePMGOffline {
		// Clear tracking and any existing offline alerts when globally disabled
		m.mu.Lock()
		delete(m.offlineConfirmations, pmg.ID)
		m.mu.Unlock()
		m.clearAlert(canonicalConnectivityStateID(pmg.ID))
	} else {
		// Check if PMG is offline (similar to PBS/nodes)
		if pmgOffline {
			m.checkPMGOffline(pmg)
		} else {
			// Clear any existing offline alert if PMG is back online
			m.clearPMGOfflineAlert(pmg)
		}
	}

	// When PMG is offline/unhealthy, clear stale metric alerts immediately.
	if pmgOffline {
		m.clearPMGMetricAlerts(pmg.ID)
		return
	}

	// Check queue depths across all nodes
	m.checkPMGQueueDepths(pmg, pmgDefaults)
	// Check oldest message age across all nodes
	m.checkPMGOldestMessage(pmg, pmgDefaults)
	// Check quarantine backlog and growth
	m.checkPMGQuarantineBacklog(pmg, pmgDefaults)
	// Check spam/virus rate anomalies
	m.checkPMGAnomalies(pmg, pmgDefaults)
	// Check per-node queue health
	m.checkPMGNodeQueues(pmg, pmgDefaults)
}

// dockerInstanceName returns the logical instance name used for Docker alerts.
func dockerInstanceName(host models.DockerHost) string {
	name := strings.TrimSpace(host.DisplayName)
	if name == "" {
		name = strings.TrimSpace(host.Hostname)
	}
	if name == "" {
		return "Docker"
	}
	return fmt.Sprintf("Docker:%s", name)
}

// dockerContainerDisplayName normalizes the container name for alert readability.
func dockerContainerDisplayName(container models.DockerContainer) string {
	name := strings.TrimSpace(container.Name)
	if strings.HasPrefix(name, "/") {
		name = strings.TrimLeft(name, "/")
	}
	if name == "" {
		containerID := strings.TrimSpace(container.ID)
		if len(containerID) > 12 {
			containerID = containerID[:12]
		}
		return containerID
	}
	return name
}

// dockerResourceID builds a stable identifier for Docker container alerts.
func dockerResourceID(hostID, containerID string) string {
	hostID = strings.TrimSpace(hostID)
	containerID = strings.TrimSpace(containerID)
	if containerID == "" {
		if hostID == "" {
			return "docker:unknown"
		}
		return fmt.Sprintf("docker:%s", hostID)
	}
	if hostID == "" {
		return fmt.Sprintf("docker:container/%s", containerID)
	}
	return fmt.Sprintf("docker:%s/%s", hostID, containerID)
}

func normalizeDockerUpdateTrackingPart(part string) string {
	return strings.ToLower(strings.TrimSpace(part))
}

// dockerUpdateTrackingHostKey builds a stable host identity for Docker update timing.
func dockerUpdateTrackingHostKey(host models.DockerHost) string {
	switch {
	case normalizeDockerUpdateTrackingPart(host.AgentID) != "":
		return "agent:" + normalizeDockerUpdateTrackingPart(host.AgentID)
	case normalizeDockerUpdateTrackingPart(host.TokenID) != "":
		return "token:" + normalizeDockerUpdateTrackingPart(host.TokenID)
	case normalizeDockerUpdateTrackingPart(host.MachineID) != "":
		return "machine:" + normalizeDockerUpdateTrackingPart(host.MachineID)
	case normalizeDockerUpdateTrackingPart(host.Hostname) != "":
		return "hostname:" + normalizeDockerUpdateTrackingPart(host.Hostname)
	case normalizeDockerUpdateTrackingPart(host.ID) != "":
		return "id:" + normalizeDockerUpdateTrackingPart(host.ID)
	case normalizeDockerUpdateTrackingPart(host.DisplayName) != "":
		return "name:" + normalizeDockerUpdateTrackingPart(host.DisplayName)
	default:
		return "unknown-host"
	}
}

func dockerUpdateTrackingContainerKey(container models.DockerContainer) string {
	if containerID := normalizeDockerUpdateTrackingPart(container.ID); containerID != "" {
		return "id:" + containerID
	}

	name := normalizeDockerUpdateTrackingPart(container.Name)
	name = strings.TrimPrefix(name, "/")
	if name != "" {
		return "name:" + name
	}

	if image := normalizeDockerUpdateTrackingPart(container.Image); image != "" {
		return "image:" + image
	}

	return "unknown-container"
}

func dockerUpdateTrackingKey(host models.DockerHost, container models.DockerContainer) string {
	return fmt.Sprintf("docker-update:%s/%s", dockerUpdateTrackingHostKey(host), dockerUpdateTrackingContainerKey(container))
}

func dockerUpdateTrackingHostPrefix(host models.DockerHost) string {
	return fmt.Sprintf("docker-update:%s/", dockerUpdateTrackingHostKey(host))
}

// dockerServiceDisplayName normalizes the service name for alert readability.
func dockerServiceDisplayName(service models.DockerService) string {
	name := strings.TrimSpace(service.Name)
	if name != "" {
		return name
	}
	serviceID := strings.TrimSpace(service.ID)
	if len(serviceID) > 12 {
		serviceID = serviceID[:12]
	}
	if serviceID == "" {
		return "service"
	}
	return serviceID
}

func dockerServiceResourceID(hostID, serviceID, serviceName string) string {
	hostID = strings.TrimSpace(hostID)
	normalizedServiceID := strings.TrimSpace(serviceID)
	if normalizedServiceID == "" {
		name := strings.TrimSpace(serviceName)
		if name == "" {
			name = "service"
		}
		builder := strings.Builder{}
		for _, r := range strings.ToLower(name) {
			switch {
			case r >= 'a' && r <= 'z':
				builder.WriteRune(r)
			case r >= '0' && r <= '9':
				builder.WriteRune(r)
			case r == '-', r == '_':
				builder.WriteRune(r)
			case r == ' ' || r == '/' || r == '\\' || r == ':' || r == '.':
				builder.WriteRune('-')
			}
		}
		normalizedServiceID = strings.Trim(builder.String(), "-_")
		if normalizedServiceID == "" {
			normalizedServiceID = "service"
		}
		if len(normalizedServiceID) > 32 {
			normalizedServiceID = normalizedServiceID[:32]
		}
	}
	if hostID == "" {
		return fmt.Sprintf("docker-service:%s", normalizedServiceID)
	}
	return fmt.Sprintf("docker:%s/service/%s", hostID, normalizedServiceID)
}

func matchesDockerIgnoredPrefix(name, id string, prefixes []string) bool {
	if len(prefixes) == 0 {
		return false
	}

	name = strings.ToLower(strings.TrimSpace(name))
	id = strings.ToLower(strings.TrimSpace(id))

	for _, raw := range prefixes {
		prefix := strings.ToLower(strings.TrimSpace(raw))
		if prefix == "" {
			continue
		}
		if name != "" && strings.HasPrefix(name, prefix) {
			return true
		}
		if id != "" && strings.HasPrefix(id, prefix) {
			return true
		}
	}

	return false
}

// CheckDockerHost evaluates Docker host telemetry and container metrics for alerts.
func (m *Manager) CheckDockerHost(host models.DockerHost) {
	if host.ID == "" {
		return
	}

	// Fresh telemetry marks the host as online and clears any offline alert.
	m.HandleDockerHostOnline(host)

	m.mu.RLock()
	alertsEnabled := m.config.Enabled
	disableAllDockerHosts := m.config.DisableAllDockerHosts
	ignoredPrefixes := append([]string(nil), m.config.DockerIgnoredContainerPrefixes...)
	m.mu.RUnlock()
	if !alertsEnabled {
		return
	}
	if disableAllDockerHosts {
		return
	}

	seen := make(map[string]struct{}, len(host.Containers)+len(host.Services))
	seenUpdateTracking := make(map[string]struct{}, len(host.Containers))
	for _, container := range host.Containers {
		containerName := dockerContainerDisplayName(container)
		resourceID := dockerResourceID(host.ID, container.ID)
		updateTrackingKey := dockerUpdateTrackingKey(host, container)

		if matchesDockerIgnoredPrefix(containerName, container.ID, ignoredPrefixes) {
			log.Debug().
				Str("container", containerName).
				Str("host", host.DisplayName).
				Msg("Skipping Docker container alert evaluation due to ignored prefix")
			m.clearDockerContainerStateAlert(resourceID)
			m.clearDockerContainerHealthAlert(resourceID)
			m.clearDockerContainerMetricAlerts(resourceID)
			m.clearAlert(fmt.Sprintf("docker-container-restart-loop-%s", resourceID))
			m.clearAlert(fmt.Sprintf("docker-container-oom-%s", resourceID))
			m.clearAlert(fmt.Sprintf("docker-container-memory-limit-%s", resourceID))
			m.mu.Lock()
			delete(m.dockerRestartTracking, resourceID)
			delete(m.dockerLastExitCode, resourceID)
			m.mu.Unlock()
			m.clearDockerContainerUpdateTracking(resourceID, updateTrackingKey)
			continue
		}

		seen[resourceID] = struct{}{}
		seenUpdateTracking[updateTrackingKey] = struct{}{}
		m.evaluateDockerContainer(host, container, resourceID)
	}

	for _, service := range host.Services {
		resourceID := dockerServiceResourceID(host.ID, service.ID, service.Name)
		seen[resourceID] = struct{}{}
		m.evaluateDockerService(host, service, resourceID)
	}

	m.cleanupDockerContainerAlertsWithTracking(host, seen, seenUpdateTracking)
}

func (m *Manager) evaluateDockerContainer(host models.DockerHost, container models.DockerContainer, resourceID string) {
	m.mu.RLock()
	disableAllContainers := m.config.DisableAllDockerContainers
	m.mu.RUnlock()
	if disableAllContainers {
		return
	}

	containerName := dockerContainerDisplayName(container)
	nodeName := strings.TrimSpace(host.Hostname)
	instanceName := dockerInstanceName(host)
	resourceType := "app-container"

	m.mu.RLock()
	overrideConfig, hasOverride := m.config.Overrides[resourceID]
	m.mu.RUnlock()
	if hasOverride && overrideConfig.Disabled {
		// Alerts disabled via override; clear any existing alerts and skip evaluation.
		m.clearDockerContainerStateAlert(resourceID)
		m.clearDockerContainerHealthAlert(resourceID)
		m.clearDockerContainerMetricAlerts(resourceID)
		return
	}

	state := strings.ToLower(strings.TrimSpace(container.State))
	if state == "" {
		state = strings.ToLower(strings.TrimSpace(container.Status))
	}

	if state != "running" {
		m.checkDockerContainerState(host, container, resourceID, containerName, instanceName, nodeName)
		m.clearDockerContainerMetricAlerts(resourceID, "cpu", "memory", "disk")
	} else {
		m.clearDockerContainerStateAlert(resourceID)

		// Use Docker-specific defaults for containers
		thresholds := ThresholdConfig{
			CPU:    &m.config.DockerDefaults.CPU,
			Memory: &m.config.DockerDefaults.Memory,
			Disk:   &m.config.DockerDefaults.Disk,
		}
		if hasOverride {
			thresholds = m.applyThresholdOverride(thresholds, overrideConfig)
		}

		if thresholds.CPU != nil {
			cpuMetadata := map[string]interface{}{
				"resourceType":  resourceType,
				"hostId":        host.ID,
				"hostName":      host.DisplayName,
				"hostHostname":  host.Hostname,
				"containerId":   container.ID,
				"containerName": containerName,
				"image":         container.Image,
				"state":         container.State,
				"status":        container.Status,
				"restartCount":  container.RestartCount,
				"metric":        "cpu",
				"cpuPercent":    container.CPUPercent,
			}
			spec, err := buildCanonicalMetricSpec(resourceID, containerName, unifiedresources.ResourceTypeAppContainer, "cpu", thresholds.CPU)
			if err != nil {
				log.Warn().
					Err(err).
					Str("resourceID", resourceID).
					Str("container", containerName).
					Msg("Skipping invalid canonical docker container CPU metric spec")
			} else {
				m.checkMetricWithCanonicalSpec(spec, containerName, nodeName, instanceName, resourceType, container.CPUPercent, thresholds.CPU, &metricOptions{Metadata: cpuMetadata})
			}
		}

		if thresholds.Memory != nil {
			memMetadata := map[string]interface{}{
				"resourceType":     resourceType,
				"hostId":           host.ID,
				"hostName":         host.DisplayName,
				"hostHostname":     host.Hostname,
				"containerId":      container.ID,
				"containerName":    containerName,
				"image":            container.Image,
				"state":            container.State,
				"status":           container.Status,
				"restartCount":     container.RestartCount,
				"metric":           "memory",
				"memoryPercent":    container.MemoryPercent,
				"memoryUsageBytes": container.MemoryUsage,
			}
			if container.MemoryLimit > 0 {
				memMetadata["memoryLimitBytes"] = container.MemoryLimit
			}
			spec, err := buildCanonicalMetricSpec(resourceID, containerName, unifiedresources.ResourceTypeAppContainer, "memory", thresholds.Memory)
			if err != nil {
				log.Warn().
					Err(err).
					Str("resourceID", resourceID).
					Str("container", containerName).
					Msg("Skipping invalid canonical docker container memory metric spec")
			} else {
				m.checkMetricWithCanonicalSpec(spec, containerName, nodeName, instanceName, resourceType, container.MemoryPercent, thresholds.Memory, &metricOptions{Metadata: memMetadata})
			}
		}

		if thresholds.Disk != nil {
			totalBytes := container.RootFilesystemBytes
			usedBytes := container.WritableLayerBytes
			if totalBytes > 0 && usedBytes >= 0 {
				diskPercent := (float64(usedBytes) / float64(totalBytes)) * 100
				diskMetadata := map[string]interface{}{
					"resourceType":        resourceType,
					"hostId":              host.ID,
					"hostName":            host.DisplayName,
					"hostHostname":        host.Hostname,
					"containerId":         container.ID,
					"containerName":       containerName,
					"image":               container.Image,
					"state":               container.State,
					"status":              container.Status,
					"restartCount":        container.RestartCount,
					"metric":              "disk",
					"diskPercent":         diskPercent,
					"writableLayerBytes":  usedBytes,
					"rootFilesystemBytes": totalBytes,
					"mountCount":          len(container.Mounts),
				}
				if container.BlockIO != nil {
					diskMetadata["blockIoReadBytes"] = container.BlockIO.ReadBytes
					diskMetadata["blockIoWriteBytes"] = container.BlockIO.WriteBytes
				}
				spec, err := buildCanonicalMetricSpec(resourceID, containerName, unifiedresources.ResourceTypeAppContainer, "disk", thresholds.Disk)
				if err != nil {
					log.Warn().
						Err(err).
						Str("resourceID", resourceID).
						Str("container", containerName).
						Msg("Skipping invalid canonical docker container disk metric spec")
				} else {
					m.checkMetricWithCanonicalSpec(spec, containerName, nodeName, instanceName, resourceType, diskPercent, thresholds.Disk, &metricOptions{Metadata: diskMetadata})
				}
			} else {
				m.clearDockerContainerMetricAlerts(resourceID, "disk")
			}
		}
	}

	m.checkDockerContainerHealth(host, container, resourceID, containerName, instanceName, nodeName)

	// Docker-specific checks
	m.checkDockerContainerRestartLoop(host, container, resourceID, containerName, instanceName, nodeName)
	m.checkDockerContainerOOMKill(host, container, resourceID, containerName, instanceName, nodeName)
	m.checkDockerContainerMemoryLimit(host, container, resourceID, containerName, instanceName, nodeName)
	m.checkDockerContainerImageUpdate(host, container, resourceID, containerName, instanceName, nodeName)
}

func (m *Manager) evaluateDockerService(host models.DockerHost, service models.DockerService, resourceID string) {
	m.mu.RLock()
	disableAllServices := m.config.DisableAllDockerServices
	warnPct := m.config.DockerDefaults.ServiceWarnGapPct
	critPct := m.config.DockerDefaults.ServiceCritGapPct
	overrideConfig, hasOverride := m.config.Overrides[resourceID]
	m.mu.RUnlock()

	if disableAllServices {
		m.clearDockerServiceAlert(resourceID)
		return
	}
	if hasOverride && overrideConfig.Disabled {
		m.clearDockerServiceAlert(resourceID)
		return
	}

	desired := service.DesiredTasks
	running := service.RunningTasks
	if desired <= 0 {
		m.clearDockerServiceAlert(resourceID)
		return
	}

	missing := desired - running
	if missing < 0 {
		missing = 0
	}

	percentMissing := 0.0
	if desired > 0 {
		percentMissing = (float64(missing) / float64(desired)) * 100.0
	}

	thresholdValue := 0.0
	serviceName := dockerServiceDisplayName(service)
	instanceName := dockerInstanceName(host)
	nodeName := strings.TrimSpace(host.Hostname)

	metadata := map[string]interface{}{
		"resourceType":   "docker-service",
		"hostId":         host.ID,
		"hostName":       host.DisplayName,
		"hostHostname":   host.Hostname,
		"serviceId":      service.ID,
		"serviceName":    service.Name,
		"stack":          service.Stack,
		"mode":           service.Mode,
		"desiredTasks":   service.DesiredTasks,
		"runningTasks":   service.RunningTasks,
		"completedTasks": service.CompletedTasks,
		"missingTasks":   missing,
		"percentMissing": percentMissing,
	}
	alertID := fmt.Sprintf("docker-service-health-%s", resourceID)

	if critPct > 0 && percentMissing >= float64(critPct) {
		thresholdValue = float64(critPct)
	} else if warnPct > 0 && percentMissing >= float64(warnPct) {
		thresholdValue = float64(warnPct)
	}

	updateState := ""
	updateMessage := ""
	updateSeverity := AlertLevel("")
	if service.UpdateStatus != nil {
		updateState = strings.ToLower(strings.TrimSpace(service.UpdateStatus.State))
		updateMessage = strings.TrimSpace(service.UpdateStatus.Message)
		switch updateState {
		case "paused", "rollback_started", "rollback_paused":
			updateSeverity = AlertLevelWarning
		case "rollback_failed":
			updateSeverity = AlertLevelCritical
		}
		if service.UpdateStatus.CompletedAt != nil && !service.UpdateStatus.CompletedAt.IsZero() {
			metadata["updateCompletedAt"] = service.UpdateStatus.CompletedAt.UTC()
		}
		if updateState != "" {
			metadata["updateState"] = service.UpdateStatus.State
		}
		if updateMessage != "" {
			metadata["updateMessage"] = updateMessage
		}
	}

	if thresholdValue == 0 && updateSeverity != "" {
		spec, err := buildCanonicalDiscreteStateSpec(resourceID, serviceName, unifiedresources.ResourceTypeDockerService, updateSeverity, 1, false, "update-state",
			[]string{"paused", "rollback_started", "rollback_paused", "rollback_failed"})
		if err != nil {
			log.Warn().
				Err(err).
				Str("service", serviceName).
				Str("resourceID", resourceID).
				Msg("Skipping invalid canonical docker service update-state spec")
			return
		}

		message := fmt.Sprintf("Docker service '%s' update state: %s", serviceName, service.UpdateStatus.State)
		if updateMessage != "" {
			message = fmt.Sprintf("%s (%s)", message, updateMessage)
		}

		_, _ = m.evaluateCanonicalStatefulAlert(canonicalStatefulAlertParams{
			Spec:                         spec,
			Evidence:                     alertspecs.AlertEvidence{ObservedAt: time.Now(), DiscreteState: &alertspecs.DiscreteStateEvidence{StateKey: "update-state", Observed: updateState}},
			AlertID:                      alertID,
			AlertType:                    "docker-service-health",
			ResourceID:                   resourceID,
			ResourceName:                 serviceName,
			Node:                         nodeName,
			Instance:                     instanceName,
			Message:                      message,
			Value:                        percentMissing,
			Threshold:                    0,
			Metadata:                     metadata,
			AddToRecent:                  true,
			AddToHistory:                 true,
			RateLimit:                    true,
			NotifyOnSeverityChange:       true,
			AddToHistoryOnSeverityChange: true,
			DispatchAsync:                true,
		})
		return
	}

	if thresholdValue == 0 {
		m.clearDockerServiceAlert(resourceID)
		return
	}

	spec, err := buildCanonicalServiceGapSpec(resourceID, serviceName, unifiedresources.ResourceTypeDockerService, serviceName, float64(warnPct), float64(critPct), false)
	if err != nil {
		log.Warn().
			Err(err).
			Str("service", serviceName).
			Str("resourceID", resourceID).
			Msg("Skipping invalid canonical docker service gap spec")
		m.clearDockerServiceAlert(resourceID)
		return
	}

	message := fmt.Sprintf("Docker service '%s' is running %d of %d desired tasks", serviceName, service.RunningTasks, service.DesiredTasks)
	_, _ = m.evaluateCanonicalStatefulAlert(canonicalStatefulAlertParams{
		Spec: spec,
		Evidence: alertspecs.AlertEvidence{
			ObservedAt: time.Now(),
			ServiceGap: &alertspecs.ServiceGapEvidence{
				Service: serviceName,
				Desired: desired,
				Running: running,
			},
		},
		AlertID:                      alertID,
		AlertType:                    "docker-service-health",
		ResourceID:                   resourceID,
		ResourceName:                 serviceName,
		Node:                         nodeName,
		Instance:                     instanceName,
		Message:                      message,
		Value:                        percentMissing,
		Threshold:                    thresholdValue,
		Metadata:                     metadata,
		AddToRecent:                  true,
		AddToHistory:                 true,
		RateLimit:                    true,
		NotifyOnSeverityChange:       true,
		AddToHistoryOnSeverityChange: true,
		DispatchAsync:                true,
	})
}

func (m *Manager) clearDockerServiceAlert(resourceID string) {
	m.clearAlert(canonicalServiceGapStateID(resourceID))
	m.clearAlert(canonicalDiscreteStateStateID(resourceID, "update-state"))
}

// HandleDockerHostOnline clears offline tracking and alerts for a Docker host.
func (m *Manager) HandleDockerHostOnline(host models.DockerHost) {
	if host.ID == "" {
		return
	}

	alertID := canonicalConnectivityStateID(fmt.Sprintf("docker:%s", strings.TrimSpace(host.ID)))

	m.mu.Lock()
	delete(m.dockerOfflineCount, host.ID)
	exists := m.hasActiveAlertNoLock(alertID)
	m.mu.Unlock()

	if exists {
		m.clearAlert(alertID)
	}
}

// HandleDockerHostRemoved clears all alerts and tracking when a Docker host is deleted.
func (m *Manager) HandleDockerHostRemoved(host models.DockerHost) {
	if host.ID == "" {
		return
	}

	// Reuse the online handler to clear offline alerts and tracking.
	m.HandleDockerHostOnline(host)
	// Drop any container alerts and host-scoped tracking entries.
	m.clearDockerHostContainerAlerts(host)
}

// HandleDockerHostOffline raises an alert when a Docker host stops reporting.
func (m *Manager) HandleDockerHostOffline(host models.DockerHost) {
	if host.ID == "" {
		return
	}

	m.mu.RLock()
	if !m.config.Enabled {
		m.mu.RUnlock()
		return
	}
	disableDockerHostsOffline := m.config.DisableAllDockerHostsOffline
	m.mu.RUnlock()

	resourceID := fmt.Sprintf("docker:%s", strings.TrimSpace(host.ID))
	alertID := canonicalConnectivityStateID(resourceID)
	instanceName := dockerInstanceName(host)
	nodeName := strings.TrimSpace(host.Hostname)

	if disableDockerHostsOffline {
		m.mu.Lock()
		delete(m.dockerOfflineCount, host.ID)
		m.mu.Unlock()
		m.clearAlert(alertID)
		return
	}

	var disableConnectivity bool
	m.mu.RLock()
	if override, exists := m.config.Overrides[host.ID]; exists {
		disableConnectivity = override.DisableConnectivity
	}
	m.mu.RUnlock()

	if disableConnectivity {
		m.clearAlert(alertID)
		m.mu.Lock()
		delete(m.dockerOfflineCount, host.ID)
		m.mu.Unlock()
		return
	}

	spec, err := buildCanonicalConnectivitySpec(resourceID, host.DisplayName, unifiedresources.ResourceType("docker-host"), AlertLevelCritical, 3, false)
	if err != nil {
		log.Warn().
			Err(err).
			Str("dockerHost", host.DisplayName).
			Str("hostID", host.ID).
			Msg("Skipping invalid canonical docker host connectivity spec")
		return
	}

	result, ok := m.evaluateCanonicalLifecycleAlert(canonicalLifecycleAlertParams{
		Spec:         spec,
		Evidence:     alertspecs.AlertEvidence{ObservedAt: time.Now(), Connectivity: &alertspecs.ConnectivityEvidence{Signal: "status", Connected: false}},
		Tracking:     m.dockerOfflineCount,
		TrackingKey:  host.ID,
		AlertID:      alertID,
		AlertType:    "docker-host-offline",
		ResourceID:   resourceID,
		ResourceName: host.DisplayName,
		Node:         nodeName,
		Instance:     instanceName,
		Message:      fmt.Sprintf("Docker host '%s' is offline", host.DisplayName),
		Metadata: map[string]interface{}{
			"resourceType": "docker-host",
			"hostId":       host.ID,
			"hostname":     host.Hostname,
			"agentId":      host.AgentID,
			"displayName":  host.DisplayName,
		},
		AddToRecent:   true,
		AddToHistory:  true,
		RateLimit:     true,
		DispatchAsync: false,
	})
	if !ok || result.Transition == nil || result.Transition.Kind != alertspecs.EvaluationTransitionActivated {
		return
	}

	m.mu.RLock()
	alert, _ := m.getActiveAlertNoLock(alertID)
	m.mu.RUnlock()
	if alert != nil {
		if callbacks := m.getAlertForAICallbacks(); len(callbacks) > 0 {
			alertCopy := cloneAlertForOutput(alert)
			go func(a *Alert, fns []func(*Alert)) {
				defer func() {
					if r := recover(); r != nil {
						log.Error().Interface("panic", r).Str("alertID", a.ID).Msg("panic in AI alert callback")
					}
				}()
				for _, callback := range fns {
					callback(a)
				}
			}(alertCopy, callbacks)
		}
	}

	m.clearDockerHostContainerAlerts(host)
}

func (m *Manager) checkDockerContainerState(host models.DockerHost, container models.DockerContainer, resourceID, containerName, instanceName, nodeName string) {
	alertID := fmt.Sprintf("docker-container-state-%s", resourceID)
	stateKey := resourceID

	m.mu.RLock()
	override, hasOverride := m.config.Overrides[resourceID]
	defaultDisable := m.config.DockerDefaults.StateDisableConnectivity
	defaultSeverity := normalizePoweredOffSeverity(m.config.DockerDefaults.StatePoweredOffSeverity)
	m.mu.RUnlock()

	disableConnectivity := defaultDisable
	severity := defaultSeverity
	if hasOverride {
		if defaultDisable && !override.DisableConnectivity {
			disableConnectivity = false
		} else if override.DisableConnectivity {
			disableConnectivity = true
		}

		if override.PoweredOffSeverity != "" {
			severity = normalizePoweredOffSeverity(override.PoweredOffSeverity)
		}
	}

	if disableConnectivity {
		m.clearDockerContainerStateAlert(resourceID)
		return
	}

	observedState := strings.ToLower(strings.TrimSpace(container.State))
	if observedState == "" {
		observedState = "unknown"
	}

	spec, err := buildCanonicalDiscreteStateSpec(resourceID, containerName, unifiedresources.ResourceTypeAppContainer, severity, 2, false, "runtime-state",
		[]string{"created", "restarting", "removing", "paused", "exited", "dead", "unknown"})
	if err != nil {
		log.Warn().
			Err(err).
			Str("resourceID", resourceID).
			Str("container", containerName).
			Msg("Skipping invalid canonical docker container state spec")
		return
	}

	_, _ = m.evaluateCanonicalLifecycleAlert(canonicalLifecycleAlertParams{
		Spec: spec,
		Evidence: alertspecs.AlertEvidence{
			ObservedAt: time.Now(),
			DiscreteState: &alertspecs.DiscreteStateEvidence{
				StateKey: "runtime-state",
				Observed: observedState,
			},
		},
		Tracking:     m.dockerStateConfirm,
		TrackingKey:  stateKey,
		AlertID:      alertID,
		AlertType:    "docker-container-state",
		ResourceID:   resourceID,
		ResourceName: containerName,
		Node:         nodeName,
		Instance:     instanceName,
		Message:      fmt.Sprintf("Docker container '%s' is %s", containerName, strings.TrimSpace(container.Status)),
		Metadata: map[string]interface{}{
			"resourceType":  "app-container",
			"hostId":        host.ID,
			"hostName":      host.DisplayName,
			"hostHostname":  host.Hostname,
			"containerId":   container.ID,
			"containerName": containerName,
			"image":         container.Image,
			"state":         container.State,
			"status":        container.Status,
		},
		AddToRecent:   true,
		AddToHistory:  true,
		DispatchAsync: true,
	})
}

func (m *Manager) clearDockerContainerStateAlert(resourceID string) {
	m.mu.Lock()
	delete(m.dockerStateConfirm, resourceID)
	m.mu.Unlock()
	m.clearAlert(canonicalDiscreteStateStateID(resourceID, "runtime-state"))
}

func dockerContainerAlertMetadata(host models.DockerHost, container models.DockerContainer, containerName string) map[string]interface{} {
	return map[string]interface{}{
		"resourceType":  "app-container",
		"hostId":        host.ID,
		"hostName":      host.DisplayName,
		"hostHostname":  host.Hostname,
		"containerId":   container.ID,
		"containerName": containerName,
		"image":         container.Image,
		"state":         container.State,
		"status":        container.Status,
	}
}

func (m *Manager) checkDockerContainerHealth(host models.DockerHost, container models.DockerContainer, resourceID, containerName, instanceName, nodeName string) {
	health := strings.ToLower(strings.TrimSpace(container.Health))
	if health == "" || health == "none" || health == "healthy" || health == "starting" {
		m.clearDockerContainerHealthAlert(resourceID)
		return
	}

	alertID := fmt.Sprintf("docker-container-health-%s", resourceID)
	spec, err := buildCanonicalHealthAssessmentSpec(resourceID+"-health", resourceID, containerName, unifiedresources.ResourceTypeAppContainer, "docker-container-health", nil, false)
	if err != nil {
		log.Warn().
			Err(err).
			Str("resourceID", resourceID).
			Str("container", containerName).
			Msg("Skipping invalid canonical docker container health spec")
		return
	}

	severity := alertspecs.AlertSeverityWarning
	if health == "unhealthy" {
		severity = alertspecs.AlertSeverityCritical
	}

	metadata := dockerContainerAlertMetadata(host, container, containerName)
	metadata["health"] = container.Health

	_, _ = m.evaluateCanonicalStatefulAlert(canonicalStatefulAlertParams{
		Spec: spec,
		Evidence: alertspecs.AlertEvidence{
			ObservedAt: time.Now(),
			HealthAssessment: &alertspecs.HealthAssessmentEvidence{
				Signal:   "docker-container-health",
				Severity: severity,
				Codes:    []string{health},
			},
		},
		AlertID:                      alertID,
		AlertType:                    "docker-container-health",
		ResourceID:                   resourceID,
		ResourceName:                 containerName,
		Node:                         nodeName,
		Instance:                     instanceName,
		Message:                      fmt.Sprintf("Docker container '%s' health is %s", containerName, container.Health),
		Metadata:                     metadata,
		AddToRecent:                  true,
		AddToHistory:                 true,
		DispatchAsync:                false,
		NotifyOnSeverityChange:       true,
		AddToHistoryOnSeverityChange: true,
	})

	log.Warn().
		Str("container", containerName).
		Str("host", host.DisplayName).
		Str("health", container.Health).
		Msg("Docker container health alert raised")
}

func (m *Manager) clearDockerContainerHealthAlert(resourceID string) {
	m.clearAlert(buildCanonicalStateID(resourceID, resourceID+"-health"))
}

// checkDockerContainerRestartLoop detects containers stuck in a restart loop
func (m *Manager) checkDockerContainerRestartLoop(host models.DockerHost, container models.DockerContainer, resourceID, containerName, instanceName, nodeName string) {
	alertID := fmt.Sprintf("docker-container-restart-loop-%s", resourceID)
	now := time.Now()

	// Get config values with defaults
	restartThreshold := m.config.DockerDefaults.RestartCount
	if restartThreshold == 0 {
		restartThreshold = 3 // Default: 3 restarts
	}
	timeWindow := m.config.DockerDefaults.RestartWindow
	if timeWindow == 0 {
		timeWindow = 300 // Default: 5 minutes (300 seconds)
	}

	m.mu.Lock()

	record, exists := m.dockerRestartTracking[resourceID]
	if !exists {
		record = &dockerRestartRecord{
			count:       container.RestartCount,
			lastCount:   container.RestartCount,
			times:       []time.Time{},
			lastChecked: now,
		}
		m.dockerRestartTracking[resourceID] = record
		m.mu.Unlock()
		return
	}

	// If restart count increased, track it
	if container.RestartCount > record.lastCount {
		newRestarts := container.RestartCount - record.lastCount
		for i := 0; i < newRestarts; i++ {
			record.times = append(record.times, now)
		}
		record.lastCount = container.RestartCount
	}

	// Clean up old restart times outside the window
	cutoff := now.Add(-time.Duration(timeWindow) * time.Second)
	var recentRestarts []time.Time
	for _, t := range record.times {
		if t.After(cutoff) {
			recentRestarts = append(recentRestarts, t)
		}
	}
	record.times = recentRestarts
	record.lastChecked = now

	recentCount := len(record.times)
	m.mu.Unlock()

	// Check if we have a restart loop
	if recentCount > restartThreshold {
		spec, err := buildCanonicalSeverityThresholdSpec(resourceID+"-restart-loop", resourceID, containerName, unifiedresources.ResourceTypeAppContainer, "restart-count-window", 0, float64(restartThreshold+1), false)
		if err != nil {
			log.Warn().
				Err(err).
				Str("resourceID", resourceID).
				Str("container", containerName).
				Msg("Skipping invalid canonical docker container restart loop spec")
			return
		}

		metadata := dockerContainerAlertMetadata(host, container, containerName)
		metadata["restartCount"] = container.RestartCount
		metadata["recentRestarts"] = recentCount

		_, _ = m.evaluateCanonicalStatefulAlert(canonicalStatefulAlertParams{
			Spec: spec,
			Evidence: alertspecs.AlertEvidence{
				ObservedAt: now,
				SeverityThreshold: &alertspecs.SeverityThresholdEvidence{
					Metric:    "restart-count-window",
					Direction: alertspecs.ThresholdDirectionAbove,
					Observed:  float64(recentCount),
				},
			},
			AlertID:      alertID,
			AlertType:    "docker-container-restart-loop",
			ResourceID:   resourceID,
			ResourceName: containerName,
			Node:         nodeName,
			Instance:     instanceName,
			Message:      fmt.Sprintf("Docker container '%s' has restarted %d times in the last %d minutes (restart loop detected)", containerName, recentCount, timeWindow/60),
			Metadata:     metadata,
			AddToRecent:  true,
			AddToHistory: true,
		})

		log.Warn().
			Str("container", containerName).
			Str("host", host.DisplayName).
			Int("restarts", recentCount).
			Msg("Docker container restart loop detected")
	} else {
		// Clear alert if restart loop has stopped
		m.clearAlert(buildCanonicalStateID(resourceID, resourceID+"-restart-loop"))
	}
}

// checkDockerContainerOOMKill detects when a container was killed due to out of memory
func (m *Manager) checkDockerContainerOOMKill(host models.DockerHost, container models.DockerContainer, resourceID, containerName, instanceName, nodeName string) {
	alertID := fmt.Sprintf("docker-container-oom-%s", resourceID)

	// Exit code 137 means the container was killed by SIGKILL, often due to OOM
	// Only alert if the container exited (not running) with exit code 137
	state := strings.ToLower(strings.TrimSpace(container.State))
	if (state == "exited" || state == "dead") && container.ExitCode == 137 {
		m.mu.Lock()
		m.dockerLastExitCode[resourceID] = 137
		m.mu.Unlock()

		spec, err := buildCanonicalHealthAssessmentSpec(resourceID+"-oom-kill", resourceID, containerName, unifiedresources.ResourceTypeAppContainer, "docker-container-exit", []string{"oom-kill"}, false)
		if err != nil {
			log.Warn().
				Err(err).
				Str("resourceID", resourceID).
				Str("container", containerName).
				Msg("Skipping invalid canonical docker container OOM spec")
			return
		}

		metadata := dockerContainerAlertMetadata(host, container, containerName)
		metadata["exitCode"] = container.ExitCode
		metadata["memoryUsageBytes"] = container.MemoryUsage
		metadata["memoryLimitBytes"] = container.MemoryLimit

		_, _ = m.evaluateCanonicalStatefulAlert(canonicalStatefulAlertParams{
			Spec: spec,
			Evidence: alertspecs.AlertEvidence{
				ObservedAt: time.Now(),
				HealthAssessment: &alertspecs.HealthAssessmentEvidence{
					Signal:   "docker-container-exit",
					Severity: alertspecs.AlertSeverityCritical,
					Codes:    []string{"oom-kill"},
				},
			},
			AlertID:      alertID,
			AlertType:    "docker-container-oom-kill",
			ResourceID:   resourceID,
			ResourceName: containerName,
			Node:         nodeName,
			Instance:     instanceName,
			Message:      fmt.Sprintf("Docker container '%s' was killed due to out of memory (OOM)", containerName),
			Metadata:     metadata,
			AddToRecent:  true,
			AddToHistory: true,
		})

		log.Error().
			Str("container", containerName).
			Str("host", host.DisplayName).
			Int64("memoryUsage", container.MemoryUsage).
			Int64("memoryLimit", container.MemoryLimit).
			Msg("Docker container OOM killed")
	} else {
		// Update last exit code if it changed
		if container.ExitCode != 0 {
			m.mu.Lock()
			m.dockerLastExitCode[resourceID] = container.ExitCode
			m.mu.Unlock()
		}
		// Clear OOM alert if container is running or exited with different code
		m.clearAlert(buildCanonicalStateID(resourceID, resourceID+"-oom-kill"))
	}
}

// checkDockerContainerMemoryLimit alerts when container approaches its memory limit
func (m *Manager) checkDockerContainerMemoryLimit(host models.DockerHost, container models.DockerContainer, resourceID, containerName, instanceName, nodeName string) {
	// Only check if container is running and has a memory limit
	state := strings.ToLower(strings.TrimSpace(container.State))
	if state != "running" || container.MemoryLimit <= 0 {
		return
	}

	alertID := fmt.Sprintf("docker-container-memory-limit-%s", resourceID)

	// Get config values with defaults
	warnThreshold := float64(m.config.DockerDefaults.MemoryWarnPct)
	if warnThreshold == 0 {
		warnThreshold = 90.0 // Default: 90%
	}
	criticalThreshold := float64(m.config.DockerDefaults.MemoryCriticalPct)
	if criticalThreshold == 0 {
		criticalThreshold = 95.0 // Default: 95%
	}

	// Calculate percentage of limit used
	limitPercent := (float64(container.MemoryUsage) / float64(container.MemoryLimit)) * 100

	clearThreshold := warnThreshold - 5
	recovery := clearThreshold
	spec, err := buildCanonicalSeverityThresholdSpecWithRecovery(resourceID+"-memory-limit", resourceID, containerName, unifiedresources.ResourceTypeAppContainer, "memory-limit-percent", warnThreshold, criticalThreshold, &recovery, false)
	if err != nil {
		log.Warn().
			Err(err).
			Str("resourceID", resourceID).
			Str("container", containerName).
			Msg("Skipping invalid canonical docker container memory limit spec")
		return
	}

	metadata := dockerContainerAlertMetadata(host, container, containerName)
	metadata["memoryUsageBytes"] = container.MemoryUsage
	metadata["memoryLimitBytes"] = container.MemoryLimit
	metadata["limitPercent"] = limitPercent

	_, _ = m.evaluateCanonicalStatefulAlert(canonicalStatefulAlertParams{
		Spec: spec,
		Evidence: alertspecs.AlertEvidence{
			ObservedAt: time.Now(),
			SeverityThreshold: &alertspecs.SeverityThresholdEvidence{
				Metric:    "memory-limit-percent",
				Direction: alertspecs.ThresholdDirectionAbove,
				Observed:  limitPercent,
			},
		},
		AlertID:      alertID,
		AlertType:    "docker-container-memory-limit",
		ResourceID:   resourceID,
		ResourceName: containerName,
		Node:         nodeName,
		Instance:     instanceName,
		Message:      fmt.Sprintf("Docker container '%s' is using %.1f%% of its memory limit (%d MB / %d MB)", containerName, limitPercent, container.MemoryUsage/(1024*1024), container.MemoryLimit/(1024*1024)),
		Metadata:     metadata,
		AddToRecent:  true,
		AddToHistory: true,
	})

	if limitPercent >= warnThreshold {
		log.Warn().
			Str("container", containerName).
			Str("host", host.DisplayName).
			Float64("limitPercent", limitPercent).
			Msg("Docker container approaching memory limit")
	}
}

func (m *Manager) clearDockerContainerMetricAlerts(resourceID string, metrics ...string) {
	if len(metrics) == 0 {
		metrics = []string{"cpu", "memory", "disk"}
	}
	for _, metric := range metrics {
		m.clearAlert(canonicalMetricStateID(resourceID, metric))
	}
}

func (m *Manager) clearDockerContainerUpdateTracking(resourceID, trackingKey string) {
	m.mu.Lock()
	delete(m.dockerUpdateFirstSeen, resourceID)
	if trackingKey != "" {
		delete(m.dockerUpdateFirstSeenByIdentity, trackingKey)
	}
	m.mu.Unlock()
}

// checkDockerContainerImageUpdate checks if an image update has been pending for too long
func (m *Manager) checkDockerContainerImageUpdate(host models.DockerHost, container models.DockerContainer, resourceID, containerName, instanceName, nodeName string) {
	alertID := fmt.Sprintf("docker-container-update-%s", resourceID)
	updateTrackingKey := dockerUpdateTrackingKey(host, container)

	// Check if update detection is enabled
	m.mu.RLock()
	delayHours := m.config.DockerDefaults.UpdateAlertDelayHours
	m.mu.RUnlock()

	// Negative value means disabled
	if delayHours < 0 {
		m.clearAlert(buildCanonicalStateID(resourceID, resourceID+"-image-update"))
		m.clearDockerContainerUpdateTracking(resourceID, updateTrackingKey)
		return
	}

	// Check if this container has an update status reported
	if container.UpdateStatus == nil {
		// No update status - clear any tracking and alerts
		m.clearAlert(buildCanonicalStateID(resourceID, resourceID+"-image-update"))
		m.clearDockerContainerUpdateTracking(resourceID, updateTrackingKey)
		return
	}

	// Check for errors in update detection (don't alert on errors)
	if container.UpdateStatus.Error != "" {
		// Update check failed - clear alert but keep tracking
		m.clearAlert(buildCanonicalStateID(resourceID, resourceID+"-image-update"))
		return
	}

	// Check if an update is available
	if !container.UpdateStatus.UpdateAvailable {
		// No update available - clear tracking and alert
		m.clearAlert(buildCanonicalStateID(resourceID, resourceID+"-image-update"))
		m.clearDockerContainerUpdateTracking(resourceID, updateTrackingKey)
		return
	}

	// Update is available - track when we first saw it
	m.mu.Lock()
	firstSeen, exists := m.dockerUpdateFirstSeenByIdentity[updateTrackingKey]
	if !exists {
		firstSeen, exists = m.dockerUpdateFirstSeen[resourceID]
	}
	if !exists {
		firstSeen = time.Now()
	}
	m.dockerUpdateFirstSeen[resourceID] = firstSeen
	m.dockerUpdateFirstSeenByIdentity[updateTrackingKey] = firstSeen
	m.mu.Unlock()

	// Check if we've exceeded the delay threshold
	pendingDuration := time.Since(firstSeen)
	threshold := time.Duration(delayHours) * time.Hour
	if pendingDuration < threshold {
		// Not yet time to alert
		log.Debug().
			Str("container", containerName).
			Str("host", host.DisplayName).
			Str("image", container.Image).
			Dur("pending", pendingDuration).
			Dur("threshold", threshold).
			Msg("Container update pending but below alert threshold")
		return
	}

	// Create or update the alert
	pendingHours := int(pendingDuration.Hours())
	spec, err := buildCanonicalSeverityThresholdSpec(resourceID+"-image-update", resourceID, containerName, unifiedresources.ResourceTypeAppContainer, "image-update-hours", float64(delayHours), 0, false)
	if err != nil {
		log.Warn().
			Err(err).
			Str("resourceID", resourceID).
			Str("container", containerName).
			Msg("Skipping invalid canonical docker container update spec")
		return
	}

	metadata := dockerContainerAlertMetadata(host, container, containerName)
	metadata["currentDigest"] = container.UpdateStatus.CurrentDigest
	metadata["latestDigest"] = container.UpdateStatus.LatestDigest
	metadata["lastChecked"] = container.UpdateStatus.LastChecked
	metadata["firstSeen"] = firstSeen
	metadata["pendingHours"] = pendingHours
	metadata["thresholdHours"] = delayHours

	_, _ = m.evaluateCanonicalStatefulAlert(canonicalStatefulAlertParams{
		Spec: spec,
		Evidence: alertspecs.AlertEvidence{
			ObservedAt: time.Now(),
			SeverityThreshold: &alertspecs.SeverityThresholdEvidence{
				Metric:    "image-update-hours",
				Direction: alertspecs.ThresholdDirectionAbove,
				Observed:  pendingDuration.Hours(),
			},
		},
		AlertID:           alertID,
		AlertType:         "docker-container-update",
		ResourceID:        resourceID,
		ResourceName:      containerName,
		Node:              nodeName,
		Instance:          instanceName,
		Message:           fmt.Sprintf("Docker container '%s' has an image update available for %d hours", containerName, pendingHours),
		StartTimeOverride: firstSeen,
		Metadata:          metadata,
		AddToRecent:       true,
		AddToHistory:      true,
	})

	log.Warn().
		Str("container", containerName).
		Str("host", host.DisplayName).
		Str("image", container.Image).
		Int("pendingHours", pendingHours).
		Msg("Docker container has pending image update")
}

func (m *Manager) cleanupDockerContainerAlerts(host models.DockerHost, seen map[string]struct{}) {
	m.cleanupDockerContainerAlertsWithTracking(host, seen, nil)
}

func (m *Manager) cleanupDockerContainerAlertsWithTracking(host models.DockerHost, seen map[string]struct{}, seenUpdateTracking map[string]struct{}) {
	prefix := fmt.Sprintf("docker:%s/", strings.TrimSpace(host.ID))
	updateTrackingPrefix := dockerUpdateTrackingHostPrefix(host)

	m.mu.Lock()
	toClear := make([]string, 0)
	for storageKey, alert := range m.activeAlerts {
		alertID := effectiveAlertID(alert, storageKey)
		if !strings.HasPrefix(alert.ResourceID, prefix) {
			continue
		}
		if _, exists := seen[alert.ResourceID]; exists {
			continue
		}
		toClear = append(toClear, alertID)
	}
	for resourceID := range m.dockerStateConfirm {
		if strings.HasPrefix(resourceID, prefix) {
			if _, exists := seen[resourceID]; !exists {
				delete(m.dockerStateConfirm, resourceID)
			}
		}
	}
	// Cleanup update tracking for removed containers
	for resourceID := range m.dockerUpdateFirstSeen {
		if strings.HasPrefix(resourceID, prefix) {
			if _, exists := seen[resourceID]; !exists {
				delete(m.dockerUpdateFirstSeen, resourceID)
			}
		}
	}
	if seenUpdateTracking != nil {
		for trackingKey := range m.dockerUpdateFirstSeenByIdentity {
			if !strings.HasPrefix(trackingKey, updateTrackingPrefix) {
				continue
			}
			if _, exists := seenUpdateTracking[trackingKey]; !exists {
				delete(m.dockerUpdateFirstSeenByIdentity, trackingKey)
			}
		}
	}
	m.mu.Unlock()

	for _, alertID := range toClear {
		m.clearAlert(alertID)
	}
}

func (m *Manager) clearDockerHostContainerAlerts(host models.DockerHost) {
	prefix := fmt.Sprintf("docker:%s/", strings.TrimSpace(host.ID))
	updateTrackingPrefix := dockerUpdateTrackingHostPrefix(host)

	m.mu.Lock()
	toClear := make([]string, 0)
	for storageKey, alert := range m.activeAlerts {
		alertID := effectiveAlertID(alert, storageKey)
		if strings.HasPrefix(alert.ResourceID, prefix) {
			toClear = append(toClear, alertID)
		}
	}
	for resourceID := range m.dockerStateConfirm {
		if strings.HasPrefix(resourceID, prefix) {
			delete(m.dockerStateConfirm, resourceID)
		}
	}
	for resourceID := range m.dockerRestartTracking {
		if strings.HasPrefix(resourceID, prefix) {
			delete(m.dockerRestartTracking, resourceID)
		}
	}
	for resourceID := range m.dockerLastExitCode {
		if strings.HasPrefix(resourceID, prefix) {
			delete(m.dockerLastExitCode, resourceID)
		}
	}
	for resourceID := range m.dockerUpdateFirstSeen {
		if strings.HasPrefix(resourceID, prefix) {
			delete(m.dockerUpdateFirstSeen, resourceID)
		}
	}
	for trackingKey := range m.dockerUpdateFirstSeenByIdentity {
		if strings.HasPrefix(trackingKey, updateTrackingPrefix) {
			delete(m.dockerUpdateFirstSeenByIdentity, trackingKey)
		}
	}
	m.mu.Unlock()

	for _, alertID := range toClear {
		m.clearAlert(alertID)
	}
}

// CheckStorage checks storage against thresholds
func (m *Manager) CheckStorage(storage models.Storage) {
	m.mu.RLock()
	if !m.config.Enabled {
		m.mu.RUnlock()
		return
	}
	if m.config.DisableAllStorage {
		m.mu.RUnlock()
		// Clear any existing storage alerts when all storage alerts are disabled
		m.mu.Lock()
		usageAlertID := canonicalMetricStateID(storage.ID, "usage")
		if m.clearActiveAlertIfPresentNoLock(usageAlertID) {
			log.Info().
				Str("alertID", usageAlertID).
				Str("storage", storage.Name).
				Msg("Cleared usage alert - all storage alerts disabled")
		}
		offlineAlertID := canonicalConnectivityStateID(storage.ID)
		if m.clearActiveAlertIfPresentNoLock(offlineAlertID) {
			log.Info().
				Str("alertID", offlineAlertID).
				Str("storage", storage.Name).
				Msg("Cleared offline alert - all storage alerts disabled")
		}
		m.mu.Unlock()
		return
	}

	thresholds := m.resolveResourceThresholds("storage", storage.ID)
	m.mu.RUnlock()

	if thresholds.Disabled {
		m.mu.Lock()
		delete(m.offlineConfirmations, storage.ID)
		m.mu.Unlock()
		m.clearAlert(canonicalMetricStateID(storage.ID, "usage"))
		m.clearAlert(canonicalConnectivityStateID(storage.ID))
		return
	}

	// Check if storage is truly offline/unavailable (not just inactive from other nodes)
	// Note: In a cluster, local storage from other nodes shows as inactive which is normal
	if thresholds.DisableConnectivity {
		m.mu.Lock()
		delete(m.offlineConfirmations, storage.ID)
		m.mu.Unlock()
		m.clearAlert(canonicalConnectivityStateID(storage.ID))
	} else if storage.Status == "offline" || storage.Status == "unavailable" {
		m.checkStorageOffline(storage)
	} else {
		// Clear any existing offline alert if storage is back online
		m.clearStorageOfflineAlert(storage)
	}

	// Check usage if storage has valid data (even if not currently active on this node)
	// In clusters, storage may show as inactive on nodes where it's not currently mounted
	// but we still want to alert on high usage
	log.Debug().
		Str("storage", storage.Name).
		Str("id", storage.ID).
		Float64("usage", storage.Usage).
		Str("status", storage.Status).
		Float64("trigger", thresholds.Usage.Trigger).
		Float64("clear", thresholds.Usage.Clear).
		Msg("Checking storage thresholds")

	// Check usage if storage is online - checkMetric will skip if threshold is nil or <= 0
	if storage.Status != "offline" && storage.Status != "unavailable" && storage.Usage > 0 {
		m.evaluateUnifiedMetrics(&UnifiedResourceInput{
			ID:       storage.ID,
			Type:     "storage",
			Name:     storage.Name,
			Node:     storage.Node,
			Instance: storage.Instance,
			Disk:     &UnifiedResourceMetric{Percent: storage.Usage},
		}, thresholds, nil)
	}

	// Check ZFS pool status if this is ZFS storage
	if storage.ZFSPool != nil {
		m.checkZFSPoolHealth(storage)
	}
}

// BuildGuestKey constructs a unique key for a guest from instance, node, and VMID.
// Uses the canonical format: instance:node:vmid
// This matches the format used by makeGuestID in the monitoring package.
func BuildGuestKey(instance, node string, vmID int) string {
	instance = strings.TrimSpace(instance)
	node = strings.TrimSpace(node)
	if instance == "" {
		instance = node
	}
	return fmt.Sprintf("%s:%s:%d", instance, node, vmID)
}

type backupRecord struct {
	key          string
	vmID         string
	lookup       GuestLookup
	fallbackName string
	instance     string
	node         string
	source       string
	rollupID     string
	providers    []recovery.Provider
	lastTime     time.Time
}

func canonicalGuestResourceType(guestType string) unifiedresources.ResourceType {
	switch strings.ToLower(strings.TrimSpace(guestType)) {
	case "lxc":
		return unifiedresources.ResourceTypeSystemContainer
	default:
		return unifiedresources.ResourceTypeVM
	}
}

func canonicalBackupSubjectResourceType(record backupRecord) unifiedresources.ResourceType {
	if record.lookup.Type != "" {
		return canonicalGuestResourceType(record.lookup.Type)
	}
	if strings.TrimSpace(record.vmID) != "" {
		return unifiedresources.ResourceTypeVM
	}
	return unifiedresources.ResourceType("backup-subject")
}

func canonicalBackupSubjectResourceID(alertKey string, record backupRecord) string {
	if record.instance != "" && record.node != "" && record.vmID != "" {
		if vmid, err := strconv.Atoi(record.vmID); err == nil && vmid > 0 {
			return BuildGuestKey(record.instance, record.node, vmid)
		}
	}
	return "backup-subject:" + sanitizeAlertKey(alertKey)
}

func asyncSaveActiveAlerts(reason string, save func() error) {
	go func() {
		defer func() {
			if r := recover(); r != nil {
				log.Error().Interface("panic", r).Str("reason", reason).Msg("panic in SaveActiveAlerts goroutine")
			}
		}()
		if err := save(); err != nil {
			log.Error().Err(err).Str("reason", reason).Msg("failed to save active alerts")
		}
	}()
}

// CheckSnapshotsForInstance evaluates guest snapshots for age-based alerts.
func (m *Manager) CheckSnapshotsForInstance(instanceName string, snapshots []models.GuestSnapshot, guestNames map[string]string) {
	m.mu.RLock()
	enabled := m.config.Enabled
	snapshotCfg := m.config.SnapshotDefaults
	m.mu.RUnlock()

	if !enabled {
		return
	}

	if !snapshotCfg.Enabled {
		m.clearSnapshotAlertsForInstance(instanceName)
		return
	}

	now := time.Now()
	validAlerts := make(map[string]struct{})

	for _, snapshot := range snapshots {
		if instanceName != "" && snapshot.Instance != "" && snapshot.Instance != instanceName {
			continue
		}
		if snapshot.Time.IsZero() {
			continue
		}

		ageHours := now.Sub(snapshot.Time).Hours()
		if ageHours < 0 {
			continue
		}
		ageDays := ageHours / 24

		const gib = 1024.0 * 1024 * 1024
		sizeGiB := 0.0
		if snapshot.SizeBytes > 0 {
			sizeGiB = float64(snapshot.SizeBytes) / gib
		}

		// Determine thresholds for this snapshot
		resourceID := fmt.Sprintf("%s:%s:%d", snapshot.Instance, snapshot.Node, snapshot.VMID)
		m.mu.RLock()
		gh := m.getGuestThresholds(nil, resourceID)
		m.mu.RUnlock()

		if gh.Disabled {
			continue
		}

		currentSnapshotCfg := snapshotCfg
		if gh.Snapshot != nil {
			currentSnapshotCfg = *gh.Snapshot
		}

		if !currentSnapshotCfg.Enabled {
			continue
		}

		var ageLevel AlertLevel
		var ageThreshold int
		var sizeLevel AlertLevel
		var sizeThreshold float64
		var triggeredStats []string

		if currentSnapshotCfg.CriticalDays > 0 && ageDays >= float64(currentSnapshotCfg.CriticalDays) {
			ageLevel = AlertLevelCritical
			ageThreshold = currentSnapshotCfg.CriticalDays
			triggeredStats = append(triggeredStats, "age")
		} else if currentSnapshotCfg.WarningDays > 0 && ageDays >= float64(currentSnapshotCfg.WarningDays) {
			ageLevel = AlertLevelWarning
			ageThreshold = currentSnapshotCfg.WarningDays
			triggeredStats = append(triggeredStats, "age")
		}

		if snapshot.SizeBytes > 0 {
			if currentSnapshotCfg.CriticalSizeGiB > 0 && sizeGiB >= currentSnapshotCfg.CriticalSizeGiB {
				sizeLevel = AlertLevelCritical
				sizeThreshold = currentSnapshotCfg.CriticalSizeGiB
				triggeredStats = append(triggeredStats, "size")
			} else if currentSnapshotCfg.WarningSizeGiB > 0 && sizeGiB >= currentSnapshotCfg.WarningSizeGiB {
				sizeLevel = AlertLevelWarning
				sizeThreshold = currentSnapshotCfg.WarningSizeGiB
				triggeredStats = append(triggeredStats, "size")
			}
		}

		if ageLevel == "" && sizeLevel == "" {
			continue
		}

		useSizePrimary := false
		if sizeLevel == AlertLevelCritical && ageLevel != AlertLevelCritical {
			useSizePrimary = true
		} else if sizeLevel != "" && ageLevel == "" {
			useSizePrimary = true
		}

		alertID := fmt.Sprintf("snapshot-age-%s", snapshot.ID)

		guestKey := BuildGuestKey(snapshot.Instance, snapshot.Node, snapshot.VMID)
		guestName := strings.TrimSpace(guestNames[guestKey])

		guestType := "VM"
		if strings.EqualFold(snapshot.Type, "lxc") {
			guestType = "Container"
		}

		if guestName == "" {
			switch guestType {
			case "Container":
				guestName = fmt.Sprintf("CT %d", snapshot.VMID)
			default:
				guestName = fmt.Sprintf("VM %d", snapshot.VMID)
			}
		}

		snapshotName := strings.TrimSpace(snapshot.Name)
		if snapshotName == "" {
			snapshotName = "(unnamed)"
		}

		ageDaysRounded := math.Round(ageDays*10) / 10
		sizeGiBRounded := math.Round(sizeGiB*10) / 10
		reasons := make([]string, 0, 2)
		if ageLevel != "" {
			reasons = append(reasons, fmt.Sprintf("%.1f days old (threshold %d days)", ageDaysRounded, ageThreshold))
		}
		if sizeLevel != "" {
			reasons = append(reasons, fmt.Sprintf("%.1f GiB (threshold %.1f GiB)", sizeGiBRounded, sizeThreshold))
		}
		reasonText := strings.Join(reasons, " and ")
		message := fmt.Sprintf(
			"%s snapshot '%s' for %s is %s on %s",
			guestType,
			snapshotName,
			guestName,
			reasonText,
			snapshot.Node,
		)

		alertValue := ageDays
		alertThreshold := float64(ageThreshold)
		thresholdTime := now
		if useSizePrimary {
			alertValue = sizeGiB
			alertThreshold = sizeThreshold
		} else if ageThreshold > 0 {
			thresholdTime = snapshot.Time.Add(time.Duration(ageThreshold) * 24 * time.Hour)
			if thresholdTime.After(now) {
				thresholdTime = now
			}
		}

		metadata := map[string]interface{}{
			"snapshotName":      snapshot.Name,
			"snapshotCreatedAt": snapshot.Time,
			"snapshotAgeDays":   ageDays,
			"snapshotAgeHours":  ageHours,
			"snapshotSizeBytes": snapshot.SizeBytes,
			"snapshotSizeGiB":   sizeGiB,
			"guestName":         guestName,
			"guestType":         guestType,
			"guestInstance":     snapshot.Instance,
			"guestNode":         snapshot.Node,
			"guestVmid":         snapshot.VMID,
			"triggeredMetrics":  triggeredStats,
			"primaryMetric":     "age",
		}
		if useSizePrimary {
			metadata["primaryMetric"] = "size"
		}
		if ageLevel != "" {
			metadata["thresholdDays"] = ageThreshold
		}
		if sizeLevel != "" {
			metadata["thresholdSizeGiB"] = sizeThreshold
		}

		resourceName := fmt.Sprintf("%s snapshot '%s'", guestName, snapshotName)
		guestResourceType := canonicalGuestResourceType(snapshot.Type)
		guestResourceID := guestKey
		sizeMetric := ""
		var sizeValue *float64
		if currentSnapshotCfg.WarningSizeGiB > 0 || currentSnapshotCfg.CriticalSizeGiB > 0 {
			sizeMetric = "snapshot-size-gib"
			sizeValue = &sizeGiB
		}
		ageMetric := ""
		if currentSnapshotCfg.WarningDays > 0 || currentSnapshotCfg.CriticalDays > 0 {
			ageMetric = "snapshot-age-days"
		}

		spec, err := buildCanonicalPostureThresholdSpec(
			guestResourceID+"/snapshot:"+snapshot.ID,
			guestResourceID,
			resourceName,
			guestResourceType,
			ageMetric,
			float64(currentSnapshotCfg.WarningDays),
			float64(currentSnapshotCfg.CriticalDays),
			sizeMetric,
			currentSnapshotCfg.WarningSizeGiB,
			currentSnapshotCfg.CriticalSizeGiB,
			false,
		)
		if err != nil {
			log.Warn().
				Err(err).
				Str("snapshotID", snapshot.ID).
				Str("resourceID", guestResourceID).
				Msg("Skipping invalid canonical snapshot posture spec")
			continue
		}
		validAlerts[canonicalTrackingKeyForSpec(spec, alertID)] = struct{}{}

		result, _ := m.evaluateCanonicalStatefulAlert(canonicalStatefulAlertParams{
			Spec: spec,
			Evidence: alertspecs.AlertEvidence{
				ObservedAt: now,
				PostureThreshold: &alertspecs.PostureThresholdEvidence{
					AgeMetric:  ageMetric,
					AgeValue:   ageDays,
					SizeMetric: sizeMetric,
					SizeValue:  sizeValue,
				},
			},
			AlertID:           alertID,
			AlertType:         "snapshot-age",
			ResourceID:        spec.ResourceID,
			ResourceName:      resourceName,
			Node:              snapshot.Node,
			Instance:          snapshot.Instance,
			Value:             alertValue,
			Threshold:         alertThreshold,
			StartTimeOverride: thresholdTime,
			Metadata:          metadata,
			AddToRecent:       true,
			AddToHistory:      true,
			RateLimit:         true,
			DispatchAsync:     true,
			MessageBuilder: func(result alertspecs.EvaluationResult) (string, float64, float64) {
				return message, alertValue, alertThreshold
			},
		})
		if result.Transition != nil && result.Transition.Kind == alertspecs.EvaluationTransitionActivated {
			asyncSaveActiveAlerts("snapshot", m.SaveActiveAlerts)
		}
	}

	m.mu.Lock()
	for storageKey, alert := range m.activeAlerts {
		if alert == nil || alert.Type != "snapshot-age" {
			continue
		}
		if instanceName != "" && alert.Instance != instanceName {
			continue
		}
		if _, ok := validAlerts[storageKey]; ok {
			continue
		}
		m.clearAlertNoLock(storageKey)
	}
	m.mu.Unlock()
}

// CheckBackups evaluates storage, PBS, and PMG backups for age-based alerts.
func (m *Manager) CheckBackups(
	rollups []recovery.ProtectionRollup,
	guestsByKey map[string]GuestLookup,
	guestsByVMID map[string][]GuestLookup,
) {
	m.mu.RLock()
	enabled := m.config.Enabled
	backupCfg := m.config.BackupDefaults
	m.mu.RUnlock()

	if backupCfg.AlertOrphaned == nil {
		alertOrphaned := true
		backupCfg.AlertOrphaned = &alertOrphaned
	}

	if !enabled || !backupCfg.Enabled {
		m.clearBackupAlerts()
		return
	}

	if backupCfg.WarningDays <= 0 && backupCfg.CriticalDays <= 0 {
		m.clearBackupAlerts()
		return
	}

	records := make(map[string]*backupRecord)

	updateRecord := func(key string, candidate backupRecord) {
		if key == "" {
			return
		}
		if existing, ok := records[key]; ok {
			if candidate.lastTime.After(existing.lastTime) {
				*existing = candidate
			}
			return
		}
		record := candidate
		records[key] = &record
	}

	now := time.Now()

	for _, rollup := range rollups {
		if rollup.LastSuccessAt == nil || rollup.LastSuccessAt.IsZero() {
			continue
		}

		lastTime := rollup.LastSuccessAt.UTC()
		providers := append([]recovery.Provider(nil), rollup.Providers...)

		source := "Recovery"
		if slicesContainsProvider(providers, recovery.ProviderProxmoxPMG) {
			source = "PMG"
		} else if slicesContainsProvider(providers, recovery.ProviderProxmoxPBS) {
			source = "PBS"
		} else if slicesContainsProvider(providers, recovery.ProviderProxmoxPVE) {
			source = "PVE"
		}

		var (
			info        GuestLookup
			key         string
			displayName string
			instance    string
			node        string
			vmID        string
		)

		ref := rollup.SubjectRef

		// Primary: subjectRef.ID is the canonical proxmox guest source ID (instance:node:vmid) when linked.
		if ref != nil && strings.TrimSpace(ref.ID) != "" {
			if inst, nd, vmid, ok := parseGuestID(ref.ID); ok {
				key = BuildGuestKey(inst, nd, vmid)
				info = guestsByKey[key]
				instance = inst
				node = nd
				vmID = strconv.Itoa(vmid)
			}
		}

		// Secondary: attempt to map by VMID for orphaned/ambiguous backups.
		if key == "" && ref != nil {
			vmidStr := strings.TrimSpace(ref.ID)
			if vmidStr == "" {
				vmidStr = strings.TrimSpace(ref.Name)
			}
			if vmidStr != "" {
				if vmid, err := strconv.Atoi(vmidStr); err == nil && vmid > 0 {
					vmID = vmidStr
					guests := guestsByVMID[vmidStr]
					if len(guests) == 1 {
						info = guests[0]
					} else if len(guests) > 1 && strings.TrimSpace(ref.Namespace) != "" {
						for _, g := range guests {
							if namespaceMatchesInstance(ref.Namespace, g.Instance) {
								info = g
								break
							}
						}
					}
					if info.Instance != "" && info.Node != "" {
						key = BuildGuestKey(info.Instance, info.Node, info.VMID)
						instance = info.Instance
						node = info.Node
					}
				}
			}
		}

		if key == "" {
			// Stable fallback for non-guest subjects and orphans.
			key = strings.TrimSpace(rollup.RollupID)
			if key == "" {
				continue
			}
		}

		displayName = strings.TrimSpace(info.Name)
		if displayName == "" && ref != nil {
			displayName = strings.TrimSpace(ref.Name)
		}
		if displayName == "" && vmID != "" {
			displayName = fmt.Sprintf("VMID %s", vmID)
		}
		if displayName == "" {
			displayName = "Unknown"
		}

		updateRecord(key, backupRecord{
			key:          key,
			vmID:         vmID,
			lookup:       info,
			fallbackName: displayName,
			instance:     instance,
			node:         node,
			source:       source,
			rollupID:     strings.TrimSpace(rollup.RollupID),
			providers:    providers,
			lastTime:     lastTime,
		})
	}

	if len(records) == 0 {
		m.clearBackupAlerts()
		return
	}

	validAlerts := make(map[string]struct{})

	for key, record := range records {
		age := now.Sub(record.lastTime)
		if age < 0 {
			continue
		}

		ageDays := age.Hours() / 24
		if ageDays < 0 {
			continue
		}
		ageDaysRounded := math.Round(ageDays*10) / 10

		// Determine thresholds for this backup
		currentBackupCfg := backupCfg
		if record.lookup.ResourceID != "" {
			m.mu.RLock()
			gh := m.getGuestThresholds(nil, record.lookup.ResourceID)
			m.mu.RUnlock()
			if gh.Disabled {
				continue
			}
			if gh.Backup != nil {
				currentBackupCfg = *gh.Backup
			}
		}

		currentBackupCfg.AlertOrphaned = backupCfg.AlertOrphaned
		currentBackupCfg.IgnoreVMIDs = backupCfg.IgnoreVMIDs

		if backupIgnoreVMID(record.vmID, currentBackupCfg.IgnoreVMIDs) {
			continue
		}
		if record.vmID != "" && record.lookup.ResourceID == "" {
			if currentBackupCfg.AlertOrphaned != nil && !*currentBackupCfg.AlertOrphaned {
				continue
			}
		}

		if !currentBackupCfg.Enabled {
			continue
		}

		var threshold int
		switch {
		case currentBackupCfg.CriticalDays > 0 && ageDays >= float64(currentBackupCfg.CriticalDays):
			threshold = currentBackupCfg.CriticalDays
		case currentBackupCfg.WarningDays > 0 && ageDays >= float64(currentBackupCfg.WarningDays):
			threshold = currentBackupCfg.WarningDays
		default:
			continue
		}

		alertKey := sanitizeAlertKey(key)
		alertID := fmt.Sprintf("backup-age-%s", alertKey)

		displayName := record.lookup.Name
		if displayName == "" {
			displayName = record.fallbackName
		}
		if displayName == "" {
			displayName = "Unknown guest"
		}

		node := record.node
		if node == "" {
			node = record.lookup.Node
		}
		instance := record.instance
		if instance == "" {
			instance = record.lookup.Instance
		}

		thresholdTime := record.lastTime.Add(time.Duration(threshold) * 24 * time.Hour)
		if thresholdTime.After(now) {
			thresholdTime = now
		}

		var sourceLabel string
		sourceLabel = record.source
		if len(record.providers) > 0 {
			parts := make([]string, 0, len(record.providers))
			for _, p := range record.providers {
				if s := strings.TrimSpace(string(p)); s != "" {
					parts = append(parts, s)
				}
			}
			if len(parts) > 0 {
				sourceLabel = strings.Join(parts, ", ")
			}
		}

		message := fmt.Sprintf(
			"%s backup via %s is %.1f days old (threshold: %d days)",
			displayName,
			sourceLabel,
			ageDaysRounded,
			threshold,
		)

		metadata := map[string]interface{}{
			"source":         record.source,
			"providers":      record.providers,
			"rollupId":       record.rollupID,
			"lastBackupTime": record.lastTime,
			"ageDays":        ageDays,
			"thresholdDays":  threshold,
		}
		specResourceID := canonicalBackupSubjectResourceID(alertKey, *record)
		specResourceType := canonicalBackupSubjectResourceType(*record)
		spec, err := buildCanonicalPostureThresholdSpec(
			specResourceID+"-backup-age",
			specResourceID,
			displayName+" backup",
			specResourceType,
			"backup-age-days",
			float64(currentBackupCfg.WarningDays),
			float64(currentBackupCfg.CriticalDays),
			"",
			0,
			0,
			false,
		)
		if err != nil {
			log.Warn().
				Err(err).
				Str("alertID", alertID).
				Str("resourceID", specResourceID).
				Msg("Skipping invalid canonical backup posture spec")
			continue
		}
		validAlerts[canonicalTrackingKeyForSpec(spec, alertID)] = struct{}{}

		result, _ := m.evaluateCanonicalStatefulAlert(canonicalStatefulAlertParams{
			Spec: spec,
			Evidence: alertspecs.AlertEvidence{
				ObservedAt: now,
				PostureThreshold: &alertspecs.PostureThresholdEvidence{
					AgeMetric: "backup-age-days",
					AgeValue:  ageDays,
				},
			},
			AlertID:           alertID,
			AlertType:         "backup-age",
			ResourceID:        spec.ResourceID,
			ResourceName:      fmt.Sprintf("%s backup", displayName),
			Node:              node,
			Instance:          instance,
			Value:             ageDays,
			Threshold:         float64(threshold),
			StartTimeOverride: thresholdTime,
			Metadata:          metadata,
			AddToRecent:       true,
			AddToHistory:      true,
			RateLimit:         true,
			DispatchAsync:     true,
			MessageBuilder: func(result alertspecs.EvaluationResult) (string, float64, float64) {
				return message, ageDays, float64(threshold)
			},
		})
		if result.Transition != nil && result.Transition.Kind == alertspecs.EvaluationTransitionActivated {
			asyncSaveActiveAlerts("backup", m.SaveActiveAlerts)
		}
	}

	m.mu.Lock()
	for storageKey, alert := range m.activeAlerts {
		if alert == nil || alert.Type != "backup-age" {
			continue
		}
		if _, ok := validAlerts[storageKey]; ok {
			continue
		}
		m.clearAlertNoLock(storageKey)
	}
	m.mu.Unlock()
}

func slicesContainsProvider(providers []recovery.Provider, target recovery.Provider) bool {
	for _, p := range providers {
		if p == target {
			return true
		}
	}
	return false
}

func parseGuestID(raw string) (instance string, node string, vmid int, ok bool) {
	raw = strings.TrimSpace(raw)
	if raw == "" {
		return "", "", 0, false
	}
	parts := strings.Split(raw, ":")
	if len(parts) < 3 {
		return "", "", 0, false
	}
	last := parts[len(parts)-1]
	prev := parts[len(parts)-2]
	inst := strings.Join(parts[:len(parts)-2], ":")
	n, err := strconv.Atoi(strings.TrimSpace(last))
	if err != nil || n <= 0 {
		return "", "", 0, false
	}
	return strings.TrimSpace(inst), strings.TrimSpace(prev), n, true
}

// checkZFSPoolHealth checks ZFS pool for errors and degraded state
func (m *Manager) checkZFSPoolHealth(storage models.Storage) {
	pool := storage.ZFSPool
	if pool == nil {
		return
	}

	poolResourceName := fmt.Sprintf("%s (%s)", storage.Name, pool.Name)
	poolAssessment := storagehealth.AssessZFSPool(*pool)

	// Check pool state (DEGRADED, FAULTED, etc.)
	stateAlertID := fmt.Sprintf("zfs-pool-state-%s", storage.ID)
	stateMetadata := map[string]interface{}{
		"pool_name":  pool.Name,
		"pool_state": pool.State,
	}
	stateReasons := filterStorageHealthReasonsByCodes(poolAssessment.Reasons, zfsPoolAssessmentCodes)
	stateResult, _ := m.syncCanonicalHealthAssessmentAlert(canonicalHealthAssessmentAlertParams{
		SpecID:         storage.ID + "/zfs-pool:" + sanitizeHostComponent(pool.Name) + "-state",
		Signal:         "zfs-pool",
		Codes:          zfsPoolAssessmentCodes,
		Reasons:        stateReasons,
		AlertID:        stateAlertID,
		AlertType:      "zfs-pool-state",
		SpecResourceID: storage.ID + "/zfs-pool:" + sanitizeHostComponent(pool.Name),
		ResourceID:     storage.ID,
		ResourceName:   poolResourceName,
		ResourceType:   unifiedresources.ResourceTypeStorage,
		Node:           storage.Node,
		Instance:       storage.Instance,
		Metadata:       stateMetadata,
	})
	if stateResult.Transition != nil && stateResult.Transition.Kind == alertspecs.EvaluationTransitionActivated {
		log.Warn().
			Str("pool", pool.Name).
			Str("state", pool.State).
			Str("node", storage.Node).
			Msg("ZFS pool is not healthy")
	}

	// Check for read/write/checksum errors
	totalErrors := pool.ReadErrors + pool.WriteErrors + pool.ChecksumErrors
	errorsAlertID := fmt.Sprintf("zfs-pool-errors-%s", storage.ID)
	errorsSpecResourceID := storage.ID + "/zfs-pool:" + sanitizeHostComponent(pool.Name)
	errorsSpecID := errorsSpecResourceID + "-errors"
	if totalErrors > 0 {
		existingValue, exists := m.activeAlertValue(buildCanonicalStateID(errorsSpecResourceID, errorsSpecID))
		if !exists || float64(totalErrors) > existingValue {
			errorMetadata := map[string]interface{}{
				"pool_name":       pool.Name,
				"read_errors":     pool.ReadErrors,
				"write_errors":    pool.WriteErrors,
				"checksum_errors": pool.ChecksumErrors,
			}
			errorReasons := filterStorageHealthReasonsByCodes(poolAssessment.Reasons, zfsPoolErrorAssessmentCodes)
			result, _ := m.syncCanonicalHealthAssessmentAlert(canonicalHealthAssessmentAlertParams{
				SpecID:         errorsSpecID,
				Signal:         "zfs-pool-errors",
				Codes:          zfsPoolErrorAssessmentCodes,
				Reasons:        errorReasons,
				AlertID:        errorsAlertID,
				AlertType:      "zfs-pool-errors",
				SpecResourceID: errorsSpecResourceID,
				ResourceID:     storage.ID,
				ResourceName:   poolResourceName,
				ResourceType:   unifiedresources.ResourceTypeStorage,
				Node:           storage.Node,
				Instance:       storage.Instance,
				Metadata:       errorMetadata,
				MessageBuilder: func(result alertspecs.EvaluationResult) (string, float64, float64) {
					return fmt.Sprintf(
						"ZFS pool '%s' has errors: %d read, %d write, %d checksum",
						pool.Name, pool.ReadErrors, pool.WriteErrors, pool.ChecksumErrors,
					), float64(totalErrors), 0
				},
			})
			if result.Transition != nil && result.Transition.Kind == alertspecs.EvaluationTransitionActivated {
				log.Error().
					Str("pool", pool.Name).
					Int64("read_errors", pool.ReadErrors).
					Int64("write_errors", pool.WriteErrors).
					Int64("checksum_errors", pool.ChecksumErrors).
					Str("node", storage.Node).
					Msg("ZFS pool has I/O errors")
			}
		}
	} else {
		m.clearAlert(buildCanonicalStateID(errorsSpecResourceID, errorsSpecID))
	}

	// Check individual devices for errors
	for _, device := range pool.Devices {
		alertID := fmt.Sprintf("zfs-device-%s-%s", storage.ID, device.Name)
		deviceAssessment := zfsDeviceAssessment(device)
		metadata := map[string]interface{}{
			"pool_name":       pool.Name,
			"device_name":     device.Name,
			"device_state":    device.State,
			"read_errors":     device.ReadErrors,
			"write_errors":    device.WriteErrors,
			"checksum_errors": device.ChecksumErrors,
		}
		result, _ := m.syncCanonicalHealthAssessmentAlert(canonicalHealthAssessmentAlertParams{
			SpecID:         storage.ID + "/zfs-pool:" + sanitizeHostComponent(pool.Name) + "/device:" + sanitizeHostComponent(device.Name) + "-health",
			Signal:         "zfs-device",
			Codes:          zfsDeviceAssessmentCodes,
			Reasons:        deviceAssessment.Reasons,
			AlertID:        alertID,
			AlertType:      "zfs-device",
			SpecResourceID: storage.ID + "/zfs-pool:" + sanitizeHostComponent(pool.Name) + "/device:" + sanitizeHostComponent(device.Name),
			ResourceID:     storage.ID,
			ResourceName:   fmt.Sprintf("%s (%s/%s)", storage.Name, pool.Name, device.Name),
			ResourceType:   unifiedresources.ResourceTypeStorage,
			Node:           storage.Node,
			Instance:       storage.Instance,
			Metadata:       metadata,
			MessageBuilder: func(result alertspecs.EvaluationResult) (string, float64, float64) {
				return strings.Join(storageHealthReasonSummaries(deviceAssessment.Reasons), "; "), float64(device.ReadErrors + device.WriteErrors + device.ChecksumErrors), 0
			},
		})
		if result.Transition != nil && result.Transition.Kind == alertspecs.EvaluationTransitionActivated {
			log.Warn().
				Str("pool", pool.Name).
				Str("device", device.Name).
				Str("state", device.State).
				Int64("errors", device.ReadErrors+device.WriteErrors+device.ChecksumErrors).
				Str("node", storage.Node).
				Msg("ZFS device has issues")
		}
	}
}

// clearAlert removes an alert if it exists
func (m *Manager) clearAlert(alertID string) {
	m.mu.Lock()
	alert, exists := m.getActiveAlertNoLock(alertID)
	if exists {
		m.removeActiveAlertNoLock(alertID)
	}
	m.mu.Unlock()

	if !exists {
		return
	}

	publicID := effectiveAlertID(alert, alertID)
	resolvedAlert := &ResolvedAlert{
		Alert:        alert,
		ResolvedTime: time.Now(),
	}

	m.addRecentlyResolvedUnlocked(resolvedAlert)

	m.safeCallResolvedAlertCallback(alert, publicID, false)

	log.Info().
		Str("alertID", publicID).
		Msg("Alert cleared")
}

// getTimeThreshold determines the delay to apply for a metric/resource combination.
func (m *Manager) getTimeThreshold(_ string, resourceType, metricType string) int {
	if delay, ok := m.getMetricTimeThreshold(resourceType, metricType); ok {
		return delay
	}

	base, hasTypeSpecific := m.getBaseTimeThreshold(resourceType)

	if !hasTypeSpecific {
		if delay, ok := m.getGlobalMetricTimeThreshold(metricType); ok {
			return delay
		}
	}

	return base
}

// getMetricTimeThreshold returns a metric-specific delay if configured at the resource-type level.
func (m *Manager) getMetricTimeThreshold(resourceType, metricType string) (int, bool) {
	if len(m.config.MetricTimeThresholds) == 0 {
		return 0, false
	}

	metricKey := strings.ToLower(strings.TrimSpace(metricType))
	if metricKey == "" {
		return 0, false
	}

	for _, typeKey := range CanonicalResourceTypeKeys(resourceType) {
		perType, ok := m.config.MetricTimeThresholds[typeKey]
		if !ok || len(perType) == 0 {
			continue
		}

		if delay, ok := perType[metricKey]; ok {
			return delay, true
		}
		if delay, ok := perType["default"]; ok {
			return delay, true
		}
		if delay, ok := perType["_default"]; ok {
			return delay, true
		}
		if delay, ok := perType["*"]; ok {
			return delay, true
		}
	}

	return 0, false
}

// getBaseTimeThreshold returns the resource-type level delay.
func (m *Manager) getBaseTimeThreshold(resourceType string) (int, bool) {
	if m.config.TimeThresholds != nil {
		for _, key := range CanonicalResourceTypeKeys(resourceType) {
			if delay, ok := m.config.TimeThresholds[key]; ok {
				return delay, true
			}
		}
		if delay, ok := m.config.TimeThresholds["all"]; ok {
			return delay, false
		}
	}

	return 0, false
}

func (m *Manager) getGlobalMetricTimeThreshold(metricType string) (int, bool) {
	if len(m.config.MetricTimeThresholds) == 0 {
		return 0, false
	}

	perType, ok := m.config.MetricTimeThresholds["all"]
	if !ok || len(perType) == 0 {
		return 0, false
	}

	metricKey := strings.ToLower(strings.TrimSpace(metricType))
	if metricKey == "" {
		return 0, false
	}

	if delay, ok := perType[metricKey]; ok {
		return delay, true
	}
	if delay, ok := perType["default"]; ok {
		return delay, true
	}
	if delay, ok := perType["_default"]; ok {
		return delay, true
	}
	if delay, ok := perType["*"]; ok {
		return delay, true
	}

	return 0, false
}

// CanonicalResourceTypeKeys returns normalized resource-type keys for threshold lookup.
func CanonicalResourceTypeKeys(resourceType string) []string {
	typeKey := canonicalAlertResourceType(resourceType)
	if typeKey == "" || isUnsupportedLegacyAlertResourceType(typeKey) {
		return nil
	}

	addUnique := func(slice []string, value string) []string {
		if value == "" {
			return slice
		}
		for _, existing := range slice {
			if existing == value {
				return slice
			}
		}
		return append(slice, value)
	}

	var keys []string
	switch typeKey {
	case "guest":
		keys = addUnique(keys, "guest")
	case "vm":
		keys = addUnique(keys, "vm")
		keys = addUnique(keys, "guest")
	case "system-container":
		keys = addUnique(keys, "system-container")
		keys = addUnique(keys, "guest")
	case "oci-container":
		keys = addUnique(keys, "oci-container")
		keys = addUnique(keys, "system-container")
		keys = addUnique(keys, "guest")
	case "app-container":
		keys = addUnique(keys, "app-container")
		keys = addUnique(keys, "guest")
	case "docker-host":
		keys = addUnique(keys, "docker-host")
		keys = addUnique(keys, "node")
	case "docker-service":
		keys = addUnique(keys, "docker-service")
		keys = addUnique(keys, "app-container")
		keys = addUnique(keys, "guest")
	case "node":
		keys = addUnique(keys, "node")
	case "agent":
		keys = addUnique(keys, "agent")
		keys = addUnique(keys, "node")
	case "agent-disk":
		keys = addUnique(keys, "agent-disk")
		keys = addUnique(keys, "agent")
		keys = addUnique(keys, "storage")
	case "pbs":
		keys = addUnique(keys, "pbs")
		keys = addUnique(keys, "node")
	case "pmg":
		keys = addUnique(keys, "pmg")
		keys = addUnique(keys, "node")
	case "k8s-cluster":
		keys = addUnique(keys, "k8s-cluster")
		keys = addUnique(keys, "guest")
	case "k8s-node":
		keys = addUnique(keys, "k8s-node")
		keys = addUnique(keys, "node")
	case "pod":
		keys = addUnique(keys, "pod")
		keys = addUnique(keys, "guest")
	case "storage":
		keys = addUnique(keys, "storage")
	case "disk":
		keys = addUnique(keys, "disk")
		keys = addUnique(keys, "storage")
	case "datastore":
		keys = addUnique(keys, "datastore")
		keys = addUnique(keys, "storage")
		keys = addUnique(keys, "pbs")
	case "pool", "dataset":
		keys = addUnique(keys, typeKey)
		keys = addUnique(keys, "storage")
	case "ceph":
		keys = addUnique(keys, "ceph")
		keys = addUnique(keys, "storage")
	case "physical_disk":
		keys = addUnique(keys, "physical_disk")
		keys = addUnique(keys, "disk")
		keys = addUnique(keys, "storage")
	default:
		keys = addUnique(keys, typeKey)
	}

	return keys
}

func isUnsupportedLegacyAlertResourceType(typeKey string) bool {
	if unifiedresources.IsUnsupportedLegacyResourceTypeAlias(typeKey) {
		return true
	}

	switch typeKey {
	case "host", "qemu", "container", "lxc", "docker", "docker container", "dockercontainer", "docker host", "dockerhost", "docker service", "dockerservice", "k8s", "k8s pod", "kubernetes", "kubernetes-cluster", "agent disk", "agentdisk", "pbs server", "pbsserver", "pmg server", "proxmox mail gateway":
		return true
	default:
		return false
	}
}

func canonicalAlertResourceType(resourceType string) string {
	return strings.ToLower(strings.TrimSpace(resourceType))
}

// checkMetric checks a single metric against its threshold with hysteresis
type metricOptions struct {
	Metadata map[string]interface{}
	Message  string
	// MonitorOnly suppresses external notifications while still tracking the alert.
	MonitorOnly bool
}

func (m *Manager) checkMetric(resourceID, resourceName, node, instance, resourceType, metricType string, value float64, threshold *HysteresisThreshold, opts *metricOptions) {
	alertID := fmt.Sprintf("%s-%s", resourceID, metricType)
	canonicalSpecID := "metric-threshold:" + metricType
	canonicalStateID := buildCanonicalStateID(resourceID, canonicalSpecID)

	if threshold == nil || threshold.Trigger <= 0 {
		m.clearAlert(canonicalStateID)
		m.clearAlert(alertID)
		return
	}

	log.Debug().
		Str("resource", resourceName).
		Str("metric", metricType).
		Float64("value", value).
		Float64("trigger", threshold.Trigger).
		Float64("clear", threshold.Clear).
		Bool("exceeds", value >= threshold.Trigger).
		Msg("Checking metric threshold")

	m.mu.Lock()
	migratedAlertIdentity := false
	defer func() {
		if migratedAlertIdentity {
			asyncSaveActiveAlerts("guest metric node move", m.SaveActiveAlerts)
		}
	}()
	defer m.mu.Unlock()

	existingAlert, exists := m.getActiveAlertNoLock(alertID)
	if !exists && canonicalStateID != "" {
		existingAlert, exists = m.getActiveAlertNoLock(canonicalStateID)
	}
	if !exists && canonicalStateID != "" {
		if migrated := m.migrateGuestMetricAlertNoLock(canonicalStateID, canonicalSpecID, string(alertspecs.AlertSpecKindMetricThreshold), resourceID, resourceName, node, instance, resourceType); migrated != nil {
			existingAlert = migrated
			exists = true
			migratedAlertIdentity = true
		}
	}
	trackingKey := canonicalTrackingKeyOrFallback(existingAlert, canonicalStateID)
	if trackingKey == "" {
		trackingKey = canonicalStateID
	}
	monitorOnly := opts != nil && opts.MonitorOnly

	// Check for suppression
	if suppressUntil, suppressed := m.suppressedUntil[trackingKey]; suppressed && time.Now().Before(suppressUntil) {
		log.Debug().
			Str("alertID", alertID).
			Str("trackingKey", trackingKey).
			Time("suppressedUntil", suppressUntil).
			Msg("Alert suppressed")
		return
	}

	if value >= threshold.Trigger {
		// Threshold exceeded
		if !exists {
			alertStartTime := time.Now()

			// Determine the appropriate time threshold based on resource/metric type
			timeThreshold := m.getTimeThreshold(resourceID, resourceType, metricType)

			// Check if we have a time threshold configured
			if timeThreshold > 0 {
				// Check if this threshold was already pending
				if pendingTime, isPending := m.pendingAlerts[trackingKey]; isPending {
					// Check if enough time has passed
					if time.Since(pendingTime) >= time.Duration(timeThreshold)*time.Second {
						// Time threshold met, proceed with alert
						delete(m.pendingAlerts, trackingKey)
						if !pendingTime.IsZero() {
							alertStartTime = pendingTime
						}
						log.Debug().
							Str("alertID", alertID).
							Int("timeThreshold", timeThreshold).
							Dur("elapsed", time.Since(pendingTime)).
							Msg("Time threshold met, triggering alert")
					} else {
						// Still waiting for time threshold
						log.Debug().
							Str("alertID", alertID).
							Int("timeThreshold", timeThreshold).
							Dur("elapsed", time.Since(pendingTime)).
							Msg("Threshold exceeded but waiting for time threshold")
						return
					}
				} else {
					// First time exceeding threshold, start tracking
					m.pendingAlerts[trackingKey] = alertStartTime
					log.Debug().
						Str("alertID", alertID).
						Str("trackingKey", trackingKey).
						Int("timeThreshold", timeThreshold).
						Msg("Threshold exceeded, starting time threshold tracking")
					return
				}
			}

			// Check for recent similar alert to prevent spam
			if recent, hasRecent := m.recentAlerts[trackingKey]; hasRecent {
				// Check minimum delta
				if m.config.MinimumDelta > 0 &&
					time.Since(recent.StartTime) < time.Duration(m.config.SuppressionWindow)*time.Minute &&
					abs(recent.Value-value) < m.config.MinimumDelta {
					log.Debug().
						Str("alertID", alertID).
						Float64("recentValue", recent.Value).
						Float64("currentValue", value).
						Float64("delta", abs(recent.Value-value)).
						Float64("minimumDelta", m.config.MinimumDelta).
						Msg("Alert suppressed due to minimum delta")

					// Set suppression window
					m.suppressedUntil[trackingKey] = time.Now().Add(time.Duration(m.config.SuppressionWindow) * time.Minute)
					return
				}
			}

			// New alert
			message := ""
			var unit string
			if opts != nil && opts.Message != "" {
				message = opts.Message
			} else {
				switch metricType {
				case "usage":
					message = fmt.Sprintf("%s at %.1f%%", resourceType, value)
				case "diskRead", "diskWrite", "networkIn", "networkOut":
					message = fmt.Sprintf("%s %s at %.1f MB/s", resourceType, metricType, value)
					unit = "MB/s"
				case "temperature", "disk_temperature", "diskTemperature":
					message = fmt.Sprintf("%s %s at %.1f°C", resourceType, metricType, value)
					unit = "°C"
				default:
					message = fmt.Sprintf("%s %s at %.1f%%", resourceType, metricType, value)
				}
			}

			alertMetadata := map[string]interface{}{
				"resourceType":   resourceType,
				"clearThreshold": threshold.Clear,
			}
			if unit != "" {
				alertMetadata["unit"] = unit
			}
			if opts != nil && opts.Metadata != nil {
				for k, v := range opts.Metadata {
					alertMetadata[k] = v
				}
			}
			alertMetadata["monitorOnly"] = monitorOnly

			alert := &Alert{
				ID:              alertID,
				Type:            metricType,
				Level:           AlertLevelWarning,
				ResourceID:      resourceID,
				ResourceName:    resourceName,
				Node:            node,
				NodeDisplayName: m.resolveNodeDisplayName(instance, node),
				Instance:        instance,
				Message:         message,
				Value:           value,
				Threshold:       threshold.Trigger,
				StartTime:       alertStartTime,
				LastSeen:        time.Now(),
				Metadata:        alertMetadata,
			}
			applyCanonicalIdentity(alert, canonicalSpecID, string(alertspecs.AlertSpecKindMetricThreshold))

			// Set level based on how much over threshold
			if value >= threshold.Trigger+10 {
				alert.Level = AlertLevelCritical
			}

			log.Debug().
				Str("alertID", alertID).
				Time("alertStartTime", alertStartTime).
				Time("now", time.Now()).
				Dur("initialDuration", time.Since(alertStartTime)).
				Msg("Creating new alert with start time")

			m.preserveAlertState(canonicalStateID, alert)
			trackingKey = canonicalTrackingKeyOrFallback(alert, canonicalStateID)
			m.setActiveAlertNoLock(canonicalStateID, alert)
			m.recentAlerts[trackingKey] = alert
			m.historyManager.AddAlert(*alert)

			// Save active alerts after adding new one
			go func() {
				defer func() {
					if r := recover(); r != nil {
						log.Error().Interface("panic", r).Msg("panic in SaveActiveAlerts goroutine")
					}
				}()
				if err := m.SaveActiveAlerts(); err != nil {
					log.Error().Err(err).Msg("failed to save active alerts after creation")
				}
			}()

			log.Warn().
				Str("alertID", alertID).
				Str("resource", resourceName).
				Str("metric", metricType).
				Float64("value", value).
				Float64("trigger", threshold.Trigger).
				Float64("clear", threshold.Clear).
				Int("activeAlerts", len(m.activeAlerts)).
				Msg("Alert triggered")

			// Trigger AI analysis callback unconditionally (bypasses notification suppression)
			if callbacks := m.getAlertForAICallbacks(); len(callbacks) > 0 {
				alertCopy := cloneAlertForOutput(alert)
				go func(a *Alert, fns []func(*Alert)) {
					defer func() {
						if r := recover(); r != nil {
							log.Error().Interface("panic", r).Str("alertID", a.ID).Msg("panic in AI alert callback")
						}
					}()
					for _, callback := range fns {
						callback(a)
					}
				}(alertCopy, callbacks)
			}

			// Check rate limit (but don't remove alert from tracking)
			if !m.checkRateLimit(trackingKey) {
				log.Debug().
					Str("alertID", alertID).
					Str("trackingKey", trackingKey).
					Int("maxPerHour", m.config.Schedule.MaxAlertsHour).
					Msg("Alert notification suppressed due to rate limit")
				// Don't delete the alert, just suppress notifications
				return
			}

			// Notify callback (may be suppressed by quiet hours)
			if len(m.getAlertCallbacks()) > 0 {
				now := time.Now()
				alert.LastNotified = &now
				if m.dispatchAlert(alert, true) {
					log.Info().Str("alertID", alertID).Msg("calling onAlert callback")
				} else {
					alert.LastNotified = nil
				}
			} else {
				log.Warn().Msg("no onAlert callback set!")
			}
		} else {
			// Update existing alert
			applyCanonicalIdentity(existingAlert, canonicalSpecID, string(alertspecs.AlertSpecKindMetricThreshold))
			m.setActiveAlertNoLock(canonicalStateID, existingAlert)
			existingAlert.LastSeen = time.Now()
			existingAlert.Value = value
			// Keep display name current (handles upgrades and renames).
			if dn := m.resolveNodeDisplayName(existingAlert.Instance, existingAlert.Node); dn != "" {
				existingAlert.NodeDisplayName = dn
			}
			if existingAlert.Metadata == nil {
				existingAlert.Metadata = map[string]interface{}{}
			}
			existingAlert.Metadata["resourceType"] = resourceType
			existingAlert.Metadata["clearThreshold"] = threshold.Clear
			existingAlert.Metadata["monitorOnly"] = monitorOnly
			if opts != nil {
				if opts.Message != "" {
					existingAlert.Message = opts.Message
				}
				if opts.Metadata != nil {
					for k, v := range opts.Metadata {
						existingAlert.Metadata[k] = v
					}
				}
			}

			// Update level if needed
			oldLevel := existingAlert.Level
			if value >= threshold.Trigger+10 {
				existingAlert.Level = AlertLevelCritical
			} else {
				existingAlert.Level = AlertLevelWarning
			}

			// Check if we should re-notify based on cooldown period
			// Never re-notify acknowledged alerts (user has already seen it)
			shouldRenotify := false
			if existingAlert.Acknowledged {
				log.Debug().
					Str("alertID", alertID).
					Msg("Alert is acknowledged, skipping re-notification")
			} else if m.shouldNotifyAfterCooldown(existingAlert) {
				shouldRenotify = true
				log.Debug().
					Str("alertID", alertID).
					Dur("cooldown", time.Duration(m.config.Schedule.Cooldown)*time.Minute).
					Msg("Cooldown period has passed, will re-notify")
			} else if oldLevel != existingAlert.Level && existingAlert.Level == AlertLevelCritical {
				// Always re-notify if alert escalated to critical
				shouldRenotify = true
				log.Debug().
					Str("alertID", alertID).
					Msg("Alert escalated to critical, will re-notify despite cooldown")
			}

			// Send re-notification if appropriate (may be suppressed by quiet hours)
			if shouldRenotify && len(m.getAlertCallbacks()) > 0 {
				now := time.Now()
				existingAlert.LastNotified = &now
				// Dispatch asynchronously so callback I/O cannot block alert evaluation.
				if m.dispatchAlert(existingAlert, true) {
					log.Info().
						Str("alertID", alertID).
						Str("level", string(existingAlert.Level)).
						Msg("Re-notifying for existing alert")
				} else {
					existingAlert.LastNotified = nil
				}
			}
		}
	} else {
		// Value is below trigger threshold
		// Clear any pending alert for this metric
		if _, isPending := m.pendingAlerts[trackingKey]; isPending {
			delete(m.pendingAlerts, trackingKey)
			log.Debug().
				Str("alertID", alertID).
				Str("trackingKey", trackingKey).
				Msg("Value dropped below threshold, clearing pending alert")
		}

		if exists {
			// Use hysteresis for resolution - only resolve if below clear threshold
			clearThreshold := threshold.Clear
			if clearThreshold <= 0 {
				clearThreshold = threshold.Trigger // Fallback to trigger if clear not set
			}

			if value <= clearThreshold {
				// Threshold cleared with hysteresis - auto resolve
				resolvedAlert := &ResolvedAlert{
					Alert:        existingAlert,
					ResolvedTime: time.Now(),
				}

				// Remove from active alerts
				m.removeActiveAlertNoLock(alertID)

				// Save active alerts after resolution
				go func() {
					defer func() {
						if r := recover(); r != nil {
							log.Error().Interface("panic", r).Msg("panic in SaveActiveAlerts goroutine (resolution)")
						}
					}()
					if err := m.SaveActiveAlerts(); err != nil {
						log.Error().Err(err).Msg("failed to save active alerts after resolution")
					}
				}()

				// Add to recently resolved while preventing lock-order inversions
				m.addRecentlyResolvedWithPrimaryLock(resolvedAlert)

				log.Info().
					Str("alertID", alertID).
					Msg("Added alert to recently resolved")

				log.Info().
					Str("resource", resourceName).
					Str("metric", metricType).
					Float64("value", value).
					Float64("clearThreshold", clearThreshold).
					Bool("wasAcknowledged", existingAlert.Acknowledged).
					Msg("Alert resolved with hysteresis")

				m.safeCallResolvedAlertCallback(existingAlert, alertID, true)
			}
		}
	}
}

func sanitizeAlertKey(label string) string {
	trimmed := strings.TrimSpace(label)
	if trimmed == "" {
		return ""
	}

	if trimmed == "/" {
		return "root"
	}

	trimmed = strings.Trim(trimmed, "/\\ ")
	if trimmed == "" {
		trimmed = "root"
	}

	lower := strings.ToLower(trimmed)
	var builder strings.Builder
	builder.Grow(len(lower))
	prevDash := false
	for _, r := range lower {
		if (r >= 'a' && r <= 'z') || (r >= '0' && r <= '9') {
			builder.WriteRune(r)
			prevDash = false
			continue
		}
		if r == '.' {
			builder.WriteRune(r)
			prevDash = false
			continue
		}
		if !prevDash {
			builder.WriteRune('-')
			prevDash = true
		}
	}

	sanitized := strings.Trim(builder.String(), "-.")
	if sanitized == "" {
		sanitized = "disk"
	}

	return sanitized
}

// abs returns the absolute value of a float64
func abs(x float64) float64 {
	if x < 0 {
		return -x
	}
	return x
}

// namespaceMatchesInstance checks if a PBS namespace likely corresponds to a PVE instance.
// This helps disambiguate backups when multiple PVE instances have VMs with the same VMID.
// Examples: namespace "pve1" matches instance "pve1", namespace "nat" matches instance "pve-nat"
func namespaceMatchesInstance(namespace, instance string) bool {
	if namespace == "" || instance == "" {
		return false
	}

	// Normalize both strings: lowercase and keep only alphanumeric
	normalize := func(s string) string {
		var b strings.Builder
		for _, r := range strings.ToLower(s) {
			if (r >= 'a' && r <= 'z') || (r >= '0' && r <= '9') {
				b.WriteRune(r)
			}
		}
		return b.String()
	}

	ns := normalize(namespace)
	inst := normalize(instance)

	if ns == "" || inst == "" {
		return false
	}

	// Exact match after normalization
	if ns == inst {
		return true
	}

	// Check if namespace is a suffix of instance
	// e.g., namespace "nat" matches instance "pvenat" (normalized from "pve-nat")
	// This is more precise than substring matching because:
	// - "nat" should match "pve-nat" but not "natpve"
	// - "pve" should match "pve" but not "pve-nat" (handled by exact match above)
	if strings.HasSuffix(inst, ns) {
		return true
	}

	// Check if instance is a suffix of namespace (reverse case)
	// e.g., namespace "pvebackups" could match instance "pve"
	if strings.HasSuffix(ns, inst) {
		return true
	}

	return false
}

// AcknowledgeAlert acknowledges an alert
func (m *Manager) AcknowledgeAlert(alertID, user string) error {
	m.mu.Lock()

	key, exists := m.resolveActiveAlertKeyNoLock(alertID)
	if !exists {
		m.mu.Unlock()
		return fmt.Errorf("%w: %s", ErrAlertNotFound, alertID)
	}
	alert, ok := m.getActiveAlertNoLock(key)
	if !ok || alert == nil {
		m.mu.Unlock()
		return fmt.Errorf("%w: %s", ErrAlertNotFound, alertID)
	}

	alert.Acknowledged = true
	now := time.Now()
	alert.AckTime = &now
	alert.AckUser = user

	m.setActiveAlertNoLock(key, alert)
	m.setAckRecordNoLock(alert, alertID, ackRecord{
		acknowledged: true,
		user:         user,
		time:         now,
	})

	alertCopy := alert.Clone()
	m.mu.Unlock()

	log.Debug().
		Str("alertID", alertID).
		Str("user", user).
		Time("ackTime", now).
		Msg("Alert acknowledgment recorded")

	m.safeCallAcknowledgedCallback(alertCopy, user)
	return nil
}

// UnacknowledgeAlert removes the acknowledged status from an alert
func (m *Manager) UnacknowledgeAlert(alertID string) error {
	m.mu.Lock()

	key, exists := m.resolveActiveAlertKeyNoLock(alertID)
	if !exists {
		m.mu.Unlock()
		return fmt.Errorf("%w: %s", ErrAlertNotFound, alertID)
	}
	alert, ok := m.getActiveAlertNoLock(key)
	if !ok || alert == nil {
		m.mu.Unlock()
		return fmt.Errorf("%w: %s", ErrAlertNotFound, alertID)
	}

	alert.Acknowledged = false
	alert.AckTime = nil
	alert.AckUser = ""

	m.setActiveAlertNoLock(key, alert)
	m.deleteAckRecordNoLock(alert, alertID)

	alertCopy := alert.Clone()
	m.mu.Unlock()

	log.Info().
		Str("alertID", alertID).
		Msg("Alert unacknowledged")

	m.safeCallUnacknowledgedCallback(alertCopy, "")
	return nil
}

// preserveAlertState copies acknowledgement and escalation metadata from an existing alert
// into a freshly constructed alert before it replaces the existing entry in the map. This
// prevents UI state from regressing when alerts are rebuilt during polling.
func (m *Manager) preserveAlertState(alertID string, updated *Alert) {
	if updated == nil {
		return
	}
	backfillCanonicalIdentity(updated)

	// Auto-resolve node display name if not already set.
	if updated.NodeDisplayName == "" && updated.Node != "" {
		updated.NodeDisplayName = m.resolveNodeDisplayName(updated.Instance, updated.Node)
	}

	existing, exists := m.getActiveAlertNoLock(alertID)
	if exists && existing != nil {
		// Preserve the original start time so duration calculations are correct
		updated.StartTime = existing.StartTime
		if existing.LastNotified != nil {
			t := *existing.LastNotified
			updated.LastNotified = &t
		} else {
			updated.LastNotified = nil
		}
		updated.Acknowledged = existing.Acknowledged
		updated.AckUser = existing.AckUser
		if existing.AckTime != nil {
			t := *existing.AckTime
			updated.AckTime = &t
		} else {
			updated.AckTime = nil
		}
		updated.LastEscalation = existing.LastEscalation
		if len(existing.EscalationTimes) > 0 {
			updated.EscalationTimes = append([]time.Time(nil), existing.EscalationTimes...)
		} else {
			updated.EscalationTimes = nil
		}

		log.Debug().
			Str("alertID", alertID).
			Time("originalStartTime", existing.StartTime).
			Dur("currentDuration", time.Since(existing.StartTime)).
			Msg("Preserving alert state including StartTime")
		return
	}

	if record, ok := m.getAckRecordNoLock(updated, alertID); ok && record.acknowledged {
		updated.Acknowledged = true
		updated.AckUser = record.user
		t := record.time
		updated.AckTime = &t
	}
}

func (m *Manager) removeActiveAlertNoLock(alertID string) {
	// Before deleting, update the history entry with the alert's final LastSeen
	// timestamp so the stored duration reflects how long the alert was actually active.
	publicID := alertID
	var currentAlert *Alert
	key, exists := m.resolveActiveAlertKeyNoLock(alertID)
	if !exists {
		key, exists = m.resolveActiveAlertKeyByCanonicalStateNoLock(alertID)
	}
	if alert, ok := m.getActiveAlertNoLock(alertID); exists && ok && alert != nil {
		currentAlert = alert
		backfillCanonicalIdentity(alert)
		publicID = effectiveAlertID(alert, alertID)
		m.historyManager.UpdateAlertLastSeenForAlert(alert, alert.LastSeen)
		m.unregisterActiveAlertAliasNoLock(key, alert)
	}
	if exists {
		delete(m.activeAlerts, key)
	}
	// NOTE: Don't delete ackState here - preserve it so if the same alert
	// reappears (e.g., powered-off VM during backup), the acknowledgement
	// is restored via preserveAlertState. ackState is cleaned up in Cleanup().
	// Update inactiveAt so the cleanup TTL is measured from removal time, not ack time.
	if exists {
		m.markAckInactiveNoLock(currentAlert, publicID, time.Now())
	}
}

// GetActiveAlerts returns all active alerts
func (m *Manager) GetActiveAlerts() []Alert {
	if m == nil {
		return nil
	}

	m.mu.RLock()
	defer m.mu.RUnlock()

	alerts := make([]Alert, 0, len(m.activeAlerts))
	for _, alert := range m.activeAlerts {
		a := *cloneAlertForOutput(alert)
		// Ensure display name is current (handles upgrades, renames, and
		// alerts created before the cache was populated).
		if dn := m.resolveNodeDisplayName(a.Instance, a.Node); dn != "" {
			a.NodeDisplayName = dn
		}
		alerts = append(alerts, a)
	}

	sort.Slice(alerts, func(i, j int) bool {
		if left, right := alertSeveritySortRank(alerts[i]), alertSeveritySortRank(alerts[j]); left != right {
			return left > right
		}
		if left, right := alertProtectionSortRank(alerts[i]), alertProtectionSortRank(alerts[j]); left != right {
			return left > right
		}
		if left, right := alertRecoverabilitySortRank(alerts[i]), alertRecoverabilitySortRank(alerts[j]); left != right {
			return left > right
		}
		if left, right := alertImpactSortRank(alerts[i]), alertImpactSortRank(alerts[j]); left != right {
			return left > right
		}
		if left, right := alertTypeSortRank(alerts[i]), alertTypeSortRank(alerts[j]); left != right {
			return left > right
		}
		if !alerts[i].StartTime.Equal(alerts[j].StartTime) {
			return alerts[i].StartTime.Before(alerts[j].StartTime)
		}
		if alerts[i].Node != alerts[j].Node {
			return alerts[i].Node < alerts[j].Node
		}
		return alerts[i].ID < alerts[j].ID
	})

	return alerts
}

func alertProtectionSortRank(alert Alert) int {
	switch {
	case metadataBoolValue(alert.Metadata, "protectionReduced"):
		return 2
	case metadataBoolValue(alert.Metadata, "rebuildInProgress"):
		return 1
	default:
		return 0
	}
}

func alertSeveritySortRank(alert Alert) int {
	switch alert.Level {
	case AlertLevelCritical:
		return 2
	case AlertLevelWarning:
		return 1
	default:
		return 0
	}
}

func alertImpactSortRank(alert Alert) int {
	if alert.Metadata == nil {
		return 0
	}
	return metadataIntValue(alert.Metadata["consumerCount"])
}

func alertRecoverabilitySortRank(alert Alert) int {
	switch {
	case metadataBoolValue(alert.Metadata, "backupTarget") && metadataIntValue(alert.Metadata["protectedWorkloadCount"]) > 0:
		return 2
	case metadataBoolValue(alert.Metadata, "backupServer") && metadataIntValue(alert.Metadata["protectedWorkloadCount"]) > 0:
		return 2
	case metadataBoolValue(alert.Metadata, "backupTarget"):
		return 1
	case metadataBoolValue(alert.Metadata, "backupServer") && metadataIntValue(alert.Metadata["affectedDatastoreCount"]) > 0:
		return 1
	case metadataBoolValue(alert.Metadata, "backupServer"):
		return 1
	default:
		return 0
	}
}

func alertTypeSortRank(alert Alert) int {
	switch alert.Type {
	case "backup-posture-incident":
		return 6
	case "backup-storage-incident":
		return 5
	case "storage-incident", "zfs-pool-state", "zfs-pool-errors":
		return 4
	case "resource-incident":
		return 4
	case "disk-health", "disk-wearout", "zfs-device":
		return 3
	case "offline", "connectivity", "powered-off", "docker-host-offline":
		return 2
	default:
		return 1
	}
}

func metadataIntValue(value interface{}) int {
	switch v := value.(type) {
	case int:
		return v
	case int8:
		return int(v)
	case int16:
		return int(v)
	case int32:
		return int(v)
	case int64:
		return int(v)
	case uint:
		return int(v)
	case uint8:
		return int(v)
	case uint16:
		return int(v)
	case uint32:
		return int(v)
	case uint64:
		return int(v)
	case float32:
		return int(v)
	case float64:
		return int(v)
	case json.Number:
		if parsed, err := v.Int64(); err == nil {
			return int(parsed)
		}
	case string:
		if parsed, err := strconv.Atoi(strings.TrimSpace(v)); err == nil {
			return parsed
		}
	}
	return 0
}

func metadataStringValue(metadata map[string]interface{}, key string) string {
	if metadata == nil {
		return ""
	}
	value, ok := metadata[key]
	if !ok {
		return ""
	}
	switch v := value.(type) {
	case string:
		return strings.TrimSpace(v)
	case []byte:
		return strings.TrimSpace(string(v))
	case json.Number:
		return strings.TrimSpace(v.String())
	default:
		return strings.TrimSpace(fmt.Sprint(v))
	}
}

func metadataBoolValue(metadata map[string]interface{}, key string) bool {
	if metadata == nil {
		return false
	}
	value, ok := metadata[key]
	if !ok {
		return false
	}
	switch v := value.(type) {
	case bool:
		return v
	case string:
		switch strings.ToLower(strings.TrimSpace(v)) {
		case "1", "true", "yes", "on":
			return true
		}
	case int:
		return v != 0
	case int8:
		return v != 0
	case int16:
		return v != 0
	case int32:
		return v != 0
	case int64:
		return v != 0
	case uint:
		return v != 0
	case uint8:
		return v != 0
	case uint16:
		return v != 0
	case uint32:
		return v != 0
	case uint64:
		return v != 0
	case float32:
		return v != 0
	case float64:
		return v != 0
	case json.Number:
		if parsed, err := v.Int64(); err == nil {
			return parsed != 0
		}
	}
	return false
}

// NotifyExistingAlert re-dispatches a notification for an existing active alert
// Used when activation state changes from pending to active
func (m *Manager) NotifyExistingAlert(alertID string) {
	m.mu.Lock()
	defer m.mu.Unlock()

	alert, exists := m.getActiveAlertNoLock(alertID)
	if !exists {
		return
	}

	// Dispatch notification for existing alert while holding lock
	// dispatchAlert expects caller to hold m.mu for checkFlapping safety
	m.dispatchAlert(alert, true)
}

// GetRecentlyResolved returns recently resolved alerts
func (m *Manager) GetRecentlyResolved() []models.ResolvedAlert {
	if m == nil {
		return nil
	}

	m.resolvedMutex.RLock()
	defer m.resolvedMutex.RUnlock()

	resolved := make([]models.ResolvedAlert, 0, len(m.recentlyResolved))
	for _, alert := range m.recentlyResolved {
		exported := cloneAlertForOutput(alert.Alert)
		resolved = append(resolved, models.ResolvedAlert{
			Alert: models.Alert{
				ID:           exported.ID,
				Type:         exported.Type,
				Level:        string(exported.Level),
				ResourceID:   exported.ResourceID,
				ResourceName: exported.ResourceName,
				Node:         exported.Node,
				Instance:     exported.Instance,
				Message:      exported.Message,
				Value:        exported.Value,
				Threshold:    exported.Threshold,
				StartTime:    exported.StartTime,
				Acknowledged: exported.Acknowledged,
			},
			ResolvedTime: alert.ResolvedTime,
		})
	}
	return resolved
}

// GetResolvedAlert returns a copy of a recently resolved alert by ID.
func (m *Manager) GetResolvedAlert(alertID string) *ResolvedAlert {
	m.resolvedMutex.RLock()
	defer m.resolvedMutex.RUnlock()

	resolved, ok := m.getResolvedAlertNoLock(alertID)
	if !ok || resolved == nil || resolved.Alert == nil {
		return nil
	}

	return &ResolvedAlert{
		Alert:        cloneAlertForOutput(resolved.Alert),
		ResolvedTime: resolved.ResolvedTime,
	}
}

// GetAlertHistory returns alert history
func (m *Manager) GetAlertHistory(limit int) []Alert {
	return canonicalizeAlertHistoryForOutput(m.historyManager.GetAllHistory(limit))
}

// GetAlertHistorySince returns alert history entries created after the provided time.
func (m *Manager) GetAlertHistorySince(since time.Time, limit int) []Alert {
	if since.IsZero() {
		return m.GetAlertHistory(limit)
	}

	return canonicalizeAlertHistoryForOutput(m.historyManager.GetHistory(since, limit))
}

// ClearAlertHistory clears all alert history
func (m *Manager) ClearAlertHistory() error {
	return m.historyManager.ClearAllHistory()
}

// OnAlertHistory registers a callback to be called when alerts are added to history.
// This enables external systems like pattern detection to track alerts.
func (m *Manager) OnAlertHistory(cb AlertCallback) {
	if m.historyManager != nil {
		m.historyManager.OnAlert(cb)
	}
}

// clearResourceOfflineAlert removes an offline alert when a resource comes back online.
func (m *Manager) clearResourceOfflineAlert(alertPrefix, resourceID, resourceName, host, resourceKind string) {
	alertID := canonicalConnectivityStateID(resourceID)

	m.mu.Lock()
	defer m.mu.Unlock()

	// Reset offline confirmation count
	if count, exists := m.offlineConfirmations[resourceID]; exists && count > 0 {
		log.Debug().
			Str(strings.ToLower(resourceKind), resourceName).
			Int("previousCount", count).
			Msg(resourceKind + " is online, resetting offline confirmation count")
		delete(m.offlineConfirmations, resourceID)
	}

	// Check if offline alert exists
	alert, exists := m.getActiveAlertNoLock(alertID)
	if !exists {
		return
	}

	// Remove from active alerts
	m.removeActiveAlertNoLock(alertID)

	resolvedAlert := &ResolvedAlert{
		Alert:        alert,
		ResolvedTime: time.Now(),
	}
	m.addRecentlyResolvedWithPrimaryLock(resolvedAlert)

	// Send recovery notification (async to avoid deadlock — callback acquires m.mu.RLock
	// via ShouldSuppressResolvedNotification, and we currently hold m.mu.Lock)
	m.safeCallResolvedAlertCallback(alert, alertID, true)

	// Log recovery
	log.Info().
		Str(strings.ToLower(resourceKind), resourceName).
		Str("host", host).
		Dur("downtime", time.Since(alert.StartTime)).
		Msg(resourceKind + " instance is back online")
}

// checkNodeOffline creates an alert for offline nodes after confirmation
func (m *Manager) checkNodeOffline(node models.Node) {
	alertID := fmt.Sprintf("node-offline-%s", node.ID)

	thresholds := m.resolveResourceThresholds("node", node.ID)
	spec, err := buildCanonicalConnectivitySpec(node.ID, node.Name, unifiedresources.ResourceType("node"), AlertLevelCritical, 3, thresholds.Disabled || thresholds.DisableConnectivity)
	if err != nil {
		log.Warn().
			Err(err).
			Str("node", node.Name).
			Str("nodeID", node.ID).
			Msg("Skipping invalid canonical node connectivity spec")
		return
	}

	_, _ = m.evaluateCanonicalLifecycleAlert(canonicalLifecycleAlertParams{
		Spec:         spec,
		Evidence:     alertspecs.AlertEvidence{ObservedAt: time.Now(), Connectivity: &alertspecs.ConnectivityEvidence{Signal: "status", Connected: false}},
		Tracking:     m.nodeOfflineCount,
		TrackingKey:  node.ID,
		AlertID:      alertID,
		AlertType:    "connectivity",
		ResourceID:   node.ID,
		ResourceName: node.Name,
		Node:         node.Name,
		Instance:     node.Instance,
		Message:      fmt.Sprintf("Node '%s' is offline", node.Name),
		Metadata: map[string]interface{}{
			"resourceType":     "node",
			"status":           node.Status,
			"connectionHealth": node.ConnectionHealth,
		},
		AddToRecent:   true,
		AddToHistory:  true,
		RateLimit:     true,
		DispatchAsync: false,
	})
}

// clearNodeOfflineAlert removes offline alert when node comes back online
func (m *Manager) clearNodeOfflineAlert(node models.Node) {
	alertID := canonicalConnectivityStateID(node.ID)

	m.mu.Lock()
	defer m.mu.Unlock()

	// Reset offline count when node comes back online
	if m.nodeOfflineCount[node.ID] > 0 {
		log.Debug().
			Str("node", node.Name).
			Int("previousCount", m.nodeOfflineCount[node.ID]).
			Msg("Node back online, resetting offline count")
		delete(m.nodeOfflineCount, node.ID)
	}

	// Check if offline alert exists
	alert, exists := m.getActiveAlertNoLock(alertID)
	if !exists {
		return
	}

	// Remove from active alerts
	m.removeActiveAlertNoLock(alertID)

	resolvedAlert := &ResolvedAlert{
		Alert:        alert,
		ResolvedTime: time.Now(),
	}
	m.addRecentlyResolvedWithPrimaryLock(resolvedAlert)

	// Send recovery notification (async to avoid deadlock — callback acquires m.mu.RLock
	// via ShouldSuppressResolvedNotification, and we currently hold m.mu.Lock)
	m.safeCallResolvedAlertCallback(alert, alertID, true)

	// Log recovery
	log.Info().
		Str("node", node.Name).
		Str("instance", node.Instance).
		Dur("downtime", time.Since(alert.StartTime)).
		Msg("Node is back online")
}

// checkPBSOffline creates an alert for offline PBS instances
func (m *Manager) checkPBSOffline(pbs models.PBSInstance) {
	thresholds := m.resolveResourceThresholds("pbs", pbs.ID)
	spec, err := buildCanonicalConnectivitySpec(pbs.ID, pbs.Name, unifiedresources.ResourceTypePBS, AlertLevelCritical, 3, thresholds.Disabled || thresholds.DisableConnectivity)
	if err != nil {
		log.Warn().
			Err(err).
			Str("pbs", pbs.Name).
			Str("pbsID", pbs.ID).
			Msg("Skipping invalid canonical PBS connectivity spec")
		return
	}

	_, _ = m.evaluateCanonicalLifecycleAlert(canonicalLifecycleAlertParams{
		Spec:         spec,
		Evidence:     alertspecs.AlertEvidence{ObservedAt: time.Now(), Connectivity: &alertspecs.ConnectivityEvidence{Signal: "status", Connected: false}},
		Tracking:     m.offlineConfirmations,
		TrackingKey:  pbs.ID,
		AlertID:      fmt.Sprintf("pbs-offline-%s", pbs.ID),
		AlertType:    "offline",
		ResourceID:   pbs.ID,
		ResourceName: pbs.Name,
		Node:         pbs.Host,
		Instance:     pbs.Name,
		Message:      fmt.Sprintf("PBS instance %s is offline", pbs.Name),
		Metadata: map[string]interface{}{
			"resourceType":     "pbs",
			"status":           pbs.Status,
			"connectionHealth": pbs.ConnectionHealth,
		},
		RateLimit:     true,
		DispatchAsync: true,
	})
}

// clearPBSOfflineAlert removes offline alert when PBS comes back online
func (m *Manager) clearPBSOfflineAlert(pbs models.PBSInstance) {
	m.clearResourceOfflineAlert("pbs-offline", pbs.ID, pbs.Name, pbs.Host, "PBS")
}

// checkPMGOffline creates an alert for offline PMG instances
func (m *Manager) checkPMGOffline(pmg models.PMGInstance) {
	m.mu.RLock()
	override, hasOverride := m.config.Overrides[pmg.ID]
	m.mu.RUnlock()
	disabled := hasOverride && (override.Disabled || override.DisableConnectivity)
	spec, err := buildCanonicalConnectivitySpec(pmg.ID, pmg.Name, unifiedresources.ResourceTypePMG, AlertLevelCritical, 3, disabled)
	if err != nil {
		log.Warn().
			Err(err).
			Str("pmg", pmg.Name).
			Str("pmgID", pmg.ID).
			Msg("Skipping invalid canonical PMG connectivity spec")
		return
	}

	_, _ = m.evaluateCanonicalLifecycleAlert(canonicalLifecycleAlertParams{
		Spec:         spec,
		Evidence:     alertspecs.AlertEvidence{ObservedAt: time.Now(), Connectivity: &alertspecs.ConnectivityEvidence{Signal: "status", Connected: false}},
		Tracking:     m.offlineConfirmations,
		TrackingKey:  pmg.ID,
		AlertID:      fmt.Sprintf("pmg-offline-%s", pmg.ID),
		AlertType:    "offline",
		ResourceID:   pmg.ID,
		ResourceName: pmg.Name,
		Node:         pmg.Host,
		Instance:     pmg.Name,
		Message:      fmt.Sprintf("PMG instance %s is offline", pmg.Name),
		Metadata: map[string]interface{}{
			"resourceType":     "pmg",
			"status":           pmg.Status,
			"connectionHealth": pmg.ConnectionHealth,
		},
		RateLimit:     true,
		DispatchAsync: true,
	})
}

// clearPMGOfflineAlert removes offline alert when PMG comes back online
func (m *Manager) clearPMGOfflineAlert(pmg models.PMGInstance) {
	m.clearResourceOfflineAlert("pmg-offline", pmg.ID, pmg.Name, pmg.Host, "PMG")
}

// checkPMGQueueDepths checks PMG mail queue depths and creates alerts
// Evaluates all queue types (total, deferred, hold) independently
func (m *Manager) checkPMGQueueDepths(pmg models.PMGInstance, defaults PMGThresholdConfig) {
	// Aggregate queue totals across all nodes
	var totalQueue, totalDeferred, totalHold int

	for _, node := range pmg.Nodes {
		if node.QueueStatus != nil {
			totalQueue += node.QueueStatus.Total
			totalDeferred += node.QueueStatus.Deferred
			totalHold += node.QueueStatus.Hold
		}
	}

	m.checkPMGQueueDepth(pmg, defaults.QueueTotalWarning, defaults.QueueTotalCritical, totalQueue, "queue-total", "queue-depth",
		"PMG %s has %d total messages in queue (threshold: %d)", "total_queue")
	m.checkPMGQueueDepth(pmg, defaults.DeferredQueueWarn, defaults.DeferredQueueCritical, totalDeferred, "queue-deferred", "queue-deferred",
		"PMG %s has %d deferred messages (threshold: %d)", "deferred_queue")
	m.checkPMGQueueDepth(pmg, defaults.HoldQueueWarn, defaults.HoldQueueCritical, totalHold, "queue-hold", "queue-hold",
		"PMG %s has %d held messages (threshold: %d)", "hold_queue")
}

func thresholdForCanonicalSeverity(severity alertspecs.AlertSeverity, warningThreshold, criticalThreshold float64) float64 {
	switch severity {
	case alertspecs.AlertSeverityCritical:
		if criticalThreshold > 0 {
			return criticalThreshold
		}
		return warningThreshold
	case alertspecs.AlertSeverityWarning:
		if warningThreshold > 0 {
			return warningThreshold
		}
		return criticalThreshold
	default:
		return 0
	}
}

func quarantineAlertThreshold(metricType, reason string, previousCount int, defaults PMGThresholdConfig) float64 {
	switch metricType {
	case "spam":
		switch reason {
		case "change-threshold-current-critical":
			return float64(defaults.QuarantineSpamCritical)
		case "change-threshold-current-warning":
			return float64(defaults.QuarantineSpamWarn)
		case "change-threshold-growth-critical":
			return float64(previousCount + defaults.QuarantineGrowthCritMin)
		case "change-threshold-growth-warning":
			return float64(previousCount + defaults.QuarantineGrowthWarnMin)
		default:
			return 0
		}
	case "virus":
		switch reason {
		case "change-threshold-current-critical":
			return float64(defaults.QuarantineVirusCritical)
		case "change-threshold-current-warning":
			return float64(defaults.QuarantineVirusWarn)
		case "change-threshold-growth-critical":
			return float64(previousCount + defaults.QuarantineGrowthCritMin)
		case "change-threshold-growth-warning":
			return float64(previousCount + defaults.QuarantineGrowthWarnMin)
		default:
			return 0
		}
	default:
		return 0
	}
}

func quarantineAlertMessage(pmg models.PMGInstance, metricType string, current, previousCount int, reason string, defaults PMGThresholdConfig) string {
	switch reason {
	case "change-threshold-growth-critical", "change-threshold-growth-warning":
		growth := current - previousCount
		growthPct := 0.0
		if previousCount > 0 {
			growthPct = (float64(growth) / float64(previousCount)) * 100
		}
		if reason == "change-threshold-growth-critical" {
			return fmt.Sprintf("PMG %s %s quarantine growing rapidly: +%d messages (+%.1f%%) in 2 hours", pmg.Name, metricType, growth, growthPct)
		}
		return fmt.Sprintf("PMG %s %s quarantine growing: +%d messages (+%.1f%%) in 2 hours", pmg.Name, metricType, growth, growthPct)
	default:
		threshold := quarantineAlertThreshold(metricType, reason, previousCount, defaults)
		return fmt.Sprintf("PMG %s has %d %s messages in quarantine (threshold: %d)", pmg.Name, current, metricType, int(threshold))
	}
}

func (m *Manager) checkPMGQueueDepth(pmg models.PMGInstance, warningThreshold, criticalThreshold, value int, alertIDSuffix, alertType, messageFormat, logField string) {
	if warningThreshold <= 0 && criticalThreshold <= 0 {
		return
	}

	alertID := fmt.Sprintf("%s-%s", pmg.ID, alertIDSuffix)
	spec, err := buildCanonicalSeverityThresholdSpec(alertID, pmg.ID, pmg.Name, unifiedresources.ResourceTypePMG, alertType, float64(warningThreshold), float64(criticalThreshold), false)
	if err != nil {
		log.Warn().
			Err(err).
			Str("pmg", pmg.Name).
			Str("alertID", alertID).
			Msg("Skipping invalid canonical PMG queue spec")
		return
	}

	result, _ := m.evaluateCanonicalStatefulAlert(canonicalStatefulAlertParams{
		Spec: spec,
		Evidence: alertspecs.AlertEvidence{
			ObservedAt: time.Now(),
			SeverityThreshold: &alertspecs.SeverityThresholdEvidence{
				Metric:    alertType,
				Direction: alertspecs.ThresholdDirectionAbove,
				Observed:  float64(value),
			},
		},
		AlertID:      alertID,
		AlertType:    alertType,
		ResourceID:   pmg.ID,
		ResourceName: pmg.Name,
		Node:         pmg.Host,
		Instance:     pmg.Name,
		MessageBuilder: func(result alertspecs.EvaluationResult) (string, float64, float64) {
			threshold := thresholdForCanonicalSeverity(result.State.Severity, float64(warningThreshold), float64(criticalThreshold))
			return fmt.Sprintf(messageFormat, pmg.Name, value, int(threshold)), float64(value), threshold
		},
		DispatchAsync: true,
	})

	if result.Transition != nil && result.Transition.Kind == alertspecs.EvaluationTransitionActivated {
		level, ok := alertLevelFromCanonicalSeverity(result.State.Severity)
		if !ok {
			level = AlertLevelWarning
		}
		log.Warn().
			Str("pmg", pmg.Name).
			Int(logField, value).
			Int("threshold", int(thresholdForCanonicalSeverity(result.State.Severity, float64(warningThreshold), float64(criticalThreshold)))).
			Str("level", string(level)).
			Msg(fmt.Sprintf("PMG %s alert triggered", alertType))
	}
}

// checkPMGOldestMessage checks oldest queued message age and creates alerts
func (m *Manager) checkPMGOldestMessage(pmg models.PMGInstance, defaults PMGThresholdConfig) {
	if defaults.OldestMessageWarnMins <= 0 && defaults.OldestMessageCritMins <= 0 {
		return
	}

	// Find the oldest message age across all nodes
	var oldestAge int64 // in seconds
	for _, node := range pmg.Nodes {
		if node.QueueStatus != nil && node.QueueStatus.OldestAge > oldestAge {
			oldestAge = node.QueueStatus.OldestAge
		}
	}

	if oldestAge == 0 {
		// No messages in queue, clear any existing alert
		alertID := fmt.Sprintf("%s-oldest-message", pmg.ID)
		m.clearAlert(buildCanonicalStateID(pmg.ID, alertID))
		return
	}

	alertID := fmt.Sprintf("%s-oldest-message", pmg.ID)
	oldestMinutes := oldestAge / 60
	spec, err := buildCanonicalSeverityThresholdSpec(alertID, pmg.ID, pmg.Name, unifiedresources.ResourceTypePMG, "message-age", float64(defaults.OldestMessageWarnMins), float64(defaults.OldestMessageCritMins), false)
	if err != nil {
		log.Warn().
			Err(err).
			Str("pmg", pmg.Name).
			Str("alertID", alertID).
			Msg("Skipping invalid canonical PMG oldest-message spec")
		return
	}

	result, _ := m.evaluateCanonicalStatefulAlert(canonicalStatefulAlertParams{
		Spec: spec,
		Evidence: alertspecs.AlertEvidence{
			ObservedAt: time.Now(),
			SeverityThreshold: &alertspecs.SeverityThresholdEvidence{
				Metric:    "message-age",
				Direction: alertspecs.ThresholdDirectionAbove,
				Observed:  float64(oldestMinutes),
			},
		},
		AlertID:      alertID,
		AlertType:    "message-age",
		ResourceID:   pmg.ID,
		ResourceName: pmg.Name,
		Node:         pmg.Host,
		Instance:     pmg.Name,
		MessageBuilder: func(result alertspecs.EvaluationResult) (string, float64, float64) {
			threshold := thresholdForCanonicalSeverity(result.State.Severity, float64(defaults.OldestMessageWarnMins), float64(defaults.OldestMessageCritMins))
			return fmt.Sprintf("PMG %s has messages queued for %d minutes (threshold: %d minutes)", pmg.Name, oldestMinutes, int64(threshold)), float64(oldestMinutes), threshold
		},
		DispatchAsync: true,
	})

	if result.Transition != nil && result.Transition.Kind == alertspecs.EvaluationTransitionActivated {
		level, ok := alertLevelFromCanonicalSeverity(result.State.Severity)
		if !ok {
			level = AlertLevelWarning
		}
		log.Warn().
			Str("pmg", pmg.Name).
			Int64("oldest_minutes", oldestMinutes).
			Int64("threshold", int64(thresholdForCanonicalSeverity(result.State.Severity, float64(defaults.OldestMessageWarnMins), float64(defaults.OldestMessageCritMins)))).
			Str("level", string(level)).
			Msg("PMG oldest message age alert triggered")
	}
}

// checkPMGNodeQueues checks individual PMG node queue health
// Uses scaled thresholds (60% warn, 80% crit) and outlier detection
func (m *Manager) checkPMGNodeQueues(pmg models.PMGInstance, defaults PMGThresholdConfig) {
	if len(pmg.Nodes) == 0 {
		return
	}

	// Calculate median queue values across nodes for outlier detection
	nodeQueueTotals := make([]int, 0, len(pmg.Nodes))
	nodeQueueDeferred := make([]int, 0, len(pmg.Nodes))
	nodeQueueHold := make([]int, 0, len(pmg.Nodes))

	for _, node := range pmg.Nodes {
		if node.QueueStatus != nil {
			nodeQueueTotals = append(nodeQueueTotals, node.QueueStatus.Total)
			nodeQueueDeferred = append(nodeQueueDeferred, node.QueueStatus.Deferred)
			nodeQueueHold = append(nodeQueueHold, node.QueueStatus.Hold)
		}
	}

	medianTotal := calculateMedianInt(nodeQueueTotals)
	medianDeferred := calculateMedianInt(nodeQueueDeferred)
	medianHold := calculateMedianInt(nodeQueueHold)

	// Scaled thresholds: 60% for warning, 80% for critical (computed once, used for all nodes)
	scaledQueueWarn := scaleThreshold(defaults.QueueTotalWarning, 0.6)
	scaledQueueCrit := scaleThreshold(defaults.QueueTotalCritical, 0.8)
	scaledDeferredWarn := scaleThreshold(defaults.DeferredQueueWarn, 0.6)
	scaledDeferredCrit := scaleThreshold(defaults.DeferredQueueCritical, 0.8)
	scaledHoldWarn := scaleThreshold(defaults.HoldQueueWarn, 0.6)
	scaledHoldCrit := scaleThreshold(defaults.HoldQueueCritical, 0.8)
	scaledAgeWarn := scaleThreshold(defaults.OldestMessageWarnMins, 0.6)
	scaledAgeCrit := scaleThreshold(defaults.OldestMessageCritMins, 0.8)

	// Check each node
	for _, node := range pmg.Nodes {
		if node.QueueStatus == nil {
			continue
		}

		// Check total queue - always check thresholds
		if scaledQueueWarn > 0 || scaledQueueCrit > 0 {
			total := node.QueueStatus.Total
			alertID := fmt.Sprintf("%s-%s-queue-total", pmg.ID, node.Name)
			if (scaledQueueCrit <= 0 || total < scaledQueueCrit) && (scaledQueueWarn <= 0 || total < scaledQueueWarn) {
				m.clearAlert(buildCanonicalStateID(pmg.ID, alertID))
				continue
			}

			isOutlier := isQueueOutlier(total, medianTotal)
			outlierNote := ""
			if isOutlier {
				outlierNote = ", outlier"
			}

			spec, err := buildCanonicalSeverityThresholdSpec(alertID, pmg.ID, pmg.Name, unifiedresources.ResourceTypePMG, "queue-total", float64(scaledQueueWarn), float64(scaledQueueCrit), false)
			if err != nil {
				log.Warn().Err(err).Str("pmg", pmg.Name).Str("alertID", alertID).Msg("Skipping invalid canonical PMG node queue spec")
				continue
			}
			_, _ = m.evaluateCanonicalStatefulAlert(canonicalStatefulAlertParams{
				Spec: spec,
				Evidence: alertspecs.AlertEvidence{
					ObservedAt: time.Now(),
					SeverityThreshold: &alertspecs.SeverityThresholdEvidence{
						Metric:    "queue-total",
						Direction: alertspecs.ThresholdDirectionAbove,
						Observed:  float64(total),
					},
				},
				AlertID:      alertID,
				AlertType:    "queue-total",
				ResourceID:   pmg.ID,
				ResourceName: pmg.Name,
				Node:         node.Name,
				Instance:     pmg.Name,
				MessageBuilder: func(result alertspecs.EvaluationResult) (string, float64, float64) {
					currentThreshold := thresholdForCanonicalSeverity(result.State.Severity, float64(scaledQueueWarn), float64(scaledQueueCrit))
					return fmt.Sprintf("PMG node %s on %s has %d total messages in queue (threshold: %d%s)", node.Name, pmg.Name, total, int(currentThreshold), outlierNote), float64(total), currentThreshold
				},
				DispatchAsync: true,
			})
		}

		// Check deferred queue - always check thresholds
		if scaledDeferredWarn > 0 || scaledDeferredCrit > 0 {
			deferred := node.QueueStatus.Deferred
			alertID := fmt.Sprintf("%s-%s-queue-deferred", pmg.ID, node.Name)
			if (scaledDeferredCrit <= 0 || deferred < scaledDeferredCrit) && (scaledDeferredWarn <= 0 || deferred < scaledDeferredWarn) {
				m.clearAlert(buildCanonicalStateID(pmg.ID, alertID))
				continue
			}

			// Add outlier indicator to message if applicable
			isOutlier := isQueueOutlier(deferred, medianDeferred)
			outlierNote := ""
			if isOutlier {
				outlierNote = ", outlier"
			}

			spec, err := buildCanonicalSeverityThresholdSpec(alertID, pmg.ID, pmg.Name, unifiedresources.ResourceTypePMG, "queue-deferred", float64(scaledDeferredWarn), float64(scaledDeferredCrit), false)
			if err != nil {
				log.Warn().Err(err).Str("pmg", pmg.Name).Str("alertID", alertID).Msg("Skipping invalid canonical PMG node deferred queue spec")
				continue
			}
			_, _ = m.evaluateCanonicalStatefulAlert(canonicalStatefulAlertParams{
				Spec: spec,
				Evidence: alertspecs.AlertEvidence{
					ObservedAt: time.Now(),
					SeverityThreshold: &alertspecs.SeverityThresholdEvidence{
						Metric:    "queue-deferred",
						Direction: alertspecs.ThresholdDirectionAbove,
						Observed:  float64(deferred),
					},
				},
				AlertID:      alertID,
				AlertType:    "queue-deferred",
				ResourceID:   pmg.ID,
				ResourceName: pmg.Name,
				Node:         node.Name,
				Instance:     pmg.Name,
				MessageBuilder: func(result alertspecs.EvaluationResult) (string, float64, float64) {
					currentThreshold := thresholdForCanonicalSeverity(result.State.Severity, float64(scaledDeferredWarn), float64(scaledDeferredCrit))
					return fmt.Sprintf("PMG node %s on %s has %d deferred messages (threshold: %d%s)", node.Name, pmg.Name, deferred, int(currentThreshold), outlierNote), float64(deferred), currentThreshold
				},
				DispatchAsync: true,
			})
		}

		// Check hold queue - always check thresholds
		if scaledHoldWarn > 0 || scaledHoldCrit > 0 {
			hold := node.QueueStatus.Hold
			alertID := fmt.Sprintf("%s-%s-queue-hold", pmg.ID, node.Name)
			if (scaledHoldCrit <= 0 || hold < scaledHoldCrit) && (scaledHoldWarn <= 0 || hold < scaledHoldWarn) {
				m.clearAlert(buildCanonicalStateID(pmg.ID, alertID))
				continue
			}

			// Add outlier indicator to message if applicable
			isOutlier := isQueueOutlier(hold, medianHold)
			outlierNote := ""
			if isOutlier {
				outlierNote = ", outlier"
			}

			spec, err := buildCanonicalSeverityThresholdSpec(alertID, pmg.ID, pmg.Name, unifiedresources.ResourceTypePMG, "queue-hold", float64(scaledHoldWarn), float64(scaledHoldCrit), false)
			if err != nil {
				log.Warn().Err(err).Str("pmg", pmg.Name).Str("alertID", alertID).Msg("Skipping invalid canonical PMG node hold queue spec")
				continue
			}
			_, _ = m.evaluateCanonicalStatefulAlert(canonicalStatefulAlertParams{
				Spec: spec,
				Evidence: alertspecs.AlertEvidence{
					ObservedAt: time.Now(),
					SeverityThreshold: &alertspecs.SeverityThresholdEvidence{
						Metric:    "queue-hold",
						Direction: alertspecs.ThresholdDirectionAbove,
						Observed:  float64(hold),
					},
				},
				AlertID:      alertID,
				AlertType:    "queue-hold",
				ResourceID:   pmg.ID,
				ResourceName: pmg.Name,
				Node:         node.Name,
				Instance:     pmg.Name,
				MessageBuilder: func(result alertspecs.EvaluationResult) (string, float64, float64) {
					currentThreshold := thresholdForCanonicalSeverity(result.State.Severity, float64(scaledHoldWarn), float64(scaledHoldCrit))
					return fmt.Sprintf("PMG node %s on %s has %d held messages (threshold: %d%s)", node.Name, pmg.Name, hold, int(currentThreshold), outlierNote), float64(hold), currentThreshold
				},
				DispatchAsync: true,
			})
		}

		// Check oldest message age per node
		if scaledAgeWarn > 0 || scaledAgeCrit > 0 {
			oldestAge := node.QueueStatus.OldestAge
			if oldestAge > 0 {
				oldestMinutes := oldestAge / 60
				alertID := fmt.Sprintf("%s-%s-oldest-message", pmg.ID, node.Name)
				if (scaledAgeCrit <= 0 || oldestMinutes < int64(scaledAgeCrit)) && (scaledAgeWarn <= 0 || oldestMinutes < int64(scaledAgeWarn)) {
					m.clearAlert(buildCanonicalStateID(pmg.ID, alertID))
					continue
				}

				spec, err := buildCanonicalSeverityThresholdSpec(alertID, pmg.ID, pmg.Name, unifiedresources.ResourceTypePMG, "message-age", float64(scaledAgeWarn), float64(scaledAgeCrit), false)
				if err != nil {
					log.Warn().Err(err).Str("pmg", pmg.Name).Str("alertID", alertID).Msg("Skipping invalid canonical PMG node message-age spec")
					continue
				}
				_, _ = m.evaluateCanonicalStatefulAlert(canonicalStatefulAlertParams{
					Spec: spec,
					Evidence: alertspecs.AlertEvidence{
						ObservedAt: time.Now(),
						SeverityThreshold: &alertspecs.SeverityThresholdEvidence{
							Metric:    "message-age",
							Direction: alertspecs.ThresholdDirectionAbove,
							Observed:  float64(oldestMinutes),
						},
					},
					AlertID:      alertID,
					AlertType:    "message-age",
					ResourceID:   pmg.ID,
					ResourceName: pmg.Name,
					Node:         node.Name,
					Instance:     pmg.Name,
					MessageBuilder: func(result alertspecs.EvaluationResult) (string, float64, float64) {
						currentThreshold := thresholdForCanonicalSeverity(result.State.Severity, float64(scaledAgeWarn), float64(scaledAgeCrit))
						return fmt.Sprintf("PMG node %s on %s has messages queued for %d minutes (threshold: %d min, node-specific)", node.Name, pmg.Name, oldestMinutes, int(currentThreshold)), float64(oldestMinutes), currentThreshold
					},
					DispatchAsync: true,
				})
			}
		}
	}
}

// isQueueOutlier determines if a node's queue value is a significant outlier
// Returns true if value is >40% above the median across all nodes
func isQueueOutlier(value, median int) bool {
	if median == 0 {
		return value > 0
	}
	percentAboveMedian := float64(value-median) / float64(median) * 100
	return percentAboveMedian > 40
}

// scaleThreshold applies a scaling factor to a threshold and ensures minimum value of 1
// Uses ceiling to avoid truncation issues with small thresholds
func scaleThreshold(threshold int, scaleFactor float64) int {
	if threshold <= 0 {
		return 0
	}
	scaled := int(math.Ceil(float64(threshold) * scaleFactor))
	if scaled < 1 {
		return 1
	}
	return scaled
}

// calculateMedianInt calculates median of integer slice
func calculateMedianInt(values []int) int {
	if len(values) == 0 {
		return 0
	}

	// Copy and sort
	sorted := make([]int, len(values))
	copy(sorted, values)
	for i := 0; i < len(sorted); i++ {
		for j := i + 1; j < len(sorted); j++ {
			if sorted[i] > sorted[j] {
				sorted[i], sorted[j] = sorted[j], sorted[i]
			}
		}
	}

	mid := len(sorted) / 2
	if len(sorted)%2 == 0 {
		return (sorted[mid-1] + sorted[mid]) / 2
	}
	return sorted[mid]
}

// checkPMGQuarantineBacklog checks quarantine backlog and growth rates
func (m *Manager) checkPMGQuarantineBacklog(pmg models.PMGInstance, defaults PMGThresholdConfig) {
	if pmg.Quarantine == nil {
		m.clearAlert(buildCanonicalStateID(pmg.ID, fmt.Sprintf("%s-quarantine-spam", pmg.ID)))
		m.clearAlert(buildCanonicalStateID(pmg.ID, fmt.Sprintf("%s-quarantine-virus", pmg.ID)))
		return
	}

	now := time.Now()
	currentSpam := pmg.Quarantine.Spam
	currentVirus := pmg.Quarantine.Virus

	// Store current snapshot
	m.mu.Lock()
	snapshot := pmgQuarantineSnapshot{
		Spam:      currentSpam,
		Virus:     currentVirus,
		Timestamp: now,
	}

	// Get or create history for this PMG instance
	history := m.pmgQuarantineHistory[pmg.ID]
	history = append(history, snapshot)

	// Clean old snapshots (keep last 3 hours)
	cutoff := now.Add(-3 * time.Hour)
	validSnapshots := make([]pmgQuarantineSnapshot, 0, len(history))
	for _, snap := range history {
		if snap.Timestamp.After(cutoff) {
			validSnapshots = append(validSnapshots, snap)
		}
	}
	// Limit to max 48 samples to prevent unbounded growth
	const maxQuarantineSnapshots = 48
	if len(validSnapshots) > maxQuarantineSnapshots {
		validSnapshots = validSnapshots[len(validSnapshots)-maxQuarantineSnapshots:]
	}
	m.pmgQuarantineHistory[pmg.ID] = validSnapshots
	m.mu.Unlock()

	// Find snapshot from ~2 hours ago (within ±15 min tolerance)
	var twoHoursAgo *pmgQuarantineSnapshot
	targetTime := now.Add(-2 * time.Hour)
	minDiff := 15 * time.Minute

	for i := range validSnapshots {
		snap := &validSnapshots[i]
		diff := snap.Timestamp.Sub(targetTime)
		if diff < 0 {
			diff = -diff
		}
		if diff < minDiff {
			minDiff = diff
			twoHoursAgo = snap
		}
	}

	// Check spam quarantine
	m.checkQuarantineMetric(pmg, "spam", currentSpam, twoHoursAgo, defaults)

	// Check virus quarantine
	m.checkQuarantineMetric(pmg, "virus", currentVirus, twoHoursAgo, defaults)
}

// checkQuarantineMetric checks a single quarantine metric (spam or virus)
func (m *Manager) checkQuarantineMetric(pmg models.PMGInstance, metricType string, current int, twoHoursAgo *pmgQuarantineSnapshot, defaults PMGThresholdConfig) {
	alertID := fmt.Sprintf("%s-quarantine-%s", pmg.ID, metricType)

	var absoluteWarn, absoluteCrit int
	var previousCount int

	// Get thresholds and previous count based on metric type
	if metricType == "spam" {
		absoluteWarn = defaults.QuarantineSpamWarn
		absoluteCrit = defaults.QuarantineSpamCritical
		if twoHoursAgo != nil {
			previousCount = twoHoursAgo.Spam
		}
	} else { // virus
		absoluteWarn = defaults.QuarantineVirusWarn
		absoluteCrit = defaults.QuarantineVirusCritical
		if twoHoursAgo != nil {
			previousCount = twoHoursAgo.Virus
		}
	}

	if absoluteWarn <= 0 && absoluteCrit <= 0 &&
		defaults.QuarantineGrowthWarnMin <= 0 && defaults.QuarantineGrowthCritMin <= 0 {
		m.clearAlert(buildCanonicalStateID(pmg.ID, alertID))
		return
	}

	spec, err := buildCanonicalChangeThresholdSpec(
		alertID,
		pmg.ID,
		pmg.Name,
		unifiedresources.ResourceTypePMG,
		"quarantine-"+metricType,
		float64(absoluteWarn),
		float64(absoluteCrit),
		float64(defaults.QuarantineGrowthWarnMin),
		float64(defaults.QuarantineGrowthCritMin),
		float64(defaults.QuarantineGrowthWarnPct),
		float64(defaults.QuarantineGrowthCritPct),
		2*time.Hour,
		false,
	)
	if err != nil {
		log.Warn().
			Err(err).
			Str("pmg", pmg.Name).
			Str("alertID", alertID).
			Str("metricType", metricType).
			Msg("Skipping invalid canonical PMG quarantine spec")
		return
	}

	evidence := alertspecs.AlertEvidence{
		ObservedAt: time.Now(),
		ChangeThreshold: &alertspecs.ChangeThresholdEvidence{
			Metric:   "quarantine-" + metricType,
			Observed: float64(current),
		},
	}
	if previousCount > 0 {
		previous := float64(previousCount)
		evidence.ChangeThreshold.PreviousObserved = &previous
	}

	result, _ := m.evaluateCanonicalStatefulAlert(canonicalStatefulAlertParams{
		Spec:         spec,
		Evidence:     evidence,
		AlertID:      alertID,
		AlertType:    fmt.Sprintf("quarantine-%s", metricType),
		ResourceID:   pmg.ID,
		ResourceName: pmg.Name,
		Node:         pmg.Host,
		Instance:     pmg.Name,
		MessageBuilder: func(result alertspecs.EvaluationResult) (string, float64, float64) {
			reason := result.State.Reason
			threshold := quarantineAlertThreshold(metricType, reason, previousCount, defaults)
			return quarantineAlertMessage(pmg, metricType, current, previousCount, reason, defaults), float64(current), threshold
		},
		DispatchAsync: true,
	})

	if result.Transition != nil && result.Transition.Kind == alertspecs.EvaluationTransitionActivated {
		level, ok := alertLevelFromCanonicalSeverity(result.State.Severity)
		if !ok {
			level = AlertLevelWarning
		}
		log.Warn().
			Str("pmg", pmg.Name).
			Str("type", metricType).
			Int("current", current).
			Int("threshold", int(quarantineAlertThreshold(metricType, result.State.Reason, previousCount, defaults))).
			Str("level", string(level)).
			Msg("PMG quarantine backlog alert triggered")
	}
}

// calculateTrimmedBaseline computes a robust baseline from historical samples
// using trimmed mean with median fallback for statistical robustness
func calculateTrimmedBaseline(samples []float64) (baseline float64, trustworthy bool) {
	sampleCount := len(samples)

	// Need at least 12 samples for trustworthy baseline (warmup period)
	if sampleCount < 12 {
		return 0, false
	}

	// For full 24-sample baseline, use trimmed mean
	if sampleCount >= 24 {
		// Create a copy for sorting
		sorted := make([]float64, len(samples))
		copy(sorted, samples)

		// Sort samples
		for i := 0; i < len(sorted); i++ {
			for j := i + 1; j < len(sorted); j++ {
				if sorted[i] > sorted[j] {
					sorted[i], sorted[j] = sorted[j], sorted[i]
				}
			}
		}

		// Calculate median
		var median float64
		mid := len(sorted) / 2
		if len(sorted)%2 == 0 {
			median = (sorted[mid-1] + sorted[mid]) / 2
		} else {
			median = sorted[mid]
		}

		// Calculate trimmed mean: drop top and bottom 2, average remaining 20
		if len(sorted) >= 24 {
			trimmed := sorted[2 : len(sorted)-2]
			sum := 0.0
			for _, val := range trimmed {
				sum += val
			}
			trimmedMean := sum / float64(len(trimmed))

			// Fallback rule: if trimmed mean differs from median by >40%, use median
			diff := trimmedMean - median
			if diff < 0 {
				diff = -diff
			}
			percentDiff := (diff / median) * 100

			if percentDiff > 40 {
				return median, true
			}
			return trimmedMean, true
		}
	}

	// For 12-23 samples, use simple mean (not enough for trimming)
	sum := 0.0
	for _, val := range samples {
		sum += val
	}
	return sum / float64(len(samples)), true
}

func anomalyAlertMessage(pmg models.PMGInstance, metricName string, current, baseline float64) string {
	ratio := 0.0
	if baseline > 0 {
		ratio = current / baseline
	}
	return fmt.Sprintf("PMG %s anomaly detected: %s is %.1f messages/hour (%.1fx baseline of %.1f)", pmg.Name, metricName, current, ratio, baseline)
}

// checkPMGAnomalies detects spam/virus rate anomalies using trimmed baseline
func (m *Manager) checkPMGAnomalies(pmg models.PMGInstance, _ PMGThresholdConfig) {
	// Need mail count data
	if len(pmg.MailCount) == 0 {
		return
	}

	// Get the latest hourly sample (most recent)
	latest := pmg.MailCount[len(pmg.MailCount)-1]
	now := time.Now()

	// Get or create anomaly tracker for this PMG instance
	m.mu.Lock()
	tracker := m.pmgAnomalyTrackers[pmg.ID]
	if tracker == nil {
		tracker = &pmgAnomalyTracker{
			Samples: make([]pmgMailMetricSample, 0, 48),
		}
		m.pmgAnomalyTrackers[pmg.ID] = tracker
	}

	// Create sample from latest mail count
	sample := pmgMailMetricSample{
		SpamIn:    latest.SpamIn,
		SpamOut:   latest.SpamOut,
		VirusIn:   latest.VirusIn,
		VirusOut:  latest.VirusOut,
		Timestamp: latest.Timestamp,
	}

	// Check for duplicate timestamp (already processed this sample)
	if !tracker.LastSampleTime.IsZero() && !sample.Timestamp.After(tracker.LastSampleTime) {
		m.mu.Unlock()
		return
	}

	// Check for timestamp gaps (>90 min indicates data discontinuity)
	if !tracker.LastSampleTime.IsZero() {
		gap := sample.Timestamp.Sub(tracker.LastSampleTime)
		if gap > 90*time.Minute {
			// Discard old samples - data gap detected
			log.Debug().
				Str("pmg", pmg.Name).
				Dur("gap", gap).
				Msg("PMG mail count data gap detected, resetting anomaly history")
			tracker.Samples = make([]pmgMailMetricSample, 0, 48)
			tracker.SampleCount = 0
		}
	}

	// Add sample to ring buffer
	tracker.Samples = append(tracker.Samples, sample)
	tracker.SampleCount++
	tracker.LastSampleTime = sample.Timestamp

	// Maintain ring buffer size (keep last 48)
	if len(tracker.Samples) > 48 {
		tracker.Samples = tracker.Samples[len(tracker.Samples)-48:]
	}

	sampleCount := len(tracker.Samples)
	m.mu.Unlock()

	// Need at least 12 samples for baseline warmup
	if sampleCount < 12 {
		log.Debug().
			Str("pmg", pmg.Name).
			Int("samples", sampleCount).
			Msg("PMG anomaly detection warming up (need 12 samples)")
		return
	}

	// Calculate baselines and check each metric
	metrics := []struct {
		name      string
		current   float64
		extractor func(pmgMailMetricSample) float64
	}{
		{"spamIn", sample.SpamIn, func(s pmgMailMetricSample) float64 { return s.SpamIn }},
		{"spamOut", sample.SpamOut, func(s pmgMailMetricSample) float64 { return s.SpamOut }},
		{"virusIn", sample.VirusIn, func(s pmgMailMetricSample) float64 { return s.VirusIn }},
		{"virusOut", sample.VirusOut, func(s pmgMailMetricSample) float64 { return s.VirusOut }},
	}

	for _, metric := range metrics {
		m.checkAnomalyMetric(pmg, tracker, metric.name, metric.current, metric.extractor, now)
	}
}

// checkAnomalyMetric checks a single spam/virus metric for anomalies
func (m *Manager) checkAnomalyMetric(pmg models.PMGInstance, tracker *pmgAnomalyTracker, metricName string, current float64, extractor func(pmgMailMetricSample) float64, now time.Time) {
	// Extract historical values for this metric (excluding current sample)
	m.mu.RLock()
	samples := tracker.Samples
	m.mu.RUnlock()

	if len(samples) < 2 {
		return
	}

	// Get previous 24 samples (or all available if less than 25 total)
	startIdx := 0
	if len(samples) > 25 {
		startIdx = len(samples) - 25
	}
	historicalSamples := samples[startIdx : len(samples)-1] // Exclude current (last) sample

	// Extract metric values
	values := make([]float64, 0, len(historicalSamples))
	for _, s := range historicalSamples {
		values = append(values, extractor(s))
	}

	// Calculate baseline
	baseline, trustworthy := calculateTrimmedBaseline(values)
	if !trustworthy {
		return
	}

	alertID := fmt.Sprintf("%s-anomaly-%s", pmg.ID, metricName)
	pendingKey := fmt.Sprintf("pmg-anomaly-%s-%s", pmg.ID, metricName)
	spec, err := buildCanonicalBaselineAnomalySpec(alertID, pmg.ID, pmg.Name, unifiedresources.ResourceTypePMG, metricName, 2, false)
	if err != nil {
		log.Warn().
			Err(err).
			Str("pmg", pmg.Name).
			Str("metric", metricName).
			Str("alertID", alertID).
			Msg("Skipping invalid canonical PMG anomaly spec")
		return
	}

	result, _ := m.evaluateCanonicalStatefulAlert(canonicalStatefulAlertParams{
		Spec:            spec,
		Evidence:        alertspecs.AlertEvidence{ObservedAt: now, BaselineAnomaly: &alertspecs.BaselineAnomalyEvidence{Metric: metricName, Observed: current, Baseline: baseline}},
		PendingTracking: m.pendingAlerts,
		PendingKey:      pendingKey,
		AlertID:         alertID,
		AlertType:       fmt.Sprintf("anomaly-%s", metricName),
		ResourceID:      pmg.ID,
		ResourceName:    pmg.Name,
		Node:            pmg.Host,
		Instance:        pmg.Name,
		MessageBuilder: func(_ alertspecs.EvaluationResult) (string, float64, float64) {
			return anomalyAlertMessage(pmg, metricName, current, baseline), current, baseline
		},
		DispatchAsync: true,
	})

	if result.State.State == alertspecs.AlertStatePending {
		log.Debug().
			Str("pmg", pmg.Name).
			Str("metric", metricName).
			Float64("current", current).
			Float64("baseline", baseline).
			Msg("PMG anomaly pending confirmation (first sample)")
		return
	}

	if result.Transition != nil && result.Transition.Kind == alertspecs.EvaluationTransitionActivated {
		pendingSince := result.Previous.FirstMatchedAt
		if pendingSince.IsZero() {
			pendingSince = now
		}

		level, ok := alertLevelFromCanonicalSeverity(result.State.Severity)
		if !ok {
			level = AlertLevelWarning
		}
		ratio := 0.0
		effectiveBaseline := baseline
		if effectiveBaseline == 0 && current > 0 {
			effectiveBaseline = 1
		}
		if effectiveBaseline > 0 {
			ratio = current / effectiveBaseline
		}
		log.Debug().
			Str("pmg", pmg.Name).
			Str("metric", metricName).
			Float64("current", current).
			Float64("baseline", baseline).
			Dur("pending", now.Sub(pendingSince)).
			Msg("PMG anomaly confirmed (second sample)")

		log.Warn().
			Str("pmg", pmg.Name).
			Str("metric", metricName).
			Float64("current", current).
			Float64("baseline", baseline).
			Float64("ratio", ratio).
			Str("level", string(level)).
			Msg("PMG anomaly alert triggered")
	}
}

// checkStorageOffline creates an alert for offline/unavailable storage
func (m *Manager) checkStorageOffline(storage models.Storage) {
	alertID := fmt.Sprintf("storage-offline-%s", storage.ID)

	thresholds := m.resolveResourceThresholds("storage", storage.ID)
	spec, err := buildCanonicalConnectivitySpec(storage.ID, storage.Name, unifiedresources.ResourceTypeStorage, AlertLevelWarning, 2, thresholds.Disabled || thresholds.DisableConnectivity)
	if err != nil {
		log.Warn().
			Err(err).
			Str("storage", storage.Name).
			Str("storageID", storage.ID).
			Msg("Skipping invalid canonical storage connectivity spec")
		return
	}

	_, _ = m.evaluateCanonicalLifecycleAlert(canonicalLifecycleAlertParams{
		Spec:         spec,
		Evidence:     alertspecs.AlertEvidence{ObservedAt: time.Now(), Connectivity: &alertspecs.ConnectivityEvidence{Signal: "status", Connected: false}},
		Tracking:     m.offlineConfirmations,
		TrackingKey:  storage.ID,
		AlertID:      alertID,
		AlertType:    "offline",
		ResourceID:   storage.ID,
		ResourceName: storage.Name,
		Node:         storage.Node,
		Instance:     storage.Instance,
		Message:      fmt.Sprintf("Storage %s on node %s is unavailable", storage.Name, storage.Node),
		Metadata: map[string]interface{}{
			"resourceType": "storage",
			"status":       storage.Status,
		},
		RateLimit:     true,
		DispatchAsync: true,
	})
}

// clearStorageOfflineAlert removes offline alert when storage comes back online
func (m *Manager) clearStorageOfflineAlert(storage models.Storage) {
	m.clearResourceOfflineAlert("storage-offline", storage.ID, storage.Name, storage.Node, "Storage")
}

// checkGuestPoweredOff creates an alert for powered-off guests
func (m *Manager) checkGuestPoweredOff(guestID, name, node, instanceName, guestType string, monitorOnly bool) {
	m.mu.RLock()
	thresholds := m.resolveGuestThresholdOverride(cloneThresholdConfig(m.config.GuestDefaults), nil, guestID)
	m.mu.RUnlock()
	m.checkGuestPoweredOffWithThresholds(guestID, name, node, instanceName, guestType, thresholds, monitorOnly)
}

func (m *Manager) checkGuestPoweredOffWithThresholds(guestID, name, node, instanceName, guestType string, thresholds ThresholdConfig, monitorOnly bool) {
	alertID := fmt.Sprintf("guest-powered-off-%s", guestID)
	severity := normalizePoweredOffSeverity(thresholds.PoweredOffSeverity)
	resourceType := unifiedresources.ResourceTypeVM
	if strings.EqualFold(guestType, "container") {
		resourceType = unifiedresources.ResourceTypeSystemContainer
	}
	spec, err := buildCanonicalPoweredStateSpec(guestID, name, resourceType, severity, 2, thresholds.Disabled || thresholds.DisableConnectivity)
	if err != nil {
		log.Warn().
			Err(err).
			Str("guest", name).
			Str("guestID", guestID).
			Msg("Skipping invalid canonical guest powered-state spec")
		return
	}

	m.evaluateCanonicalLifecycleAlert(canonicalLifecycleAlertParams{
		Spec: spec,
		Evidence: alertspecs.AlertEvidence{
			ObservedAt: time.Now(),
			PoweredState: &alertspecs.PoweredStateEvidence{
				Expected: alertspecs.PowerStateOn,
				Observed: alertspecs.PowerStateOff,
			},
		},
		Tracking:     m.offlineConfirmations,
		TrackingKey:  guestID,
		AlertID:      alertID,
		AlertType:    "powered-off",
		ResourceID:   guestID,
		ResourceName: name,
		Node:         node,
		Instance:     instanceName,
		Message:      fmt.Sprintf("%s '%s' is powered off", guestType, name),
		Metadata: map[string]interface{}{
			"monitorOnly":  monitorOnly,
			"resourceType": strings.ToLower(guestType),
		},
		AddToRecent:   true,
		AddToHistory:  true,
		DispatchAsync: false,
	})
}

// clearGuestPoweredOffAlert removes powered-off alert when guest starts running
func (m *Manager) clearGuestPoweredOffAlert(guestID, name string) {
	alertID := canonicalPoweredStateStateID(guestID)

	m.mu.Lock()
	defer m.mu.Unlock()

	// Reset confirmation count when guest comes back online
	if count, exists := m.offlineConfirmations[guestID]; exists && count > 0 {
		log.Debug().
			Str("guest", name).
			Int("previousCount", count).
			Msg("Guest is running, resetting powered-off confirmation count")
		delete(m.offlineConfirmations, guestID)
	}

	// Check if powered-off alert exists
	alert, exists := m.getActiveAlertNoLock(alertID)
	if !exists {
		return
	}

	// Remove from active alerts
	m.removeActiveAlertNoLock(alertID)

	downtime := time.Since(alert.StartTime)
	resolvedAlert := &ResolvedAlert{
		Alert:        alert,
		ResolvedTime: time.Now(),
	}
	m.addRecentlyResolvedWithPrimaryLock(resolvedAlert)

	// Send recovery notification (async to avoid deadlock — callback acquires m.mu.RLock
	// via ShouldSuppressResolvedNotification, and we currently hold m.mu.Lock)
	m.safeCallResolvedAlertCallback(alert, effectiveAlertID(alert, alertID), true)

	// Log recovery
	log.Info().
		Str("guest", name).
		Dur("downtime", downtime).
		Msg("Guest is now running")
}

// ClearAlert removes an alert from active alerts (but keeps in history)
func (m *Manager) ClearAlert(alertID string) bool {
	m.mu.Lock()
	alert, exists := m.getActiveAlertNoLock(alertID)
	if !exists || alert == nil {
		m.mu.Unlock()
		return false
	}
	trackingKey := canonicalTrackingKeyForAlert(alert)

	m.clearAlertNoLock(alertID)
	delete(m.recentAlerts, alertID)
	delete(m.pendingAlerts, alertID)
	delete(m.suppressedUntil, alertID)
	delete(m.alertRateLimit, alertID)
	if trackingKey != "" && trackingKey != alertID {
		delete(m.recentAlerts, trackingKey)
		delete(m.pendingAlerts, trackingKey)
		delete(m.suppressedUntil, trackingKey)
		delete(m.alertRateLimit, trackingKey)
	}
	m.mu.Unlock()

	m.saveActiveAlertsAsync("manual-clear")
	return true
}

// Cleanup removes old acknowledged alerts and cleans up tracking maps
func (m *Manager) Cleanup(maxAge time.Duration) {
	m.mu.Lock()
	now := time.Now()
	var autoAcked []*Alert

	lastSeenTooOld := func(alert *Alert, cutoff time.Duration) bool {
		if alert == nil {
			return true
		}
		lastSeen := alert.LastSeen
		if lastSeen.IsZero() {
			lastSeen = alert.StartTime
		}
		return now.Sub(lastSeen) > cutoff
	}

	// Auto-acknowledge old alerts if configured
	if m.config.AutoAcknowledgeAfterHours > 0 {
		autoAckThreshold := time.Duration(m.config.AutoAcknowledgeAfterHours) * time.Hour
		for id, alert := range m.activeAlerts {
			if !alert.Acknowledged && now.Sub(alert.StartTime) > autoAckThreshold {
				log.Info().
					Str("alertID", id).
					Dur("age", now.Sub(alert.StartTime)).
					Msg("Auto-acknowledging old alert")
				alert.Acknowledged = true
				ackTime := now
				alert.AckTime = &ackTime
				alert.AckUser = "system-auto"
				autoAcked = append(autoAcked, alert.Clone())

				if recordAlertAcknowledged != nil {
					recordAlertAcknowledged()
				}
			}
		}
	}

	// Clean up acknowledged alerts based on TTL
	if m.config.MaxAcknowledgedAgeDays > 0 {
		acknowledgedTTL := time.Duration(m.config.MaxAcknowledgedAgeDays) * 24 * time.Hour
		for id, alert := range m.activeAlerts {
			if alert.Acknowledged && alert.AckTime != nil &&
				now.Sub(*alert.AckTime) > acknowledgedTTL &&
				lastSeenTooOld(alert, acknowledgedTTL) {
				log.Info().
					Str("alertID", id).
					Dur("age", now.Sub(*alert.AckTime)).
					Msg("Cleaning up old acknowledged alert (TTL)")
				m.removeActiveAlertNoLock(id)
			}
		}
	}

	// Clean up old unacknowledged alerts based on TTL
	if m.config.MaxAlertAgeDays > 0 {
		alertTTL := time.Duration(m.config.MaxAlertAgeDays) * 24 * time.Hour
		for id, alert := range m.activeAlerts {
			if !alert.Acknowledged && now.Sub(alert.StartTime) > alertTTL {
				log.Info().
					Str("alertID", id).
					Dur("age", now.Sub(alert.StartTime)).
					Msg("Cleaning up old unacknowledged alert (TTL)")
				m.removeActiveAlertNoLock(id)
			}
		}
	}

	// Original cleanup for acknowledged alerts (fallback if TTL not configured)
	for id, alert := range m.activeAlerts {
		if alert.Acknowledged && alert.AckTime != nil &&
			now.Sub(*alert.AckTime) > maxAge &&
			lastSeenTooOld(alert, maxAge) {
			m.removeActiveAlertNoLock(id)
		}
	}

	// Clean up stale ackState entries for alerts that no longer exist
	// Keep ackState for 1 hour after the alert was removed (not from ack time)
	// to handle transient alert clears (e.g., backups of powered-off VMs)
	ackStateTTL := 1 * time.Hour
	for id, record := range m.ackState {
		if !m.hasActiveAlertNoLock(id) {
			// Use inactiveAt (when alert was removed) for TTL, not ack time
			checkTime := record.inactiveAt
			if checkTime.IsZero() {
				// Fallback for legacy entries without inactiveAt
				checkTime = record.time
			}
			if now.Sub(checkTime) > ackStateTTL {
				delete(m.ackState, id)
			}
		}
	}
	for canonicalID, record := range m.ackStateByCanonical {
		if m.hasActiveAlertTrackingKeyNoLock(canonicalID) {
			continue
		}
		checkTime := record.inactiveAt
		if checkTime.IsZero() {
			checkTime = record.time
		}
		if now.Sub(checkTime) > ackStateTTL {
			delete(m.ackStateByCanonical, canonicalID)
		}
	}

	// Clean up recent alerts older than suppression window
	suppressionWindow := time.Duration(m.config.SuppressionWindow) * time.Minute
	if suppressionWindow == 0 {
		suppressionWindow = 5 * time.Minute // Default
	}

	for id, alert := range m.recentAlerts {
		if now.Sub(alert.StartTime) > suppressionWindow {
			delete(m.recentAlerts, id)
		}
	}

	// Clean up expired suppressions
	for id, suppressUntil := range m.suppressedUntil {
		if now.After(suppressUntil) {
			delete(m.suppressedUntil, id)
		}
	}

	// Clean up old rate limit entries (older than 1 hour)
	cutoff := now.Add(-1 * time.Hour)
	for alertID, times := range m.alertRateLimit {
		var recentTimes []time.Time
		for _, t := range times {
			if t.After(cutoff) {
				recentTimes = append(recentTimes, t)
			}
		}
		if len(recentTimes) == 0 {
			// No recent alerts, remove the entry entirely
			delete(m.alertRateLimit, alertID)
		} else {
			// Update with only recent times
			m.alertRateLimit[alertID] = recentTimes
		}
	}

	// Clean up old recently resolved alerts (older than 5 minutes)
	fiveMinutesAgo := now.Add(-5 * time.Minute)
	m.resolvedMutex.Lock()
	for alertID, resolved := range m.recentlyResolved {
		if resolved.ResolvedTime.Before(fiveMinutesAgo) {
			m.removeResolvedAlertUnlocked(alertID)
		}
	}
	m.resolvedMutex.Unlock()

	// Clean up stale pending alerts (older than max time threshold window)
	// This prevents memory leak from deleted resources that never triggered alerts
	maxPendingAge := 10 * time.Minute // Longest time threshold + safety buffer
	for id, pendingTime := range m.pendingAlerts {
		if now.Sub(pendingTime) > maxPendingAge {
			delete(m.pendingAlerts, id)
			log.Debug().
				Str("resourceID", id).
				Dur("age", now.Sub(pendingTime)).
				Msg("Cleaned up stale pending alert entry")
		}
	}

	// Clean up flapping history for resolved/inactive alerts
	flappingCleanupAge := 1 * time.Hour
	for alertID := range m.flappingHistory {
		// If alert is no longer active and flapping cooldown has expired
		if !m.hasActiveAlertTrackingKeyNoLock(alertID) {
			if suppressUntil, suppressed := m.suppressedUntil[alertID]; !suppressed || now.After(suppressUntil.Add(flappingCleanupAge)) {
				delete(m.flappingHistory, alertID)
				delete(m.flappingActive, alertID)
				log.Debug().
					Str("alertID", alertID).
					Msg("Cleaned up flapping history for inactive alert")
			}
		}
	}

	// Clean up old Docker restart tracking (containers not seen in 24h)
	// Prevents memory leak from ephemeral containers in CI/CD environments
	for resourceID, record := range m.dockerRestartTracking {
		if now.Sub(record.lastChecked) > 24*time.Hour {
			delete(m.dockerRestartTracking, resourceID)
			log.Debug().
				Str("resourceID", resourceID).
				Msg("Cleaned up stale Docker restart tracking entry")
		}
	}

	// Clean up stale PMG anomaly trackers (no samples in 24h)
	// Prevents memory leak from decommissioned or transient PMG instances
	staleTrackerAge := 24 * time.Hour
	for pmgID, tracker := range m.pmgAnomalyTrackers {
		if tracker != nil && !tracker.LastSampleTime.IsZero() {
			if now.Sub(tracker.LastSampleTime) > staleTrackerAge {
				delete(m.pmgAnomalyTrackers, pmgID)
				log.Debug().
					Str("pmgID", pmgID).
					Time("lastSampleTime", tracker.LastSampleTime).
					Msg("Cleaned up stale PMG anomaly tracker")
			}
		}
	}

	// Clean up stale PMG quarantine history (no recent snapshots in 7 days)
	// Prevents memory leak from deleted PMG instances
	staleHistoryAge := 7 * 24 * time.Hour
	for pmgID, snapshots := range m.pmgQuarantineHistory {
		// If no snapshots remain or last snapshot is very old
		if len(snapshots) == 0 {
			delete(m.pmgQuarantineHistory, pmgID)
			log.Debug().
				Str("pmgID", pmgID).
				Msg("Cleaned up empty PMG quarantine history")
			continue
		}

		lastSnapshot := snapshots[len(snapshots)-1]
		if now.Sub(lastSnapshot.Timestamp) > staleHistoryAge {
			delete(m.pmgQuarantineHistory, pmgID)
			log.Debug().
				Str("pmgID", pmgID).
				Time("lastSnapshot", lastSnapshot.Timestamp).
				Msg("Cleaned up stale PMG quarantine history")
		}
	}

	m.mu.Unlock()

	for _, alert := range autoAcked {
		m.safeCallAcknowledgedCallback(alert, "system-auto")
	}
}

func cloneThreshold(threshold *HysteresisThreshold) *HysteresisThreshold {
	if threshold == nil {
		return nil
	}
	clone := *threshold
	return &clone
}

func cloneStringPtr(value *string) *string {
	if value == nil {
		return nil
	}
	v := *value
	return &v
}

func cloneSnapshotConfig(cfg *SnapshotAlertConfig) *SnapshotAlertConfig {
	if cfg == nil {
		return nil
	}
	clone := *cfg
	return &clone
}

func cloneBackupConfig(cfg *BackupAlertConfig) *BackupAlertConfig {
	if cfg == nil {
		return nil
	}
	clone := *cfg
	if cfg.AlertOrphaned != nil {
		value := *cfg.AlertOrphaned
		clone.AlertOrphaned = &value
	}
	if len(cfg.IgnoreVMIDs) > 0 {
		clone.IgnoreVMIDs = append([]string(nil), cfg.IgnoreVMIDs...)
	}
	return &clone
}

func cloneThresholdConfig(cfg ThresholdConfig) ThresholdConfig {
	clone := cfg
	clone.CPU = cloneThreshold(cfg.CPU)
	clone.Memory = cloneThreshold(cfg.Memory)
	clone.Disk = cloneThreshold(cfg.Disk)
	clone.DiskRead = cloneThreshold(cfg.DiskRead)
	clone.DiskWrite = cloneThreshold(cfg.DiskWrite)
	clone.NetworkIn = cloneThreshold(cfg.NetworkIn)
	clone.NetworkOut = cloneThreshold(cfg.NetworkOut)
	clone.Temperature = cloneThreshold(cfg.Temperature)
	clone.DiskTemperature = cloneThreshold(cfg.DiskTemperature)
	clone.Usage = cloneThreshold(cfg.Usage)
	clone.Backup = cloneBackupConfig(cfg.Backup)
	clone.Snapshot = cloneSnapshotConfig(cfg.Snapshot)
	clone.Note = cloneStringPtr(cfg.Note)
	return clone
}

func (m *Manager) applyThresholdOverride(base ThresholdConfig, override ThresholdConfig) ThresholdConfig {
	result := base

	if override.Disabled {
		result.Disabled = true
	}
	if override.DisableConnectivity {
		result.DisableConnectivity = true
	}
	if override.PoweredOffSeverity != "" {
		result.PoweredOffSeverity = normalizePoweredOffSeverity(override.PoweredOffSeverity)
	}

	if override.CPU != nil {
		result.CPU = ensureHysteresisThreshold(cloneThreshold(override.CPU))
	}

	if override.Memory != nil {
		result.Memory = ensureHysteresisThreshold(cloneThreshold(override.Memory))
	}

	if override.Disk != nil {
		result.Disk = ensureHysteresisThreshold(cloneThreshold(override.Disk))
	}

	if override.DiskRead != nil {
		result.DiskRead = ensureHysteresisThreshold(cloneThreshold(override.DiskRead))
	}

	if override.DiskWrite != nil {
		result.DiskWrite = ensureHysteresisThreshold(cloneThreshold(override.DiskWrite))
	}

	if override.NetworkIn != nil {
		result.NetworkIn = ensureHysteresisThreshold(cloneThreshold(override.NetworkIn))
	}

	if override.NetworkOut != nil {
		result.NetworkOut = ensureHysteresisThreshold(cloneThreshold(override.NetworkOut))
	}

	if override.Temperature != nil {
		result.Temperature = ensureHysteresisThreshold(cloneThreshold(override.Temperature))
	}

	if override.DiskTemperature != nil {
		result.DiskTemperature = ensureHysteresisThreshold(cloneThreshold(override.DiskTemperature))
	}

	if override.Usage != nil {
		result.Usage = ensureHysteresisThreshold(cloneThreshold(override.Usage))
	}
	if override.Backup != nil {
		result.Backup = cloneBackupConfig(override.Backup)
	}
	if override.Snapshot != nil {
		result.Snapshot = cloneSnapshotConfig(override.Snapshot)
	}

	if override.Note != nil {
		note := strings.TrimSpace(*override.Note)
		if note == "" {
			result.Note = nil
		} else {
			noteCopy := note
			result.Note = &noteCopy
		}
	}

	return result
}

func proxmoxDiskCanonicalResourceID(instance, node, devPath string) string {
	return fmt.Sprintf("%s:%s:disk:%s", strings.TrimSpace(instance), strings.TrimSpace(node), sanitizeAlertKey(devPath))
}

func proxmoxDiskAlertMetadata(disk proxmox.Disk) map[string]interface{} {
	return map[string]interface{}{
		"disk_path":   disk.DevPath,
		"disk_model":  disk.Model,
		"disk_serial": disk.Serial,
		"disk_type":   disk.Type,
		"disk_size":   disk.Size,
	}
}

// ensureHysteresisThreshold ensures a threshold has hysteresis configured
func ensureHysteresisThreshold(threshold *HysteresisThreshold) *HysteresisThreshold {
	if threshold == nil {
		return nil
	}
	if threshold.Clear <= 0 {
		threshold.Clear = threshold.Trigger - 5.0 // Default 5% margin
	}
	return threshold
}

type pulseTagSettings struct {
	Suppress    bool
	MonitorOnly bool
	Relaxed     bool
}

func parsePulseTags(tags []string) pulseTagSettings {
	settings := pulseTagSettings{}
	for _, raw := range tags {
		tag := strings.TrimSpace(strings.ToLower(raw))
		switch tag {
		case "pulse-no-alerts":
			settings.Suppress = true
		case "pulse-monitor-only":
			settings.MonitorOnly = true
		case "pulse-relaxed":
			settings.Relaxed = true
		}
	}
	return settings
}

func applyRelaxedGuestThresholds(cfg ThresholdConfig) ThresholdConfig {
	relaxed := cloneThresholdConfig(cfg)

	adjust := func(th **HysteresisThreshold, minTrigger float64) {
		if *th == nil {
			*th = &HysteresisThreshold{Trigger: minTrigger, Clear: minTrigger - 5}
			return
		}
		ensureHysteresisThreshold(*th)
		if (*th).Trigger < minTrigger {
			(*th).Trigger = minTrigger
		}
		if (*th).Clear >= (*th).Trigger {
			(*th).Clear = (*th).Trigger - 5
		}
		if (*th).Clear < 0 {
			(*th).Clear = 0
		}
	}

	adjust(&relaxed.CPU, 95)
	adjust(&relaxed.Memory, 92)
	adjust(&relaxed.Disk, 95)

	return relaxed
}

func (m *Manager) suppressGuestAlerts(guestID string) bool {
	m.mu.Lock()
	defer m.mu.Unlock()

	cleared := false

	for storageKey, alert := range m.activeAlerts {
		alertID := effectiveAlertID(alert, storageKey)
		trackingKey := canonicalTrackingKeyForAlert(alert)
		if alert == nil {
			continue
		}
		if alert.ResourceID == guestID || strings.HasPrefix(alert.ResourceID, guestID+"/") || strings.HasPrefix(alertID, guestID) {
			m.clearAlertNoLock(alertID)
			delete(m.recentAlerts, trackingKey)
			delete(m.pendingAlerts, trackingKey)
			delete(m.suppressedUntil, trackingKey)
			delete(m.alertRateLimit, trackingKey)
			cleared = true
		}
	}

	for key := range m.pendingAlerts {
		if strings.HasPrefix(key, guestID) {
			delete(m.pendingAlerts, key)
		}
	}
	for key := range m.recentAlerts {
		if strings.HasPrefix(key, guestID) {
			delete(m.recentAlerts, key)
		}
	}
	for key := range m.suppressedUntil {
		if strings.HasPrefix(key, guestID) {
			delete(m.suppressedUntil, key)
		}
	}
	for key := range m.alertRateLimit {
		if strings.HasPrefix(key, guestID) {
			delete(m.alertRateLimit, key)
		}
	}

	delete(m.offlineConfirmations, guestID)

	return cleared
}

func (m *Manager) guestHasMonitorOnlyAlerts(guestID string) bool {
	m.mu.RLock()
	defer m.mu.RUnlock()

	for _, alert := range m.activeAlerts {
		if alert == nil {
			continue
		}
		if alert.ResourceID != guestID {
			continue
		}
		if isMonitorOnlyAlert(alert) {
			return true
		}
	}

	return false
}

// checkRateLimit checks if an alert has exceeded rate limit
func (m *Manager) checkRateLimit(alertID string) bool {
	if m.config.Schedule.MaxAlertsHour <= 0 {
		return true // No rate limit
	}

	now := time.Now()
	cutoff := now.Add(-1 * time.Hour)

	// Clean old entries and count recent alerts
	var recentAlerts []time.Time
	if times, exists := m.alertRateLimit[alertID]; exists {
		for _, t := range times {
			if t.After(cutoff) {
				recentAlerts = append(recentAlerts, t)
			}
		}
	}

	// Check if we've hit the limit
	if len(recentAlerts) >= m.config.Schedule.MaxAlertsHour {
		return false
	}

	// Add current time
	recentAlerts = append(recentAlerts, now)
	m.alertRateLimit[alertID] = recentAlerts

	return true
}

// escalationChecker runs periodically to check for alerts that need escalation and cleanup
func (m *Manager) escalationChecker() {
	ticker := time.NewTicker(1 * time.Minute)
	cleanupTicker := time.NewTicker(10 * time.Minute) // Run cleanup every 10 minutes
	defer ticker.Stop()
	defer cleanupTicker.Stop()

	for {
		select {
		case <-ticker.C:
			m.checkEscalations()
		case <-cleanupTicker.C:
			m.Cleanup(24 * time.Hour) // Clean up acknowledged alerts older than 24 hours
		case <-m.escalationStop:
			return
		}
	}
}

// checkEscalations checks all active alerts for escalation
func (m *Manager) checkEscalations() {
	m.mu.Lock()
	defer m.mu.Unlock()

	// Respect global alert and activation controls before escalating.
	// Escalations should never bypass a user disabling alerts.
	if !m.config.Enabled || m.config.ActivationState != ActivationActive {
		return
	}

	if !m.config.Schedule.Escalation.Enabled {
		return
	}

	now := time.Now()
	for _, alert := range m.activeAlerts {
		// Skip acknowledged alerts
		if alert.Acknowledged {
			continue
		}

		// Check each escalation level
		for i, level := range m.config.Schedule.Escalation.Levels {
			// Skip if we've already escalated to this level
			if alert.LastEscalation >= i+1 {
				continue
			}

			// Check if it's time to escalate
			escalateTime := alert.StartTime.Add(time.Duration(level.After) * time.Minute)
			if now.After(escalateTime) {
				// Update alert escalation state
				alert.LastEscalation = i + 1
				alert.EscalationTimes = append(alert.EscalationTimes, now)

				log.Info().
					Str("alertID", alert.ID).
					Str("trackingKey", canonicalTrackingKeyForAlert(alert)).
					Int("level", i+1).
					Str("notify", level.Notify).
					Msg("Alert escalated")

				// Trigger escalation callback
				m.safeCallEscalateCallback(alert, i+1)
			}
		}
	}
}

// Stop stops the alert manager and saves history
func (m *Manager) Stop() {
	m.stopOnce.Do(func() {
		closeSignalChannel(m.escalationStop)
		closeSignalChannel(m.cleanupStop)
		if m.historyManager != nil {
			m.historyManager.Stop()
		}

		// Give background goroutines time to exit cleanly
		time.Sleep(100 * time.Millisecond)

		// Save active alerts before stopping
		if err := m.SaveActiveAlerts(); err != nil {
			log.Error().Err(err).Msg("Failed to save active alerts on stop")
		}
	})
}

func closeSignalChannel(ch chan struct{}) {
	if ch == nil {
		return
	}
	defer func() {
		if recover() != nil {
			// Channel was already closed by another shutdown path.
		}
	}()
	close(ch)
}

// SaveActiveAlerts persists active alerts to disk
func (m *Manager) SaveActiveAlerts() error {
	// Serialize snapshots and writes so concurrent async saves cannot
	// overwrite newer state with an older snapshot.
	m.saveMu.Lock()
	defer m.saveMu.Unlock()

	m.mu.RLock()
	defer m.mu.RUnlock()

	// Create directory if it doesn't exist
	alertsDir := m.getAlertsDir()
	if err := os.MkdirAll(alertsDir, alertsDirPerm); err != nil {
		return fmt.Errorf("failed to create alerts directory: %w", err)
	}
	if err := os.Chmod(alertsDir, alertsDirPerm); err != nil {
		return fmt.Errorf("failed to set alerts directory permissions: %w", err)
	}

	// Convert map to slice for JSON encoding
	alerts := make([]*Alert, 0, len(m.activeAlerts))
	for _, alert := range m.activeAlerts {
		alerts = append(alerts, alert)
	}

	data, err := json.Marshal(alerts)
	if err != nil {
		return fmt.Errorf("failed to marshal active alerts: %w", err)
	}

	// Write to temporary file first, then rename (atomic operation)
	// Use a unique temp file to avoid race conditions between concurrent saves (e.g., periodic vs shutdown)
	tmpFile, err := os.CreateTemp(alertsDir, "active-alerts-*.json.tmp")
	if err != nil {
		return fmt.Errorf("failed to create temp file: %w", err)
	}
	tmpName := tmpFile.Name()
	cleanupTemp := true

	// Ensure cleanup of temp file in case of failure
	defer func() {
		if !cleanupTemp {
			return
		}
		if err := os.Remove(tmpName); err != nil && !os.IsNotExist(err) {
			log.Warn().Err(err).Str("file", tmpName).Msg("Failed to remove temp active alerts file")
		}
	}()

	if _, err := tmpFile.Write(data); err != nil {
		writeErr := fmt.Errorf("failed to write active alerts temp file %s: %w", tmpName, err)
		if closeErr := tmpFile.Close(); closeErr != nil {
			closeErr = fmt.Errorf("failed to close temp file %s after write failure: %w", tmpName, closeErr)
			return fmt.Errorf("failed to persist active alerts: %w", errors.Join(writeErr, closeErr))
		}
		return writeErr
	}
	if err := tmpFile.Chmod(alertsFilePerm); err != nil {
		if closeErr := tmpFile.Close(); closeErr != nil {
			log.Warn().Err(closeErr).Str("file", tmpName).Msg("Failed to close temp file after chmod error")
		}
		return fmt.Errorf("failed to set active alerts temp file permissions: %w", err)
	}
	if err := tmpFile.Close(); err != nil {
		return fmt.Errorf("failed to close active alerts temp file %s: %w", tmpName, err)
	}

	finalFile := filepath.Join(alertsDir, "active-alerts.json")
	if err := os.Rename(tmpName, finalFile); err != nil {
		return fmt.Errorf("failed to rename active alerts file from %s to %s: %w", tmpName, finalFile, err)
	}
	if err := os.Chmod(finalFile, alertsFilePerm); err != nil {
		return fmt.Errorf("failed to set active alerts file permissions: %w", err)
	}

	log.Debug().Int("count", len(alerts)).Msg("saved active alerts to disk")
	return nil
}

func (m *Manager) saveActiveAlertsAsync(context string) {
	go func() {
		defer func() {
			if r := recover(); r != nil {
				log.Error().
					Interface("panic", r).
					Str("context", context).
					Msg("Panic in SaveActiveAlerts goroutine")
			}
		}()
		if err := m.SaveActiveAlerts(); err != nil {
			log.Error().
				Err(err).
				Str("context", context).
				Msg("Failed to save active alerts")
		}
	}()
}

// LoadActiveAlerts restores active alerts from disk
func (m *Manager) LoadActiveAlerts() error {
	m.mu.Lock()
	defer m.mu.Unlock()

	alertsFile := filepath.Join(m.getAlertsDir(), "active-alerts.json")
	data, err := readLimitedRegularFile(alertsFile, maxActiveAlertsFileSizeBytes)
	if err != nil {
		if errors.Is(err, os.ErrNotExist) {
			log.Info().Msg("No active alerts file found, starting fresh")
			return nil
		}
		return fmt.Errorf("failed to read active alerts: %w", err)
	}

	var alerts []*Alert
	if err := json.Unmarshal(data, &alerts); err != nil {
		return fmt.Errorf("failed to unmarshal active alerts: %w", err)
	}
	if err := os.Chmod(alertsFile, alertsFilePerm); err != nil && !os.IsNotExist(err) {
		log.Warn().Err(err).Str("file", alertsFile).Msg("Failed to harden active alerts file permissions")
	}

	// Restore alerts to the map with deduplication
	now := time.Now()
	restoredCount := 0
	duplicateCount := 0
	seen := make(map[string]bool)

	for _, alert := range alerts {
		backfillCanonicalIdentity(alert)

		// Migrate legacy guest alert IDs (instance-node-VMID -> instance-VMID)
		// Check if this is a guest-related alert by looking at common alert types
		isGuestAlert := strings.Contains(alert.Type, "cpu") || strings.Contains(alert.Type, "memory") ||
			strings.Contains(alert.Type, "disk") || strings.Contains(alert.Type, "network") ||
			alert.Type == "guest-offline"
		if isGuestAlert {
			// Try to extract instance, node, and VMID from resource ID
			// Legacy format: instance-node-VMID or node-VMID (standalone)
			parts := strings.Split(alert.ResourceID, "-")

			// Check if this looks like a legacy format (has node in the ID)
			// We can detect this if we have Node field and it appears in the ResourceID
			if alert.Node != "" && len(parts) >= 2 {
				var newResourceID string

				// Try to extract VMID (should be last part)
				vmidStr := parts[len(parts)-1]
				if _, err := strconv.Atoi(vmidStr); err == nil {
					// VMID is valid, now check if we need to migrate
					if len(parts) == 3 && alert.Instance != "" && alert.Instance != alert.Node {
						// Format: instance-node-VMID -> instance-VMID
						newResourceID = fmt.Sprintf("%s-%s", alert.Instance, vmidStr)
					} else if len(parts) == 2 && alert.Instance == alert.Node {
						// Format: node-VMID -> instance-VMID (standalone)
						newResourceID = fmt.Sprintf("%s-%s", alert.Instance, vmidStr)
					}

					if newResourceID != "" && newResourceID != alert.ResourceID {
						log.Info().
							Str("oldID", alert.ResourceID).
							Str("newID", newResourceID).
							Str("alertType", alert.Type).
							Msg("Migrating active alert from legacy guest ID format")

						oldResourceID := alert.ResourceID

						// Update resource ID
						alert.ResourceID = newResourceID

						// Update alert ID (usually contains resource ID)
						alert.ID = strings.Replace(alert.ID, oldResourceID, newResourceID, 1)
					}
				}
			}
		}

		// Skip duplicates
		if seen[alert.ID] {
			duplicateCount++
			log.Warn().Str("alertID", alert.ID).Msg("skipping duplicate alert during restore")
			continue
		}
		seen[alert.ID] = true

		// Skip very old alerts (older than 24 hours)
		if now.Sub(alert.StartTime) > 24*time.Hour {
			log.Debug().Str("alertID", alert.ID).Msg("skipping old alert during restore")
			continue
		}

		// Skip acknowledged alerts older than 1 hour from activeAlerts,
		// but still preserve the ackState so if the same alert reappears
		// (e.g., backup-age alerts) it won't retrigger notifications.
		if alert.Acknowledged && alert.AckTime != nil && now.Sub(*alert.AckTime) > time.Hour {
			log.Debug().Str("alertID", alert.ID).Msg("skipping old acknowledged alert from activeAlerts but preserving ackState")
			ackTime := alert.StartTime
			if alert.AckTime != nil {
				ackTime = *alert.AckTime
			}
			m.setAckRecordNoLock(alert, alert.ID, ackRecord{
				acknowledged: true,
				user:         alert.AckUser,
				time:         ackTime,
			})
			continue
		}

		m.setActiveAlertNoLock(alert.ID, alert)
		if alert.Acknowledged {
			ackTime := alert.StartTime
			if alert.AckTime != nil {
				ackTime = *alert.AckTime
			}
			m.setAckRecordNoLock(alert, alert.ID, ackRecord{
				acknowledged: true,
				user:         alert.AckUser,
				time:         ackTime,
			})
		}
		restoredCount++

		// For critical alerts that are still active after restart, send notifications
		// This ensures users are notified about ongoing critical issues even after service restarts
		// Only notify for alerts that started recently (within last 2 hours) to avoid spam
		if alert.Level == AlertLevelCritical && now.Sub(alert.StartTime) < 2*time.Hour {
			// Use a goroutine and add a small delay to avoid notification spam on startup
			alertCopy := alert.Clone()
			go func(a *Alert) {
				delay := time.NewTimer(10 * time.Second)
				defer func() {
					if !delay.Stop() {
						select {
						case <-delay.C:
						default:
						}
					}
				}()

				// Wait for system to stabilize or cancellation
				select {
				case <-delay.C:
					log.Info().
						Str("alertID", a.ID).
						Str("resource", a.ResourceName).
						Msg("Attempting to send notification for restored critical alert")

					// Acquire lock before calling dispatchAlert (it accesses maps)
					m.mu.Lock()
					m.dispatchAlert(a, false) // Use dispatchAlert to respect activation state and quiet hours
					m.mu.Unlock()
				case <-m.escalationStop:
					log.Debug().
						Str("alertID", a.ID).
						Msg("Cancelled startup notification due to shutdown")
					return
				}
			}(alertCopy)
		}
	}

	log.Info().
		Int("restored", restoredCount).
		Int("total", len(alerts)).
		Int("duplicates", duplicateCount).
		Msg("Restored active alerts from disk")
	return nil
}

func (m *Manager) getAlertsDir() string {
	if strings.TrimSpace(m.alertsDir) != "" {
		return m.alertsDir
	}

	// Fallback for tests that construct Manager directly.
	return filepath.Join(utils.GetDataDir(), "alerts")
}

// CleanupAlertsForNodes removes alerts for nodes that no longer exist
func (m *Manager) CleanupAlertsForNodes(existingNodes map[string]bool) {
	m.mu.Lock()
	defer m.mu.Unlock()

	log.Debug().
		Int("totalAlerts", len(m.activeAlerts)).
		Int("existingNodes", len(existingNodes)).
		Interface("nodes", existingNodes).
		Msg("Starting alert cleanup for non-existent nodes")

	removedCount := 0
	for storageKey, alert := range m.activeAlerts {
		alertID := effectiveAlertID(alert, storageKey)
		if alert == nil {
			continue
		}

		// Skip alerts that are not tied to Proxmox nodes. Docker and PBS resources use
		// synthetic node identifiers that won't appear in the Proxmox node list, so we
		// must preserve their alerts here.
		if strings.HasPrefix(alertID, "docker-") || strings.HasPrefix(alert.ResourceID, "docker:") {
			continue
		}
		if strings.HasPrefix(alertID, "pbs-") || alert.Type == "pbs-offline" {
			continue
		}
		if alert.Metadata != nil {
			if resourceType, _ := alert.Metadata["resourceType"].(string); resourceType == "pbs" {
				continue
			}
		}
		if alert.CanonicalKind == string(alertspecs.AlertSpecKindConnectivity) && strings.HasPrefix(alert.ResourceID, "pbs") {
			continue
		}
		// Use the Node field from the alert itself, which is more reliable
		node := alert.Node

		// If we couldn't get a node or the node doesn't exist, remove the alert
		if node == "" || !existingNodes[node] {
			m.removeActiveAlertNoLock(alertID)
			removedCount++
			log.Debug().Str("alertID", alertID).Str("node", node).Msg("removed alert for non-existent node")
		}
	}

	if removedCount > 0 {
		log.Debug().Int("removed", removedCount).Int("remaining", len(m.activeAlerts)).Msg("cleaned up alerts for non-existent nodes")
		// Save the cleaned up state
		go func() {
			defer func() {
				if r := recover(); r != nil {
					log.Error().Interface("panic", r).Msg("panic in SaveActiveAlerts goroutine (cleanup)")
				}
			}()
			if err := m.SaveActiveAlerts(); err != nil {
				log.Error().Err(err).Msg("failed to save alerts after cleanup")
			}
		}()
	} else {
		log.Info().Msg("no alerts needed cleanup")
	}
}

// ClearActiveAlerts removes all active and pending alerts, resetting the manager state.
func (m *Manager) ClearActiveAlerts() {
	m.mu.Lock()
	if len(m.activeAlerts) == 0 && len(m.pendingAlerts) == 0 {
		m.mu.Unlock()
		return
	}
	m.activeAlerts = make(map[string]*Alert)
	m.activeAlertAlias = make(map[string]string)
	m.pendingAlerts = make(map[string]time.Time)
	m.recentAlerts = make(map[string]*Alert)
	m.suppressedUntil = make(map[string]time.Time)
	m.alertRateLimit = make(map[string][]time.Time)
	m.nodeOfflineCount = make(map[string]int)
	m.offlineConfirmations = make(map[string]int)
	m.dockerOfflineCount = make(map[string]int)
	m.dockerStateConfirm = make(map[string]int)
	m.dockerRestartTracking = make(map[string]*dockerRestartRecord)
	m.dockerLastExitCode = make(map[string]int)
	m.dockerUpdateFirstSeen = make(map[string]time.Time)
	m.dockerUpdateFirstSeenByIdentity = make(map[string]time.Time)
	m.ackState = make(map[string]ackRecord)
	m.ackStateByCanonical = make(map[string]ackRecord)
	m.mu.Unlock()

	m.resolvedMutex.Lock()
	m.recentlyResolved = make(map[string]*ResolvedAlert)
	m.resolvedAlias = make(map[string]string)
	m.resolvedMutex.Unlock()

	log.Info().Msg("cleared all active and pending alerts")

	go func() {
		defer func() {
			if r := recover(); r != nil {
				log.Error().Interface("panic", r).Msg("panic in SaveActiveAlerts goroutine (clear)")
			}
		}()
		if err := m.SaveActiveAlerts(); err != nil {
			log.Error().Err(err).Msg("failed to persist cleared alerts")
		}
	}()
}

// periodicSaveAlerts saves active alerts to disk periodically
func (m *Manager) periodicSaveAlerts() {
	ticker := time.NewTicker(1 * time.Minute)
	defer ticker.Stop()

	for {
		select {
		case <-ticker.C:
			if err := m.SaveActiveAlerts(); err != nil {
				log.Error().Err(err).Msg("failed to save active alerts during periodic save")
			}
		case <-m.escalationStop:
			return
		}
	}
}

// trackingMapCleanup periodically cleans up stale entries from tracking maps
// to prevent unbounded memory growth from deleted/decommissioned resources.
func (m *Manager) trackingMapCleanup() {
	// Run cleanup every hour
	ticker := time.NewTicker(1 * time.Hour)
	defer ticker.Stop()

	for {
		select {
		case <-ticker.C:
			m.cleanupStaleMaps()
		case <-m.cleanupStop:
			return
		}
	}
}

// cleanupStaleMaps removes stale entries from tracking maps.
// Entries are considered stale if they haven't been updated in 24 hours
// and don't correspond to any active alert.
func (m *Manager) cleanupStaleMaps() {
	m.mu.Lock()
	defer m.mu.Unlock()

	now := time.Now()
	staleThreshold := StaleTrackingThreshold
	cleaned := 0

	// Clean up flapping history for resources without active alerts
	for alertID, history := range m.flappingHistory {
		if !m.hasActiveAlertNoLock(alertID) {
			// Check if history is stale (last entry older than threshold)
			if len(history) == 0 || now.Sub(history[len(history)-1]) > staleThreshold {
				delete(m.flappingHistory, alertID)
				delete(m.flappingActive, alertID)
				cleaned++
			}
		}
	}

	// Clean up suppressedUntil entries that have expired
	for alertID, suppressUntil := range m.suppressedUntil {
		if now.After(suppressUntil) {
			delete(m.suppressedUntil, alertID)
			cleaned++
		}
	}

	// Clean up pending alerts older than threshold without active alerts
	for alertID, pendingTime := range m.pendingAlerts {
		if !m.hasActiveAlertNoLock(alertID) {
			if now.Sub(pendingTime) > staleThreshold {
				delete(m.pendingAlerts, alertID)
				cleaned++
			}
		}
	}

	// Clean up offline confirmation counts for resources without active alerts
	for resourceID := range m.offlineConfirmations {
		hasRelatedAlert := false
		for storageKey, alert := range m.activeAlerts {
			alertID := effectiveAlertID(alert, storageKey)
			if strings.Contains(alertID, resourceID) {
				hasRelatedAlert = true
				break
			}
		}
		if !hasRelatedAlert {
			delete(m.offlineConfirmations, resourceID)
			cleaned++
		}
	}

	// Clean up node offline counts (legacy)
	for nodeID := range m.nodeOfflineCount {
		hasRelatedAlert := false
		for storageKey, alert := range m.activeAlerts {
			alertID := effectiveAlertID(alert, storageKey)
			if strings.Contains(alertID, nodeID) {
				hasRelatedAlert = true
				break
			}
		}
		if !hasRelatedAlert {
			delete(m.nodeOfflineCount, nodeID)
			cleaned++
		}
	}

	// Clean up Docker tracking maps
	for containerID := range m.dockerStateConfirm {
		hasRelatedAlert := false
		for storageKey, alert := range m.activeAlerts {
			alertID := effectiveAlertID(alert, storageKey)
			if strings.Contains(alertID, containerID) {
				hasRelatedAlert = true
				break
			}
		}
		if !hasRelatedAlert {
			delete(m.dockerStateConfirm, containerID)
			cleaned++
		}
	}

	for hostID := range m.dockerOfflineCount {
		hasRelatedAlert := false
		for storageKey, alert := range m.activeAlerts {
			alertID := effectiveAlertID(alert, storageKey)
			if strings.Contains(alertID, hostID) {
				hasRelatedAlert = true
				break
			}
		}
		if !hasRelatedAlert {
			delete(m.dockerOfflineCount, hostID)
			cleaned++
		}
	}

	// Clean up Docker restart tracking for stale containers
	for containerID, record := range m.dockerRestartTracking {
		if record != nil && now.Sub(record.lastChecked) > staleThreshold {
			delete(m.dockerRestartTracking, containerID)
			delete(m.dockerLastExitCode, containerID)
			cleaned++
		}
	}

	// Clean up Docker update tracking for stale entries
	for containerID, firstSeen := range m.dockerUpdateFirstSeen {
		if now.Sub(firstSeen) > staleThreshold {
			delete(m.dockerUpdateFirstSeen, containerID)
			cleaned++
		}
	}
	for containerID, firstSeen := range m.dockerUpdateFirstSeenByIdentity {
		if now.Sub(firstSeen) > staleThreshold {
			delete(m.dockerUpdateFirstSeenByIdentity, containerID)
			cleaned++
		}
	}

	// Clean up rate limit entries older than 1 hour
	rateLimitThreshold := RateLimitCleanupWindow
	for resourceID, times := range m.alertRateLimit {
		// Filter to keep only recent entries
		var recent []time.Time
		for _, t := range times {
			if now.Sub(t) < rateLimitThreshold {
				recent = append(recent, t)
			}
		}
		if len(recent) == 0 {
			delete(m.alertRateLimit, resourceID)
			cleaned++
		} else if len(recent) < len(times) {
			m.alertRateLimit[resourceID] = recent
		}
	}

	// Clean up recent alerts older than suppression window
	suppressWindow := time.Duration(m.config.SuppressionWindow) * time.Minute
	if suppressWindow <= 0 {
		suppressWindow = 5 * time.Minute
	}
	for alertID, alert := range m.recentAlerts {
		if now.Sub(alert.LastSeen) > suppressWindow {
			delete(m.recentAlerts, alertID)
			cleaned++
		}
	}

	// Clean up ackState for alerts that no longer exist and are older than threshold
	for alertID, record := range m.ackState {
		if !m.hasActiveAlertNoLock(alertID) {
			// Use inactiveAt (when alert was removed) for TTL, not ack time
			checkTime := record.inactiveAt
			if checkTime.IsZero() {
				checkTime = record.time
			}
			if now.Sub(checkTime) > staleThreshold {
				delete(m.ackState, alertID)
				cleaned++
			}
		}
	}
	for canonicalID, record := range m.ackStateByCanonical {
		checkTime := record.inactiveAt
		if checkTime.IsZero() {
			checkTime = record.time
		}
		if now.Sub(checkTime) > staleThreshold {
			delete(m.ackStateByCanonical, canonicalID)
			cleaned++
		}
	}

	// Auto-resolve stale alerts - alerts where the resource hasn't been polled in 24 hours.
	// This handles cases where a resource (e.g., Docker container, storage) stops being
	// monitored but its alert remains active. Without this, alerts would persist indefinitely.
	staleAlerts := make([]string, 0)
	for storageKey, alert := range m.activeAlerts {
		alertID := effectiveAlertID(alert, storageKey)
		if alert != nil && now.Sub(alert.LastSeen) > staleThreshold {
			staleAlerts = append(staleAlerts, alertID)
		}
	}
	staleResolved := 0
	for _, alertID := range staleAlerts {
		alert, exists := m.getActiveAlertNoLock(alertID)
		if !exists || alert == nil {
			continue
		}
		log.Info().
			Str("alertID", alertID).
			Str("resourceName", alert.ResourceName).
			Time("lastSeen", alert.LastSeen).
			Dur("staleFor", now.Sub(alert.LastSeen)).
			Msg("Auto-resolving stale alert - resource no longer being monitored")
		m.clearAlertNoLock(alertID)
		cleaned++
		staleResolved++
	}

	// Persist changes if we resolved any stale alerts
	if staleResolved > 0 {
		go func() {
			defer func() {
				if r := recover(); r != nil {
					log.Error().Interface("panic", r).Msg("panic in SaveActiveAlerts goroutine (stale cleanup)")
				}
			}()
			if err := m.SaveActiveAlerts(); err != nil {
				log.Error().Err(err).Msg("failed to save active alerts after stale cleanup")
			}
		}()
		log.Info().
			Int("count", staleResolved).
			Msg("Auto-resolved stale alerts")
	}

	if cleaned > 0 {
		log.Debug().
			Int("entriesCleaned", cleaned).
			Msg("Cleaned stale entries from alert tracking maps")
	}
}

// CheckDiskHealth checks disk health and creates alerts if needed
func (m *Manager) CheckDiskHealth(instance, node string, disk proxmox.Disk) {
	// Create unique alert ID for this disk
	alertID := fmt.Sprintf("disk-health-%s-%s-%s", instance, node, disk.DevPath)
	resourceID := fmt.Sprintf("%s-%s", node, disk.DevPath)
	resourceName := fmt.Sprintf("%s (%s)", disk.Model, disk.DevPath)
	canonicalResourceID := proxmoxDiskCanonicalResourceID(instance, node, disk.DevPath)
	resourceType := unifiedresources.ResourceType("proxmox-disk")

	// Check if disk health is not PASSED
	normalizedHealth := strings.ToUpper(strings.TrimSpace(disk.Health))
	healthCheckNeeded := normalizedHealth != "" && normalizedHealth != "UNKNOWN" && normalizedHealth != "PASSED" && normalizedHealth != "OK"

	// Skip health alerts for drives with known firmware bugs that cause false reports
	// These drives may report FAILED status due to firmware issues even when healthy
	// We still monitor wearout below, which is more reliable for these drives
	if healthCheckNeeded && storagehealth.HasKnownFirmwareBug(disk.Model) {
		log.Debug().
			Str("node", node).
			Str("disk", disk.DevPath).
			Str("model", disk.Model).
			Str("health", disk.Health).
			Msg("Skipping health alert for drive with known firmware bug - health status unreliable")

		// Clear any existing health alert since we now recognize this is a false positive
		m.clearAlert(buildCanonicalStateID(canonicalResourceID, canonicalResourceID+"-health"))
		healthCheckNeeded = false // Skip to wearout check
	}

	if healthCheckNeeded {
		spec, err := buildCanonicalHealthAssessmentSpec(canonicalResourceID+"-health", canonicalResourceID, resourceName, resourceType, "proxmox-disk-health", nil, false)
		if err != nil {
			log.Warn().
				Err(err).
				Str("node", node).
				Str("disk", disk.DevPath).
				Msg("Skipping invalid canonical proxmox disk health spec")
		} else {
			metadata := proxmoxDiskAlertMetadata(disk)
			metadata["disk_health"] = disk.Health

			_, _ = m.evaluateCanonicalStatefulAlert(canonicalStatefulAlertParams{
				Spec: spec,
				Evidence: alertspecs.AlertEvidence{
					ObservedAt: time.Now(),
					HealthAssessment: &alertspecs.HealthAssessmentEvidence{
						Signal:   "proxmox-disk-health",
						Severity: alertspecs.AlertSeverityCritical,
						Codes:    []string{normalizedHealth},
					},
				},
				AlertID:      alertID,
				AlertType:    "disk-health",
				ResourceID:   resourceID,
				ResourceName: resourceName,
				Node:         node,
				Instance:     instance,
				Message:      fmt.Sprintf("Disk health check failed: %s", disk.Health),
				Metadata:     metadata,
				AddToRecent:  true,
				AddToHistory: true,
			})

			log.Error().
				Str("node", node).
				Str("disk", disk.DevPath).
				Str("model", disk.Model).
				Str("health", disk.Health).
				Msg("Disk health alert created")
		}
	} else {
		// Disk is healthy, clear alert if it exists
		m.clearAlert(buildCanonicalStateID(canonicalResourceID, canonicalResourceID+"-health"))
	}

	// Check for low wearout (SSD life remaining)
	if disk.Wearout > 0 && disk.Wearout < 10 {
		wearoutAlertID := fmt.Sprintf("disk-wearout-%s-%s-%s", instance, node, disk.DevPath)
		message := fmt.Sprintf("SSD has less than 10%% life remaining (%d%% wearout)", disk.Wearout)
		spec, err := buildCanonicalSeverityThresholdSpecWithDirection(canonicalResourceID+"-wearout", canonicalResourceID, resourceName, resourceType, "wearout-remaining", alertspecs.ThresholdDirectionBelow, 10, 0, false)
		if err != nil {
			log.Warn().
				Err(err).
				Str("node", node).
				Str("disk", disk.DevPath).
				Msg("Skipping invalid canonical proxmox disk wearout spec")
		} else {
			metadata := proxmoxDiskAlertMetadata(disk)
			metadata["disk_wearout"] = disk.Wearout

			_, _ = m.evaluateCanonicalStatefulAlert(canonicalStatefulAlertParams{
				Spec: spec,
				Evidence: alertspecs.AlertEvidence{
					ObservedAt: time.Now(),
					SeverityThreshold: &alertspecs.SeverityThresholdEvidence{
						Metric:    "wearout-remaining",
						Direction: alertspecs.ThresholdDirectionBelow,
						Observed:  float64(disk.Wearout),
					},
				},
				AlertID:      wearoutAlertID,
				AlertType:    "disk-wearout",
				ResourceID:   resourceID,
				ResourceName: resourceName,
				Node:         node,
				Instance:     instance,
				Message:      message,
				Value:        float64(disk.Wearout),
				Threshold:    10.0,
				Metadata:     metadata,
				AddToRecent:  true,
				AddToHistory: true,
			})

			log.Warn().
				Str("node", node).
				Str("disk", disk.DevPath).
				Str("model", disk.Model).
				Int("wearout", disk.Wearout).
				Msg("Disk wearout alert created")
		}
	} else if disk.Wearout >= 10 {
		// Wearout is acceptable, clear alert if it exists
		m.clearAlert(buildCanonicalStateID(canonicalResourceID, canonicalResourceID+"-wearout"))
	}
}

// clearAlertNoLock clears an alert without locking (must be called with lock held)
func (m *Manager) clearAlertNoLock(alertID string) {
	alert, exists := m.getActiveAlertNoLock(alertID)
	if !exists {
		return
	}
	publicID := effectiveAlertID(alert, alertID)

	// Record metric for resolved alert
	if recordAlertResolved != nil {
		recordAlertResolved(alert)
	}

	m.removeActiveAlertNoLock(alertID)
	resolvedAlert := &ResolvedAlert{
		Alert:        alert,
		ResolvedTime: time.Now(),
	}

	m.addRecentlyResolvedWithPrimaryLock(resolvedAlert)

	m.safeCallResolvedAlertCallback(alert, publicID, true) // Make async to prevent deadlock

	log.Info().
		Str("alertID", publicID).
		Msg("Alert cleared")
}

func (m *Manager) clearActiveAlertIfPresentNoLock(alertID string) bool {
	if _, exists := m.getActiveAlertNoLock(alertID); !exists {
		return false
	}
	m.clearAlertNoLock(alertID)
	return true
}

func (m *Manager) clearSnapshotAlertsForInstance(instance string) {
	m.mu.Lock()
	m.clearSnapshotAlertsForInstanceLocked(instance)
	m.mu.Unlock()
}

func (m *Manager) clearSnapshotAlertsForInstanceLocked(instance string) {
	for storageKey, alert := range m.activeAlerts {
		alertID := effectiveAlertID(alert, storageKey)
		if alert == nil || alert.Type != "snapshot-age" {
			continue
		}
		if instance != "" && alert.Instance != instance {
			continue
		}
		m.clearAlertNoLock(alertID)
	}
}

func (m *Manager) clearBackupAlerts() {
	m.mu.Lock()
	m.clearBackupAlertsLocked()
	m.mu.Unlock()
}

func (m *Manager) clearBackupAlertsLocked() {
	for storageKey, alert := range m.activeAlerts {
		alertID := effectiveAlertID(alert, storageKey)
		if alert == nil || alert.Type != "backup-age" {
			continue
		}
		m.clearAlertNoLock(alertID)
	}
}