From baeef84c6967b19bf60a0f75e8e3ccaffc6ec73a Mon Sep 17 00:00:00 2001 From: rcourtman Date: Wed, 6 May 2026 13:54:09 +0100 Subject: [PATCH] refactor: split backup snapshot alerts Move snapshot age and size evaluation, backup rollup age evaluation, inventory readiness, namespace disambiguation, template matching, and backup/snapshot cleanup into internal/alerts/backup_snapshot.go. Keep the generic async active-alert save helper in the central package because canonical metric migration still shares it, and record backup_snapshot.go as the backup/snapshot owner in the alerts subsystem contract. Proof: go test ./internal/alerts/... --- .../v6/internal/subsystems/alerts.md | 7 + internal/alerts/alerts.go | 963 ----------------- internal/alerts/alerts_test.go | 7 + internal/alerts/backup_snapshot.go | 976 ++++++++++++++++++ 4 files changed, 990 insertions(+), 963 deletions(-) create mode 100644 internal/alerts/backup_snapshot.go diff --git a/docs/release-control/v6/internal/subsystems/alerts.md b/docs/release-control/v6/internal/subsystems/alerts.md index 71360b984..c7bbc699f 100644 --- a/docs/release-control/v6/internal/subsystems/alerts.md +++ b/docs/release-control/v6/internal/subsystems/alerts.md @@ -62,6 +62,7 @@ operator-facing alert routing behavior for live runtime alerts. 40. `internal/alerts/storage.go` 41. `internal/alerts/node.go` 42. `internal/alerts/host.go` +43. `internal/alerts/backup_snapshot.go` ## Shared Boundaries @@ -260,6 +261,12 @@ health handling, host cleanup, and host offline lifecycle handling; future host agent alert behavior should extend that resource checker owner while shared health-assessment evaluation remains package-level until all storage-health callers can be separated behind a narrower owner. +Snapshot and backup-age alert evaluation now lives in +`internal/alerts/backup_snapshot.go`. That file owns snapshot age/size +evaluation, backup rollup age evaluation, backup inventory readiness, PVE +template subject matching, namespace disambiguation, and snapshot/backup active +alert cleanup; future backup or snapshot alert behavior should extend that +owner rather than expanding the central Manager file. Commercial alert handoffs now follow the same shared navigation boundary. `frontend-modern/src/components/Alerts/InvestigateAlertButton.tsx` may resolve the canonical `ai_alerts` destination from the shared license/commercial diff --git a/internal/alerts/alerts.go b/internal/alerts/alerts.go index 5fccd2b09..83bcf4ff2 100644 --- a/internal/alerts/alerts.go +++ b/internal/alerts/alerts.go @@ -4,7 +4,6 @@ import ( "encoding/json" "errors" "fmt" - "math" "os" "path/filepath" "slices" @@ -16,7 +15,6 @@ import ( alertconfig "github.com/rcourtman/pulse-go-rewrite/internal/alerts/config" alertspecs "github.com/rcourtman/pulse-go-rewrite/internal/alerts/specs" "github.com/rcourtman/pulse-go-rewrite/internal/models" - "github.com/rcourtman/pulse-go-rewrite/internal/recovery" "github.com/rcourtman/pulse-go-rewrite/internal/storagehealth" "github.com/rcourtman/pulse-go-rewrite/internal/unifiedresources" "github.com/rcourtman/pulse-go-rewrite/internal/utils" @@ -616,29 +614,6 @@ func (m *Manager) UpdateConfig(config AlertConfig) { m.reevaluateActiveAlertsLocked() } -func backupIgnoreVMID(vmID string, ignoreList []string) bool { - if vmID == "" || len(ignoreList) == 0 { - return false - } - for _, entry := range ignoreList { - value := strings.TrimSpace(entry) - if value == "" { - continue - } - if strings.HasSuffix(value, "*") { - prefix := strings.TrimSuffix(value, "*") - if prefix != "" && strings.HasPrefix(vmID, prefix) { - return true - } - continue - } - if vmID == value { - return true - } - } - return false -} - // migrateActivationState handles backward compatibility for activation state func (m *Manager) migrateActivationState(config *AlertConfig) { if config.ActivationState == "" { @@ -1073,83 +1048,6 @@ func (m *Manager) reevaluateActiveAlertsLocked() { } } -func (m *Manager) resolvedSnapshotAlertConfigNoLock(thresholds ThresholdConfig) SnapshotAlertConfig { - cfg := m.config.SnapshotDefaults - if thresholds.Snapshot != nil { - cfg = *thresholds.Snapshot - } - return cfg -} - -func (m *Manager) resolvedBackupAlertConfigNoLock(thresholds ThresholdConfig) BackupAlertConfig { - cfg := m.config.BackupDefaults - if thresholds.Backup != nil { - cfg = *thresholds.Backup - } - if cfg.AlertOrphaned == nil { - alertOrphaned := true - cfg.AlertOrphaned = &alertOrphaned - } - return cfg -} - -func snapshotAlertStillTriggered(alert *Alert, cfg SnapshotAlertConfig) bool { - if alert == nil || !cfg.Enabled { - return false - } - - ageValue, _ := metadataFloatValue(alert.Metadata, "snapshotAgeDays") - sizeValue, _ := metadataFloatValue(alert.Metadata, "snapshotSizeGiB") - - if cfg.CriticalDays > 0 && ageValue >= float64(cfg.CriticalDays) { - return true - } - if cfg.WarningDays > 0 && ageValue >= float64(cfg.WarningDays) { - return true - } - if cfg.CriticalSizeGiB > 0 && sizeValue >= cfg.CriticalSizeGiB { - return true - } - if cfg.WarningSizeGiB > 0 && sizeValue >= cfg.WarningSizeGiB { - return true - } - - return false -} - -func backupAlertStillTriggered(alert *Alert, cfg BackupAlertConfig) bool { - if alert == nil || !cfg.Enabled { - return false - } - - vmid := metadataStringValue(alert.Metadata, "guestVmid") - if vmid == "" { - if parsed := metadataIntValue(alert.Metadata["guestVmid"]); parsed > 0 { - vmid = strconv.Itoa(parsed) - } - } - if backupIgnoreVMID(vmid, cfg.IgnoreVMIDs) { - return false - } - if metadataBoolValue(alert.Metadata, "orphaned") && cfg.AlertOrphaned != nil && !*cfg.AlertOrphaned { - return false - } - - ageValue, ok := metadataFloatValue(alert.Metadata, "ageDays") - if !ok { - ageValue = alert.Value - } - - if cfg.CriticalDays > 0 && ageValue >= float64(cfg.CriticalDays) { - return true - } - if cfg.WarningDays > 0 && ageValue >= float64(cfg.WarningDays) { - return true - } - - return false -} - // ReevaluateGuestAlert reevaluates a specific guest's alerts with full threshold resolution including custom rules // This should be called by the monitor with the current guest state func (m *Manager) ReevaluateGuestAlert(guest any, guestID string) { @@ -1773,127 +1671,6 @@ func (m *Manager) syncCanonicalHealthAssessmentAlert(params canonicalHealthAsses }) } -// BuildGuestKey constructs a unique key for a guest from instance, node, and VMID. -// Uses the canonical format: instance:node:vmid -// This matches the format used by makeGuestID in the monitoring package. -func BuildGuestKey(instance, node string, vmID int) string { - instance = strings.TrimSpace(instance) - node = strings.TrimSpace(node) - if instance == "" { - instance = node - } - return fmt.Sprintf("%s:%s:%d", instance, node, vmID) -} - -type backupRecord struct { - key string - vmID string - lookup GuestLookup - fallbackName string - instance string - node string - subjectType string - source string - rollupID string - providers []recovery.Provider - lastTime time.Time -} - -// BackupInventoryScope carries monitoring-owned inventory readiness into backup -// alert evaluation. It keeps orphan detection from racing ahead of Proxmox -// guest/template discovery while preserving the direct CheckBackups API for -// unit tests and non-monitoring callers. -type BackupInventoryScope struct { - PVEOrphanInventoryReady map[string]map[string]bool - PVETemplateSubjects map[string]struct{} -} - -func BuildBackupPVETemplateSubjectKey(instance, guestType, node string, vmid int) string { - instance = strings.TrimSpace(instance) - guestType = normalizeBackupGuestType(guestType) - node = strings.TrimSpace(node) - if instance == "" || guestType == "" || node == "" || vmid <= 0 { - return "" - } - return strings.Join([]string{instance, guestType, node, strconv.Itoa(vmid)}, "\x00") -} - -func normalizeBackupGuestType(guestType string) string { - switch strings.ToLower(strings.TrimSpace(guestType)) { - case "qemu", "vm", "proxmox-vm": - return "qemu" - case "lxc", "ct", "container", "system-container", "proxmox-lxc": - return "lxc" - default: - return strings.ToLower(strings.TrimSpace(guestType)) - } -} - -func backupOrphanInventoryReady(scope *BackupInventoryScope, record backupRecord) bool { - if scope == nil || scope.PVEOrphanInventoryReady == nil { - return true - } - if record.source != "PVE" { - return true - } - instance := strings.TrimSpace(record.instance) - guestType := normalizeBackupGuestType(record.subjectType) - if instance == "" || guestType == "" { - return false - } - return scope.PVEOrphanInventoryReady[instance][guestType] -} - -func backupMatchesKnownPVETemplate(scope *BackupInventoryScope, record backupRecord) bool { - if scope == nil || len(scope.PVETemplateSubjects) == 0 || record.source != "PVE" { - return false - } - vmid, err := strconv.Atoi(strings.TrimSpace(record.vmID)) - if err != nil || vmid <= 0 { - return false - } - key := BuildBackupPVETemplateSubjectKey(record.instance, record.subjectType, record.node, vmid) - if key == "" { - return false - } - _, exists := scope.PVETemplateSubjects[key] - return exists -} - -func canonicalGuestResourceType(guestType string) unifiedresources.ResourceType { - switch strings.ToLower(strings.TrimSpace(guestType)) { - case "lxc": - return unifiedresources.ResourceTypeSystemContainer - default: - return unifiedresources.ResourceTypeVM - } -} - -func canonicalBackupSubjectResourceType(record backupRecord) unifiedresources.ResourceType { - if record.lookup.Type != "" { - return canonicalGuestResourceType(record.lookup.Type) - } - switch normalizeBackupGuestType(record.subjectType) { - case "lxc": - return unifiedresources.ResourceTypeSystemContainer - case "qemu": - return unifiedresources.ResourceTypeVM - } - if strings.TrimSpace(record.vmID) != "" { - return unifiedresources.ResourceTypeVM - } - return unifiedresources.ResourceType("backup-subject") -} - -func canonicalBackupSubjectResourceID(alertKey string, record backupRecord) string { - if record.instance != "" && record.node != "" && record.vmID != "" { - if vmid, err := strconv.Atoi(record.vmID); err == nil && vmid > 0 { - return BuildGuestKey(record.instance, record.node, vmid) - } - } - return "backup-subject:" + sanitizeAlertKey(alertKey) -} - func asyncSaveActiveAlerts(reason string, save func() error) { go func() { defer func() { @@ -1907,662 +1684,6 @@ func asyncSaveActiveAlerts(reason string, save func() error) { }() } -// CheckSnapshotsForInstance evaluates guest snapshots for age-based alerts. -func (m *Manager) CheckSnapshotsForInstance(instanceName string, snapshots []models.GuestSnapshot, guestNames map[string]string) { - m.mu.RLock() - enabled := m.config.Enabled - snapshotCfg := m.config.SnapshotDefaults - m.mu.RUnlock() - - if !enabled { - return - } - - if !snapshotCfg.Enabled { - m.clearSnapshotAlertsForInstance(instanceName) - return - } - - now := time.Now() - validAlerts := make(map[string]struct{}) - - for _, snapshot := range snapshots { - if instanceName != "" && snapshot.Instance != "" && snapshot.Instance != instanceName { - continue - } - if snapshot.Time.IsZero() { - continue - } - - ageHours := now.Sub(snapshot.Time).Hours() - if ageHours < 0 { - continue - } - ageDays := ageHours / 24 - - const gib = 1024.0 * 1024 * 1024 - sizeGiB := 0.0 - if snapshot.SizeBytes > 0 { - sizeGiB = float64(snapshot.SizeBytes) / gib - } - - // Determine thresholds for this snapshot - resourceID := fmt.Sprintf("%s:%s:%d", snapshot.Instance, snapshot.Node, snapshot.VMID) - guestName := strings.TrimSpace(guestNames[BuildGuestKey(snapshot.Instance, snapshot.Node, snapshot.VMID)]) - guestContext := guestSnapshotFromIdentity(resourceID, guestName, snapshot.Node, snapshot.Instance, snapshot.Type, "") - m.mu.RLock() - gh := m.getGuestThresholds(guestContext, resourceID) - m.mu.RUnlock() - - if gh.Disabled { - continue - } - - currentSnapshotCfg := snapshotCfg - if gh.Snapshot != nil { - currentSnapshotCfg = *gh.Snapshot - } - - if !currentSnapshotCfg.Enabled { - continue - } - - var ageLevel AlertLevel - var ageThreshold int - var sizeLevel AlertLevel - var sizeThreshold float64 - var triggeredStats []string - - if currentSnapshotCfg.CriticalDays > 0 && ageDays >= float64(currentSnapshotCfg.CriticalDays) { - ageLevel = AlertLevelCritical - ageThreshold = currentSnapshotCfg.CriticalDays - triggeredStats = append(triggeredStats, "age") - } else if currentSnapshotCfg.WarningDays > 0 && ageDays >= float64(currentSnapshotCfg.WarningDays) { - ageLevel = AlertLevelWarning - ageThreshold = currentSnapshotCfg.WarningDays - triggeredStats = append(triggeredStats, "age") - } - - if snapshot.SizeBytes > 0 { - if currentSnapshotCfg.CriticalSizeGiB > 0 && sizeGiB >= currentSnapshotCfg.CriticalSizeGiB { - sizeLevel = AlertLevelCritical - sizeThreshold = currentSnapshotCfg.CriticalSizeGiB - triggeredStats = append(triggeredStats, "size") - } else if currentSnapshotCfg.WarningSizeGiB > 0 && sizeGiB >= currentSnapshotCfg.WarningSizeGiB { - sizeLevel = AlertLevelWarning - sizeThreshold = currentSnapshotCfg.WarningSizeGiB - triggeredStats = append(triggeredStats, "size") - } - } - - if ageLevel == "" && sizeLevel == "" { - continue - } - - useSizePrimary := false - if sizeLevel == AlertLevelCritical && ageLevel != AlertLevelCritical { - useSizePrimary = true - } else if sizeLevel != "" && ageLevel == "" { - useSizePrimary = true - } - - alertID := fmt.Sprintf("snapshot-age-%s", snapshot.ID) - - guestKey := BuildGuestKey(snapshot.Instance, snapshot.Node, snapshot.VMID) - - guestType := "VM" - if strings.EqualFold(snapshot.Type, "lxc") { - guestType = "Container" - } - - if guestName == "" { - switch guestType { - case "Container": - guestName = fmt.Sprintf("CT %d", snapshot.VMID) - default: - guestName = fmt.Sprintf("VM %d", snapshot.VMID) - } - } - - snapshotName := strings.TrimSpace(snapshot.Name) - if snapshotName == "" { - snapshotName = "(unnamed)" - } - - ageDaysRounded := math.Round(ageDays*10) / 10 - sizeGiBRounded := math.Round(sizeGiB*10) / 10 - reasons := make([]string, 0, 2) - if ageLevel != "" { - reasons = append(reasons, fmt.Sprintf("%.1f days old (threshold %d days)", ageDaysRounded, ageThreshold)) - } - if sizeLevel != "" { - reasons = append(reasons, fmt.Sprintf("%.1f GiB (threshold %.1f GiB)", sizeGiBRounded, sizeThreshold)) - } - reasonText := strings.Join(reasons, " and ") - message := fmt.Sprintf( - "%s snapshot '%s' for %s is %s on %s", - guestType, - snapshotName, - guestName, - reasonText, - snapshot.Node, - ) - - alertValue := ageDays - alertThreshold := float64(ageThreshold) - thresholdTime := now - if useSizePrimary { - alertValue = sizeGiB - alertThreshold = sizeThreshold - } else if ageThreshold > 0 { - thresholdTime = snapshot.Time.Add(time.Duration(ageThreshold) * 24 * time.Hour) - if thresholdTime.After(now) { - thresholdTime = now - } - } - - metadata := map[string]interface{}{ - "snapshotName": snapshot.Name, - "snapshotCreatedAt": snapshot.Time, - "snapshotAgeDays": ageDays, - "snapshotAgeHours": ageHours, - "snapshotSizeBytes": snapshot.SizeBytes, - "snapshotSizeGiB": sizeGiB, - "guestName": guestName, - "guestType": guestType, - "guestInstance": snapshot.Instance, - "guestNode": snapshot.Node, - "guestVmid": snapshot.VMID, - "triggeredMetrics": triggeredStats, - "primaryMetric": "age", - } - if useSizePrimary { - metadata["primaryMetric"] = "size" - } - if ageLevel != "" { - metadata["thresholdDays"] = ageThreshold - } - if sizeLevel != "" { - metadata["thresholdSizeGiB"] = sizeThreshold - } - - resourceName := fmt.Sprintf("%s snapshot '%s'", guestName, snapshotName) - guestResourceType := canonicalGuestResourceType(snapshot.Type) - guestResourceID := guestKey - sizeMetric := "" - var sizeValue *float64 - if currentSnapshotCfg.WarningSizeGiB > 0 || currentSnapshotCfg.CriticalSizeGiB > 0 { - sizeMetric = "snapshot-size-gib" - sizeValue = &sizeGiB - } - ageMetric := "" - if currentSnapshotCfg.WarningDays > 0 || currentSnapshotCfg.CriticalDays > 0 { - ageMetric = "snapshot-age-days" - } - - spec, err := buildCanonicalPostureThresholdSpec( - guestResourceID+"/snapshot:"+snapshot.ID, - guestResourceID, - resourceName, - guestResourceType, - ageMetric, - float64(currentSnapshotCfg.WarningDays), - float64(currentSnapshotCfg.CriticalDays), - sizeMetric, - currentSnapshotCfg.WarningSizeGiB, - currentSnapshotCfg.CriticalSizeGiB, - false, - ) - if err != nil { - log.Warn(). - Err(err). - Str("snapshotID", snapshot.ID). - Str("resourceID", guestResourceID). - Msg("Skipping invalid canonical snapshot posture spec") - continue - } - validAlerts[canonicalTrackingKeyForSpec(spec, alertID)] = struct{}{} - - result, _ := m.evaluateCanonicalStatefulAlert(canonicalStatefulAlertParams{ - Spec: spec, - Evidence: alertspecs.AlertEvidence{ - ObservedAt: now, - PostureThreshold: &alertspecs.PostureThresholdEvidence{ - AgeMetric: ageMetric, - AgeValue: ageDays, - SizeMetric: sizeMetric, - SizeValue: sizeValue, - }, - }, - AlertID: alertID, - AlertType: "snapshot-age", - ResourceID: spec.ResourceID, - ResourceName: resourceName, - Node: snapshot.Node, - Instance: snapshot.Instance, - Value: alertValue, - Threshold: alertThreshold, - StartTimeOverride: thresholdTime, - Metadata: metadata, - AddToRecent: true, - AddToHistory: true, - RateLimit: true, - DispatchAsync: true, - MessageBuilder: func(result alertspecs.EvaluationResult) (string, float64, float64) { - return message, alertValue, alertThreshold - }, - }) - if result.Transition != nil && result.Transition.Kind == alertspecs.EvaluationTransitionActivated { - asyncSaveActiveAlerts("snapshot", m.SaveActiveAlerts) - } - } - - m.mu.Lock() - for storageKey, alert := range m.activeAlerts { - if alert == nil || alert.Type != "snapshot-age" { - continue - } - if instanceName != "" && alert.Instance != instanceName { - continue - } - if _, ok := validAlerts[storageKey]; ok { - continue - } - m.clearAlertNoLock(storageKey) - } - m.mu.Unlock() -} - -// CheckBackups evaluates storage, PBS, and PMG backups for age-based alerts. -func (m *Manager) CheckBackups( - rollups []recovery.ProtectionRollup, - guestsByKey map[string]GuestLookup, - guestsByVMID map[string][]GuestLookup, -) { - m.CheckBackupsWithInventory(rollups, guestsByKey, guestsByVMID, nil) -} - -// CheckBackupsWithInventory evaluates backup rollups with optional monitoring -// inventory readiness for orphan detection. -func (m *Manager) CheckBackupsWithInventory( - rollups []recovery.ProtectionRollup, - guestsByKey map[string]GuestLookup, - guestsByVMID map[string][]GuestLookup, - inventoryScope *BackupInventoryScope, -) { - m.mu.RLock() - enabled := m.config.Enabled - backupCfg := m.config.BackupDefaults - m.mu.RUnlock() - - if backupCfg.AlertOrphaned == nil { - alertOrphaned := true - backupCfg.AlertOrphaned = &alertOrphaned - } - - if !enabled || !backupCfg.Enabled { - m.clearBackupAlerts() - return - } - - if backupCfg.WarningDays <= 0 && backupCfg.CriticalDays <= 0 { - m.clearBackupAlerts() - return - } - - records := make(map[string]*backupRecord) - - updateRecord := func(key string, candidate backupRecord) { - if key == "" { - return - } - if existing, ok := records[key]; ok { - if candidate.lastTime.After(existing.lastTime) { - *existing = candidate - } - return - } - record := candidate - records[key] = &record - } - - now := time.Now() - - for _, rollup := range rollups { - if rollup.LastSuccessAt == nil || rollup.LastSuccessAt.IsZero() { - continue - } - - lastTime := rollup.LastSuccessAt.UTC() - providers := append([]recovery.Provider(nil), rollup.Providers...) - - source := "Recovery" - if slicesContainsProvider(providers, recovery.ProviderProxmoxPMG) { - source = "PMG" - } else if slicesContainsProvider(providers, recovery.ProviderProxmoxPBS) { - source = "PBS" - } else if slicesContainsProvider(providers, recovery.ProviderProxmoxPVE) { - source = "PVE" - } - - var ( - info GuestLookup - key string - displayName string - instance string - node string - vmID string - subjectType string - ) - - ref := rollup.SubjectRef - if ref != nil { - subjectType = normalizeBackupGuestType(ref.Type) - } - - // Primary: subjectRef.ID is the canonical proxmox guest source ID (instance:node:vmid) when linked. - if ref != nil && strings.TrimSpace(ref.ID) != "" { - if inst, nd, vmid, ok := parseGuestID(ref.ID); ok { - key = BuildGuestKey(inst, nd, vmid) - info = guestsByKey[key] - instance = inst - node = nd - vmID = strconv.Itoa(vmid) - } - } - - // Secondary: attempt to map by VMID for orphaned/ambiguous backups. - if key == "" && ref != nil { - vmidStr := strings.TrimSpace(ref.ID) - if vmidStr == "" { - vmidStr = strings.TrimSpace(ref.Name) - } - if vmidStr != "" { - if vmid, err := strconv.Atoi(vmidStr); err == nil && vmid > 0 { - vmID = vmidStr - guests := guestsByVMID[vmidStr] - if len(guests) == 1 { - info = guests[0] - } else if len(guests) > 1 && strings.TrimSpace(ref.Namespace) != "" { - for _, g := range guests { - if namespaceMatchesInstance(ref.Namespace, g.Instance) { - info = g - break - } - } - } - if info.Instance != "" && info.Node != "" { - key = BuildGuestKey(info.Instance, info.Node, info.VMID) - instance = info.Instance - node = info.Node - } - } - } - } - - if key == "" { - // Stable fallback for non-guest subjects and orphans. - key = strings.TrimSpace(rollup.RollupID) - if key == "" { - continue - } - } - - displayName = strings.TrimSpace(info.Name) - if displayName == "" && ref != nil { - displayName = strings.TrimSpace(ref.Name) - } - if displayName == "" && vmID != "" { - displayName = fmt.Sprintf("VMID %s", vmID) - } - if displayName == "" { - displayName = "Unknown" - } - - updateRecord(key, backupRecord{ - key: key, - vmID: vmID, - lookup: info, - fallbackName: displayName, - instance: instance, - node: node, - subjectType: subjectType, - source: source, - rollupID: strings.TrimSpace(rollup.RollupID), - providers: providers, - lastTime: lastTime, - }) - } - - if len(records) == 0 { - m.clearBackupAlerts() - return - } - - validAlerts := make(map[string]struct{}) - - for key, record := range records { - age := now.Sub(record.lastTime) - if age < 0 { - continue - } - - ageDays := age.Hours() / 24 - if ageDays < 0 { - continue - } - ageDaysRounded := math.Round(ageDays*10) / 10 - - // Determine thresholds for this backup - currentBackupCfg := backupCfg - guestContext := guestSnapshotFromLookup(record.lookup, record.fallbackName) - guestResourceID := strings.TrimSpace(record.lookup.ResourceID) - if guestResourceID == "" { - guestResourceID = guestContext.ID - } - if guestResourceID != "" { - m.mu.RLock() - gh := m.getGuestThresholds(guestContext, guestResourceID) - m.mu.RUnlock() - if gh.Disabled { - continue - } - if gh.Backup != nil { - currentBackupCfg = *gh.Backup - } - } - - currentBackupCfg.AlertOrphaned = backupCfg.AlertOrphaned - currentBackupCfg.IgnoreVMIDs = backupCfg.IgnoreVMIDs - - if backupIgnoreVMID(record.vmID, currentBackupCfg.IgnoreVMIDs) { - continue - } - if record.vmID != "" && record.lookup.ResourceID == "" { - if backupMatchesKnownPVETemplate(inventoryScope, *record) { - continue - } - if !backupOrphanInventoryReady(inventoryScope, *record) { - continue - } - if currentBackupCfg.AlertOrphaned != nil && !*currentBackupCfg.AlertOrphaned { - continue - } - } - - if !currentBackupCfg.Enabled { - continue - } - - var threshold int - switch { - case currentBackupCfg.CriticalDays > 0 && ageDays >= float64(currentBackupCfg.CriticalDays): - threshold = currentBackupCfg.CriticalDays - case currentBackupCfg.WarningDays > 0 && ageDays >= float64(currentBackupCfg.WarningDays): - threshold = currentBackupCfg.WarningDays - default: - continue - } - - alertKey := sanitizeAlertKey(key) - alertID := fmt.Sprintf("backup-age-%s", alertKey) - - displayName := record.lookup.Name - if displayName == "" { - displayName = record.fallbackName - } - if displayName == "" { - displayName = "Unknown guest" - } - - node := record.node - if node == "" { - node = record.lookup.Node - } - instance := record.instance - if instance == "" { - instance = record.lookup.Instance - } - - thresholdTime := record.lastTime.Add(time.Duration(threshold) * 24 * time.Hour) - if thresholdTime.After(now) { - thresholdTime = now - } - - var sourceLabel string - sourceLabel = record.source - if len(record.providers) > 0 { - parts := make([]string, 0, len(record.providers)) - for _, p := range record.providers { - if s := strings.TrimSpace(string(p)); s != "" { - parts = append(parts, s) - } - } - if len(parts) > 0 { - sourceLabel = strings.Join(parts, ", ") - } - } - - message := fmt.Sprintf( - "%s backup via %s is %.1f days old (threshold: %d days)", - displayName, - sourceLabel, - ageDaysRounded, - threshold, - ) - - metadata := map[string]interface{}{ - "source": record.source, - "providers": record.providers, - "rollupId": record.rollupID, - "lastBackupTime": record.lastTime, - "ageDays": ageDays, - "thresholdDays": threshold, - "guestName": displayName, - "guestType": record.lookup.Type, - "guestInstance": instance, - "guestNode": node, - "guestVmid": metadataIntValue(record.vmID), - "orphaned": record.vmID != "" && guestResourceID == "", - } - specResourceID := canonicalBackupSubjectResourceID(alertKey, *record) - specResourceType := canonicalBackupSubjectResourceType(*record) - spec, err := buildCanonicalPostureThresholdSpec( - specResourceID+"-backup-age", - specResourceID, - displayName+" backup", - specResourceType, - "backup-age-days", - float64(currentBackupCfg.WarningDays), - float64(currentBackupCfg.CriticalDays), - "", - 0, - 0, - false, - ) - if err != nil { - log.Warn(). - Err(err). - Str("alertID", alertID). - Str("resourceID", specResourceID). - Msg("Skipping invalid canonical backup posture spec") - continue - } - validAlerts[canonicalTrackingKeyForSpec(spec, alertID)] = struct{}{} - - result, _ := m.evaluateCanonicalStatefulAlert(canonicalStatefulAlertParams{ - Spec: spec, - Evidence: alertspecs.AlertEvidence{ - ObservedAt: now, - PostureThreshold: &alertspecs.PostureThresholdEvidence{ - AgeMetric: "backup-age-days", - AgeValue: ageDays, - }, - }, - AlertID: alertID, - AlertType: "backup-age", - ResourceID: spec.ResourceID, - ResourceName: fmt.Sprintf("%s backup", displayName), - Node: node, - Instance: instance, - Value: ageDays, - Threshold: float64(threshold), - StartTimeOverride: thresholdTime, - Metadata: metadata, - AddToRecent: true, - AddToHistory: true, - RateLimit: true, - DispatchAsync: true, - MessageBuilder: func(result alertspecs.EvaluationResult) (string, float64, float64) { - return message, ageDays, float64(threshold) - }, - }) - if result.Transition != nil && result.Transition.Kind == alertspecs.EvaluationTransitionActivated { - asyncSaveActiveAlerts("backup", m.SaveActiveAlerts) - } - } - - m.mu.Lock() - for storageKey, alert := range m.activeAlerts { - if alert == nil || alert.Type != "backup-age" { - continue - } - if _, ok := validAlerts[storageKey]; ok { - continue - } - m.clearAlertNoLock(storageKey) - } - m.mu.Unlock() -} - -func slicesContainsProvider(providers []recovery.Provider, target recovery.Provider) bool { - for _, p := range providers { - if p == target { - return true - } - } - return false -} - -func parseGuestID(raw string) (instance string, node string, vmid int, ok bool) { - raw = strings.TrimSpace(raw) - if raw == "" { - return "", "", 0, false - } - parts := strings.Split(raw, ":") - if len(parts) < 3 { - return "", "", 0, false - } - last := parts[len(parts)-1] - prev := parts[len(parts)-2] - inst := strings.Join(parts[:len(parts)-2], ":") - n, err := strconv.Atoi(strings.TrimSpace(last)) - if err != nil || n <= 0 { - return "", "", 0, false - } - return strings.TrimSpace(inst), strings.TrimSpace(prev), n, true -} - // clearAlert removes an alert if it exists func (m *Manager) clearAlert(alertID string) { m.mu.Lock() @@ -3132,55 +2253,6 @@ func abs(x float64) float64 { return x } -// namespaceMatchesInstance checks if a PBS namespace likely corresponds to a PVE instance. -// This helps disambiguate backups when multiple PVE instances have VMs with the same VMID. -// Examples: namespace "pve1" matches instance "pve1", namespace "nat" matches instance "pve-nat" -func namespaceMatchesInstance(namespace, instance string) bool { - if namespace == "" || instance == "" { - return false - } - - // Normalize both strings: lowercase and keep only alphanumeric - normalize := func(s string) string { - var b strings.Builder - for _, r := range strings.ToLower(s) { - if (r >= 'a' && r <= 'z') || (r >= '0' && r <= '9') { - b.WriteRune(r) - } - } - return b.String() - } - - ns := normalize(namespace) - inst := normalize(instance) - - if ns == "" || inst == "" { - return false - } - - // Exact match after normalization - if ns == inst { - return true - } - - // Check if namespace is a suffix of instance - // e.g., namespace "nat" matches instance "pvenat" (normalized from "pve-nat") - // This is more precise than substring matching because: - // - "nat" should match "pve-nat" but not "natpve" - // - "pve" should match "pve" but not "pve-nat" (handled by exact match above) - if strings.HasSuffix(inst, ns) { - return true - } - - // Check if instance is a suffix of namespace (reverse case) - // e.g., namespace "pvebackups" could match instance "pve" - if strings.HasSuffix(ns, inst) { - return true - } - - return false -} - // AcknowledgeAlert acknowledges an alert func (m *Manager) AcknowledgeAlert(alertID, user string) error { m.mu.Lock() @@ -4989,38 +4061,3 @@ func (m *Manager) clearActiveAlertIfPresentNoLock(alertID string) bool { m.clearAlertNoLock(alertID) return true } - -func (m *Manager) clearSnapshotAlertsForInstance(instance string) { - m.mu.Lock() - m.clearSnapshotAlertsForInstanceLocked(instance) - m.mu.Unlock() -} - -func (m *Manager) clearSnapshotAlertsForInstanceLocked(instance string) { - for storageKey, alert := range m.activeAlerts { - alertID := effectiveAlertID(alert, storageKey) - if alert == nil || alert.Type != "snapshot-age" { - continue - } - if instance != "" && alert.Instance != instance { - continue - } - m.clearAlertNoLock(alertID) - } -} - -func (m *Manager) clearBackupAlerts() { - m.mu.Lock() - m.clearBackupAlertsLocked() - m.mu.Unlock() -} - -func (m *Manager) clearBackupAlertsLocked() { - for storageKey, alert := range m.activeAlerts { - alertID := effectiveAlertID(alert, storageKey) - if alert == nil || alert.Type != "backup-age" { - continue - } - m.clearAlertNoLock(alertID) - } -} diff --git a/internal/alerts/alerts_test.go b/internal/alerts/alerts_test.go index 8e791ee31..e916c468d 100644 --- a/internal/alerts/alerts_test.go +++ b/internal/alerts/alerts_test.go @@ -325,6 +325,13 @@ func TestHostResourceIDTrimsID(t *testing.T) { } } +func TestBuildBackupPVETemplateSubjectKeyTrimsParts(t *testing.T) { + got := BuildBackupPVETemplateSubjectKey(" inst ", " VM ", " node ", 702) + if got != "inst\x00qemu\x00node\x00702" { + t.Fatalf("BuildBackupPVETemplateSubjectKey() = %q, want %q", got, "inst\x00qemu\x00node\x00702") + } +} + func TestQuietHoursCategoryForResourceIncidentUsesIncidentCategoryMetadata(t *testing.T) { availability := &Alert{ Type: "resource-incident", diff --git a/internal/alerts/backup_snapshot.go b/internal/alerts/backup_snapshot.go new file mode 100644 index 000000000..1e4133cc7 --- /dev/null +++ b/internal/alerts/backup_snapshot.go @@ -0,0 +1,976 @@ +package alerts + +import ( + "fmt" + "math" + "strconv" + "strings" + "time" + + alertspecs "github.com/rcourtman/pulse-go-rewrite/internal/alerts/specs" + "github.com/rcourtman/pulse-go-rewrite/internal/models" + "github.com/rcourtman/pulse-go-rewrite/internal/recovery" + "github.com/rcourtman/pulse-go-rewrite/internal/unifiedresources" + "github.com/rs/zerolog/log" +) + +func backupIgnoreVMID(vmID string, ignoreList []string) bool { + if vmID == "" || len(ignoreList) == 0 { + return false + } + for _, entry := range ignoreList { + value := strings.TrimSpace(entry) + if value == "" { + continue + } + if strings.HasSuffix(value, "*") { + prefix := strings.TrimSuffix(value, "*") + if prefix != "" && strings.HasPrefix(vmID, prefix) { + return true + } + continue + } + if vmID == value { + return true + } + } + return false +} + +func (m *Manager) resolvedSnapshotAlertConfigNoLock(thresholds ThresholdConfig) SnapshotAlertConfig { + cfg := m.config.SnapshotDefaults + if thresholds.Snapshot != nil { + cfg = *thresholds.Snapshot + } + return cfg +} + +func (m *Manager) resolvedBackupAlertConfigNoLock(thresholds ThresholdConfig) BackupAlertConfig { + cfg := m.config.BackupDefaults + if thresholds.Backup != nil { + cfg = *thresholds.Backup + } + if cfg.AlertOrphaned == nil { + alertOrphaned := true + cfg.AlertOrphaned = &alertOrphaned + } + return cfg +} + +func snapshotAlertStillTriggered(alert *Alert, cfg SnapshotAlertConfig) bool { + if alert == nil || !cfg.Enabled { + return false + } + + ageValue, _ := metadataFloatValue(alert.Metadata, "snapshotAgeDays") + sizeValue, _ := metadataFloatValue(alert.Metadata, "snapshotSizeGiB") + + if cfg.CriticalDays > 0 && ageValue >= float64(cfg.CriticalDays) { + return true + } + if cfg.WarningDays > 0 && ageValue >= float64(cfg.WarningDays) { + return true + } + if cfg.CriticalSizeGiB > 0 && sizeValue >= cfg.CriticalSizeGiB { + return true + } + if cfg.WarningSizeGiB > 0 && sizeValue >= cfg.WarningSizeGiB { + return true + } + + return false +} + +func backupAlertStillTriggered(alert *Alert, cfg BackupAlertConfig) bool { + if alert == nil || !cfg.Enabled { + return false + } + + vmid := metadataStringValue(alert.Metadata, "guestVmid") + if vmid == "" { + if parsed := metadataIntValue(alert.Metadata["guestVmid"]); parsed > 0 { + vmid = strconv.Itoa(parsed) + } + } + if backupIgnoreVMID(vmid, cfg.IgnoreVMIDs) { + return false + } + if metadataBoolValue(alert.Metadata, "orphaned") && cfg.AlertOrphaned != nil && !*cfg.AlertOrphaned { + return false + } + + ageValue, ok := metadataFloatValue(alert.Metadata, "ageDays") + if !ok { + ageValue = alert.Value + } + + if cfg.CriticalDays > 0 && ageValue >= float64(cfg.CriticalDays) { + return true + } + if cfg.WarningDays > 0 && ageValue >= float64(cfg.WarningDays) { + return true + } + + return false +} + +// BuildGuestKey constructs a unique key for a guest from instance, node, and VMID. +// Uses the canonical format: instance:node:vmid +// This matches the format used by makeGuestID in the monitoring package. +func BuildGuestKey(instance, node string, vmID int) string { + instance = strings.TrimSpace(instance) + node = strings.TrimSpace(node) + if instance == "" { + instance = node + } + return fmt.Sprintf("%s:%s:%d", instance, node, vmID) +} + +type backupRecord struct { + key string + vmID string + lookup GuestLookup + fallbackName string + instance string + node string + subjectType string + source string + rollupID string + providers []recovery.Provider + lastTime time.Time +} + +// BackupInventoryScope carries monitoring-owned inventory readiness into backup +// alert evaluation. It keeps orphan detection from racing ahead of Proxmox +// guest/template discovery while preserving the direct CheckBackups API for +// unit tests and non-monitoring callers. +type BackupInventoryScope struct { + PVEOrphanInventoryReady map[string]map[string]bool + PVETemplateSubjects map[string]struct{} +} + +func BuildBackupPVETemplateSubjectKey(instance, guestType, node string, vmid int) string { + instance = strings.TrimSpace(instance) + guestType = normalizeBackupGuestType(guestType) + node = strings.TrimSpace(node) + if instance == "" || guestType == "" || node == "" || vmid <= 0 { + return "" + } + return strings.Join([]string{instance, guestType, node, strconv.Itoa(vmid)}, "\x00") +} + +func normalizeBackupGuestType(guestType string) string { + switch strings.ToLower(strings.TrimSpace(guestType)) { + case "qemu", "vm", "proxmox-vm": + return "qemu" + case "lxc", "ct", "container", "system-container", "proxmox-lxc": + return "lxc" + default: + return strings.ToLower(strings.TrimSpace(guestType)) + } +} + +func backupOrphanInventoryReady(scope *BackupInventoryScope, record backupRecord) bool { + if scope == nil || scope.PVEOrphanInventoryReady == nil { + return true + } + if record.source != "PVE" { + return true + } + instance := strings.TrimSpace(record.instance) + guestType := normalizeBackupGuestType(record.subjectType) + if instance == "" || guestType == "" { + return false + } + return scope.PVEOrphanInventoryReady[instance][guestType] +} + +func backupMatchesKnownPVETemplate(scope *BackupInventoryScope, record backupRecord) bool { + if scope == nil || len(scope.PVETemplateSubjects) == 0 || record.source != "PVE" { + return false + } + vmid, err := strconv.Atoi(strings.TrimSpace(record.vmID)) + if err != nil || vmid <= 0 { + return false + } + key := BuildBackupPVETemplateSubjectKey(record.instance, record.subjectType, record.node, vmid) + if key == "" { + return false + } + _, exists := scope.PVETemplateSubjects[key] + return exists +} + +func canonicalGuestResourceType(guestType string) unifiedresources.ResourceType { + switch strings.ToLower(strings.TrimSpace(guestType)) { + case "lxc": + return unifiedresources.ResourceTypeSystemContainer + default: + return unifiedresources.ResourceTypeVM + } +} + +func canonicalBackupSubjectResourceType(record backupRecord) unifiedresources.ResourceType { + if record.lookup.Type != "" { + return canonicalGuestResourceType(record.lookup.Type) + } + switch normalizeBackupGuestType(record.subjectType) { + case "lxc": + return unifiedresources.ResourceTypeSystemContainer + case "qemu": + return unifiedresources.ResourceTypeVM + } + if strings.TrimSpace(record.vmID) != "" { + return unifiedresources.ResourceTypeVM + } + return unifiedresources.ResourceType("backup-subject") +} + +func canonicalBackupSubjectResourceID(alertKey string, record backupRecord) string { + if record.instance != "" && record.node != "" && record.vmID != "" { + if vmid, err := strconv.Atoi(record.vmID); err == nil && vmid > 0 { + return BuildGuestKey(record.instance, record.node, vmid) + } + } + return "backup-subject:" + sanitizeAlertKey(alertKey) +} + +// CheckSnapshotsForInstance evaluates guest snapshots for age-based alerts. +func (m *Manager) CheckSnapshotsForInstance(instanceName string, snapshots []models.GuestSnapshot, guestNames map[string]string) { + m.mu.RLock() + enabled := m.config.Enabled + snapshotCfg := m.config.SnapshotDefaults + m.mu.RUnlock() + + if !enabled { + return + } + + if !snapshotCfg.Enabled { + m.clearSnapshotAlertsForInstance(instanceName) + return + } + + now := time.Now() + validAlerts := make(map[string]struct{}) + + for _, snapshot := range snapshots { + if instanceName != "" && snapshot.Instance != "" && snapshot.Instance != instanceName { + continue + } + if snapshot.Time.IsZero() { + continue + } + + ageHours := now.Sub(snapshot.Time).Hours() + if ageHours < 0 { + continue + } + ageDays := ageHours / 24 + + const gib = 1024.0 * 1024 * 1024 + sizeGiB := 0.0 + if snapshot.SizeBytes > 0 { + sizeGiB = float64(snapshot.SizeBytes) / gib + } + + // Determine thresholds for this snapshot + resourceID := fmt.Sprintf("%s:%s:%d", snapshot.Instance, snapshot.Node, snapshot.VMID) + guestName := strings.TrimSpace(guestNames[BuildGuestKey(snapshot.Instance, snapshot.Node, snapshot.VMID)]) + guestContext := guestSnapshotFromIdentity(resourceID, guestName, snapshot.Node, snapshot.Instance, snapshot.Type, "") + m.mu.RLock() + gh := m.getGuestThresholds(guestContext, resourceID) + m.mu.RUnlock() + + if gh.Disabled { + continue + } + + currentSnapshotCfg := snapshotCfg + if gh.Snapshot != nil { + currentSnapshotCfg = *gh.Snapshot + } + + if !currentSnapshotCfg.Enabled { + continue + } + + var ageLevel AlertLevel + var ageThreshold int + var sizeLevel AlertLevel + var sizeThreshold float64 + var triggeredStats []string + + if currentSnapshotCfg.CriticalDays > 0 && ageDays >= float64(currentSnapshotCfg.CriticalDays) { + ageLevel = AlertLevelCritical + ageThreshold = currentSnapshotCfg.CriticalDays + triggeredStats = append(triggeredStats, "age") + } else if currentSnapshotCfg.WarningDays > 0 && ageDays >= float64(currentSnapshotCfg.WarningDays) { + ageLevel = AlertLevelWarning + ageThreshold = currentSnapshotCfg.WarningDays + triggeredStats = append(triggeredStats, "age") + } + + if snapshot.SizeBytes > 0 { + if currentSnapshotCfg.CriticalSizeGiB > 0 && sizeGiB >= currentSnapshotCfg.CriticalSizeGiB { + sizeLevel = AlertLevelCritical + sizeThreshold = currentSnapshotCfg.CriticalSizeGiB + triggeredStats = append(triggeredStats, "size") + } else if currentSnapshotCfg.WarningSizeGiB > 0 && sizeGiB >= currentSnapshotCfg.WarningSizeGiB { + sizeLevel = AlertLevelWarning + sizeThreshold = currentSnapshotCfg.WarningSizeGiB + triggeredStats = append(triggeredStats, "size") + } + } + + if ageLevel == "" && sizeLevel == "" { + continue + } + + useSizePrimary := false + if sizeLevel == AlertLevelCritical && ageLevel != AlertLevelCritical { + useSizePrimary = true + } else if sizeLevel != "" && ageLevel == "" { + useSizePrimary = true + } + + alertID := fmt.Sprintf("snapshot-age-%s", snapshot.ID) + + guestKey := BuildGuestKey(snapshot.Instance, snapshot.Node, snapshot.VMID) + + guestType := "VM" + if strings.EqualFold(snapshot.Type, "lxc") { + guestType = "Container" + } + + if guestName == "" { + switch guestType { + case "Container": + guestName = fmt.Sprintf("CT %d", snapshot.VMID) + default: + guestName = fmt.Sprintf("VM %d", snapshot.VMID) + } + } + + snapshotName := strings.TrimSpace(snapshot.Name) + if snapshotName == "" { + snapshotName = "(unnamed)" + } + + ageDaysRounded := math.Round(ageDays*10) / 10 + sizeGiBRounded := math.Round(sizeGiB*10) / 10 + reasons := make([]string, 0, 2) + if ageLevel != "" { + reasons = append(reasons, fmt.Sprintf("%.1f days old (threshold %d days)", ageDaysRounded, ageThreshold)) + } + if sizeLevel != "" { + reasons = append(reasons, fmt.Sprintf("%.1f GiB (threshold %.1f GiB)", sizeGiBRounded, sizeThreshold)) + } + reasonText := strings.Join(reasons, " and ") + message := fmt.Sprintf( + "%s snapshot '%s' for %s is %s on %s", + guestType, + snapshotName, + guestName, + reasonText, + snapshot.Node, + ) + + alertValue := ageDays + alertThreshold := float64(ageThreshold) + thresholdTime := now + if useSizePrimary { + alertValue = sizeGiB + alertThreshold = sizeThreshold + } else if ageThreshold > 0 { + thresholdTime = snapshot.Time.Add(time.Duration(ageThreshold) * 24 * time.Hour) + if thresholdTime.After(now) { + thresholdTime = now + } + } + + metadata := map[string]interface{}{ + "snapshotName": snapshot.Name, + "snapshotCreatedAt": snapshot.Time, + "snapshotAgeDays": ageDays, + "snapshotAgeHours": ageHours, + "snapshotSizeBytes": snapshot.SizeBytes, + "snapshotSizeGiB": sizeGiB, + "guestName": guestName, + "guestType": guestType, + "guestInstance": snapshot.Instance, + "guestNode": snapshot.Node, + "guestVmid": snapshot.VMID, + "triggeredMetrics": triggeredStats, + "primaryMetric": "age", + } + if useSizePrimary { + metadata["primaryMetric"] = "size" + } + if ageLevel != "" { + metadata["thresholdDays"] = ageThreshold + } + if sizeLevel != "" { + metadata["thresholdSizeGiB"] = sizeThreshold + } + + resourceName := fmt.Sprintf("%s snapshot '%s'", guestName, snapshotName) + guestResourceType := canonicalGuestResourceType(snapshot.Type) + guestResourceID := guestKey + sizeMetric := "" + var sizeValue *float64 + if currentSnapshotCfg.WarningSizeGiB > 0 || currentSnapshotCfg.CriticalSizeGiB > 0 { + sizeMetric = "snapshot-size-gib" + sizeValue = &sizeGiB + } + ageMetric := "" + if currentSnapshotCfg.WarningDays > 0 || currentSnapshotCfg.CriticalDays > 0 { + ageMetric = "snapshot-age-days" + } + + spec, err := buildCanonicalPostureThresholdSpec( + guestResourceID+"/snapshot:"+snapshot.ID, + guestResourceID, + resourceName, + guestResourceType, + ageMetric, + float64(currentSnapshotCfg.WarningDays), + float64(currentSnapshotCfg.CriticalDays), + sizeMetric, + currentSnapshotCfg.WarningSizeGiB, + currentSnapshotCfg.CriticalSizeGiB, + false, + ) + if err != nil { + log.Warn(). + Err(err). + Str("snapshotID", snapshot.ID). + Str("resourceID", guestResourceID). + Msg("Skipping invalid canonical snapshot posture spec") + continue + } + validAlerts[canonicalTrackingKeyForSpec(spec, alertID)] = struct{}{} + + result, _ := m.evaluateCanonicalStatefulAlert(canonicalStatefulAlertParams{ + Spec: spec, + Evidence: alertspecs.AlertEvidence{ + ObservedAt: now, + PostureThreshold: &alertspecs.PostureThresholdEvidence{ + AgeMetric: ageMetric, + AgeValue: ageDays, + SizeMetric: sizeMetric, + SizeValue: sizeValue, + }, + }, + AlertID: alertID, + AlertType: "snapshot-age", + ResourceID: spec.ResourceID, + ResourceName: resourceName, + Node: snapshot.Node, + Instance: snapshot.Instance, + Value: alertValue, + Threshold: alertThreshold, + StartTimeOverride: thresholdTime, + Metadata: metadata, + AddToRecent: true, + AddToHistory: true, + RateLimit: true, + DispatchAsync: true, + MessageBuilder: func(result alertspecs.EvaluationResult) (string, float64, float64) { + return message, alertValue, alertThreshold + }, + }) + if result.Transition != nil && result.Transition.Kind == alertspecs.EvaluationTransitionActivated { + asyncSaveActiveAlerts("snapshot", m.SaveActiveAlerts) + } + } + + m.mu.Lock() + for storageKey, alert := range m.activeAlerts { + if alert == nil || alert.Type != "snapshot-age" { + continue + } + if instanceName != "" && alert.Instance != instanceName { + continue + } + if _, ok := validAlerts[storageKey]; ok { + continue + } + m.clearAlertNoLock(storageKey) + } + m.mu.Unlock() +} + +// CheckBackups evaluates storage, PBS, and PMG backups for age-based alerts. +func (m *Manager) CheckBackups( + rollups []recovery.ProtectionRollup, + guestsByKey map[string]GuestLookup, + guestsByVMID map[string][]GuestLookup, +) { + m.CheckBackupsWithInventory(rollups, guestsByKey, guestsByVMID, nil) +} + +// CheckBackupsWithInventory evaluates backup rollups with optional monitoring +// inventory readiness for orphan detection. +func (m *Manager) CheckBackupsWithInventory( + rollups []recovery.ProtectionRollup, + guestsByKey map[string]GuestLookup, + guestsByVMID map[string][]GuestLookup, + inventoryScope *BackupInventoryScope, +) { + m.mu.RLock() + enabled := m.config.Enabled + backupCfg := m.config.BackupDefaults + m.mu.RUnlock() + + if backupCfg.AlertOrphaned == nil { + alertOrphaned := true + backupCfg.AlertOrphaned = &alertOrphaned + } + + if !enabled || !backupCfg.Enabled { + m.clearBackupAlerts() + return + } + + if backupCfg.WarningDays <= 0 && backupCfg.CriticalDays <= 0 { + m.clearBackupAlerts() + return + } + + records := make(map[string]*backupRecord) + + updateRecord := func(key string, candidate backupRecord) { + if key == "" { + return + } + if existing, ok := records[key]; ok { + if candidate.lastTime.After(existing.lastTime) { + *existing = candidate + } + return + } + record := candidate + records[key] = &record + } + + now := time.Now() + + for _, rollup := range rollups { + if rollup.LastSuccessAt == nil || rollup.LastSuccessAt.IsZero() { + continue + } + + lastTime := rollup.LastSuccessAt.UTC() + providers := append([]recovery.Provider(nil), rollup.Providers...) + + source := "Recovery" + if slicesContainsProvider(providers, recovery.ProviderProxmoxPMG) { + source = "PMG" + } else if slicesContainsProvider(providers, recovery.ProviderProxmoxPBS) { + source = "PBS" + } else if slicesContainsProvider(providers, recovery.ProviderProxmoxPVE) { + source = "PVE" + } + + var ( + info GuestLookup + key string + displayName string + instance string + node string + vmID string + subjectType string + ) + + ref := rollup.SubjectRef + if ref != nil { + subjectType = normalizeBackupGuestType(ref.Type) + } + + // Primary: subjectRef.ID is the canonical proxmox guest source ID (instance:node:vmid) when linked. + if ref != nil && strings.TrimSpace(ref.ID) != "" { + if inst, nd, vmid, ok := parseGuestID(ref.ID); ok { + key = BuildGuestKey(inst, nd, vmid) + info = guestsByKey[key] + instance = inst + node = nd + vmID = strconv.Itoa(vmid) + } + } + + // Secondary: attempt to map by VMID for orphaned/ambiguous backups. + if key == "" && ref != nil { + vmidStr := strings.TrimSpace(ref.ID) + if vmidStr == "" { + vmidStr = strings.TrimSpace(ref.Name) + } + if vmidStr != "" { + if vmid, err := strconv.Atoi(vmidStr); err == nil && vmid > 0 { + vmID = vmidStr + guests := guestsByVMID[vmidStr] + if len(guests) == 1 { + info = guests[0] + } else if len(guests) > 1 && strings.TrimSpace(ref.Namespace) != "" { + for _, g := range guests { + if namespaceMatchesInstance(ref.Namespace, g.Instance) { + info = g + break + } + } + } + if info.Instance != "" && info.Node != "" { + key = BuildGuestKey(info.Instance, info.Node, info.VMID) + instance = info.Instance + node = info.Node + } + } + } + } + + if key == "" { + // Stable fallback for non-guest subjects and orphans. + key = strings.TrimSpace(rollup.RollupID) + if key == "" { + continue + } + } + + displayName = strings.TrimSpace(info.Name) + if displayName == "" && ref != nil { + displayName = strings.TrimSpace(ref.Name) + } + if displayName == "" && vmID != "" { + displayName = fmt.Sprintf("VMID %s", vmID) + } + if displayName == "" { + displayName = "Unknown" + } + + updateRecord(key, backupRecord{ + key: key, + vmID: vmID, + lookup: info, + fallbackName: displayName, + instance: instance, + node: node, + subjectType: subjectType, + source: source, + rollupID: strings.TrimSpace(rollup.RollupID), + providers: providers, + lastTime: lastTime, + }) + } + + if len(records) == 0 { + m.clearBackupAlerts() + return + } + + validAlerts := make(map[string]struct{}) + + for key, record := range records { + age := now.Sub(record.lastTime) + if age < 0 { + continue + } + + ageDays := age.Hours() / 24 + if ageDays < 0 { + continue + } + ageDaysRounded := math.Round(ageDays*10) / 10 + + // Determine thresholds for this backup + currentBackupCfg := backupCfg + guestContext := guestSnapshotFromLookup(record.lookup, record.fallbackName) + guestResourceID := strings.TrimSpace(record.lookup.ResourceID) + if guestResourceID == "" { + guestResourceID = guestContext.ID + } + if guestResourceID != "" { + m.mu.RLock() + gh := m.getGuestThresholds(guestContext, guestResourceID) + m.mu.RUnlock() + if gh.Disabled { + continue + } + if gh.Backup != nil { + currentBackupCfg = *gh.Backup + } + } + + currentBackupCfg.AlertOrphaned = backupCfg.AlertOrphaned + currentBackupCfg.IgnoreVMIDs = backupCfg.IgnoreVMIDs + + if backupIgnoreVMID(record.vmID, currentBackupCfg.IgnoreVMIDs) { + continue + } + if record.vmID != "" && record.lookup.ResourceID == "" { + if backupMatchesKnownPVETemplate(inventoryScope, *record) { + continue + } + if !backupOrphanInventoryReady(inventoryScope, *record) { + continue + } + if currentBackupCfg.AlertOrphaned != nil && !*currentBackupCfg.AlertOrphaned { + continue + } + } + + if !currentBackupCfg.Enabled { + continue + } + + var threshold int + switch { + case currentBackupCfg.CriticalDays > 0 && ageDays >= float64(currentBackupCfg.CriticalDays): + threshold = currentBackupCfg.CriticalDays + case currentBackupCfg.WarningDays > 0 && ageDays >= float64(currentBackupCfg.WarningDays): + threshold = currentBackupCfg.WarningDays + default: + continue + } + + alertKey := sanitizeAlertKey(key) + alertID := fmt.Sprintf("backup-age-%s", alertKey) + + displayName := record.lookup.Name + if displayName == "" { + displayName = record.fallbackName + } + if displayName == "" { + displayName = "Unknown guest" + } + + node := record.node + if node == "" { + node = record.lookup.Node + } + instance := record.instance + if instance == "" { + instance = record.lookup.Instance + } + + thresholdTime := record.lastTime.Add(time.Duration(threshold) * 24 * time.Hour) + if thresholdTime.After(now) { + thresholdTime = now + } + + var sourceLabel string + sourceLabel = record.source + if len(record.providers) > 0 { + parts := make([]string, 0, len(record.providers)) + for _, p := range record.providers { + if s := strings.TrimSpace(string(p)); s != "" { + parts = append(parts, s) + } + } + if len(parts) > 0 { + sourceLabel = strings.Join(parts, ", ") + } + } + + message := fmt.Sprintf( + "%s backup via %s is %.1f days old (threshold: %d days)", + displayName, + sourceLabel, + ageDaysRounded, + threshold, + ) + + metadata := map[string]interface{}{ + "source": record.source, + "providers": record.providers, + "rollupId": record.rollupID, + "lastBackupTime": record.lastTime, + "ageDays": ageDays, + "thresholdDays": threshold, + "guestName": displayName, + "guestType": record.lookup.Type, + "guestInstance": instance, + "guestNode": node, + "guestVmid": metadataIntValue(record.vmID), + "orphaned": record.vmID != "" && guestResourceID == "", + } + specResourceID := canonicalBackupSubjectResourceID(alertKey, *record) + specResourceType := canonicalBackupSubjectResourceType(*record) + spec, err := buildCanonicalPostureThresholdSpec( + specResourceID+"-backup-age", + specResourceID, + displayName+" backup", + specResourceType, + "backup-age-days", + float64(currentBackupCfg.WarningDays), + float64(currentBackupCfg.CriticalDays), + "", + 0, + 0, + false, + ) + if err != nil { + log.Warn(). + Err(err). + Str("alertID", alertID). + Str("resourceID", specResourceID). + Msg("Skipping invalid canonical backup posture spec") + continue + } + validAlerts[canonicalTrackingKeyForSpec(spec, alertID)] = struct{}{} + + result, _ := m.evaluateCanonicalStatefulAlert(canonicalStatefulAlertParams{ + Spec: spec, + Evidence: alertspecs.AlertEvidence{ + ObservedAt: now, + PostureThreshold: &alertspecs.PostureThresholdEvidence{ + AgeMetric: "backup-age-days", + AgeValue: ageDays, + }, + }, + AlertID: alertID, + AlertType: "backup-age", + ResourceID: spec.ResourceID, + ResourceName: fmt.Sprintf("%s backup", displayName), + Node: node, + Instance: instance, + Value: ageDays, + Threshold: float64(threshold), + StartTimeOverride: thresholdTime, + Metadata: metadata, + AddToRecent: true, + AddToHistory: true, + RateLimit: true, + DispatchAsync: true, + MessageBuilder: func(result alertspecs.EvaluationResult) (string, float64, float64) { + return message, ageDays, float64(threshold) + }, + }) + if result.Transition != nil && result.Transition.Kind == alertspecs.EvaluationTransitionActivated { + asyncSaveActiveAlerts("backup", m.SaveActiveAlerts) + } + } + + m.mu.Lock() + for storageKey, alert := range m.activeAlerts { + if alert == nil || alert.Type != "backup-age" { + continue + } + if _, ok := validAlerts[storageKey]; ok { + continue + } + m.clearAlertNoLock(storageKey) + } + m.mu.Unlock() +} + +func slicesContainsProvider(providers []recovery.Provider, target recovery.Provider) bool { + for _, p := range providers { + if p == target { + return true + } + } + return false +} + +func parseGuestID(raw string) (instance string, node string, vmid int, ok bool) { + raw = strings.TrimSpace(raw) + if raw == "" { + return "", "", 0, false + } + parts := strings.Split(raw, ":") + if len(parts) < 3 { + return "", "", 0, false + } + last := parts[len(parts)-1] + prev := parts[len(parts)-2] + inst := strings.Join(parts[:len(parts)-2], ":") + n, err := strconv.Atoi(strings.TrimSpace(last)) + if err != nil || n <= 0 { + return "", "", 0, false + } + return strings.TrimSpace(inst), strings.TrimSpace(prev), n, true +} + +// namespaceMatchesInstance checks if a PBS namespace likely corresponds to a PVE instance. +// This helps disambiguate backups when multiple PVE instances have VMs with the same VMID. +// Examples: namespace "pve1" matches instance "pve1", namespace "nat" matches instance "pve-nat" +func namespaceMatchesInstance(namespace, instance string) bool { + if namespace == "" || instance == "" { + return false + } + + // Normalize both strings: lowercase and keep only alphanumeric + normalize := func(s string) string { + var b strings.Builder + for _, r := range strings.ToLower(s) { + if (r >= 'a' && r <= 'z') || (r >= '0' && r <= '9') { + b.WriteRune(r) + } + } + return b.String() + } + + ns := normalize(namespace) + inst := normalize(instance) + + if ns == "" || inst == "" { + return false + } + + // Exact match after normalization + if ns == inst { + return true + } + + // Check if namespace is a suffix of instance + // e.g., namespace "nat" matches instance "pvenat" (normalized from "pve-nat") + // This is more precise than substring matching because: + // - "nat" should match "pve-nat" but not "natpve" + // - "pve" should match "pve" but not "pve-nat" (handled by exact match above) + if strings.HasSuffix(inst, ns) { + return true + } + + // Check if instance is a suffix of namespace (reverse case) + // e.g., namespace "pvebackups" could match instance "pve" + if strings.HasSuffix(ns, inst) { + return true + } + + return false +} + +func (m *Manager) clearSnapshotAlertsForInstance(instance string) { + m.mu.Lock() + m.clearSnapshotAlertsForInstanceLocked(instance) + m.mu.Unlock() +} + +func (m *Manager) clearSnapshotAlertsForInstanceLocked(instance string) { + for storageKey, alert := range m.activeAlerts { + alertID := effectiveAlertID(alert, storageKey) + if alert == nil || alert.Type != "snapshot-age" { + continue + } + if instance != "" && alert.Instance != instance { + continue + } + m.clearAlertNoLock(alertID) + } +} + +func (m *Manager) clearBackupAlerts() { + m.mu.Lock() + m.clearBackupAlertsLocked() + m.mu.Unlock() +} + +func (m *Manager) clearBackupAlertsLocked() { + for storageKey, alert := range m.activeAlerts { + alertID := effectiveAlertID(alert, storageKey) + if alert == nil || alert.Type != "backup-age" { + continue + } + m.clearAlertNoLock(alertID) + } +}