mirror of
https://github.com/rcourtman/Pulse.git
synced 2026-05-20 01:01:20 +00:00
refactor: split backup snapshot alerts
Move snapshot age and size evaluation, backup rollup age evaluation, inventory readiness, namespace disambiguation, template matching, and backup/snapshot cleanup into internal/alerts/backup_snapshot.go. Keep the generic async active-alert save helper in the central package because canonical metric migration still shares it, and record backup_snapshot.go as the backup/snapshot owner in the alerts subsystem contract. Proof: go test ./internal/alerts/...
This commit is contained in:
parent
8a21162f35
commit
baeef84c69
4 changed files with 990 additions and 963 deletions
|
|
@ -62,6 +62,7 @@ operator-facing alert routing behavior for live runtime alerts.
|
|||
40. `internal/alerts/storage.go`
|
||||
41. `internal/alerts/node.go`
|
||||
42. `internal/alerts/host.go`
|
||||
43. `internal/alerts/backup_snapshot.go`
|
||||
|
||||
## Shared Boundaries
|
||||
|
||||
|
|
@ -260,6 +261,12 @@ health handling, host cleanup, and host offline lifecycle handling; future host
|
|||
agent alert behavior should extend that resource checker owner while shared
|
||||
health-assessment evaluation remains package-level until all storage-health
|
||||
callers can be separated behind a narrower owner.
|
||||
Snapshot and backup-age alert evaluation now lives in
|
||||
`internal/alerts/backup_snapshot.go`. That file owns snapshot age/size
|
||||
evaluation, backup rollup age evaluation, backup inventory readiness, PVE
|
||||
template subject matching, namespace disambiguation, and snapshot/backup active
|
||||
alert cleanup; future backup or snapshot alert behavior should extend that
|
||||
owner rather than expanding the central Manager file.
|
||||
Commercial alert handoffs now follow the same shared navigation boundary.
|
||||
`frontend-modern/src/components/Alerts/InvestigateAlertButton.tsx` may resolve
|
||||
the canonical `ai_alerts` destination from the shared license/commercial
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load diff
|
|
@ -325,6 +325,13 @@ func TestHostResourceIDTrimsID(t *testing.T) {
|
|||
}
|
||||
}
|
||||
|
||||
func TestBuildBackupPVETemplateSubjectKeyTrimsParts(t *testing.T) {
|
||||
got := BuildBackupPVETemplateSubjectKey(" inst ", " VM ", " node ", 702)
|
||||
if got != "inst\x00qemu\x00node\x00702" {
|
||||
t.Fatalf("BuildBackupPVETemplateSubjectKey() = %q, want %q", got, "inst\x00qemu\x00node\x00702")
|
||||
}
|
||||
}
|
||||
|
||||
func TestQuietHoursCategoryForResourceIncidentUsesIncidentCategoryMetadata(t *testing.T) {
|
||||
availability := &Alert{
|
||||
Type: "resource-incident",
|
||||
|
|
|
|||
976
internal/alerts/backup_snapshot.go
Normal file
976
internal/alerts/backup_snapshot.go
Normal file
|
|
@ -0,0 +1,976 @@
|
|||
package alerts
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"math"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
alertspecs "github.com/rcourtman/pulse-go-rewrite/internal/alerts/specs"
|
||||
"github.com/rcourtman/pulse-go-rewrite/internal/models"
|
||||
"github.com/rcourtman/pulse-go-rewrite/internal/recovery"
|
||||
"github.com/rcourtman/pulse-go-rewrite/internal/unifiedresources"
|
||||
"github.com/rs/zerolog/log"
|
||||
)
|
||||
|
||||
func backupIgnoreVMID(vmID string, ignoreList []string) bool {
|
||||
if vmID == "" || len(ignoreList) == 0 {
|
||||
return false
|
||||
}
|
||||
for _, entry := range ignoreList {
|
||||
value := strings.TrimSpace(entry)
|
||||
if value == "" {
|
||||
continue
|
||||
}
|
||||
if strings.HasSuffix(value, "*") {
|
||||
prefix := strings.TrimSuffix(value, "*")
|
||||
if prefix != "" && strings.HasPrefix(vmID, prefix) {
|
||||
return true
|
||||
}
|
||||
continue
|
||||
}
|
||||
if vmID == value {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func (m *Manager) resolvedSnapshotAlertConfigNoLock(thresholds ThresholdConfig) SnapshotAlertConfig {
|
||||
cfg := m.config.SnapshotDefaults
|
||||
if thresholds.Snapshot != nil {
|
||||
cfg = *thresholds.Snapshot
|
||||
}
|
||||
return cfg
|
||||
}
|
||||
|
||||
func (m *Manager) resolvedBackupAlertConfigNoLock(thresholds ThresholdConfig) BackupAlertConfig {
|
||||
cfg := m.config.BackupDefaults
|
||||
if thresholds.Backup != nil {
|
||||
cfg = *thresholds.Backup
|
||||
}
|
||||
if cfg.AlertOrphaned == nil {
|
||||
alertOrphaned := true
|
||||
cfg.AlertOrphaned = &alertOrphaned
|
||||
}
|
||||
return cfg
|
||||
}
|
||||
|
||||
func snapshotAlertStillTriggered(alert *Alert, cfg SnapshotAlertConfig) bool {
|
||||
if alert == nil || !cfg.Enabled {
|
||||
return false
|
||||
}
|
||||
|
||||
ageValue, _ := metadataFloatValue(alert.Metadata, "snapshotAgeDays")
|
||||
sizeValue, _ := metadataFloatValue(alert.Metadata, "snapshotSizeGiB")
|
||||
|
||||
if cfg.CriticalDays > 0 && ageValue >= float64(cfg.CriticalDays) {
|
||||
return true
|
||||
}
|
||||
if cfg.WarningDays > 0 && ageValue >= float64(cfg.WarningDays) {
|
||||
return true
|
||||
}
|
||||
if cfg.CriticalSizeGiB > 0 && sizeValue >= cfg.CriticalSizeGiB {
|
||||
return true
|
||||
}
|
||||
if cfg.WarningSizeGiB > 0 && sizeValue >= cfg.WarningSizeGiB {
|
||||
return true
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
func backupAlertStillTriggered(alert *Alert, cfg BackupAlertConfig) bool {
|
||||
if alert == nil || !cfg.Enabled {
|
||||
return false
|
||||
}
|
||||
|
||||
vmid := metadataStringValue(alert.Metadata, "guestVmid")
|
||||
if vmid == "" {
|
||||
if parsed := metadataIntValue(alert.Metadata["guestVmid"]); parsed > 0 {
|
||||
vmid = strconv.Itoa(parsed)
|
||||
}
|
||||
}
|
||||
if backupIgnoreVMID(vmid, cfg.IgnoreVMIDs) {
|
||||
return false
|
||||
}
|
||||
if metadataBoolValue(alert.Metadata, "orphaned") && cfg.AlertOrphaned != nil && !*cfg.AlertOrphaned {
|
||||
return false
|
||||
}
|
||||
|
||||
ageValue, ok := metadataFloatValue(alert.Metadata, "ageDays")
|
||||
if !ok {
|
||||
ageValue = alert.Value
|
||||
}
|
||||
|
||||
if cfg.CriticalDays > 0 && ageValue >= float64(cfg.CriticalDays) {
|
||||
return true
|
||||
}
|
||||
if cfg.WarningDays > 0 && ageValue >= float64(cfg.WarningDays) {
|
||||
return true
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
// BuildGuestKey constructs a unique key for a guest from instance, node, and VMID.
|
||||
// Uses the canonical format: instance:node:vmid
|
||||
// This matches the format used by makeGuestID in the monitoring package.
|
||||
func BuildGuestKey(instance, node string, vmID int) string {
|
||||
instance = strings.TrimSpace(instance)
|
||||
node = strings.TrimSpace(node)
|
||||
if instance == "" {
|
||||
instance = node
|
||||
}
|
||||
return fmt.Sprintf("%s:%s:%d", instance, node, vmID)
|
||||
}
|
||||
|
||||
type backupRecord struct {
|
||||
key string
|
||||
vmID string
|
||||
lookup GuestLookup
|
||||
fallbackName string
|
||||
instance string
|
||||
node string
|
||||
subjectType string
|
||||
source string
|
||||
rollupID string
|
||||
providers []recovery.Provider
|
||||
lastTime time.Time
|
||||
}
|
||||
|
||||
// BackupInventoryScope carries monitoring-owned inventory readiness into backup
|
||||
// alert evaluation. It keeps orphan detection from racing ahead of Proxmox
|
||||
// guest/template discovery while preserving the direct CheckBackups API for
|
||||
// unit tests and non-monitoring callers.
|
||||
type BackupInventoryScope struct {
|
||||
PVEOrphanInventoryReady map[string]map[string]bool
|
||||
PVETemplateSubjects map[string]struct{}
|
||||
}
|
||||
|
||||
func BuildBackupPVETemplateSubjectKey(instance, guestType, node string, vmid int) string {
|
||||
instance = strings.TrimSpace(instance)
|
||||
guestType = normalizeBackupGuestType(guestType)
|
||||
node = strings.TrimSpace(node)
|
||||
if instance == "" || guestType == "" || node == "" || vmid <= 0 {
|
||||
return ""
|
||||
}
|
||||
return strings.Join([]string{instance, guestType, node, strconv.Itoa(vmid)}, "\x00")
|
||||
}
|
||||
|
||||
func normalizeBackupGuestType(guestType string) string {
|
||||
switch strings.ToLower(strings.TrimSpace(guestType)) {
|
||||
case "qemu", "vm", "proxmox-vm":
|
||||
return "qemu"
|
||||
case "lxc", "ct", "container", "system-container", "proxmox-lxc":
|
||||
return "lxc"
|
||||
default:
|
||||
return strings.ToLower(strings.TrimSpace(guestType))
|
||||
}
|
||||
}
|
||||
|
||||
func backupOrphanInventoryReady(scope *BackupInventoryScope, record backupRecord) bool {
|
||||
if scope == nil || scope.PVEOrphanInventoryReady == nil {
|
||||
return true
|
||||
}
|
||||
if record.source != "PVE" {
|
||||
return true
|
||||
}
|
||||
instance := strings.TrimSpace(record.instance)
|
||||
guestType := normalizeBackupGuestType(record.subjectType)
|
||||
if instance == "" || guestType == "" {
|
||||
return false
|
||||
}
|
||||
return scope.PVEOrphanInventoryReady[instance][guestType]
|
||||
}
|
||||
|
||||
func backupMatchesKnownPVETemplate(scope *BackupInventoryScope, record backupRecord) bool {
|
||||
if scope == nil || len(scope.PVETemplateSubjects) == 0 || record.source != "PVE" {
|
||||
return false
|
||||
}
|
||||
vmid, err := strconv.Atoi(strings.TrimSpace(record.vmID))
|
||||
if err != nil || vmid <= 0 {
|
||||
return false
|
||||
}
|
||||
key := BuildBackupPVETemplateSubjectKey(record.instance, record.subjectType, record.node, vmid)
|
||||
if key == "" {
|
||||
return false
|
||||
}
|
||||
_, exists := scope.PVETemplateSubjects[key]
|
||||
return exists
|
||||
}
|
||||
|
||||
func canonicalGuestResourceType(guestType string) unifiedresources.ResourceType {
|
||||
switch strings.ToLower(strings.TrimSpace(guestType)) {
|
||||
case "lxc":
|
||||
return unifiedresources.ResourceTypeSystemContainer
|
||||
default:
|
||||
return unifiedresources.ResourceTypeVM
|
||||
}
|
||||
}
|
||||
|
||||
func canonicalBackupSubjectResourceType(record backupRecord) unifiedresources.ResourceType {
|
||||
if record.lookup.Type != "" {
|
||||
return canonicalGuestResourceType(record.lookup.Type)
|
||||
}
|
||||
switch normalizeBackupGuestType(record.subjectType) {
|
||||
case "lxc":
|
||||
return unifiedresources.ResourceTypeSystemContainer
|
||||
case "qemu":
|
||||
return unifiedresources.ResourceTypeVM
|
||||
}
|
||||
if strings.TrimSpace(record.vmID) != "" {
|
||||
return unifiedresources.ResourceTypeVM
|
||||
}
|
||||
return unifiedresources.ResourceType("backup-subject")
|
||||
}
|
||||
|
||||
func canonicalBackupSubjectResourceID(alertKey string, record backupRecord) string {
|
||||
if record.instance != "" && record.node != "" && record.vmID != "" {
|
||||
if vmid, err := strconv.Atoi(record.vmID); err == nil && vmid > 0 {
|
||||
return BuildGuestKey(record.instance, record.node, vmid)
|
||||
}
|
||||
}
|
||||
return "backup-subject:" + sanitizeAlertKey(alertKey)
|
||||
}
|
||||
|
||||
// CheckSnapshotsForInstance evaluates guest snapshots for age-based alerts.
|
||||
func (m *Manager) CheckSnapshotsForInstance(instanceName string, snapshots []models.GuestSnapshot, guestNames map[string]string) {
|
||||
m.mu.RLock()
|
||||
enabled := m.config.Enabled
|
||||
snapshotCfg := m.config.SnapshotDefaults
|
||||
m.mu.RUnlock()
|
||||
|
||||
if !enabled {
|
||||
return
|
||||
}
|
||||
|
||||
if !snapshotCfg.Enabled {
|
||||
m.clearSnapshotAlertsForInstance(instanceName)
|
||||
return
|
||||
}
|
||||
|
||||
now := time.Now()
|
||||
validAlerts := make(map[string]struct{})
|
||||
|
||||
for _, snapshot := range snapshots {
|
||||
if instanceName != "" && snapshot.Instance != "" && snapshot.Instance != instanceName {
|
||||
continue
|
||||
}
|
||||
if snapshot.Time.IsZero() {
|
||||
continue
|
||||
}
|
||||
|
||||
ageHours := now.Sub(snapshot.Time).Hours()
|
||||
if ageHours < 0 {
|
||||
continue
|
||||
}
|
||||
ageDays := ageHours / 24
|
||||
|
||||
const gib = 1024.0 * 1024 * 1024
|
||||
sizeGiB := 0.0
|
||||
if snapshot.SizeBytes > 0 {
|
||||
sizeGiB = float64(snapshot.SizeBytes) / gib
|
||||
}
|
||||
|
||||
// Determine thresholds for this snapshot
|
||||
resourceID := fmt.Sprintf("%s:%s:%d", snapshot.Instance, snapshot.Node, snapshot.VMID)
|
||||
guestName := strings.TrimSpace(guestNames[BuildGuestKey(snapshot.Instance, snapshot.Node, snapshot.VMID)])
|
||||
guestContext := guestSnapshotFromIdentity(resourceID, guestName, snapshot.Node, snapshot.Instance, snapshot.Type, "")
|
||||
m.mu.RLock()
|
||||
gh := m.getGuestThresholds(guestContext, resourceID)
|
||||
m.mu.RUnlock()
|
||||
|
||||
if gh.Disabled {
|
||||
continue
|
||||
}
|
||||
|
||||
currentSnapshotCfg := snapshotCfg
|
||||
if gh.Snapshot != nil {
|
||||
currentSnapshotCfg = *gh.Snapshot
|
||||
}
|
||||
|
||||
if !currentSnapshotCfg.Enabled {
|
||||
continue
|
||||
}
|
||||
|
||||
var ageLevel AlertLevel
|
||||
var ageThreshold int
|
||||
var sizeLevel AlertLevel
|
||||
var sizeThreshold float64
|
||||
var triggeredStats []string
|
||||
|
||||
if currentSnapshotCfg.CriticalDays > 0 && ageDays >= float64(currentSnapshotCfg.CriticalDays) {
|
||||
ageLevel = AlertLevelCritical
|
||||
ageThreshold = currentSnapshotCfg.CriticalDays
|
||||
triggeredStats = append(triggeredStats, "age")
|
||||
} else if currentSnapshotCfg.WarningDays > 0 && ageDays >= float64(currentSnapshotCfg.WarningDays) {
|
||||
ageLevel = AlertLevelWarning
|
||||
ageThreshold = currentSnapshotCfg.WarningDays
|
||||
triggeredStats = append(triggeredStats, "age")
|
||||
}
|
||||
|
||||
if snapshot.SizeBytes > 0 {
|
||||
if currentSnapshotCfg.CriticalSizeGiB > 0 && sizeGiB >= currentSnapshotCfg.CriticalSizeGiB {
|
||||
sizeLevel = AlertLevelCritical
|
||||
sizeThreshold = currentSnapshotCfg.CriticalSizeGiB
|
||||
triggeredStats = append(triggeredStats, "size")
|
||||
} else if currentSnapshotCfg.WarningSizeGiB > 0 && sizeGiB >= currentSnapshotCfg.WarningSizeGiB {
|
||||
sizeLevel = AlertLevelWarning
|
||||
sizeThreshold = currentSnapshotCfg.WarningSizeGiB
|
||||
triggeredStats = append(triggeredStats, "size")
|
||||
}
|
||||
}
|
||||
|
||||
if ageLevel == "" && sizeLevel == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
useSizePrimary := false
|
||||
if sizeLevel == AlertLevelCritical && ageLevel != AlertLevelCritical {
|
||||
useSizePrimary = true
|
||||
} else if sizeLevel != "" && ageLevel == "" {
|
||||
useSizePrimary = true
|
||||
}
|
||||
|
||||
alertID := fmt.Sprintf("snapshot-age-%s", snapshot.ID)
|
||||
|
||||
guestKey := BuildGuestKey(snapshot.Instance, snapshot.Node, snapshot.VMID)
|
||||
|
||||
guestType := "VM"
|
||||
if strings.EqualFold(snapshot.Type, "lxc") {
|
||||
guestType = "Container"
|
||||
}
|
||||
|
||||
if guestName == "" {
|
||||
switch guestType {
|
||||
case "Container":
|
||||
guestName = fmt.Sprintf("CT %d", snapshot.VMID)
|
||||
default:
|
||||
guestName = fmt.Sprintf("VM %d", snapshot.VMID)
|
||||
}
|
||||
}
|
||||
|
||||
snapshotName := strings.TrimSpace(snapshot.Name)
|
||||
if snapshotName == "" {
|
||||
snapshotName = "(unnamed)"
|
||||
}
|
||||
|
||||
ageDaysRounded := math.Round(ageDays*10) / 10
|
||||
sizeGiBRounded := math.Round(sizeGiB*10) / 10
|
||||
reasons := make([]string, 0, 2)
|
||||
if ageLevel != "" {
|
||||
reasons = append(reasons, fmt.Sprintf("%.1f days old (threshold %d days)", ageDaysRounded, ageThreshold))
|
||||
}
|
||||
if sizeLevel != "" {
|
||||
reasons = append(reasons, fmt.Sprintf("%.1f GiB (threshold %.1f GiB)", sizeGiBRounded, sizeThreshold))
|
||||
}
|
||||
reasonText := strings.Join(reasons, " and ")
|
||||
message := fmt.Sprintf(
|
||||
"%s snapshot '%s' for %s is %s on %s",
|
||||
guestType,
|
||||
snapshotName,
|
||||
guestName,
|
||||
reasonText,
|
||||
snapshot.Node,
|
||||
)
|
||||
|
||||
alertValue := ageDays
|
||||
alertThreshold := float64(ageThreshold)
|
||||
thresholdTime := now
|
||||
if useSizePrimary {
|
||||
alertValue = sizeGiB
|
||||
alertThreshold = sizeThreshold
|
||||
} else if ageThreshold > 0 {
|
||||
thresholdTime = snapshot.Time.Add(time.Duration(ageThreshold) * 24 * time.Hour)
|
||||
if thresholdTime.After(now) {
|
||||
thresholdTime = now
|
||||
}
|
||||
}
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"snapshotName": snapshot.Name,
|
||||
"snapshotCreatedAt": snapshot.Time,
|
||||
"snapshotAgeDays": ageDays,
|
||||
"snapshotAgeHours": ageHours,
|
||||
"snapshotSizeBytes": snapshot.SizeBytes,
|
||||
"snapshotSizeGiB": sizeGiB,
|
||||
"guestName": guestName,
|
||||
"guestType": guestType,
|
||||
"guestInstance": snapshot.Instance,
|
||||
"guestNode": snapshot.Node,
|
||||
"guestVmid": snapshot.VMID,
|
||||
"triggeredMetrics": triggeredStats,
|
||||
"primaryMetric": "age",
|
||||
}
|
||||
if useSizePrimary {
|
||||
metadata["primaryMetric"] = "size"
|
||||
}
|
||||
if ageLevel != "" {
|
||||
metadata["thresholdDays"] = ageThreshold
|
||||
}
|
||||
if sizeLevel != "" {
|
||||
metadata["thresholdSizeGiB"] = sizeThreshold
|
||||
}
|
||||
|
||||
resourceName := fmt.Sprintf("%s snapshot '%s'", guestName, snapshotName)
|
||||
guestResourceType := canonicalGuestResourceType(snapshot.Type)
|
||||
guestResourceID := guestKey
|
||||
sizeMetric := ""
|
||||
var sizeValue *float64
|
||||
if currentSnapshotCfg.WarningSizeGiB > 0 || currentSnapshotCfg.CriticalSizeGiB > 0 {
|
||||
sizeMetric = "snapshot-size-gib"
|
||||
sizeValue = &sizeGiB
|
||||
}
|
||||
ageMetric := ""
|
||||
if currentSnapshotCfg.WarningDays > 0 || currentSnapshotCfg.CriticalDays > 0 {
|
||||
ageMetric = "snapshot-age-days"
|
||||
}
|
||||
|
||||
spec, err := buildCanonicalPostureThresholdSpec(
|
||||
guestResourceID+"/snapshot:"+snapshot.ID,
|
||||
guestResourceID,
|
||||
resourceName,
|
||||
guestResourceType,
|
||||
ageMetric,
|
||||
float64(currentSnapshotCfg.WarningDays),
|
||||
float64(currentSnapshotCfg.CriticalDays),
|
||||
sizeMetric,
|
||||
currentSnapshotCfg.WarningSizeGiB,
|
||||
currentSnapshotCfg.CriticalSizeGiB,
|
||||
false,
|
||||
)
|
||||
if err != nil {
|
||||
log.Warn().
|
||||
Err(err).
|
||||
Str("snapshotID", snapshot.ID).
|
||||
Str("resourceID", guestResourceID).
|
||||
Msg("Skipping invalid canonical snapshot posture spec")
|
||||
continue
|
||||
}
|
||||
validAlerts[canonicalTrackingKeyForSpec(spec, alertID)] = struct{}{}
|
||||
|
||||
result, _ := m.evaluateCanonicalStatefulAlert(canonicalStatefulAlertParams{
|
||||
Spec: spec,
|
||||
Evidence: alertspecs.AlertEvidence{
|
||||
ObservedAt: now,
|
||||
PostureThreshold: &alertspecs.PostureThresholdEvidence{
|
||||
AgeMetric: ageMetric,
|
||||
AgeValue: ageDays,
|
||||
SizeMetric: sizeMetric,
|
||||
SizeValue: sizeValue,
|
||||
},
|
||||
},
|
||||
AlertID: alertID,
|
||||
AlertType: "snapshot-age",
|
||||
ResourceID: spec.ResourceID,
|
||||
ResourceName: resourceName,
|
||||
Node: snapshot.Node,
|
||||
Instance: snapshot.Instance,
|
||||
Value: alertValue,
|
||||
Threshold: alertThreshold,
|
||||
StartTimeOverride: thresholdTime,
|
||||
Metadata: metadata,
|
||||
AddToRecent: true,
|
||||
AddToHistory: true,
|
||||
RateLimit: true,
|
||||
DispatchAsync: true,
|
||||
MessageBuilder: func(result alertspecs.EvaluationResult) (string, float64, float64) {
|
||||
return message, alertValue, alertThreshold
|
||||
},
|
||||
})
|
||||
if result.Transition != nil && result.Transition.Kind == alertspecs.EvaluationTransitionActivated {
|
||||
asyncSaveActiveAlerts("snapshot", m.SaveActiveAlerts)
|
||||
}
|
||||
}
|
||||
|
||||
m.mu.Lock()
|
||||
for storageKey, alert := range m.activeAlerts {
|
||||
if alert == nil || alert.Type != "snapshot-age" {
|
||||
continue
|
||||
}
|
||||
if instanceName != "" && alert.Instance != instanceName {
|
||||
continue
|
||||
}
|
||||
if _, ok := validAlerts[storageKey]; ok {
|
||||
continue
|
||||
}
|
||||
m.clearAlertNoLock(storageKey)
|
||||
}
|
||||
m.mu.Unlock()
|
||||
}
|
||||
|
||||
// CheckBackups evaluates storage, PBS, and PMG backups for age-based alerts.
|
||||
func (m *Manager) CheckBackups(
|
||||
rollups []recovery.ProtectionRollup,
|
||||
guestsByKey map[string]GuestLookup,
|
||||
guestsByVMID map[string][]GuestLookup,
|
||||
) {
|
||||
m.CheckBackupsWithInventory(rollups, guestsByKey, guestsByVMID, nil)
|
||||
}
|
||||
|
||||
// CheckBackupsWithInventory evaluates backup rollups with optional monitoring
|
||||
// inventory readiness for orphan detection.
|
||||
func (m *Manager) CheckBackupsWithInventory(
|
||||
rollups []recovery.ProtectionRollup,
|
||||
guestsByKey map[string]GuestLookup,
|
||||
guestsByVMID map[string][]GuestLookup,
|
||||
inventoryScope *BackupInventoryScope,
|
||||
) {
|
||||
m.mu.RLock()
|
||||
enabled := m.config.Enabled
|
||||
backupCfg := m.config.BackupDefaults
|
||||
m.mu.RUnlock()
|
||||
|
||||
if backupCfg.AlertOrphaned == nil {
|
||||
alertOrphaned := true
|
||||
backupCfg.AlertOrphaned = &alertOrphaned
|
||||
}
|
||||
|
||||
if !enabled || !backupCfg.Enabled {
|
||||
m.clearBackupAlerts()
|
||||
return
|
||||
}
|
||||
|
||||
if backupCfg.WarningDays <= 0 && backupCfg.CriticalDays <= 0 {
|
||||
m.clearBackupAlerts()
|
||||
return
|
||||
}
|
||||
|
||||
records := make(map[string]*backupRecord)
|
||||
|
||||
updateRecord := func(key string, candidate backupRecord) {
|
||||
if key == "" {
|
||||
return
|
||||
}
|
||||
if existing, ok := records[key]; ok {
|
||||
if candidate.lastTime.After(existing.lastTime) {
|
||||
*existing = candidate
|
||||
}
|
||||
return
|
||||
}
|
||||
record := candidate
|
||||
records[key] = &record
|
||||
}
|
||||
|
||||
now := time.Now()
|
||||
|
||||
for _, rollup := range rollups {
|
||||
if rollup.LastSuccessAt == nil || rollup.LastSuccessAt.IsZero() {
|
||||
continue
|
||||
}
|
||||
|
||||
lastTime := rollup.LastSuccessAt.UTC()
|
||||
providers := append([]recovery.Provider(nil), rollup.Providers...)
|
||||
|
||||
source := "Recovery"
|
||||
if slicesContainsProvider(providers, recovery.ProviderProxmoxPMG) {
|
||||
source = "PMG"
|
||||
} else if slicesContainsProvider(providers, recovery.ProviderProxmoxPBS) {
|
||||
source = "PBS"
|
||||
} else if slicesContainsProvider(providers, recovery.ProviderProxmoxPVE) {
|
||||
source = "PVE"
|
||||
}
|
||||
|
||||
var (
|
||||
info GuestLookup
|
||||
key string
|
||||
displayName string
|
||||
instance string
|
||||
node string
|
||||
vmID string
|
||||
subjectType string
|
||||
)
|
||||
|
||||
ref := rollup.SubjectRef
|
||||
if ref != nil {
|
||||
subjectType = normalizeBackupGuestType(ref.Type)
|
||||
}
|
||||
|
||||
// Primary: subjectRef.ID is the canonical proxmox guest source ID (instance:node:vmid) when linked.
|
||||
if ref != nil && strings.TrimSpace(ref.ID) != "" {
|
||||
if inst, nd, vmid, ok := parseGuestID(ref.ID); ok {
|
||||
key = BuildGuestKey(inst, nd, vmid)
|
||||
info = guestsByKey[key]
|
||||
instance = inst
|
||||
node = nd
|
||||
vmID = strconv.Itoa(vmid)
|
||||
}
|
||||
}
|
||||
|
||||
// Secondary: attempt to map by VMID for orphaned/ambiguous backups.
|
||||
if key == "" && ref != nil {
|
||||
vmidStr := strings.TrimSpace(ref.ID)
|
||||
if vmidStr == "" {
|
||||
vmidStr = strings.TrimSpace(ref.Name)
|
||||
}
|
||||
if vmidStr != "" {
|
||||
if vmid, err := strconv.Atoi(vmidStr); err == nil && vmid > 0 {
|
||||
vmID = vmidStr
|
||||
guests := guestsByVMID[vmidStr]
|
||||
if len(guests) == 1 {
|
||||
info = guests[0]
|
||||
} else if len(guests) > 1 && strings.TrimSpace(ref.Namespace) != "" {
|
||||
for _, g := range guests {
|
||||
if namespaceMatchesInstance(ref.Namespace, g.Instance) {
|
||||
info = g
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
if info.Instance != "" && info.Node != "" {
|
||||
key = BuildGuestKey(info.Instance, info.Node, info.VMID)
|
||||
instance = info.Instance
|
||||
node = info.Node
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if key == "" {
|
||||
// Stable fallback for non-guest subjects and orphans.
|
||||
key = strings.TrimSpace(rollup.RollupID)
|
||||
if key == "" {
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
displayName = strings.TrimSpace(info.Name)
|
||||
if displayName == "" && ref != nil {
|
||||
displayName = strings.TrimSpace(ref.Name)
|
||||
}
|
||||
if displayName == "" && vmID != "" {
|
||||
displayName = fmt.Sprintf("VMID %s", vmID)
|
||||
}
|
||||
if displayName == "" {
|
||||
displayName = "Unknown"
|
||||
}
|
||||
|
||||
updateRecord(key, backupRecord{
|
||||
key: key,
|
||||
vmID: vmID,
|
||||
lookup: info,
|
||||
fallbackName: displayName,
|
||||
instance: instance,
|
||||
node: node,
|
||||
subjectType: subjectType,
|
||||
source: source,
|
||||
rollupID: strings.TrimSpace(rollup.RollupID),
|
||||
providers: providers,
|
||||
lastTime: lastTime,
|
||||
})
|
||||
}
|
||||
|
||||
if len(records) == 0 {
|
||||
m.clearBackupAlerts()
|
||||
return
|
||||
}
|
||||
|
||||
validAlerts := make(map[string]struct{})
|
||||
|
||||
for key, record := range records {
|
||||
age := now.Sub(record.lastTime)
|
||||
if age < 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
ageDays := age.Hours() / 24
|
||||
if ageDays < 0 {
|
||||
continue
|
||||
}
|
||||
ageDaysRounded := math.Round(ageDays*10) / 10
|
||||
|
||||
// Determine thresholds for this backup
|
||||
currentBackupCfg := backupCfg
|
||||
guestContext := guestSnapshotFromLookup(record.lookup, record.fallbackName)
|
||||
guestResourceID := strings.TrimSpace(record.lookup.ResourceID)
|
||||
if guestResourceID == "" {
|
||||
guestResourceID = guestContext.ID
|
||||
}
|
||||
if guestResourceID != "" {
|
||||
m.mu.RLock()
|
||||
gh := m.getGuestThresholds(guestContext, guestResourceID)
|
||||
m.mu.RUnlock()
|
||||
if gh.Disabled {
|
||||
continue
|
||||
}
|
||||
if gh.Backup != nil {
|
||||
currentBackupCfg = *gh.Backup
|
||||
}
|
||||
}
|
||||
|
||||
currentBackupCfg.AlertOrphaned = backupCfg.AlertOrphaned
|
||||
currentBackupCfg.IgnoreVMIDs = backupCfg.IgnoreVMIDs
|
||||
|
||||
if backupIgnoreVMID(record.vmID, currentBackupCfg.IgnoreVMIDs) {
|
||||
continue
|
||||
}
|
||||
if record.vmID != "" && record.lookup.ResourceID == "" {
|
||||
if backupMatchesKnownPVETemplate(inventoryScope, *record) {
|
||||
continue
|
||||
}
|
||||
if !backupOrphanInventoryReady(inventoryScope, *record) {
|
||||
continue
|
||||
}
|
||||
if currentBackupCfg.AlertOrphaned != nil && !*currentBackupCfg.AlertOrphaned {
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
if !currentBackupCfg.Enabled {
|
||||
continue
|
||||
}
|
||||
|
||||
var threshold int
|
||||
switch {
|
||||
case currentBackupCfg.CriticalDays > 0 && ageDays >= float64(currentBackupCfg.CriticalDays):
|
||||
threshold = currentBackupCfg.CriticalDays
|
||||
case currentBackupCfg.WarningDays > 0 && ageDays >= float64(currentBackupCfg.WarningDays):
|
||||
threshold = currentBackupCfg.WarningDays
|
||||
default:
|
||||
continue
|
||||
}
|
||||
|
||||
alertKey := sanitizeAlertKey(key)
|
||||
alertID := fmt.Sprintf("backup-age-%s", alertKey)
|
||||
|
||||
displayName := record.lookup.Name
|
||||
if displayName == "" {
|
||||
displayName = record.fallbackName
|
||||
}
|
||||
if displayName == "" {
|
||||
displayName = "Unknown guest"
|
||||
}
|
||||
|
||||
node := record.node
|
||||
if node == "" {
|
||||
node = record.lookup.Node
|
||||
}
|
||||
instance := record.instance
|
||||
if instance == "" {
|
||||
instance = record.lookup.Instance
|
||||
}
|
||||
|
||||
thresholdTime := record.lastTime.Add(time.Duration(threshold) * 24 * time.Hour)
|
||||
if thresholdTime.After(now) {
|
||||
thresholdTime = now
|
||||
}
|
||||
|
||||
var sourceLabel string
|
||||
sourceLabel = record.source
|
||||
if len(record.providers) > 0 {
|
||||
parts := make([]string, 0, len(record.providers))
|
||||
for _, p := range record.providers {
|
||||
if s := strings.TrimSpace(string(p)); s != "" {
|
||||
parts = append(parts, s)
|
||||
}
|
||||
}
|
||||
if len(parts) > 0 {
|
||||
sourceLabel = strings.Join(parts, ", ")
|
||||
}
|
||||
}
|
||||
|
||||
message := fmt.Sprintf(
|
||||
"%s backup via %s is %.1f days old (threshold: %d days)",
|
||||
displayName,
|
||||
sourceLabel,
|
||||
ageDaysRounded,
|
||||
threshold,
|
||||
)
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"source": record.source,
|
||||
"providers": record.providers,
|
||||
"rollupId": record.rollupID,
|
||||
"lastBackupTime": record.lastTime,
|
||||
"ageDays": ageDays,
|
||||
"thresholdDays": threshold,
|
||||
"guestName": displayName,
|
||||
"guestType": record.lookup.Type,
|
||||
"guestInstance": instance,
|
||||
"guestNode": node,
|
||||
"guestVmid": metadataIntValue(record.vmID),
|
||||
"orphaned": record.vmID != "" && guestResourceID == "",
|
||||
}
|
||||
specResourceID := canonicalBackupSubjectResourceID(alertKey, *record)
|
||||
specResourceType := canonicalBackupSubjectResourceType(*record)
|
||||
spec, err := buildCanonicalPostureThresholdSpec(
|
||||
specResourceID+"-backup-age",
|
||||
specResourceID,
|
||||
displayName+" backup",
|
||||
specResourceType,
|
||||
"backup-age-days",
|
||||
float64(currentBackupCfg.WarningDays),
|
||||
float64(currentBackupCfg.CriticalDays),
|
||||
"",
|
||||
0,
|
||||
0,
|
||||
false,
|
||||
)
|
||||
if err != nil {
|
||||
log.Warn().
|
||||
Err(err).
|
||||
Str("alertID", alertID).
|
||||
Str("resourceID", specResourceID).
|
||||
Msg("Skipping invalid canonical backup posture spec")
|
||||
continue
|
||||
}
|
||||
validAlerts[canonicalTrackingKeyForSpec(spec, alertID)] = struct{}{}
|
||||
|
||||
result, _ := m.evaluateCanonicalStatefulAlert(canonicalStatefulAlertParams{
|
||||
Spec: spec,
|
||||
Evidence: alertspecs.AlertEvidence{
|
||||
ObservedAt: now,
|
||||
PostureThreshold: &alertspecs.PostureThresholdEvidence{
|
||||
AgeMetric: "backup-age-days",
|
||||
AgeValue: ageDays,
|
||||
},
|
||||
},
|
||||
AlertID: alertID,
|
||||
AlertType: "backup-age",
|
||||
ResourceID: spec.ResourceID,
|
||||
ResourceName: fmt.Sprintf("%s backup", displayName),
|
||||
Node: node,
|
||||
Instance: instance,
|
||||
Value: ageDays,
|
||||
Threshold: float64(threshold),
|
||||
StartTimeOverride: thresholdTime,
|
||||
Metadata: metadata,
|
||||
AddToRecent: true,
|
||||
AddToHistory: true,
|
||||
RateLimit: true,
|
||||
DispatchAsync: true,
|
||||
MessageBuilder: func(result alertspecs.EvaluationResult) (string, float64, float64) {
|
||||
return message, ageDays, float64(threshold)
|
||||
},
|
||||
})
|
||||
if result.Transition != nil && result.Transition.Kind == alertspecs.EvaluationTransitionActivated {
|
||||
asyncSaveActiveAlerts("backup", m.SaveActiveAlerts)
|
||||
}
|
||||
}
|
||||
|
||||
m.mu.Lock()
|
||||
for storageKey, alert := range m.activeAlerts {
|
||||
if alert == nil || alert.Type != "backup-age" {
|
||||
continue
|
||||
}
|
||||
if _, ok := validAlerts[storageKey]; ok {
|
||||
continue
|
||||
}
|
||||
m.clearAlertNoLock(storageKey)
|
||||
}
|
||||
m.mu.Unlock()
|
||||
}
|
||||
|
||||
func slicesContainsProvider(providers []recovery.Provider, target recovery.Provider) bool {
|
||||
for _, p := range providers {
|
||||
if p == target {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func parseGuestID(raw string) (instance string, node string, vmid int, ok bool) {
|
||||
raw = strings.TrimSpace(raw)
|
||||
if raw == "" {
|
||||
return "", "", 0, false
|
||||
}
|
||||
parts := strings.Split(raw, ":")
|
||||
if len(parts) < 3 {
|
||||
return "", "", 0, false
|
||||
}
|
||||
last := parts[len(parts)-1]
|
||||
prev := parts[len(parts)-2]
|
||||
inst := strings.Join(parts[:len(parts)-2], ":")
|
||||
n, err := strconv.Atoi(strings.TrimSpace(last))
|
||||
if err != nil || n <= 0 {
|
||||
return "", "", 0, false
|
||||
}
|
||||
return strings.TrimSpace(inst), strings.TrimSpace(prev), n, true
|
||||
}
|
||||
|
||||
// namespaceMatchesInstance checks if a PBS namespace likely corresponds to a PVE instance.
|
||||
// This helps disambiguate backups when multiple PVE instances have VMs with the same VMID.
|
||||
// Examples: namespace "pve1" matches instance "pve1", namespace "nat" matches instance "pve-nat"
|
||||
func namespaceMatchesInstance(namespace, instance string) bool {
|
||||
if namespace == "" || instance == "" {
|
||||
return false
|
||||
}
|
||||
|
||||
// Normalize both strings: lowercase and keep only alphanumeric
|
||||
normalize := func(s string) string {
|
||||
var b strings.Builder
|
||||
for _, r := range strings.ToLower(s) {
|
||||
if (r >= 'a' && r <= 'z') || (r >= '0' && r <= '9') {
|
||||
b.WriteRune(r)
|
||||
}
|
||||
}
|
||||
return b.String()
|
||||
}
|
||||
|
||||
ns := normalize(namespace)
|
||||
inst := normalize(instance)
|
||||
|
||||
if ns == "" || inst == "" {
|
||||
return false
|
||||
}
|
||||
|
||||
// Exact match after normalization
|
||||
if ns == inst {
|
||||
return true
|
||||
}
|
||||
|
||||
// Check if namespace is a suffix of instance
|
||||
// e.g., namespace "nat" matches instance "pvenat" (normalized from "pve-nat")
|
||||
// This is more precise than substring matching because:
|
||||
// - "nat" should match "pve-nat" but not "natpve"
|
||||
// - "pve" should match "pve" but not "pve-nat" (handled by exact match above)
|
||||
if strings.HasSuffix(inst, ns) {
|
||||
return true
|
||||
}
|
||||
|
||||
// Check if instance is a suffix of namespace (reverse case)
|
||||
// e.g., namespace "pvebackups" could match instance "pve"
|
||||
if strings.HasSuffix(ns, inst) {
|
||||
return true
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
func (m *Manager) clearSnapshotAlertsForInstance(instance string) {
|
||||
m.mu.Lock()
|
||||
m.clearSnapshotAlertsForInstanceLocked(instance)
|
||||
m.mu.Unlock()
|
||||
}
|
||||
|
||||
func (m *Manager) clearSnapshotAlertsForInstanceLocked(instance string) {
|
||||
for storageKey, alert := range m.activeAlerts {
|
||||
alertID := effectiveAlertID(alert, storageKey)
|
||||
if alert == nil || alert.Type != "snapshot-age" {
|
||||
continue
|
||||
}
|
||||
if instance != "" && alert.Instance != instance {
|
||||
continue
|
||||
}
|
||||
m.clearAlertNoLock(alertID)
|
||||
}
|
||||
}
|
||||
|
||||
func (m *Manager) clearBackupAlerts() {
|
||||
m.mu.Lock()
|
||||
m.clearBackupAlertsLocked()
|
||||
m.mu.Unlock()
|
||||
}
|
||||
|
||||
func (m *Manager) clearBackupAlertsLocked() {
|
||||
for storageKey, alert := range m.activeAlerts {
|
||||
alertID := effectiveAlertID(alert, storageKey)
|
||||
if alert == nil || alert.Type != "backup-age" {
|
||||
continue
|
||||
}
|
||||
m.clearAlertNoLock(alertID)
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue