diff --git a/docs/release-control/v6/internal/records/known-rc-issue-closure-for-ga-backup-orphan-readiness-2026-05-01.md b/docs/release-control/v6/internal/records/known-rc-issue-closure-for-ga-backup-orphan-readiness-2026-05-01.md new file mode 100644 index 000000000..dc0532403 --- /dev/null +++ b/docs/release-control/v6/internal/records/known-rc-issue-closure-for-ga-backup-orphan-readiness-2026-05-01.md @@ -0,0 +1,49 @@ +# Known RC Issue Closure For GA Backup Orphan Readiness Record + +- Date: `2026-05-01` +- Gate: `known-rc-issue-closure-for-ga` +- Result: `passed` + +## Context + +The v5 maintenance delta audit found that `#1352` had not been carried into the +v6 backup-alert path. Pulse v5 learned not to run backup orphan detection before +Proxmox template inventory was ready, because backup polling can race ahead of +guest and template discovery during startup. + +The v6 runtime no longer evaluates raw storage backup arrays directly; it +evaluates recovery rollups. That made the v5 patch non-cherry-pickable, but the +same failure mode still applied: an old PVE backup whose VMID was not yet in +the current guest/template inventory could be marked as an orphaned backup-age +alert. + +## Disposition + +The v6 candidate now carries an inventory-aware backup alert boundary: + +- `internal/alerts/alerts.go` keeps the existing `CheckBackups` API for direct + callers and adds `CheckBackupsWithInventory` for monitoring-owned runtime + evaluation. +- PVE orphaned backup alerts now require per-instance, per-guest-type inventory + readiness before unresolved PVE backup subjects can alert. +- Known Proxmox template VM/container subjects are carried as backup-valid + subjects and skipped from orphaned backup-age alert creation. +- Monitoring records Proxmox template subjects from both the efficient + `cluster/resources` poll and the traditional VM/container poll fallback, and + passes that scoped inventory into backup alert evaluation. +- PBS/PMG rollup behavior remains unchanged, so external backup-only subjects + can still alert when no matching local guest exists. + +## Proof + +- `go test ./internal/alerts -run 'TestCheckBackups(SkipsPVEOrphanUntilInventoryReady|CreatesPVEOrphanWhenInventoryReady|SkipsKnownPVETemplateBackupSubject|SkipsOrphanedWhenDisabled|HandlesPbsOnlyGuests|VMIDCollision)' -count=1` +- `go test ./internal/monitoring -run 'TestPVEBackupTemplateInventoryScopeFromClusterResources|TestBuildGuestLookupsFromReadState' -count=1` +- `go test ./internal/alerts -count=1` +- `go test ./internal/monitoring -count=1` + +## Outcome + +The v6 recovery-rollup alert path no longer knowingly regresses v5 `#1352`. +PVE backup orphan alerts wait for the owning Proxmox inventory signal, and +template backups do not become false orphaned backup alerts just because +templates are excluded from normal workload resources. diff --git a/docs/release-control/v6/internal/status.json b/docs/release-control/v6/internal/status.json index a1f06271a..7c5f8c09a 100644 --- a/docs/release-control/v6/internal/status.json +++ b/docs/release-control/v6/internal/status.json @@ -4403,6 +4403,12 @@ "kind": "file", "evidence_tier": "managed-runtime-exercise" }, + { + "repo": "pulse", + "path": "docs/release-control/v6/internal/records/known-rc-issue-closure-for-ga-backup-orphan-readiness-2026-05-01.md", + "kind": "file", + "evidence_tier": "managed-runtime-exercise" + }, { "repo": "pulse", "path": "docs/release-control/v6/internal/records/known-rc-issue-closure-for-ga-blocked-2026-05-01.md", diff --git a/docs/release-control/v6/internal/subsystems/alerts.md b/docs/release-control/v6/internal/subsystems/alerts.md index cc060fafa..754792817 100644 --- a/docs/release-control/v6/internal/subsystems/alerts.md +++ b/docs/release-control/v6/internal/subsystems/alerts.md @@ -156,6 +156,13 @@ and identity first-seen tracking. Generic threshold reevaluation must not keep or resurrect image-update alerts after their owning Docker alert configuration has disabled them. +Backup orphan evaluation is also inventory-scoped. The alerts runtime may +evaluate recovery rollups for backup age, but unresolved Proxmox PVE backup +subjects must not be treated as orphaned until monitoring has supplied the +matching per-instance guest-type inventory readiness signal. Known Proxmox +template subjects are valid backup subjects, not orphaned workload backups, +even though templates remain excluded from normal runtime workload resources. + Alert history persistence is also part of that canonical boundary. The history manager may choose the owned runtime data directory, but it must normalize that directory once and then resolve only the fixed `alert-history.json` and diff --git a/docs/release-control/v6/internal/subsystems/monitoring.md b/docs/release-control/v6/internal/subsystems/monitoring.md index 08c8247e0..13ad0ad32 100644 --- a/docs/release-control/v6/internal/subsystems/monitoring.md +++ b/docs/release-control/v6/internal/subsystems/monitoring.md @@ -865,6 +865,13 @@ record whose canonical status remains `available` whenever any reporting node still has the shared target active; node-local inactive copies may expand node affinity, but they must not downgrade the cluster record into an offline projection just because that node won the capacity sample. +That same monitoring-owned Proxmox backup boundary also owns the inventory +readiness signal used by backup orphan alerts. `internal/monitoring/` must +record when PVE VM and container inventory has successfully observed a given +instance and guest type, including template VMIDs that are intentionally +excluded from normal workload resources. Backup alert evaluation may then +receive that scoped signal from monitoring, but alert code must not infer PVE +orphan readiness from recovery rollups alone. That same monitoring-owned host-agent ingest boundary now also owns vendor-managed NAS RAID normalization. `internal/monitoring/monitor_agents.go` must filter vendor-managed system arrays through the shared diff --git a/internal/alerts/alerts.go b/internal/alerts/alerts.go index a54c96bfb..72e99da79 100644 --- a/internal/alerts/alerts.go +++ b/internal/alerts/alerts.go @@ -6216,12 +6216,74 @@ type backupRecord struct { fallbackName string instance string node string + subjectType string source string rollupID string providers []recovery.Provider lastTime time.Time } +// BackupInventoryScope carries monitoring-owned inventory readiness into backup +// alert evaluation. It keeps orphan detection from racing ahead of Proxmox +// guest/template discovery while preserving the direct CheckBackups API for +// unit tests and non-monitoring callers. +type BackupInventoryScope struct { + PVEOrphanInventoryReady map[string]map[string]bool + PVETemplateSubjects map[string]struct{} +} + +func BuildBackupPVETemplateSubjectKey(instance, guestType, node string, vmid int) string { + instance = strings.TrimSpace(instance) + guestType = normalizeBackupGuestType(guestType) + node = strings.TrimSpace(node) + if instance == "" || guestType == "" || node == "" || vmid <= 0 { + return "" + } + return strings.Join([]string{instance, guestType, node, strconv.Itoa(vmid)}, "\x00") +} + +func normalizeBackupGuestType(guestType string) string { + switch strings.ToLower(strings.TrimSpace(guestType)) { + case "qemu", "vm", "proxmox-vm": + return "qemu" + case "lxc", "ct", "container", "system-container", "proxmox-lxc": + return "lxc" + default: + return strings.ToLower(strings.TrimSpace(guestType)) + } +} + +func backupOrphanInventoryReady(scope *BackupInventoryScope, record backupRecord) bool { + if scope == nil || scope.PVEOrphanInventoryReady == nil { + return true + } + if record.source != "PVE" { + return true + } + instance := strings.TrimSpace(record.instance) + guestType := normalizeBackupGuestType(record.subjectType) + if instance == "" || guestType == "" { + return false + } + return scope.PVEOrphanInventoryReady[instance][guestType] +} + +func backupMatchesKnownPVETemplate(scope *BackupInventoryScope, record backupRecord) bool { + if scope == nil || len(scope.PVETemplateSubjects) == 0 || record.source != "PVE" { + return false + } + vmid, err := strconv.Atoi(strings.TrimSpace(record.vmID)) + if err != nil || vmid <= 0 { + return false + } + key := BuildBackupPVETemplateSubjectKey(record.instance, record.subjectType, record.node, vmid) + if key == "" { + return false + } + _, exists := scope.PVETemplateSubjects[key] + return exists +} + func canonicalGuestResourceType(guestType string) unifiedresources.ResourceType { switch strings.ToLower(strings.TrimSpace(guestType)) { case "lxc": @@ -6235,6 +6297,12 @@ func canonicalBackupSubjectResourceType(record backupRecord) unifiedresources.Re if record.lookup.Type != "" { return canonicalGuestResourceType(record.lookup.Type) } + switch normalizeBackupGuestType(record.subjectType) { + case "lxc": + return unifiedresources.ResourceTypeSystemContainer + case "qemu": + return unifiedresources.ResourceTypeVM + } if strings.TrimSpace(record.vmID) != "" { return unifiedresources.ResourceTypeVM } @@ -6534,6 +6602,17 @@ func (m *Manager) CheckBackups( rollups []recovery.ProtectionRollup, guestsByKey map[string]GuestLookup, guestsByVMID map[string][]GuestLookup, +) { + m.CheckBackupsWithInventory(rollups, guestsByKey, guestsByVMID, nil) +} + +// CheckBackupsWithInventory evaluates backup rollups with optional monitoring +// inventory readiness for orphan detection. +func (m *Manager) CheckBackupsWithInventory( + rollups []recovery.ProtectionRollup, + guestsByKey map[string]GuestLookup, + guestsByVMID map[string][]GuestLookup, + inventoryScope *BackupInventoryScope, ) { m.mu.RLock() enabled := m.config.Enabled @@ -6597,9 +6676,13 @@ func (m *Manager) CheckBackups( instance string node string vmID string + subjectType string ) ref := rollup.SubjectRef + if ref != nil { + subjectType = normalizeBackupGuestType(ref.Type) + } // Primary: subjectRef.ID is the canonical proxmox guest source ID (instance:node:vmid) when linked. if ref != nil && strings.TrimSpace(ref.ID) != "" { @@ -6667,6 +6750,7 @@ func (m *Manager) CheckBackups( fallbackName: displayName, instance: instance, node: node, + subjectType: subjectType, source: source, rollupID: strings.TrimSpace(rollup.RollupID), providers: providers, @@ -6719,6 +6803,12 @@ func (m *Manager) CheckBackups( continue } if record.vmID != "" && record.lookup.ResourceID == "" { + if backupMatchesKnownPVETemplate(inventoryScope, *record) { + continue + } + if !backupOrphanInventoryReady(inventoryScope, *record) { + continue + } if currentBackupCfg.AlertOrphaned != nil && !*currentBackupCfg.AlertOrphaned { continue } diff --git a/internal/alerts/alerts_test.go b/internal/alerts/alerts_test.go index 0b9de9898..4ef4734a4 100644 --- a/internal/alerts/alerts_test.go +++ b/internal/alerts/alerts_test.go @@ -1740,6 +1740,147 @@ func TestCheckBackupsSkipsOrphanedWhenDisabled(t *testing.T) { } } +func TestCheckBackupsSkipsPVEOrphanUntilInventoryReady(t *testing.T) { + m := newTestManager(t) + m.ClearActiveAlerts() + + alertOrphaned := true + m.mu.Lock() + m.config.Enabled = true + m.config.BackupDefaults = BackupAlertConfig{ + Enabled: true, + WarningDays: 3, + CriticalDays: 5, + AlertOrphaned: &alertOrphaned, + } + m.mu.Unlock() + + now := time.Now() + rollups := []recovery.ProtectionRollup{ + { + RollupID: "res:vm:proxmox:inst:node:700", + SubjectRef: &recovery.ExternalRef{ + Type: "proxmox-vm", + Namespace: "inst", + Name: "700", + ID: "inst:node:700", + Class: "node", + }, + LastSuccessAt: ptrTime(now.Add(-6 * 24 * time.Hour)), + LastOutcome: recovery.OutcomeSuccess, + Providers: []recovery.Provider{recovery.ProviderProxmoxPVE}, + }, + } + + m.CheckBackupsWithInventory(rollups, map[string]GuestLookup{}, map[string][]GuestLookup{}, &BackupInventoryScope{ + PVEOrphanInventoryReady: map[string]map[string]bool{}, + }) + + m.mu.RLock() + defer m.mu.RUnlock() + for storageKey, alert := range m.activeAlerts { + if strings.HasPrefix(effectiveAlertID(alert, storageKey), "backup-age-") { + t.Fatalf("expected no PVE orphan alert before inventory readiness, found %s", effectiveAlertID(alert, storageKey)) + } + } +} + +func TestCheckBackupsCreatesPVEOrphanWhenInventoryReady(t *testing.T) { + m := newTestManager(t) + m.ClearActiveAlerts() + + alertOrphaned := true + m.mu.Lock() + m.config.Enabled = true + m.config.BackupDefaults = BackupAlertConfig{ + Enabled: true, + WarningDays: 3, + CriticalDays: 5, + AlertOrphaned: &alertOrphaned, + } + m.mu.Unlock() + + now := time.Now() + rollups := []recovery.ProtectionRollup{ + { + RollupID: "res:vm:proxmox:inst:node:701", + SubjectRef: &recovery.ExternalRef{ + Type: "proxmox-vm", + Namespace: "inst", + Name: "701", + ID: "inst:node:701", + Class: "node", + }, + LastSuccessAt: ptrTime(now.Add(-6 * 24 * time.Hour)), + LastOutcome: recovery.OutcomeSuccess, + Providers: []recovery.Provider{recovery.ProviderProxmoxPVE}, + }, + } + + m.CheckBackupsWithInventory(rollups, map[string]GuestLookup{}, map[string][]GuestLookup{}, &BackupInventoryScope{ + PVEOrphanInventoryReady: map[string]map[string]bool{ + "inst": {"qemu": true}, + }, + }) + + m.mu.RLock() + defer m.mu.RUnlock() + if !testHasActiveAlert(t, m, buildCanonicalStateID("inst:node:701", "inst:node:701-backup-age")) { + t.Fatalf("expected PVE orphan backup alert after qemu inventory is ready") + } +} + +func TestCheckBackupsSkipsKnownPVETemplateBackupSubject(t *testing.T) { + m := newTestManager(t) + m.ClearActiveAlerts() + + alertOrphaned := true + m.mu.Lock() + m.config.Enabled = true + m.config.BackupDefaults = BackupAlertConfig{ + Enabled: true, + WarningDays: 3, + CriticalDays: 5, + AlertOrphaned: &alertOrphaned, + } + m.mu.Unlock() + + now := time.Now() + rollups := []recovery.ProtectionRollup{ + { + RollupID: "res:vm:proxmox:inst:node:702", + SubjectRef: &recovery.ExternalRef{ + Type: "proxmox-vm", + Namespace: "inst", + Name: "template-702", + ID: "inst:node:702", + Class: "node", + }, + LastSuccessAt: ptrTime(now.Add(-6 * 24 * time.Hour)), + LastOutcome: recovery.OutcomeSuccess, + Providers: []recovery.Provider{recovery.ProviderProxmoxPVE}, + }, + } + + templateKey := BuildBackupPVETemplateSubjectKey("inst", "qemu", "node", 702) + m.CheckBackupsWithInventory(rollups, map[string]GuestLookup{}, map[string][]GuestLookup{}, &BackupInventoryScope{ + PVEOrphanInventoryReady: map[string]map[string]bool{ + "inst": {"qemu": true}, + }, + PVETemplateSubjects: map[string]struct{}{ + templateKey: {}, + }, + }) + + m.mu.RLock() + defer m.mu.RUnlock() + for storageKey, alert := range m.activeAlerts { + if strings.HasPrefix(effectiveAlertID(alert, storageKey), "backup-age-") { + t.Fatalf("expected known template backup subject to be skipped, found %s", effectiveAlertID(alert, storageKey)) + } + } +} + func TestCheckBackupsIgnoresVMIDs(t *testing.T) { m := newTestManager(t) m.ClearActiveAlerts() diff --git a/internal/monitoring/canonical_guardrails_test.go b/internal/monitoring/canonical_guardrails_test.go index e3941db84..a9059d3b3 100644 --- a/internal/monitoring/canonical_guardrails_test.go +++ b/internal/monitoring/canonical_guardrails_test.go @@ -450,6 +450,45 @@ func TestProxmoxGuestDiskInventoryPrefersCanonicalLinkedHostAgentSource(t *testi } } +func TestBackupOrphanDetectionUsesCanonicalInventoryReadinessScope(t *testing.T) { + requiredSnippets := map[string][]string{ + "monitor.go": { + "pveBackupInventoryReady map[string]map[string]bool", + "pveBackupTemplateSubjects map[string]map[string]struct{}", + }, + "monitor_backups.go": { + "func (m *Monitor) updatePVEBackupTemplateSubjectsForType(instanceName, guestType string, subjects map[string]struct{}) {", + "func (m *Monitor) updatePVEBackupTemplateSubjectsFromClusterResources(instanceName string, resources []proxmox.ClusterResource) {", + "func (m *Monitor) backupInventoryScopeForAlerts() *alerts.BackupInventoryScope {", + "m.alertManager.CheckBackupsWithInventory(rollups, guestsByKey, guestsByVMID, m.backupInventoryScopeForAlerts())", + }, + "monitor_pve_guest_poll.go": { + "m.updatePVEBackupTemplateSubjectsFromClusterResources(instanceName, resources)", + }, + "monitor_polling_vm.go": { + `pveBackupTemplateSubjectKey(instanceName, "qemu", n.Node, vm.VMID)`, + `m.updatePVEBackupTemplateSubjectsForType(instanceName, "qemu", qemuTemplateSubjects)`, + }, + "monitor_polling_containers.go": { + `pveBackupTemplateSubjectKey(instanceName, "lxc", n.Node, int(container.VMID))`, + `m.updatePVEBackupTemplateSubjectsForType(instanceName, "lxc", lxcTemplateSubjects)`, + }, + } + + for file, snippets := range requiredSnippets { + data, err := os.ReadFile(file) + if err != nil { + t.Fatalf("failed to read %s: %v", file, err) + } + source := string(data) + for _, snippet := range snippets { + if !strings.Contains(source, snippet) { + t.Fatalf("%s must contain %q", file, snippet) + } + } + } +} + func TestStoragePollingUsesCanonicalPoolMetadataForZFSAttachment(t *testing.T) { data, err := os.ReadFile("monitor_polling_storage.go") if err != nil { diff --git a/internal/monitoring/memory_trust_characterization_test.go b/internal/monitoring/memory_trust_characterization_test.go index f717c459e..79d18102f 100644 --- a/internal/monitoring/memory_trust_characterization_test.go +++ b/internal/monitoring/memory_trust_characterization_test.go @@ -403,6 +403,62 @@ func TestPollVMsWithNodesPreservesProxmoxPool(t *testing.T) { } } +func TestPollVMsWithNodesRecordsQEMUTemplateBackupInventoryReadiness(t *testing.T) { + t.Setenv("PULSE_DATA_DIR", t.TempDir()) + + mon := newTestPVEMonitor("test") + defer mon.alertManager.Stop() + defer mon.notificationMgr.Stop() + + client := &vmMemoryTrustStubClient{ + stubPVEClient: &stubPVEClient{}, + vms: []proxmox.VM{ + { + VMID: 900, + Name: "tmpl-900", + Node: "node1", + Status: "stopped", + Template: 1, + MaxMem: 8 * 1024, + CPUs: 2, + }, + { + VMID: 101, + Name: "vm-101", + Node: "node1", + Status: "stopped", + MaxMem: 8 * 1024, + Mem: 2 * 1024, + CPUs: 2, + }, + }, + } + + nodes := []proxmox.Node{{Node: "node1", Status: "online"}} + nodeEffectiveStatus := map[string]string{"node1": "online"} + mon.pollVMsWithNodes(context.Background(), "test", "", false, client, nodes, nodeEffectiveStatus) + + vms := mon.state.GetSnapshot().VMs + if len(vms) != 1 { + t.Fatalf("expected only non-template VM in runtime state, got %d", len(vms)) + } + if got := vms[0].VMID; got != 101 { + t.Fatalf("runtime VMID = %d, want 101", got) + } + + scope := mon.backupInventoryScopeForAlerts() + if scope == nil { + t.Fatal("expected backup inventory scope") + } + if !scope.PVEOrphanInventoryReady["test"]["qemu"] { + t.Fatalf("expected qemu backup orphan inventory readiness for test instance") + } + templateSubject := pveBackupTemplateSubjectKey("test", "qemu", "node1", 900) + if _, ok := scope.PVETemplateSubjects[templateSubject]; !ok { + t.Fatalf("expected template subject %q in backup inventory scope", templateSubject) + } +} + func TestPollVMsWithNodesMemoryTrustCharacterization(t *testing.T) { t.Setenv("PULSE_DATA_DIR", t.TempDir()) diff --git a/internal/monitoring/monitor.go b/internal/monitoring/monitor.go index c5169678c..af0fa014e 100644 --- a/internal/monitoring/monitor.go +++ b/internal/monitoring/monitor.go @@ -941,6 +941,8 @@ type Monitor struct { lastPhysicalDiskPoll map[string]time.Time // Track last physical disk poll time per instance lastPVEBackupPoll map[string]time.Time // Track last PVE backup poll per instance lastPBSBackupPoll map[string]time.Time // Track last PBS backup poll per instance + pveBackupInventoryReady map[string]map[string]bool // Track PVE guest inventory readiness for backup orphan detection + pveBackupTemplateSubjects map[string]map[string]struct{} // Track template VMIDs excluded from runtime workloads but valid for backups backupPermissionWarnings map[string]string // Track backup permission issues per instance (instance -> warning message) persistence *config.ConfigPersistence // Add persistence for saving updated configs pbsBackupPollers map[string]bool // Track PBS backup polling goroutines per instance @@ -1486,6 +1488,8 @@ func New(cfg *config.Config) (*Monitor, error) { lastPhysicalDiskPoll: make(map[string]time.Time), lastPVEBackupPoll: make(map[string]time.Time), lastPBSBackupPoll: make(map[string]time.Time), + pveBackupInventoryReady: make(map[string]map[string]bool), + pveBackupTemplateSubjects: make(map[string]map[string]struct{}), backupPermissionWarnings: make(map[string]string), persistence: config.NewConfigPersistence(cfg.DataPath), pbsBackupPollers: make(map[string]bool), diff --git a/internal/monitoring/monitor_agents.go b/internal/monitoring/monitor_agents.go index 6c61e3117..092e2bc73 100644 --- a/internal/monitoring/monitor_agents.go +++ b/internal/monitoring/monitor_agents.go @@ -2600,6 +2600,8 @@ func (m *Monitor) cleanupTrackingMaps(now time.Time) { for instanceID, ts := range m.lastPVEBackupPoll { if ts.Before(cutoff) { delete(m.lastPVEBackupPoll, instanceID) + delete(m.pveBackupInventoryReady, instanceID) + delete(m.pveBackupTemplateSubjects, instanceID) cleaned++ } } diff --git a/internal/monitoring/monitor_alerts.go b/internal/monitoring/monitor_alerts.go index 5cb37d5df..e1b5283a6 100644 --- a/internal/monitoring/monitor_alerts.go +++ b/internal/monitoring/monitor_alerts.go @@ -303,7 +303,7 @@ func (m *Monitor) checkMockAlerts() { if err != nil { log.Warn().Err(err).Msg("Failed to list recovery rollups for backup alerts") } else { - m.alertManager.CheckBackups(rollups, guestsByKey, guestsByVMID) + m.alertManager.CheckBackupsWithInventory(rollups, guestsByKey, guestsByVMID, m.backupInventoryScopeForAlerts()) } // Limit how many guests we check per cycle to prevent blocking with large datasets diff --git a/internal/monitoring/monitor_backups.go b/internal/monitoring/monitor_backups.go index 657dbb93d..4649e5337 100644 --- a/internal/monitoring/monitor_backups.go +++ b/internal/monitoring/monitor_backups.go @@ -20,6 +20,100 @@ import ( "github.com/rs/zerolog/log" ) +func pveBackupTemplateSubjectKey(instance, guestType, node string, vmid int) string { + return alerts.BuildBackupPVETemplateSubjectKey(instance, guestType, node, vmid) +} + +func (m *Monitor) updatePVEBackupTemplateSubjectsForType(instanceName, guestType string, subjects map[string]struct{}) { + if m == nil { + return + } + instanceName = strings.TrimSpace(instanceName) + guestType = strings.TrimSpace(guestType) + if instanceName == "" || guestType == "" { + return + } + + m.mu.Lock() + defer m.mu.Unlock() + if m.pveBackupInventoryReady == nil { + m.pveBackupInventoryReady = make(map[string]map[string]bool) + } + if m.pveBackupInventoryReady[instanceName] == nil { + m.pveBackupInventoryReady[instanceName] = make(map[string]bool) + } + m.pveBackupInventoryReady[instanceName][guestType] = true + + if m.pveBackupTemplateSubjects == nil { + m.pveBackupTemplateSubjects = make(map[string]map[string]struct{}) + } + existing := m.pveBackupTemplateSubjects[instanceName] + if existing == nil { + existing = make(map[string]struct{}) + } + prefix := instanceName + "\x00" + guestType + "\x00" + for key := range existing { + if strings.HasPrefix(key, prefix) { + delete(existing, key) + } + } + for key := range subjects { + if key != "" { + existing[key] = struct{}{} + } + } + m.pveBackupTemplateSubjects[instanceName] = existing +} + +func (m *Monitor) updatePVEBackupTemplateSubjectsFromClusterResources(instanceName string, resources []proxmox.ClusterResource) { + qemuTemplates := make(map[string]struct{}) + lxcTemplates := make(map[string]struct{}) + for _, res := range resources { + if res.Template != 1 { + continue + } + switch strings.TrimSpace(res.Type) { + case "qemu": + if key := pveBackupTemplateSubjectKey(instanceName, "qemu", res.Node, res.VMID); key != "" { + qemuTemplates[key] = struct{}{} + } + case "lxc": + if key := pveBackupTemplateSubjectKey(instanceName, "lxc", res.Node, res.VMID); key != "" { + lxcTemplates[key] = struct{}{} + } + } + } + m.updatePVEBackupTemplateSubjectsForType(instanceName, "qemu", qemuTemplates) + m.updatePVEBackupTemplateSubjectsForType(instanceName, "lxc", lxcTemplates) +} + +func (m *Monitor) backupInventoryScopeForAlerts() *alerts.BackupInventoryScope { + if m == nil { + return nil + } + scope := &alerts.BackupInventoryScope{ + PVEOrphanInventoryReady: make(map[string]map[string]bool), + PVETemplateSubjects: make(map[string]struct{}), + } + m.mu.RLock() + defer m.mu.RUnlock() + for instance, readyByType := range m.pveBackupInventoryReady { + if len(readyByType) == 0 { + continue + } + scope.PVEOrphanInventoryReady[instance] = make(map[string]bool, len(readyByType)) + for guestType, ready := range readyByType { + scope.PVEOrphanInventoryReady[instance][guestType] = ready + } + } + for _, subjects := range m.pveBackupTemplateSubjects { + for key := range subjects { + scope.PVETemplateSubjects[key] = struct{}{} + } + } + return scope +} + func (m *Monitor) pollStorageBackupsWithNodes(ctx context.Context, instanceName string, client PVEClientInterface, nodes []proxmox.Node, nodeEffectiveStatus map[string]string) { var allBackups []models.StorageBackup @@ -267,7 +361,7 @@ func (m *Monitor) pollStorageBackupsWithNodes(ctx context.Context, instanceName if err != nil { log.Warn().Err(err).Msg("Failed to list recovery rollups for backup alerts") } else { - m.alertManager.CheckBackups(rollups, guestsByKey, guestsByVMID) + m.alertManager.CheckBackupsWithInventory(rollups, guestsByKey, guestsByVMID, m.backupInventoryScopeForAlerts()) } } @@ -1301,7 +1395,7 @@ func (m *Monitor) pollPBSBackups(ctx context.Context, instanceName string, clien if err != nil { log.Warn().Err(err).Msg("Failed to list recovery rollups for backup alerts") } else { - m.alertManager.CheckBackups(rollups, guestsByKey, guestsByVMID) + m.alertManager.CheckBackupsWithInventory(rollups, guestsByKey, guestsByVMID, m.backupInventoryScopeForAlerts()) } } diff --git a/internal/monitoring/monitor_cluster_dedupe.go b/internal/monitoring/monitor_cluster_dedupe.go index 3b30bf893..8694cebd0 100644 --- a/internal/monitoring/monitor_cluster_dedupe.go +++ b/internal/monitoring/monitor_cluster_dedupe.go @@ -87,6 +87,8 @@ func (m *Monitor) retirePVEInstanceRuntime(instanceName string) { delete(m.lastClusterCheck, instanceName) delete(m.lastPhysicalDiskPoll, instanceName) delete(m.lastPVEBackupPoll, instanceName) + delete(m.pveBackupInventoryReady, instanceName) + delete(m.pveBackupTemplateSubjects, instanceName) delete(m.backupPermissionWarnings, instanceName) delete(m.authFailures, instanceName) delete(m.authFailures, string(InstanceTypePVE)+"-"+instanceName) diff --git a/internal/monitoring/monitor_host_agents_test.go b/internal/monitoring/monitor_host_agents_test.go index 0374c135f..f724b618e 100644 --- a/internal/monitoring/monitor_host_agents_test.go +++ b/internal/monitoring/monitor_host_agents_test.go @@ -241,6 +241,50 @@ func TestEvaluateHostAgentsClearsAlertWhenHostReturns(t *testing.T) { } } +func TestCleanupTrackingMapsClearsStalePVEBackupInventoryScope(t *testing.T) { + now := time.Now() + stale := now.Add(-25 * time.Hour) + fresh := now.Add(-time.Hour) + staleSubject := pveBackupTemplateSubjectKey("pve-stale", "qemu", "node1", 900) + freshSubject := pveBackupTemplateSubjectKey("pve-fresh", "qemu", "node1", 901) + + monitor := &Monitor{ + lastPVEBackupPoll: map[string]time.Time{ + "pve-stale": stale, + "pve-fresh": fresh, + }, + pveBackupInventoryReady: map[string]map[string]bool{ + "pve-stale": {"qemu": true}, + "pve-fresh": {"qemu": true}, + }, + pveBackupTemplateSubjects: map[string]map[string]struct{}{ + "pve-stale": {staleSubject: {}}, + "pve-fresh": {freshSubject: {}}, + }, + } + + monitor.cleanupTrackingMaps(now) + + if _, ok := monitor.lastPVEBackupPoll["pve-stale"]; ok { + t.Fatalf("expected stale PVE backup poll marker to be removed") + } + if _, ok := monitor.pveBackupInventoryReady["pve-stale"]; ok { + t.Fatalf("expected stale PVE backup inventory readiness to be removed") + } + if _, ok := monitor.pveBackupTemplateSubjects["pve-stale"]; ok { + t.Fatalf("expected stale PVE backup template subjects to be removed") + } + if _, ok := monitor.lastPVEBackupPoll["pve-fresh"]; !ok { + t.Fatalf("expected fresh PVE backup poll marker to remain") + } + if !monitor.pveBackupInventoryReady["pve-fresh"]["qemu"] { + t.Fatalf("expected fresh PVE backup inventory readiness to remain") + } + if _, ok := monitor.pveBackupTemplateSubjects["pve-fresh"][freshSubject]; !ok { + t.Fatalf("expected fresh PVE backup template subject to remain") + } +} + func TestApplyHostReportAllowsTokenReuseAcrossHosts(t *testing.T) { t.Helper() diff --git a/internal/monitoring/monitor_metadata_test.go b/internal/monitoring/monitor_metadata_test.go index ff05f8096..12f3d3e65 100644 --- a/internal/monitoring/monitor_metadata_test.go +++ b/internal/monitoring/monitor_metadata_test.go @@ -9,6 +9,7 @@ import ( "github.com/rcourtman/pulse-go-rewrite/internal/config" "github.com/rcourtman/pulse-go-rewrite/internal/models" "github.com/rcourtman/pulse-go-rewrite/internal/unifiedresources" + "github.com/rcourtman/pulse-go-rewrite/pkg/proxmox" ) func TestPersistGuestIdentity_Concurrent(t *testing.T) { @@ -173,3 +174,37 @@ func TestBuildGuestLookupsFromReadState_UsesPersistedMetadataWhenCanonicalStateE t.Fatalf("expected persisted metadata fallback, got %+v", byVMID["100"]) } } + +func TestPVEBackupTemplateInventoryScopeFromClusterResources(t *testing.T) { + m := &Monitor{} + + m.updatePVEBackupTemplateSubjectsFromClusterResources("pve-a", []proxmox.ClusterResource{ + {Type: "qemu", Node: "node-a", VMID: 700, Template: 1}, + {Type: "lxc", Node: "node-b", VMID: 701, Template: 1}, + {Type: "qemu", Node: "node-c", VMID: 702, Template: 0}, + }) + + scope := m.backupInventoryScopeForAlerts() + if scope == nil { + t.Fatalf("expected backup inventory scope") + } + if !scope.PVEOrphanInventoryReady["pve-a"]["qemu"] { + t.Fatalf("expected qemu backup inventory to be marked ready") + } + if !scope.PVEOrphanInventoryReady["pve-a"]["lxc"] { + t.Fatalf("expected lxc backup inventory to be marked ready") + } + + qemuTemplate := alerts.BuildBackupPVETemplateSubjectKey("pve-a", "qemu", "node-a", 700) + if _, exists := scope.PVETemplateSubjects[qemuTemplate]; !exists { + t.Fatalf("expected qemu template subject to be captured") + } + lxcTemplate := alerts.BuildBackupPVETemplateSubjectKey("pve-a", "lxc", "node-b", 701) + if _, exists := scope.PVETemplateSubjects[lxcTemplate]; !exists { + t.Fatalf("expected lxc template subject to be captured") + } + nonTemplate := alerts.BuildBackupPVETemplateSubjectKey("pve-a", "qemu", "node-c", 702) + if _, exists := scope.PVETemplateSubjects[nonTemplate]; exists { + t.Fatalf("did not expect non-template subject to be captured") + } +} diff --git a/internal/monitoring/monitor_polling_containers.go b/internal/monitoring/monitor_polling_containers.go index 41e421826..9924a6349 100644 --- a/internal/monitoring/monitor_polling_containers.go +++ b/internal/monitoring/monitor_polling_containers.go @@ -17,9 +17,10 @@ func (m *Monitor) pollContainersWithNodes(ctx context.Context, instanceName stri // Channel to collect container results from each node type nodeResult struct { - node string - containers []models.Container - err error + node string + containers []models.Container + templateSubjects map[string]struct{} + err error } resultChan := make(chan nodeResult, len(nodes)) @@ -81,11 +82,15 @@ func (m *Monitor) pollContainersWithNodes(ctx context.Context, instanceName stri rootUsageOverrides := m.collectContainerRootUsage(ctx, client, n.Node, vmIDs) var nodeContainers []models.Container + nodeTemplateSubjects := make(map[string]struct{}) // Process each container for _, container := range containers { // Skip templates if container.Template == 1 { + if key := pveBackupTemplateSubjectKey(instanceName, "lxc", n.Node, int(container.VMID)); key != "" { + nodeTemplateSubjects[key] = struct{}{} + } continue } @@ -269,7 +274,7 @@ func (m *Monitor) pollContainersWithNodes(ctx context.Context, instanceName stri Dur("duration", nodeDuration). Msg("Node container polling completed") - resultChan <- nodeResult{node: n.Node, containers: nodeContainers} + resultChan <- nodeResult{node: n.Node, containers: nodeContainers, templateSubjects: nodeTemplateSubjects} }(node) } @@ -281,6 +286,7 @@ func (m *Monitor) pollContainersWithNodes(ctx context.Context, instanceName stri // Collect results from all nodes var allContainers []models.Container + lxcTemplateSubjects := make(map[string]struct{}) successfulNodes := 0 failedNodes := 0 @@ -290,8 +296,14 @@ func (m *Monitor) pollContainersWithNodes(ctx context.Context, instanceName stri } else { successfulNodes++ allContainers = append(allContainers, result.containers...) + for key := range result.templateSubjects { + lxcTemplateSubjects[key] = struct{}{} + } } } + if failedNodes == 0 && successfulNodes > 0 { + m.updatePVEBackupTemplateSubjectsForType(instanceName, "lxc", lxcTemplateSubjects) + } // If we got ZERO containers but had containers before (likely cluster health issue), // preserve previous containers instead of clearing them diff --git a/internal/monitoring/monitor_polling_vm.go b/internal/monitoring/monitor_polling_vm.go index a04964573..6e896a8bf 100644 --- a/internal/monitoring/monitor_polling_vm.go +++ b/internal/monitoring/monitor_polling_vm.go @@ -19,9 +19,10 @@ func (m *Monitor) pollVMsWithNodes(ctx context.Context, instanceName string, clu // Channel to collect VM results from each node type nodeResult struct { - node string - vms []models.VM - err error + node string + vms []models.VM + templateSubjects map[string]struct{} + err error } resultChan := make(chan nodeResult, len(nodes)) @@ -74,11 +75,15 @@ func (m *Monitor) pollVMsWithNodes(ctx context.Context, instanceName string, clu } var nodeVMs []models.VM + nodeTemplateSubjects := make(map[string]struct{}) // Process each VM for _, vm := range vms { // Skip templates if vm.Template == 1 { + if key := pveBackupTemplateSubjectKey(instanceName, "qemu", n.Node, vm.VMID); key != "" { + nodeTemplateSubjects[key] = struct{}{} + } continue } @@ -601,7 +606,7 @@ func (m *Monitor) pollVMsWithNodes(ctx context.Context, instanceName string, clu Dur("duration", nodeDuration). Msg("Node VM polling completed") - resultChan <- nodeResult{node: n.Node, vms: nodeVMs} + resultChan <- nodeResult{node: n.Node, vms: nodeVMs, templateSubjects: nodeTemplateSubjects} }(node) } @@ -613,6 +618,7 @@ func (m *Monitor) pollVMsWithNodes(ctx context.Context, instanceName string, clu // Collect results from all nodes var allVMs []models.VM + qemuTemplateSubjects := make(map[string]struct{}) successfulNodes := 0 failedNodes := 0 @@ -622,8 +628,14 @@ func (m *Monitor) pollVMsWithNodes(ctx context.Context, instanceName string, clu } else { successfulNodes++ allVMs = append(allVMs, result.vms...) + for key := range result.templateSubjects { + qemuTemplateSubjects[key] = struct{}{} + } } } + if failedNodes == 0 && successfulNodes > 0 { + m.updatePVEBackupTemplateSubjectsForType(instanceName, "qemu", qemuTemplateSubjects) + } // If we got ZERO VMs but had VMs before (likely cluster health issue), // preserve previous VMs instead of clearing them diff --git a/internal/monitoring/monitor_pve_guest_poll.go b/internal/monitoring/monitor_pve_guest_poll.go index 75fe1bd57..a5d0f5ca3 100644 --- a/internal/monitoring/monitor_pve_guest_poll.go +++ b/internal/monitoring/monitor_pve_guest_poll.go @@ -25,6 +25,7 @@ func (m *Monitor) pollVMsAndContainersEfficient(ctx context.Context, instanceNam log.Debug().Err(err).Str("instance", instanceName).Msg("cluster/resources not available, falling back to traditional polling") return false } + m.updatePVEBackupTemplateSubjectsFromClusterResources(instanceName, resources) // Capture previous guest state once per poll cycle so fallback and grace-period // behavior is based on a consistent pre-poll snapshot.