Fix backup orphan inventory readiness

Refs #1352
This commit is contained in:
rcourtman 2026-05-01 19:36:39 +01:00
parent 61cd902ded
commit fe597554c3
18 changed files with 612 additions and 11 deletions

View file

@ -0,0 +1,49 @@
# Known RC Issue Closure For GA Backup Orphan Readiness Record
- Date: `2026-05-01`
- Gate: `known-rc-issue-closure-for-ga`
- Result: `passed`
## Context
The v5 maintenance delta audit found that `#1352` had not been carried into the
v6 backup-alert path. Pulse v5 learned not to run backup orphan detection before
Proxmox template inventory was ready, because backup polling can race ahead of
guest and template discovery during startup.
The v6 runtime no longer evaluates raw storage backup arrays directly; it
evaluates recovery rollups. That made the v5 patch non-cherry-pickable, but the
same failure mode still applied: an old PVE backup whose VMID was not yet in
the current guest/template inventory could be marked as an orphaned backup-age
alert.
## Disposition
The v6 candidate now carries an inventory-aware backup alert boundary:
- `internal/alerts/alerts.go` keeps the existing `CheckBackups` API for direct
callers and adds `CheckBackupsWithInventory` for monitoring-owned runtime
evaluation.
- PVE orphaned backup alerts now require per-instance, per-guest-type inventory
readiness before unresolved PVE backup subjects can alert.
- Known Proxmox template VM/container subjects are carried as backup-valid
subjects and skipped from orphaned backup-age alert creation.
- Monitoring records Proxmox template subjects from both the efficient
`cluster/resources` poll and the traditional VM/container poll fallback, and
passes that scoped inventory into backup alert evaluation.
- PBS/PMG rollup behavior remains unchanged, so external backup-only subjects
can still alert when no matching local guest exists.
## Proof
- `go test ./internal/alerts -run 'TestCheckBackups(SkipsPVEOrphanUntilInventoryReady|CreatesPVEOrphanWhenInventoryReady|SkipsKnownPVETemplateBackupSubject|SkipsOrphanedWhenDisabled|HandlesPbsOnlyGuests|VMIDCollision)' -count=1`
- `go test ./internal/monitoring -run 'TestPVEBackupTemplateInventoryScopeFromClusterResources|TestBuildGuestLookupsFromReadState' -count=1`
- `go test ./internal/alerts -count=1`
- `go test ./internal/monitoring -count=1`
## Outcome
The v6 recovery-rollup alert path no longer knowingly regresses v5 `#1352`.
PVE backup orphan alerts wait for the owning Proxmox inventory signal, and
template backups do not become false orphaned backup alerts just because
templates are excluded from normal workload resources.

View file

@ -4403,6 +4403,12 @@
"kind": "file",
"evidence_tier": "managed-runtime-exercise"
},
{
"repo": "pulse",
"path": "docs/release-control/v6/internal/records/known-rc-issue-closure-for-ga-backup-orphan-readiness-2026-05-01.md",
"kind": "file",
"evidence_tier": "managed-runtime-exercise"
},
{
"repo": "pulse",
"path": "docs/release-control/v6/internal/records/known-rc-issue-closure-for-ga-blocked-2026-05-01.md",

View file

@ -156,6 +156,13 @@ and identity first-seen tracking. Generic threshold reevaluation must not keep
or resurrect image-update alerts after their owning Docker alert configuration
has disabled them.
Backup orphan evaluation is also inventory-scoped. The alerts runtime may
evaluate recovery rollups for backup age, but unresolved Proxmox PVE backup
subjects must not be treated as orphaned until monitoring has supplied the
matching per-instance guest-type inventory readiness signal. Known Proxmox
template subjects are valid backup subjects, not orphaned workload backups,
even though templates remain excluded from normal runtime workload resources.
Alert history persistence is also part of that canonical boundary. The history
manager may choose the owned runtime data directory, but it must normalize that
directory once and then resolve only the fixed `alert-history.json` and

View file

@ -865,6 +865,13 @@ record whose canonical status remains `available` whenever any reporting node
still has the shared target active; node-local inactive copies may expand node
affinity, but they must not downgrade the cluster record into an offline
projection just because that node won the capacity sample.
That same monitoring-owned Proxmox backup boundary also owns the inventory
readiness signal used by backup orphan alerts. `internal/monitoring/` must
record when PVE VM and container inventory has successfully observed a given
instance and guest type, including template VMIDs that are intentionally
excluded from normal workload resources. Backup alert evaluation may then
receive that scoped signal from monitoring, but alert code must not infer PVE
orphan readiness from recovery rollups alone.
That same monitoring-owned host-agent ingest boundary now also owns
vendor-managed NAS RAID normalization. `internal/monitoring/monitor_agents.go`
must filter vendor-managed system arrays through the shared

View file

@ -6216,12 +6216,74 @@ type backupRecord struct {
fallbackName string
instance string
node string
subjectType string
source string
rollupID string
providers []recovery.Provider
lastTime time.Time
}
// BackupInventoryScope carries monitoring-owned inventory readiness into backup
// alert evaluation. It keeps orphan detection from racing ahead of Proxmox
// guest/template discovery while preserving the direct CheckBackups API for
// unit tests and non-monitoring callers.
type BackupInventoryScope struct {
PVEOrphanInventoryReady map[string]map[string]bool
PVETemplateSubjects map[string]struct{}
}
func BuildBackupPVETemplateSubjectKey(instance, guestType, node string, vmid int) string {
instance = strings.TrimSpace(instance)
guestType = normalizeBackupGuestType(guestType)
node = strings.TrimSpace(node)
if instance == "" || guestType == "" || node == "" || vmid <= 0 {
return ""
}
return strings.Join([]string{instance, guestType, node, strconv.Itoa(vmid)}, "\x00")
}
func normalizeBackupGuestType(guestType string) string {
switch strings.ToLower(strings.TrimSpace(guestType)) {
case "qemu", "vm", "proxmox-vm":
return "qemu"
case "lxc", "ct", "container", "system-container", "proxmox-lxc":
return "lxc"
default:
return strings.ToLower(strings.TrimSpace(guestType))
}
}
func backupOrphanInventoryReady(scope *BackupInventoryScope, record backupRecord) bool {
if scope == nil || scope.PVEOrphanInventoryReady == nil {
return true
}
if record.source != "PVE" {
return true
}
instance := strings.TrimSpace(record.instance)
guestType := normalizeBackupGuestType(record.subjectType)
if instance == "" || guestType == "" {
return false
}
return scope.PVEOrphanInventoryReady[instance][guestType]
}
func backupMatchesKnownPVETemplate(scope *BackupInventoryScope, record backupRecord) bool {
if scope == nil || len(scope.PVETemplateSubjects) == 0 || record.source != "PVE" {
return false
}
vmid, err := strconv.Atoi(strings.TrimSpace(record.vmID))
if err != nil || vmid <= 0 {
return false
}
key := BuildBackupPVETemplateSubjectKey(record.instance, record.subjectType, record.node, vmid)
if key == "" {
return false
}
_, exists := scope.PVETemplateSubjects[key]
return exists
}
func canonicalGuestResourceType(guestType string) unifiedresources.ResourceType {
switch strings.ToLower(strings.TrimSpace(guestType)) {
case "lxc":
@ -6235,6 +6297,12 @@ func canonicalBackupSubjectResourceType(record backupRecord) unifiedresources.Re
if record.lookup.Type != "" {
return canonicalGuestResourceType(record.lookup.Type)
}
switch normalizeBackupGuestType(record.subjectType) {
case "lxc":
return unifiedresources.ResourceTypeSystemContainer
case "qemu":
return unifiedresources.ResourceTypeVM
}
if strings.TrimSpace(record.vmID) != "" {
return unifiedresources.ResourceTypeVM
}
@ -6534,6 +6602,17 @@ func (m *Manager) CheckBackups(
rollups []recovery.ProtectionRollup,
guestsByKey map[string]GuestLookup,
guestsByVMID map[string][]GuestLookup,
) {
m.CheckBackupsWithInventory(rollups, guestsByKey, guestsByVMID, nil)
}
// CheckBackupsWithInventory evaluates backup rollups with optional monitoring
// inventory readiness for orphan detection.
func (m *Manager) CheckBackupsWithInventory(
rollups []recovery.ProtectionRollup,
guestsByKey map[string]GuestLookup,
guestsByVMID map[string][]GuestLookup,
inventoryScope *BackupInventoryScope,
) {
m.mu.RLock()
enabled := m.config.Enabled
@ -6597,9 +6676,13 @@ func (m *Manager) CheckBackups(
instance string
node string
vmID string
subjectType string
)
ref := rollup.SubjectRef
if ref != nil {
subjectType = normalizeBackupGuestType(ref.Type)
}
// Primary: subjectRef.ID is the canonical proxmox guest source ID (instance:node:vmid) when linked.
if ref != nil && strings.TrimSpace(ref.ID) != "" {
@ -6667,6 +6750,7 @@ func (m *Manager) CheckBackups(
fallbackName: displayName,
instance: instance,
node: node,
subjectType: subjectType,
source: source,
rollupID: strings.TrimSpace(rollup.RollupID),
providers: providers,
@ -6719,6 +6803,12 @@ func (m *Manager) CheckBackups(
continue
}
if record.vmID != "" && record.lookup.ResourceID == "" {
if backupMatchesKnownPVETemplate(inventoryScope, *record) {
continue
}
if !backupOrphanInventoryReady(inventoryScope, *record) {
continue
}
if currentBackupCfg.AlertOrphaned != nil && !*currentBackupCfg.AlertOrphaned {
continue
}

View file

@ -1740,6 +1740,147 @@ func TestCheckBackupsSkipsOrphanedWhenDisabled(t *testing.T) {
}
}
func TestCheckBackupsSkipsPVEOrphanUntilInventoryReady(t *testing.T) {
m := newTestManager(t)
m.ClearActiveAlerts()
alertOrphaned := true
m.mu.Lock()
m.config.Enabled = true
m.config.BackupDefaults = BackupAlertConfig{
Enabled: true,
WarningDays: 3,
CriticalDays: 5,
AlertOrphaned: &alertOrphaned,
}
m.mu.Unlock()
now := time.Now()
rollups := []recovery.ProtectionRollup{
{
RollupID: "res:vm:proxmox:inst:node:700",
SubjectRef: &recovery.ExternalRef{
Type: "proxmox-vm",
Namespace: "inst",
Name: "700",
ID: "inst:node:700",
Class: "node",
},
LastSuccessAt: ptrTime(now.Add(-6 * 24 * time.Hour)),
LastOutcome: recovery.OutcomeSuccess,
Providers: []recovery.Provider{recovery.ProviderProxmoxPVE},
},
}
m.CheckBackupsWithInventory(rollups, map[string]GuestLookup{}, map[string][]GuestLookup{}, &BackupInventoryScope{
PVEOrphanInventoryReady: map[string]map[string]bool{},
})
m.mu.RLock()
defer m.mu.RUnlock()
for storageKey, alert := range m.activeAlerts {
if strings.HasPrefix(effectiveAlertID(alert, storageKey), "backup-age-") {
t.Fatalf("expected no PVE orphan alert before inventory readiness, found %s", effectiveAlertID(alert, storageKey))
}
}
}
func TestCheckBackupsCreatesPVEOrphanWhenInventoryReady(t *testing.T) {
m := newTestManager(t)
m.ClearActiveAlerts()
alertOrphaned := true
m.mu.Lock()
m.config.Enabled = true
m.config.BackupDefaults = BackupAlertConfig{
Enabled: true,
WarningDays: 3,
CriticalDays: 5,
AlertOrphaned: &alertOrphaned,
}
m.mu.Unlock()
now := time.Now()
rollups := []recovery.ProtectionRollup{
{
RollupID: "res:vm:proxmox:inst:node:701",
SubjectRef: &recovery.ExternalRef{
Type: "proxmox-vm",
Namespace: "inst",
Name: "701",
ID: "inst:node:701",
Class: "node",
},
LastSuccessAt: ptrTime(now.Add(-6 * 24 * time.Hour)),
LastOutcome: recovery.OutcomeSuccess,
Providers: []recovery.Provider{recovery.ProviderProxmoxPVE},
},
}
m.CheckBackupsWithInventory(rollups, map[string]GuestLookup{}, map[string][]GuestLookup{}, &BackupInventoryScope{
PVEOrphanInventoryReady: map[string]map[string]bool{
"inst": {"qemu": true},
},
})
m.mu.RLock()
defer m.mu.RUnlock()
if !testHasActiveAlert(t, m, buildCanonicalStateID("inst:node:701", "inst:node:701-backup-age")) {
t.Fatalf("expected PVE orphan backup alert after qemu inventory is ready")
}
}
func TestCheckBackupsSkipsKnownPVETemplateBackupSubject(t *testing.T) {
m := newTestManager(t)
m.ClearActiveAlerts()
alertOrphaned := true
m.mu.Lock()
m.config.Enabled = true
m.config.BackupDefaults = BackupAlertConfig{
Enabled: true,
WarningDays: 3,
CriticalDays: 5,
AlertOrphaned: &alertOrphaned,
}
m.mu.Unlock()
now := time.Now()
rollups := []recovery.ProtectionRollup{
{
RollupID: "res:vm:proxmox:inst:node:702",
SubjectRef: &recovery.ExternalRef{
Type: "proxmox-vm",
Namespace: "inst",
Name: "template-702",
ID: "inst:node:702",
Class: "node",
},
LastSuccessAt: ptrTime(now.Add(-6 * 24 * time.Hour)),
LastOutcome: recovery.OutcomeSuccess,
Providers: []recovery.Provider{recovery.ProviderProxmoxPVE},
},
}
templateKey := BuildBackupPVETemplateSubjectKey("inst", "qemu", "node", 702)
m.CheckBackupsWithInventory(rollups, map[string]GuestLookup{}, map[string][]GuestLookup{}, &BackupInventoryScope{
PVEOrphanInventoryReady: map[string]map[string]bool{
"inst": {"qemu": true},
},
PVETemplateSubjects: map[string]struct{}{
templateKey: {},
},
})
m.mu.RLock()
defer m.mu.RUnlock()
for storageKey, alert := range m.activeAlerts {
if strings.HasPrefix(effectiveAlertID(alert, storageKey), "backup-age-") {
t.Fatalf("expected known template backup subject to be skipped, found %s", effectiveAlertID(alert, storageKey))
}
}
}
func TestCheckBackupsIgnoresVMIDs(t *testing.T) {
m := newTestManager(t)
m.ClearActiveAlerts()

View file

@ -450,6 +450,45 @@ func TestProxmoxGuestDiskInventoryPrefersCanonicalLinkedHostAgentSource(t *testi
}
}
func TestBackupOrphanDetectionUsesCanonicalInventoryReadinessScope(t *testing.T) {
requiredSnippets := map[string][]string{
"monitor.go": {
"pveBackupInventoryReady map[string]map[string]bool",
"pveBackupTemplateSubjects map[string]map[string]struct{}",
},
"monitor_backups.go": {
"func (m *Monitor) updatePVEBackupTemplateSubjectsForType(instanceName, guestType string, subjects map[string]struct{}) {",
"func (m *Monitor) updatePVEBackupTemplateSubjectsFromClusterResources(instanceName string, resources []proxmox.ClusterResource) {",
"func (m *Monitor) backupInventoryScopeForAlerts() *alerts.BackupInventoryScope {",
"m.alertManager.CheckBackupsWithInventory(rollups, guestsByKey, guestsByVMID, m.backupInventoryScopeForAlerts())",
},
"monitor_pve_guest_poll.go": {
"m.updatePVEBackupTemplateSubjectsFromClusterResources(instanceName, resources)",
},
"monitor_polling_vm.go": {
`pveBackupTemplateSubjectKey(instanceName, "qemu", n.Node, vm.VMID)`,
`m.updatePVEBackupTemplateSubjectsForType(instanceName, "qemu", qemuTemplateSubjects)`,
},
"monitor_polling_containers.go": {
`pveBackupTemplateSubjectKey(instanceName, "lxc", n.Node, int(container.VMID))`,
`m.updatePVEBackupTemplateSubjectsForType(instanceName, "lxc", lxcTemplateSubjects)`,
},
}
for file, snippets := range requiredSnippets {
data, err := os.ReadFile(file)
if err != nil {
t.Fatalf("failed to read %s: %v", file, err)
}
source := string(data)
for _, snippet := range snippets {
if !strings.Contains(source, snippet) {
t.Fatalf("%s must contain %q", file, snippet)
}
}
}
}
func TestStoragePollingUsesCanonicalPoolMetadataForZFSAttachment(t *testing.T) {
data, err := os.ReadFile("monitor_polling_storage.go")
if err != nil {

View file

@ -403,6 +403,62 @@ func TestPollVMsWithNodesPreservesProxmoxPool(t *testing.T) {
}
}
func TestPollVMsWithNodesRecordsQEMUTemplateBackupInventoryReadiness(t *testing.T) {
t.Setenv("PULSE_DATA_DIR", t.TempDir())
mon := newTestPVEMonitor("test")
defer mon.alertManager.Stop()
defer mon.notificationMgr.Stop()
client := &vmMemoryTrustStubClient{
stubPVEClient: &stubPVEClient{},
vms: []proxmox.VM{
{
VMID: 900,
Name: "tmpl-900",
Node: "node1",
Status: "stopped",
Template: 1,
MaxMem: 8 * 1024,
CPUs: 2,
},
{
VMID: 101,
Name: "vm-101",
Node: "node1",
Status: "stopped",
MaxMem: 8 * 1024,
Mem: 2 * 1024,
CPUs: 2,
},
},
}
nodes := []proxmox.Node{{Node: "node1", Status: "online"}}
nodeEffectiveStatus := map[string]string{"node1": "online"}
mon.pollVMsWithNodes(context.Background(), "test", "", false, client, nodes, nodeEffectiveStatus)
vms := mon.state.GetSnapshot().VMs
if len(vms) != 1 {
t.Fatalf("expected only non-template VM in runtime state, got %d", len(vms))
}
if got := vms[0].VMID; got != 101 {
t.Fatalf("runtime VMID = %d, want 101", got)
}
scope := mon.backupInventoryScopeForAlerts()
if scope == nil {
t.Fatal("expected backup inventory scope")
}
if !scope.PVEOrphanInventoryReady["test"]["qemu"] {
t.Fatalf("expected qemu backup orphan inventory readiness for test instance")
}
templateSubject := pveBackupTemplateSubjectKey("test", "qemu", "node1", 900)
if _, ok := scope.PVETemplateSubjects[templateSubject]; !ok {
t.Fatalf("expected template subject %q in backup inventory scope", templateSubject)
}
}
func TestPollVMsWithNodesMemoryTrustCharacterization(t *testing.T) {
t.Setenv("PULSE_DATA_DIR", t.TempDir())

View file

@ -941,6 +941,8 @@ type Monitor struct {
lastPhysicalDiskPoll map[string]time.Time // Track last physical disk poll time per instance
lastPVEBackupPoll map[string]time.Time // Track last PVE backup poll per instance
lastPBSBackupPoll map[string]time.Time // Track last PBS backup poll per instance
pveBackupInventoryReady map[string]map[string]bool // Track PVE guest inventory readiness for backup orphan detection
pveBackupTemplateSubjects map[string]map[string]struct{} // Track template VMIDs excluded from runtime workloads but valid for backups
backupPermissionWarnings map[string]string // Track backup permission issues per instance (instance -> warning message)
persistence *config.ConfigPersistence // Add persistence for saving updated configs
pbsBackupPollers map[string]bool // Track PBS backup polling goroutines per instance
@ -1486,6 +1488,8 @@ func New(cfg *config.Config) (*Monitor, error) {
lastPhysicalDiskPoll: make(map[string]time.Time),
lastPVEBackupPoll: make(map[string]time.Time),
lastPBSBackupPoll: make(map[string]time.Time),
pveBackupInventoryReady: make(map[string]map[string]bool),
pveBackupTemplateSubjects: make(map[string]map[string]struct{}),
backupPermissionWarnings: make(map[string]string),
persistence: config.NewConfigPersistence(cfg.DataPath),
pbsBackupPollers: make(map[string]bool),

View file

@ -2600,6 +2600,8 @@ func (m *Monitor) cleanupTrackingMaps(now time.Time) {
for instanceID, ts := range m.lastPVEBackupPoll {
if ts.Before(cutoff) {
delete(m.lastPVEBackupPoll, instanceID)
delete(m.pveBackupInventoryReady, instanceID)
delete(m.pveBackupTemplateSubjects, instanceID)
cleaned++
}
}

View file

@ -303,7 +303,7 @@ func (m *Monitor) checkMockAlerts() {
if err != nil {
log.Warn().Err(err).Msg("Failed to list recovery rollups for backup alerts")
} else {
m.alertManager.CheckBackups(rollups, guestsByKey, guestsByVMID)
m.alertManager.CheckBackupsWithInventory(rollups, guestsByKey, guestsByVMID, m.backupInventoryScopeForAlerts())
}
// Limit how many guests we check per cycle to prevent blocking with large datasets

View file

@ -20,6 +20,100 @@ import (
"github.com/rs/zerolog/log"
)
func pveBackupTemplateSubjectKey(instance, guestType, node string, vmid int) string {
return alerts.BuildBackupPVETemplateSubjectKey(instance, guestType, node, vmid)
}
func (m *Monitor) updatePVEBackupTemplateSubjectsForType(instanceName, guestType string, subjects map[string]struct{}) {
if m == nil {
return
}
instanceName = strings.TrimSpace(instanceName)
guestType = strings.TrimSpace(guestType)
if instanceName == "" || guestType == "" {
return
}
m.mu.Lock()
defer m.mu.Unlock()
if m.pveBackupInventoryReady == nil {
m.pveBackupInventoryReady = make(map[string]map[string]bool)
}
if m.pveBackupInventoryReady[instanceName] == nil {
m.pveBackupInventoryReady[instanceName] = make(map[string]bool)
}
m.pveBackupInventoryReady[instanceName][guestType] = true
if m.pveBackupTemplateSubjects == nil {
m.pveBackupTemplateSubjects = make(map[string]map[string]struct{})
}
existing := m.pveBackupTemplateSubjects[instanceName]
if existing == nil {
existing = make(map[string]struct{})
}
prefix := instanceName + "\x00" + guestType + "\x00"
for key := range existing {
if strings.HasPrefix(key, prefix) {
delete(existing, key)
}
}
for key := range subjects {
if key != "" {
existing[key] = struct{}{}
}
}
m.pveBackupTemplateSubjects[instanceName] = existing
}
func (m *Monitor) updatePVEBackupTemplateSubjectsFromClusterResources(instanceName string, resources []proxmox.ClusterResource) {
qemuTemplates := make(map[string]struct{})
lxcTemplates := make(map[string]struct{})
for _, res := range resources {
if res.Template != 1 {
continue
}
switch strings.TrimSpace(res.Type) {
case "qemu":
if key := pveBackupTemplateSubjectKey(instanceName, "qemu", res.Node, res.VMID); key != "" {
qemuTemplates[key] = struct{}{}
}
case "lxc":
if key := pveBackupTemplateSubjectKey(instanceName, "lxc", res.Node, res.VMID); key != "" {
lxcTemplates[key] = struct{}{}
}
}
}
m.updatePVEBackupTemplateSubjectsForType(instanceName, "qemu", qemuTemplates)
m.updatePVEBackupTemplateSubjectsForType(instanceName, "lxc", lxcTemplates)
}
func (m *Monitor) backupInventoryScopeForAlerts() *alerts.BackupInventoryScope {
if m == nil {
return nil
}
scope := &alerts.BackupInventoryScope{
PVEOrphanInventoryReady: make(map[string]map[string]bool),
PVETemplateSubjects: make(map[string]struct{}),
}
m.mu.RLock()
defer m.mu.RUnlock()
for instance, readyByType := range m.pveBackupInventoryReady {
if len(readyByType) == 0 {
continue
}
scope.PVEOrphanInventoryReady[instance] = make(map[string]bool, len(readyByType))
for guestType, ready := range readyByType {
scope.PVEOrphanInventoryReady[instance][guestType] = ready
}
}
for _, subjects := range m.pveBackupTemplateSubjects {
for key := range subjects {
scope.PVETemplateSubjects[key] = struct{}{}
}
}
return scope
}
func (m *Monitor) pollStorageBackupsWithNodes(ctx context.Context, instanceName string, client PVEClientInterface, nodes []proxmox.Node, nodeEffectiveStatus map[string]string) {
var allBackups []models.StorageBackup
@ -267,7 +361,7 @@ func (m *Monitor) pollStorageBackupsWithNodes(ctx context.Context, instanceName
if err != nil {
log.Warn().Err(err).Msg("Failed to list recovery rollups for backup alerts")
} else {
m.alertManager.CheckBackups(rollups, guestsByKey, guestsByVMID)
m.alertManager.CheckBackupsWithInventory(rollups, guestsByKey, guestsByVMID, m.backupInventoryScopeForAlerts())
}
}
@ -1301,7 +1395,7 @@ func (m *Monitor) pollPBSBackups(ctx context.Context, instanceName string, clien
if err != nil {
log.Warn().Err(err).Msg("Failed to list recovery rollups for backup alerts")
} else {
m.alertManager.CheckBackups(rollups, guestsByKey, guestsByVMID)
m.alertManager.CheckBackupsWithInventory(rollups, guestsByKey, guestsByVMID, m.backupInventoryScopeForAlerts())
}
}

View file

@ -87,6 +87,8 @@ func (m *Monitor) retirePVEInstanceRuntime(instanceName string) {
delete(m.lastClusterCheck, instanceName)
delete(m.lastPhysicalDiskPoll, instanceName)
delete(m.lastPVEBackupPoll, instanceName)
delete(m.pveBackupInventoryReady, instanceName)
delete(m.pveBackupTemplateSubjects, instanceName)
delete(m.backupPermissionWarnings, instanceName)
delete(m.authFailures, instanceName)
delete(m.authFailures, string(InstanceTypePVE)+"-"+instanceName)

View file

@ -241,6 +241,50 @@ func TestEvaluateHostAgentsClearsAlertWhenHostReturns(t *testing.T) {
}
}
func TestCleanupTrackingMapsClearsStalePVEBackupInventoryScope(t *testing.T) {
now := time.Now()
stale := now.Add(-25 * time.Hour)
fresh := now.Add(-time.Hour)
staleSubject := pveBackupTemplateSubjectKey("pve-stale", "qemu", "node1", 900)
freshSubject := pveBackupTemplateSubjectKey("pve-fresh", "qemu", "node1", 901)
monitor := &Monitor{
lastPVEBackupPoll: map[string]time.Time{
"pve-stale": stale,
"pve-fresh": fresh,
},
pveBackupInventoryReady: map[string]map[string]bool{
"pve-stale": {"qemu": true},
"pve-fresh": {"qemu": true},
},
pveBackupTemplateSubjects: map[string]map[string]struct{}{
"pve-stale": {staleSubject: {}},
"pve-fresh": {freshSubject: {}},
},
}
monitor.cleanupTrackingMaps(now)
if _, ok := monitor.lastPVEBackupPoll["pve-stale"]; ok {
t.Fatalf("expected stale PVE backup poll marker to be removed")
}
if _, ok := monitor.pveBackupInventoryReady["pve-stale"]; ok {
t.Fatalf("expected stale PVE backup inventory readiness to be removed")
}
if _, ok := monitor.pveBackupTemplateSubjects["pve-stale"]; ok {
t.Fatalf("expected stale PVE backup template subjects to be removed")
}
if _, ok := monitor.lastPVEBackupPoll["pve-fresh"]; !ok {
t.Fatalf("expected fresh PVE backup poll marker to remain")
}
if !monitor.pveBackupInventoryReady["pve-fresh"]["qemu"] {
t.Fatalf("expected fresh PVE backup inventory readiness to remain")
}
if _, ok := monitor.pveBackupTemplateSubjects["pve-fresh"][freshSubject]; !ok {
t.Fatalf("expected fresh PVE backup template subject to remain")
}
}
func TestApplyHostReportAllowsTokenReuseAcrossHosts(t *testing.T) {
t.Helper()

View file

@ -9,6 +9,7 @@ import (
"github.com/rcourtman/pulse-go-rewrite/internal/config"
"github.com/rcourtman/pulse-go-rewrite/internal/models"
"github.com/rcourtman/pulse-go-rewrite/internal/unifiedresources"
"github.com/rcourtman/pulse-go-rewrite/pkg/proxmox"
)
func TestPersistGuestIdentity_Concurrent(t *testing.T) {
@ -173,3 +174,37 @@ func TestBuildGuestLookupsFromReadState_UsesPersistedMetadataWhenCanonicalStateE
t.Fatalf("expected persisted metadata fallback, got %+v", byVMID["100"])
}
}
func TestPVEBackupTemplateInventoryScopeFromClusterResources(t *testing.T) {
m := &Monitor{}
m.updatePVEBackupTemplateSubjectsFromClusterResources("pve-a", []proxmox.ClusterResource{
{Type: "qemu", Node: "node-a", VMID: 700, Template: 1},
{Type: "lxc", Node: "node-b", VMID: 701, Template: 1},
{Type: "qemu", Node: "node-c", VMID: 702, Template: 0},
})
scope := m.backupInventoryScopeForAlerts()
if scope == nil {
t.Fatalf("expected backup inventory scope")
}
if !scope.PVEOrphanInventoryReady["pve-a"]["qemu"] {
t.Fatalf("expected qemu backup inventory to be marked ready")
}
if !scope.PVEOrphanInventoryReady["pve-a"]["lxc"] {
t.Fatalf("expected lxc backup inventory to be marked ready")
}
qemuTemplate := alerts.BuildBackupPVETemplateSubjectKey("pve-a", "qemu", "node-a", 700)
if _, exists := scope.PVETemplateSubjects[qemuTemplate]; !exists {
t.Fatalf("expected qemu template subject to be captured")
}
lxcTemplate := alerts.BuildBackupPVETemplateSubjectKey("pve-a", "lxc", "node-b", 701)
if _, exists := scope.PVETemplateSubjects[lxcTemplate]; !exists {
t.Fatalf("expected lxc template subject to be captured")
}
nonTemplate := alerts.BuildBackupPVETemplateSubjectKey("pve-a", "qemu", "node-c", 702)
if _, exists := scope.PVETemplateSubjects[nonTemplate]; exists {
t.Fatalf("did not expect non-template subject to be captured")
}
}

View file

@ -17,9 +17,10 @@ func (m *Monitor) pollContainersWithNodes(ctx context.Context, instanceName stri
// Channel to collect container results from each node
type nodeResult struct {
node string
containers []models.Container
err error
node string
containers []models.Container
templateSubjects map[string]struct{}
err error
}
resultChan := make(chan nodeResult, len(nodes))
@ -81,11 +82,15 @@ func (m *Monitor) pollContainersWithNodes(ctx context.Context, instanceName stri
rootUsageOverrides := m.collectContainerRootUsage(ctx, client, n.Node, vmIDs)
var nodeContainers []models.Container
nodeTemplateSubjects := make(map[string]struct{})
// Process each container
for _, container := range containers {
// Skip templates
if container.Template == 1 {
if key := pveBackupTemplateSubjectKey(instanceName, "lxc", n.Node, int(container.VMID)); key != "" {
nodeTemplateSubjects[key] = struct{}{}
}
continue
}
@ -269,7 +274,7 @@ func (m *Monitor) pollContainersWithNodes(ctx context.Context, instanceName stri
Dur("duration", nodeDuration).
Msg("Node container polling completed")
resultChan <- nodeResult{node: n.Node, containers: nodeContainers}
resultChan <- nodeResult{node: n.Node, containers: nodeContainers, templateSubjects: nodeTemplateSubjects}
}(node)
}
@ -281,6 +286,7 @@ func (m *Monitor) pollContainersWithNodes(ctx context.Context, instanceName stri
// Collect results from all nodes
var allContainers []models.Container
lxcTemplateSubjects := make(map[string]struct{})
successfulNodes := 0
failedNodes := 0
@ -290,8 +296,14 @@ func (m *Monitor) pollContainersWithNodes(ctx context.Context, instanceName stri
} else {
successfulNodes++
allContainers = append(allContainers, result.containers...)
for key := range result.templateSubjects {
lxcTemplateSubjects[key] = struct{}{}
}
}
}
if failedNodes == 0 && successfulNodes > 0 {
m.updatePVEBackupTemplateSubjectsForType(instanceName, "lxc", lxcTemplateSubjects)
}
// If we got ZERO containers but had containers before (likely cluster health issue),
// preserve previous containers instead of clearing them

View file

@ -19,9 +19,10 @@ func (m *Monitor) pollVMsWithNodes(ctx context.Context, instanceName string, clu
// Channel to collect VM results from each node
type nodeResult struct {
node string
vms []models.VM
err error
node string
vms []models.VM
templateSubjects map[string]struct{}
err error
}
resultChan := make(chan nodeResult, len(nodes))
@ -74,11 +75,15 @@ func (m *Monitor) pollVMsWithNodes(ctx context.Context, instanceName string, clu
}
var nodeVMs []models.VM
nodeTemplateSubjects := make(map[string]struct{})
// Process each VM
for _, vm := range vms {
// Skip templates
if vm.Template == 1 {
if key := pveBackupTemplateSubjectKey(instanceName, "qemu", n.Node, vm.VMID); key != "" {
nodeTemplateSubjects[key] = struct{}{}
}
continue
}
@ -601,7 +606,7 @@ func (m *Monitor) pollVMsWithNodes(ctx context.Context, instanceName string, clu
Dur("duration", nodeDuration).
Msg("Node VM polling completed")
resultChan <- nodeResult{node: n.Node, vms: nodeVMs}
resultChan <- nodeResult{node: n.Node, vms: nodeVMs, templateSubjects: nodeTemplateSubjects}
}(node)
}
@ -613,6 +618,7 @@ func (m *Monitor) pollVMsWithNodes(ctx context.Context, instanceName string, clu
// Collect results from all nodes
var allVMs []models.VM
qemuTemplateSubjects := make(map[string]struct{})
successfulNodes := 0
failedNodes := 0
@ -622,8 +628,14 @@ func (m *Monitor) pollVMsWithNodes(ctx context.Context, instanceName string, clu
} else {
successfulNodes++
allVMs = append(allVMs, result.vms...)
for key := range result.templateSubjects {
qemuTemplateSubjects[key] = struct{}{}
}
}
}
if failedNodes == 0 && successfulNodes > 0 {
m.updatePVEBackupTemplateSubjectsForType(instanceName, "qemu", qemuTemplateSubjects)
}
// If we got ZERO VMs but had VMs before (likely cluster health issue),
// preserve previous VMs instead of clearing them

View file

@ -25,6 +25,7 @@ func (m *Monitor) pollVMsAndContainersEfficient(ctx context.Context, instanceNam
log.Debug().Err(err).Str("instance", instanceName).Msg("cluster/resources not available, falling back to traditional polling")
return false
}
m.updatePVEBackupTemplateSubjectsFromClusterResources(instanceName, resources)
// Capture previous guest state once per poll cycle so fallback and grace-period
// behavior is based on a consistent pre-poll snapshot.