Fix template backup orphan detection race (#1352)

This commit is contained in:
rcourtman 2026-03-25 10:36:33 +00:00
parent b9c6f504d8
commit 40249947ed
6 changed files with 151 additions and 51 deletions

View file

@ -5562,6 +5562,7 @@ func (m *Manager) CheckBackups(
pmgBackups []models.PMGBackup,
guestsByKey map[string]GuestLookup,
guestsByVMID map[string][]GuestLookup,
templateInventoryReady map[string]bool,
) {
m.mu.RLock()
enabled := m.config.Enabled
@ -5763,16 +5764,24 @@ func (m *Manager) CheckBackups(
return
}
// Build a set of instances that have at least one live guest (non-empty
// ResourceID). Orphan detection is only safe for instances whose guest
// list has been populated — if an instance hasn't been polled yet
// (startup race, auth failure, staggered polling), every backup from
// that instance would look orphaned.
instancesWithLiveGuests := make(map[string]bool)
for _, guests := range guestsByVMID {
for _, g := range guests {
if g.ResourceID != "" && g.Instance != "" {
instancesWithLiveGuests[g.Instance] = true
// Build a set of instances whose inventory is safe to use for orphan detection.
// When the monitor provides a template-inventory readiness map, prefer that
// signal because backup polling can race ahead of template discovery even when
// live guests already exist on the instance. Fall back to the legacy "has at
// least one live guest" heuristic for direct callers/tests that do not pass it.
instancesReadyForOrphanDetection := make(map[string]bool)
if templateInventoryReady != nil {
for instance, ready := range templateInventoryReady {
if ready {
instancesReadyForOrphanDetection[instance] = true
}
}
} else {
for _, guests := range guestsByVMID {
for _, g := range guests {
if g.ResourceID != "" && g.Instance != "" {
instancesReadyForOrphanDetection[g.Instance] = true
}
}
}
}
@ -5812,15 +5821,15 @@ func (m *Manager) CheckBackups(
continue
}
// Determine whether we have enough inventory to safely run orphan
// detection for this backup. For PVE storage backups the instance
// detection for this backup. For PVE storage backups the instance
// guard is strict: only check when that specific PVE instance has
// been polled. For PBS/PMG backups (which span instances) it's
// enough that *any* instance has live guests.
// completed a template-aware inventory poll. For PBS/PMG backups
// (which span instances) it's enough that any instance is ready.
inventoryReady := false
if record.source == "PVE storage" {
inventoryReady = instancesWithLiveGuests[record.instance]
inventoryReady = instancesReadyForOrphanDetection[record.instance]
} else {
inventoryReady = len(instancesWithLiveGuests) > 0
inventoryReady = len(instancesReadyForOrphanDetection) > 0
}
if record.vmid != "" && record.lookup.ResourceID == "" && inventoryReady {
// Backup has a VMID but no matching live guest in its lookup.
@ -6157,9 +6166,9 @@ func (m *Manager) CheckBackups(
if _, ok := validAlerts[alertID]; ok {
continue
}
// When no instances have live inventory, preserve existing orphan
// When no instances have inventory ready for orphan detection, preserve existing orphan
// alerts rather than clearing them — we can't confirm they're resolved.
if len(instancesWithLiveGuests) == 0 && alert.Type == "backup-orphaned" {
if len(instancesReadyForOrphanDetection) == 0 && alert.Type == "backup-orphaned" {
continue
}
m.clearAlertNoLock(alertID)

View file

@ -976,7 +976,7 @@ func TestCheckBackupsCreatesAndClearsAlerts(t *testing.T) {
"100": {guestsByKey[key]},
}
m.CheckBackups(storageBackups, nil, nil, guestsByKey, guestsByVMID)
m.CheckBackups(storageBackups, nil, nil, guestsByKey, guestsByVMID, nil)
m.mu.RLock()
alert, exists := m.activeAlerts["backup-age-"+sanitizeAlertKey(key)]
@ -990,7 +990,7 @@ func TestCheckBackupsCreatesAndClearsAlerts(t *testing.T) {
// Recent backup clears alert
storageBackups[0].Time = now
m.CheckBackups(storageBackups, nil, nil, guestsByKey, guestsByVMID)
m.CheckBackups(storageBackups, nil, nil, guestsByKey, guestsByVMID, nil)
m.mu.RLock()
_, exists = m.activeAlerts["backup-age-"+sanitizeAlertKey(key)]
@ -1044,7 +1044,7 @@ func TestCheckBackupsRespectsOverrides(t *testing.T) {
}
// 1. Verify warning alert is created with defaults
m.CheckBackups(storageBackups, nil, nil, guestsByKey, guestsByVMID)
m.CheckBackups(storageBackups, nil, nil, guestsByKey, guestsByVMID, nil)
m.mu.RLock()
alert, exists := m.activeAlerts["backup-age-"+sanitizeAlertKey(key)]
m.mu.RUnlock()
@ -1064,7 +1064,7 @@ func TestCheckBackupsRespectsOverrides(t *testing.T) {
}
m.UpdateConfig(cfg)
m.CheckBackups(storageBackups, nil, nil, guestsByKey, guestsByVMID)
m.CheckBackups(storageBackups, nil, nil, guestsByKey, guestsByVMID, nil)
m.mu.RLock()
_, exists = m.activeAlerts["backup-age-"+sanitizeAlertKey(key)]
m.mu.RUnlock()
@ -1081,7 +1081,7 @@ func TestCheckBackupsRespectsOverrides(t *testing.T) {
},
}
m.UpdateConfig(cfg)
m.CheckBackups(storageBackups, nil, nil, guestsByKey, guestsByVMID)
m.CheckBackups(storageBackups, nil, nil, guestsByKey, guestsByVMID, nil)
m.mu.RLock()
_, exists = m.activeAlerts["backup-age-"+sanitizeAlertKey(key)]
m.mu.RUnlock()
@ -1095,7 +1095,7 @@ func TestCheckBackupsRespectsOverrides(t *testing.T) {
}
m.UpdateConfig(cfg)
storageBackups[0].Time = now.Add(-30 * 24 * time.Hour) // Way past defaults
m.CheckBackups(storageBackups, nil, nil, guestsByKey, guestsByVMID)
m.CheckBackups(storageBackups, nil, nil, guestsByKey, guestsByVMID, nil)
m.mu.RLock()
_, exists = m.activeAlerts["backup-age-"+sanitizeAlertKey(key)]
m.mu.RUnlock()
@ -1138,7 +1138,7 @@ func TestCheckBackupsHandlesPbsOnlyGuests(t *testing.T) {
"9999": {guestsByKey[sentinelKey]},
}
m.CheckBackups(nil, pbsBackups, nil, guestsByKey, guestsByVMID)
m.CheckBackups(nil, pbsBackups, nil, guestsByKey, guestsByVMID, nil)
m.mu.RLock()
found := false
@ -1217,7 +1217,7 @@ func TestCheckBackupsDisambiguatesWithNamespace(t *testing.T) {
},
}
m.CheckBackups(nil, pbsBackups, nil, guestsByKey, guestsByVMID)
m.CheckBackups(nil, pbsBackups, nil, guestsByKey, guestsByVMID, nil)
m.mu.RLock()
defer m.mu.RUnlock()
@ -1299,7 +1299,7 @@ func TestCheckBackupsVMIDCollisionNonMatchingNamespace(t *testing.T) {
},
}
m.CheckBackups(nil, pbsBackups, nil, guestsByKey, guestsByVMID)
m.CheckBackups(nil, pbsBackups, nil, guestsByKey, guestsByVMID, nil)
m.mu.RLock()
defer m.mu.RUnlock()
@ -1378,7 +1378,7 @@ func TestCheckBackupsVMIDCollisionNoNamespace(t *testing.T) {
},
}
m.CheckBackups(nil, pbsBackups, nil, guestsByKey, guestsByVMID)
m.CheckBackups(nil, pbsBackups, nil, guestsByKey, guestsByVMID, nil)
m.mu.RLock()
defer m.mu.RUnlock()
@ -1426,7 +1426,7 @@ func TestCheckBackupsHandlesPmgBackups(t *testing.T) {
},
}
m.CheckBackups(nil, nil, pmgBackups, map[string]GuestLookup{}, map[string][]GuestLookup{})
m.CheckBackups(nil, nil, pmgBackups, map[string]GuestLookup{}, map[string][]GuestLookup{}, nil)
m.mu.RLock()
found := false
@ -1483,7 +1483,7 @@ func TestCheckBackupsSkipsOrphanedWhenDisabled(t *testing.T) {
"9999": {guestsByKey[sentinelKey]},
}
m.CheckBackups(storageBackups, nil, nil, guestsByKey, guestsByVMID)
m.CheckBackups(storageBackups, nil, nil, guestsByKey, guestsByVMID, nil)
m.mu.RLock()
defer m.mu.RUnlock()
@ -1537,7 +1537,7 @@ func TestCheckBackupsCreatesOrphanedAlert(t *testing.T) {
"9999": {guestsByKey[sentinelKey]},
}
m.CheckBackups(storageBackups, nil, nil, guestsByKey, guestsByVMID)
m.CheckBackups(storageBackups, nil, nil, guestsByKey, guestsByVMID, nil)
m.mu.RLock()
defer m.mu.RUnlock()
@ -1609,7 +1609,7 @@ func TestCheckBackupsOrphanedAlertClearsWhenGuestReappears(t *testing.T) {
}
// First cycle: guest 300 absent (only sentinel present) → orphaned alert fires.
m.CheckBackups(storageBackups, nil, nil, sentinelByKey, sentinelByVMID)
m.CheckBackups(storageBackups, nil, nil, sentinelByKey, sentinelByVMID, nil)
m.mu.RLock()
orphanedFound := false
@ -1631,7 +1631,7 @@ func TestCheckBackupsOrphanedAlertClearsWhenGuestReappears(t *testing.T) {
guestsByVMID := map[string][]GuestLookup{
"300": {guestsByKey[guestKey]},
}
m.CheckBackups(storageBackups, nil, nil, guestsByKey, guestsByVMID)
m.CheckBackups(storageBackups, nil, nil, guestsByKey, guestsByVMID, nil)
m.mu.RLock()
defer m.mu.RUnlock()
@ -1690,7 +1690,7 @@ func TestCheckBackupsOrphanedIgnoresVMIDs(t *testing.T) {
}
// Both are orphaned (not in inventory), but VMID 200 matches ignore pattern "20*".
m.CheckBackups(storageBackups, nil, nil, guestsByKey, guestsByVMID)
m.CheckBackups(storageBackups, nil, nil, guestsByKey, guestsByVMID, nil)
m.mu.RLock()
defer m.mu.RUnlock()
@ -1749,7 +1749,7 @@ func TestCheckBackupsOrphanedWithZeroAgeThresholds(t *testing.T) {
}
// Orphaned guest with zero age thresholds — should still fire orphaned alert.
m.CheckBackups(storageBackups, nil, nil, guestsByKey, guestsByVMID)
m.CheckBackups(storageBackups, nil, nil, guestsByKey, guestsByVMID, nil)
m.mu.RLock()
defer m.mu.RUnlock()
@ -1815,7 +1815,7 @@ func TestCheckBackupsOrphanedWithPersistedMetadata(t *testing.T) {
"9999": {guestsByKey[sentinelKey]},
}
m.CheckBackups(storageBackups, nil, nil, guestsByKey, guestsByVMID)
m.CheckBackups(storageBackups, nil, nil, guestsByKey, guestsByVMID, nil)
m.mu.RLock()
defer m.mu.RUnlock()
@ -1863,7 +1863,7 @@ func TestCheckBackupsOrphanedSkippedWhenNoLiveInventory(t *testing.T) {
}
// Completely empty guest maps — no live inventory.
m.CheckBackups(storageBackups, nil, nil, map[string]GuestLookup{}, map[string][]GuestLookup{})
m.CheckBackups(storageBackups, nil, nil, map[string]GuestLookup{}, map[string][]GuestLookup{}, nil)
m.mu.RLock()
defer m.mu.RUnlock()
@ -1914,7 +1914,7 @@ func TestCheckBackupsOrphanedPreservedWhenNoLiveInventory(t *testing.T) {
guestsByVMID := map[string][]GuestLookup{
"9999": {guestsByKey[sentinelKey]},
}
m.CheckBackups(storageBackups, nil, nil, guestsByKey, guestsByVMID)
m.CheckBackups(storageBackups, nil, nil, guestsByKey, guestsByVMID, nil)
m.mu.RLock()
orphanFound := false
@ -1929,7 +1929,7 @@ func TestCheckBackupsOrphanedPreservedWhenNoLiveInventory(t *testing.T) {
}
// Second cycle: inventory disappears (empty maps) — orphan alert must be preserved.
m.CheckBackups(storageBackups, nil, nil, map[string]GuestLookup{}, map[string][]GuestLookup{})
m.CheckBackups(storageBackups, nil, nil, map[string]GuestLookup{}, map[string][]GuestLookup{}, nil)
m.mu.RLock()
defer m.mu.RUnlock()
@ -1990,7 +1990,7 @@ func TestCheckBackupsOrphanedCrossInstanceVMID(t *testing.T) {
"9999": {guestsByKey[sentinelA]},
}
m.CheckBackups(storageBackups, nil, nil, guestsByKey, guestsByVMID)
m.CheckBackups(storageBackups, nil, nil, guestsByKey, guestsByVMID, nil)
m.mu.RLock()
defer m.mu.RUnlock()
@ -2005,6 +2005,64 @@ func TestCheckBackupsOrphanedCrossInstanceVMID(t *testing.T) {
}
}
func TestCheckBackupsSkipsPVEOrphanDetectionUntilTemplateInventoryReady(t *testing.T) {
m := newTestManager(t)
m.ClearActiveAlerts()
alertOrphaned := true
m.mu.Lock()
m.config.Enabled = true
m.config.BackupDefaults = BackupAlertConfig{
Enabled: true,
WarningDays: 7,
CriticalDays: 14,
AlertOrphaned: &alertOrphaned,
IgnoreVMIDs: []string{},
}
m.mu.Unlock()
now := time.Now()
storageBackups := []models.StorageBackup{
{
ID: "instA-node2-700-backup",
Storage: "local",
Node: "node2",
Instance: "instA",
Type: "qemu",
VMID: 700,
Time: now.Add(-1 * 24 * time.Hour),
},
}
// Simulate the startup/concurrency window where the instance has enough live
// guest data to satisfy the legacy readiness heuristic, but template inventory
// has not been populated yet. This backup must not be treated as orphaned.
sentinelKey := BuildGuestKey("instA", "node3", 9999)
guestsByKey := map[string]GuestLookup{
sentinelKey: {
ResourceID: "qemu/9999",
Name: "sentinel-vm",
Instance: "instA",
Node: "node3",
Type: "qemu",
VMID: 9999,
},
}
guestsByVMID := map[string][]GuestLookup{
"9999": {guestsByKey[sentinelKey]},
}
m.CheckBackups(storageBackups, nil, nil, guestsByKey, guestsByVMID, map[string]bool{})
m.mu.RLock()
defer m.mu.RUnlock()
for id := range m.activeAlerts {
if strings.HasPrefix(id, "backup-orphaned-") {
t.Fatalf("expected no orphaned alert before template inventory is ready, found %s", id)
}
}
}
func TestCheckBackupsIgnoresVMIDs(t *testing.T) {
m := newTestManager(t)
m.ClearActiveAlerts()
@ -2054,7 +2112,7 @@ func TestCheckBackupsIgnoresVMIDs(t *testing.T) {
"200": {guestsByKey[keyAllowed]},
}
m.CheckBackups(storageBackups, nil, nil, guestsByKey, guestsByVMID)
m.CheckBackups(storageBackups, nil, nil, guestsByKey, guestsByVMID, nil)
m.mu.RLock()
_, ignoredExists := m.activeAlerts["backup-age-"+sanitizeAlertKey(keyIgnored)]

View file

@ -41,6 +41,8 @@ type State struct {
TemperatureMonitoringEnabled bool `json:"temperatureMonitoringEnabled"`
PVETagColors map[string]string `json:"pveTagColors,omitempty"`
// templateVMIDs tracks Proxmox template VMIDs (not shown in UI, used for backup orphan detection).
// The presence of an instance key also acts as the readiness signal that a successful
// resources poll populated template inventory for that instance, even if the map is empty.
// Unexported so it is never serialised to JSON or the API.
templateVMIDs map[string]map[int]string // instance -> VMID -> node
}
@ -1056,7 +1058,7 @@ type Memory struct {
Used int64 `json:"used"`
Free int64 `json:"free"`
Usage float64 `json:"usage"`
Cache int64 `json:"cache,omitempty"` // Reclaimable buff/cache (Available - Free); used + cache + free ≈ total
Cache int64 `json:"cache,omitempty"` // Reclaimable buff/cache (Available - Free); used + cache + free ≈ total
Balloon int64 `json:"balloon,omitempty"`
SwapUsed int64 `json:"swapUsed,omitempty"`
SwapTotal int64 `json:"swapTotal,omitempty"`
@ -3068,14 +3070,16 @@ func (s *State) GetContainers() []Container {
// UpdateTemplateVMIDsForInstance stores the VMID→node mapping for Proxmox template guests
// for a given instance. Templates are excluded from the main VM/container lists (they are not
// monitored) but must be known to the backup orphan-detection logic so their backups are not
// incorrectly flagged as orphaned. Pass nil or an empty map to clear the instance's entries.
// incorrectly flagged as orphaned. Pass nil to clear the instance's entries entirely.
// Passing an empty non-nil map records that inventory was populated and that the
// instance currently has no templates.
func (s *State) UpdateTemplateVMIDsForInstance(instance string, vmids map[int]string) {
s.mu.Lock()
defer s.mu.Unlock()
if s.templateVMIDs == nil {
s.templateVMIDs = make(map[string]map[int]string)
}
if len(vmids) == 0 {
if vmids == nil {
delete(s.templateVMIDs, instance)
} else {
s.templateVMIDs[instance] = vmids

View file

@ -186,6 +186,31 @@ func TestStateLinkHostAgentToNode(t *testing.T) {
}
}
func TestStateSnapshotPreservesEmptyTemplateInventoryReadiness(t *testing.T) {
state := &State{}
state.UpdateTemplateVMIDsForInstance("instA", map[int]string{})
snapshot := state.GetSnapshot()
if snapshot.TemplateVMIDs == nil {
t.Fatalf("expected TemplateVMIDs map to be initialized in snapshot")
}
templates, exists := snapshot.TemplateVMIDs["instA"]
if !exists {
t.Fatalf("expected empty template inventory for instA to be preserved as readiness signal")
}
if len(templates) != 0 {
t.Fatalf("expected no template VMIDs for instA, got %v", templates)
}
state.UpdateTemplateVMIDsForInstance("instA", nil)
snapshot = state.GetSnapshot()
if _, exists := snapshot.TemplateVMIDs["instA"]; exists {
t.Fatalf("expected nil update to clear template inventory for instA")
}
}
func TestUpdateNodesForInstancePreservesLinkWhenNodeIDChanges(t *testing.T) {
state := &State{
Hosts: []Host{

View file

@ -33,6 +33,8 @@ type StateSnapshot struct {
TemperatureMonitoringEnabled bool `json:"temperatureMonitoringEnabled"`
PVETagColors map[string]string `json:"pveTagColors,omitempty"`
// TemplateVMIDs holds Proxmox template VMID→node mappings per instance.
// The presence of an instance key also indicates that template inventory was
// successfully populated for that instance, even when the inner map is empty.
// Not serialised to JSON or the frontend API; used only for backup orphan detection.
TemplateVMIDs map[string]map[int]string `json:"-"`
}

View file

@ -10205,7 +10205,7 @@ func (m *Monitor) pollStorageBackupsWithNodes(ctx context.Context, instanceName
if m.alertManager != nil {
snapshot := m.state.GetSnapshot()
guestsByKey, guestsByVMID := buildGuestLookups(snapshot, m.guestMetadataStore)
guestsByKey, guestsByVMID, templateInventoryReady := buildGuestLookups(snapshot, m.guestMetadataStore)
pveStorage := snapshot.Backups.PVE.StorageBackups
if len(pveStorage) == 0 && len(snapshot.PVEBackups.StorageBackups) > 0 {
pveStorage = snapshot.PVEBackups.StorageBackups
@ -10218,7 +10218,7 @@ func (m *Monitor) pollStorageBackupsWithNodes(ctx context.Context, instanceName
if len(pmgBackups) == 0 && len(snapshot.PMGBackups) > 0 {
pmgBackups = snapshot.PMGBackups
}
m.alertManager.CheckBackups(pveStorage, pbsBackups, pmgBackups, guestsByKey, guestsByVMID)
m.alertManager.CheckBackups(pveStorage, pbsBackups, pmgBackups, guestsByKey, guestsByVMID, templateInventoryReady)
}
// Clear permission warning if no permission errors occurred this cycle
@ -10361,9 +10361,10 @@ func preserveFailedStorageBackups(instanceName string, snapshot models.StateSnap
return current, storages
}
func buildGuestLookups(snapshot models.StateSnapshot, metadataStore *config.GuestMetadataStore) (map[string]alerts.GuestLookup, map[string][]alerts.GuestLookup) {
func buildGuestLookups(snapshot models.StateSnapshot, metadataStore *config.GuestMetadataStore) (map[string]alerts.GuestLookup, map[string][]alerts.GuestLookup, map[string]bool) {
byKey := make(map[string]alerts.GuestLookup)
byVMID := make(map[string][]alerts.GuestLookup)
templateInventoryReady := make(map[string]bool)
for _, vm := range snapshot.VMs {
info := alerts.GuestLookup{
@ -10418,6 +10419,7 @@ func buildGuestLookups(snapshot models.StateSnapshot, metadataStore *config.Gues
// existing templates as orphaned. Templates are excluded from the main VM/container
// lists (they are not monitored), so without this they would appear as unknown guests.
for instance, vmids := range snapshot.TemplateVMIDs {
templateInventoryReady[instance] = true
for vmid, node := range vmids {
info := alerts.GuestLookup{
ResourceID: makeGuestID(instance, node, vmid),
@ -10434,7 +10436,7 @@ func buildGuestLookups(snapshot models.StateSnapshot, metadataStore *config.Gues
}
}
return byKey, byVMID
return byKey, byVMID, templateInventoryReady
}
// enrichWithPersistedMetadata adds entries from the metadata store for guests
@ -11282,7 +11284,7 @@ func (m *Monitor) pollPBSBackups(ctx context.Context, instanceName string, clien
if m.alertManager != nil {
snapshot := m.state.GetSnapshot()
guestsByKey, guestsByVMID := buildGuestLookups(snapshot, m.guestMetadataStore)
guestsByKey, guestsByVMID, templateInventoryReady := buildGuestLookups(snapshot, m.guestMetadataStore)
pveStorage := snapshot.Backups.PVE.StorageBackups
if len(pveStorage) == 0 && len(snapshot.PVEBackups.StorageBackups) > 0 {
pveStorage = snapshot.PVEBackups.StorageBackups
@ -11295,7 +11297,7 @@ func (m *Monitor) pollPBSBackups(ctx context.Context, instanceName string, clien
if len(pmgBackups) == 0 && len(snapshot.PMGBackups) > 0 {
pmgBackups = snapshot.PMGBackups
}
m.alertManager.CheckBackups(pveStorage, pbsBackups, pmgBackups, guestsByKey, guestsByVMID)
m.alertManager.CheckBackups(pveStorage, pbsBackups, pmgBackups, guestsByKey, guestsByVMID, templateInventoryReady)
}
// Immediately broadcast the updated state so frontend sees new backups
@ -11547,7 +11549,7 @@ func (m *Monitor) checkMockAlerts() {
Msg("Collecting resources for alert cleanup in mock mode")
m.alertManager.CleanupAlertsForNodes(existingNodes)
guestsByKey, guestsByVMID := buildGuestLookups(state, m.guestMetadataStore)
guestsByKey, guestsByVMID, templateInventoryReady := buildGuestLookups(state, m.guestMetadataStore)
pveStorage := state.Backups.PVE.StorageBackups
if len(pveStorage) == 0 && len(state.PVEBackups.StorageBackups) > 0 {
pveStorage = state.PVEBackups.StorageBackups
@ -11560,7 +11562,7 @@ func (m *Monitor) checkMockAlerts() {
if len(pmgBackups) == 0 && len(state.PMGBackups) > 0 {
pmgBackups = state.PMGBackups
}
m.alertManager.CheckBackups(pveStorage, pbsBackups, pmgBackups, guestsByKey, guestsByVMID)
m.alertManager.CheckBackups(pveStorage, pbsBackups, pmgBackups, guestsByKey, guestsByVMID, templateInventoryReady)
// Limit how many guests we check per cycle to prevent blocking with large datasets
const maxGuestsPerCycle = 50