mirror of
https://github.com/rcourtman/Pulse.git
synced 2026-05-19 16:27:37 +00:00
parent
61cd902ded
commit
fe597554c3
18 changed files with 612 additions and 11 deletions
|
|
@ -0,0 +1,49 @@
|
|||
# Known RC Issue Closure For GA Backup Orphan Readiness Record
|
||||
|
||||
- Date: `2026-05-01`
|
||||
- Gate: `known-rc-issue-closure-for-ga`
|
||||
- Result: `passed`
|
||||
|
||||
## Context
|
||||
|
||||
The v5 maintenance delta audit found that `#1352` had not been carried into the
|
||||
v6 backup-alert path. Pulse v5 learned not to run backup orphan detection before
|
||||
Proxmox template inventory was ready, because backup polling can race ahead of
|
||||
guest and template discovery during startup.
|
||||
|
||||
The v6 runtime no longer evaluates raw storage backup arrays directly; it
|
||||
evaluates recovery rollups. That made the v5 patch non-cherry-pickable, but the
|
||||
same failure mode still applied: an old PVE backup whose VMID was not yet in
|
||||
the current guest/template inventory could be marked as an orphaned backup-age
|
||||
alert.
|
||||
|
||||
## Disposition
|
||||
|
||||
The v6 candidate now carries an inventory-aware backup alert boundary:
|
||||
|
||||
- `internal/alerts/alerts.go` keeps the existing `CheckBackups` API for direct
|
||||
callers and adds `CheckBackupsWithInventory` for monitoring-owned runtime
|
||||
evaluation.
|
||||
- PVE orphaned backup alerts now require per-instance, per-guest-type inventory
|
||||
readiness before unresolved PVE backup subjects can alert.
|
||||
- Known Proxmox template VM/container subjects are carried as backup-valid
|
||||
subjects and skipped from orphaned backup-age alert creation.
|
||||
- Monitoring records Proxmox template subjects from both the efficient
|
||||
`cluster/resources` poll and the traditional VM/container poll fallback, and
|
||||
passes that scoped inventory into backup alert evaluation.
|
||||
- PBS/PMG rollup behavior remains unchanged, so external backup-only subjects
|
||||
can still alert when no matching local guest exists.
|
||||
|
||||
## Proof
|
||||
|
||||
- `go test ./internal/alerts -run 'TestCheckBackups(SkipsPVEOrphanUntilInventoryReady|CreatesPVEOrphanWhenInventoryReady|SkipsKnownPVETemplateBackupSubject|SkipsOrphanedWhenDisabled|HandlesPbsOnlyGuests|VMIDCollision)' -count=1`
|
||||
- `go test ./internal/monitoring -run 'TestPVEBackupTemplateInventoryScopeFromClusterResources|TestBuildGuestLookupsFromReadState' -count=1`
|
||||
- `go test ./internal/alerts -count=1`
|
||||
- `go test ./internal/monitoring -count=1`
|
||||
|
||||
## Outcome
|
||||
|
||||
The v6 recovery-rollup alert path no longer knowingly regresses v5 `#1352`.
|
||||
PVE backup orphan alerts wait for the owning Proxmox inventory signal, and
|
||||
template backups do not become false orphaned backup alerts just because
|
||||
templates are excluded from normal workload resources.
|
||||
|
|
@ -4403,6 +4403,12 @@
|
|||
"kind": "file",
|
||||
"evidence_tier": "managed-runtime-exercise"
|
||||
},
|
||||
{
|
||||
"repo": "pulse",
|
||||
"path": "docs/release-control/v6/internal/records/known-rc-issue-closure-for-ga-backup-orphan-readiness-2026-05-01.md",
|
||||
"kind": "file",
|
||||
"evidence_tier": "managed-runtime-exercise"
|
||||
},
|
||||
{
|
||||
"repo": "pulse",
|
||||
"path": "docs/release-control/v6/internal/records/known-rc-issue-closure-for-ga-blocked-2026-05-01.md",
|
||||
|
|
|
|||
|
|
@ -156,6 +156,13 @@ and identity first-seen tracking. Generic threshold reevaluation must not keep
|
|||
or resurrect image-update alerts after their owning Docker alert configuration
|
||||
has disabled them.
|
||||
|
||||
Backup orphan evaluation is also inventory-scoped. The alerts runtime may
|
||||
evaluate recovery rollups for backup age, but unresolved Proxmox PVE backup
|
||||
subjects must not be treated as orphaned until monitoring has supplied the
|
||||
matching per-instance guest-type inventory readiness signal. Known Proxmox
|
||||
template subjects are valid backup subjects, not orphaned workload backups,
|
||||
even though templates remain excluded from normal runtime workload resources.
|
||||
|
||||
Alert history persistence is also part of that canonical boundary. The history
|
||||
manager may choose the owned runtime data directory, but it must normalize that
|
||||
directory once and then resolve only the fixed `alert-history.json` and
|
||||
|
|
|
|||
|
|
@ -865,6 +865,13 @@ record whose canonical status remains `available` whenever any reporting node
|
|||
still has the shared target active; node-local inactive copies may expand node
|
||||
affinity, but they must not downgrade the cluster record into an offline
|
||||
projection just because that node won the capacity sample.
|
||||
That same monitoring-owned Proxmox backup boundary also owns the inventory
|
||||
readiness signal used by backup orphan alerts. `internal/monitoring/` must
|
||||
record when PVE VM and container inventory has successfully observed a given
|
||||
instance and guest type, including template VMIDs that are intentionally
|
||||
excluded from normal workload resources. Backup alert evaluation may then
|
||||
receive that scoped signal from monitoring, but alert code must not infer PVE
|
||||
orphan readiness from recovery rollups alone.
|
||||
That same monitoring-owned host-agent ingest boundary now also owns
|
||||
vendor-managed NAS RAID normalization. `internal/monitoring/monitor_agents.go`
|
||||
must filter vendor-managed system arrays through the shared
|
||||
|
|
|
|||
|
|
@ -6216,12 +6216,74 @@ type backupRecord struct {
|
|||
fallbackName string
|
||||
instance string
|
||||
node string
|
||||
subjectType string
|
||||
source string
|
||||
rollupID string
|
||||
providers []recovery.Provider
|
||||
lastTime time.Time
|
||||
}
|
||||
|
||||
// BackupInventoryScope carries monitoring-owned inventory readiness into backup
|
||||
// alert evaluation. It keeps orphan detection from racing ahead of Proxmox
|
||||
// guest/template discovery while preserving the direct CheckBackups API for
|
||||
// unit tests and non-monitoring callers.
|
||||
type BackupInventoryScope struct {
|
||||
PVEOrphanInventoryReady map[string]map[string]bool
|
||||
PVETemplateSubjects map[string]struct{}
|
||||
}
|
||||
|
||||
func BuildBackupPVETemplateSubjectKey(instance, guestType, node string, vmid int) string {
|
||||
instance = strings.TrimSpace(instance)
|
||||
guestType = normalizeBackupGuestType(guestType)
|
||||
node = strings.TrimSpace(node)
|
||||
if instance == "" || guestType == "" || node == "" || vmid <= 0 {
|
||||
return ""
|
||||
}
|
||||
return strings.Join([]string{instance, guestType, node, strconv.Itoa(vmid)}, "\x00")
|
||||
}
|
||||
|
||||
func normalizeBackupGuestType(guestType string) string {
|
||||
switch strings.ToLower(strings.TrimSpace(guestType)) {
|
||||
case "qemu", "vm", "proxmox-vm":
|
||||
return "qemu"
|
||||
case "lxc", "ct", "container", "system-container", "proxmox-lxc":
|
||||
return "lxc"
|
||||
default:
|
||||
return strings.ToLower(strings.TrimSpace(guestType))
|
||||
}
|
||||
}
|
||||
|
||||
func backupOrphanInventoryReady(scope *BackupInventoryScope, record backupRecord) bool {
|
||||
if scope == nil || scope.PVEOrphanInventoryReady == nil {
|
||||
return true
|
||||
}
|
||||
if record.source != "PVE" {
|
||||
return true
|
||||
}
|
||||
instance := strings.TrimSpace(record.instance)
|
||||
guestType := normalizeBackupGuestType(record.subjectType)
|
||||
if instance == "" || guestType == "" {
|
||||
return false
|
||||
}
|
||||
return scope.PVEOrphanInventoryReady[instance][guestType]
|
||||
}
|
||||
|
||||
func backupMatchesKnownPVETemplate(scope *BackupInventoryScope, record backupRecord) bool {
|
||||
if scope == nil || len(scope.PVETemplateSubjects) == 0 || record.source != "PVE" {
|
||||
return false
|
||||
}
|
||||
vmid, err := strconv.Atoi(strings.TrimSpace(record.vmID))
|
||||
if err != nil || vmid <= 0 {
|
||||
return false
|
||||
}
|
||||
key := BuildBackupPVETemplateSubjectKey(record.instance, record.subjectType, record.node, vmid)
|
||||
if key == "" {
|
||||
return false
|
||||
}
|
||||
_, exists := scope.PVETemplateSubjects[key]
|
||||
return exists
|
||||
}
|
||||
|
||||
func canonicalGuestResourceType(guestType string) unifiedresources.ResourceType {
|
||||
switch strings.ToLower(strings.TrimSpace(guestType)) {
|
||||
case "lxc":
|
||||
|
|
@ -6235,6 +6297,12 @@ func canonicalBackupSubjectResourceType(record backupRecord) unifiedresources.Re
|
|||
if record.lookup.Type != "" {
|
||||
return canonicalGuestResourceType(record.lookup.Type)
|
||||
}
|
||||
switch normalizeBackupGuestType(record.subjectType) {
|
||||
case "lxc":
|
||||
return unifiedresources.ResourceTypeSystemContainer
|
||||
case "qemu":
|
||||
return unifiedresources.ResourceTypeVM
|
||||
}
|
||||
if strings.TrimSpace(record.vmID) != "" {
|
||||
return unifiedresources.ResourceTypeVM
|
||||
}
|
||||
|
|
@ -6534,6 +6602,17 @@ func (m *Manager) CheckBackups(
|
|||
rollups []recovery.ProtectionRollup,
|
||||
guestsByKey map[string]GuestLookup,
|
||||
guestsByVMID map[string][]GuestLookup,
|
||||
) {
|
||||
m.CheckBackupsWithInventory(rollups, guestsByKey, guestsByVMID, nil)
|
||||
}
|
||||
|
||||
// CheckBackupsWithInventory evaluates backup rollups with optional monitoring
|
||||
// inventory readiness for orphan detection.
|
||||
func (m *Manager) CheckBackupsWithInventory(
|
||||
rollups []recovery.ProtectionRollup,
|
||||
guestsByKey map[string]GuestLookup,
|
||||
guestsByVMID map[string][]GuestLookup,
|
||||
inventoryScope *BackupInventoryScope,
|
||||
) {
|
||||
m.mu.RLock()
|
||||
enabled := m.config.Enabled
|
||||
|
|
@ -6597,9 +6676,13 @@ func (m *Manager) CheckBackups(
|
|||
instance string
|
||||
node string
|
||||
vmID string
|
||||
subjectType string
|
||||
)
|
||||
|
||||
ref := rollup.SubjectRef
|
||||
if ref != nil {
|
||||
subjectType = normalizeBackupGuestType(ref.Type)
|
||||
}
|
||||
|
||||
// Primary: subjectRef.ID is the canonical proxmox guest source ID (instance:node:vmid) when linked.
|
||||
if ref != nil && strings.TrimSpace(ref.ID) != "" {
|
||||
|
|
@ -6667,6 +6750,7 @@ func (m *Manager) CheckBackups(
|
|||
fallbackName: displayName,
|
||||
instance: instance,
|
||||
node: node,
|
||||
subjectType: subjectType,
|
||||
source: source,
|
||||
rollupID: strings.TrimSpace(rollup.RollupID),
|
||||
providers: providers,
|
||||
|
|
@ -6719,6 +6803,12 @@ func (m *Manager) CheckBackups(
|
|||
continue
|
||||
}
|
||||
if record.vmID != "" && record.lookup.ResourceID == "" {
|
||||
if backupMatchesKnownPVETemplate(inventoryScope, *record) {
|
||||
continue
|
||||
}
|
||||
if !backupOrphanInventoryReady(inventoryScope, *record) {
|
||||
continue
|
||||
}
|
||||
if currentBackupCfg.AlertOrphaned != nil && !*currentBackupCfg.AlertOrphaned {
|
||||
continue
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1740,6 +1740,147 @@ func TestCheckBackupsSkipsOrphanedWhenDisabled(t *testing.T) {
|
|||
}
|
||||
}
|
||||
|
||||
func TestCheckBackupsSkipsPVEOrphanUntilInventoryReady(t *testing.T) {
|
||||
m := newTestManager(t)
|
||||
m.ClearActiveAlerts()
|
||||
|
||||
alertOrphaned := true
|
||||
m.mu.Lock()
|
||||
m.config.Enabled = true
|
||||
m.config.BackupDefaults = BackupAlertConfig{
|
||||
Enabled: true,
|
||||
WarningDays: 3,
|
||||
CriticalDays: 5,
|
||||
AlertOrphaned: &alertOrphaned,
|
||||
}
|
||||
m.mu.Unlock()
|
||||
|
||||
now := time.Now()
|
||||
rollups := []recovery.ProtectionRollup{
|
||||
{
|
||||
RollupID: "res:vm:proxmox:inst:node:700",
|
||||
SubjectRef: &recovery.ExternalRef{
|
||||
Type: "proxmox-vm",
|
||||
Namespace: "inst",
|
||||
Name: "700",
|
||||
ID: "inst:node:700",
|
||||
Class: "node",
|
||||
},
|
||||
LastSuccessAt: ptrTime(now.Add(-6 * 24 * time.Hour)),
|
||||
LastOutcome: recovery.OutcomeSuccess,
|
||||
Providers: []recovery.Provider{recovery.ProviderProxmoxPVE},
|
||||
},
|
||||
}
|
||||
|
||||
m.CheckBackupsWithInventory(rollups, map[string]GuestLookup{}, map[string][]GuestLookup{}, &BackupInventoryScope{
|
||||
PVEOrphanInventoryReady: map[string]map[string]bool{},
|
||||
})
|
||||
|
||||
m.mu.RLock()
|
||||
defer m.mu.RUnlock()
|
||||
for storageKey, alert := range m.activeAlerts {
|
||||
if strings.HasPrefix(effectiveAlertID(alert, storageKey), "backup-age-") {
|
||||
t.Fatalf("expected no PVE orphan alert before inventory readiness, found %s", effectiveAlertID(alert, storageKey))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestCheckBackupsCreatesPVEOrphanWhenInventoryReady(t *testing.T) {
|
||||
m := newTestManager(t)
|
||||
m.ClearActiveAlerts()
|
||||
|
||||
alertOrphaned := true
|
||||
m.mu.Lock()
|
||||
m.config.Enabled = true
|
||||
m.config.BackupDefaults = BackupAlertConfig{
|
||||
Enabled: true,
|
||||
WarningDays: 3,
|
||||
CriticalDays: 5,
|
||||
AlertOrphaned: &alertOrphaned,
|
||||
}
|
||||
m.mu.Unlock()
|
||||
|
||||
now := time.Now()
|
||||
rollups := []recovery.ProtectionRollup{
|
||||
{
|
||||
RollupID: "res:vm:proxmox:inst:node:701",
|
||||
SubjectRef: &recovery.ExternalRef{
|
||||
Type: "proxmox-vm",
|
||||
Namespace: "inst",
|
||||
Name: "701",
|
||||
ID: "inst:node:701",
|
||||
Class: "node",
|
||||
},
|
||||
LastSuccessAt: ptrTime(now.Add(-6 * 24 * time.Hour)),
|
||||
LastOutcome: recovery.OutcomeSuccess,
|
||||
Providers: []recovery.Provider{recovery.ProviderProxmoxPVE},
|
||||
},
|
||||
}
|
||||
|
||||
m.CheckBackupsWithInventory(rollups, map[string]GuestLookup{}, map[string][]GuestLookup{}, &BackupInventoryScope{
|
||||
PVEOrphanInventoryReady: map[string]map[string]bool{
|
||||
"inst": {"qemu": true},
|
||||
},
|
||||
})
|
||||
|
||||
m.mu.RLock()
|
||||
defer m.mu.RUnlock()
|
||||
if !testHasActiveAlert(t, m, buildCanonicalStateID("inst:node:701", "inst:node:701-backup-age")) {
|
||||
t.Fatalf("expected PVE orphan backup alert after qemu inventory is ready")
|
||||
}
|
||||
}
|
||||
|
||||
func TestCheckBackupsSkipsKnownPVETemplateBackupSubject(t *testing.T) {
|
||||
m := newTestManager(t)
|
||||
m.ClearActiveAlerts()
|
||||
|
||||
alertOrphaned := true
|
||||
m.mu.Lock()
|
||||
m.config.Enabled = true
|
||||
m.config.BackupDefaults = BackupAlertConfig{
|
||||
Enabled: true,
|
||||
WarningDays: 3,
|
||||
CriticalDays: 5,
|
||||
AlertOrphaned: &alertOrphaned,
|
||||
}
|
||||
m.mu.Unlock()
|
||||
|
||||
now := time.Now()
|
||||
rollups := []recovery.ProtectionRollup{
|
||||
{
|
||||
RollupID: "res:vm:proxmox:inst:node:702",
|
||||
SubjectRef: &recovery.ExternalRef{
|
||||
Type: "proxmox-vm",
|
||||
Namespace: "inst",
|
||||
Name: "template-702",
|
||||
ID: "inst:node:702",
|
||||
Class: "node",
|
||||
},
|
||||
LastSuccessAt: ptrTime(now.Add(-6 * 24 * time.Hour)),
|
||||
LastOutcome: recovery.OutcomeSuccess,
|
||||
Providers: []recovery.Provider{recovery.ProviderProxmoxPVE},
|
||||
},
|
||||
}
|
||||
|
||||
templateKey := BuildBackupPVETemplateSubjectKey("inst", "qemu", "node", 702)
|
||||
m.CheckBackupsWithInventory(rollups, map[string]GuestLookup{}, map[string][]GuestLookup{}, &BackupInventoryScope{
|
||||
PVEOrphanInventoryReady: map[string]map[string]bool{
|
||||
"inst": {"qemu": true},
|
||||
},
|
||||
PVETemplateSubjects: map[string]struct{}{
|
||||
templateKey: {},
|
||||
},
|
||||
})
|
||||
|
||||
m.mu.RLock()
|
||||
defer m.mu.RUnlock()
|
||||
for storageKey, alert := range m.activeAlerts {
|
||||
if strings.HasPrefix(effectiveAlertID(alert, storageKey), "backup-age-") {
|
||||
t.Fatalf("expected known template backup subject to be skipped, found %s", effectiveAlertID(alert, storageKey))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestCheckBackupsIgnoresVMIDs(t *testing.T) {
|
||||
m := newTestManager(t)
|
||||
m.ClearActiveAlerts()
|
||||
|
|
|
|||
|
|
@ -450,6 +450,45 @@ func TestProxmoxGuestDiskInventoryPrefersCanonicalLinkedHostAgentSource(t *testi
|
|||
}
|
||||
}
|
||||
|
||||
func TestBackupOrphanDetectionUsesCanonicalInventoryReadinessScope(t *testing.T) {
|
||||
requiredSnippets := map[string][]string{
|
||||
"monitor.go": {
|
||||
"pveBackupInventoryReady map[string]map[string]bool",
|
||||
"pveBackupTemplateSubjects map[string]map[string]struct{}",
|
||||
},
|
||||
"monitor_backups.go": {
|
||||
"func (m *Monitor) updatePVEBackupTemplateSubjectsForType(instanceName, guestType string, subjects map[string]struct{}) {",
|
||||
"func (m *Monitor) updatePVEBackupTemplateSubjectsFromClusterResources(instanceName string, resources []proxmox.ClusterResource) {",
|
||||
"func (m *Monitor) backupInventoryScopeForAlerts() *alerts.BackupInventoryScope {",
|
||||
"m.alertManager.CheckBackupsWithInventory(rollups, guestsByKey, guestsByVMID, m.backupInventoryScopeForAlerts())",
|
||||
},
|
||||
"monitor_pve_guest_poll.go": {
|
||||
"m.updatePVEBackupTemplateSubjectsFromClusterResources(instanceName, resources)",
|
||||
},
|
||||
"monitor_polling_vm.go": {
|
||||
`pveBackupTemplateSubjectKey(instanceName, "qemu", n.Node, vm.VMID)`,
|
||||
`m.updatePVEBackupTemplateSubjectsForType(instanceName, "qemu", qemuTemplateSubjects)`,
|
||||
},
|
||||
"monitor_polling_containers.go": {
|
||||
`pveBackupTemplateSubjectKey(instanceName, "lxc", n.Node, int(container.VMID))`,
|
||||
`m.updatePVEBackupTemplateSubjectsForType(instanceName, "lxc", lxcTemplateSubjects)`,
|
||||
},
|
||||
}
|
||||
|
||||
for file, snippets := range requiredSnippets {
|
||||
data, err := os.ReadFile(file)
|
||||
if err != nil {
|
||||
t.Fatalf("failed to read %s: %v", file, err)
|
||||
}
|
||||
source := string(data)
|
||||
for _, snippet := range snippets {
|
||||
if !strings.Contains(source, snippet) {
|
||||
t.Fatalf("%s must contain %q", file, snippet)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestStoragePollingUsesCanonicalPoolMetadataForZFSAttachment(t *testing.T) {
|
||||
data, err := os.ReadFile("monitor_polling_storage.go")
|
||||
if err != nil {
|
||||
|
|
|
|||
|
|
@ -403,6 +403,62 @@ func TestPollVMsWithNodesPreservesProxmoxPool(t *testing.T) {
|
|||
}
|
||||
}
|
||||
|
||||
func TestPollVMsWithNodesRecordsQEMUTemplateBackupInventoryReadiness(t *testing.T) {
|
||||
t.Setenv("PULSE_DATA_DIR", t.TempDir())
|
||||
|
||||
mon := newTestPVEMonitor("test")
|
||||
defer mon.alertManager.Stop()
|
||||
defer mon.notificationMgr.Stop()
|
||||
|
||||
client := &vmMemoryTrustStubClient{
|
||||
stubPVEClient: &stubPVEClient{},
|
||||
vms: []proxmox.VM{
|
||||
{
|
||||
VMID: 900,
|
||||
Name: "tmpl-900",
|
||||
Node: "node1",
|
||||
Status: "stopped",
|
||||
Template: 1,
|
||||
MaxMem: 8 * 1024,
|
||||
CPUs: 2,
|
||||
},
|
||||
{
|
||||
VMID: 101,
|
||||
Name: "vm-101",
|
||||
Node: "node1",
|
||||
Status: "stopped",
|
||||
MaxMem: 8 * 1024,
|
||||
Mem: 2 * 1024,
|
||||
CPUs: 2,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
nodes := []proxmox.Node{{Node: "node1", Status: "online"}}
|
||||
nodeEffectiveStatus := map[string]string{"node1": "online"}
|
||||
mon.pollVMsWithNodes(context.Background(), "test", "", false, client, nodes, nodeEffectiveStatus)
|
||||
|
||||
vms := mon.state.GetSnapshot().VMs
|
||||
if len(vms) != 1 {
|
||||
t.Fatalf("expected only non-template VM in runtime state, got %d", len(vms))
|
||||
}
|
||||
if got := vms[0].VMID; got != 101 {
|
||||
t.Fatalf("runtime VMID = %d, want 101", got)
|
||||
}
|
||||
|
||||
scope := mon.backupInventoryScopeForAlerts()
|
||||
if scope == nil {
|
||||
t.Fatal("expected backup inventory scope")
|
||||
}
|
||||
if !scope.PVEOrphanInventoryReady["test"]["qemu"] {
|
||||
t.Fatalf("expected qemu backup orphan inventory readiness for test instance")
|
||||
}
|
||||
templateSubject := pveBackupTemplateSubjectKey("test", "qemu", "node1", 900)
|
||||
if _, ok := scope.PVETemplateSubjects[templateSubject]; !ok {
|
||||
t.Fatalf("expected template subject %q in backup inventory scope", templateSubject)
|
||||
}
|
||||
}
|
||||
|
||||
func TestPollVMsWithNodesMemoryTrustCharacterization(t *testing.T) {
|
||||
t.Setenv("PULSE_DATA_DIR", t.TempDir())
|
||||
|
||||
|
|
|
|||
|
|
@ -941,6 +941,8 @@ type Monitor struct {
|
|||
lastPhysicalDiskPoll map[string]time.Time // Track last physical disk poll time per instance
|
||||
lastPVEBackupPoll map[string]time.Time // Track last PVE backup poll per instance
|
||||
lastPBSBackupPoll map[string]time.Time // Track last PBS backup poll per instance
|
||||
pveBackupInventoryReady map[string]map[string]bool // Track PVE guest inventory readiness for backup orphan detection
|
||||
pveBackupTemplateSubjects map[string]map[string]struct{} // Track template VMIDs excluded from runtime workloads but valid for backups
|
||||
backupPermissionWarnings map[string]string // Track backup permission issues per instance (instance -> warning message)
|
||||
persistence *config.ConfigPersistence // Add persistence for saving updated configs
|
||||
pbsBackupPollers map[string]bool // Track PBS backup polling goroutines per instance
|
||||
|
|
@ -1486,6 +1488,8 @@ func New(cfg *config.Config) (*Monitor, error) {
|
|||
lastPhysicalDiskPoll: make(map[string]time.Time),
|
||||
lastPVEBackupPoll: make(map[string]time.Time),
|
||||
lastPBSBackupPoll: make(map[string]time.Time),
|
||||
pveBackupInventoryReady: make(map[string]map[string]bool),
|
||||
pveBackupTemplateSubjects: make(map[string]map[string]struct{}),
|
||||
backupPermissionWarnings: make(map[string]string),
|
||||
persistence: config.NewConfigPersistence(cfg.DataPath),
|
||||
pbsBackupPollers: make(map[string]bool),
|
||||
|
|
|
|||
|
|
@ -2600,6 +2600,8 @@ func (m *Monitor) cleanupTrackingMaps(now time.Time) {
|
|||
for instanceID, ts := range m.lastPVEBackupPoll {
|
||||
if ts.Before(cutoff) {
|
||||
delete(m.lastPVEBackupPoll, instanceID)
|
||||
delete(m.pveBackupInventoryReady, instanceID)
|
||||
delete(m.pveBackupTemplateSubjects, instanceID)
|
||||
cleaned++
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -303,7 +303,7 @@ func (m *Monitor) checkMockAlerts() {
|
|||
if err != nil {
|
||||
log.Warn().Err(err).Msg("Failed to list recovery rollups for backup alerts")
|
||||
} else {
|
||||
m.alertManager.CheckBackups(rollups, guestsByKey, guestsByVMID)
|
||||
m.alertManager.CheckBackupsWithInventory(rollups, guestsByKey, guestsByVMID, m.backupInventoryScopeForAlerts())
|
||||
}
|
||||
|
||||
// Limit how many guests we check per cycle to prevent blocking with large datasets
|
||||
|
|
|
|||
|
|
@ -20,6 +20,100 @@ import (
|
|||
"github.com/rs/zerolog/log"
|
||||
)
|
||||
|
||||
func pveBackupTemplateSubjectKey(instance, guestType, node string, vmid int) string {
|
||||
return alerts.BuildBackupPVETemplateSubjectKey(instance, guestType, node, vmid)
|
||||
}
|
||||
|
||||
func (m *Monitor) updatePVEBackupTemplateSubjectsForType(instanceName, guestType string, subjects map[string]struct{}) {
|
||||
if m == nil {
|
||||
return
|
||||
}
|
||||
instanceName = strings.TrimSpace(instanceName)
|
||||
guestType = strings.TrimSpace(guestType)
|
||||
if instanceName == "" || guestType == "" {
|
||||
return
|
||||
}
|
||||
|
||||
m.mu.Lock()
|
||||
defer m.mu.Unlock()
|
||||
if m.pveBackupInventoryReady == nil {
|
||||
m.pveBackupInventoryReady = make(map[string]map[string]bool)
|
||||
}
|
||||
if m.pveBackupInventoryReady[instanceName] == nil {
|
||||
m.pveBackupInventoryReady[instanceName] = make(map[string]bool)
|
||||
}
|
||||
m.pveBackupInventoryReady[instanceName][guestType] = true
|
||||
|
||||
if m.pveBackupTemplateSubjects == nil {
|
||||
m.pveBackupTemplateSubjects = make(map[string]map[string]struct{})
|
||||
}
|
||||
existing := m.pveBackupTemplateSubjects[instanceName]
|
||||
if existing == nil {
|
||||
existing = make(map[string]struct{})
|
||||
}
|
||||
prefix := instanceName + "\x00" + guestType + "\x00"
|
||||
for key := range existing {
|
||||
if strings.HasPrefix(key, prefix) {
|
||||
delete(existing, key)
|
||||
}
|
||||
}
|
||||
for key := range subjects {
|
||||
if key != "" {
|
||||
existing[key] = struct{}{}
|
||||
}
|
||||
}
|
||||
m.pveBackupTemplateSubjects[instanceName] = existing
|
||||
}
|
||||
|
||||
func (m *Monitor) updatePVEBackupTemplateSubjectsFromClusterResources(instanceName string, resources []proxmox.ClusterResource) {
|
||||
qemuTemplates := make(map[string]struct{})
|
||||
lxcTemplates := make(map[string]struct{})
|
||||
for _, res := range resources {
|
||||
if res.Template != 1 {
|
||||
continue
|
||||
}
|
||||
switch strings.TrimSpace(res.Type) {
|
||||
case "qemu":
|
||||
if key := pveBackupTemplateSubjectKey(instanceName, "qemu", res.Node, res.VMID); key != "" {
|
||||
qemuTemplates[key] = struct{}{}
|
||||
}
|
||||
case "lxc":
|
||||
if key := pveBackupTemplateSubjectKey(instanceName, "lxc", res.Node, res.VMID); key != "" {
|
||||
lxcTemplates[key] = struct{}{}
|
||||
}
|
||||
}
|
||||
}
|
||||
m.updatePVEBackupTemplateSubjectsForType(instanceName, "qemu", qemuTemplates)
|
||||
m.updatePVEBackupTemplateSubjectsForType(instanceName, "lxc", lxcTemplates)
|
||||
}
|
||||
|
||||
func (m *Monitor) backupInventoryScopeForAlerts() *alerts.BackupInventoryScope {
|
||||
if m == nil {
|
||||
return nil
|
||||
}
|
||||
scope := &alerts.BackupInventoryScope{
|
||||
PVEOrphanInventoryReady: make(map[string]map[string]bool),
|
||||
PVETemplateSubjects: make(map[string]struct{}),
|
||||
}
|
||||
m.mu.RLock()
|
||||
defer m.mu.RUnlock()
|
||||
for instance, readyByType := range m.pveBackupInventoryReady {
|
||||
if len(readyByType) == 0 {
|
||||
continue
|
||||
}
|
||||
scope.PVEOrphanInventoryReady[instance] = make(map[string]bool, len(readyByType))
|
||||
for guestType, ready := range readyByType {
|
||||
scope.PVEOrphanInventoryReady[instance][guestType] = ready
|
||||
}
|
||||
}
|
||||
for _, subjects := range m.pveBackupTemplateSubjects {
|
||||
for key := range subjects {
|
||||
scope.PVETemplateSubjects[key] = struct{}{}
|
||||
}
|
||||
}
|
||||
return scope
|
||||
}
|
||||
|
||||
func (m *Monitor) pollStorageBackupsWithNodes(ctx context.Context, instanceName string, client PVEClientInterface, nodes []proxmox.Node, nodeEffectiveStatus map[string]string) {
|
||||
|
||||
var allBackups []models.StorageBackup
|
||||
|
|
@ -267,7 +361,7 @@ func (m *Monitor) pollStorageBackupsWithNodes(ctx context.Context, instanceName
|
|||
if err != nil {
|
||||
log.Warn().Err(err).Msg("Failed to list recovery rollups for backup alerts")
|
||||
} else {
|
||||
m.alertManager.CheckBackups(rollups, guestsByKey, guestsByVMID)
|
||||
m.alertManager.CheckBackupsWithInventory(rollups, guestsByKey, guestsByVMID, m.backupInventoryScopeForAlerts())
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -1301,7 +1395,7 @@ func (m *Monitor) pollPBSBackups(ctx context.Context, instanceName string, clien
|
|||
if err != nil {
|
||||
log.Warn().Err(err).Msg("Failed to list recovery rollups for backup alerts")
|
||||
} else {
|
||||
m.alertManager.CheckBackups(rollups, guestsByKey, guestsByVMID)
|
||||
m.alertManager.CheckBackupsWithInventory(rollups, guestsByKey, guestsByVMID, m.backupInventoryScopeForAlerts())
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -87,6 +87,8 @@ func (m *Monitor) retirePVEInstanceRuntime(instanceName string) {
|
|||
delete(m.lastClusterCheck, instanceName)
|
||||
delete(m.lastPhysicalDiskPoll, instanceName)
|
||||
delete(m.lastPVEBackupPoll, instanceName)
|
||||
delete(m.pveBackupInventoryReady, instanceName)
|
||||
delete(m.pveBackupTemplateSubjects, instanceName)
|
||||
delete(m.backupPermissionWarnings, instanceName)
|
||||
delete(m.authFailures, instanceName)
|
||||
delete(m.authFailures, string(InstanceTypePVE)+"-"+instanceName)
|
||||
|
|
|
|||
|
|
@ -241,6 +241,50 @@ func TestEvaluateHostAgentsClearsAlertWhenHostReturns(t *testing.T) {
|
|||
}
|
||||
}
|
||||
|
||||
func TestCleanupTrackingMapsClearsStalePVEBackupInventoryScope(t *testing.T) {
|
||||
now := time.Now()
|
||||
stale := now.Add(-25 * time.Hour)
|
||||
fresh := now.Add(-time.Hour)
|
||||
staleSubject := pveBackupTemplateSubjectKey("pve-stale", "qemu", "node1", 900)
|
||||
freshSubject := pveBackupTemplateSubjectKey("pve-fresh", "qemu", "node1", 901)
|
||||
|
||||
monitor := &Monitor{
|
||||
lastPVEBackupPoll: map[string]time.Time{
|
||||
"pve-stale": stale,
|
||||
"pve-fresh": fresh,
|
||||
},
|
||||
pveBackupInventoryReady: map[string]map[string]bool{
|
||||
"pve-stale": {"qemu": true},
|
||||
"pve-fresh": {"qemu": true},
|
||||
},
|
||||
pveBackupTemplateSubjects: map[string]map[string]struct{}{
|
||||
"pve-stale": {staleSubject: {}},
|
||||
"pve-fresh": {freshSubject: {}},
|
||||
},
|
||||
}
|
||||
|
||||
monitor.cleanupTrackingMaps(now)
|
||||
|
||||
if _, ok := monitor.lastPVEBackupPoll["pve-stale"]; ok {
|
||||
t.Fatalf("expected stale PVE backup poll marker to be removed")
|
||||
}
|
||||
if _, ok := monitor.pveBackupInventoryReady["pve-stale"]; ok {
|
||||
t.Fatalf("expected stale PVE backup inventory readiness to be removed")
|
||||
}
|
||||
if _, ok := monitor.pveBackupTemplateSubjects["pve-stale"]; ok {
|
||||
t.Fatalf("expected stale PVE backup template subjects to be removed")
|
||||
}
|
||||
if _, ok := monitor.lastPVEBackupPoll["pve-fresh"]; !ok {
|
||||
t.Fatalf("expected fresh PVE backup poll marker to remain")
|
||||
}
|
||||
if !monitor.pveBackupInventoryReady["pve-fresh"]["qemu"] {
|
||||
t.Fatalf("expected fresh PVE backup inventory readiness to remain")
|
||||
}
|
||||
if _, ok := monitor.pveBackupTemplateSubjects["pve-fresh"][freshSubject]; !ok {
|
||||
t.Fatalf("expected fresh PVE backup template subject to remain")
|
||||
}
|
||||
}
|
||||
|
||||
func TestApplyHostReportAllowsTokenReuseAcrossHosts(t *testing.T) {
|
||||
t.Helper()
|
||||
|
||||
|
|
|
|||
|
|
@ -9,6 +9,7 @@ import (
|
|||
"github.com/rcourtman/pulse-go-rewrite/internal/config"
|
||||
"github.com/rcourtman/pulse-go-rewrite/internal/models"
|
||||
"github.com/rcourtman/pulse-go-rewrite/internal/unifiedresources"
|
||||
"github.com/rcourtman/pulse-go-rewrite/pkg/proxmox"
|
||||
)
|
||||
|
||||
func TestPersistGuestIdentity_Concurrent(t *testing.T) {
|
||||
|
|
@ -173,3 +174,37 @@ func TestBuildGuestLookupsFromReadState_UsesPersistedMetadataWhenCanonicalStateE
|
|||
t.Fatalf("expected persisted metadata fallback, got %+v", byVMID["100"])
|
||||
}
|
||||
}
|
||||
|
||||
func TestPVEBackupTemplateInventoryScopeFromClusterResources(t *testing.T) {
|
||||
m := &Monitor{}
|
||||
|
||||
m.updatePVEBackupTemplateSubjectsFromClusterResources("pve-a", []proxmox.ClusterResource{
|
||||
{Type: "qemu", Node: "node-a", VMID: 700, Template: 1},
|
||||
{Type: "lxc", Node: "node-b", VMID: 701, Template: 1},
|
||||
{Type: "qemu", Node: "node-c", VMID: 702, Template: 0},
|
||||
})
|
||||
|
||||
scope := m.backupInventoryScopeForAlerts()
|
||||
if scope == nil {
|
||||
t.Fatalf("expected backup inventory scope")
|
||||
}
|
||||
if !scope.PVEOrphanInventoryReady["pve-a"]["qemu"] {
|
||||
t.Fatalf("expected qemu backup inventory to be marked ready")
|
||||
}
|
||||
if !scope.PVEOrphanInventoryReady["pve-a"]["lxc"] {
|
||||
t.Fatalf("expected lxc backup inventory to be marked ready")
|
||||
}
|
||||
|
||||
qemuTemplate := alerts.BuildBackupPVETemplateSubjectKey("pve-a", "qemu", "node-a", 700)
|
||||
if _, exists := scope.PVETemplateSubjects[qemuTemplate]; !exists {
|
||||
t.Fatalf("expected qemu template subject to be captured")
|
||||
}
|
||||
lxcTemplate := alerts.BuildBackupPVETemplateSubjectKey("pve-a", "lxc", "node-b", 701)
|
||||
if _, exists := scope.PVETemplateSubjects[lxcTemplate]; !exists {
|
||||
t.Fatalf("expected lxc template subject to be captured")
|
||||
}
|
||||
nonTemplate := alerts.BuildBackupPVETemplateSubjectKey("pve-a", "qemu", "node-c", 702)
|
||||
if _, exists := scope.PVETemplateSubjects[nonTemplate]; exists {
|
||||
t.Fatalf("did not expect non-template subject to be captured")
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -17,9 +17,10 @@ func (m *Monitor) pollContainersWithNodes(ctx context.Context, instanceName stri
|
|||
|
||||
// Channel to collect container results from each node
|
||||
type nodeResult struct {
|
||||
node string
|
||||
containers []models.Container
|
||||
err error
|
||||
node string
|
||||
containers []models.Container
|
||||
templateSubjects map[string]struct{}
|
||||
err error
|
||||
}
|
||||
|
||||
resultChan := make(chan nodeResult, len(nodes))
|
||||
|
|
@ -81,11 +82,15 @@ func (m *Monitor) pollContainersWithNodes(ctx context.Context, instanceName stri
|
|||
rootUsageOverrides := m.collectContainerRootUsage(ctx, client, n.Node, vmIDs)
|
||||
|
||||
var nodeContainers []models.Container
|
||||
nodeTemplateSubjects := make(map[string]struct{})
|
||||
|
||||
// Process each container
|
||||
for _, container := range containers {
|
||||
// Skip templates
|
||||
if container.Template == 1 {
|
||||
if key := pveBackupTemplateSubjectKey(instanceName, "lxc", n.Node, int(container.VMID)); key != "" {
|
||||
nodeTemplateSubjects[key] = struct{}{}
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
|
|
@ -269,7 +274,7 @@ func (m *Monitor) pollContainersWithNodes(ctx context.Context, instanceName stri
|
|||
Dur("duration", nodeDuration).
|
||||
Msg("Node container polling completed")
|
||||
|
||||
resultChan <- nodeResult{node: n.Node, containers: nodeContainers}
|
||||
resultChan <- nodeResult{node: n.Node, containers: nodeContainers, templateSubjects: nodeTemplateSubjects}
|
||||
}(node)
|
||||
}
|
||||
|
||||
|
|
@ -281,6 +286,7 @@ func (m *Monitor) pollContainersWithNodes(ctx context.Context, instanceName stri
|
|||
|
||||
// Collect results from all nodes
|
||||
var allContainers []models.Container
|
||||
lxcTemplateSubjects := make(map[string]struct{})
|
||||
successfulNodes := 0
|
||||
failedNodes := 0
|
||||
|
||||
|
|
@ -290,8 +296,14 @@ func (m *Monitor) pollContainersWithNodes(ctx context.Context, instanceName stri
|
|||
} else {
|
||||
successfulNodes++
|
||||
allContainers = append(allContainers, result.containers...)
|
||||
for key := range result.templateSubjects {
|
||||
lxcTemplateSubjects[key] = struct{}{}
|
||||
}
|
||||
}
|
||||
}
|
||||
if failedNodes == 0 && successfulNodes > 0 {
|
||||
m.updatePVEBackupTemplateSubjectsForType(instanceName, "lxc", lxcTemplateSubjects)
|
||||
}
|
||||
|
||||
// If we got ZERO containers but had containers before (likely cluster health issue),
|
||||
// preserve previous containers instead of clearing them
|
||||
|
|
|
|||
|
|
@ -19,9 +19,10 @@ func (m *Monitor) pollVMsWithNodes(ctx context.Context, instanceName string, clu
|
|||
|
||||
// Channel to collect VM results from each node
|
||||
type nodeResult struct {
|
||||
node string
|
||||
vms []models.VM
|
||||
err error
|
||||
node string
|
||||
vms []models.VM
|
||||
templateSubjects map[string]struct{}
|
||||
err error
|
||||
}
|
||||
|
||||
resultChan := make(chan nodeResult, len(nodes))
|
||||
|
|
@ -74,11 +75,15 @@ func (m *Monitor) pollVMsWithNodes(ctx context.Context, instanceName string, clu
|
|||
}
|
||||
|
||||
var nodeVMs []models.VM
|
||||
nodeTemplateSubjects := make(map[string]struct{})
|
||||
|
||||
// Process each VM
|
||||
for _, vm := range vms {
|
||||
// Skip templates
|
||||
if vm.Template == 1 {
|
||||
if key := pveBackupTemplateSubjectKey(instanceName, "qemu", n.Node, vm.VMID); key != "" {
|
||||
nodeTemplateSubjects[key] = struct{}{}
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
|
|
@ -601,7 +606,7 @@ func (m *Monitor) pollVMsWithNodes(ctx context.Context, instanceName string, clu
|
|||
Dur("duration", nodeDuration).
|
||||
Msg("Node VM polling completed")
|
||||
|
||||
resultChan <- nodeResult{node: n.Node, vms: nodeVMs}
|
||||
resultChan <- nodeResult{node: n.Node, vms: nodeVMs, templateSubjects: nodeTemplateSubjects}
|
||||
}(node)
|
||||
}
|
||||
|
||||
|
|
@ -613,6 +618,7 @@ func (m *Monitor) pollVMsWithNodes(ctx context.Context, instanceName string, clu
|
|||
|
||||
// Collect results from all nodes
|
||||
var allVMs []models.VM
|
||||
qemuTemplateSubjects := make(map[string]struct{})
|
||||
successfulNodes := 0
|
||||
failedNodes := 0
|
||||
|
||||
|
|
@ -622,8 +628,14 @@ func (m *Monitor) pollVMsWithNodes(ctx context.Context, instanceName string, clu
|
|||
} else {
|
||||
successfulNodes++
|
||||
allVMs = append(allVMs, result.vms...)
|
||||
for key := range result.templateSubjects {
|
||||
qemuTemplateSubjects[key] = struct{}{}
|
||||
}
|
||||
}
|
||||
}
|
||||
if failedNodes == 0 && successfulNodes > 0 {
|
||||
m.updatePVEBackupTemplateSubjectsForType(instanceName, "qemu", qemuTemplateSubjects)
|
||||
}
|
||||
|
||||
// If we got ZERO VMs but had VMs before (likely cluster health issue),
|
||||
// preserve previous VMs instead of clearing them
|
||||
|
|
|
|||
|
|
@ -25,6 +25,7 @@ func (m *Monitor) pollVMsAndContainersEfficient(ctx context.Context, instanceNam
|
|||
log.Debug().Err(err).Str("instance", instanceName).Msg("cluster/resources not available, falling back to traditional polling")
|
||||
return false
|
||||
}
|
||||
m.updatePVEBackupTemplateSubjectsFromClusterResources(instanceName, resources)
|
||||
|
||||
// Capture previous guest state once per poll cycle so fallback and grace-period
|
||||
// behavior is based on a consistent pre-poll snapshot.
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue