Pulse/internal/monitoring/monitor_backups.go

package monitoring

import (
	"context"
	"fmt"
	"sort"
	"strconv"
	"strings"
	"sync"
	"time"

	"github.com/rcourtman/pulse-go-rewrite/internal/alerts"
	"github.com/rcourtman/pulse-go-rewrite/internal/config"
	"github.com/rcourtman/pulse-go-rewrite/internal/models"
	"github.com/rcourtman/pulse-go-rewrite/internal/monitoring/errors"
	proxmoxrecoverymapper "github.com/rcourtman/pulse-go-rewrite/internal/recovery/mapper/proxmox"
	"github.com/rcourtman/pulse-go-rewrite/internal/unifiedresources"
	"github.com/rcourtman/pulse-go-rewrite/pkg/pbs"
	"github.com/rcourtman/pulse-go-rewrite/pkg/proxmox"
	"github.com/rs/zerolog/log"
)

func (m *Monitor) pollStorageBackupsWithNodes(ctx context.Context, instanceName string, client PVEClientInterface, nodes []proxmox.Node, nodeEffectiveStatus map[string]string) {

	var allBackups []models.StorageBackup
	hasPBSDirectConnection := m.config != nil && len(m.config.PBSInstances) > 0
	seenVolids := make(map[string]bool) // Track seen volume IDs to avoid duplicates
	hadSuccessfulNode := false          // Track if at least one node responded successfully
	storagesWithBackup := 0             // Number of storages that should contain backups
	contentSuccess := 0                 // Number of successful storage content fetches
	contentFailures := 0                // Number of failed storage content fetches
	storageQueryErrors := 0             // Number of nodes where storage list could not be queried
	hadPermissionError := false         // Track if any permission errors occurred this cycle
	storagePreserveNeeded := map[string]struct{}{}
	storageSuccess := map[string]struct{}{}
	readState := m.GetUnifiedReadStateOrSnapshot()

	// Build guest lookup map to find actual node for each VMID
	snapshot := m.state.GetSnapshot()
	guestNodeMap := make(map[int]string) // VMID -> actual node name
	populateGuestNodeMapFromReadState(readState, instanceName, guestNodeMap)

	// For each node, get storage and check content
	for _, node := range nodes {
		if nodeEffectiveStatus[node.Node] != "online" {
			for _, storageName := range storageNamesForNode(readState, instanceName, node.Node) {
				storagePreserveNeeded[storageName] = struct{}{}
			}
			continue
		}

		// Get storage for this node - retry once on timeout
		var storages []proxmox.Storage
		var err error

		for attempt := 1; attempt <= 2; attempt++ {
			storages, err = client.GetStorage(ctx, node.Node)
			if err == nil {
				break // Success
			}

			// Check if it's a timeout error
			errStr := err.Error()
			if strings.Contains(errStr, "timeout") || strings.Contains(errStr, "deadline exceeded") {
				if attempt == 1 {
					log.Warn().
						Str("node", node.Node).
						Str("instance", instanceName).
						Msg("Storage query timed out, retrying with extended timeout...")
					// Give it a bit more time on retry
					time.Sleep(2 * time.Second)
					continue
				}
			}
			// Non-timeout error or second attempt failed
			break
		}

		if err != nil {
			monErr := errors.NewMonitorError(errors.ErrorTypeAPI, "get_storage_for_backups", instanceName, err).WithNode(node.Node)
			log.Warn().Err(monErr).Str("node", node.Node).Msg("failed to get storage for backups - skipping node")
			for _, storageName := range storageNamesForNode(readState, instanceName, node.Node) {
				storagePreserveNeeded[storageName] = struct{}{}
			}
			storageQueryErrors++
			continue
		}

		hadSuccessfulNode = true

		// For each storage that can contain backups or templates
		for _, storage := range storages {
			// Check if storage supports backup content
			if !strings.Contains(storage.Content, "backup") {
				continue
			}
			if !storageContentQueryable(storage) {
				continue
			}

			storagesWithBackup++

			// Get storage content
			contents, err := client.GetStorageContent(ctx, node.Node, storage.Storage)
			if err != nil {
				monErr := errors.NewMonitorError(errors.ErrorTypeAPI, "get_storage_content", instanceName, err).WithNode(node.Node)
				errStr := strings.ToLower(err.Error())

				// Check if this is a permission error
				if strings.Contains(errStr, "403") || strings.Contains(errStr, "401") ||
					strings.Contains(errStr, "permission") || strings.Contains(errStr, "forbidden") {
					hadPermissionError = true
					m.mu.Lock()
					m.backupPermissionWarnings[instanceName] = "Missing PVEDatastoreAdmin permission on /storage. Run: pveum aclmod /storage -user pulse-monitor@pve -role PVEDatastoreAdmin"
					m.mu.Unlock()
					log.Warn().
						Str("instance", instanceName).
						Str("node", node.Node).
						Str("storage", storage.Storage).
						Msg("Backup permission denied - PVEDatastoreAdmin role may be missing on /storage")
				} else {
					log.Debug().Err(monErr).
						Str("node", node.Node).
						Str("storage", storage.Storage).
						Msg("Failed to get storage content")
				}
				if _, ok := storageSuccess[storage.Storage]; !ok {
					storagePreserveNeeded[storage.Storage] = struct{}{}
				}
				contentFailures++
				continue
			}

			contentSuccess++
			storageSuccess[storage.Storage] = struct{}{}
			delete(storagePreserveNeeded, storage.Storage)

			// Convert to models
			for _, content := range contents {
				// Skip if we've already seen this item (shared storage duplicate)
				if seenVolids[content.Volid] {
					continue
				}
				seenVolids[content.Volid] = true

				// Skip templates and ISOs - they're not backups
				if content.Content == "vztmpl" || content.Content == "iso" {
					continue
				}

				// Determine type from content type and VMID
				backupType := "unknown"
				if content.VMID == 0 {
					backupType = "host"
				} else if strings.Contains(content.Volid, "/vm/") || strings.Contains(content.Volid, "qemu") {
					backupType = "qemu"
				} else if strings.Contains(content.Volid, "/ct/") || strings.Contains(content.Volid, "lxc") {
					backupType = "lxc"
				} else if strings.Contains(content.Format, "pbs-ct") {
					// PBS format check as fallback
					backupType = "lxc"
				} else if strings.Contains(content.Format, "pbs-vm") {
					// PBS format check as fallback
					backupType = "qemu"
				}

				// Determine the correct node: for guest backups (VMID > 0), use the actual guest's node
				// For host backups (VMID == 0), use the node where the backup was found
				backupNode := node.Node
				if content.VMID > 0 {
					if actualNode, found := guestNodeMap[content.VMID]; found {
						backupNode = actualNode
					}
					// If not found in map, fall back to queried node (shouldn't happen normally)
				}
				isPBSStorage := strings.HasPrefix(storage.Storage, "pbs-") || storage.Type == "pbs"
				if isPBSStorage && hasPBSDirectConnection {
					log.Debug().
						Str("instance", instanceName).
						Str("node", node.Node).
						Str("storage", storage.Storage).
						Str("volid", content.Volid).
						Msg("Skipping PBS backup from PVE storage - PBS direct connection is authoritative")
					continue
				}

				// Check verification status for PBS backups
				verified := false
				verificationInfo := ""
				if isPBSStorage {
					// Check if verified flag is set
					if content.Verified > 0 {
						verified = true
					}
					// Also check verification map if available
					if content.Verification != nil {
						if state, ok := content.Verification["state"].(string); ok {
							verified = (state == "ok")
							verificationInfo = state
						}
					}
				}

				backup := models.StorageBackup{
					ID:           fmt.Sprintf("%s-%s", instanceName, content.Volid),
					Storage:      storage.Storage,
					Node:         backupNode,
					Instance:     instanceName,
					Type:         backupType,
					VMID:         content.VMID,
					Time:         time.Unix(content.CTime, 0),
					CTime:        content.CTime,
					Size:         int64(content.Size),
					Format:       content.Format,
					Notes:        content.Notes,
					Protected:    content.Protected > 0,
					Volid:        content.Volid,
					IsPBS:        isPBSStorage,
					Verified:     verified,
					Verification: verificationInfo,
				}

				allBackups = append(allBackups, backup)
			}
		}
	}

	allBackups, preservedStorages := preserveFailedStorageBackups(instanceName, snapshot, storagePreserveNeeded, allBackups)
	if len(preservedStorages) > 0 {
		log.Warn().
			Str("instance", instanceName).
			Strs("storages", preservedStorages).
			Msg("Preserving previous storage backup data due to partial failures")
	}

	// Decide whether to keep existing backups when every query failed
	if shouldPreserveBackups(len(nodes), hadSuccessfulNode, storagesWithBackup, contentSuccess) {
		if len(nodes) > 0 && !hadSuccessfulNode {
			log.Warn().
				Str("instance", instanceName).
				Int("nodes", len(nodes)).
				Int("errors", storageQueryErrors).
				Msg("Failed to query storage on all nodes; keeping previous backup list")
		} else if storagesWithBackup > 0 && contentSuccess == 0 {
			log.Warn().
				Str("instance", instanceName).
				Int("storages", storagesWithBackup).
				Int("failures", contentFailures).
				Msg("All storage content queries failed; keeping previous backup list")
		}
		return
	}

	// Update state with storage backups for this instance
	m.state.UpdateStorageBackupsForInstance(instanceName, allBackups)

	// Best-effort ingestion into recovery store (for rollups / unified backups UX).
	guestInfo := buildProxmoxGuestInfoIndex(readState)
	m.ingestRecoveryPointsAsync(proxmoxrecoverymapper.FromPVEStorageBackups(allBackups, guestInfo))

	// Sync backup times to VMs/Containers for backup status indicators
	m.state.SyncGuestBackupTimes()

	if m.alertManager != nil {
		guestsByKey, guestsByVMID := buildGuestLookupsFromReadState(m.GetUnifiedReadStateOrSnapshot(), m.guestMetadataStore)
		rollups, err := m.listBackupRollupsForAlerts(ctx)
		if err != nil {
			log.Warn().Err(err).Msg("Failed to list recovery rollups for backup alerts")
		} else {
			m.alertManager.CheckBackups(rollups, guestsByKey, guestsByVMID)
		}
	}

	// Clear permission warning if no permission errors occurred this cycle
	if !hadPermissionError {
		m.mu.Lock()
		delete(m.backupPermissionWarnings, instanceName)
		m.mu.Unlock()
	}

	log.Debug().
		Str("instance", instanceName).
		Int("count", len(allBackups)).
		Msg("Storage backups polled")

	// Immediately broadcast the updated state so frontend sees new backups
	m.broadcastStateUpdate()
}

func shouldPreserveBackups(nodeCount int, hadSuccessfulNode bool, storagesWithBackup, contentSuccess int) bool {
	if nodeCount > 0 && !hadSuccessfulNode {
		return true
	}
	if storagesWithBackup > 0 && contentSuccess == 0 {
		return true
	}
	return false
}

func shouldPreservePBSBackups(datastoreCount, datastoreFetches int) bool {
	// If there are datastores but all fetches failed, preserve existing backups
	if datastoreCount > 0 && datastoreFetches == 0 {
		return true
	}
	return false
}

func storageNamesForNode(readState unifiedresources.ReadState, instanceName, nodeName string) []string {
	if readState == nil || nodeName == "" {
		return nil
	}

	var storages []string
	for _, storage := range readState.StoragePools() {
		if storage == nil || storage.Instance() != instanceName {
			continue
		}
		if storage.Name() == "" {
			continue
		}
		if !strings.Contains(storage.Content(), "backup") {
			continue
		}
		if storage.Node() == nodeName {
			storages = append(storages, storage.Name())
			continue
		}
		for _, node := range storage.AccessibleNodes() {
			if node == nodeName {
				storages = append(storages, storage.Name())
				break
			}
		}
	}

	return storages
}

func preserveFailedStorageBackups(instanceName string, snapshot models.StateSnapshot, storagesToPreserve map[string]struct{}, current []models.StorageBackup) ([]models.StorageBackup, []string) {
	if len(storagesToPreserve) == 0 {
		return current, nil
	}

	existing := make(map[string]struct{}, len(current))
	for _, backup := range current {
		existing[backup.ID] = struct{}{}
	}

	preserved := make(map[string]struct{})
	for _, backup := range snapshot.PVEBackups.StorageBackups {
		if backup.Instance != instanceName {
			continue
		}
		if _, ok := storagesToPreserve[backup.Storage]; !ok {
			continue
		}
		if _, duplicate := existing[backup.ID]; duplicate {
			continue
		}
		current = append(current, backup)
		existing[backup.ID] = struct{}{}
		preserved[backup.Storage] = struct{}{}
	}

	if len(preserved) == 0 {
		return current, nil
	}

	storages := make([]string, 0, len(preserved))
	for storage := range preserved {
		storages = append(storages, storage)
	}
	sort.Strings(storages)
	return current, storages
}

func buildGuestLookupsFromReadState(readState unifiedresources.ReadState, metadataStore *config.GuestMetadataStore) (map[string]alerts.GuestLookup, map[string][]alerts.GuestLookup) {
	byKey := make(map[string]alerts.GuestLookup)
	byVMID := make(map[string][]alerts.GuestLookup)

	if readState == nil {
		if metadataStore != nil {
			enrichWithPersistedMetadata(metadataStore, byVMID)
		}
		return byKey, byVMID
	}

	for _, vm := range readState.VMs() {
		if vm == nil {
			continue
		}
		info := alerts.GuestLookup{
			ResourceID: makeGuestID(vm.Instance(), vm.Node(), vm.VMID()),
			Name:       vm.Name(),
			Instance:   vm.Instance(),
			Node:       vm.Node(),
			Type:       "qemu",
			VMID:       vm.VMID(),
		}
		key := alerts.BuildGuestKey(vm.Instance(), vm.Node(), vm.VMID())
		byKey[key] = info

		vmidKey := strconv.Itoa(vm.VMID())
		byVMID[vmidKey] = append(byVMID[vmidKey], info)

		// Persist last-known name and type for this guest
		if metadataStore != nil && vm.Name() != "" {
			persistGuestIdentity(metadataStore, key, vm.Name(), info.Type)
		}
	}

	for _, ct := range readState.Containers() {
		if ct == nil {
			continue
		}
		guestType := firstNonEmptyString(ct.ContainerType(), "lxc")
		info := alerts.GuestLookup{
			ResourceID: makeGuestID(ct.Instance(), ct.Node(), ct.VMID()),
			Name:       ct.Name(),
			Instance:   ct.Instance(),
			Node:       ct.Node(),
			Type:       guestType,
			VMID:       ct.VMID(),
		}
		key := alerts.BuildGuestKey(ct.Instance(), ct.Node(), ct.VMID())
		if _, exists := byKey[key]; !exists {
			byKey[key] = info
		}

		vmidKey := strconv.Itoa(ct.VMID())
		byVMID[vmidKey] = append(byVMID[vmidKey], info)

		// Persist last-known name and type for this guest
		if metadataStore != nil && ct.Name() != "" {
			persistGuestIdentity(metadataStore, key, ct.Name(), guestType)
		}
	}

	// Augment byVMID with persisted metadata for deleted guests
	if metadataStore != nil {
		enrichWithPersistedMetadata(metadataStore, byVMID)
	}

	return byKey, byVMID
}

func populateGuestNodeMapFromReadState(readState unifiedresources.ReadState, instanceName string, guestNodeMap map[int]string) {
	if readState == nil {
		return
	}
	for _, vm := range readState.VMs() {
		if vm == nil || vm.Instance() != instanceName {
			continue
		}
		guestNodeMap[vm.VMID()] = vm.Node()
	}
	for _, ct := range readState.Containers() {
		if ct == nil || ct.Instance() != instanceName {
			continue
		}
		guestNodeMap[ct.VMID()] = ct.Node()
	}
}

// enrichWithPersistedMetadata adds entries from the metadata store for guests
// that no longer exist in the live inventory but have persisted identity data
func enrichWithPersistedMetadata(metadataStore *config.GuestMetadataStore, byVMID map[string][]alerts.GuestLookup) {
	allMetadata := metadataStore.GetAll()
	for guestKey, meta := range allMetadata {
		if meta.LastKnownName == "" {
			continue // No name persisted, skip
		}

		// Parse the guest key (format: instance:node:vmid)
		// We need to extract instance, node, and vmid
		var instance, node string
		parts := strings.Split(guestKey, ":")
		if len(parts) != 3 {
			continue
		}
		instance, node = parts[0], parts[1]
		vmid, err := strconv.Atoi(parts[2])
		if err != nil {
			continue
		}

		vmidKey := strconv.Itoa(vmid)

		// Check if we already have a live entry for this exact guest
		hasLiveEntry := false
		for _, existing := range byVMID[vmidKey] {
			if existing.Instance == instance && existing.Node == node && existing.VMID == vmid {
				hasLiveEntry = true
				break
			}
		}

		// Only add persisted metadata if no live entry exists
		if !hasLiveEntry {
			byVMID[vmidKey] = append(byVMID[vmidKey], alerts.GuestLookup{
				Name:     meta.LastKnownName,
				Instance: instance,
				Node:     node,
				Type:     meta.LastKnownType,
				VMID:     vmid,
			})
		}
	}
}

// persistGuestIdentity updates the metadata store with the last-known name and type for a guest
func persistGuestIdentity(metadataStore *config.GuestMetadataStore, guestKey, name, guestType string) {
	existing := metadataStore.Get(guestKey)
	if existing == nil {
		existing = &config.GuestMetadata{
			ID:   guestKey,
			Tags: []string{},
		}
	}

	guestType = strings.TrimSpace(guestType)
	if guestType == "" {
		return
	}

	// Never "downgrade" OCI containers back to LXC. OCI classification can be transiently
	// unavailable if Proxmox config reads fail due to permissions or transient API errors.
	if existing.LastKnownType == "oci" && guestType != "oci" {
		guestType = existing.LastKnownType
	}

	// Only update if the name or type has changed
	if existing.LastKnownName != name || existing.LastKnownType != guestType {
		existing.LastKnownName = name
		existing.LastKnownType = guestType
		// Save asynchronously to avoid blocking the monitor
		go func() {
			if err := metadataStore.Set(guestKey, existing); err != nil {
				log.Error().Err(err).Str("guestKey", guestKey).Msg("failed to persist guest identity")
			}
		}()
	}
}

func (m *Monitor) calculateBackupOperationTimeout(instanceName string) time.Duration {
	const (
		minTimeout      = 2 * time.Minute
		maxTimeout      = 5 * time.Minute
		timeoutPerGuest = 2 * time.Second
	)

	timeout := minTimeout
	readState := m.GetUnifiedReadStateOrSnapshot()

	guestCount := 0
	for _, vm := range readState.VMs() {
		if vm != nil && vm.Instance() == instanceName && !vm.Template() {
			guestCount++
		}
	}
	for _, ct := range readState.Containers() {
		if ct != nil && ct.Instance() == instanceName && !ct.Template() {
			guestCount++
		}
	}

	if guestCount > 0 {
		dynamic := time.Duration(guestCount) * timeoutPerGuest
		if dynamic > timeout {
			timeout = dynamic
		}
	}

	if timeout > maxTimeout {
		return maxTimeout
	}

	return timeout
}

// pollGuestSnapshots polls snapshots for all VMs and containers
func (m *Monitor) pollGuestSnapshots(ctx context.Context, instanceName string, client PVEClientInterface) {
	log.Debug().Str("instance", instanceName).Msg("polling guest snapshots")

	readState := m.GetUnifiedReadStateOrSnapshot()
	var vms []models.VM
	for _, vm := range readState.VMs() {
		if vm == nil || vm.Instance() != instanceName {
			continue
		}
		vms = append(vms, vmFromReadStateView(vm))
	}
	var containers []models.Container
	for _, ct := range readState.Containers() {
		if ct == nil || ct.Instance() != instanceName {
			continue
		}
		containers = append(containers, containerFromReadStateView(ct))
	}

	guestKey := func(instance, node string, vmid int) string {
		if instance == node {
			return fmt.Sprintf("%s-%d", node, vmid)
		}
		return fmt.Sprintf("%s-%s-%d", instance, node, vmid)
	}

	guestNames := make(map[string]string, len(vms)+len(containers))
	for _, vm := range vms {
		guestNames[guestKey(instanceName, vm.Node, vm.VMID)] = vm.Name
	}
	for _, ct := range containers {
		guestNames[guestKey(instanceName, ct.Node, ct.VMID)] = ct.Name
	}

	activeGuests := 0
	for _, vm := range vms {
		if !vm.Template {
			activeGuests++
		}
	}
	for _, ct := range containers {
		if !ct.Template {
			activeGuests++
		}
	}

	const (
		minSnapshotTimeout      = 60 * time.Second
		maxSnapshotTimeout      = 4 * time.Minute
		snapshotTimeoutPerGuest = 2 * time.Second
	)

	timeout := minSnapshotTimeout
	if activeGuests > 0 {
		dynamic := time.Duration(activeGuests) * snapshotTimeoutPerGuest
		if dynamic > timeout {
			timeout = dynamic
		}
	}
	if timeout > maxSnapshotTimeout {
		timeout = maxSnapshotTimeout
	}

	if deadline, ok := ctx.Deadline(); ok {
		remaining := time.Until(deadline)
		if remaining <= 0 {
			log.Warn().
				Str("instance", instanceName).
				Msg("Skipping guest snapshot polling; backup context deadline exceeded")
			return
		}
		if timeout > remaining {
			timeout = remaining
		}
	}

	snapshotCtx, cancel := context.WithTimeout(ctx, timeout)
	defer cancel()

	log.Debug().
		Str("instance", instanceName).
		Int("guestCount", activeGuests).
		Dur("timeout", timeout).
		Msg("Guest snapshot polling budget established")

	var allSnapshots []models.GuestSnapshot
	deadlineExceeded := false

	// Poll VM snapshots
	for _, vm := range vms {
		// Skip templates
		if vm.Template {
			continue
		}

		snapshots, err := client.GetVMSnapshots(snapshotCtx, vm.Node, vm.VMID)
		if err != nil {
			if snapshotCtx.Err() != nil {
				log.Warn().
					Str("instance", instanceName).
					Str("node", vm.Node).
					Int("vmid", vm.VMID).
					Err(snapshotCtx.Err()).
					Msg("Aborting guest snapshot polling due to context cancellation while fetching VM snapshots")
				deadlineExceeded = true
				break
			}
			// This is common for VMs without snapshots, so use debug level
			monErr := errors.NewMonitorError(errors.ErrorTypeAPI, "get_vm_snapshots", instanceName, err).WithNode(vm.Node)
			log.Debug().
				Err(monErr).
				Str("node", vm.Node).
				Int("vmid", vm.VMID).
				Msg("Failed to get VM snapshots")
			continue
		}

		for _, snap := range snapshots {
			snapshot := models.GuestSnapshot{
				ID:          fmt.Sprintf("%s-%s-%d-%s", instanceName, vm.Node, vm.VMID, snap.Name),
				Name:        snap.Name,
				Node:        vm.Node,
				Instance:    instanceName,
				Type:        "qemu",
				VMID:        vm.VMID,
				Time:        time.Unix(snap.SnapTime, 0),
				Description: snap.Description,
				Parent:      snap.Parent,
				VMState:     true, // VM state support enabled
			}

			allSnapshots = append(allSnapshots, snapshot)
		}
	}

	if deadlineExceeded {
		log.Warn().
			Str("instance", instanceName).
			Msg("Guest snapshot polling timed out before completing VM collection; retaining previous snapshots")
		return
	}

	// Poll container snapshots
	for _, ct := range containers {
		// Skip templates
		if ct.Template {
			continue
		}

		snapshots, err := client.GetContainerSnapshots(snapshotCtx, ct.Node, ct.VMID)
		if err != nil {
			if snapshotCtx.Err() != nil {
				log.Warn().
					Str("instance", instanceName).
					Str("node", ct.Node).
					Int("vmid", ct.VMID).
					Err(snapshotCtx.Err()).
					Msg("Aborting guest snapshot polling due to context cancellation while fetching container snapshots")
				deadlineExceeded = true
				break
			}
			// API error 596 means snapshots not supported/available - this is expected for many containers
			errStr := err.Error()
			if strings.Contains(errStr, "596") || strings.Contains(errStr, "not available") {
				// Silently skip containers without snapshot support
				continue
			}
			// Log other errors at debug level
			monErr := errors.NewMonitorError(errors.ErrorTypeAPI, "get_container_snapshots", instanceName, err).WithNode(ct.Node)
			log.Debug().
				Err(monErr).
				Str("node", ct.Node).
				Int("vmid", ct.VMID).
				Msg("Failed to get container snapshots")
			continue
		}

		for _, snap := range snapshots {
			snapshot := models.GuestSnapshot{
				ID:          fmt.Sprintf("%s-%s-%d-%s", instanceName, ct.Node, ct.VMID, snap.Name),
				Name:        snap.Name,
				Node:        ct.Node,
				Instance:    instanceName,
				Type:        "lxc",
				VMID:        ct.VMID,
				Time:        time.Unix(snap.SnapTime, 0),
				Description: snap.Description,
				Parent:      snap.Parent,
				VMState:     false,
			}

			allSnapshots = append(allSnapshots, snapshot)
		}
	}

	if deadlineExceeded || snapshotCtx.Err() != nil {
		log.Warn().
			Str("instance", instanceName).
			Msg("Guest snapshot polling timed out before completion; retaining previous snapshots")
		return
	}

	if len(allSnapshots) > 0 {
		sizeMap := m.collectSnapshotSizes(snapshotCtx, instanceName, client, allSnapshots)
		if len(sizeMap) > 0 {
			for i := range allSnapshots {
				if size, ok := sizeMap[allSnapshots[i].ID]; ok && size > 0 {
					allSnapshots[i].SizeBytes = size
				}
			}
		}
	}

	// Update state with guest snapshots for this instance
	m.state.UpdateGuestSnapshotsForInstance(instanceName, allSnapshots)

	// Best-effort ingestion into recovery store (for rollups / unified backups UX).
	guestInfo := buildProxmoxGuestInfoIndex(readState)
	m.ingestRecoveryPointsAsync(proxmoxrecoverymapper.FromPVEGuestSnapshots(allSnapshots, guestInfo))

	if m.alertManager != nil {
		m.alertManager.CheckSnapshotsForInstance(instanceName, allSnapshots, guestNames)
	}

	log.Debug().
		Str("instance", instanceName).
		Int("count", len(allSnapshots)).
		Msg("Guest snapshots polled")

	// Immediately broadcast the updated state so frontend sees new snapshots
	m.broadcastStateUpdate()
}

func (m *Monitor) collectSnapshotSizes(ctx context.Context, instanceName string, client PVEClientInterface, snapshots []models.GuestSnapshot) map[string]int64 {
	sizes := make(map[string]int64, len(snapshots))
	if len(snapshots) == 0 {
		return sizes
	}

	validSnapshots := make(map[string]struct{}, len(snapshots))
	nodes := make(map[string]struct{})

	for _, snap := range snapshots {
		validSnapshots[snap.ID] = struct{}{}
		if snap.Node != "" {
			nodes[snap.Node] = struct{}{}
		}
	}

	if len(nodes) == 0 {
		return sizes
	}

	seenVolids := make(map[string]struct{})

	for nodeName := range nodes {
		if ctx.Err() != nil {
			break
		}

		storages, err := client.GetStorage(ctx, nodeName)
		if err != nil {
			log.Debug().
				Err(err).
				Str("node", nodeName).
				Str("instance", instanceName).
				Msg("Failed to get storage list for snapshot sizing")
			continue
		}

		for _, storage := range storages {
			if ctx.Err() != nil {
				break
			}

			contentTypes := strings.ToLower(storage.Content)
			if !strings.Contains(contentTypes, "images") && !strings.Contains(contentTypes, "rootdir") {
				continue
			}
			if !storageContentQueryable(storage) {
				continue
			}

			contents, err := client.GetStorageContent(ctx, nodeName, storage.Storage)
			if err != nil {
				log.Debug().
					Err(err).
					Str("node", nodeName).
					Str("storage", storage.Storage).
					Str("instance", instanceName).
					Msg("Failed to get storage content for snapshot sizing")
				continue
			}

			for _, item := range contents {
				if item.VMID <= 0 {
					continue
				}

				if _, seen := seenVolids[item.Volid]; seen {
					continue
				}

				snapName := extractSnapshotName(item.Volid)
				if snapName == "" {
					continue
				}

				key := fmt.Sprintf("%s-%s-%d-%s", instanceName, nodeName, item.VMID, snapName)
				if _, ok := validSnapshots[key]; !ok {
					continue
				}

				seenVolids[item.Volid] = struct{}{}

				size := int64(item.Size)
				if size < 0 {
					size = 0
				}

				sizes[key] += size
			}
		}
	}

	return sizes
}

func (m *Monitor) recordAuthFailure(instanceName string, nodeType string) {
	nodeID := instanceName
	if nodeType != "" {
		nodeID = nodeType + "-" + instanceName
	}

	m.mu.Lock()
	m.authFailures[nodeID]++
	failures := m.authFailures[nodeID]
	m.lastAuthAttempt[nodeID] = time.Now()
	m.mu.Unlock()

	log.Warn().
		Str("node", nodeID).
		Int("failures", failures).
		Msg("Authentication failure recorded")

	const maxAuthFailures = 5
	if failures >= maxAuthFailures {
		// Clear tracking first, then perform removal outside the monitor lock.
		// Removal updates state/health and may need to acquire monitor locks internally.
		m.mu.Lock()
		delete(m.authFailures, nodeID)
		delete(m.lastAuthAttempt, nodeID)
		m.mu.Unlock()

		log.Error().
			Str("node", nodeID).
			Int("failures", failures).
			Msg("Maximum authentication failures reached, removing node from state")

		// Remove from state based on type
		if nodeType == "pve" {
			m.removeFailedPVENode(instanceName)
		} else if nodeType == "pbs" {
			m.removeFailedPBSNode(instanceName)
		} else if nodeType == "pmg" {
			m.removeFailedPMGInstance(instanceName)
		}
	}
}

// resetAuthFailures resets the failure count for a node after successful auth
func (m *Monitor) resetAuthFailures(instanceName string, nodeType string) {
	m.mu.Lock()
	defer m.mu.Unlock()

	nodeID := instanceName
	if nodeType != "" {
		nodeID = nodeType + "-" + instanceName
	}

	if count, exists := m.authFailures[nodeID]; exists && count > 0 {
		log.Info().
			Str("node", nodeID).
			Int("previousFailures", count).
			Msg("Authentication succeeded, resetting failure count")

		delete(m.authFailures, nodeID)
		delete(m.lastAuthAttempt, nodeID)
	}
}

// removeFailedPVENode updates a PVE node to show failed authentication status
func (m *Monitor) removeFailedPVENode(instanceName string) {
	// Get instance config to get host URL
	var hostURL string
	m.mu.RLock()
	if m.config != nil {
		for _, cfg := range m.config.PVEInstances {
			if cfg.Name == instanceName {
				hostURL = cfg.Host
				break
			}
		}
	}
	m.mu.RUnlock()

	// Create a failed node entry to show in UI with error status
	failedNode := models.Node{
		ID:               instanceName + "-failed",
		Name:             instanceName,
		DisplayName:      instanceName,
		Instance:         instanceName,
		Host:             hostURL, // Include host URL even for failed nodes
		Status:           "offline",
		Type:             "node",
		ConnectionHealth: "error",
		LastSeen:         time.Now(),
		// Set other fields to zero values to indicate no data
		CPU:    0,
		Memory: models.Memory{},
		Disk:   models.Disk{},
	}

	// Update with just the failed node
	m.state.UpdateNodesForInstance(instanceName, []models.Node{failedNode})

	// Remove all other resources associated with this instance
	m.state.UpdateVMsForInstance(instanceName, []models.VM{})
	m.state.UpdateContainersForInstance(instanceName, []models.Container{})
	m.state.UpdateStorageForInstance(instanceName, []models.Storage{})
	m.state.UpdateCephClustersForInstance(instanceName, []models.CephCluster{})
	m.state.UpdateBackupTasksForInstance(instanceName, []models.BackupTask{})
	m.state.UpdateStorageBackupsForInstance(instanceName, []models.StorageBackup{})
	m.state.UpdateGuestSnapshotsForInstance(instanceName, []models.GuestSnapshot{})

	// Set connection health to false
	m.setProviderConnectionHealth(InstanceTypePVE, instanceName, false)
}

// removeFailedPBSNode removes a PBS node and all its resources from state
func (m *Monitor) removeFailedPBSNode(instanceName string) {
	// Remove PBS instance by passing empty array
	currentInstances := m.state.PBSInstances
	var updatedInstances []models.PBSInstance
	for _, inst := range currentInstances {
		if inst.Name != instanceName {
			updatedInstances = append(updatedInstances, inst)
		}
	}
	m.state.UpdatePBSInstances(updatedInstances)

	// Remove PBS backups
	m.state.UpdatePBSBackups(instanceName, []models.PBSBackup{})

	// Set connection health to false
	m.setProviderConnectionHealth(InstanceTypePBS, instanceName, false)
}

// removeFailedPMGInstance removes PMG data from state when authentication fails repeatedly
func (m *Monitor) removeFailedPMGInstance(instanceName string) {
	currentInstances := m.state.PMGInstances
	updated := make([]models.PMGInstance, 0, len(currentInstances))
	for _, inst := range currentInstances {
		if inst.Name != instanceName {
			updated = append(updated, inst)
		}
	}

	m.state.UpdatePMGInstances(updated)
	m.state.UpdatePMGBackups(instanceName, nil)
	m.setProviderConnectionHealth(InstanceTypePMG, instanceName, false)
}

// pbsBackupCacheTTL controls how long cached PBS backup snapshots are reused
// before forcing a re-fetch. This ensures verification status changes (which
// don't alter backup count or timestamp) are picked up periodically.
const pbsBackupCacheTTL = 10 * time.Minute

type pbsBackupGroupKey struct {
	datastore  string
	namespace  string
	backupType string
	backupID   string
}

type cachedPBSGroup struct {
	snapshots []models.PBSBackup
	latest    time.Time
}

type pbsBackupFetchRequest struct {
	datastore string
	namespace string
	group     pbs.BackupGroup
	cached    cachedPBSGroup
}

// pollPBSBackups fetches all backups from PBS datastores
func (m *Monitor) pollPBSBackups(ctx context.Context, instanceName string, client *pbs.Client, datastores []models.PBSDatastore) {
	log.Debug().Str("instance", instanceName).Msg("polling PBS backups")

	// Cache existing PBS backups so we can avoid redundant API calls when no changes occurred.
	existingGroups := m.buildPBSBackupCache(instanceName)

	var allBackups []models.PBSBackup
	datastoreCount := len(datastores) // Number of datastores to query
	datastoreFetches := 0             // Number of successful datastore fetches
	datastoreErrors := 0              // Number of failed datastore fetches
	datastoreTerminalFailures := 0    // Number of datastores that failed only with terminal errors

	// Process each datastore
	for _, ds := range datastores {
		if ctx.Err() != nil {
			log.Warn().
				Str("instance", instanceName).
				Msg("PBS backup polling cancelled before completion")
			return
		}

		namespacePaths := namespacePathsForDatastore(ds)

		log.Info().
			Str("instance", instanceName).
			Str("datastore", ds.Name).
			Int("namespaces", len(namespacePaths)).
			Strs("namespace_paths", namespacePaths).
			Msg("Processing datastore namespaces")

		datastoreHadSuccess := false
		datastoreNamespaceErrors := 0
		datastoreTerminalNamespaceErrors := 0
		groupsReused := 0
		groupsRequested := 0

		for _, namespace := range namespacePaths {
			if ctx.Err() != nil {
				log.Warn().
					Str("instance", instanceName).
					Msg("PBS backup polling cancelled mid-datastore")
				return
			}

			groups, err := client.ListBackupGroups(ctx, ds.Name, namespace)
			if err != nil {
				datastoreNamespaceErrors++
				if !shouldReuseCachedPBSBackups(err) {
					datastoreTerminalNamespaceErrors++
				}
				log.Error().
					Err(err).
					Str("instance", instanceName).
					Str("datastore", ds.Name).
					Str("namespace", namespace).
					Msg("Failed to list PBS backup groups")
				continue
			}

			datastoreHadSuccess = true
			requests := make([]pbsBackupFetchRequest, 0, len(groups))

			for _, group := range groups {
				key := pbsBackupGroupKey{
					datastore:  ds.Name,
					namespace:  namespace,
					backupType: group.BackupType,
					backupID:   group.BackupID,
				}
				cached := existingGroups[key]

				// Group deleted (no backups left) - ensure cached data is dropped.
				if group.BackupCount == 0 {
					continue
				}

				lastBackupTime := time.Unix(group.LastBackup, 0)
				hasCachedData := len(cached.snapshots) > 0

				// Check if the cached data is still within its TTL.
				cacheAge := time.Since(m.pbsBackupCacheTimeFor(instanceName, key))
				cacheStillFresh := cacheAge < pbsBackupCacheTTL

				// Only re-fetch when the backup count changes, the most recent backup
				// is newer, or the cache TTL has expired (to pick up verification changes).
				if hasCachedData &&
					cacheStillFresh &&
					len(cached.snapshots) == group.BackupCount &&
					!lastBackupTime.After(cached.latest) {

					allBackups = append(allBackups, cached.snapshots...)
					groupsReused++
					continue
				}

				requests = append(requests, pbsBackupFetchRequest{
					datastore: ds.Name,
					namespace: namespace,
					group:     group,
					cached:    cached,
				})
			}

			if len(requests) == 0 {
				continue
			}

			groupsRequested += len(requests)
			fetched := m.fetchPBSBackupSnapshots(ctx, client, instanceName, requests)
			if len(fetched) > 0 {
				allBackups = append(allBackups, fetched...)
			}

			// Record fetch time for each requested group so the TTL tracks freshness.
			// We record for all requested groups — on fetch failure, fetchPBSBackupSnapshots
			// falls back to cached data, so the timestamp prevents hammering a failing
			// endpoint. The TTL ensures we retry within a bounded window.
			fetchedAt := time.Now()
			for _, req := range requests {
				reqKey := pbsBackupGroupKey{
					datastore:  req.datastore,
					namespace:  req.namespace,
					backupType: req.group.BackupType,
					backupID:   req.group.BackupID,
				}
				m.setPBSBackupCacheTime(instanceName, reqKey, fetchedAt)
			}
		}

		if datastoreHadSuccess {
			datastoreFetches++
			log.Info().
				Str("instance", instanceName).
				Str("datastore", ds.Name).
				Int("namespaces", len(namespacePaths)).
				Int("groups_reused", groupsReused).
				Int("groups_refreshed", groupsRequested).
				Msg("PBS datastore processed")
		} else {
			allNamespaceErrorsTerminal := datastoreNamespaceErrors > 0 &&
				datastoreTerminalNamespaceErrors == datastoreNamespaceErrors
			if allNamespaceErrorsTerminal {
				datastoreTerminalFailures++
				log.Warn().
					Str("instance", instanceName).
					Str("datastore", ds.Name).
					Int("namespace_errors", datastoreNamespaceErrors).
					Msg("No namespaces succeeded for PBS datastore due to terminal errors; clearing cached backups")
			} else {
				// Preserve cached data for this datastore when failures are transient.
				log.Warn().
					Str("instance", instanceName).
					Str("datastore", ds.Name).
					Msg("No namespaces succeeded for PBS datastore; using cached backups")
				for key, entry := range existingGroups {
					if key.datastore != ds.Name || len(entry.snapshots) == 0 {
						continue
					}
					allBackups = append(allBackups, entry.snapshots...)
				}
			}
			datastoreErrors++
		}
	}

	log.Info().
		Str("instance", instanceName).
		Int("count", len(allBackups)).
		Msg("PBS backups fetched")

	// Decide whether to keep existing backups when all queries failed
	if shouldPreservePBSBackupsWithTerminal(datastoreCount, datastoreFetches, datastoreTerminalFailures) {
		log.Warn().
			Str("instance", instanceName).
			Int("datastores", datastoreCount).
			Int("errors", datastoreErrors).
			Int("terminal_failures", datastoreTerminalFailures).
			Msg("All PBS datastore queries failed; keeping previous backup list")
		return
	}

	// Update state
	m.state.UpdatePBSBackups(instanceName, allBackups)

	// Best-effort ingestion into recovery store (for rollups / unified backups UX).
	candidates := buildPBSGuestCandidates(m.GetUnifiedReadStateOrSnapshot())
	m.ingestRecoveryPointsAsync(proxmoxrecoverymapper.FromPBSBackups(allBackups, candidates))

	// Sync backup times to VMs/Containers for backup status indicators
	m.state.SyncGuestBackupTimes()

	if m.alertManager != nil {
		guestsByKey, guestsByVMID := buildGuestLookupsFromReadState(m.GetUnifiedReadStateOrSnapshot(), m.guestMetadataStore)
		rollups, err := m.listBackupRollupsForAlerts(context.Background())
		if err != nil {
			log.Warn().Err(err).Msg("Failed to list recovery rollups for backup alerts")
		} else {
			m.alertManager.CheckBackups(rollups, guestsByKey, guestsByVMID)
		}
	}

	// Immediately broadcast the updated state so frontend sees new backups
	m.broadcastStateUpdate()
}

func (m *Monitor) buildPBSBackupCache(instanceName string) map[pbsBackupGroupKey]cachedPBSGroup {
	snapshot := m.state.GetSnapshot()
	cache := make(map[pbsBackupGroupKey]cachedPBSGroup)
	for _, backup := range snapshot.PBSBackups {
		if backup.Instance != instanceName {
			continue
		}
		key := pbsBackupGroupKey{
			datastore:  backup.Datastore,
			namespace:  normalizePBSNamespacePath(backup.Namespace),
			backupType: backup.BackupType,
			backupID:   backup.VMID,
		}
		entry := cache[key]
		entry.snapshots = append(entry.snapshots, backup)
		if backup.BackupTime.After(entry.latest) {
			entry.latest = backup.BackupTime
		}
		cache[key] = entry
	}
	return cache
}

// pbsBackupCacheTimeFor returns the last fetch time for a PBS backup group.
func (m *Monitor) pbsBackupCacheTimeFor(instanceName string, key pbsBackupGroupKey) time.Time {
	m.mu.RLock()
	defer m.mu.RUnlock()
	if perGroup, ok := m.pbsBackupCacheTime[instanceName]; ok {
		return perGroup[key]
	}
	return time.Time{}
}

// setPBSBackupCacheTime records when a PBS backup group was last fetched.
func (m *Monitor) setPBSBackupCacheTime(instanceName string, key pbsBackupGroupKey, t time.Time) {
	m.mu.Lock()
	defer m.mu.Unlock()
	if m.pbsBackupCacheTime == nil {
		m.pbsBackupCacheTime = make(map[string]map[pbsBackupGroupKey]time.Time)
	}
	if m.pbsBackupCacheTime[instanceName] == nil {
		m.pbsBackupCacheTime[instanceName] = make(map[pbsBackupGroupKey]time.Time)
	}
	m.pbsBackupCacheTime[instanceName][key] = t
}

func normalizePBSNamespacePath(ns string) string {
	if ns == "/" {
		return ""
	}
	return ns
}

func namespacePathsForDatastore(ds models.PBSDatastore) []string {
	if len(ds.Namespaces) == 0 {
		return []string{""}
	}

	seen := make(map[string]struct{}, len(ds.Namespaces))
	var paths []string
	for _, ns := range ds.Namespaces {
		path := normalizePBSNamespacePath(ns.Path)
		if _, ok := seen[path]; ok {
			continue
		}
		seen[path] = struct{}{}
		paths = append(paths, path)
	}
	return paths
}

func (m *Monitor) fetchPBSBackupSnapshots(ctx context.Context, client *pbs.Client, instanceName string, requests []pbsBackupFetchRequest) []models.PBSBackup {
	if len(requests) == 0 {
		return nil
	}

	results := make(chan []models.PBSBackup, len(requests))
	var wg sync.WaitGroup
	sem := make(chan struct{}, 5)

	for _, req := range requests {
		req := req
		wg.Add(1)
		go func() {
			defer wg.Done()

			select {
			case sem <- struct{}{}:
			case <-ctx.Done():
				return
			}
			defer func() { <-sem }()

			log.Debug().
				Str("instance", instanceName).
				Str("datastore", req.datastore).
				Str("namespace", req.namespace).
				Str("type", req.group.BackupType).
				Str("id", req.group.BackupID).
				Msg("Refreshing PBS backup group")

			snapshots, err := client.ListBackupSnapshots(ctx, req.datastore, req.namespace, req.group.BackupType, req.group.BackupID)
			if err != nil {
				log.Error().
					Err(err).
					Str("instance", instanceName).
					Str("datastore", req.datastore).
					Str("namespace", req.namespace).
					Str("type", req.group.BackupType).
					Str("id", req.group.BackupID).
					Msg("Failed to list PBS backup snapshots")

				if len(req.cached.snapshots) > 0 {
					results <- req.cached.snapshots
				}
				return
			}

			results <- convertPBSSnapshots(instanceName, req.datastore, req.namespace, snapshots)
		}()
	}

	go func() {
		wg.Wait()
		close(results)
	}()

	var combined []models.PBSBackup
	for backups := range results {
		if len(backups) == 0 {
			continue
		}
		combined = append(combined, backups...)
	}

	return combined
}

func convertPBSSnapshots(instanceName, datastore, namespace string, snapshots []pbs.BackupSnapshot) []models.PBSBackup {
	backups := make([]models.PBSBackup, 0, len(snapshots))
	for _, snapshot := range snapshots {
		backupTime := time.Unix(snapshot.BackupTime, 0)
		backupID := fmt.Sprintf("pbs-%s-%s-%s-%s-%s-%d",
			instanceName, datastore, namespace,
			snapshot.BackupType, snapshot.BackupID,
			snapshot.BackupTime)

		var fileNames []string
		for _, file := range snapshot.Files {
			switch f := file.(type) {
			case string:
				fileNames = append(fileNames, f)
			case map[string]interface{}:
				if filename, ok := f["filename"].(string); ok {
					fileNames = append(fileNames, filename)
				}
			}
		}

		verified := false
		if snapshot.Verification != nil {
			switch v := snapshot.Verification.(type) {
			case string:
				verified = v == "ok"
			case map[string]interface{}:
				if state, ok := v["state"].(string); ok {
					verified = state == "ok"
				}
			}

			log.Debug().
				Str("vmid", snapshot.BackupID).
				Int64("time", snapshot.BackupTime).
				Interface("verification", snapshot.Verification).
				Bool("verified", verified).
				Msg("PBS backup verification status")
		}

		backups = append(backups, models.PBSBackup{
			ID:              backupID,
			Instance:        instanceName,
			Datastore:       datastore,
			Namespace:       namespace,
			BackupType:      snapshot.BackupType,
			VMID:            snapshot.BackupID,
			BackupTime:      backupTime,
			Size:            snapshot.Size,
			Protected:       snapshot.Protected,
			Verified:        verified,
			VerificationRaw: snapshot.Verification,
			Comment:         snapshot.Comment,
			Files:           fileNames,
			Owner:           snapshot.Owner,
		})
	}

	return backups
}

// pollBackupTasks polls backup tasks from a PVE instance
func (m *Monitor) pollBackupTasks(ctx context.Context, instanceName string, client PVEClientInterface) {
	log.Debug().Str("instance", instanceName).Msg("polling backup tasks")

	tasks, err := client.GetBackupTasks(ctx)
	if err != nil {
		monErr := errors.WrapAPIError("get_backup_tasks", instanceName, err, 0)
		log.Error().Err(monErr).Str("instance", instanceName).Msg("failed to get backup tasks")
		return
	}

	var backupTasks []models.BackupTask
	for _, task := range tasks {
		// Extract VMID from task ID (format: "UPID:node:pid:starttime:type:vmid:user@realm:")
		vmid := 0
		if task.ID != "" {
			if vmidInt, err := strconv.Atoi(task.ID); err == nil {
				vmid = vmidInt
			}
		}

		taskID := fmt.Sprintf("%s-%s", instanceName, task.UPID)

		backupTask := models.BackupTask{
			ID:        taskID,
			Node:      task.Node,
			Instance:  instanceName,
			Type:      task.Type,
			VMID:      vmid,
			Status:    task.Status,
			StartTime: time.Unix(task.StartTime, 0),
		}

		if task.EndTime > 0 {
			backupTask.EndTime = time.Unix(task.EndTime, 0)
		}

		backupTasks = append(backupTasks, backupTask)
	}

	// Update state with new backup tasks for this instance
	m.state.UpdateBackupTasksForInstance(instanceName, backupTasks)

	// Best-effort ingestion into recovery store (for rollups / unified backups UX).
	guestInfo := buildProxmoxGuestInfoIndex(m.GetUnifiedReadStateOrSnapshot())
	m.ingestRecoveryPointsAsync(proxmoxrecoverymapper.FromPVEBackupTasks(backupTasks, guestInfo))
}

func (m *Monitor) pollPVEBackupsAsync(
	ctx context.Context,
	instanceName string,
	instanceCfg *config.PVEInstance,
	client PVEClientInterface,
	nodes []proxmox.Node,
	nodeEffectiveStatus map[string]string,
) error {
	// Poll backups if enabled - respect configured interval or cycle gating
	if !instanceCfg.MonitorBackups {
		return nil
	}

	if !m.config.EnableBackupPolling {
		log.Debug().
			Str("instance", instanceName).
			Msg("Skipping backup polling - globally disabled")
		return nil
	}

	now := time.Now()

	m.mu.RLock()
	lastPoll := m.lastPVEBackupPoll[instanceName]
	m.mu.RUnlock()

	shouldPoll, reason, newLast := m.shouldRunBackupPoll(lastPoll, now)
	if !shouldPoll {
		if reason != "" {
			log.Debug().
				Str("instance", instanceName).
				Str("reason", reason).
				Msg("Skipping PVE backup polling this cycle")
		}
		return nil
	}

	select {
	case <-ctx.Done():
		return ctx.Err()
	default:
		// Set initial timestamp before starting goroutine (prevents concurrent starts)
		m.mu.Lock()
		m.lastPVEBackupPoll[instanceName] = newLast
		m.mu.Unlock()

		// Run backup polling in a separate goroutine to avoid blocking real-time stats
		go func(startTime time.Time, inst string, pveClient PVEClientInterface) {
			defer recoverFromPanic(fmt.Sprintf("pollPVEBackups-%s", inst))
			timeout := m.calculateBackupOperationTimeout(inst)
			log.Info().
				Str("instance", inst).
				Dur("timeout", timeout).
				Msg("Starting background backup/snapshot polling")

			// The per-cycle ctx is canceled as soon as the main polling loop finishes,
			// so derive the backup poll context from the long-lived runtime context instead.
			parentCtx := m.runtimeCtx
			if parentCtx == nil {
				parentCtx = context.Background()
			}

			backupCtx, cancel := context.WithTimeout(parentCtx, timeout)
			defer cancel()

			// Poll backup tasks
			m.pollBackupTasks(backupCtx, inst, pveClient)

			// Poll storage backups - pass nodes to avoid duplicate API calls
			m.pollStorageBackupsWithNodes(backupCtx, inst, pveClient, nodes, nodeEffectiveStatus)

			// Poll guest snapshots
			m.pollGuestSnapshots(backupCtx, inst, pveClient)

			duration := time.Since(startTime)
			log.Info().
				Str("instance", inst).
				Dur("duration", duration).
				Msg("Completed background backup/snapshot polling")

			// Update timestamp after completion for accurate interval scheduling
			m.mu.Lock()
			m.lastPVEBackupPoll[inst] = time.Now()
			m.mu.Unlock()
		}(now, instanceName, client)
	}

	return nil
}

// checkMockAlerts checks alerts for mock data