package monitoring

import (
	"context"
	stderrors "errors"
	"fmt"
	"math"
	"os"
	"sort"
	"strconv"
	"strings"
	"sync"
	"time"

	"github.com/rcourtman/pulse-go-rewrite/internal/config"
	"github.com/rcourtman/pulse-go-rewrite/internal/errors"
	"github.com/rcourtman/pulse-go-rewrite/internal/logging"
	"github.com/rcourtman/pulse-go-rewrite/internal/models"
	"github.com/rcourtman/pulse-go-rewrite/pkg/proxmox"
	"github.com/rs/zerolog"
	"github.com/rs/zerolog/log"
)

func (m *Monitor) describeInstancesForScheduler() []InstanceDescriptor {
	total := len(m.pveClients) + len(m.pbsClients) + len(m.pmgClients)
	if total == 0 {
		return nil
	}

	descriptors := make([]InstanceDescriptor, 0, total)

	if len(m.pveClients) > 0 {
		names := make([]string, 0, len(m.pveClients))
		for name := range m.pveClients {
			names = append(names, name)
		}
		sort.Strings(names)
		for _, name := range names {
			desc := InstanceDescriptor{
				Name: name,
				Type: InstanceTypePVE,
			}
			if m.scheduler != nil {
				if last, ok := m.scheduler.LastScheduled(InstanceTypePVE, name); ok {
					desc.LastScheduled = last.NextRun
					desc.LastInterval = last.Interval
				}
			}
			if m.stalenessTracker != nil {
				if snap, ok := m.stalenessTracker.snapshot(InstanceTypePVE, name); ok {
					desc.LastSuccess = snap.LastSuccess
					desc.LastFailure = snap.LastError
					desc.Metadata = map[string]any{"changeHash": snap.ChangeHash}
				}
			}
			descriptors = append(descriptors, desc)
		}
	}

	if len(m.pbsClients) > 0 {
		names := make([]string, 0, len(m.pbsClients))
		for name := range m.pbsClients {
			names = append(names, name)
		}
		sort.Strings(names)
		for _, name := range names {
			desc := InstanceDescriptor{
				Name: name,
				Type: InstanceTypePBS,
			}
			if m.scheduler != nil {
				if last, ok := m.scheduler.LastScheduled(InstanceTypePBS, name); ok {
					desc.LastScheduled = last.NextRun
					desc.LastInterval = last.Interval
				}
			}
			if m.stalenessTracker != nil {
				if snap, ok := m.stalenessTracker.snapshot(InstanceTypePBS, name); ok {
					desc.LastSuccess = snap.LastSuccess
					desc.LastFailure = snap.LastError
					desc.Metadata = map[string]any{"changeHash": snap.ChangeHash}
				}
			}
			descriptors = append(descriptors, desc)
		}
	}

	if len(m.pmgClients) > 0 {
		names := make([]string, 0, len(m.pmgClients))
		for name := range m.pmgClients {
			names = append(names, name)
		}
		sort.Strings(names)
		for _, name := range names {
			desc := InstanceDescriptor{
				Name: name,
				Type: InstanceTypePMG,
			}
			if m.scheduler != nil {
				if last, ok := m.scheduler.LastScheduled(InstanceTypePMG, name); ok {
					desc.LastScheduled = last.NextRun
					desc.LastInterval = last.Interval
				}
			}
			if m.stalenessTracker != nil {
				if snap, ok := m.stalenessTracker.snapshot(InstanceTypePMG, name); ok {
					desc.LastSuccess = snap.LastSuccess
					desc.LastFailure = snap.LastError
					desc.Metadata = map[string]any{"changeHash": snap.ChangeHash}
				}
			}
			descriptors = append(descriptors, desc)
		}
	}

	return descriptors
}

func (m *Monitor) buildScheduledTasks(now time.Time) []ScheduledTask {
	descriptors := m.describeInstancesForScheduler()
	if len(descriptors) == 0 {
		return nil
	}

	queueDepth := 0
	if m.taskQueue != nil {
		queueDepth = m.taskQueue.Size()
	}

	if m.scheduler == nil {
		tasks := make([]ScheduledTask, 0, len(descriptors))
		for _, desc := range descriptors {
			interval := m.baseIntervalForInstanceType(desc.Type)
			if interval <= 0 {
				interval = DefaultSchedulerConfig().BaseInterval
			}
			tasks = append(tasks, ScheduledTask{
				InstanceName: desc.Name,
				InstanceType: desc.Type,
				NextRun:      now,
				Interval:     interval,
			})
		}
		return tasks
	}

	return m.scheduler.BuildPlan(now, descriptors, queueDepth)
}

// convertPoolInfoToModel converts Proxmox ZFS pool info to our model
func convertPoolInfoToModel(poolInfo *proxmox.ZFSPoolInfo) *models.ZFSPool {
	if poolInfo == nil {
		return nil
	}

	// Use the converter from the proxmox package
	proxmoxPool := poolInfo.ConvertToModelZFSPool()

	// Convert to our internal model
	modelPool := &models.ZFSPool{
		Name:           proxmoxPool.Name,
		State:          proxmoxPool.State,
		Status:         proxmoxPool.Status,
		Scan:           proxmoxPool.Scan,
		ReadErrors:     proxmoxPool.ReadErrors,
		WriteErrors:    proxmoxPool.WriteErrors,
		ChecksumErrors: proxmoxPool.ChecksumErrors,
		Devices:        make([]models.ZFSDevice, 0, len(proxmoxPool.Devices)),
	}

	// Convert devices
	for _, dev := range proxmoxPool.Devices {
		modelPool.Devices = append(modelPool.Devices, models.ZFSDevice{
			Name:           dev.Name,
			Type:           dev.Type,
			State:          dev.State,
			ReadErrors:     dev.ReadErrors,
			WriteErrors:    dev.WriteErrors,
			ChecksumErrors: dev.ChecksumErrors,
			Message:        dev.Message,
		})
	}

	return modelPool
}

func matchZFSPoolForStorage(storage models.Storage, zfsPoolMap map[string]*models.ZFSPool) *models.ZFSPool {
	if len(zfsPoolMap) == 0 {
		return nil
	}

	normalizedPools := make(map[string]*models.ZFSPool, len(zfsPoolMap))
	var solePool *models.ZFSPool
	for name, pool := range zfsPoolMap {
		normalizedName := strings.ToLower(strings.Trim(strings.TrimSpace(name), "/"))
		if normalizedName == "" || pool == nil {
			continue
		}
		normalizedPools[normalizedName] = pool
		solePool = pool
	}

	lookupCandidate := func(candidate string) *models.ZFSPool {
		normalized := strings.ToLower(strings.Trim(strings.TrimSpace(candidate), "/"))
		if normalized == "" {
			return nil
		}
		if pool, ok := normalizedPools[normalized]; ok {
			return pool
		}
		if idx := strings.Index(normalized, "/"); idx > 0 {
			if pool, ok := normalizedPools[normalized[:idx]]; ok {
				return pool
			}
		}
		return nil
	}

	candidates := []string{
		storage.Pool,
		storage.Name,
		storage.Path,
	}

	trimmedPath := strings.Trim(strings.TrimSpace(storage.Path), "/")
	if trimmedPath != "" {
		candidates = append(candidates, trimmedPath)
		if idx := strings.Index(trimmedPath, "/"); idx > 0 {
			candidates = append(candidates, trimmedPath[:idx])
		}
	}

	normalizedName := strings.TrimSpace(storage.Name)
	if strings.HasSuffix(strings.ToLower(normalizedName), "-zfs") {
		candidates = append(candidates, strings.TrimSuffix(normalizedName, "-zfs"))
		candidates = append(candidates, strings.TrimSuffix(normalizedName, "-ZFS"))
	}

	for _, candidate := range candidates {
		if pool := lookupCandidate(candidate); pool != nil {
			return pool
		}
	}

	if len(normalizedPools) == 1 {
		return solePool
	}

	return nil
}

type indexedLegacyVM struct {
	order int
	vm    proxmox.VM
}

type orderedNodeVMResult struct {
	order   int
	vm      models.VM
	alertVM models.VM
	snap    GuestMemorySnapshot
}

func rotateIndexedLegacyVMs(vms []indexedLegacyVM, offset int) []indexedLegacyVM {
	count := len(vms)
	if count <= 1 {
		return append([]indexedLegacyVM(nil), vms...)
	}

	offset %= count
	if offset < 0 {
		offset += count
	}
	if offset == 0 {
		return append([]indexedLegacyVM(nil), vms...)
	}

	rotated := make([]indexedLegacyVM, 0, count)
	rotated = append(rotated, vms[offset:]...)
	rotated = append(rotated, vms[:offset]...)
	return rotated
}

// pollVMsWithNodes polls VMs from all nodes in parallel using goroutines
// When the instance is part of a cluster, the cluster name is used for guest IDs to prevent duplicates
// when multiple cluster nodes are configured as separate PVE instances.
func (m *Monitor) pollVMsWithNodes(ctx context.Context, instanceName string, clusterName string, isCluster bool, client PVEClientInterface, nodes []proxmox.Node, nodeEffectiveStatus map[string]string) {
	startTime := time.Now()

	// Channel to collect VM results from each node
	type nodeResult struct {
		node     string
		vms      []models.VM
		alertVMs []models.VM
		snaps    []GuestMemorySnapshot
		err      error
	}

	resultChan := make(chan nodeResult, len(nodes))
	var wg sync.WaitGroup

	// Count online nodes for logging
	onlineNodes := 0
	for _, node := range nodes {
		if nodeEffectiveStatus[node.Node] == "online" {
			onlineNodes++
		}
	}

	log.Debug().
		Str("instance", instanceName).
		Int("totalNodes", len(nodes)).
		Int("onlineNodes", onlineNodes).
		Msg("Starting parallel VM polling")

	// Build a lookup map from VM guest ID → linked host agent.
	// When a Pulse agent runs inside a VM, it reads /proc/meminfo directly
	// and gets accurate MemAvailable (excluding page cache). We use this as
	// a memory fallback before the inflated status.Mem value. Refs: #1270
	prevState := m.GetState()
	prevInstanceVMs := filterVMsByInstance(prevState.VMs, instanceName)
	// Build a lookup for previous disk data so we can carry it forward when the
	// guest agent call fails (prevents disk usage flickering 57% → 0% → 57%).
	prevDiskByGuestID := make(map[string]models.Disk, len(prevInstanceVMs))
	prevVMByGuestID := make(map[string]models.VM, len(prevInstanceVMs))
	for _, pvm := range prevInstanceVMs {
		prevVMByGuestID[pvm.ID] = pvm
		if pvm.Disk.Usage > 0 {
			prevDiskByGuestID[pvm.ID] = pvm.Disk
		}
	}
	vmIDToHostAgent := buildLinkedVMHostAgentMap(prevState.Hosts)

	// Launch a goroutine for each online node
	for _, node := range nodes {
		// Skip offline nodes
		if nodeEffectiveStatus[node.Node] != "online" {
			log.Debug().
				Str("node", node.Node).
				Str("status", node.Status).
				Msg("Skipping offline node for VM polling")
			continue
		}

		wg.Add(1)
		go func(n proxmox.Node) {
			defer wg.Done()

			nodeStart := time.Now()

			// Fetch VMs for this node
			vms, err := client.GetVMs(ctx, n.Node)
			if err != nil {
				monErr := errors.NewMonitorError(errors.ErrorTypeAPI, "get_vms", instanceName, err).WithNode(n.Node)
				log.Error().Err(monErr).Str("node", n.Node).Msg("Failed to get VMs; deferring node poll until next cycle")
				resultChan <- nodeResult{node: n.Node, err: err}
				return
			}

			indexedVMs := make([]indexedLegacyVM, 0, len(vms))
			for idx, vm := range vms {
				if vm.Template == 1 {
					continue
				}
				indexedVMs = append(indexedVMs, indexedLegacyVM{
					order: idx,
					vm:    vm,
				})
			}
			scheduledVMs := rotateIndexedLegacyVMs(
				indexedVMs,
				m.nextGuestAgentPollOffset(instanceName+":"+n.Node, len(indexedVMs)),
			)
			pollVM := func(indexedVM indexedLegacyVM) orderedNodeVMResult {
				vm := indexedVM.vm

				// Parse tags
				var tags []string
				if vm.Tags != "" {
					tags = strings.Split(vm.Tags, ";")
				}

				// Generate canonical guest ID: instance:node:vmid
				guestID := makeGuestID(instanceName, n.Node, vm.VMID)

				guestRaw := VMMemoryRaw{
					ListingMem:    vm.Mem,
					ListingMaxMem: vm.MaxMem,
					Agent:         vm.Agent,
				}
				memorySource := "listing-mem"

				// Initialize metrics from VM listing (may be 0 for disk I/O)
				diskReadBytes := int64(vm.DiskRead)
				diskWriteBytes := int64(vm.DiskWrite)
				networkInBytes := int64(vm.NetIn)
				networkOutBytes := int64(vm.NetOut)

				// Get memory info for running VMs (and agent status for disk)
				memUsed := vm.Mem
				memTotal := vm.MaxMem
				var vmStatus *proxmox.VMStatus
				memAvailable := uint64(0)
				memRawFree := uint64(0) // Truly free memory (MemFree), for cache segment calculation
				memInfoTotalMinusUsed := uint64(0)
				rrdUsed := uint64(0)
				var ipAddresses []string
				var networkInterfaces []models.GuestNetworkInterface
				var osName, osVersion, guestAgentVersion string
				var prevVM *models.VM
				if prev, ok := prevVMByGuestID[guestID]; ok {
					prevVM = &prev
				}
				guestAgentAvailable := false

				if vm.Status == "running" {
					// Try to get detailed VM status (but don't wait too long)
					statusCtx, cancel := context.WithTimeout(ctx, 2*time.Second)
					if status, err := client.GetVMStatus(statusCtx, n.Node, vm.VMID); err == nil {
						vmStatus = status
						guestRaw.StatusMaxMem = status.MaxMem
						guestRaw.StatusMem = status.Mem
						guestRaw.StatusFreeMem = status.FreeMem
						guestRaw.Balloon = status.Balloon
						guestRaw.BalloonMin = status.BalloonMin
						guestRaw.Agent = status.Agent.Value
						if status.MemInfo != nil {
							guestRaw.MemInfoUsed = status.MemInfo.Used
							guestRaw.MemInfoFree = status.MemInfo.Free
							guestRaw.MemInfoTotal = status.MemInfo.Total
							guestRaw.MemInfoAvailable = status.MemInfo.Available
							guestRaw.MemInfoBuffers = status.MemInfo.Buffers
							guestRaw.MemInfoCached = status.MemInfo.Cached
							guestRaw.MemInfoShared = status.MemInfo.Shared

							if status.MemInfo.Free > 0 {
								memRawFree = status.MemInfo.Free
							}

							selection := selectVMAvailableFromMemInfo(status.MemInfo)
							memInfoTotalMinusUsed = selection.TotalMinusUsed
							guestRaw.MemInfoTotalMinusUsed = memInfoTotalMinusUsed
							if selection.Available > 0 {
								memAvailable = selection.Available
								memorySource = selection.Source
							}
						}
						// Note: do NOT fall back to vmStatus.FreeMem for memRawFree.
						// FreeMem is relative to the balloon allocation (guest-visible total),
						// while memFree is derived from MaxMem. Mixing reference frames would
						// inflate the cache segment by the balloon gap. Only MemInfo.Free is
						// safe because it shares the same reference frame as MemInfo.Available.
						// Note: We intentionally do NOT override memTotal with balloon.
						// The balloon value is tracked separately in memory.balloon for
						// visualization purposes. Using balloon as total causes user
						// confusion (showing 1GB/1GB at 100% when VM is configured for 4GB)
						// and makes the frontend's balloon marker logic ineffective.
						// Refs: #1070
						// Use actual disk I/O values from detailed status
						diskReadBytes = int64(vmStatus.DiskRead)
						diskWriteBytes = int64(vmStatus.DiskWrite)
						networkInBytes = int64(vmStatus.NetIn)
						networkOutBytes = int64(vmStatus.NetOut)
					}
					cancel()
				}
				guestAgentAvailable = vm.Status == "running" && shouldQueryGuestAgent(vmStatus, prevVM, time.Now())

				// Preferred fallback: read /proc/meminfo directly via the QEMU guest
				// agent's file-read endpoint. This gives real-time MemAvailable which
				// correctly excludes reclaimable buff/cache. Results are cached (60s
				// positive, 5min negative) so this is cheap after the first call.
				// Refs: #1270
				if guestAgentAvailable && memAvailable == 0 {
					m.runGuestAgentVMWork(ctx, instanceName, n.Node, vm.Name, vm.VMID, func(agentCtx context.Context) {
						if agentAvail, agentErr := m.getVMAgentMemAvailable(agentCtx, client, instanceName, n.Node, vm.VMID); agentErr == nil && agentAvail > 0 {
							memAvailable = agentAvail
							memorySource = "guest-agent-meminfo"
							guestRaw.MemInfoAvailable = memAvailable
							log.Debug().
								Str("vm", vm.Name).
								Str("node", n.Node).
								Int("vmid", vm.VMID).
								Uint64("total", memTotal).
								Uint64("available", memAvailable).
								Msg("QEMU memory: using guest agent /proc/meminfo (excludes reclaimable cache)")
						}
					})
				}

				// Fallback: try Proxmox RRD memory when guest agent file-read
				// didn't work or isn't available.
				if vm.Status == "running" && memAvailable == 0 {
					if rrdEntry, rrdErr := m.getVMRRDMetrics(ctx, client, instanceName, n.Node, vm.VMID); rrdErr == nil {
						if rrdEntry.total > 0 {
							memTotal = rrdEntry.total
						}
						if rrdEntry.available > 0 {
							memAvailable = rrdEntry.available
							memorySource = "rrd-memavailable"
							guestRaw.RRDAvailable = rrdEntry.available
							guestRaw.MemInfoAvailable = rrdEntry.available
							log.Debug().
								Str("vm", vm.Name).
								Str("node", n.Node).
								Int("vmid", vm.VMID).
								Uint64("total", memTotal).
								Uint64("available", memAvailable).
								Msg("QEMU memory: using RRD memavailable fallback (excludes reclaimable cache)")
						} else if rrdEntry.used > 0 {
							rrdUsed = rrdEntry.used
							memorySource = "rrd-memused"
							guestRaw.RRDUsed = rrdEntry.used
							log.Debug().
								Str("vm", vm.Name).
								Str("node", n.Node).
								Int("vmid", vm.VMID).
								Uint64("total", memTotal).
								Uint64("used", rrdUsed).
								Msg("QEMU memory: using RRD memused fallback")
						}
					}
				}

				if vm.Status == "running" && memAvailable == 0 {
					if agentHost, ok := vmIDToHostAgent[guestID]; ok && agentHost.Memory.Total > 0 {
						agentAvailable := agentHost.Memory.Total - agentHost.Memory.Used
						if agentAvailable > 0 {
							memAvailable = uint64(agentAvailable)
							memorySource = "host-agent"
							guestRaw.HostAgentTotal = uint64(agentHost.Memory.Total)
							guestRaw.HostAgentUsed = uint64(agentHost.Memory.Used)
							log.Debug().
								Str("vm", vm.Name).
								Str("node", n.Node).
								Int("vmid", vm.VMID).
								Uint64("total", memTotal).
								Uint64("available", memAvailable).
								Int64("agentTotal", agentHost.Memory.Total).
								Int64("agentUsed", agentHost.Memory.Used).
								Msg("QEMU memory: using linked Pulse host agent memory (excludes page cache)")
						}
					}
				}

				if vm.Status == "running" && memAvailable == 0 && memInfoTotalMinusUsed > 0 {
					memAvailable = memInfoTotalMinusUsed
					memorySource = "meminfo-total-minus-used"
				}

				switch {
				case vm.Status != "running":
					memUsed = 0
					memorySource = "powered-off"
				case memAvailable > 0:
					if memAvailable > memTotal {
						memAvailable = memTotal
					}
					memUsed = memTotal - memAvailable
				case rrdUsed > 0:
					memUsed = rrdUsed
					memorySource = "rrd-memused"
				case vmStatus != nil:
					if selection := selectVMLowTrustUsedMemory(memTotal, vmStatus); selection.Source != "" {
						memUsed = selection.Used
						memorySource = selection.Source
					} else {
						memorySource = "status-unavailable"
					}
				case vmStatus == nil:
					memorySource = "listing-mem"
				}
				if memUsed > memTotal {
					memUsed = memTotal
				}

				// Calculate disk usage - start with allocated disk size
				// NOTE: The Proxmox cluster/resources API always returns 0 for VM disk usage
				// We must query the guest agent to get actual disk usage
				diskUsed := vm.Disk
				diskTotal := vm.MaxDisk
				diskFree := diskTotal - diskUsed
				diskUsage := safePercentage(float64(diskUsed), float64(diskTotal))
				diskFromAgent := false
				diskStatusReason := ""
				var individualDisks []models.Disk

				// For stopped VMs, we can't get guest agent data
				if vm.Status != "running" {
					// Show allocated disk size for stopped VMs
					if diskTotal > 0 {
						diskUsage = -1 // Indicates "allocated size only"
						diskStatusReason = "vm-stopped"
					}
				}

				// For running VMs, ALWAYS try to get filesystem info from guest agent
				// The cluster/resources endpoint always returns 0 for disk usage
				if vm.Status == "running" && guestAgentAvailable && diskTotal > 0 {
					// Log the initial state
					agentValue := 0
					if vmStatus != nil {
						agentValue = vmStatus.Agent.Value
					}
					if logging.IsLevelEnabled(zerolog.DebugLevel) {
						log.Debug().
							Str("instance", instanceName).
							Str("vm", vm.Name).
							Int("vmid", vm.VMID).
							Int("agent", agentValue).
							Uint64("diskUsed", diskUsed).
							Uint64("diskTotal", diskTotal).
							Msg("VM has 0 disk usage, checking guest agent")
					}

					// Check if agent is enabled
					if vmStatus != nil && vmStatus.Agent.Value == 0 {
						diskStatusReason = "agent-disabled"
						if logging.IsLevelEnabled(zerolog.DebugLevel) {
							log.Debug().
								Str("instance", instanceName).
								Str("vm", vm.Name).
								Msg("Guest agent disabled in VM config")
						}
					} else {
						m.runGuestAgentVMWork(ctx, instanceName, n.Node, vm.Name, vm.VMID, func(agentCtx context.Context) {
							if logging.IsLevelEnabled(zerolog.DebugLevel) {
								log.Debug().
									Str("instance", instanceName).
									Str("vm", vm.Name).
									Int("vmid", vm.VMID).
									Msg("Guest agent enabled, fetching filesystem info")
							}

							// Filesystem info with configurable timeout and retry (refs #592)
							fsInfoRaw, err := m.retryGuestAgentCall(agentCtx, m.guestAgentFSInfoTimeout, m.guestAgentRetries, func(ctx context.Context) (interface{}, error) {
								return client.GetVMFSInfo(ctx, n.Node, vm.VMID)
							})
							var fsInfo []proxmox.VMFileSystem
							if err == nil {
								if fs, ok := fsInfoRaw.([]proxmox.VMFileSystem); ok {
									fsInfo = fs
								}
							}
							if err != nil {
								// Handle errors
								errStr := err.Error()
								errStrLower := strings.ToLower(errStr)
								log.Warn().
									Str("instance", instanceName).
									Str("vm", vm.Name).
									Int("vmid", vm.VMID).
									Str("error", errStr).
									Msg("Failed to get VM filesystem info from guest agent")

								// Classify the error type for better user messaging
								// Order matters: check most specific patterns first
								if strings.Contains(errStr, "QEMU guest agent is not running") {
									diskStatusReason = "agent-not-running"
									log.Info().
										Str("instance", instanceName).
										Str("vm", vm.Name).
										Int("vmid", vm.VMID).
										Msg("Guest agent enabled in VM config but not running inside guest OS. Install and start qemu-guest-agent in the VM")
								} else if strings.Contains(errStr, "timeout") || strings.Contains(errStr, "deadline exceeded") {
									diskStatusReason = "agent-timeout"
								} else if strings.Contains(errStr, "500") && (strings.Contains(errStr, "not running") || strings.Contains(errStr, "not available")) {
									// Proxmox API error 500 with "not running"/"not available" indicates guest agent issue, not permissions
									// This commonly happens when guest agent is not installed or not running
									diskStatusReason = "agent-not-running"
									log.Info().
										Str("instance", instanceName).
										Str("vm", vm.Name).
										Int("vmid", vm.VMID).
										Msg("Guest agent communication failed (API error 500). Install and start qemu-guest-agent in the VM")
								} else if (strings.Contains(errStr, "403") || strings.Contains(errStr, "401")) &&
									(strings.Contains(errStrLower, "permission") || strings.Contains(errStrLower, "forbidden") || strings.Contains(errStrLower, "not allowed")) {
									// Only treat as permission-denied if we get explicit auth/permission error codes (401/403)
									// This distinguishes actual permission issues from guest agent unavailability
									diskStatusReason = "permission-denied"
									log.Warn().
										Str("instance", instanceName).
										Str("vm", vm.Name).
										Int("vmid", vm.VMID).
										Msg("Permission denied accessing guest agent. Verify Pulse user has VM.Monitor (PVE 8) or VM.GuestAgent.Audit+VM.GuestAgent.FileRead (PVE 9) permissions")
								} else if strings.Contains(errStr, "500") {
									// Generic 500 error without clear indicators - likely agent unavailable
									// Refs #596: Proxmox returns 500 errors when guest agent isn't installed/running
									diskStatusReason = "agent-not-running"
									log.Info().
										Str("instance", instanceName).
										Str("vm", vm.Name).
										Int("vmid", vm.VMID).
										Msg("Failed to communicate with guest agent (API error 500). This usually means qemu-guest-agent is not installed or not running in the VM")
								} else {
									diskStatusReason = "agent-error"
								}
							} else if len(fsInfo) == 0 {
								diskStatusReason = "no-filesystems"
								log.Warn().
									Str("instance", instanceName).
									Str("vm", vm.Name).
									Int("vmid", vm.VMID).
									Msg("Guest agent returned empty filesystem list")
							} else {
								log.Info().
									Str("instance", instanceName).
									Str("vm", vm.Name).
									Int("vmid", vm.VMID).
									Int("filesystems", len(fsInfo)).
									Msg("Got filesystem info from guest agent")
								// Aggregate disk usage from all filesystems
								// Fix for #425: Track seen devices to avoid counting duplicates
								var totalBytes, usedBytes uint64
								seenDevices := make(map[string]bool)

								for _, fs := range fsInfo {
									// Log each filesystem for debugging
									log.Debug().
										Str("vm", vm.Name).
										Str("mountpoint", fs.Mountpoint).
										Str("type", fs.Type).
										Str("disk", fs.Disk).
										Uint64("total", fs.TotalBytes).
										Uint64("used", fs.UsedBytes).
										Msg("Processing filesystem from guest agent")

									// Skip special filesystems and Windows System Reserved.
									// Treat normalized drive roots like "C:" the same as "C:\\".
									isWindowsDrive := isWindowsGuestFilesystemMountpoint(fs.Mountpoint)

									if !isWindowsDrive {
										if reason, skip := readOnlyFilesystemReason(fs.Type, fs.TotalBytes, fs.UsedBytes); skip {
											log.Debug().
												Str("vm", vm.Name).
												Str("mountpoint", fs.Mountpoint).
												Str("type", fs.Type).
												Str("skipReason", reason).
												Uint64("total", fs.TotalBytes).
												Uint64("used", fs.UsedBytes).
												Msg("Skipping read-only filesystem from guest agent")
											continue
										}

										if fs.Type == "tmpfs" || fs.Type == "devtmpfs" ||
											strings.HasPrefix(fs.Mountpoint, "/dev") ||
											strings.HasPrefix(fs.Mountpoint, "/proc") ||
											strings.HasPrefix(fs.Mountpoint, "/sys") ||
											strings.HasPrefix(fs.Mountpoint, "/run") ||
											fs.Mountpoint == "/boot/efi" ||
											fs.Mountpoint == "System Reserved" ||
											strings.Contains(fs.Mountpoint, "System Reserved") ||
											strings.HasPrefix(fs.Mountpoint, "/snap") { // Skip snap mounts
											log.Debug().
												Str("vm", vm.Name).
												Str("mountpoint", fs.Mountpoint).
												Str("type", fs.Type).
												Msg("Skipping special filesystem")
											continue
										}
									}

									// Skip if we've already seen this device (duplicate mount point)
									if fs.Disk != "" && seenDevices[fs.Disk] {
										log.Debug().
											Str("vm", vm.Name).
											Str("mountpoint", fs.Mountpoint).
											Str("disk", fs.Disk).
											Msg("Skipping duplicate mount of same device")
										continue
									}

									// Only count real filesystems with valid data
									if fs.TotalBytes > 0 {
										// Mark this device as seen
										if fs.Disk != "" {
											seenDevices[fs.Disk] = true
										}

										totalBytes += fs.TotalBytes
										usedBytes += fs.UsedBytes
										individualDisks = append(individualDisks, models.Disk{
											Total:      int64(fs.TotalBytes),
											Used:       int64(fs.UsedBytes),
											Free:       int64(fs.TotalBytes - fs.UsedBytes),
											Usage:      safePercentage(float64(fs.UsedBytes), float64(fs.TotalBytes)),
											Mountpoint: fs.Mountpoint,
											Type:       fs.Type,
											Device:     fs.Disk,
										})
										log.Debug().
											Str("vm", vm.Name).
											Str("mountpoint", fs.Mountpoint).
											Str("disk", fs.Disk).
											Uint64("added_total", fs.TotalBytes).
											Uint64("added_used", fs.UsedBytes).
											Msg("Adding filesystem to total")
									} else {
										log.Debug().
											Str("vm", vm.Name).
											Str("mountpoint", fs.Mountpoint).
											Msg("Skipping filesystem with 0 total bytes")
									}
								}

								// If we got valid data from guest agent, use it
								if totalBytes > 0 {
									diskTotal = totalBytes
									diskUsed = usedBytes
									diskFree = totalBytes - usedBytes
									diskUsage = safePercentage(float64(usedBytes), float64(totalBytes))
									diskFromAgent = true
									diskStatusReason = "" // Clear reason on success

									log.Info().
										Str("instance", instanceName).
										Str("vm", vm.Name).
										Int("vmid", vm.VMID).
										Uint64("totalBytes", totalBytes).
										Uint64("usedBytes", usedBytes).
										Float64("usage", diskUsage).
										Msg("✓ Successfully retrieved disk usage from guest agent")
								} else {
									// Only special filesystems found - show allocated disk size instead
									diskStatusReason = "special-filesystems-only"
									if diskTotal > 0 {
										diskUsage = -1 // Show as allocated size
									}
									log.Info().
										Str("instance", instanceName).
										Str("vm", vm.Name).
										Int("filesystems_found", len(fsInfo)).
										Msg("Guest agent provided filesystem info but no usable filesystems found (all were special mounts)")
								}
							}
						})
					}
				} else if vm.Status == "running" && diskTotal > 0 {
					// Running VM but no vmStatus - show allocated disk
					diskUsage = -1
					if guestAgentAvailable {
						diskStatusReason = "agent-unavailable"
					} else {
						diskStatusReason = "no-status"
					}
				}

				if vm.Status == "running" {
					m.runGuestAgentVMWork(ctx, instanceName, n.Node, vm.Name, vm.VMID, func(agentCtx context.Context) {
						guestIPs, guestIfaces, guestOSName, guestOSVersion, agentVersion := m.fetchGuestAgentMetadata(agentCtx, client, instanceName, n.Node, vm.Name, vm.VMID, vmStatus, guestAgentAvailable)
						if len(guestIPs) > 0 {
							ipAddresses = guestIPs
						}
						if len(guestIfaces) > 0 {
							networkInterfaces = guestIfaces
						}
						if guestOSName != "" {
							osName = guestOSName
						}
						if guestOSVersion != "" {
							osVersion = guestOSVersion
						}
						if agentVersion != "" {
							guestAgentVersion = agentVersion
						}
					})
				}

				if vm.Status == "running" && !diskFromAgent {
					if hostDisk, hostDisks, ok := resolveGuestDiskFromLinkedHostAgent(guestID, vmIDToHostAgent); ok && hostDisk.Total > 0 {
						diskTotal = uint64(hostDisk.Total)
						diskUsed = uint64(hostDisk.Used)
						diskFree = uint64(hostDisk.Free)
						diskUsage = hostDisk.Usage
						individualDisks = hostDisks
						diskFromAgent = true
						diskStatusReason = ""
						log.Debug().
							Str("vm", vm.Name).
							Str("node", n.Node).
							Int("vmid", vm.VMID).
							Float64("usage", hostDisk.Usage).
							Msg("QEMU disk: using linked Pulse host agent disk summary")
					}
				}

				// Carry forward previous disk data when the guest agent failed this cycle.
				// Proxmox cluster/resources always returns 0 for disk usage, so without
				// the guest agent we'd show 0% or "allocated only", causing chart spikes
				// when the agent intermittently times out. Refs: #1319
				// Only carry forward for transient failures — not when the agent is
				// permanently disabled or absent, as those won't self-resolve.
				if vm.Status == "running" && !diskFromAgent && shouldCarryForwardQEMUDisk(diskStatusReason) {
					if prev, ok := prevDiskByGuestID[guestID]; ok && prev.Usage > 0 && prev.Total > 0 && prev.Used >= 0 && prev.Used <= prev.Total {
						diskTotal = uint64(prev.Total)
						diskUsed = uint64(prev.Used)
						diskFree = diskTotal - diskUsed
						diskUsage = prev.Usage
						if prevVM, ok := prevVMByGuestID[guestID]; ok {
							individualDisks = cloneGuestDisks(prevVM.Disks)
						}
						if logging.IsLevelEnabled(zerolog.DebugLevel) {
							log.Debug().
								Str("instance", instanceName).
								Str("vm", vm.Name).
								Int("vmid", vm.VMID).
								Str("reason", diskStatusReason).
								Float64("prevUsage", prev.Usage).
								Msg("Guest agent disk query failed; carrying forward previous disk data")
						}
						diskStatusReason = "prev-" + diskStatusReason
					}
				}

				// Calculate I/O rates after we have the actual values
				sampleTime := time.Now()
				currentMetrics := IOMetrics{
					DiskRead:   diskReadBytes,
					DiskWrite:  diskWriteBytes,
					NetworkIn:  networkInBytes,
					NetworkOut: networkOutBytes,
					Timestamp:  sampleTime,
				}
				diskReadRate, diskWriteRate, netInRate, netOutRate := m.rateTracker.CalculateRates(guestID, currentMetrics)

				// Debug log disk I/O rates
				if diskReadRate > 0 || diskWriteRate > 0 {
					log.Debug().
						Str("vm", vm.Name).
						Int("vmid", vm.VMID).
						Float64("diskReadRate", diskReadRate).
						Float64("diskWriteRate", diskWriteRate).
						Int64("diskReadBytes", diskReadBytes).
						Int64("diskWriteBytes", diskWriteBytes).
						Msg("VM disk I/O rates calculated")
				}

				// Set CPU to 0 for non-running VMs
				cpuUsage := safeFloat(vm.CPU)
				if vm.Status != "running" {
					cpuUsage = 0
				}

				memTotalBytes := clampToInt64(memTotal)
				memUsedBytes := clampToInt64(memUsed)
				if memTotalBytes > 0 && memUsedBytes > memTotalBytes {
					memUsedBytes = memTotalBytes
				}
				memFreeBytes := memTotalBytes - memUsedBytes
				if memFreeBytes < 0 {
					memFreeBytes = 0
				}
				memory := models.Memory{
					Total: memTotalBytes,
					Used:  memUsedBytes,
					Free:  memFreeBytes,
					Usage: safePercentage(float64(memUsed), float64(memTotal)),
				}
				// Derive reclaimable cache: the difference between "available" memory
				// (what the OS can reclaim) and "truly free" memory (unused pages).
				if memRawFree > 0 && memFreeBytes > clampToInt64(memRawFree) {
					memory.Cache = memFreeBytes - clampToInt64(memRawFree)
					memory.Free = clampToInt64(memRawFree)
				}
				if guestRaw.Balloon > 0 {
					memory.Balloon = clampToInt64(guestRaw.Balloon)
				}

				// Create VM model
				modelVM := models.VM{
					ID:           guestID,
					VMID:         vm.VMID,
					Name:         vm.Name,
					Node:         n.Node,
					Instance:     instanceName,
					Status:       vm.Status,
					Type:         "qemu",
					CPU:          cpuUsage,
					CPUs:         vm.CPUs,
					Memory:       memory,
					MemorySource: memorySource,
					Disk: models.Disk{
						Total: int64(diskTotal),
						Used:  int64(diskUsed),
						Free:  int64(diskFree),
						Usage: diskUsage,
					},
					Disks:             individualDisks,
					DiskStatusReason:  diskStatusReason,
					NetworkIn:         max(0, int64(netInRate)),
					NetworkOut:        max(0, int64(netOutRate)),
					DiskRead:          max(0, int64(diskReadRate)),
					DiskWrite:         max(0, int64(diskWriteRate)),
					Uptime:            int64(vm.Uptime),
					Template:          vm.Template == 1,
					LastSeen:          sampleTime,
					Tags:              tags,
					IPAddresses:       ipAddresses,
					OSName:            osName,
					OSVersion:         osVersion,
					AgentVersion:      guestAgentVersion,
					NetworkInterfaces: networkInterfaces,
				}

				// Zero out metrics for non-running VMs
				if vm.Status != "running" {
					modelVM.CPU = 0
					modelVM.Memory.Usage = 0
					modelVM.Disk.Usage = 0
					modelVM.NetworkIn = 0
					modelVM.NetworkOut = 0
					modelVM.DiskRead = 0
					modelVM.DiskWrite = 0
				}

				// Trigger guest metadata migration if old format exists
				if m.guestMetadataStore != nil {
					m.guestMetadataStore.GetWithLegacyMigration(guestID, instanceName, n.Node, vm.VMID)
				}

				return orderedNodeVMResult{
					order:   indexedVM.order,
					vm:      modelVM,
					alertVM: modelVM,
					snap: GuestMemorySnapshot{
						Name:         vm.Name,
						Status:       vm.Status,
						RetrievedAt:  sampleTime,
						MemorySource: memorySource,
						Memory:       modelVM.Memory,
						Raw:          guestRaw,
					},
				}
			}

			// Process each VM while rotating guest-agent priority across polls and
			// allowing per-VM work to run in parallel. Without this, the legacy
			// per-node path serializes every guest-agent timeout behind the first
			// affected VM, which is exactly the "higher VMIDs starve first" pattern
			// reported in clustered environments.
			nodeResults := make([]orderedNodeVMResult, 0, len(scheduledVMs))
			if len(scheduledVMs) > 0 {
				resultCh := make(chan orderedNodeVMResult, len(scheduledVMs))
				jobCh := make(chan indexedLegacyVM, len(scheduledVMs))
				var vmWG sync.WaitGroup

				workerCount := m.efficientQEMUWorkerCount(len(scheduledVMs))
				for i := 0; i < workerCount; i++ {
					vmWG.Add(1)
					go func() {
						defer vmWG.Done()
						for entry := range jobCh {
							resultCh <- pollVM(entry)
						}
					}()
				}

				for _, entry := range scheduledVMs {
					jobCh <- entry
				}
				close(jobCh)

				go func() {
					vmWG.Wait()
					close(resultCh)
				}()

				for result := range resultCh {
					nodeResults = append(nodeResults, result)
				}
			}

			sort.Slice(nodeResults, func(i, j int) bool {
				return nodeResults[i].order < nodeResults[j].order
			})
			nodeVMs := make([]models.VM, 0, len(nodeResults))
			nodeAlertVMs := make([]models.VM, 0, len(nodeResults))
			nodeSnapshots := make([]GuestMemorySnapshot, 0, len(nodeResults))
			for _, result := range nodeResults {
				nodeVMs = append(nodeVMs, result.vm)
				nodeAlertVMs = append(nodeAlertVMs, result.alertVM)
				nodeSnapshots = append(nodeSnapshots, result.snap)
			}

			nodeDuration := time.Since(nodeStart)
			log.Debug().
				Str("node", n.Node).
				Int("vms", len(nodeVMs)).
				Dur("duration", nodeDuration).
				Msg("Node VM polling completed")

			resultChan <- nodeResult{node: n.Node, vms: nodeVMs, alertVMs: nodeAlertVMs, snaps: nodeSnapshots}
		}(node)
	}

	// Close channel when all goroutines complete
	go func() {
		wg.Wait()
		close(resultChan)
	}()

	// Collect results from all nodes
	var allVMs []models.VM
	var allAlertVMs []models.VM
	var allSnapshots []GuestMemorySnapshot
	successfulNodes := 0
	failedNodes := 0

	for result := range resultChan {
		if result.err != nil {
			failedNodes++
		} else {
			successfulNodes++
			allVMs = append(allVMs, result.vms...)
			allAlertVMs = append(allAlertVMs, result.alertVMs...)
			allSnapshots = append(allSnapshots, result.snaps...)
		}
	}

	// If we got ZERO VMs but had VMs before (likely cluster health issue),
	// preserve previous VMs instead of clearing them
	if len(allVMs) == 0 && len(nodes) > 0 {
		for _, vm := range prevInstanceVMs {
			allVMs = append(allVMs, vm)
		}
		prevVMCount := len(prevInstanceVMs)
		if prevVMCount > 0 {
			log.Warn().
				Str("instance", instanceName).
				Int("prevVMs", prevVMCount).
				Int("successfulNodes", successfulNodes).
				Int("totalNodes", len(nodes)).
				Msg("Traditional polling returned zero VMs but had VMs before - preserving previous VMs")
		}
	}

	stabilizeSuspiciousRepeatedVMMemory(allVMs, allAlertVMs, allSnapshots, prevInstanceVMs, time.Now())
	m.logSuspiciousRepeatedVMMemoryUsage(instanceName, allVMs, prevInstanceVMs)

	// Update state with all VMs
	m.state.UpdateVMsForInstance(instanceName, allVMs)

	for i, vm := range allVMs {
		if m.guestMetadataStore != nil {
			m.guestMetadataStore.GetWithLegacyMigration(vm.ID, instanceName, vm.Node, vm.VMID)
		}
		if i < len(allSnapshots) {
			m.recordGuestSnapshot(instanceName, vm.Type, vm.Node, vm.VMID, allSnapshots[i])
		}
		if i < len(allAlertVMs) {
			m.alertManager.CheckGuest(allAlertVMs[i], instanceName)
		}
	}

	// Record guest metrics history for running VMs (enables sparkline/trends view)
	now := time.Now()
	for _, vm := range allVMs {
		if vm.Status == "running" {
			m.metricsHistory.AddGuestMetric(vm.ID, "cpu", vm.CPU*100, now)
			m.metricsHistory.AddGuestMetric(vm.ID, "memory", vm.Memory.Usage, now)
			if vm.Disk.Usage >= 0 {
				m.metricsHistory.AddGuestMetric(vm.ID, "disk", vm.Disk.Usage, now)
			}
			// Also write to persistent store
			if m.metricsStore != nil {
				m.metricsStore.Write("vm", vm.ID, "cpu", vm.CPU*100, now)
				m.metricsStore.Write("vm", vm.ID, "memory", vm.Memory.Usage, now)
				if vm.Disk.Usage >= 0 {
					m.metricsStore.Write("vm", vm.ID, "disk", vm.Disk.Usage, now)
				}
			}
		}
	}

	duration := time.Since(startTime)
	log.Debug().
		Str("instance", instanceName).
		Int("totalVMs", len(allVMs)).
		Int("successfulNodes", successfulNodes).
		Int("failedNodes", failedNodes).
		Dur("duration", duration).
		Msg("Parallel VM polling completed")
}

// pollContainersWithNodes polls containers from all nodes in parallel using goroutines
// When the instance is part of a cluster, the cluster name is used for guest IDs to prevent duplicates
// when multiple cluster nodes are configured as separate PVE instances.
func (m *Monitor) pollContainersWithNodes(ctx context.Context, instanceName string, clusterName string, isCluster bool, client PVEClientInterface, nodes []proxmox.Node, nodeEffectiveStatus map[string]string) {
	startTime := time.Now()

	// Channel to collect container results from each node
	type nodeResult struct {
		node       string
		containers []models.Container
		err        error
	}

	resultChan := make(chan nodeResult, len(nodes))
	var wg sync.WaitGroup

	// Count online nodes for logging
	onlineNodes := 0
	for _, node := range nodes {
		if nodeEffectiveStatus[node.Node] == "online" {
			onlineNodes++
		}
	}

	// Seed OCI classification from previous state so we never "downgrade" to LXC
	// if container config fetching intermittently fails (permissions or transient API errors).
	prevState := m.GetState()
	prevContainerIsOCI := make(map[int]bool)
	for _, ct := range prevState.Containers {
		if ct.Instance != instanceName {
			continue
		}
		if ct.VMID <= 0 {
			continue
		}
		if ct.Type == "oci" || ct.IsOCI {
			prevContainerIsOCI[ct.VMID] = true
		}
	}

	log.Debug().
		Str("instance", instanceName).
		Int("totalNodes", len(nodes)).
		Int("onlineNodes", onlineNodes).
		Msg("Starting parallel container polling")

	// Launch a goroutine for each online node
	for _, node := range nodes {
		// Skip offline nodes
		if nodeEffectiveStatus[node.Node] != "online" {
			log.Debug().
				Str("node", node.Node).
				Str("status", node.Status).
				Msg("Skipping offline node for container polling")
			continue
		}

		wg.Add(1)
		go func(n proxmox.Node) {
			defer wg.Done()

			nodeStart := time.Now()

			// Fetch containers for this node
			containers, err := client.GetContainers(ctx, n.Node)
			if err != nil {
				monErr := errors.NewMonitorError(errors.ErrorTypeAPI, "get_containers", instanceName, err).WithNode(n.Node)
				log.Error().Err(monErr).Str("node", n.Node).Msg("Failed to get containers")
				resultChan <- nodeResult{node: n.Node, err: err}
				return
			}

			vmIDs := make([]int, 0, len(containers))
			for _, ct := range containers {
				if ct.Template == 1 {
					continue
				}
				vmIDs = append(vmIDs, int(ct.VMID))
			}

			rootUsageOverrides := m.collectContainerRootUsage(ctx, client, n.Node, vmIDs)

			var nodeContainers []models.Container

			// Process each container
			for _, container := range containers {
				// Skip templates
				if container.Template == 1 {
					continue
				}

				// Parse tags
				var tags []string
				if container.Tags != "" {
					tags = strings.Split(container.Tags, ";")
				}

				// Generate canonical guest ID: instance:node:vmid
				guestID := makeGuestID(instanceName, n.Node, int(container.VMID))

				// Calculate I/O rates
				currentMetrics := IOMetrics{
					DiskRead:   int64(container.DiskRead),
					DiskWrite:  int64(container.DiskWrite),
					NetworkIn:  int64(container.NetIn),
					NetworkOut: int64(container.NetOut),
					Timestamp:  time.Now(),
				}
				diskReadRate, diskWriteRate, netInRate, netOutRate := m.rateTracker.CalculateRates(guestID, currentMetrics)

				// Set CPU to 0 for non-running containers
				cpuUsage := safeFloat(container.CPU)
				if container.Status != "running" {
					cpuUsage = 0
				}

				memTotalBytes := clampToInt64(container.MaxMem)
				memUsedBytes := clampToInt64(container.Mem)
				if memTotalBytes > 0 && memUsedBytes > memTotalBytes {
					memUsedBytes = memTotalBytes
				}
				memFreeBytes := memTotalBytes - memUsedBytes
				if memFreeBytes < 0 {
					memFreeBytes = 0
				}
				memUsagePercent := safePercentage(float64(memUsedBytes), float64(memTotalBytes))

				diskTotalBytes := clampToInt64(container.MaxDisk)
				diskUsedBytes := clampToInt64(container.Disk)
				if diskTotalBytes > 0 && diskUsedBytes > diskTotalBytes {
					diskUsedBytes = diskTotalBytes
				}
				diskFreeBytes := diskTotalBytes - diskUsedBytes
				if diskFreeBytes < 0 {
					diskFreeBytes = 0
				}
				diskUsagePercent := safePercentage(float64(diskUsedBytes), float64(diskTotalBytes))

				// Create container model
				modelContainer := models.Container{
					ID:       guestID,
					VMID:     int(container.VMID),
					Name:     container.Name,
					Node:     n.Node,
					Instance: instanceName,
					Status:   container.Status,
					Type:     "lxc",
					CPU:      cpuUsage,
					CPUs:     int(container.CPUs),
					Memory: models.Memory{
						Total: memTotalBytes,
						Used:  memUsedBytes,
						Free:  memFreeBytes,
						Usage: memUsagePercent,
					},
					Disk: models.Disk{
						Total: diskTotalBytes,
						Used:  diskUsedBytes,
						Free:  diskFreeBytes,
						Usage: diskUsagePercent,
					},
					NetworkIn:  max(0, int64(netInRate)),
					NetworkOut: max(0, int64(netOutRate)),
					DiskRead:   max(0, int64(diskReadRate)),
					DiskWrite:  max(0, int64(diskWriteRate)),
					Uptime:     int64(container.Uptime),
					Template:   container.Template == 1,
					LastSeen:   time.Now(),
					Tags:       tags,
				}

				if prevContainerIsOCI[modelContainer.VMID] {
					modelContainer.IsOCI = true
					modelContainer.Type = "oci"
				}

				if override, ok := rootUsageOverrides[int(container.VMID)]; ok {
					overrideUsed := clampToInt64(override.Used)
					overrideTotal := clampToInt64(override.Total)

					if overrideUsed > 0 && (modelContainer.Disk.Used == 0 || overrideUsed < modelContainer.Disk.Used) {
						modelContainer.Disk.Used = overrideUsed
					}

					if overrideTotal > 0 {
						modelContainer.Disk.Total = overrideTotal
					}

					if modelContainer.Disk.Total > 0 && modelContainer.Disk.Used > modelContainer.Disk.Total {
						modelContainer.Disk.Used = modelContainer.Disk.Total
					}

					modelContainer.Disk.Free = modelContainer.Disk.Total - modelContainer.Disk.Used
					if modelContainer.Disk.Free < 0 {
						modelContainer.Disk.Free = 0
					}

					modelContainer.Disk.Usage = safePercentage(float64(modelContainer.Disk.Used), float64(modelContainer.Disk.Total))
				}

				m.enrichContainerMetadata(ctx, client, instanceName, n.Node, &modelContainer)

				// Zero out metrics for non-running containers
				if container.Status != "running" {
					modelContainer.CPU = 0
					modelContainer.Memory.Usage = 0
					modelContainer.Disk.Usage = 0
					modelContainer.NetworkIn = 0
					modelContainer.NetworkOut = 0
					modelContainer.DiskRead = 0
					modelContainer.DiskWrite = 0
				}

				// Trigger guest metadata migration if old format exists
				if m.guestMetadataStore != nil {
					m.guestMetadataStore.GetWithLegacyMigration(guestID, instanceName, n.Node, int(container.VMID))
				}

				nodeContainers = append(nodeContainers, modelContainer)

				// Check alerts
				m.alertManager.CheckGuest(modelContainer, instanceName)
			}

			nodeDuration := time.Since(nodeStart)
			log.Debug().
				Str("node", n.Node).
				Int("containers", len(nodeContainers)).
				Dur("duration", nodeDuration).
				Msg("Node container polling completed")

			resultChan <- nodeResult{node: n.Node, containers: nodeContainers}
		}(node)
	}

	// Close channel when all goroutines complete
	go func() {
		wg.Wait()
		close(resultChan)
	}()

	// Collect results from all nodes
	var allContainers []models.Container
	successfulNodes := 0
	failedNodes := 0

	for result := range resultChan {
		if result.err != nil {
			failedNodes++
		} else {
			successfulNodes++
			allContainers = append(allContainers, result.containers...)
		}
	}

	// If we got ZERO containers but had containers before (likely cluster health issue),
	// preserve previous containers instead of clearing them
	if len(allContainers) == 0 && len(nodes) > 0 {
		prevState := m.GetState()
		prevContainerCount := 0
		for _, container := range prevState.Containers {
			if container.Instance == instanceName {
				allContainers = append(allContainers, container)
				prevContainerCount++
			}
		}
		if prevContainerCount > 0 {
			log.Warn().
				Str("instance", instanceName).
				Int("prevContainers", prevContainerCount).
				Int("successfulNodes", successfulNodes).
				Int("totalNodes", len(nodes)).
				Msg("Traditional polling returned zero containers but had containers before - preserving previous containers")
		}
	}

	// Check Docker presence for containers that need it (new, restarted, started)
	allContainers = m.CheckContainersForDocker(ctx, allContainers)

	// Update state with all containers
	m.state.UpdateContainersForInstance(instanceName, allContainers)

	// Record guest metrics history for running containers (enables sparkline/trends view)
	now := time.Now()
	for _, ct := range allContainers {
		if ct.Status == "running" {
			m.metricsHistory.AddGuestMetric(ct.ID, "cpu", ct.CPU*100, now)
			m.metricsHistory.AddGuestMetric(ct.ID, "memory", ct.Memory.Usage, now)
			if ct.Disk.Usage >= 0 {
				m.metricsHistory.AddGuestMetric(ct.ID, "disk", ct.Disk.Usage, now)
			}
			// Also write to persistent store
			if m.metricsStore != nil {
				m.metricsStore.Write("container", ct.ID, "cpu", ct.CPU*100, now)
				m.metricsStore.Write("container", ct.ID, "memory", ct.Memory.Usage, now)
				if ct.Disk.Usage >= 0 {
					m.metricsStore.Write("container", ct.ID, "disk", ct.Disk.Usage, now)
				}
			}
		}
	}

	duration := time.Since(startTime)
	log.Debug().
		Str("instance", instanceName).
		Int("totalContainers", len(allContainers)).
		Int("successfulNodes", successfulNodes).
		Int("failedNodes", failedNodes).
		Dur("duration", duration).
		Msg("Parallel container polling completed")
}

// pollStorageWithNodes polls storage from all nodes in parallel using goroutines
func (m *Monitor) pollStorageWithNodes(ctx context.Context, instanceName string, client PVEClientInterface, nodes []proxmox.Node) {
	startTime := time.Now()

	instanceCfg := m.getInstanceConfig(instanceName)

	// Determine the storage instance name - use cluster name for clustered setups
	// This must match what is set in each storage item's Instance field
	storageInstanceName := instanceName
	if instanceCfg != nil && instanceCfg.IsCluster && instanceCfg.ClusterName != "" {
		storageInstanceName = instanceCfg.ClusterName
	}

	// Get cluster storage configuration first (single call)
	clusterStorages, err := client.GetAllStorage(ctx)
	clusterStorageAvailable := err == nil
	if err != nil {
		// Provide detailed context about cluster health issues
		if strings.Contains(err.Error(), "no healthy nodes available") {
			log.Warn().
				Err(err).
				Str("instance", instanceName).
				Msg("Cluster health check shows no healthy endpoints - continuing with direct node storage polling. Check network connectivity and API accessibility from Pulse to each cluster node.")
		} else {
			log.Warn().
				Err(err).
				Str("instance", instanceName).
				Msg("Failed to get cluster storage config - will continue with node storage only")
		}
	}

	// Create a map for quick lookup of cluster storage config
	clusterStorageMap := make(map[string]proxmox.Storage)
	cephDetected := false
	if clusterStorageAvailable {
		for _, cs := range clusterStorages {
			clusterStorageMap[cs.Storage] = cs
			if !cephDetected && isCephStorageType(cs.Type) {
				cephDetected = true
			}
		}
	}

	// Channel to collect storage results from each node
	type nodeResult struct {
		node    string
		storage []models.Storage
		err     error
	}

	resultChan := make(chan nodeResult, len(nodes))
	var wg sync.WaitGroup

	// Count online nodes for logging
	onlineNodes := 0
	for _, node := range nodes {
		if node.Status == "online" {
			onlineNodes++
		}
	}

	log.Debug().
		Str("instance", instanceName).
		Int("totalNodes", len(nodes)).
		Int("onlineNodes", onlineNodes).
		Msg("Starting parallel storage polling")

	// Get existing storage from state to preserve data for offline nodes
	currentState := m.state.GetSnapshot()
	existingStorageMap := make(map[string]models.Storage)
	for _, storage := range currentState.Storage {
		if storage.Instance == instanceName {
			existingStorageMap[storage.ID] = storage
		}
	}

	// Track which nodes we successfully polled
	polledNodes := make(map[string]bool)

	// Launch a goroutine for each online node
	for _, node := range nodes {
		// Skip offline nodes but preserve their existing storage data
		if node.Status != "online" {
			log.Debug().
				Str("node", node.Node).
				Str("status", node.Status).
				Msg("Skipping offline node for storage polling - preserving existing data")
			continue
		}

		wg.Add(1)
		go func(n proxmox.Node) {
			defer wg.Done()

			nodeStart := time.Now()

			// Fetch storage for this node
			nodeStorage, err := client.GetStorage(ctx, n.Node)
			if err != nil {
				if shouldAttemptFallback(err) {
					if fallbackStorage, ferr := m.fetchNodeStorageFallback(ctx, instanceCfg, n.Node); ferr == nil {
						log.Warn().
							Str("instance", instanceName).
							Str("node", n.Node).
							Err(err).
							Msg("Primary storage query failed; using direct node fallback")
						nodeStorage = fallbackStorage
						err = nil
					} else {
						log.Warn().
							Str("instance", instanceName).
							Str("node", n.Node).
							Err(ferr).
							Msg("Storage fallback to direct node query failed")
					}
				}
			}
			if err != nil {
				// Handle timeout gracefully - unavailable storage (e.g., NFS mounts) can cause this
				errStr := err.Error()
				if strings.Contains(errStr, "timeout") || strings.Contains(errStr, "deadline exceeded") {
					log.Warn().
						Str("node", n.Node).
						Str("instance", instanceName).
						Msg("Storage query timed out - likely due to unavailable storage mounts. Preserving existing storage data for this node.")
					// Send an error result so the node is marked as failed and preservation logic works
					resultChan <- nodeResult{node: n.Node, err: err}
					return
				}
				// For other errors, log as error
				log.Error().
					Err(err).
					Str("node", n.Node).
					Str("instance", instanceName).
					Msg("Failed to get node storage - check API permissions")
				resultChan <- nodeResult{node: n.Node, err: err}
				return
			}

			var nodeStorageList []models.Storage

			// Get ZFS pool status for this node if any storage is ZFS
			// This is now production-ready with proper API integration
			var zfsPoolMap = make(map[string]*models.ZFSPool)
			enableZFSMonitoring := os.Getenv("PULSE_DISABLE_ZFS_MONITORING") != "true" // Enabled by default

			if enableZFSMonitoring {
				if poolInfos, err := client.GetZFSPoolsWithDetails(ctx, n.Node); err == nil {
					log.Debug().
						Str("node", n.Node).
						Int("pools", len(poolInfos)).
						Msg("Successfully fetched ZFS pool details")

					// Convert to our model format
					for _, poolInfo := range poolInfos {
						modelPool := convertPoolInfoToModel(&poolInfo)
						if modelPool != nil {
							zfsPoolMap[poolInfo.Name] = modelPool
						}
					}
				} else {
					// Log but don't fail - ZFS monitoring is optional
					log.Debug().
						Err(err).
						Str("node", n.Node).
						Str("instance", instanceName).
						Msg("Could not get ZFS pool status (may require additional permissions)")
				}
			}

			// Process each storage
			for _, storage := range nodeStorage {
				if reason, skip := readOnlyFilesystemReason(storage.Type, storage.Total, storage.Used); skip {
					log.Debug().
						Str("node", n.Node).
						Str("storage", storage.Storage).
						Str("type", storage.Type).
						Str("skipReason", reason).
						Uint64("total", storage.Total).
						Uint64("used", storage.Used).
						Msg("Skipping read-only storage mount")
					continue
				}

				// Create storage ID
				var storageID string
				if instanceName == n.Node {
					storageID = fmt.Sprintf("%s-%s", n.Node, storage.Storage)
				} else {
					storageID = fmt.Sprintf("%s-%s-%s", instanceName, n.Node, storage.Storage)
				}

				// Get cluster config for this storage
				clusterConfig, hasClusterConfig := clusterStorageMap[storage.Storage]

				// Determine if shared - check multiple sources:
				// 1. Per-node API returns shared flag directly
				// 2. Cluster config API also has shared flag (if available)
				// 3. Some storage types are inherently cluster-wide even if flags aren't set
				shared := storage.Shared == 1 ||
					(hasClusterConfig && clusterConfig.Shared == 1) ||
					isInherentlySharedStorageType(storage.Type)

				// Create storage model
				// Initialize Enabled/Active from per-node API response
				// Use storageInstanceName (cluster name when clustered) to match node ID format
				modelStorage := models.Storage{
					ID:       storageID,
					Name:     storage.Storage,
					Node:     n.Node,
					Instance: storageInstanceName,
					Type:     storage.Type,
					Status:   "available",
					Pool:     storage.Pool,
					Path:     storage.Path,
					Total:    int64(storage.Total),
					Used:     int64(storage.Used),
					Free:     int64(storage.Available),
					Usage:    safePercentage(float64(storage.Used), float64(storage.Total)),
					Content:  sortContent(storage.Content),
					Shared:   shared,
					Enabled:  storage.Enabled == 1,
					Active:   storage.Active == 1,
				}

				if hasClusterConfig {
					if nodes := parseClusterStorageNodes(clusterConfig.Nodes); len(nodes) > 0 {
						modelStorage.Nodes = nodes
					}
					if modelStorage.Pool == "" && clusterConfig.Pool != "" {
						modelStorage.Pool = clusterConfig.Pool
					}
					if modelStorage.Path == "" && clusterConfig.Path != "" {
						modelStorage.Path = clusterConfig.Path
					}
				}

				// Attach ZFS pool status information whenever the storage name or
				// dataset path resolves to a known pool, even for dir storages
				// backed by ZFS datasets.
				modelStorage.ZFSPool = matchZFSPoolForStorage(modelStorage, zfsPoolMap)

				// Override with cluster config if available, but only when the
				// cluster metadata explicitly carries those flags. Some storage
				// types (notably PBS) omit enabled/active, and forcing them to 0
				// would make us skip backup polling even though the node reports
				// the storage as available.
				if hasClusterConfig {
					// Cluster metadata is inconsistent across storage types; PBS storages often omit
					// enabled/active entirely (decode as zero). To avoid marking usable storages as
					// disabled, only override when the cluster explicitly sets the flag to 1.
					if clusterConfig.Enabled == 1 {
						modelStorage.Enabled = true
					}
					if clusterConfig.Active == 1 {
						modelStorage.Active = true
					}
				}

				// Determine status based on enabled/active flags
				// Priority: disabled storage always shows as "disabled", regardless of active state
				if !modelStorage.Enabled {
					modelStorage.Status = "disabled"
				} else if modelStorage.Active {
					modelStorage.Status = "available"
				} else {
					modelStorage.Status = "inactive"
				}

				nodeStorageList = append(nodeStorageList, modelStorage)
			}

			nodeDuration := time.Since(nodeStart)
			log.Debug().
				Str("node", n.Node).
				Int("storage", len(nodeStorageList)).
				Dur("duration", nodeDuration).
				Msg("Node storage polling completed")

			// If we got empty storage but have existing storage for this node, don't mark as successfully polled
			// This allows preservation logic to keep the existing storage
			if len(nodeStorageList) == 0 {
				// Check if we have existing storage for this node
				hasExisting := false
				for _, existing := range existingStorageMap {
					if existing.Node == n.Node {
						hasExisting = true
						break
					}
				}
				if hasExisting {
					log.Warn().
						Str("node", n.Node).
						Str("instance", instanceName).
						Msg("Node returned empty storage but has existing storage - preserving existing data")
					// Don't send result, allowing preservation logic to work
					return
				}
			}

			resultChan <- nodeResult{node: n.Node, storage: nodeStorageList}
		}(node)
	}

	// Close channel when all goroutines complete
	go func() {
		wg.Wait()
		close(resultChan)
	}()

	// Collect results from all nodes
	var allStorage []models.Storage
	type sharedStorageAggregation struct {
		storage models.Storage
		nodes   map[string]struct{}
		nodeIDs map[string]struct{}
	}
	sharedStorageMap := make(map[string]*sharedStorageAggregation) // Map to keep shared storage entries with node affiliations

	toSortedSlice := func(set map[string]struct{}) []string {
		slice := make([]string, 0, len(set))
		for value := range set {
			slice = append(slice, value)
		}
		sort.Strings(slice)
		return slice
	}
	successfulNodes := 0
	failedNodes := 0

	for result := range resultChan {
		if result.err != nil {
			failedNodes++
		} else {
			successfulNodes++
			polledNodes[result.node] = true // Mark this node as successfully polled
			for _, storage := range result.storage {
				if storage.Shared {
					// For shared storage, aggregate by instance+name so we retain the
					// reporting nodes but never merge across different Proxmox instances.
					key := storage.Instance + "/" + storage.Name
					nodeIdentifier := fmt.Sprintf("%s-%s", storage.Instance, storage.Node)

					if entry, exists := sharedStorageMap[key]; exists {
						entry.nodes[storage.Node] = struct{}{}
						entry.nodeIDs[nodeIdentifier] = struct{}{}

						// Prefer the entry with the most up-to-date utilization data
						if storage.Used > entry.storage.Used || (storage.Total > entry.storage.Total && storage.Used == entry.storage.Used) {
							entry.storage.Total = storage.Total
							entry.storage.Used = storage.Used
							entry.storage.Free = storage.Free
							entry.storage.Usage = storage.Usage
							entry.storage.ZFSPool = storage.ZFSPool
							entry.storage.Status = storage.Status
							entry.storage.Enabled = storage.Enabled
							entry.storage.Active = storage.Active
							entry.storage.Content = storage.Content
							entry.storage.Type = storage.Type
						}
					} else {
						sharedStorageMap[key] = &sharedStorageAggregation{
							storage: storage,
							nodes:   map[string]struct{}{storage.Node: {}},
							nodeIDs: map[string]struct{}{nodeIdentifier: {}},
						}
					}
				} else {
					// Non-shared storage goes directly to results
					allStorage = append(allStorage, storage)
				}
			}
		}
	}

	// Add deduplicated shared storage to results
	for _, entry := range sharedStorageMap {
		entry.storage.Node = "cluster"
		entry.storage.Nodes = toSortedSlice(entry.nodes)
		entry.storage.NodeIDs = toSortedSlice(entry.nodeIDs)
		entry.storage.NodeCount = len(entry.storage.Nodes)
		// Fix for #1049: Use a consistent ID for shared storage that doesn't
		// include the node name, preventing duplicates when different nodes
		// report the same shared storage across polling cycles.
		entry.storage.ID = fmt.Sprintf("%s-cluster-%s", entry.storage.Instance, entry.storage.Name)
		allStorage = append(allStorage, entry.storage)
	}

	// Some shared storages exist only in the cluster-wide storage config and do
	// not show up in per-node storage responses. Synthesize those entries so the
	// UI can still surface them for alert configuration.
	existingSharedStorage := make(map[string]struct{}, len(allStorage))
	for _, storage := range allStorage {
		if storage.Shared {
			existingSharedStorage[storage.Instance+"/"+storage.Name] = struct{}{}
		}
	}

	for _, clusterStorage := range clusterStorages {
		storageName := strings.TrimSpace(clusterStorage.Storage)
		if storageName == "" {
			continue
		}

		shared := clusterStorage.Shared == 1 || isInherentlySharedStorageType(clusterStorage.Type)
		if !shared {
			continue
		}

		key := storageInstanceName + "/" + storageName
		if _, exists := existingSharedStorage[key]; exists {
			continue
		}

		nodesForStorage := parseClusterStorageNodes(clusterStorage.Nodes)
		if len(nodesForStorage) == 0 {
			nodesForStorage = make([]string, 0, len(nodes))
			for _, node := range nodes {
				nodeName := strings.TrimSpace(node.Node)
				if nodeName == "" {
					continue
				}
				nodesForStorage = append(nodesForStorage, nodeName)
			}
		}

		nodeIDs := make([]string, 0, len(nodesForStorage))
		for _, nodeName := range nodesForStorage {
			nodeIDs = append(nodeIDs, fmt.Sprintf("%s-%s", storageInstanceName, nodeName))
		}

		synthetic := models.Storage{
			ID:        fmt.Sprintf("%s-cluster-%s", storageInstanceName, storageName),
			Name:      storageName,
			Node:      "cluster",
			Instance:  storageInstanceName,
			Nodes:     nodesForStorage,
			NodeIDs:   nodeIDs,
			NodeCount: len(nodesForStorage),
			Type:      clusterStorage.Type,
			Status:    "available",
			Pool:      clusterStorage.Pool,
			Path:      clusterStorage.Path,
			Total:     int64(clusterStorage.Total),
			Used:      int64(clusterStorage.Used),
			Free:      int64(clusterStorage.Available),
			Usage:     safePercentage(float64(clusterStorage.Used), float64(clusterStorage.Total)),
			Content:   sortContent(clusterStorage.Content),
			Shared:    true,
			Enabled:   true,
			Active:    true,
		}

		allStorage = append(allStorage, synthetic)
		existingSharedStorage[key] = struct{}{}
	}

	// Preserve existing storage data for nodes that weren't polled (offline or error)
	preservedCount := 0
	for _, existingStorage := range existingStorageMap {
		// Only preserve if we didn't poll this node
		if !polledNodes[existingStorage.Node] && existingStorage.Node != "cluster" {
			allStorage = append(allStorage, existingStorage)
			preservedCount++
			log.Debug().
				Str("node", existingStorage.Node).
				Str("storage", existingStorage.Name).
				Msg("Preserving existing storage data for unpolled node")
		}
	}

	// Record metrics and check alerts for all storage devices
	for _, storage := range allStorage {
		if m.metricsHistory != nil {
			timestamp := time.Now()
			m.metricsHistory.AddStorageMetric(storage.ID, "usage", storage.Usage, timestamp)
			m.metricsHistory.AddStorageMetric(storage.ID, "used", float64(storage.Used), timestamp)
			m.metricsHistory.AddStorageMetric(storage.ID, "total", float64(storage.Total), timestamp)
			m.metricsHistory.AddStorageMetric(storage.ID, "avail", float64(storage.Free), timestamp)

			// Also write to persistent store for enterprise reporting
			if m.metricsStore != nil {
				m.metricsStore.Write("storage", storage.ID, "usage", storage.Usage, timestamp)
				m.metricsStore.Write("storage", storage.ID, "used", float64(storage.Used), timestamp)
				m.metricsStore.Write("storage", storage.ID, "total", float64(storage.Total), timestamp)
				m.metricsStore.Write("storage", storage.ID, "avail", float64(storage.Free), timestamp)
			}
		}

		if m.alertManager != nil {
			m.alertManager.CheckStorage(storage)
		}
	}

	if !cephDetected {
		for _, storage := range allStorage {
			if isCephStorageType(storage.Type) {
				cephDetected = true
				break
			}
		}
	}

	// Update state with all storage
	m.state.UpdateStorageForInstance(storageInstanceName, allStorage)

	// Poll Ceph cluster data after refreshing storage information
	if instanceCfg == nil || !instanceCfg.DisableCeph {
		m.pollCephCluster(ctx, instanceName, client, cephDetected)
	}

	duration := time.Since(startTime)

	// Warn if all nodes failed to get storage
	if successfulNodes == 0 && failedNodes > 0 {
		log.Error().
			Str("instance", instanceName).
			Int("failedNodes", failedNodes).
			Msg("All nodes failed to retrieve storage - check Proxmox API permissions for Datastore.Audit on all storage")
	} else {
		log.Debug().
			Str("instance", instanceName).
			Int("totalStorage", len(allStorage)).
			Int("successfulNodes", successfulNodes).
			Int("failedNodes", failedNodes).
			Int("preservedStorage", preservedCount).
			Dur("duration", duration).
			Msg("Parallel storage polling completed")
	}
}

func shouldAttemptFallback(err error) bool {
	if err == nil {
		return false
	}
	msg := strings.ToLower(err.Error())
	return strings.Contains(msg, "timeout") || strings.Contains(msg, "deadline exceeded") || strings.Contains(msg, "context canceled")
}

func (m *Monitor) fetchNodeStorageFallback(ctx context.Context, instanceCfg *config.PVEInstance, nodeName string) ([]proxmox.Storage, error) {
	if m == nil || instanceCfg == nil || !instanceCfg.IsCluster || len(instanceCfg.ClusterEndpoints) == 0 {
		return nil, fmt.Errorf("fallback unavailable")
	}

	var target string
	for _, ep := range instanceCfg.ClusterEndpoints {
		if !strings.EqualFold(ep.NodeName, nodeName) {
			continue
		}
		target = clusterEndpointEffectiveURL(ep, instanceCfg.VerifySSL, instanceCfg.Fingerprint)
		if target != "" {
			break
		}
	}
	if strings.TrimSpace(target) == "" {
		return nil, fmt.Errorf("fallback endpoint missing for node %s", nodeName)
	}

	cfg := proxmox.ClientConfig{
		Host:        target,
		VerifySSL:   instanceCfg.VerifySSL,
		Fingerprint: instanceCfg.Fingerprint,
		Timeout:     m.pollTimeout,
	}
	if instanceCfg.TokenName != "" && instanceCfg.TokenValue != "" {
		cfg.TokenName = instanceCfg.TokenName
		cfg.TokenValue = instanceCfg.TokenValue
	} else {
		cfg.User = instanceCfg.User
		cfg.Password = instanceCfg.Password
	}

	directClient, err := proxmox.NewClient(cfg)
	if err != nil {
		return nil, err
	}

	return directClient.GetStorage(ctx, nodeName)
}

// pollPVENode polls a single PVE node and returns the result
func (m *Monitor) pollPVENode(
	ctx context.Context,
	instanceName string,
	instanceCfg *config.PVEInstance,
	client PVEClientInterface,
	node proxmox.Node,
	connectionHealthStr string,
	prevNodeMemory map[string]models.Memory,
	prevInstanceNodes []models.Node,
) (models.Node, string, error) {
	nodeStart := time.Now()
	displayName := getNodeDisplayName(instanceCfg, node.Node)
	connectionHost := instanceCfg.Host
	guestURL := instanceCfg.GuestURL
	if instanceCfg.IsCluster && len(instanceCfg.ClusterEndpoints) > 0 {
		for _, ep := range instanceCfg.ClusterEndpoints {
			if strings.EqualFold(ep.NodeName, node.Node) {
				if effective := clusterEndpointEffectiveURL(ep, instanceCfg.VerifySSL, instanceCfg.Fingerprint); effective != "" {
					connectionHost = effective
				}
				if ep.GuestURL != "" {
					guestURL = ep.GuestURL
				}
				break
			}
		}
	}

	// Apply grace period for node status to prevent flapping
	// For clustered nodes, use clusterName-nodeName as the ID to deduplicate
	// when the same cluster is registered via multiple entry points
	// (e.g., agent installed with --enable-proxmox on multiple cluster nodes)
	var nodeID string
	if instanceCfg.IsCluster && instanceCfg.ClusterName != "" {
		nodeID = instanceCfg.ClusterName + "-" + node.Node
	} else {
		nodeID = instanceName + "-" + node.Node
	}
	effectiveStatus := node.Status
	now := time.Now()

	m.mu.Lock()
	if strings.ToLower(node.Status) == "online" {
		// Node is online - update last-online timestamp
		m.nodeLastOnline[nodeID] = now
	} else {
		// Node is reported as offline - check grace period
		lastOnline, exists := m.nodeLastOnline[nodeID]
		if exists && now.Sub(lastOnline) < nodeOfflineGracePeriod {
			// Still within grace period - preserve online status
			effectiveStatus = "online"
			log.Debug().
				Str("instance", instanceName).
				Str("node", node.Node).
				Dur("timeSinceOnline", now.Sub(lastOnline)).
				Dur("gracePeriod", nodeOfflineGracePeriod).
				Msg("Node offline but within grace period - preserving online status")
		} else {
			// Grace period expired or never seen online - mark as offline
			if exists {
				log.Info().
					Str("instance", instanceName).
					Str("node", node.Node).
					Dur("timeSinceOnline", now.Sub(lastOnline)).
					Msg("Node offline and grace period expired - marking as offline")
			}
		}
	}
	m.mu.Unlock()

	modelNode := models.Node{
		ID:          nodeID,
		Name:        node.Node,
		DisplayName: displayName,
		Instance:    instanceName,
		Host:        connectionHost,
		GuestURL:    guestURL,
		Status:      effectiveStatus,
		Type:        "node",
		CPU:         safeFloat(node.CPU), // Proxmox returns 0-1 ratio (e.g., 0.15 = 15%)
		Memory: models.Memory{
			Total: int64(node.MaxMem),
			Used:  int64(node.Mem),
			Free:  int64(node.MaxMem - node.Mem),
			Usage: safePercentage(float64(node.Mem), float64(node.MaxMem)),
		},
		Uptime:                       int64(node.Uptime),
		LoadAverage:                  []float64{},
		LastSeen:                     time.Now(),
		ConnectionHealth:             connectionHealthStr, // Use the determined health status
		IsClusterMember:              instanceCfg.IsCluster,
		ClusterName:                  instanceCfg.ClusterName,
		TemperatureMonitoringEnabled: instanceCfg.TemperatureMonitoringEnabled,
	}
	modelNode.Disk, _ = m.resolveNodeDisk(instanceName, nodeID, node.Node, node, nil)

	nodeSnapshotRaw := NodeMemoryRaw{
		Total:               node.MaxMem,
		Used:                node.Mem,
		Free:                node.MaxMem - node.Mem,
		FallbackTotal:       node.MaxMem,
		FallbackUsed:        node.Mem,
		FallbackFree:        node.MaxMem - node.Mem,
		FallbackCalculated:  true,
		ProxmoxMemorySource: "nodes-endpoint",
	}
	nodeMemorySource := "nodes-endpoint"
	var nodeFallbackReason string

	// Debug logging for disk metrics - note that these values can fluctuate
	// due to thin provisioning and dynamic allocation
	if node.Disk > 0 && node.MaxDisk > 0 {
		log.Debug().
			Str("node", node.Node).
			Uint64("disk", node.Disk).
			Uint64("maxDisk", node.MaxDisk).
			Float64("diskUsage", safePercentage(float64(node.Disk), float64(node.MaxDisk))).
			Msg("Node disk metrics from /nodes endpoint")
	}

	// Track whether we successfully replaced memory metrics with detailed status data
	memoryUpdated := false

	// Get detailed node info if available (skip for offline nodes)
	if effectiveStatus == "online" {
		nodeInfo, nodeErr := client.GetNodeStatus(ctx, node.Node)
		if nodeErr != nil {
			nodeFallbackReason = "node-status-unavailable"
			// If we can't get node status, log but continue with data from /nodes endpoint
			if node.Disk > 0 && node.MaxDisk > 0 {
				log.Warn().
					Str("instance", instanceName).
					Str("node", node.Node).
					Err(nodeErr).
					Uint64("usingDisk", node.Disk).
					Uint64("usingMaxDisk", node.MaxDisk).
					Msg("Could not get node status - using fallback metrics (memory will include cache/buffers)")
			} else {
				log.Warn().
					Str("instance", instanceName).
					Str("node", node.Node).
					Err(nodeErr).
					Uint64("disk", node.Disk).
					Uint64("maxDisk", node.MaxDisk).
					Msg("Could not get node status - no fallback metrics available (memory will include cache/buffers)")
			}
		} else if nodeInfo != nil {
			if nodeInfo.Memory != nil {
				nodeSnapshotRaw.Total = nodeInfo.Memory.Total
				nodeSnapshotRaw.Used = nodeInfo.Memory.Used
				nodeSnapshotRaw.Free = nodeInfo.Memory.Free
				nodeSnapshotRaw.Available = nodeInfo.Memory.Available
				nodeSnapshotRaw.Avail = nodeInfo.Memory.Avail
				nodeSnapshotRaw.Buffers = nodeInfo.Memory.Buffers
				nodeSnapshotRaw.Cached = nodeInfo.Memory.Cached
				nodeSnapshotRaw.Shared = nodeInfo.Memory.Shared
				nodeSnapshotRaw.EffectiveAvailable = nodeInfo.Memory.EffectiveAvailable()
				nodeSnapshotRaw.ProxmoxMemorySource = "node-status"
				nodeSnapshotRaw.FallbackCalculated = false
			}

			// Convert LoadAvg from interface{} to float64
			loadAvg := make([]float64, 0, len(nodeInfo.LoadAvg))
			for _, val := range nodeInfo.LoadAvg {
				switch v := val.(type) {
				case float64:
					loadAvg = append(loadAvg, v)
				case string:
					if f, err := strconv.ParseFloat(v, 64); err == nil {
						loadAvg = append(loadAvg, f)
					}
				}
			}
			modelNode.LoadAverage = loadAvg
			modelNode.KernelVersion = nodeInfo.KernelVersion
			modelNode.PVEVersion = nodeInfo.PVEVersion

			if resolvedDisk, diskSource := m.resolveNodeDisk(instanceName, nodeID, node.Node, node, nodeInfo); diskSource != "" {
				modelNode.Disk = resolvedDisk
			} else {
				log.Warn().
					Str("node", node.Node).
					Bool("rootfsNil", nodeInfo.RootFS == nil).
					Uint64("nodeDisk", node.Disk).
					Uint64("nodeMaxDisk", node.MaxDisk).
					Msg("No valid disk metrics available for node")
			}

			// Update memory metrics to use Available field for more accurate usage
			if nodeInfo.Memory != nil && nodeInfo.Memory.Total > 0 {
				var actualUsed uint64
				effectiveAvailable := nodeInfo.Memory.EffectiveAvailable()
				componentAvailable := nodeInfo.Memory.Free
				if nodeInfo.Memory.Buffers > 0 {
					if math.MaxUint64-componentAvailable < nodeInfo.Memory.Buffers {
						componentAvailable = math.MaxUint64
					} else {
						componentAvailable += nodeInfo.Memory.Buffers
					}
				}
				if nodeInfo.Memory.Cached > 0 {
					if math.MaxUint64-componentAvailable < nodeInfo.Memory.Cached {
						componentAvailable = math.MaxUint64
					} else {
						componentAvailable += nodeInfo.Memory.Cached
					}
				}
				if nodeInfo.Memory.Total > 0 && componentAvailable > nodeInfo.Memory.Total {
					componentAvailable = nodeInfo.Memory.Total
				}

				availableFromUsed := uint64(0)
				if nodeInfo.Memory.Total > 0 && nodeInfo.Memory.Used > 0 && nodeInfo.Memory.Total >= nodeInfo.Memory.Used {
					availableFromUsed = nodeInfo.Memory.Total - nodeInfo.Memory.Used
				}
				nodeSnapshotRaw.TotalMinusUsed = availableFromUsed

				missingCacheMetrics := nodeInfo.Memory.Available == 0 &&
					nodeInfo.Memory.Avail == 0 &&
					nodeInfo.Memory.Buffers == 0 &&
					nodeInfo.Memory.Cached == 0

				var rrdMetrics rrdMemCacheEntry
				haveRRDMetrics := false
				usedRRDAvailableFallback := false
				rrdMemUsedFallback := false

				if missingCacheMetrics {
					if metrics, err := m.getNodeRRDMetrics(ctx, client, node.Node); err == nil {
						haveRRDMetrics = true
						rrdMetrics = metrics
						if metrics.available > 0 {
							effectiveAvailable = metrics.available
							usedRRDAvailableFallback = true
						}
						if metrics.used > 0 {
							rrdMemUsedFallback = true
						}
					} else if err != nil {
						log.Debug().
							Err(err).
							Str("instance", instanceName).
							Str("node", node.Node).
							Msg("RRD memavailable fallback unavailable")
					}
				}

				const totalMinusUsedGapTolerance uint64 = 16 * 1024 * 1024
				gapGreaterThanComponents := false
				if availableFromUsed > componentAvailable {
					gap := availableFromUsed - componentAvailable
					if componentAvailable == 0 || gap >= totalMinusUsedGapTolerance {
						gapGreaterThanComponents = true
					}
				}

				derivedFromTotalMinusUsed := !usedRRDAvailableFallback &&
					missingCacheMetrics &&
					availableFromUsed > 0 &&
					gapGreaterThanComponents &&
					effectiveAvailable == availableFromUsed

				switch {
				case effectiveAvailable > 0 && effectiveAvailable <= nodeInfo.Memory.Total:
					// Prefer available/avail fields or derived buffers+cache values when present.
					actualUsed = nodeInfo.Memory.Total - effectiveAvailable
					if actualUsed > nodeInfo.Memory.Total {
						actualUsed = nodeInfo.Memory.Total
					}

					logCtx := log.Debug().
						Str("node", node.Node).
						Uint64("total", nodeInfo.Memory.Total).
						Uint64("effectiveAvailable", effectiveAvailable).
						Uint64("actualUsed", actualUsed).
						Float64("usage", safePercentage(float64(actualUsed), float64(nodeInfo.Memory.Total)))
					if usedRRDAvailableFallback {
						if haveRRDMetrics && rrdMetrics.available > 0 {
							logCtx = logCtx.Uint64("rrdAvailable", rrdMetrics.available)
						}
						logCtx.Msg("Node memory: using RRD memavailable fallback (excludes reclaimable cache)")
						nodeMemorySource = "rrd-memavailable"
						nodeFallbackReason = "rrd-memavailable"
						nodeSnapshotRaw.FallbackCalculated = true
						nodeSnapshotRaw.ProxmoxMemorySource = "rrd-memavailable"
					} else if nodeInfo.Memory.Available > 0 {
						logCtx.Msg("Node memory: using available field (excludes reclaimable cache)")
						nodeMemorySource = "available-field"
					} else if nodeInfo.Memory.Avail > 0 {
						logCtx.Msg("Node memory: using avail field (excludes reclaimable cache)")
						nodeMemorySource = "avail-field"
					} else if derivedFromTotalMinusUsed {
						logCtx.
							Uint64("availableFromUsed", availableFromUsed).
							Uint64("reportedFree", nodeInfo.Memory.Free).
							Msg("Node memory: derived available from total-used gap (cache fields missing)")
						nodeMemorySource = "derived-total-minus-used"
						if nodeFallbackReason == "" {
							nodeFallbackReason = "node-status-total-minus-used"
						}
						nodeSnapshotRaw.FallbackCalculated = true
						nodeSnapshotRaw.ProxmoxMemorySource = "node-status-total-minus-used"
					} else {
						logCtx.
							Uint64("free", nodeInfo.Memory.Free).
							Uint64("buffers", nodeInfo.Memory.Buffers).
							Uint64("cached", nodeInfo.Memory.Cached).
							Msg("Node memory: derived available from free+buffers+cached (excludes reclaimable cache)")
						nodeMemorySource = "derived-free-buffers-cached"
					}
				default:
					switch {
					case rrdMemUsedFallback && haveRRDMetrics && rrdMetrics.used > 0:
						actualUsed = rrdMetrics.used
						if actualUsed > nodeInfo.Memory.Total {
							actualUsed = nodeInfo.Memory.Total
						}
						log.Debug().
							Str("node", node.Node).
							Uint64("total", nodeInfo.Memory.Total).
							Uint64("rrdUsed", rrdMetrics.used).
							Msg("Node memory: using RRD memused fallback (excludes reclaimable cache)")
						nodeMemorySource = "rrd-memused"
						if nodeFallbackReason == "" {
							nodeFallbackReason = "rrd-memused"
						}
						nodeSnapshotRaw.FallbackCalculated = true
						nodeSnapshotRaw.ProxmoxMemorySource = "rrd-memused"
					default:
						// Fallback to traditional used memory if no cache-aware data is exposed
						actualUsed = nodeInfo.Memory.Used
						if actualUsed > nodeInfo.Memory.Total {
							actualUsed = nodeInfo.Memory.Total
						}
						log.Debug().
							Str("node", node.Node).
							Uint64("total", nodeInfo.Memory.Total).
							Uint64("used", actualUsed).
							Msg("Node memory: no cache-aware metrics - using traditional calculation (includes cache)")
						nodeMemorySource = "node-status-used"
					}
				}

				nodeSnapshotRaw.EffectiveAvailable = effectiveAvailable
				if haveRRDMetrics {
					nodeSnapshotRaw.RRDAvailable = rrdMetrics.available
					nodeSnapshotRaw.RRDUsed = rrdMetrics.used
					nodeSnapshotRaw.RRDTotal = rrdMetrics.total
				}

				free := int64(nodeInfo.Memory.Total - actualUsed)
				if free < 0 {
					free = 0
				}

				modelNode.Memory = models.Memory{
					Total: int64(nodeInfo.Memory.Total),
					Used:  int64(actualUsed),
					Free:  free,
					Usage: safePercentage(float64(actualUsed), float64(nodeInfo.Memory.Total)),
				}
				memoryUpdated = true
			}

			if nodeInfo.CPUInfo != nil {
				// Use MaxCPU from node data for logical CPU count (includes hyperthreading)
				// If MaxCPU is not available or 0, fall back to physical cores
				logicalCores := node.MaxCPU
				if logicalCores == 0 {
					logicalCores = nodeInfo.CPUInfo.Cores
				}

				mhzStr := nodeInfo.CPUInfo.GetMHzString()
				log.Debug().
					Str("node", node.Node).
					Str("model", nodeInfo.CPUInfo.Model).
					Int("cores", nodeInfo.CPUInfo.Cores).
					Int("logicalCores", logicalCores).
					Int("sockets", nodeInfo.CPUInfo.Sockets).
					Str("mhz", mhzStr).
					Msg("Node CPU info from Proxmox")
				modelNode.CPUInfo = models.CPUInfo{
					Model:   nodeInfo.CPUInfo.Model,
					Cores:   logicalCores, // Use logical cores for display
					Sockets: nodeInfo.CPUInfo.Sockets,
					MHz:     mhzStr,
				}
			}
		}
	}

	// If we couldn't update memory metrics using detailed status, preserve previous accurate values if available
	if !memoryUpdated && effectiveStatus == "online" {
		if prevMem, exists := prevNodeMemory[modelNode.ID]; exists && prevMem.Total > 0 {
			total := int64(node.MaxMem)
			if total == 0 {
				total = prevMem.Total
			}
			used := prevMem.Used
			if total > 0 && used > total {
				used = total
			}
			free := total - used
			if free < 0 {
				free = 0
			}

			preserved := prevMem
			preserved.Total = total
			preserved.Used = used
			preserved.Free = free
			preserved.Usage = safePercentage(float64(used), float64(total))

			modelNode.Memory = preserved
			log.Debug().
				Str("instance", instanceName).
				Str("node", node.Node).
				Msg("Preserving previous memory metrics - node status unavailable this cycle")

			if nodeFallbackReason == "" {
				nodeFallbackReason = "preserved-previous-snapshot"
			}
			nodeMemorySource = "previous-snapshot"
			if nodeSnapshotRaw.ProxmoxMemorySource == "node-status" && nodeSnapshotRaw.Total == 0 {
				nodeSnapshotRaw.ProxmoxMemorySource = "previous-snapshot"
			}
		}
	}

	m.recordNodeSnapshot(instanceName, node.Node, NodeMemorySnapshot{
		RetrievedAt:    time.Now(),
		MemorySource:   nodeMemorySource,
		FallbackReason: nodeFallbackReason,
		Memory:         modelNode.Memory,
		Raw:            nodeSnapshotRaw,
	})

	// Collect temperature data via SSH (non-blocking, best effort)
	// Only attempt for online nodes when temperature monitoring is enabled
	// Check per-node setting first, fall back to global setting
	tempMonitoringEnabled := m.config.TemperatureMonitoringEnabled
	if instanceCfg.TemperatureMonitoringEnabled != nil {
		tempMonitoringEnabled = *instanceCfg.TemperatureMonitoringEnabled
	}
	if effectiveStatus == "online" && tempMonitoringEnabled {
		// First, check if there's a matching host agent with temperature data.
		// Host agent temperatures are preferred because they don't require SSH access.
		// Use getHostAgentTemperatureByID with the unique node ID to correctly handle
		// duplicate hostname scenarios (e.g., two "px1" nodes on different IPs).
		hostAgentTemp := m.getHostAgentTemperatureByID(modelNode.ID, node.Node)
		if hostAgentTemp != nil {
			log.Debug().
				Str("node", node.Node).
				Float64("cpuPackage", hostAgentTemp.CPUPackage).
				Float64("cpuMax", hostAgentTemp.CPUMax).
				Int("nvmeCount", len(hostAgentTemp.NVMe)).
				Msg("Using temperature data from host agent")
		}

		// If no host agent temp or we need additional data (SMART), try SSH/proxy collection
		var sshTemp *models.Temperature
		var err error
		if m.tempCollector != nil {
			// Temperature collection is best-effort - use a short timeout to avoid blocking node polling
			// Use context.Background() so the timeout is truly independent of the parent polling context
			// If SSH is slow or unresponsive, we'll preserve previous temperature data
			tempCtx, tempCancel := context.WithTimeout(context.Background(), 10*time.Second)
			defer tempCancel()

			// Determine SSH hostname to use (most robust approach):
			// Prefer the resolved host for this node, with cluster overrides when available.
			sshHost := modelNode.Host
			foundNodeEndpoint := false

			if modelNode.IsClusterMember && instanceCfg.IsCluster {
				// Try to find specific endpoint configuration for this node
				if len(instanceCfg.ClusterEndpoints) > 0 {
					for _, ep := range instanceCfg.ClusterEndpoints {
						if strings.EqualFold(ep.NodeName, node.Node) {
							if effective := clusterEndpointEffectiveURL(ep, instanceCfg.VerifySSL, instanceCfg.Fingerprint); effective != "" {
								sshHost = effective
								foundNodeEndpoint = true
							}
							break
						}
					}
				}

				// If no specific endpoint found, fall back to node name
				if !foundNodeEndpoint {
					sshHost = node.Node
					log.Debug().
						Str("node", node.Node).
						Str("instance", instanceCfg.Name).
						Msg("Node endpoint not found in cluster metadata - falling back to node name for temperature collection")
				}
			}

			if strings.TrimSpace(sshHost) == "" {
				sshHost = node.Node
			}

			// Skip SSH only when the host agent already has SMART data too.
			// If the host agent only has CPU/NVMe readings, SSH can still
			// augment the node with SMART disk temperatures.
			skipSSHCollection := shouldSkipTemperatureSSHCollection(hostAgentTemp)

			if !skipSSHCollection {
				sshTemp, err = m.tempCollector.CollectTemperature(tempCtx, sshHost, node.Node)
				if err != nil && hostAgentTemp == nil {
					log.Debug().
						Str("node", node.Node).
						Str("sshHost", sshHost).
						Bool("isCluster", modelNode.IsClusterMember).
						Int("endpointCount", len(instanceCfg.ClusterEndpoints)).
						Msg("Temperature collection failed - check SSH access")
				}
			}

			// Debug: log SSH temp details before merge
			if sshTemp != nil {
				log.Debug().
					Str("node", node.Node).
					Bool("sshTempAvailable", sshTemp.Available).
					Bool("sshHasSMART", sshTemp.HasSMART).
					Int("sshSMARTCount", len(sshTemp.SMART)).
					Bool("sshHasNVMe", sshTemp.HasNVMe).
					Int("sshNVMeCount", len(sshTemp.NVMe)).
					Msg("SSH temperature data before merge")
			} else {
				log.Debug().
					Str("node", node.Node).
					Msg("SSH temperature data is nil")
			}
		}

		// Merge host agent and SSH temperatures
		temp := mergeTemperatureData(hostAgentTemp, sshTemp)

		if temp != nil && temp.Available {
			// Get the current CPU temperature (prefer package, fall back to max)
			currentTemp := temp.CPUPackage
			if currentTemp == 0 && temp.CPUMax > 0 {
				currentTemp = temp.CPUMax
			}

			// Find previous temperature data for this node to preserve min/max
			var prevTemp *models.Temperature
			for _, prevNode := range prevInstanceNodes {
				if prevNode.ID == modelNode.ID && prevNode.Temperature != nil {
					prevTemp = prevNode.Temperature
					break
				}
			}

			// Initialize or update min/max tracking
			if prevTemp != nil && prevTemp.CPUMin > 0 {
				// Preserve existing min/max and update if necessary
				temp.CPUMin = prevTemp.CPUMin
				temp.CPUMaxRecord = prevTemp.CPUMaxRecord
				temp.MinRecorded = prevTemp.MinRecorded
				temp.MaxRecorded = prevTemp.MaxRecorded

				// Update min if current is lower
				if currentTemp > 0 && currentTemp < temp.CPUMin {
					temp.CPUMin = currentTemp
					temp.MinRecorded = time.Now()
				}

				// Update max if current is higher
				if currentTemp > temp.CPUMaxRecord {
					temp.CPUMaxRecord = currentTemp
					temp.MaxRecorded = time.Now()
				}
			} else if currentTemp > 0 {
				// First reading - initialize min/max to current value
				temp.CPUMin = currentTemp
				temp.CPUMaxRecord = currentTemp
				temp.MinRecorded = time.Now()
				temp.MaxRecorded = time.Now()
			}

			modelNode.Temperature = temp

			// Determine source for logging
			tempSource := "ssh"
			if hostAgentTemp != nil && sshTemp == nil {
				tempSource = "host-agent"
			} else if hostAgentTemp != nil && sshTemp != nil {
				tempSource = "host-agent+ssh"
			}

			log.Debug().
				Str("node", node.Node).
				Str("source", tempSource).
				Float64("cpuPackage", temp.CPUPackage).
				Float64("cpuMax", temp.CPUMax).
				Float64("cpuMin", temp.CPUMin).
				Float64("cpuMaxRecord", temp.CPUMaxRecord).
				Int("nvmeCount", len(temp.NVMe)).
				Msg("Collected temperature data")
		} else {
			// Temperature data returned but not available (temp != nil && !temp.Available)
			// OR no temperature data from any source - preserve previous temperature if available
			// This prevents the temperature column from flickering when collection temporarily fails
			var prevTemp *models.Temperature
			for _, prevNode := range prevInstanceNodes {
				if prevNode.ID == modelNode.ID && prevNode.Temperature != nil && prevNode.Temperature.Available {
					prevTemp = prevNode.Temperature
					break
				}
			}

			if prevTemp != nil {
				// Clone the previous temperature to avoid modifying historical data
				preserved := *prevTemp
				preserved.LastUpdate = prevTemp.LastUpdate // Keep original update time to indicate staleness
				modelNode.Temperature = &preserved
				log.Debug().
					Str("node", node.Node).
					Bool("isCluster", modelNode.IsClusterMember).
					Float64("cpuPackage", preserved.CPUPackage).
					Time("lastUpdate", preserved.LastUpdate).
					Msg("Preserved previous temperature data (current collection failed or unavailable)")
			} else {
				log.Debug().
					Str("node", node.Node).
					Bool("isCluster", modelNode.IsClusterMember).
					Msg("No temperature data available (collection failed, no previous data to preserve)")
			}
		}
	}

	// Poll pending apt updates (less frequently - every 30 minutes)
	// Only for online nodes to avoid wasting API calls on offline nodes
	if effectiveStatus == "online" {
		now := time.Now()
		m.mu.RLock()
		if m.nodePendingUpdatesCache == nil {
			m.mu.RUnlock()
			m.mu.Lock()
			if m.nodePendingUpdatesCache == nil {
				m.nodePendingUpdatesCache = make(map[string]pendingUpdatesCache)
			}
			m.mu.Unlock()
			m.mu.RLock()
		}
		cached, hasCached := m.nodePendingUpdatesCache[nodeID]
		m.mu.RUnlock()

		if !hasCached || now.Sub(cached.checkedAt) >= pendingUpdatesCacheTTL {
			// Time to check for updates
			pendingPkgs, err := client.GetNodePendingUpdates(ctx, node.Node)
			if err != nil {
				// API call failed - preserve cached value if available, don't spam logs
				log.Debug().
					Err(err).
					Str("node", node.Node).
					Str("instance", instanceName).
					Msg("Could not check pending apt updates (may require Sys.Audit permission)")
				if hasCached {
					modelNode.PendingUpdates = cached.count
					modelNode.PendingUpdatesCheckedAt = cached.checkedAt
				}
			} else {
				updateCount := len(pendingPkgs)
				modelNode.PendingUpdates = updateCount
				modelNode.PendingUpdatesCheckedAt = now

				// Cache the result
				m.mu.Lock()
				m.nodePendingUpdatesCache[nodeID] = pendingUpdatesCache{
					count:     updateCount,
					checkedAt: now,
				}
				m.mu.Unlock()

				log.Debug().
					Str("node", node.Node).
					Str("instance", instanceName).
					Int("pendingUpdates", updateCount).
					Msg("Checked pending apt updates")
			}
		} else {
			// Use cached value
			modelNode.PendingUpdates = cached.count
			modelNode.PendingUpdatesCheckedAt = cached.checkedAt
		}
	}

	if m.pollMetrics != nil {
		nodeNameLabel := strings.TrimSpace(node.Node)
		if nodeNameLabel == "" {
			nodeNameLabel = strings.TrimSpace(modelNode.DisplayName)
		}
		if nodeNameLabel == "" {
			nodeNameLabel = "unknown-node"
		}

		success := true
		nodeErrReason := ""
		health := strings.ToLower(strings.TrimSpace(modelNode.ConnectionHealth))
		if health != "" && health != "healthy" {
			success = false
			nodeErrReason = fmt.Sprintf("connection health %s", health)
		}

		status := strings.ToLower(strings.TrimSpace(modelNode.Status))
		if success && status != "" && status != "online" {
			success = false
			nodeErrReason = fmt.Sprintf("status %s", status)
		}

		var nodeErr error
		if !success {
			if nodeErrReason == "" {
				nodeErrReason = "unknown node error"
			}
			nodeErr = stderrors.New(nodeErrReason)
		}

		m.pollMetrics.RecordNodeResult(NodePollResult{
			InstanceName: instanceName,
			InstanceType: "pve",
			NodeName:     nodeNameLabel,
			Success:      success,
			Error:        nodeErr,
			StartTime:    nodeStart,
			EndTime:      time.Now(),
		})
	}

	return modelNode, effectiveStatus, nil
}

func parseClusterStorageNodes(raw string) []string {
	raw = strings.TrimSpace(raw)
	if raw == "" {
		return nil
	}

	parts := strings.FieldsFunc(raw, func(r rune) bool {
		return r == ',' || r == ';' || r == ' ' || r == '\t' || r == '\n'
	})
	if len(parts) == 0 {
		return nil
	}

	seen := make(map[string]struct{}, len(parts))
	result := make([]string, 0, len(parts))
	for _, part := range parts {
		part = strings.TrimSpace(part)
		if part == "" {
			continue
		}
		if _, exists := seen[part]; exists {
			continue
		}
		seen[part] = struct{}{}
		result = append(result, part)
	}
	if len(result) == 0 {
		return nil
	}
	return result
}