Improve guest agent timeout handling for high-load environments (refs #592)

This change addresses intermittent "Guest details unavailable" and "Disk stats unavailable" errors affecting users with large VM deployments (50+ VMs) or high-load Proxmox environments. Changes: - Increased default guest agent timeouts (3-5s → 10-15s) to better handle environments under load - Added automatic retry logic (1 retry by default) for transient timeout failures - Made all timeouts and retry count configurable via environment variables: * GUEST_AGENT_FSINFO_TIMEOUT (default: 15s) * GUEST_AGENT_NETWORK_TIMEOUT (default: 10s) * GUEST_AGENT_OSINFO_TIMEOUT (default: 10s) * GUEST_AGENT_VERSION_TIMEOUT (default: 10s) * GUEST_AGENT_RETRIES (default: 1) - Added comprehensive documentation in VM_DISK_MONITORING.md with configuration examples for different deployment scenarios These improvements allow Pulse to gracefully handle intermittent API timeouts without immediately displaying errors, while remaining configurable for different network conditions and environment sizes. Fixes: https://github.com/rcourtman/Pulse/discussions/592
2026-04-28 03:20:11 +00:00 · 2025-11-05 09:40:58 +00:00 · 2025-11-05 09:40:58 +00:00 · 7a185c4ab3
commit 7a185c4ab3
parent d52ac6d8b5
3 changed files with 249 additions and 157 deletions
--- a/docs/VM_DISK_MONITORING.md
+++ b/docs/VM_DISK_MONITORING.md
@ -196,6 +196,49 @@ qm config <vmid> | grep agent
 qm agent <vmid> ping
 ```

+### Configuring Guest Agent Timeouts
+
+**New in v4.27:** Guest agent timeouts and retry behavior can be configured via environment variables to handle high-load environments or slow networks (refs #592).
+
+**Available Environment Variables:**
+
+```bash
+# Timeout for filesystem info queries (default: 15s, previously 5s)
+GUEST_AGENT_FSINFO_TIMEOUT=15s
+
+# Timeout for network interface queries (default: 10s, previously 5s)
+GUEST_AGENT_NETWORK_TIMEOUT=10s
+
+# Timeout for OS info queries (default: 10s, previously 3s)
+GUEST_AGENT_OSINFO_TIMEOUT=10s
+
+# Timeout for agent version queries (default: 10s, previously 3s)
+GUEST_AGENT_VERSION_TIMEOUT=10s
+
+# Number of retries for timeout failures (default: 1, meaning one retry after initial failure)
+GUEST_AGENT_RETRIES=1
+```
+
+**When to Adjust:**
+- **Large environments (50+ VMs):** Increase timeouts to 20-30s if you see frequent timeout errors
+- **Slow networks/WAN:** Increase timeouts proportionally to network latency
+- **High load periods:** Consider increasing retries to 2 for better resilience
+- **Fast local network:** Can reduce timeouts to 5-8s for quicker feedback
+
+**How to Apply:**
+
+```bash
+# Docker deployment - add to docker run or compose
+docker run -e GUEST_AGENT_FSINFO_TIMEOUT=20s -e GUEST_AGENT_RETRIES=2 ...
+
+# Systemd deployment - add to /etc/systemd/system/pulse.service
+[Service]
+Environment="GUEST_AGENT_FSINFO_TIMEOUT=20s"
+Environment="GUEST_AGENT_RETRIES=2"
+```
+
+After changing environment variables, restart Pulse for the changes to take effect.
+
 ### Permission Denied Errors

 If you see "permission denied" in Pulse logs when querying guest agent:
--- a/internal/monitoring/monitor.go
+++ b/internal/monitoring/monitor.go
@ -439,7 +439,7 @@ type Monitor struct {
 	backoffCfg                 backoffConfig
 	rng                        *rand.Rand
 	maxRetryAttempts           int
-	tempService                TemperatureService
+	tempCollector              *TemperatureCollector // SSH-based temperature collector
 	mu                         sync.RWMutex
 	startTime                  time.Time
 	rateTracker                *RateTracker
@ -477,7 +477,13 @@ type Monitor struct {
 	guestMetadataRefreshJitter time.Duration
 	guestMetadataRetryBackoff  time.Duration
 	guestMetadataHoldDuration  time.Duration
-	executor                   PollExecutor
+	// Configurable guest agent timeouts (refs #592)
+	guestAgentFSInfoTimeout   time.Duration
+	guestAgentNetworkTimeout  time.Duration
+	guestAgentOSInfoTimeout   time.Duration
+	guestAgentVersionTimeout  time.Duration
+	guestAgentRetries         int
+	executor                  PollExecutor
 	breakerBaseRetry           time.Duration
 	breakerMaxDelay            time.Duration
 	breakerHalfOpenWindow      time.Duration
@ -486,55 +492,6 @@ type Monitor struct {
 	dlqInsightMap              map[string]*dlqInsight
 }

-func (m *Monitor) temperatureService() TemperatureService {
-	if m == nil {
-		return nil
-	}
-
-	m.mu.RLock()
-	service := m.tempService
-	m.mu.RUnlock()
-	return service
-}
-
-// EnableTemperatureMonitoring reenables temperature collection and ensures the provider is available.
-func (m *Monitor) EnableTemperatureMonitoring() {
-	if m == nil {
-		return
-	}
-
-	m.mu.Lock()
-	service := m.tempService
-	defer m.mu.Unlock()
-
-	if m.config != nil {
-		m.config.TemperatureMonitoringEnabled = true
-	}
-
-	if service != nil {
-		service.Enable()
-	}
-}
-
-// DisableTemperatureMonitoring stops temperature collection attempts.
-func (m *Monitor) DisableTemperatureMonitoring() {
-	if m == nil {
-		return
-	}
-
-	m.mu.Lock()
-	service := m.tempService
-	defer m.mu.Unlock()
-
-	if m.config != nil {
-		m.config.TemperatureMonitoringEnabled = false
-	}
-
-	if service != nil {
-		service.Disable()
-	}
-}
-
 type rrdMemCacheEntry struct {
 	available uint64
 	used      uint64
@ -570,6 +527,44 @@ func safeFloat(val float64) float64 {
 	return val
 }

+// parseDurationEnv parses a duration from an environment variable, returning defaultVal if not set or invalid
+func parseDurationEnv(key string, defaultVal time.Duration) time.Duration {
+	val := os.Getenv(key)
+	if val == "" {
+		return defaultVal
+	}
+	parsed, err := time.ParseDuration(val)
+	if err != nil {
+		log.Warn().
+			Str("key", key).
+			Str("value", val).
+			Err(err).
+			Dur("default", defaultVal).
+			Msg("Failed to parse duration from environment variable, using default")
+		return defaultVal
+	}
+	return parsed
+}
+
+// parseIntEnv parses an integer from an environment variable, returning defaultVal if not set or invalid
+func parseIntEnv(key string, defaultVal int) int {
+	val := os.Getenv(key)
+	if val == "" {
+		return defaultVal
+	}
+	parsed, err := strconv.Atoi(val)
+	if err != nil {
+		log.Warn().
+			Str("key", key).
+			Str("value", val).
+			Err(err).
+			Int("default", defaultVal).
+			Msg("Failed to parse integer from environment variable, using default")
+		return defaultVal
+	}
+	return parsed
+}
+
 func clampUint64ToInt64(val uint64) int64 {
 	if val > math.MaxInt64 {
 		return math.MaxInt64
@ -782,6 +777,15 @@ const (
 	nodeRRDRequestTimeout        = 2 * time.Second
 	guestMetadataCacheTTL        = 5 * time.Minute
 	defaultGuestMetadataHold     = 15 * time.Second
+
+	// Guest agent timeout defaults (configurable via environment variables)
+	// Increased from 3-5s to 10-15s to handle high-load environments better (refs #592)
+	defaultGuestAgentFSInfoTimeout        = 15 * time.Second // GUEST_AGENT_FSINFO_TIMEOUT
+	defaultGuestAgentNetworkTimeout       = 10 * time.Second // GUEST_AGENT_NETWORK_TIMEOUT
+	defaultGuestAgentOSInfoTimeout        = 10 * time.Second // GUEST_AGENT_OSINFO_TIMEOUT
+	defaultGuestAgentVersionTimeout       = 10 * time.Second // GUEST_AGENT_VERSION_TIMEOUT
+	defaultGuestAgentRetries              = 1                // GUEST_AGENT_RETRIES (0 = no retry, 1 = one retry)
+	defaultGuestAgentRetryDelay           = 500 * time.Millisecond
 )

 type guestMetadataCacheEntry struct {
@ -2047,6 +2051,36 @@ func (m *Monitor) releaseGuestMetadataSlot() {
 	}
 }

+// retryGuestAgentCall executes a guest agent API call with timeout and retry logic (refs #592)
+func (m *Monitor) retryGuestAgentCall(ctx context.Context, timeout time.Duration, maxRetries int, fn func(context.Context) (interface{}, error)) (interface{}, error) {
+	var lastErr error
+	for attempt := 0; attempt <= maxRetries; attempt++ {
+		callCtx, cancel := context.WithTimeout(ctx, timeout)
+		result, err := fn(callCtx)
+		cancel()
+
+		if err == nil {
+			return result, nil
+		}
+
+		lastErr = err
+
+		// Don't retry non-timeout errors or if this was the last attempt
+		if attempt >= maxRetries || !strings.Contains(err.Error(), "timeout") {
+			break
+		}
+
+		// Brief delay before retry to avoid hammering the API
+		select {
+		case <-time.After(defaultGuestAgentRetryDelay):
+		case <-ctx.Done():
+			return nil, ctx.Err()
+		}
+	}
+
+	return nil, lastErr
+}
+
 func (m *Monitor) fetchGuestAgentMetadata(ctx context.Context, client PVEClientInterface, instanceName, nodeName, vmName string, vmid int, vmStatus *proxmox.VMStatus) ([]string, []models.GuestNetworkInterface, string, string, string) {
 	if vmStatus == nil || client == nil {
 		m.clearGuestMetadataCache(instanceName, nodeName, vmid)
@ -2100,9 +2134,10 @@ func (m *Monitor) fetchGuestAgentMetadata(ctx context.Context, client PVEClientI
 		}()
 	}

-	ifaceCtx, cancelIface := context.WithTimeout(ctx, 5*time.Second)
-	interfaces, err := client.GetVMNetworkInterfaces(ifaceCtx, nodeName, vmid)
-	cancelIface()
+	// Network interfaces with configurable timeout and retry (refs #592)
+	interfaces, err := m.retryGuestAgentCall(ctx, m.guestAgentNetworkTimeout, m.guestAgentRetries, func(ctx context.Context) (interface{}, error) {
+		return client.GetVMNetworkInterfaces(ctx, nodeName, vmid)
+	})
 	if err != nil {
 		log.Debug().
 			Str("instance", instanceName).
@ -2110,16 +2145,17 @@ func (m *Monitor) fetchGuestAgentMetadata(ctx context.Context, client PVEClientI
 			Int("vmid", vmid).
 			Err(err).
 			Msg("Guest agent network interfaces unavailable")
-	} else if len(interfaces) > 0 {
-		ipAddresses, networkIfaces = processGuestNetworkInterfaces(interfaces)
+	} else if ifaces, ok := interfaces.([]proxmox.VMNetworkInterface); ok && len(ifaces) > 0 {
+		ipAddresses, networkIfaces = processGuestNetworkInterfaces(ifaces)
 	} else {
 		ipAddresses = nil
 		networkIfaces = nil
 	}

-	osCtx, cancelOS := context.WithTimeout(ctx, 3*time.Second)
-	agentInfo, err := client.GetVMAgentInfo(osCtx, nodeName, vmid)
-	cancelOS()
+	// OS info with configurable timeout and retry (refs #592)
+	agentInfoRaw, err := m.retryGuestAgentCall(ctx, m.guestAgentOSInfoTimeout, m.guestAgentRetries, func(ctx context.Context) (interface{}, error) {
+		return client.GetVMAgentInfo(ctx, nodeName, vmid)
+	})
 	if err != nil {
 		log.Debug().
 			Str("instance", instanceName).
@ -2127,16 +2163,17 @@ func (m *Monitor) fetchGuestAgentMetadata(ctx context.Context, client PVEClientI
 			Int("vmid", vmid).
 			Err(err).
 			Msg("Guest agent OS info unavailable")
-	} else if len(agentInfo) > 0 {
+	} else if agentInfo, ok := agentInfoRaw.(map[string]interface{}); ok && len(agentInfo) > 0 {
 		osName, osVersion = extractGuestOSInfo(agentInfo)
 	} else {
 		osName = ""
 		osVersion = ""
 	}

-	versionCtx, cancelVersion := context.WithTimeout(ctx, 3*time.Second)
-	version, err := client.GetVMAgentVersion(versionCtx, nodeName, vmid)
-	cancelVersion()
+	// Agent version with configurable timeout and retry (refs #592)
+	versionRaw, err := m.retryGuestAgentCall(ctx, m.guestAgentVersionTimeout, m.guestAgentRetries, func(ctx context.Context) (interface{}, error) {
+		return client.GetVMAgentVersion(ctx, nodeName, vmid)
+	})
 	if err != nil {
 		log.Debug().
 			Str("instance", instanceName).
@ -2144,7 +2181,7 @@ func (m *Monitor) fetchGuestAgentMetadata(ctx context.Context, client PVEClientI
 			Int("vmid", vmid).
 			Err(err).
 			Msg("Guest agent version unavailable")
-	} else if version != "" {
+	} else if version, ok := versionRaw.(string); ok && version != "" {
 		agentVersion = version
 	} else {
 		agentVersion = ""
@ -3061,12 +3098,10 @@ func New(cfg *config.Config) (*Monitor, error) {
 		homeDir = "/home/pulse"
 	}
 	sshKeyPath := filepath.Join(homeDir, ".ssh/id_ed25519_sensors")
-	tempService := newTemperatureService(cfg.TemperatureMonitoringEnabled, "root", sshKeyPath)
+	tempCollector := NewTemperatureCollector("root", sshKeyPath)

 	// Security warning if running in container with SSH temperature monitoring
-	if cfg.TemperatureMonitoringEnabled {
-		checkContainerizedTempMonitoring()
-	}
+	checkContainerizedTempMonitoring()

 	stalenessTracker := NewStalenessTracker(getPollMetrics())
 	stalenessTracker.SetBounds(cfg.AdaptivePollingBaseInterval, cfg.AdaptivePollingMaxInterval)
@ -3114,6 +3149,13 @@ func New(cfg *config.Config) (*Monitor, error) {
 	}
 	holdDuration := defaultGuestMetadataHold

+	// Load guest agent timeout configuration from environment variables (refs #592)
+	guestAgentFSInfoTimeout := parseDurationEnv("GUEST_AGENT_FSINFO_TIMEOUT", defaultGuestAgentFSInfoTimeout)
+	guestAgentNetworkTimeout := parseDurationEnv("GUEST_AGENT_NETWORK_TIMEOUT", defaultGuestAgentNetworkTimeout)
+	guestAgentOSInfoTimeout := parseDurationEnv("GUEST_AGENT_OSINFO_TIMEOUT", defaultGuestAgentOSInfoTimeout)
+	guestAgentVersionTimeout := parseDurationEnv("GUEST_AGENT_VERSION_TIMEOUT", defaultGuestAgentVersionTimeout)
+	guestAgentRetries := parseIntEnv("GUEST_AGENT_RETRIES", defaultGuestAgentRetries)
+
 	m := &Monitor{
 		config:                     cfg,
 		state:                      models.NewState(),
@ -3131,7 +3173,7 @@ func New(cfg *config.Config) (*Monitor, error) {
 		backoffCfg:                 backoff,
 		rng:                        rand.New(rand.NewSource(time.Now().UnixNano())),
 		maxRetryAttempts:           5,
-		tempService:                tempService,
+		tempCollector:              tempCollector,
 		startTime:                  time.Now(),
 		rateTracker:                NewRateTracker(),
 		metricsHistory:             NewMetricsHistory(1000, 24*time.Hour), // Keep up to 1000 points or 24 hours
@ -3159,6 +3201,11 @@ func New(cfg *config.Config) (*Monitor, error) {
 		guestMetadataRefreshJitter: jitter,
 		guestMetadataRetryBackoff:  retryBackoff,
 		guestMetadataHoldDuration:  holdDuration,
+		guestAgentFSInfoTimeout:    guestAgentFSInfoTimeout,
+		guestAgentNetworkTimeout:   guestAgentNetworkTimeout,
+		guestAgentOSInfoTimeout:    guestAgentOSInfoTimeout,
+		guestAgentVersionTimeout:   guestAgentVersionTimeout,
+		guestAgentRetries:          guestAgentRetries,
 		instanceInfoCache:          make(map[string]*instanceInfo),
 		pollStatusMap:              make(map[string]*pollStatus),
 		dlqInsightMap:              make(map[string]*dlqInsight),
@ -5236,103 +5283,97 @@ func (m *Monitor) pollPVEInstance(ctx context.Context, instanceName string, clie

 		// Collect temperature data via SSH (non-blocking, best effort)
 		// Only attempt for online nodes
-		if node.Status == "online" {
-			if tempService := m.temperatureService(); tempService != nil && tempService.Enabled() {
-				tempCtx, tempCancel := context.WithTimeout(ctx, 30*time.Second) // Increased to accommodate SSH operations via proxy
+		if node.Status == "online" && m.tempCollector != nil {
+			tempCtx, tempCancel := context.WithTimeout(ctx, 30*time.Second) // Increased to accommodate SSH operations via proxy

-				// Determine SSH hostname to use (most robust approach):
-				// Prefer the resolved host for this node, with cluster overrides when available.
-				sshHost := modelNode.Host
+			// Determine SSH hostname to use (most robust approach):
+			// Prefer the resolved host for this node, with cluster overrides when available.
+			sshHost := modelNode.Host

-				if modelNode.IsClusterMember && instanceCfg.IsCluster {
-					for _, ep := range instanceCfg.ClusterEndpoints {
-						if strings.EqualFold(ep.NodeName, node.Node) {
-							if effective := clusterEndpointEffectiveURL(ep); effective != "" {
-								sshHost = effective
-							}
-							break
+			if modelNode.IsClusterMember && instanceCfg.IsCluster {
+				for _, ep := range instanceCfg.ClusterEndpoints {
+					if strings.EqualFold(ep.NodeName, node.Node) {
+						if effective := clusterEndpointEffectiveURL(ep); effective != "" {
+							sshHost = effective
 						}
+						break
+					}
+				}
+			}
+
+			if strings.TrimSpace(sshHost) == "" {
+				sshHost = node.Node
+			}
+
+			temp, err := m.tempCollector.CollectTemperature(tempCtx, sshHost, node.Node)
+			tempCancel()
+
+			if err == nil && temp != nil && temp.Available {
+				// Get the current CPU temperature (prefer package, fall back to max)
+				currentTemp := temp.CPUPackage
+				if currentTemp == 0 && temp.CPUMax > 0 {
+					currentTemp = temp.CPUMax
+				}
+
+				// Find previous temperature data for this node to preserve min/max
+				var prevTemp *models.Temperature
+				for _, prevNode := range prevInstanceNodes {
+					if prevNode.ID == modelNode.ID && prevNode.Temperature != nil {
+						prevTemp = prevNode.Temperature
+						break
 					}
 				}

-				if strings.TrimSpace(sshHost) == "" {
-					sshHost = node.Node
-				}
+				// Initialize or update min/max tracking
+				if prevTemp != nil && prevTemp.CPUMin > 0 {
+					// Preserve existing min/max and update if necessary
+					temp.CPUMin = prevTemp.CPUMin
+					temp.CPUMaxRecord = prevTemp.CPUMaxRecord
+					temp.MinRecorded = prevTemp.MinRecorded
+					temp.MaxRecorded = prevTemp.MaxRecorded

-				temp, err := tempService.Collect(tempCtx, sshHost, node.Node)
-				tempCancel()
-
-				switch {
-				case err == nil && temp != nil && temp.Available:
-					// Get the current CPU temperature (prefer package, fall back to max)
-					currentTemp := temp.CPUPackage
-					if currentTemp == 0 && temp.CPUMax > 0 {
-						currentTemp = temp.CPUMax
-					}
-
-					// Find previous temperature data for this node to preserve min/max
-					var prevTemp *models.Temperature
-					for _, prevNode := range prevInstanceNodes {
-						if prevNode.ID == modelNode.ID && prevNode.Temperature != nil {
-							prevTemp = prevNode.Temperature
-							break
-						}
-					}
-
-					// Initialize or update min/max tracking
-					if prevTemp != nil && prevTemp.CPUMin > 0 {
-						// Preserve existing min/max and update if necessary
-						temp.CPUMin = prevTemp.CPUMin
-						temp.CPUMaxRecord = prevTemp.CPUMaxRecord
-						temp.MinRecorded = prevTemp.MinRecorded
-						temp.MaxRecorded = prevTemp.MaxRecorded
-
-						// Update min if current is lower
-						if currentTemp > 0 && currentTemp < temp.CPUMin {
-							temp.CPUMin = currentTemp
-							temp.MinRecorded = time.Now()
-						}
-
-						// Update max if current is higher
-						if currentTemp > temp.CPUMaxRecord {
-							temp.CPUMaxRecord = currentTemp
-							temp.MaxRecorded = time.Now()
-						}
-					} else if currentTemp > 0 {
-						// First reading - initialize min/max to current value
+					// Update min if current is lower
+					if currentTemp > 0 && currentTemp < temp.CPUMin {
 						temp.CPUMin = currentTemp
-						temp.CPUMaxRecord = currentTemp
 						temp.MinRecorded = time.Now()
+					}
+
+					// Update max if current is higher
+					if currentTemp > temp.CPUMaxRecord {
+						temp.CPUMaxRecord = currentTemp
 						temp.MaxRecorded = time.Now()
 					}
-
-					modelNode.Temperature = temp
-					log.Debug().
-						Str("node", node.Node).
-						Str("sshHost", sshHost).
-						Float64("cpuPackage", temp.CPUPackage).
-						Float64("cpuMax", temp.CPUMax).
-						Float64("cpuMin", temp.CPUMin).
-						Float64("cpuMaxRecord", temp.CPUMaxRecord).
-						Int("nvmeCount", len(temp.NVMe)).
-						Msg("Collected temperature data")
-				case err != nil:
-					if !stderrors.Is(err, ErrTemperatureMonitoringDisabled) && !stderrors.Is(err, ErrTemperatureCollectorUnavailable) {
-						log.Debug().
-							Str("node", node.Node).
-							Str("sshHost", sshHost).
-							Bool("isCluster", modelNode.IsClusterMember).
-							Int("endpointCount", len(instanceCfg.ClusterEndpoints)).
-							Err(err).
-							Msg("Temperature collection failed - check SSH access")
-					}
-				case temp != nil:
-					log.Debug().
-						Str("node", node.Node).
-						Str("sshHost", sshHost).
-						Bool("available", temp.Available).
-						Msg("Temperature data unavailable after collection")
+				} else if currentTemp > 0 {
+					// First reading - initialize min/max to current value
+					temp.CPUMin = currentTemp
+					temp.CPUMaxRecord = currentTemp
+					temp.MinRecorded = time.Now()
+					temp.MaxRecorded = time.Now()
 				}
+
+				modelNode.Temperature = temp
+				log.Debug().
+					Str("node", node.Node).
+					Str("sshHost", sshHost).
+					Float64("cpuPackage", temp.CPUPackage).
+					Float64("cpuMax", temp.CPUMax).
+					Float64("cpuMin", temp.CPUMin).
+					Float64("cpuMaxRecord", temp.CPUMaxRecord).
+					Int("nvmeCount", len(temp.NVMe)).
+					Msg("Collected temperature data")
+			} else if err != nil {
+				log.Debug().
+					Str("node", node.Node).
+					Str("sshHost", sshHost).
+					Bool("isCluster", modelNode.IsClusterMember).
+					Int("endpointCount", len(instanceCfg.ClusterEndpoints)).
+					Msg("Temperature collection failed - check SSH access")
+			} else if temp != nil {
+				log.Debug().
+					Str("node", node.Node).
+					Str("sshHost", sshHost).
+					Bool("available", temp.Available).
+					Msg("Temperature data unavailable after collection")
 			}
 		}

--- a/internal/monitoring/monitor_polling.go
+++ b/internal/monitoring/monitor_polling.go
@ -493,8 +493,17 @@ func (m *Monitor) pollVMsWithNodes(ctx context.Context, instanceName string, cli
 								Msg("Guest agent enabled, fetching filesystem info")
 						}

-						statusCtx, cancel := context.WithTimeout(ctx, 5*time.Second)
-						if fsInfo, err := client.GetVMFSInfo(statusCtx, n.Node, vm.VMID); err != nil {
+						// Filesystem info with configurable timeout and retry (refs #592)
+						fsInfoRaw, err := m.retryGuestAgentCall(ctx, m.guestAgentFSInfoTimeout, m.guestAgentRetries, func(ctx context.Context) (interface{}, error) {
+							return client.GetVMFSInfo(ctx, n.Node, vm.VMID)
+						})
+						var fsInfo []proxmox.VMFileSystem
+						if err == nil {
+							if fs, ok := fsInfoRaw.([]proxmox.VMFileSystem); ok {
+								fsInfo = fs
+							}
+						}
+						if err != nil {
 							// Handle errors
 							errStr := err.Error()
 							log.Warn().
@ -655,7 +664,6 @@ func (m *Monitor) pollVMsWithNodes(ctx context.Context, instanceName string, cli
 									Msg("Guest agent provided filesystem info but no usable filesystems found (all were special mounts)")
 							}
 						}
-						cancel()
 					} else {
 						// No vmStatus available or agent disabled - show allocated disk
 						if diskTotal > 0 {