Improve guest agent timeout handling for high-load environments (refs #592)

This change addresses intermittent "Guest details unavailable" and "Disk stats
unavailable" errors affecting users with large VM deployments (50+ VMs) or
high-load Proxmox environments.

Changes:
- Increased default guest agent timeouts (3-5s → 10-15s) to better handle
  environments under load
- Added automatic retry logic (1 retry by default) for transient timeout failures
- Made all timeouts and retry count configurable via environment variables:
  * GUEST_AGENT_FSINFO_TIMEOUT (default: 15s)
  * GUEST_AGENT_NETWORK_TIMEOUT (default: 10s)
  * GUEST_AGENT_OSINFO_TIMEOUT (default: 10s)
  * GUEST_AGENT_VERSION_TIMEOUT (default: 10s)
  * GUEST_AGENT_RETRIES (default: 1)
- Added comprehensive documentation in VM_DISK_MONITORING.md with configuration
  examples for different deployment scenarios

These improvements allow Pulse to gracefully handle intermittent API timeouts
without immediately displaying errors, while remaining configurable for
different network conditions and environment sizes.

Fixes: https://github.com/rcourtman/Pulse/discussions/592
This commit is contained in:
rcourtman 2025-11-05 09:40:58 +00:00
parent d52ac6d8b5
commit 7a185c4ab3
3 changed files with 249 additions and 157 deletions

View file

@ -196,6 +196,49 @@ qm config <vmid> | grep agent
qm agent <vmid> ping
```
### Configuring Guest Agent Timeouts
**New in v4.27:** Guest agent timeouts and retry behavior can be configured via environment variables to handle high-load environments or slow networks (refs #592).
**Available Environment Variables:**
```bash
# Timeout for filesystem info queries (default: 15s, previously 5s)
GUEST_AGENT_FSINFO_TIMEOUT=15s
# Timeout for network interface queries (default: 10s, previously 5s)
GUEST_AGENT_NETWORK_TIMEOUT=10s
# Timeout for OS info queries (default: 10s, previously 3s)
GUEST_AGENT_OSINFO_TIMEOUT=10s
# Timeout for agent version queries (default: 10s, previously 3s)
GUEST_AGENT_VERSION_TIMEOUT=10s
# Number of retries for timeout failures (default: 1, meaning one retry after initial failure)
GUEST_AGENT_RETRIES=1
```
**When to Adjust:**
- **Large environments (50+ VMs):** Increase timeouts to 20-30s if you see frequent timeout errors
- **Slow networks/WAN:** Increase timeouts proportionally to network latency
- **High load periods:** Consider increasing retries to 2 for better resilience
- **Fast local network:** Can reduce timeouts to 5-8s for quicker feedback
**How to Apply:**
```bash
# Docker deployment - add to docker run or compose
docker run -e GUEST_AGENT_FSINFO_TIMEOUT=20s -e GUEST_AGENT_RETRIES=2 ...
# Systemd deployment - add to /etc/systemd/system/pulse.service
[Service]
Environment="GUEST_AGENT_FSINFO_TIMEOUT=20s"
Environment="GUEST_AGENT_RETRIES=2"
```
After changing environment variables, restart Pulse for the changes to take effect.
### Permission Denied Errors
If you see "permission denied" in Pulse logs when querying guest agent:

View file

@ -439,7 +439,7 @@ type Monitor struct {
backoffCfg backoffConfig
rng *rand.Rand
maxRetryAttempts int
tempService TemperatureService
tempCollector *TemperatureCollector // SSH-based temperature collector
mu sync.RWMutex
startTime time.Time
rateTracker *RateTracker
@ -477,7 +477,13 @@ type Monitor struct {
guestMetadataRefreshJitter time.Duration
guestMetadataRetryBackoff time.Duration
guestMetadataHoldDuration time.Duration
executor PollExecutor
// Configurable guest agent timeouts (refs #592)
guestAgentFSInfoTimeout time.Duration
guestAgentNetworkTimeout time.Duration
guestAgentOSInfoTimeout time.Duration
guestAgentVersionTimeout time.Duration
guestAgentRetries int
executor PollExecutor
breakerBaseRetry time.Duration
breakerMaxDelay time.Duration
breakerHalfOpenWindow time.Duration
@ -486,55 +492,6 @@ type Monitor struct {
dlqInsightMap map[string]*dlqInsight
}
func (m *Monitor) temperatureService() TemperatureService {
if m == nil {
return nil
}
m.mu.RLock()
service := m.tempService
m.mu.RUnlock()
return service
}
// EnableTemperatureMonitoring reenables temperature collection and ensures the provider is available.
func (m *Monitor) EnableTemperatureMonitoring() {
if m == nil {
return
}
m.mu.Lock()
service := m.tempService
defer m.mu.Unlock()
if m.config != nil {
m.config.TemperatureMonitoringEnabled = true
}
if service != nil {
service.Enable()
}
}
// DisableTemperatureMonitoring stops temperature collection attempts.
func (m *Monitor) DisableTemperatureMonitoring() {
if m == nil {
return
}
m.mu.Lock()
service := m.tempService
defer m.mu.Unlock()
if m.config != nil {
m.config.TemperatureMonitoringEnabled = false
}
if service != nil {
service.Disable()
}
}
type rrdMemCacheEntry struct {
available uint64
used uint64
@ -570,6 +527,44 @@ func safeFloat(val float64) float64 {
return val
}
// parseDurationEnv parses a duration from an environment variable, returning defaultVal if not set or invalid
func parseDurationEnv(key string, defaultVal time.Duration) time.Duration {
val := os.Getenv(key)
if val == "" {
return defaultVal
}
parsed, err := time.ParseDuration(val)
if err != nil {
log.Warn().
Str("key", key).
Str("value", val).
Err(err).
Dur("default", defaultVal).
Msg("Failed to parse duration from environment variable, using default")
return defaultVal
}
return parsed
}
// parseIntEnv parses an integer from an environment variable, returning defaultVal if not set or invalid
func parseIntEnv(key string, defaultVal int) int {
val := os.Getenv(key)
if val == "" {
return defaultVal
}
parsed, err := strconv.Atoi(val)
if err != nil {
log.Warn().
Str("key", key).
Str("value", val).
Err(err).
Int("default", defaultVal).
Msg("Failed to parse integer from environment variable, using default")
return defaultVal
}
return parsed
}
func clampUint64ToInt64(val uint64) int64 {
if val > math.MaxInt64 {
return math.MaxInt64
@ -782,6 +777,15 @@ const (
nodeRRDRequestTimeout = 2 * time.Second
guestMetadataCacheTTL = 5 * time.Minute
defaultGuestMetadataHold = 15 * time.Second
// Guest agent timeout defaults (configurable via environment variables)
// Increased from 3-5s to 10-15s to handle high-load environments better (refs #592)
defaultGuestAgentFSInfoTimeout = 15 * time.Second // GUEST_AGENT_FSINFO_TIMEOUT
defaultGuestAgentNetworkTimeout = 10 * time.Second // GUEST_AGENT_NETWORK_TIMEOUT
defaultGuestAgentOSInfoTimeout = 10 * time.Second // GUEST_AGENT_OSINFO_TIMEOUT
defaultGuestAgentVersionTimeout = 10 * time.Second // GUEST_AGENT_VERSION_TIMEOUT
defaultGuestAgentRetries = 1 // GUEST_AGENT_RETRIES (0 = no retry, 1 = one retry)
defaultGuestAgentRetryDelay = 500 * time.Millisecond
)
type guestMetadataCacheEntry struct {
@ -2047,6 +2051,36 @@ func (m *Monitor) releaseGuestMetadataSlot() {
}
}
// retryGuestAgentCall executes a guest agent API call with timeout and retry logic (refs #592)
func (m *Monitor) retryGuestAgentCall(ctx context.Context, timeout time.Duration, maxRetries int, fn func(context.Context) (interface{}, error)) (interface{}, error) {
var lastErr error
for attempt := 0; attempt <= maxRetries; attempt++ {
callCtx, cancel := context.WithTimeout(ctx, timeout)
result, err := fn(callCtx)
cancel()
if err == nil {
return result, nil
}
lastErr = err
// Don't retry non-timeout errors or if this was the last attempt
if attempt >= maxRetries || !strings.Contains(err.Error(), "timeout") {
break
}
// Brief delay before retry to avoid hammering the API
select {
case <-time.After(defaultGuestAgentRetryDelay):
case <-ctx.Done():
return nil, ctx.Err()
}
}
return nil, lastErr
}
func (m *Monitor) fetchGuestAgentMetadata(ctx context.Context, client PVEClientInterface, instanceName, nodeName, vmName string, vmid int, vmStatus *proxmox.VMStatus) ([]string, []models.GuestNetworkInterface, string, string, string) {
if vmStatus == nil || client == nil {
m.clearGuestMetadataCache(instanceName, nodeName, vmid)
@ -2100,9 +2134,10 @@ func (m *Monitor) fetchGuestAgentMetadata(ctx context.Context, client PVEClientI
}()
}
ifaceCtx, cancelIface := context.WithTimeout(ctx, 5*time.Second)
interfaces, err := client.GetVMNetworkInterfaces(ifaceCtx, nodeName, vmid)
cancelIface()
// Network interfaces with configurable timeout and retry (refs #592)
interfaces, err := m.retryGuestAgentCall(ctx, m.guestAgentNetworkTimeout, m.guestAgentRetries, func(ctx context.Context) (interface{}, error) {
return client.GetVMNetworkInterfaces(ctx, nodeName, vmid)
})
if err != nil {
log.Debug().
Str("instance", instanceName).
@ -2110,16 +2145,17 @@ func (m *Monitor) fetchGuestAgentMetadata(ctx context.Context, client PVEClientI
Int("vmid", vmid).
Err(err).
Msg("Guest agent network interfaces unavailable")
} else if len(interfaces) > 0 {
ipAddresses, networkIfaces = processGuestNetworkInterfaces(interfaces)
} else if ifaces, ok := interfaces.([]proxmox.VMNetworkInterface); ok && len(ifaces) > 0 {
ipAddresses, networkIfaces = processGuestNetworkInterfaces(ifaces)
} else {
ipAddresses = nil
networkIfaces = nil
}
osCtx, cancelOS := context.WithTimeout(ctx, 3*time.Second)
agentInfo, err := client.GetVMAgentInfo(osCtx, nodeName, vmid)
cancelOS()
// OS info with configurable timeout and retry (refs #592)
agentInfoRaw, err := m.retryGuestAgentCall(ctx, m.guestAgentOSInfoTimeout, m.guestAgentRetries, func(ctx context.Context) (interface{}, error) {
return client.GetVMAgentInfo(ctx, nodeName, vmid)
})
if err != nil {
log.Debug().
Str("instance", instanceName).
@ -2127,16 +2163,17 @@ func (m *Monitor) fetchGuestAgentMetadata(ctx context.Context, client PVEClientI
Int("vmid", vmid).
Err(err).
Msg("Guest agent OS info unavailable")
} else if len(agentInfo) > 0 {
} else if agentInfo, ok := agentInfoRaw.(map[string]interface{}); ok && len(agentInfo) > 0 {
osName, osVersion = extractGuestOSInfo(agentInfo)
} else {
osName = ""
osVersion = ""
}
versionCtx, cancelVersion := context.WithTimeout(ctx, 3*time.Second)
version, err := client.GetVMAgentVersion(versionCtx, nodeName, vmid)
cancelVersion()
// Agent version with configurable timeout and retry (refs #592)
versionRaw, err := m.retryGuestAgentCall(ctx, m.guestAgentVersionTimeout, m.guestAgentRetries, func(ctx context.Context) (interface{}, error) {
return client.GetVMAgentVersion(ctx, nodeName, vmid)
})
if err != nil {
log.Debug().
Str("instance", instanceName).
@ -2144,7 +2181,7 @@ func (m *Monitor) fetchGuestAgentMetadata(ctx context.Context, client PVEClientI
Int("vmid", vmid).
Err(err).
Msg("Guest agent version unavailable")
} else if version != "" {
} else if version, ok := versionRaw.(string); ok && version != "" {
agentVersion = version
} else {
agentVersion = ""
@ -3061,12 +3098,10 @@ func New(cfg *config.Config) (*Monitor, error) {
homeDir = "/home/pulse"
}
sshKeyPath := filepath.Join(homeDir, ".ssh/id_ed25519_sensors")
tempService := newTemperatureService(cfg.TemperatureMonitoringEnabled, "root", sshKeyPath)
tempCollector := NewTemperatureCollector("root", sshKeyPath)
// Security warning if running in container with SSH temperature monitoring
if cfg.TemperatureMonitoringEnabled {
checkContainerizedTempMonitoring()
}
checkContainerizedTempMonitoring()
stalenessTracker := NewStalenessTracker(getPollMetrics())
stalenessTracker.SetBounds(cfg.AdaptivePollingBaseInterval, cfg.AdaptivePollingMaxInterval)
@ -3114,6 +3149,13 @@ func New(cfg *config.Config) (*Monitor, error) {
}
holdDuration := defaultGuestMetadataHold
// Load guest agent timeout configuration from environment variables (refs #592)
guestAgentFSInfoTimeout := parseDurationEnv("GUEST_AGENT_FSINFO_TIMEOUT", defaultGuestAgentFSInfoTimeout)
guestAgentNetworkTimeout := parseDurationEnv("GUEST_AGENT_NETWORK_TIMEOUT", defaultGuestAgentNetworkTimeout)
guestAgentOSInfoTimeout := parseDurationEnv("GUEST_AGENT_OSINFO_TIMEOUT", defaultGuestAgentOSInfoTimeout)
guestAgentVersionTimeout := parseDurationEnv("GUEST_AGENT_VERSION_TIMEOUT", defaultGuestAgentVersionTimeout)
guestAgentRetries := parseIntEnv("GUEST_AGENT_RETRIES", defaultGuestAgentRetries)
m := &Monitor{
config: cfg,
state: models.NewState(),
@ -3131,7 +3173,7 @@ func New(cfg *config.Config) (*Monitor, error) {
backoffCfg: backoff,
rng: rand.New(rand.NewSource(time.Now().UnixNano())),
maxRetryAttempts: 5,
tempService: tempService,
tempCollector: tempCollector,
startTime: time.Now(),
rateTracker: NewRateTracker(),
metricsHistory: NewMetricsHistory(1000, 24*time.Hour), // Keep up to 1000 points or 24 hours
@ -3159,6 +3201,11 @@ func New(cfg *config.Config) (*Monitor, error) {
guestMetadataRefreshJitter: jitter,
guestMetadataRetryBackoff: retryBackoff,
guestMetadataHoldDuration: holdDuration,
guestAgentFSInfoTimeout: guestAgentFSInfoTimeout,
guestAgentNetworkTimeout: guestAgentNetworkTimeout,
guestAgentOSInfoTimeout: guestAgentOSInfoTimeout,
guestAgentVersionTimeout: guestAgentVersionTimeout,
guestAgentRetries: guestAgentRetries,
instanceInfoCache: make(map[string]*instanceInfo),
pollStatusMap: make(map[string]*pollStatus),
dlqInsightMap: make(map[string]*dlqInsight),
@ -5236,103 +5283,97 @@ func (m *Monitor) pollPVEInstance(ctx context.Context, instanceName string, clie
// Collect temperature data via SSH (non-blocking, best effort)
// Only attempt for online nodes
if node.Status == "online" {
if tempService := m.temperatureService(); tempService != nil && tempService.Enabled() {
tempCtx, tempCancel := context.WithTimeout(ctx, 30*time.Second) // Increased to accommodate SSH operations via proxy
if node.Status == "online" && m.tempCollector != nil {
tempCtx, tempCancel := context.WithTimeout(ctx, 30*time.Second) // Increased to accommodate SSH operations via proxy
// Determine SSH hostname to use (most robust approach):
// Prefer the resolved host for this node, with cluster overrides when available.
sshHost := modelNode.Host
// Determine SSH hostname to use (most robust approach):
// Prefer the resolved host for this node, with cluster overrides when available.
sshHost := modelNode.Host
if modelNode.IsClusterMember && instanceCfg.IsCluster {
for _, ep := range instanceCfg.ClusterEndpoints {
if strings.EqualFold(ep.NodeName, node.Node) {
if effective := clusterEndpointEffectiveURL(ep); effective != "" {
sshHost = effective
}
break
if modelNode.IsClusterMember && instanceCfg.IsCluster {
for _, ep := range instanceCfg.ClusterEndpoints {
if strings.EqualFold(ep.NodeName, node.Node) {
if effective := clusterEndpointEffectiveURL(ep); effective != "" {
sshHost = effective
}
break
}
}
}
if strings.TrimSpace(sshHost) == "" {
sshHost = node.Node
}
temp, err := m.tempCollector.CollectTemperature(tempCtx, sshHost, node.Node)
tempCancel()
if err == nil && temp != nil && temp.Available {
// Get the current CPU temperature (prefer package, fall back to max)
currentTemp := temp.CPUPackage
if currentTemp == 0 && temp.CPUMax > 0 {
currentTemp = temp.CPUMax
}
// Find previous temperature data for this node to preserve min/max
var prevTemp *models.Temperature
for _, prevNode := range prevInstanceNodes {
if prevNode.ID == modelNode.ID && prevNode.Temperature != nil {
prevTemp = prevNode.Temperature
break
}
}
if strings.TrimSpace(sshHost) == "" {
sshHost = node.Node
}
// Initialize or update min/max tracking
if prevTemp != nil && prevTemp.CPUMin > 0 {
// Preserve existing min/max and update if necessary
temp.CPUMin = prevTemp.CPUMin
temp.CPUMaxRecord = prevTemp.CPUMaxRecord
temp.MinRecorded = prevTemp.MinRecorded
temp.MaxRecorded = prevTemp.MaxRecorded
temp, err := tempService.Collect(tempCtx, sshHost, node.Node)
tempCancel()
switch {
case err == nil && temp != nil && temp.Available:
// Get the current CPU temperature (prefer package, fall back to max)
currentTemp := temp.CPUPackage
if currentTemp == 0 && temp.CPUMax > 0 {
currentTemp = temp.CPUMax
}
// Find previous temperature data for this node to preserve min/max
var prevTemp *models.Temperature
for _, prevNode := range prevInstanceNodes {
if prevNode.ID == modelNode.ID && prevNode.Temperature != nil {
prevTemp = prevNode.Temperature
break
}
}
// Initialize or update min/max tracking
if prevTemp != nil && prevTemp.CPUMin > 0 {
// Preserve existing min/max and update if necessary
temp.CPUMin = prevTemp.CPUMin
temp.CPUMaxRecord = prevTemp.CPUMaxRecord
temp.MinRecorded = prevTemp.MinRecorded
temp.MaxRecorded = prevTemp.MaxRecorded
// Update min if current is lower
if currentTemp > 0 && currentTemp < temp.CPUMin {
temp.CPUMin = currentTemp
temp.MinRecorded = time.Now()
}
// Update max if current is higher
if currentTemp > temp.CPUMaxRecord {
temp.CPUMaxRecord = currentTemp
temp.MaxRecorded = time.Now()
}
} else if currentTemp > 0 {
// First reading - initialize min/max to current value
// Update min if current is lower
if currentTemp > 0 && currentTemp < temp.CPUMin {
temp.CPUMin = currentTemp
temp.CPUMaxRecord = currentTemp
temp.MinRecorded = time.Now()
}
// Update max if current is higher
if currentTemp > temp.CPUMaxRecord {
temp.CPUMaxRecord = currentTemp
temp.MaxRecorded = time.Now()
}
modelNode.Temperature = temp
log.Debug().
Str("node", node.Node).
Str("sshHost", sshHost).
Float64("cpuPackage", temp.CPUPackage).
Float64("cpuMax", temp.CPUMax).
Float64("cpuMin", temp.CPUMin).
Float64("cpuMaxRecord", temp.CPUMaxRecord).
Int("nvmeCount", len(temp.NVMe)).
Msg("Collected temperature data")
case err != nil:
if !stderrors.Is(err, ErrTemperatureMonitoringDisabled) && !stderrors.Is(err, ErrTemperatureCollectorUnavailable) {
log.Debug().
Str("node", node.Node).
Str("sshHost", sshHost).
Bool("isCluster", modelNode.IsClusterMember).
Int("endpointCount", len(instanceCfg.ClusterEndpoints)).
Err(err).
Msg("Temperature collection failed - check SSH access")
}
case temp != nil:
log.Debug().
Str("node", node.Node).
Str("sshHost", sshHost).
Bool("available", temp.Available).
Msg("Temperature data unavailable after collection")
} else if currentTemp > 0 {
// First reading - initialize min/max to current value
temp.CPUMin = currentTemp
temp.CPUMaxRecord = currentTemp
temp.MinRecorded = time.Now()
temp.MaxRecorded = time.Now()
}
modelNode.Temperature = temp
log.Debug().
Str("node", node.Node).
Str("sshHost", sshHost).
Float64("cpuPackage", temp.CPUPackage).
Float64("cpuMax", temp.CPUMax).
Float64("cpuMin", temp.CPUMin).
Float64("cpuMaxRecord", temp.CPUMaxRecord).
Int("nvmeCount", len(temp.NVMe)).
Msg("Collected temperature data")
} else if err != nil {
log.Debug().
Str("node", node.Node).
Str("sshHost", sshHost).
Bool("isCluster", modelNode.IsClusterMember).
Int("endpointCount", len(instanceCfg.ClusterEndpoints)).
Msg("Temperature collection failed - check SSH access")
} else if temp != nil {
log.Debug().
Str("node", node.Node).
Str("sshHost", sshHost).
Bool("available", temp.Available).
Msg("Temperature data unavailable after collection")
}
}

View file

@ -493,8 +493,17 @@ func (m *Monitor) pollVMsWithNodes(ctx context.Context, instanceName string, cli
Msg("Guest agent enabled, fetching filesystem info")
}
statusCtx, cancel := context.WithTimeout(ctx, 5*time.Second)
if fsInfo, err := client.GetVMFSInfo(statusCtx, n.Node, vm.VMID); err != nil {
// Filesystem info with configurable timeout and retry (refs #592)
fsInfoRaw, err := m.retryGuestAgentCall(ctx, m.guestAgentFSInfoTimeout, m.guestAgentRetries, func(ctx context.Context) (interface{}, error) {
return client.GetVMFSInfo(ctx, n.Node, vm.VMID)
})
var fsInfo []proxmox.VMFileSystem
if err == nil {
if fs, ok := fsInfoRaw.([]proxmox.VMFileSystem); ok {
fsInfo = fs
}
}
if err != nil {
// Handle errors
errStr := err.Error()
log.Warn().
@ -655,7 +664,6 @@ func (m *Monitor) pollVMsWithNodes(ctx context.Context, instanceName string, cli
Msg("Guest agent provided filesystem info but no usable filesystems found (all were special mounts)")
}
}
cancel()
} else {
// No vmStatus available or agent disabled - show allocated disk
if diskTotal > 0 {