diff --git a/pkg/proxmox/cluster_client.go b/pkg/proxmox/cluster_client.go index 61350f5a1..8fb8cd315 100644 --- a/pkg/proxmox/cluster_client.go +++ b/pkg/proxmox/cluster_client.go @@ -363,6 +363,9 @@ func (cc *ClusterClient) getHealthyClient(ctx context.Context) (*Client, error) Int("nodes", len(testNodes)). Msg("Cluster endpoint passed connectivity test") + // Clear any stale error from previous failures now that connectivity succeeded + delete(cc.lastError, selectedEndpoint) + // Create the actual client with full timeout newClient, err := NewClient(cfg) if err != nil { @@ -397,6 +400,13 @@ func (cc *ClusterClient) markUnhealthyWithError(endpoint string, errMsg string) cc.lastHealthCheck[endpoint] = time.Now() } +// clearEndpointError removes any cached error for an endpoint after successful operations +func (cc *ClusterClient) clearEndpointError(endpoint string) { + cc.mu.Lock() + defer cc.mu.Unlock() + delete(cc.lastError, endpoint) +} + // recoverUnhealthyNodes attempts to recover unhealthy nodes func (cc *ClusterClient) recoverUnhealthyNodes(ctx context.Context) { cc.mu.RLock() @@ -570,6 +580,8 @@ func (cc *ClusterClient) executeWithFailover(ctx context.Context, fn func(*Clien // Execute the function err = fn(client) if err == nil { + // Clear any stale error for this endpoint on success + cc.clearEndpointError(clientEndpoint) return nil } lastErr = err