fix: clear stale errors after successful cluster operations

Previously, errors stored in ClusterClient.lastError were only cleared
during initial health checks or when recovering unhealthy nodes. This
caused stale error messages to persist in the UI even after the
underlying issues were resolved.

The fix clears cached errors in two places:
1. After passing connectivity test in getHealthyClient()
2. After successful operation in executeWithFailover()

This ensures that once an endpoint starts working again, any previous
error messages are cleared from the UI without requiring a restart.

Related to #659, #754
This commit is contained in:
rcourtman 2025-11-27 16:22:16 +00:00
parent 6ff5ae4200
commit 6db4ee7a3b

View file

@ -363,6 +363,9 @@ func (cc *ClusterClient) getHealthyClient(ctx context.Context) (*Client, error)
Int("nodes", len(testNodes)).
Msg("Cluster endpoint passed connectivity test")
// Clear any stale error from previous failures now that connectivity succeeded
delete(cc.lastError, selectedEndpoint)
// Create the actual client with full timeout
newClient, err := NewClient(cfg)
if err != nil {
@ -397,6 +400,13 @@ func (cc *ClusterClient) markUnhealthyWithError(endpoint string, errMsg string)
cc.lastHealthCheck[endpoint] = time.Now()
}
// clearEndpointError removes any cached error for an endpoint after successful operations
func (cc *ClusterClient) clearEndpointError(endpoint string) {
cc.mu.Lock()
defer cc.mu.Unlock()
delete(cc.lastError, endpoint)
}
// recoverUnhealthyNodes attempts to recover unhealthy nodes
func (cc *ClusterClient) recoverUnhealthyNodes(ctx context.Context) {
cc.mu.RLock()
@ -570,6 +580,8 @@ func (cc *ClusterClient) executeWithFailover(ctx context.Context, fn func(*Clien
// Execute the function
err = fn(client)
if err == nil {
// Clear any stale error for this endpoint on success
cc.clearEndpointError(clientEndpoint)
return nil
}
lastErr = err