Pulse/pkg/proxmox/cluster_client.go

package proxmox

import (
	"context"
	"fmt"
	"math/rand"
	"regexp"
	"strconv"
	"strings"
	"sync"
	"time"

	"github.com/rs/zerolog/log"
)

// ClusterClient wraps multiple Proxmox clients for cluster-aware operations
type ClusterClient struct {
	mu                   sync.RWMutex
	name                 string
	clients              map[string]*Client   // Key is node name
	endpoints            []string             // All available endpoints
	endpointFingerprints map[string]string    // Per-endpoint TLS fingerprints (TOFU)
	nodeHealth           map[string]bool      // Track node health
	lastHealthCheck      map[string]time.Time // Track last health check time
	lastError            map[string]string    // Track last error per endpoint
	config               ClientConfig         // Base config (auth info)
	rateLimitUntil       map[string]time.Time // Cooldown window for rate-limited endpoints
}

const (
	rateLimitBaseDelay   = 150 * time.Millisecond
	rateLimitMaxJitter   = 200 * time.Millisecond
	rateLimitRetryBudget = 2
)

var statusCodePattern = regexp.MustCompile(`(?i)(?:api error|status)\s+(\d{3})`)

var transientRateLimitStatusCodes = map[int]struct{}{
	408: {},
	425: {}, // Too Early
	429: {},
	502: {},
	503: {},
	504: {},
}

// isVMSpecificError reports whether an error string is scoped to a single VM/guest agent
// and should not be treated as a node connectivity failure.
func isVMSpecificError(errStr string) bool {
	if errStr == "" {
		return false
	}
	lower := strings.ToLower(errStr)

	if strings.Contains(lower, "no qemu guest agent") ||
		strings.Contains(lower, "qemu guest agent is not running") ||
		strings.Contains(lower, "guest agent") {
		return true
	}

	// QMP guest agent operations can time out or fail per-VM (e.g. guest-get-fsinfo).
	// These aren't node connectivity issues and should not mark endpoints unhealthy.
	if strings.Contains(lower, "qmp command") {
		return true
	}

	if strings.Contains(lower, "guest-get-") {
		return true
	}

	return false
}

// isEndpointConnectivityError reports whether an error indicates the endpoint
// itself is unreachable (TCP/DNS/TLS failure). Any error that carries an HTTP
// response — even a 500 — proves the endpoint is reachable, so those are NOT
// connectivity errors.
func isEndpointConnectivityError(err error) bool {
	if err == nil {
		return false
	}
	errStr := strings.ToLower(err.Error())

	// If we received an HTTP response from Proxmox (any status code),
	// the endpoint is reachable.
	if strings.Contains(errStr, "api error") {
		return false
	}

	// TCP/DNS connectivity failures
	if strings.Contains(errStr, "connection refused") ||
		strings.Contains(errStr, "connection reset") ||
		strings.Contains(errStr, "no such host") ||
		strings.Contains(errStr, "network is unreachable") ||
		strings.Contains(errStr, "no route to host") ||
		strings.Contains(errStr, "i/o timeout") ||
		strings.Contains(errStr, "dial tcp") ||
		strings.Contains(errStr, "dial:") {
		return true
	}

	// TLS failures
	if strings.Contains(errStr, "tls handshake") ||
		strings.Contains(errStr, "tls:") ||
		strings.Contains(errStr, "certificate") ||
		strings.Contains(errStr, "fingerprint mismatch") {
		return true
	}

	return false
}

// sanitizeEndpointError transforms raw Go errors into user-friendly messages
// for display in the UI. The original error is preserved in logs.
func sanitizeEndpointError(errMsg string) string {
	if errMsg == "" {
		return errMsg
	}

	lower := strings.ToLower(errMsg)

	// Context deadline exceeded - usually means slow API response
	if strings.Contains(lower, "context deadline exceeded") {
		// Check for specific causes
		if strings.Contains(lower, "/storage") {
			return "Request timed out - storage API slow (check for unreachable PBS/NFS/Ceph backends)"
		}
		if strings.Contains(lower, "pbs-") || strings.Contains(lower, ":8007") {
			return "Request timed out - PBS storage backend unreachable"
		}
		return "Request timed out - Proxmox API may be slow or waiting on unreachable backend services"
	}

	// Client timeout - similar to context deadline
	if strings.Contains(lower, "client.timeout exceeded") {
		return "Connection timed out - Proxmox API not responding in time"
	}

	// Connection refused
	if strings.Contains(lower, "connection refused") {
		return "Connection refused - Proxmox API not running or firewall blocking"
	}

	// No route to host
	if strings.Contains(lower, "no route to host") {
		return "Network unreachable - check network connectivity to Proxmox host"
	}

	// TLS/certificate errors
	if strings.Contains(lower, "certificate") || strings.Contains(lower, "x509") {
		return "TLS certificate error - check SSL settings or add fingerprint"
	}

	// Auth errors - keep these specific
	if strings.Contains(lower, "authentication") || strings.Contains(lower, "401") || strings.Contains(lower, "403") {
		return "Authentication failed - check API token or credentials"
	}

	// PBS-specific errors
	if strings.Contains(lower, "can't connect to") && strings.Contains(lower, ":8007") {
		return "PBS storage unreachable - check Proxmox Backup Server connectivity"
	}

	// Return original if no transformation applies
	return errMsg
}

// NewClusterClient creates a new cluster-aware client.
// endpointFingerprints is an optional map of endpoint URL -> TLS fingerprint for per-node certificate verification.
// This enables TOFU (Trust On First Use) for clusters with unique self-signed certs per node.
func NewClusterClient(name string, config ClientConfig, endpoints []string, endpointFingerprints map[string]string) *ClusterClient {
	if endpointFingerprints == nil {
		endpointFingerprints = make(map[string]string)
	}
	cc := &ClusterClient{
		name:                 name,
		clients:              make(map[string]*Client),
		endpoints:            endpoints,
		endpointFingerprints: endpointFingerprints,
		nodeHealth:           make(map[string]bool),
		lastHealthCheck:      make(map[string]time.Time),
		lastError:            make(map[string]string),
		config:               config,
		rateLimitUntil:       make(map[string]time.Time),
	}

	// Initialize all endpoints as unknown (will be tested on first use)
	// Start optimistically - assume healthy until proven otherwise
	// This allows operations to be attempted even if initial health check fails
	for _, endpoint := range endpoints {
		cc.nodeHealth[endpoint] = true // Start optimistic, will be marked unhealthy if operations fail
	}

	// Do a quick parallel health check on initialization (synchronous to avoid race)
	// This will mark unhealthy nodes but won't prevent trying them later
	cc.initialHealthCheck()

	return cc
}

// getEndpointFingerprint returns the TLS fingerprint to use for a specific endpoint.
// It prefers the per-endpoint fingerprint (TOFU) over the base config fingerprint.
func (cc *ClusterClient) getEndpointFingerprint(endpoint string) string {
	if fp, ok := cc.endpointFingerprints[endpoint]; ok && fp != "" {
		return fp
	}
	return cc.config.Fingerprint
}

// initialHealthCheck performs a quick parallel health check on all endpoints
func (cc *ClusterClient) initialHealthCheck() {
	// Skip initial health check if there's only one endpoint
	// For single-endpoint clusters (using main host for routing), assume healthy
	if len(cc.endpoints) == 1 {
		log.Info().
			Str("cluster", cc.name).
			Str("endpoint", cc.endpoints[0]).
			Msg("Single endpoint cluster - skipping initial health check")
		return
	}

	// For multi-node clusters, do a very quick check but don't mark unhealthy immediately
	// This prevents nodes from being marked unhealthy due to temporary startup conditions

	var wg sync.WaitGroup
	for _, endpoint := range cc.endpoints {
		wg.Add(1)
		go func(ep string) {
			defer wg.Done()

			// Try a quick connection test with slightly longer timeout for initial check
			cfg := cc.config
			cfg.Host = ep
			cfg.Fingerprint = cc.getEndpointFingerprint(ep)
			cfg.Timeout = 5 * time.Second

			testClient, err := NewClient(cfg)
			if err != nil {
				cc.mu.Lock()
				cc.nodeHealth[ep] = false
				cc.lastError[ep] = sanitizeEndpointError(err.Error())
				cc.lastHealthCheck[ep] = time.Now()
				cc.mu.Unlock()
				log.Info().
					Str("cluster", cc.name).
					Str("endpoint", ep).
					Err(err).
					Msg("Cluster endpoint marked unhealthy on initialization")
				return
			}

			// Quick test with slightly longer timeout for initial check
			ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
			_, err = testClient.GetNodes(ctx)
			cancel()

			cc.mu.Lock()

			// Check if error is VM-specific (shouldn't affect health)
			vmSpecificErr := err != nil && isVMSpecificError(err.Error())

			if err == nil || vmSpecificErr {
				// Node is healthy - create a proper client with full timeout for actual use
				fullCfg := cc.config
				fullCfg.Host = ep
				fullCfg.Fingerprint = cc.getEndpointFingerprint(ep)
				fullClient, clientErr := NewClient(fullCfg)
				if clientErr != nil {
					cc.nodeHealth[ep] = false
					cc.lastError[ep] = sanitizeEndpointError(clientErr.Error())
					cc.lastHealthCheck[ep] = time.Now()
					log.Warn().
						Str("cluster", cc.name).
						Str("endpoint", ep).
						Err(clientErr).
						Msg("Failed to create full client after successful health check")
				} else {
					cc.nodeHealth[ep] = true
					delete(cc.lastError, ep)
					cc.lastHealthCheck[ep] = time.Now()
					cc.clients[ep] = fullClient // Store the full client, not test client
					if vmSpecificErr {
						log.Debug().
							Str("cluster", cc.name).
							Str("endpoint", ep).
							Msg("Cluster endpoint healthy despite VM-specific errors")
					} else {
						log.Info().
							Str("cluster", cc.name).
							Str("endpoint", ep).
							Msg("Cluster endpoint passed initial health check")
					}
				}
			} else {
				// Real connectivity issue
				cc.nodeHealth[ep] = false
				cc.lastError[ep] = sanitizeEndpointError(err.Error())
				cc.lastHealthCheck[ep] = time.Now()
				log.Info().
					Str("cluster", cc.name).
					Str("endpoint", ep).
					Err(err).
					Msg("Cluster endpoint failed initial health check")
			}
			cc.mu.Unlock()
		}(endpoint)
	}

	// Wait for all checks to complete
	wg.Wait()

	log.Info().
		Str("cluster", cc.name).
		Int("total", len(cc.endpoints)).
		Msg("Initial cluster health check completed")
}

// getHealthyClient returns a healthy client using round-robin selection
func (cc *ClusterClient) getHealthyClient(ctx context.Context) (*Client, error) {
	cc.mu.Lock()
	defer cc.mu.Unlock()

	// Get list of healthy endpoints
	var healthyEndpoints []string
	var coolingEndpoints []string
	now := time.Now()
	for endpoint, healthy := range cc.nodeHealth {
		if healthy {
			if cooldown, exists := cc.rateLimitUntil[endpoint]; exists {
				if now.Before(cooldown) {
					coolingEndpoints = append(coolingEndpoints, endpoint)
					continue
				}
				delete(cc.rateLimitUntil, endpoint)
			}
			healthyEndpoints = append(healthyEndpoints, endpoint)
		}
	}

	if len(healthyEndpoints) == 0 && len(coolingEndpoints) > 0 {
		// Nothing is immediately available, fall back to endpoints that are in cooldown
		healthyEndpoints = append(healthyEndpoints, coolingEndpoints...)
	}

	// Count unhealthy endpoints for logging and recovery decisions
	unhealthyCount := 0
	for _, healthy := range cc.nodeHealth {
		if !healthy {
			unhealthyCount++
		}
	}

	// Log at warn level if no healthy endpoints to aid troubleshooting
	if len(healthyEndpoints) == 0 && len(coolingEndpoints) == 0 {
		log.Warn().
			Str("cluster", cc.name).
			Int("healthy", len(healthyEndpoints)).
			Int("total", len(cc.nodeHealth)).
			Interface("nodeHealth", cc.nodeHealth).
			Msg("No healthy endpoints available - attempting recovery")
	} else {
		log.Debug().
			Str("cluster", cc.name).
			Int("healthy", len(healthyEndpoints)).
			Int("cooling", len(coolingEndpoints)).
			Int("total", len(cc.nodeHealth)).
			Interface("nodeHealth", cc.nodeHealth).
			Msg("Checking for healthy endpoints")
	}

	// Trigger recovery if we have any unhealthy endpoints
	// This ensures degraded clusters recover individual nodes over time,
	// not just when all nodes are down
	if unhealthyCount > 0 {
		// Use an anonymous function to ensure the lock is re-acquired even if
		// recoverUnhealthyNodes panics, preventing double-unlock from defer
		func() {
			cc.mu.Unlock()
			defer cc.mu.Lock()
			cc.recoverUnhealthyNodes(ctx)
		}()

		// Refresh the healthy/cooling endpoints lists after recovery attempt
		// since cluster state may have changed while lock was released
		healthyEndpoints = nil
		coolingEndpoints = nil
		now = time.Now() // Refresh time for accurate cooldown checks
		for endpoint, healthy := range cc.nodeHealth {
			if healthy {
				if cooldown, exists := cc.rateLimitUntil[endpoint]; exists && now.Before(cooldown) {
					coolingEndpoints = append(coolingEndpoints, endpoint)
					continue
				}
				healthyEndpoints = append(healthyEndpoints, endpoint)
			}
		}

		// Re-apply cooldown fallback if no healthy endpoints but some cooling
		if len(healthyEndpoints) == 0 && len(coolingEndpoints) > 0 {
			healthyEndpoints = append(healthyEndpoints, coolingEndpoints...)
		}
	}

	if len(healthyEndpoints) == 0 {
		// If still no healthy endpoints and we only have one endpoint,
		// try to use it anyway (could be temporarily unreachable)
		if len(cc.endpoints) == 1 {
			log.Warn().
				Str("cluster", cc.name).
				Str("endpoint", cc.endpoints[0]).
				Msg("Single endpoint appears unhealthy but attempting to use it anyway")
			healthyEndpoints = cc.endpoints
			// Mark it as healthy optimistically
			cc.nodeHealth[cc.endpoints[0]] = true
		} else {
			// Provide detailed error with endpoint status
			unhealthyList := make([]string, 0, len(cc.endpoints))
			for _, ep := range cc.endpoints {
				if !cc.nodeHealth[ep] {
					unhealthyList = append(unhealthyList, ep)
				}
			}
			log.Error().
				Str("cluster", cc.name).
				Strs("unhealthyEndpoints", unhealthyList).
				Int("totalEndpoints", len(cc.endpoints)).
				Msg("All cluster endpoints are unhealthy - verify network connectivity and API accessibility from Pulse server")
			return nil, fmt.Errorf("no healthy nodes available in cluster %s (all %d endpoints unreachable: %v)", cc.name, len(cc.endpoints), unhealthyList)
		}
	}

	// Use random selection for better load distribution
	selectedEndpoint := healthyEndpoints[rand.Intn(len(healthyEndpoints))]

	// Get or create client for this endpoint
	client, exists := cc.clients[selectedEndpoint]
	if !exists {
		// Create new client with shorter timeout for initial test
		cfg := cc.config
		cfg.Host = selectedEndpoint
		cfg.Fingerprint = cc.getEndpointFingerprint(selectedEndpoint)

		// First try with a short timeout to quickly detect offline nodes
		testCfg := cfg
		testCfg.Timeout = 3 * time.Second

		testClient, err := NewClient(testCfg)
		if err != nil {
			// Mark as unhealthy
			cc.nodeHealth[selectedEndpoint] = false
			log.Debug().
				Str("cluster", cc.name).
				Str("endpoint", selectedEndpoint).
				Err(err).
				Msg("Failed to create client for cluster endpoint")
			return nil, fmt.Errorf("failed to create client for %s: %w", selectedEndpoint, err)
		}

		// Connectivity test - 5 seconds to allow for TLS handshake (~3s typical)
		testCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
		testNodes, testErr := testClient.GetNodes(testCtx)
		cancel()

		if testErr != nil {
			// Check if this is a transient rate limit error that shouldn't mark the node unhealthy
			if isRateLimited, _ := isTransientRateLimitError(testErr); isRateLimited {
				log.Debug().
					Str("cluster", cc.name).
					Str("endpoint", selectedEndpoint).
					Err(testErr).
					Msg("Ignoring transient rate limit error during connectivity test")
				// Continue with client creation since the node is accessible, just rate limited
			} else {
				// Check if this is a VM-specific error that shouldn't mark the node unhealthy
				testErrStr := testErr.Error()
				if strings.Contains(testErrStr, "No QEMU guest agent") ||
					strings.Contains(testErrStr, "QEMU guest agent is not running") ||
					strings.Contains(testErrStr, "guest agent") {
					// This is a VM-specific issue, not a connectivity problem
					// The node is actually healthy, so don't mark it unhealthy
					log.Debug().
						Str("cluster", cc.name).
						Str("endpoint", selectedEndpoint).
						Err(testErr).
						Msg("Ignoring VM-specific error during connectivity test")
					// Continue with client creation since the node is actually accessible
				} else {
					// Mark as unhealthy for real connectivity issues
					cc.nodeHealth[selectedEndpoint] = false
					log.Warn().
						Str("cluster", cc.name).
						Str("endpoint", selectedEndpoint).
						Err(testErr).
						Msg("Failed to connect to Proxmox endpoint; endpoint removed from rotation until next refresh")
					return nil, fmt.Errorf("endpoint %s failed connectivity test: %w", selectedEndpoint, testErr)
				}
			}
		}

		log.Debug().
			Str("cluster", cc.name).
			Str("endpoint", selectedEndpoint).
			Int("nodes", len(testNodes)).
			Msg("Cluster endpoint passed connectivity test")

		// Clear any stale error from previous failures now that connectivity succeeded
		delete(cc.lastError, selectedEndpoint)

		// Create the actual client with full timeout
		newClient, err := NewClient(cfg)
		if err != nil {
			// This shouldn't happen since we just tested it
			cc.nodeHealth[selectedEndpoint] = false
			return nil, fmt.Errorf("failed to create client for %s: %w", selectedEndpoint, err)
		}

		cc.clients[selectedEndpoint] = newClient
		client = newClient
	}

	return client, nil
}

// markUnhealthyWithError marks an endpoint as unhealthy and captures the error
func (cc *ClusterClient) markUnhealthyWithError(endpoint string, errMsg string) {
	cc.mu.Lock()
	defer cc.mu.Unlock()

	if cc.nodeHealth[endpoint] {
		log.Warn().
			Str("cluster", cc.name).
			Str("endpoint", endpoint).
			Str("error", errMsg).
			Msg("Marking cluster node as unhealthy")
		cc.nodeHealth[endpoint] = false
	}
	if errMsg != "" {
		cc.lastError[endpoint] = sanitizeEndpointError(errMsg)
	}
	cc.lastHealthCheck[endpoint] = time.Now()
}

// clearEndpointError removes any cached error for an endpoint after successful operations
// and marks the endpoint as healthy since the operation succeeded
func (cc *ClusterClient) clearEndpointError(endpoint string) {
	cc.mu.Lock()
	defer cc.mu.Unlock()
	delete(cc.lastError, endpoint)
	// Mark endpoint healthy since operation succeeded - this ensures degraded
	// clusters recover once endpoints start responding again
	cc.nodeHealth[endpoint] = true
}

// recoverUnhealthyNodes attempts to recover unhealthy nodes
func (cc *ClusterClient) recoverUnhealthyNodes(ctx context.Context) {
	cc.mu.RLock()
	unhealthyEndpoints := make([]string, 0)
	throttledEndpoints := make([]string, 0)
	now := time.Now()
	for endpoint, healthy := range cc.nodeHealth {
		if !healthy {
			// Skip if we checked this endpoint recently (within 10 seconds)
			// Balance between recovery speed and avoiding excessive checks
			if lastCheck, exists := cc.lastHealthCheck[endpoint]; exists {
				if now.Sub(lastCheck) < 10*time.Second {
					throttledEndpoints = append(throttledEndpoints, endpoint)
					continue
				}
			}
			unhealthyEndpoints = append(unhealthyEndpoints, endpoint)
		}
	}
	cc.mu.RUnlock()

	if len(unhealthyEndpoints) == 0 {
		if len(throttledEndpoints) > 0 {
			log.Debug().
				Str("cluster", cc.name).
				Strs("throttledEndpoints", throttledEndpoints).
				Msg("Skipping recovery check - endpoints checked recently")
		}
		return
	}

	log.Info().
		Str("cluster", cc.name).
		Strs("unhealthyEndpoints", unhealthyEndpoints).
		Int("count", len(unhealthyEndpoints)).
		Msg("Attempting to recover unhealthy cluster endpoints")

	// Test all unhealthy endpoints concurrently with a short timeout
	var wg sync.WaitGroup
	recoveredEndpoints := make(chan string, len(unhealthyEndpoints))

	for _, endpoint := range unhealthyEndpoints {
		wg.Add(1)
		go func(ep string) {
			defer wg.Done()

			// Update last check time
			cc.mu.Lock()
			cc.lastHealthCheck[ep] = now
			cc.mu.Unlock()

			// Try to create a client and test connection
			// Note: 5-second timeout needed because TLS handshake to Proxmox API
			// typically takes ~3 seconds on local networks
			cfg := cc.config
			cfg.Host = ep
			cfg.Fingerprint = cc.getEndpointFingerprint(ep)
			cfg.Timeout = 5 * time.Second

			testClient, err := NewClient(cfg)
			if err != nil {
				log.Debug().
					Str("cluster", cc.name).
					Str("endpoint", ep).
					Err(err).
					Msg("Failed to create client during recovery attempt")
				return
			}

			// Try a simple API call
			testCtx, cancel := context.WithTimeout(ctx, 5*time.Second)
			_, err = testClient.GetNodes(testCtx)
			cancel()

			// Check if error is VM-specific (shouldn't prevent recovery)
			vmSpecificErr := err != nil && isVMSpecificError(err.Error())

			if err == nil || vmSpecificErr {
				recoveredEndpoints <- ep

				// Store the client with original timeout
				cfg.Timeout = cc.config.Timeout
				fullClient, _ := NewClient(cfg)

				cc.mu.Lock()
				cc.nodeHealth[ep] = true
				delete(cc.lastError, ep)
				cc.lastHealthCheck[ep] = time.Now()
				cc.clients[ep] = fullClient
				cc.mu.Unlock()

				if vmSpecificErr {
					log.Info().
						Str("cluster", cc.name).
						Str("endpoint", ep).
						Msg("Recovered unhealthy cluster node (ignoring VM-specific errors)")
				} else {
					log.Info().
						Str("cluster", cc.name).
						Str("endpoint", ep).
						Msg("Recovered unhealthy cluster node")
				}
			} else {
				log.Debug().
					Str("cluster", cc.name).
					Str("endpoint", ep).
					Err(err).
					Msg("Recovery attempt failed - endpoint still unhealthy")
			}
		}(endpoint)
	}

	// Wait for all recovery attempts to complete
	go func() {
		wg.Wait()
		close(recoveredEndpoints)
	}()

	// Count recovered endpoints
	recoveredCount := 0
	for range recoveredEndpoints {
		recoveredCount++
	}

	// Log recovery summary
	if recoveredCount > 0 {
		log.Info().
			Str("cluster", cc.name).
			Int("recovered", recoveredCount).
			Int("attempted", len(unhealthyEndpoints)).
			Msg("Cluster endpoint recovery completed")
	} else if len(unhealthyEndpoints) > 0 {
		log.Warn().
			Str("cluster", cc.name).
			Int("attempted", len(unhealthyEndpoints)).
			Strs("failedEndpoints", unhealthyEndpoints).
			Msg("No endpoints recovered - cluster may be unreachable from Pulse server")
	}
}

// executeWithFailover executes a function with automatic failover
func (cc *ClusterClient) executeWithFailover(ctx context.Context, fn func(*Client) error) error {
	baseRetries := len(cc.endpoints)
	maxRetries := baseRetries + rateLimitRetryBudget
	var lastErr error

	log.Debug().
		Str("cluster", cc.name).
		Int("maxRetries", maxRetries).
		Msg("Starting executeWithFailover")

	for i := 0; i < maxRetries; i++ {
		client, err := cc.getHealthyClient(ctx)
		if err != nil {
			log.Debug().
				Str("cluster", cc.name).
				Err(err).
				Int("attempt", i+1).
				Msg("Failed to get healthy client")
			return err
		}

		// Get the endpoint for this client
		var clientEndpoint string
		cc.mu.RLock()
		for endpoint, c := range cc.clients {
			if c == client {
				clientEndpoint = endpoint
				break
			}
		}
		cc.mu.RUnlock()

		// Execute the function
		err = fn(client)
		if err == nil {
			// Clear any stale error for this endpoint on success
			cc.clearEndpointError(clientEndpoint)
			return nil
		}
		lastErr = err

		// Rate limit - retry with backoff (check before connectivity classification)
		if isRateLimited, statusCode := isTransientRateLimitError(err); isRateLimited {
			backoff := calculateRateLimitBackoff(i)
			cc.applyRateLimitCooldown(clientEndpoint, backoff)

			event := log.Warn().
				Str("cluster", cc.name).
				Str("endpoint", clientEndpoint).
				Err(err).
				Dur("backoff", backoff).
				Int("attempt", i+1)
			if statusCode != 0 {
				event = event.Int("status", statusCode)
			}
			event.Msg("Rate limited by cluster node, retrying with backoff")

			timer := time.NewTimer(backoff)
			select {
			case <-ctx.Done():
				if !timer.Stop() {
					<-timer.C
				}
				return fmt.Errorf("context canceled while backing off after rate limit: %w", ctx.Err())
			case <-timer.C:
			}

			continue
		}

		// Auth errors - return immediately without marking endpoint unhealthy
		if isAuthError(err) {
			return err
		}

		// Only mark endpoint unhealthy for actual connectivity failures (TCP/DNS/TLS).
		// Any HTTP response — even 500 — proves the endpoint is reachable.
		if isEndpointConnectivityError(err) {
			cc.markUnhealthyWithError(clientEndpoint, err.Error())
			log.Warn().
				Str("cluster", cc.name).
				Str("endpoint", clientEndpoint).
				Err(err).
				Int("attempt", i+1).
				Msg("Connectivity failure on cluster node, trying next")
			continue
		}

		// Endpoint is reachable but this specific request failed (API error, permission
		// issue, VM-specific error, etc.). Return without marking endpoint unhealthy.
		log.Debug().
			Str("cluster", cc.name).
			Str("endpoint", clientEndpoint).
			Err(err).
			Msg("Request-level error, endpoint reachable - not marking unhealthy")
		return err
	}

	if lastErr != nil {
		return fmt.Errorf("all cluster nodes failed for %s: %w", cc.name, lastErr)
	}

	return fmt.Errorf("all cluster nodes failed for %s", cc.name)
}

func (cc *ClusterClient) applyRateLimitCooldown(endpoint string, backoff time.Duration) {
	if endpoint == "" {
		return
	}

	cc.mu.Lock()
	defer cc.mu.Unlock()
	if cc.rateLimitUntil == nil {
		cc.rateLimitUntil = make(map[string]time.Time)
	}
	cc.rateLimitUntil[endpoint] = time.Now().Add(backoff)
}

func calculateRateLimitBackoff(attempt int) time.Duration {
	// Linear backoff with jitter keeps retries gentle while avoiding thundering herd
	base := rateLimitBaseDelay * time.Duration(attempt+1)
	if rateLimitMaxJitter <= 0 {
		return base
	}

	jitter := time.Duration(rand.Int63n(rateLimitMaxJitter.Nanoseconds()+1)) * time.Nanosecond
	return base + jitter
}

func isTransientRateLimitError(err error) (bool, int) {
	if err == nil {
		return false, 0
	}

	errStr := err.Error()
	statusCode := extractStatusCode(errStr)
	if statusCode != 0 {
		if _, ok := transientRateLimitStatusCodes[statusCode]; ok {
			return true, statusCode
		}
	}

	lowerErr := strings.ToLower(errStr)
	if strings.Contains(lowerErr, "rate limit") || strings.Contains(lowerErr, "too many requests") {
		if statusCode == 0 {
			statusCode = 429
		}
		return true, statusCode
	}

	return false, statusCode
}

func extractStatusCode(errStr string) int {
	matches := statusCodePattern.FindStringSubmatch(errStr)
	if len(matches) != 2 {
		return 0
	}

	code, err := strconv.Atoi(matches[1])
	if err != nil {
		return 0
	}

	return code
}

func isNotImplementedError(errStr string) bool {
	lower := strings.ToLower(errStr)
	if !strings.Contains(lower, "not implemented") {
		return false
	}

	// Common formatting: "status 501", "error 501", "api error 501"
	if strings.Contains(lower, " 501") || strings.Contains(lower, "status 501") || strings.Contains(lower, "error 501") {
		return true
	}

	// Fallback to explicit HTTP status detection
	if extractStatusCode(errStr) == 501 {
		return true
	}

	return false
}

// GetHealthStatus returns the health status of all nodes
func (cc *ClusterClient) GetHealthStatus() map[string]bool {
	cc.mu.RLock()
	defer cc.mu.RUnlock()

	status := make(map[string]bool)
	for endpoint, healthy := range cc.nodeHealth {
		status[endpoint] = healthy
	}
	return status
}

// EndpointHealth contains health information for a single endpoint
type EndpointHealth struct {
	Healthy   bool
	LastCheck time.Time
	LastError string
}

// GetHealthStatusWithErrors returns detailed health status including error messages
func (cc *ClusterClient) GetHealthStatusWithErrors() map[string]EndpointHealth {
	cc.mu.RLock()
	defer cc.mu.RUnlock()

	status := make(map[string]EndpointHealth)
	for endpoint, healthy := range cc.nodeHealth {
		status[endpoint] = EndpointHealth{
			Healthy:   healthy,
			LastCheck: cc.lastHealthCheck[endpoint],
			LastError: cc.lastError[endpoint],
		}
	}
	return status
}

// Implement all the Client methods with failover

func (cc *ClusterClient) GetNodes(ctx context.Context) ([]Node, error) {
	log.Debug().
		Str("cluster", cc.name).
		Msg("ClusterClient.GetNodes called")

	var result []Node
	err := cc.executeWithFailover(ctx, func(client *Client) error {
		nodes, err := client.GetNodes(ctx)
		if err != nil {
			return err
		}
		result = nodes
		return nil
	})

	if err != nil {
		log.Warn().
			Str("cluster", cc.name).
			Err(err).
			Msg("ClusterClient.GetNodes failed")
	} else {
		log.Info().
			Str("cluster", cc.name).
			Int("count", len(result)).
			Msg("ClusterClient.GetNodes succeeded")
	}

	return result, err
}

func (cc *ClusterClient) GetNodeStatus(ctx context.Context, node string) (*NodeStatus, error) {
	var result *NodeStatus
	err := cc.executeWithFailover(ctx, func(client *Client) error {
		status, err := client.GetNodeStatus(ctx, node)
		if err != nil {
			return err
		}
		result = status
		return nil
	})
	return result, err
}

func (cc *ClusterClient) GetNodeRRDData(ctx context.Context, node, timeframe, cf string, ds []string) ([]NodeRRDPoint, error) {
	var result []NodeRRDPoint
	err := cc.executeWithFailover(ctx, func(client *Client) error {
		points, err := client.GetNodeRRDData(ctx, node, timeframe, cf, ds)
		if err != nil {
			return err
		}
		result = points
		return nil
	})
	return result, err
}

func (cc *ClusterClient) GetLXCRRDData(ctx context.Context, node string, vmid int, timeframe, cf string, ds []string) ([]GuestRRDPoint, error) {
	var result []GuestRRDPoint
	err := cc.executeWithFailover(ctx, func(client *Client) error {
		points, err := client.GetLXCRRDData(ctx, node, vmid, timeframe, cf, ds)
		if err != nil {
			return err
		}
		result = points
		return nil
	})
	return result, err
}

func (cc *ClusterClient) GetVMRRDData(ctx context.Context, node string, vmid int, timeframe, cf string, ds []string) ([]GuestRRDPoint, error) {
	var result []GuestRRDPoint
	err := cc.executeWithFailover(ctx, func(client *Client) error {
		points, err := client.GetVMRRDData(ctx, node, vmid, timeframe, cf, ds)
		if err != nil {
			return err
		}
		result = points
		return nil
	})
	return result, err
}

func (cc *ClusterClient) GetVMs(ctx context.Context, node string) ([]VM, error) {
	var result []VM
	err := cc.executeWithFailover(ctx, func(client *Client) error {
		vms, err := client.GetVMs(ctx, node)
		if err != nil {
			return err
		}
		result = vms
		return nil
	})

	// Don't return error for transient connectivity issues - preserve UI state
	if err != nil && strings.Contains(err.Error(), "no healthy nodes available") {
		log.Debug().
			Str("cluster", cc.name).
			Str("node", node).
			Err(err).
			Msg("No healthy nodes for GetVMs - returning empty list to preserve UI state")
		return []VM{}, nil
	}

	return result, err
}

func (cc *ClusterClient) GetContainers(ctx context.Context, node string) ([]Container, error) {
	var result []Container
	err := cc.executeWithFailover(ctx, func(client *Client) error {
		containers, err := client.GetContainers(ctx, node)
		if err != nil {
			return err
		}
		result = containers
		return nil
	})

	// Don't return error for transient connectivity issues - preserve UI state
	if err != nil && strings.Contains(err.Error(), "no healthy nodes available") {
		log.Debug().
			Str("cluster", cc.name).
			Str("node", node).
			Err(err).
			Msg("No healthy nodes for GetContainers - returning empty list to preserve UI state")
		return []Container{}, nil
	}

	return result, err
}

func (cc *ClusterClient) GetStorage(ctx context.Context, node string) ([]Storage, error) {
	var result []Storage
	err := cc.executeWithFailover(ctx, func(client *Client) error {
		storage, err := client.GetStorage(ctx, node)
		if err != nil {
			return err
		}
		result = storage
		return nil
	})
	return result, err
}

func (cc *ClusterClient) GetAllStorage(ctx context.Context) ([]Storage, error) {
	var result []Storage
	err := cc.executeWithFailover(ctx, func(client *Client) error {
		storage, err := client.GetAllStorage(ctx)
		if err != nil {
			return err
		}
		result = storage
		return nil
	})
	return result, err
}

func (cc *ClusterClient) GetBackupTasks(ctx context.Context) ([]Task, error) {
	var result []Task
	err := cc.executeWithFailover(ctx, func(client *Client) error {
		tasks, err := client.GetBackupTasks(ctx)
		if err != nil {
			return err
		}
		result = tasks
		return nil
	})
	return result, err
}

func (cc *ClusterClient) GetReplicationStatus(ctx context.Context) ([]ReplicationJob, error) {
	var result []ReplicationJob
	err := cc.executeWithFailover(ctx, func(client *Client) error {
		jobs, err := client.GetReplicationStatus(ctx)
		if err != nil {
			return err
		}
		result = jobs
		return nil
	})
	return result, err
}

func (cc *ClusterClient) GetStorageContent(ctx context.Context, node, storage string) ([]StorageContent, error) {
	var result []StorageContent
	err := cc.executeWithFailover(ctx, func(client *Client) error {
		content, err := client.GetStorageContent(ctx, node, storage)
		if err != nil {
			return err
		}
		result = content
		return nil
	})
	return result, err
}

// GetCephStatus returns Ceph cluster status information with failover support.
func (cc *ClusterClient) GetCephStatus(ctx context.Context) (*CephStatus, error) {
	var result *CephStatus
	err := cc.executeWithFailover(ctx, func(client *Client) error {
		status, err := client.GetCephStatus(ctx)
		if err != nil {
			return err
		}
		result = status
		return nil
	})
	return result, err
}

// GetCephDF returns Ceph capacity information with failover support.
func (cc *ClusterClient) GetCephDF(ctx context.Context) (*CephDF, error) {
	var result *CephDF
	err := cc.executeWithFailover(ctx, func(client *Client) error {
		df, err := client.GetCephDF(ctx)
		if err != nil {
			return err
		}
		result = df
		return nil
	})
	return result, err
}

func (cc *ClusterClient) GetVMSnapshots(ctx context.Context, node string, vmid int) ([]Snapshot, error) {
	var result []Snapshot
	err := cc.executeWithFailover(ctx, func(client *Client) error {
		snapshots, err := client.GetVMSnapshots(ctx, node, vmid)
		if err != nil {
			return err
		}
		result = snapshots
		return nil
	})
	return result, err
}

func (cc *ClusterClient) GetContainerSnapshots(ctx context.Context, node string, vmid int) ([]Snapshot, error) {
	var result []Snapshot
	err := cc.executeWithFailover(ctx, func(client *Client) error {
		snapshots, err := client.GetContainerSnapshots(ctx, node, vmid)
		if err != nil {
			return err
		}
		result = snapshots
		return nil
	})
	return result, err
}

func (cc *ClusterClient) GetVMStatus(ctx context.Context, node string, vmid int) (*VMStatus, error) {
	var result *VMStatus
	err := cc.executeWithFailover(ctx, func(client *Client) error {
		status, err := client.GetVMStatus(ctx, node, vmid)
		if err != nil {
			return err
		}
		result = status
		return nil
	})
	return result, err
}

func (cc *ClusterClient) GetVMConfig(ctx context.Context, node string, vmid int) (map[string]interface{}, error) {
	var result map[string]interface{}
	err := cc.executeWithFailover(ctx, func(client *Client) error {
		config, err := client.GetVMConfig(ctx, node, vmid)
		if err != nil {
			return err
		}
		result = config
		return nil
	})
	return result, err
}

func (cc *ClusterClient) GetVMAgentInfo(ctx context.Context, node string, vmid int) (map[string]interface{}, error) {
	var result map[string]interface{}
	err := cc.executeWithFailover(ctx, func(client *Client) error {
		info, err := client.GetVMAgentInfo(ctx, node, vmid)
		if err != nil {
			return err
		}
		result = info
		return nil
	})
	return result, err
}

// GetVMAgentVersion returns the guest agent version for the VM.
func (cc *ClusterClient) GetVMAgentVersion(ctx context.Context, node string, vmid int) (string, error) {
	var version string
	err := cc.executeWithFailover(ctx, func(client *Client) error {
		v, err := client.GetVMAgentVersion(ctx, node, vmid)
		if err != nil {
			return err
		}
		version = v
		return nil
	})
	return version, err
}

// GetVMFSInfo returns filesystem information from QEMU guest agent
func (cc *ClusterClient) GetVMFSInfo(ctx context.Context, node string, vmid int) ([]VMFileSystem, error) {
	var result []VMFileSystem
	err := cc.executeWithFailover(ctx, func(client *Client) error {
		info, err := client.GetVMFSInfo(ctx, node, vmid)
		if err != nil {
			return err
		}
		result = info
		return nil
	})
	return result, err
}

// GetVMNetworkInterfaces returns guest network interfaces from the QEMU agent
func (cc *ClusterClient) GetVMNetworkInterfaces(ctx context.Context, node string, vmid int) ([]VMNetworkInterface, error) {
	var result []VMNetworkInterface
	err := cc.executeWithFailover(ctx, func(client *Client) error {
		interfaces, err := client.GetVMNetworkInterfaces(ctx, node, vmid)
		if err != nil {
			return err
		}
		result = interfaces
		return nil
	})
	return result, err
}

// GetVMMemAvailableFromAgent reads /proc/meminfo via the QEMU guest agent to get MemAvailable.
func (cc *ClusterClient) GetVMMemAvailableFromAgent(ctx context.Context, node string, vmid int) (uint64, error) {
	var result uint64
	err := cc.executeWithFailover(ctx, func(client *Client) error {
		available, err := client.GetVMMemAvailableFromAgent(ctx, node, vmid)
		if err != nil {
			return err
		}
		result = available
		return nil
	})
	return result, err
}

// GetClusterResources returns all resources (VMs, containers) across the cluster in a single call
func (cc *ClusterClient) GetClusterResources(ctx context.Context, resourceType string) ([]ClusterResource, error) {
	var result []ClusterResource
	err := cc.executeWithFailover(ctx, func(client *Client) error {
		resources, err := client.GetClusterResources(ctx, resourceType)
		if err != nil {
			return err
		}
		result = resources
		return nil
	})
	return result, err
}

// GetContainerStatus returns the status of a specific container
func (cc *ClusterClient) GetContainerStatus(ctx context.Context, node string, vmid int) (*Container, error) {
	var result *Container
	err := cc.executeWithFailover(ctx, func(client *Client) error {
		status, err := client.GetContainerStatus(ctx, node, vmid)
		if err != nil {
			return err
		}
		result = status
		return nil
	})
	return result, err
}

// GetContainerConfig returns the configuration of a specific container
func (cc *ClusterClient) GetContainerConfig(ctx context.Context, node string, vmid int) (map[string]interface{}, error) {
	var result map[string]interface{}
	err := cc.executeWithFailover(ctx, func(client *Client) error {
		config, err := client.GetContainerConfig(ctx, node, vmid)
		if err != nil {
			return err
		}
		result = config
		return nil
	})
	return result, err
}

// GetContainerInterfaces returns interface details for a container
func (cc *ClusterClient) GetContainerInterfaces(ctx context.Context, node string, vmid int) ([]ContainerInterface, error) {
	var result []ContainerInterface
	err := cc.executeWithFailover(ctx, func(client *Client) error {
		interfaces, err := client.GetContainerInterfaces(ctx, node, vmid)
		if err != nil {
			return err
		}
		result = interfaces
		return nil
	})
	return result, err
}

// IsClusterMember checks if this node is part of a cluster
func (cc *ClusterClient) IsClusterMember(ctx context.Context) (bool, error) {
	var result bool
	err := cc.executeWithFailover(ctx, func(client *Client) error {
		isMember, err := client.IsClusterMember(ctx)
		if err != nil {
			return err
		}
		result = isMember
		return nil
	})
	return result, err
}

// GetZFSPoolStatus returns ZFS pool status for a node
func (cc *ClusterClient) GetZFSPoolStatus(ctx context.Context, node string) ([]ZFSPoolStatus, error) {
	var result []ZFSPoolStatus
	err := cc.executeWithFailover(ctx, func(client *Client) error {
		pools, err := client.GetZFSPoolStatus(ctx, node)
		if err != nil {
			return err
		}
		result = pools
		return nil
	})
	return result, err
}

// GetZFSPoolsWithDetails returns ZFS pools with full details for a node
func (cc *ClusterClient) GetZFSPoolsWithDetails(ctx context.Context, node string) ([]ZFSPoolInfo, error) {
	var result []ZFSPoolInfo
	err := cc.executeWithFailover(ctx, func(client *Client) error {
		pools, err := client.GetZFSPoolsWithDetails(ctx, node)
		if err != nil {
			return err
		}
		result = pools
		return nil
	})
	return result, err
}

// Helper to check if error is auth-related
func (cc *ClusterClient) GetDisks(ctx context.Context, node string) ([]Disk, error) {
	var result []Disk
	err := cc.executeWithFailover(ctx, func(client *Client) error {
		disks, err := client.GetDisks(ctx, node)
		if err != nil {
			return err
		}
		result = disks
		return nil
	})

	// Don't return error for transient connectivity issues
	if err != nil && strings.Contains(err.Error(), "no healthy nodes available") {
		log.Debug().
			Str("cluster", cc.name).
			Str("node", node).
			Err(err).
			Msg("No healthy nodes for GetDisks - returning empty list")
		return []Disk{}, nil
	}

	return result, err
}

// GetNodePendingUpdates returns pending apt updates for a node with failover support
func (cc *ClusterClient) GetNodePendingUpdates(ctx context.Context, node string) ([]AptPackage, error) {
	var result []AptPackage
	err := cc.executeWithFailover(ctx, func(client *Client) error {
		pkgs, err := client.GetNodePendingUpdates(ctx, node)
		if err != nil {
			return err
		}
		result = pkgs
		return nil
	})

	// Don't return error for transient connectivity issues or permission issues
	if err != nil && (strings.Contains(err.Error(), "no healthy nodes available") ||
		strings.Contains(err.Error(), "403") || strings.Contains(err.Error(), "permission")) {
		log.Debug().
			Str("cluster", cc.name).
			Str("node", node).
			Err(err).
			Msg("Could not get pending updates - returning empty list")
		return []AptPackage{}, nil
	}

	return result, err
}

// GetClusterStatus returns the cluster status including all nodes with failover support.
func (cc *ClusterClient) GetClusterStatus(ctx context.Context) ([]ClusterStatus, error) {
	var result []ClusterStatus
	err := cc.executeWithFailover(ctx, func(client *Client) error {
		status, err := client.GetClusterStatus(ctx)
		if err != nil {
			return err
		}
		result = status
		return nil
	})

	return result, err
}

// IsQuorate checks if the cluster has quorum by querying the Proxmox cluster status.
// Returns true if the cluster is quorate (has enough votes for consensus), false otherwise.
// This is the authoritative check for cluster health - a cluster with quorum is healthy
// even if some nodes are intentionally offline (e.g., backup nodes not running).
func (cc *ClusterClient) IsQuorate(ctx context.Context) (bool, error) {
	status, err := cc.GetClusterStatus(ctx)
	if err != nil {
		return false, err
	}

	// Look for the cluster entry which has the quorate field
	for _, s := range status {
		if s.Type == "cluster" {
			return s.Quorate == 1, nil
		}
	}

	// If no cluster entry found, this might be a standalone node - consider it healthy
	return true, nil
}

// GetClusterOptions fetches datacenter options (e.g. tag colour map) via the first healthy node.
func (cc *ClusterClient) GetClusterOptions(ctx context.Context) (*ClusterOptions, error) {
	var result *ClusterOptions
	err := cc.executeWithFailover(ctx, func(client *Client) error {
		opts, err := client.GetClusterOptions(ctx)
		if err != nil {
			return err
		}
		result = opts
		return nil
	})
	return result, err
}

// isAuthError checks if an error is an authentication error
func isAuthError(err error) bool {
	if err == nil {
		return false
	}
	errStr := err.Error()
	return strings.Contains(errStr, "authentication") ||
		strings.Contains(errStr, "401") ||
		strings.Contains(errStr, "403")
}