Pulse/pkg/proxmox/cluster_client.go

1470 lines
43 KiB
Go

package proxmox
import (
"context"
"fmt"
"math/rand"
"regexp"
"strconv"
"strings"
"sync"
"time"
"github.com/rs/zerolog/log"
)
// ClusterClient wraps multiple Proxmox clients for cluster-aware operations
type ClusterClient struct {
mu sync.RWMutex
name string
clients map[string]*Client // Key is node name
endpoints []string // All available endpoints
endpointFingerprints map[string]string // Per-endpoint TLS fingerprints (TOFU)
nodeHealth map[string]bool // Track node health
lastHealthCheck map[string]time.Time // Track last health check time
lastError map[string]string // Track last error per endpoint
config ClientConfig // Base config (auth info)
rateLimitUntil map[string]time.Time // Cooldown window for rate-limited endpoints
}
const (
rateLimitBaseDelay = 150 * time.Millisecond
rateLimitMaxJitter = 200 * time.Millisecond
rateLimitRetryBudget = 2
)
var statusCodePattern = regexp.MustCompile(`(?i)(?:api error|status)\s+(\d{3})`)
var transientRateLimitStatusCodes = map[int]struct{}{
408: {},
425: {}, // Too Early
429: {},
502: {},
503: {},
504: {},
}
// isVMSpecificError reports whether an error string is scoped to a single VM/guest agent
// and should not be treated as a node connectivity failure.
func isVMSpecificError(errStr string) bool {
if errStr == "" {
return false
}
lower := strings.ToLower(errStr)
if strings.Contains(lower, "no qemu guest agent") ||
strings.Contains(lower, "qemu guest agent is not running") ||
strings.Contains(lower, "guest agent") {
return true
}
// QMP guest agent operations can time out or fail per-VM (e.g. guest-get-fsinfo).
// These aren't node connectivity issues and should not mark endpoints unhealthy.
if strings.Contains(lower, "qmp command") {
return true
}
if strings.Contains(lower, "guest-get-") {
return true
}
return false
}
// isEndpointConnectivityError reports whether an error indicates the endpoint
// itself is unreachable (TCP/DNS/TLS failure). Any error that carries an HTTP
// response — even a 500 — proves the endpoint is reachable, so those are NOT
// connectivity errors.
func isEndpointConnectivityError(err error) bool {
if err == nil {
return false
}
errStr := strings.ToLower(err.Error())
// If we received an HTTP response from Proxmox (any status code),
// the endpoint is reachable.
if strings.Contains(errStr, "api error") {
return false
}
// TCP/DNS connectivity failures
if strings.Contains(errStr, "connection refused") ||
strings.Contains(errStr, "connection reset") ||
strings.Contains(errStr, "no such host") ||
strings.Contains(errStr, "network is unreachable") ||
strings.Contains(errStr, "no route to host") ||
strings.Contains(errStr, "i/o timeout") ||
strings.Contains(errStr, "dial tcp") ||
strings.Contains(errStr, "dial:") {
return true
}
// TLS failures
if strings.Contains(errStr, "tls handshake") ||
strings.Contains(errStr, "tls:") ||
strings.Contains(errStr, "certificate") ||
strings.Contains(errStr, "fingerprint mismatch") {
return true
}
return false
}
// sanitizeEndpointError transforms raw Go errors into user-friendly messages
// for display in the UI. The original error is preserved in logs.
func sanitizeEndpointError(errMsg string) string {
if errMsg == "" {
return errMsg
}
lower := strings.ToLower(errMsg)
// Context deadline exceeded - usually means slow API response
if strings.Contains(lower, "context deadline exceeded") {
// Check for specific causes
if strings.Contains(lower, "/storage") {
return "Request timed out - storage API slow (check for unreachable PBS/NFS/Ceph backends)"
}
if strings.Contains(lower, "pbs-") || strings.Contains(lower, ":8007") {
return "Request timed out - PBS storage backend unreachable"
}
return "Request timed out - Proxmox API may be slow or waiting on unreachable backend services"
}
// Client timeout - similar to context deadline
if strings.Contains(lower, "client.timeout exceeded") {
return "Connection timed out - Proxmox API not responding in time"
}
// Connection refused
if strings.Contains(lower, "connection refused") {
return "Connection refused - Proxmox API not running or firewall blocking"
}
// No route to host
if strings.Contains(lower, "no route to host") {
return "Network unreachable - check network connectivity to Proxmox host"
}
// TLS/certificate errors
if strings.Contains(lower, "certificate") || strings.Contains(lower, "x509") {
return "TLS certificate error - check SSL settings or add fingerprint"
}
// Auth errors - keep these specific
if strings.Contains(lower, "authentication") || strings.Contains(lower, "401") || strings.Contains(lower, "403") {
return "Authentication failed - check API token or credentials"
}
// PBS-specific errors
if strings.Contains(lower, "can't connect to") && strings.Contains(lower, ":8007") {
return "PBS storage unreachable - check Proxmox Backup Server connectivity"
}
// Return original if no transformation applies
return errMsg
}
// NewClusterClient creates a new cluster-aware client.
// endpointFingerprints is an optional map of endpoint URL -> TLS fingerprint for per-node certificate verification.
// This enables TOFU (Trust On First Use) for clusters with unique self-signed certs per node.
func NewClusterClient(name string, config ClientConfig, endpoints []string, endpointFingerprints map[string]string) *ClusterClient {
if endpointFingerprints == nil {
endpointFingerprints = make(map[string]string)
}
cc := &ClusterClient{
name: name,
clients: make(map[string]*Client),
endpoints: endpoints,
endpointFingerprints: endpointFingerprints,
nodeHealth: make(map[string]bool),
lastHealthCheck: make(map[string]time.Time),
lastError: make(map[string]string),
config: config,
rateLimitUntil: make(map[string]time.Time),
}
// Initialize all endpoints as unknown (will be tested on first use)
// Start optimistically - assume healthy until proven otherwise
// This allows operations to be attempted even if initial health check fails
for _, endpoint := range endpoints {
cc.nodeHealth[endpoint] = true // Start optimistic, will be marked unhealthy if operations fail
}
// Do a quick parallel health check on initialization (synchronous to avoid race)
// This will mark unhealthy nodes but won't prevent trying them later
cc.initialHealthCheck()
return cc
}
// getEndpointFingerprint returns the TLS fingerprint to use for a specific endpoint.
// It prefers the per-endpoint fingerprint (TOFU) over the base config fingerprint.
func (cc *ClusterClient) getEndpointFingerprint(endpoint string) string {
if fp, ok := cc.endpointFingerprints[endpoint]; ok && fp != "" {
return fp
}
return cc.config.Fingerprint
}
// initialHealthCheck performs a quick parallel health check on all endpoints
func (cc *ClusterClient) initialHealthCheck() {
// Skip initial health check if there's only one endpoint
// For single-endpoint clusters (using main host for routing), assume healthy
if len(cc.endpoints) == 1 {
log.Info().
Str("cluster", cc.name).
Str("endpoint", cc.endpoints[0]).
Msg("Single endpoint cluster - skipping initial health check")
return
}
// For multi-node clusters, do a very quick check but don't mark unhealthy immediately
// This prevents nodes from being marked unhealthy due to temporary startup conditions
var wg sync.WaitGroup
for _, endpoint := range cc.endpoints {
wg.Add(1)
go func(ep string) {
defer wg.Done()
// Try a quick connection test with slightly longer timeout for initial check
cfg := cc.config
cfg.Host = ep
cfg.Fingerprint = cc.getEndpointFingerprint(ep)
cfg.Timeout = 5 * time.Second
testClient, err := NewClient(cfg)
if err != nil {
cc.mu.Lock()
cc.nodeHealth[ep] = false
cc.lastError[ep] = sanitizeEndpointError(err.Error())
cc.lastHealthCheck[ep] = time.Now()
cc.mu.Unlock()
log.Info().
Str("cluster", cc.name).
Str("endpoint", ep).
Err(err).
Msg("Cluster endpoint marked unhealthy on initialization")
return
}
// Quick test with slightly longer timeout for initial check
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
_, err = testClient.GetNodes(ctx)
cancel()
cc.mu.Lock()
// Check if error is VM-specific (shouldn't affect health)
vmSpecificErr := err != nil && isVMSpecificError(err.Error())
if err == nil || vmSpecificErr {
// Node is healthy - create a proper client with full timeout for actual use
fullCfg := cc.config
fullCfg.Host = ep
fullCfg.Fingerprint = cc.getEndpointFingerprint(ep)
fullClient, clientErr := NewClient(fullCfg)
if clientErr != nil {
cc.nodeHealth[ep] = false
cc.lastError[ep] = sanitizeEndpointError(clientErr.Error())
cc.lastHealthCheck[ep] = time.Now()
log.Warn().
Str("cluster", cc.name).
Str("endpoint", ep).
Err(clientErr).
Msg("Failed to create full client after successful health check")
} else {
cc.nodeHealth[ep] = true
delete(cc.lastError, ep)
cc.lastHealthCheck[ep] = time.Now()
cc.clients[ep] = fullClient // Store the full client, not test client
if vmSpecificErr {
log.Debug().
Str("cluster", cc.name).
Str("endpoint", ep).
Msg("Cluster endpoint healthy despite VM-specific errors")
} else {
log.Info().
Str("cluster", cc.name).
Str("endpoint", ep).
Msg("Cluster endpoint passed initial health check")
}
}
} else {
// Real connectivity issue
cc.nodeHealth[ep] = false
cc.lastError[ep] = sanitizeEndpointError(err.Error())
cc.lastHealthCheck[ep] = time.Now()
log.Info().
Str("cluster", cc.name).
Str("endpoint", ep).
Err(err).
Msg("Cluster endpoint failed initial health check")
}
cc.mu.Unlock()
}(endpoint)
}
// Wait for all checks to complete
wg.Wait()
log.Info().
Str("cluster", cc.name).
Int("total", len(cc.endpoints)).
Msg("Initial cluster health check completed")
}
// getHealthyClient returns a healthy client using round-robin selection
func (cc *ClusterClient) getHealthyClient(ctx context.Context) (*Client, error) {
cc.mu.Lock()
defer cc.mu.Unlock()
// Get list of healthy endpoints
var healthyEndpoints []string
var coolingEndpoints []string
now := time.Now()
for endpoint, healthy := range cc.nodeHealth {
if healthy {
if cooldown, exists := cc.rateLimitUntil[endpoint]; exists {
if now.Before(cooldown) {
coolingEndpoints = append(coolingEndpoints, endpoint)
continue
}
delete(cc.rateLimitUntil, endpoint)
}
healthyEndpoints = append(healthyEndpoints, endpoint)
}
}
if len(healthyEndpoints) == 0 && len(coolingEndpoints) > 0 {
// Nothing is immediately available, fall back to endpoints that are in cooldown
healthyEndpoints = append(healthyEndpoints, coolingEndpoints...)
}
// Count unhealthy endpoints for logging and recovery decisions
unhealthyCount := 0
for _, healthy := range cc.nodeHealth {
if !healthy {
unhealthyCount++
}
}
// Log at warn level if no healthy endpoints to aid troubleshooting
if len(healthyEndpoints) == 0 && len(coolingEndpoints) == 0 {
log.Warn().
Str("cluster", cc.name).
Int("healthy", len(healthyEndpoints)).
Int("total", len(cc.nodeHealth)).
Interface("nodeHealth", cc.nodeHealth).
Msg("No healthy endpoints available - attempting recovery")
} else {
log.Debug().
Str("cluster", cc.name).
Int("healthy", len(healthyEndpoints)).
Int("cooling", len(coolingEndpoints)).
Int("total", len(cc.nodeHealth)).
Interface("nodeHealth", cc.nodeHealth).
Msg("Checking for healthy endpoints")
}
// Trigger recovery if we have any unhealthy endpoints
// This ensures degraded clusters recover individual nodes over time,
// not just when all nodes are down
if unhealthyCount > 0 {
// Use an anonymous function to ensure the lock is re-acquired even if
// recoverUnhealthyNodes panics, preventing double-unlock from defer
func() {
cc.mu.Unlock()
defer cc.mu.Lock()
cc.recoverUnhealthyNodes(ctx)
}()
// Refresh the healthy/cooling endpoints lists after recovery attempt
// since cluster state may have changed while lock was released
healthyEndpoints = nil
coolingEndpoints = nil
now = time.Now() // Refresh time for accurate cooldown checks
for endpoint, healthy := range cc.nodeHealth {
if healthy {
if cooldown, exists := cc.rateLimitUntil[endpoint]; exists && now.Before(cooldown) {
coolingEndpoints = append(coolingEndpoints, endpoint)
continue
}
healthyEndpoints = append(healthyEndpoints, endpoint)
}
}
// Re-apply cooldown fallback if no healthy endpoints but some cooling
if len(healthyEndpoints) == 0 && len(coolingEndpoints) > 0 {
healthyEndpoints = append(healthyEndpoints, coolingEndpoints...)
}
}
if len(healthyEndpoints) == 0 {
// If still no healthy endpoints and we only have one endpoint,
// try to use it anyway (could be temporarily unreachable)
if len(cc.endpoints) == 1 {
log.Warn().
Str("cluster", cc.name).
Str("endpoint", cc.endpoints[0]).
Msg("Single endpoint appears unhealthy but attempting to use it anyway")
healthyEndpoints = cc.endpoints
// Mark it as healthy optimistically
cc.nodeHealth[cc.endpoints[0]] = true
} else {
// Provide detailed error with endpoint status
unhealthyList := make([]string, 0, len(cc.endpoints))
for _, ep := range cc.endpoints {
if !cc.nodeHealth[ep] {
unhealthyList = append(unhealthyList, ep)
}
}
log.Error().
Str("cluster", cc.name).
Strs("unhealthyEndpoints", unhealthyList).
Int("totalEndpoints", len(cc.endpoints)).
Msg("All cluster endpoints are unhealthy - verify network connectivity and API accessibility from Pulse server")
return nil, fmt.Errorf("no healthy nodes available in cluster %s (all %d endpoints unreachable: %v)", cc.name, len(cc.endpoints), unhealthyList)
}
}
// Use random selection for better load distribution
selectedEndpoint := healthyEndpoints[rand.Intn(len(healthyEndpoints))]
// Get or create client for this endpoint
client, exists := cc.clients[selectedEndpoint]
if !exists {
// Create new client with shorter timeout for initial test
cfg := cc.config
cfg.Host = selectedEndpoint
cfg.Fingerprint = cc.getEndpointFingerprint(selectedEndpoint)
// First try with a short timeout to quickly detect offline nodes
testCfg := cfg
testCfg.Timeout = 3 * time.Second
testClient, err := NewClient(testCfg)
if err != nil {
// Mark as unhealthy
cc.nodeHealth[selectedEndpoint] = false
log.Debug().
Str("cluster", cc.name).
Str("endpoint", selectedEndpoint).
Err(err).
Msg("Failed to create client for cluster endpoint")
return nil, fmt.Errorf("failed to create client for %s: %w", selectedEndpoint, err)
}
// Connectivity test - 5 seconds to allow for TLS handshake (~3s typical)
testCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
testNodes, testErr := testClient.GetNodes(testCtx)
cancel()
if testErr != nil {
// Check if this is a transient rate limit error that shouldn't mark the node unhealthy
if isRateLimited, _ := isTransientRateLimitError(testErr); isRateLimited {
log.Debug().
Str("cluster", cc.name).
Str("endpoint", selectedEndpoint).
Err(testErr).
Msg("Ignoring transient rate limit error during connectivity test")
// Continue with client creation since the node is accessible, just rate limited
} else {
// Check if this is a VM-specific error that shouldn't mark the node unhealthy
testErrStr := testErr.Error()
if strings.Contains(testErrStr, "No QEMU guest agent") ||
strings.Contains(testErrStr, "QEMU guest agent is not running") ||
strings.Contains(testErrStr, "guest agent") {
// This is a VM-specific issue, not a connectivity problem
// The node is actually healthy, so don't mark it unhealthy
log.Debug().
Str("cluster", cc.name).
Str("endpoint", selectedEndpoint).
Err(testErr).
Msg("Ignoring VM-specific error during connectivity test")
// Continue with client creation since the node is actually accessible
} else {
// Mark as unhealthy for real connectivity issues
cc.nodeHealth[selectedEndpoint] = false
log.Warn().
Str("cluster", cc.name).
Str("endpoint", selectedEndpoint).
Err(testErr).
Msg("Failed to connect to Proxmox endpoint; endpoint removed from rotation until next refresh")
return nil, fmt.Errorf("endpoint %s failed connectivity test: %w", selectedEndpoint, testErr)
}
}
}
log.Debug().
Str("cluster", cc.name).
Str("endpoint", selectedEndpoint).
Int("nodes", len(testNodes)).
Msg("Cluster endpoint passed connectivity test")
// Clear any stale error from previous failures now that connectivity succeeded
delete(cc.lastError, selectedEndpoint)
// Create the actual client with full timeout
newClient, err := NewClient(cfg)
if err != nil {
// This shouldn't happen since we just tested it
cc.nodeHealth[selectedEndpoint] = false
return nil, fmt.Errorf("failed to create client for %s: %w", selectedEndpoint, err)
}
cc.clients[selectedEndpoint] = newClient
client = newClient
}
return client, nil
}
// markUnhealthyWithError marks an endpoint as unhealthy and captures the error
func (cc *ClusterClient) markUnhealthyWithError(endpoint string, errMsg string) {
cc.mu.Lock()
defer cc.mu.Unlock()
if cc.nodeHealth[endpoint] {
log.Warn().
Str("cluster", cc.name).
Str("endpoint", endpoint).
Str("error", errMsg).
Msg("Marking cluster node as unhealthy")
cc.nodeHealth[endpoint] = false
}
if errMsg != "" {
cc.lastError[endpoint] = sanitizeEndpointError(errMsg)
}
cc.lastHealthCheck[endpoint] = time.Now()
}
// clearEndpointError removes any cached error for an endpoint after successful operations
// and marks the endpoint as healthy since the operation succeeded
func (cc *ClusterClient) clearEndpointError(endpoint string) {
cc.mu.Lock()
defer cc.mu.Unlock()
delete(cc.lastError, endpoint)
// Mark endpoint healthy since operation succeeded - this ensures degraded
// clusters recover once endpoints start responding again
cc.nodeHealth[endpoint] = true
}
// recoverUnhealthyNodes attempts to recover unhealthy nodes
func (cc *ClusterClient) recoverUnhealthyNodes(ctx context.Context) {
cc.mu.RLock()
unhealthyEndpoints := make([]string, 0)
throttledEndpoints := make([]string, 0)
now := time.Now()
for endpoint, healthy := range cc.nodeHealth {
if !healthy {
// Skip if we checked this endpoint recently (within 10 seconds)
// Balance between recovery speed and avoiding excessive checks
if lastCheck, exists := cc.lastHealthCheck[endpoint]; exists {
if now.Sub(lastCheck) < 10*time.Second {
throttledEndpoints = append(throttledEndpoints, endpoint)
continue
}
}
unhealthyEndpoints = append(unhealthyEndpoints, endpoint)
}
}
cc.mu.RUnlock()
if len(unhealthyEndpoints) == 0 {
if len(throttledEndpoints) > 0 {
log.Debug().
Str("cluster", cc.name).
Strs("throttledEndpoints", throttledEndpoints).
Msg("Skipping recovery check - endpoints checked recently")
}
return
}
log.Info().
Str("cluster", cc.name).
Strs("unhealthyEndpoints", unhealthyEndpoints).
Int("count", len(unhealthyEndpoints)).
Msg("Attempting to recover unhealthy cluster endpoints")
// Test all unhealthy endpoints concurrently with a short timeout
var wg sync.WaitGroup
recoveredEndpoints := make(chan string, len(unhealthyEndpoints))
for _, endpoint := range unhealthyEndpoints {
wg.Add(1)
go func(ep string) {
defer wg.Done()
// Update last check time
cc.mu.Lock()
cc.lastHealthCheck[ep] = now
cc.mu.Unlock()
// Try to create a client and test connection
// Note: 5-second timeout needed because TLS handshake to Proxmox API
// typically takes ~3 seconds on local networks
cfg := cc.config
cfg.Host = ep
cfg.Fingerprint = cc.getEndpointFingerprint(ep)
cfg.Timeout = 5 * time.Second
testClient, err := NewClient(cfg)
if err != nil {
log.Debug().
Str("cluster", cc.name).
Str("endpoint", ep).
Err(err).
Msg("Failed to create client during recovery attempt")
return
}
// Try a simple API call
testCtx, cancel := context.WithTimeout(ctx, 5*time.Second)
_, err = testClient.GetNodes(testCtx)
cancel()
// Check if error is VM-specific (shouldn't prevent recovery)
vmSpecificErr := err != nil && isVMSpecificError(err.Error())
if err == nil || vmSpecificErr {
recoveredEndpoints <- ep
// Store the client with original timeout
cfg.Timeout = cc.config.Timeout
fullClient, _ := NewClient(cfg)
cc.mu.Lock()
cc.nodeHealth[ep] = true
delete(cc.lastError, ep)
cc.lastHealthCheck[ep] = time.Now()
cc.clients[ep] = fullClient
cc.mu.Unlock()
if vmSpecificErr {
log.Info().
Str("cluster", cc.name).
Str("endpoint", ep).
Msg("Recovered unhealthy cluster node (ignoring VM-specific errors)")
} else {
log.Info().
Str("cluster", cc.name).
Str("endpoint", ep).
Msg("Recovered unhealthy cluster node")
}
} else {
log.Debug().
Str("cluster", cc.name).
Str("endpoint", ep).
Err(err).
Msg("Recovery attempt failed - endpoint still unhealthy")
}
}(endpoint)
}
// Wait for all recovery attempts to complete
go func() {
wg.Wait()
close(recoveredEndpoints)
}()
// Count recovered endpoints
recoveredCount := 0
for range recoveredEndpoints {
recoveredCount++
}
// Log recovery summary
if recoveredCount > 0 {
log.Info().
Str("cluster", cc.name).
Int("recovered", recoveredCount).
Int("attempted", len(unhealthyEndpoints)).
Msg("Cluster endpoint recovery completed")
} else if len(unhealthyEndpoints) > 0 {
log.Warn().
Str("cluster", cc.name).
Int("attempted", len(unhealthyEndpoints)).
Strs("failedEndpoints", unhealthyEndpoints).
Msg("No endpoints recovered - cluster may be unreachable from Pulse server")
}
}
// executeWithFailover executes a function with automatic failover
func (cc *ClusterClient) executeWithFailover(ctx context.Context, fn func(*Client) error) error {
baseRetries := len(cc.endpoints)
maxRetries := baseRetries + rateLimitRetryBudget
var lastErr error
log.Debug().
Str("cluster", cc.name).
Int("maxRetries", maxRetries).
Msg("Starting executeWithFailover")
for i := 0; i < maxRetries; i++ {
client, err := cc.getHealthyClient(ctx)
if err != nil {
log.Debug().
Str("cluster", cc.name).
Err(err).
Int("attempt", i+1).
Msg("Failed to get healthy client")
return err
}
// Get the endpoint for this client
var clientEndpoint string
cc.mu.RLock()
for endpoint, c := range cc.clients {
if c == client {
clientEndpoint = endpoint
break
}
}
cc.mu.RUnlock()
// Execute the function
err = fn(client)
if err == nil {
// Clear any stale error for this endpoint on success
cc.clearEndpointError(clientEndpoint)
return nil
}
lastErr = err
// Rate limit - retry with backoff (check before connectivity classification)
if isRateLimited, statusCode := isTransientRateLimitError(err); isRateLimited {
backoff := calculateRateLimitBackoff(i)
cc.applyRateLimitCooldown(clientEndpoint, backoff)
event := log.Warn().
Str("cluster", cc.name).
Str("endpoint", clientEndpoint).
Err(err).
Dur("backoff", backoff).
Int("attempt", i+1)
if statusCode != 0 {
event = event.Int("status", statusCode)
}
event.Msg("Rate limited by cluster node, retrying with backoff")
timer := time.NewTimer(backoff)
select {
case <-ctx.Done():
if !timer.Stop() {
<-timer.C
}
return fmt.Errorf("context canceled while backing off after rate limit: %w", ctx.Err())
case <-timer.C:
}
continue
}
// Auth errors - return immediately without marking endpoint unhealthy
if isAuthError(err) {
return err
}
// Only mark endpoint unhealthy for actual connectivity failures (TCP/DNS/TLS).
// Any HTTP response — even 500 — proves the endpoint is reachable.
if isEndpointConnectivityError(err) {
cc.markUnhealthyWithError(clientEndpoint, err.Error())
log.Warn().
Str("cluster", cc.name).
Str("endpoint", clientEndpoint).
Err(err).
Int("attempt", i+1).
Msg("Connectivity failure on cluster node, trying next")
continue
}
// Endpoint is reachable but this specific request failed (API error, permission
// issue, VM-specific error, etc.). Return without marking endpoint unhealthy.
log.Debug().
Str("cluster", cc.name).
Str("endpoint", clientEndpoint).
Err(err).
Msg("Request-level error, endpoint reachable - not marking unhealthy")
return err
}
if lastErr != nil {
return fmt.Errorf("all cluster nodes failed for %s: %w", cc.name, lastErr)
}
return fmt.Errorf("all cluster nodes failed for %s", cc.name)
}
func (cc *ClusterClient) applyRateLimitCooldown(endpoint string, backoff time.Duration) {
if endpoint == "" {
return
}
cc.mu.Lock()
defer cc.mu.Unlock()
if cc.rateLimitUntil == nil {
cc.rateLimitUntil = make(map[string]time.Time)
}
cc.rateLimitUntil[endpoint] = time.Now().Add(backoff)
}
func calculateRateLimitBackoff(attempt int) time.Duration {
// Linear backoff with jitter keeps retries gentle while avoiding thundering herd
base := rateLimitBaseDelay * time.Duration(attempt+1)
if rateLimitMaxJitter <= 0 {
return base
}
jitter := time.Duration(rand.Int63n(rateLimitMaxJitter.Nanoseconds()+1)) * time.Nanosecond
return base + jitter
}
func isTransientRateLimitError(err error) (bool, int) {
if err == nil {
return false, 0
}
errStr := err.Error()
statusCode := extractStatusCode(errStr)
if statusCode != 0 {
if _, ok := transientRateLimitStatusCodes[statusCode]; ok {
return true, statusCode
}
}
lowerErr := strings.ToLower(errStr)
if strings.Contains(lowerErr, "rate limit") || strings.Contains(lowerErr, "too many requests") {
if statusCode == 0 {
statusCode = 429
}
return true, statusCode
}
return false, statusCode
}
func extractStatusCode(errStr string) int {
matches := statusCodePattern.FindStringSubmatch(errStr)
if len(matches) != 2 {
return 0
}
code, err := strconv.Atoi(matches[1])
if err != nil {
return 0
}
return code
}
func isNotImplementedError(errStr string) bool {
lower := strings.ToLower(errStr)
if !strings.Contains(lower, "not implemented") {
return false
}
// Common formatting: "status 501", "error 501", "api error 501"
if strings.Contains(lower, " 501") || strings.Contains(lower, "status 501") || strings.Contains(lower, "error 501") {
return true
}
// Fallback to explicit HTTP status detection
if extractStatusCode(errStr) == 501 {
return true
}
return false
}
// GetHealthStatus returns the health status of all nodes
func (cc *ClusterClient) GetHealthStatus() map[string]bool {
cc.mu.RLock()
defer cc.mu.RUnlock()
status := make(map[string]bool)
for endpoint, healthy := range cc.nodeHealth {
status[endpoint] = healthy
}
return status
}
// EndpointHealth contains health information for a single endpoint
type EndpointHealth struct {
Healthy bool
LastCheck time.Time
LastError string
}
// GetHealthStatusWithErrors returns detailed health status including error messages
func (cc *ClusterClient) GetHealthStatusWithErrors() map[string]EndpointHealth {
cc.mu.RLock()
defer cc.mu.RUnlock()
status := make(map[string]EndpointHealth)
for endpoint, healthy := range cc.nodeHealth {
status[endpoint] = EndpointHealth{
Healthy: healthy,
LastCheck: cc.lastHealthCheck[endpoint],
LastError: cc.lastError[endpoint],
}
}
return status
}
// Implement all the Client methods with failover
func (cc *ClusterClient) GetNodes(ctx context.Context) ([]Node, error) {
log.Debug().
Str("cluster", cc.name).
Msg("ClusterClient.GetNodes called")
var result []Node
err := cc.executeWithFailover(ctx, func(client *Client) error {
nodes, err := client.GetNodes(ctx)
if err != nil {
return err
}
result = nodes
return nil
})
if err != nil {
log.Warn().
Str("cluster", cc.name).
Err(err).
Msg("ClusterClient.GetNodes failed")
} else {
log.Info().
Str("cluster", cc.name).
Int("count", len(result)).
Msg("ClusterClient.GetNodes succeeded")
}
return result, err
}
func (cc *ClusterClient) GetNodeStatus(ctx context.Context, node string) (*NodeStatus, error) {
var result *NodeStatus
err := cc.executeWithFailover(ctx, func(client *Client) error {
status, err := client.GetNodeStatus(ctx, node)
if err != nil {
return err
}
result = status
return nil
})
return result, err
}
func (cc *ClusterClient) GetNodeRRDData(ctx context.Context, node, timeframe, cf string, ds []string) ([]NodeRRDPoint, error) {
var result []NodeRRDPoint
err := cc.executeWithFailover(ctx, func(client *Client) error {
points, err := client.GetNodeRRDData(ctx, node, timeframe, cf, ds)
if err != nil {
return err
}
result = points
return nil
})
return result, err
}
func (cc *ClusterClient) GetLXCRRDData(ctx context.Context, node string, vmid int, timeframe, cf string, ds []string) ([]GuestRRDPoint, error) {
var result []GuestRRDPoint
err := cc.executeWithFailover(ctx, func(client *Client) error {
points, err := client.GetLXCRRDData(ctx, node, vmid, timeframe, cf, ds)
if err != nil {
return err
}
result = points
return nil
})
return result, err
}
func (cc *ClusterClient) GetVMRRDData(ctx context.Context, node string, vmid int, timeframe, cf string, ds []string) ([]GuestRRDPoint, error) {
var result []GuestRRDPoint
err := cc.executeWithFailover(ctx, func(client *Client) error {
points, err := client.GetVMRRDData(ctx, node, vmid, timeframe, cf, ds)
if err != nil {
return err
}
result = points
return nil
})
return result, err
}
func (cc *ClusterClient) GetVMs(ctx context.Context, node string) ([]VM, error) {
var result []VM
err := cc.executeWithFailover(ctx, func(client *Client) error {
vms, err := client.GetVMs(ctx, node)
if err != nil {
return err
}
result = vms
return nil
})
// Don't return error for transient connectivity issues - preserve UI state
if err != nil && strings.Contains(err.Error(), "no healthy nodes available") {
log.Debug().
Str("cluster", cc.name).
Str("node", node).
Err(err).
Msg("No healthy nodes for GetVMs - returning empty list to preserve UI state")
return []VM{}, nil
}
return result, err
}
func (cc *ClusterClient) GetContainers(ctx context.Context, node string) ([]Container, error) {
var result []Container
err := cc.executeWithFailover(ctx, func(client *Client) error {
containers, err := client.GetContainers(ctx, node)
if err != nil {
return err
}
result = containers
return nil
})
// Don't return error for transient connectivity issues - preserve UI state
if err != nil && strings.Contains(err.Error(), "no healthy nodes available") {
log.Debug().
Str("cluster", cc.name).
Str("node", node).
Err(err).
Msg("No healthy nodes for GetContainers - returning empty list to preserve UI state")
return []Container{}, nil
}
return result, err
}
func (cc *ClusterClient) GetStorage(ctx context.Context, node string) ([]Storage, error) {
var result []Storage
err := cc.executeWithFailover(ctx, func(client *Client) error {
storage, err := client.GetStorage(ctx, node)
if err != nil {
return err
}
result = storage
return nil
})
return result, err
}
func (cc *ClusterClient) GetAllStorage(ctx context.Context) ([]Storage, error) {
var result []Storage
err := cc.executeWithFailover(ctx, func(client *Client) error {
storage, err := client.GetAllStorage(ctx)
if err != nil {
return err
}
result = storage
return nil
})
return result, err
}
func (cc *ClusterClient) GetBackupTasks(ctx context.Context) ([]Task, error) {
var result []Task
err := cc.executeWithFailover(ctx, func(client *Client) error {
tasks, err := client.GetBackupTasks(ctx)
if err != nil {
return err
}
result = tasks
return nil
})
return result, err
}
func (cc *ClusterClient) GetReplicationStatus(ctx context.Context) ([]ReplicationJob, error) {
var result []ReplicationJob
err := cc.executeWithFailover(ctx, func(client *Client) error {
jobs, err := client.GetReplicationStatus(ctx)
if err != nil {
return err
}
result = jobs
return nil
})
return result, err
}
func (cc *ClusterClient) GetStorageContent(ctx context.Context, node, storage string) ([]StorageContent, error) {
var result []StorageContent
err := cc.executeWithFailover(ctx, func(client *Client) error {
content, err := client.GetStorageContent(ctx, node, storage)
if err != nil {
return err
}
result = content
return nil
})
return result, err
}
// GetCephStatus returns Ceph cluster status information with failover support.
func (cc *ClusterClient) GetCephStatus(ctx context.Context) (*CephStatus, error) {
var result *CephStatus
err := cc.executeWithFailover(ctx, func(client *Client) error {
status, err := client.GetCephStatus(ctx)
if err != nil {
return err
}
result = status
return nil
})
return result, err
}
// GetCephDF returns Ceph capacity information with failover support.
func (cc *ClusterClient) GetCephDF(ctx context.Context) (*CephDF, error) {
var result *CephDF
err := cc.executeWithFailover(ctx, func(client *Client) error {
df, err := client.GetCephDF(ctx)
if err != nil {
return err
}
result = df
return nil
})
return result, err
}
func (cc *ClusterClient) GetVMSnapshots(ctx context.Context, node string, vmid int) ([]Snapshot, error) {
var result []Snapshot
err := cc.executeWithFailover(ctx, func(client *Client) error {
snapshots, err := client.GetVMSnapshots(ctx, node, vmid)
if err != nil {
return err
}
result = snapshots
return nil
})
return result, err
}
func (cc *ClusterClient) GetContainerSnapshots(ctx context.Context, node string, vmid int) ([]Snapshot, error) {
var result []Snapshot
err := cc.executeWithFailover(ctx, func(client *Client) error {
snapshots, err := client.GetContainerSnapshots(ctx, node, vmid)
if err != nil {
return err
}
result = snapshots
return nil
})
return result, err
}
func (cc *ClusterClient) GetVMStatus(ctx context.Context, node string, vmid int) (*VMStatus, error) {
var result *VMStatus
err := cc.executeWithFailover(ctx, func(client *Client) error {
status, err := client.GetVMStatus(ctx, node, vmid)
if err != nil {
return err
}
result = status
return nil
})
return result, err
}
func (cc *ClusterClient) GetVMConfig(ctx context.Context, node string, vmid int) (map[string]interface{}, error) {
var result map[string]interface{}
err := cc.executeWithFailover(ctx, func(client *Client) error {
config, err := client.GetVMConfig(ctx, node, vmid)
if err != nil {
return err
}
result = config
return nil
})
return result, err
}
func (cc *ClusterClient) GetVMAgentInfo(ctx context.Context, node string, vmid int) (map[string]interface{}, error) {
var result map[string]interface{}
err := cc.executeWithFailover(ctx, func(client *Client) error {
info, err := client.GetVMAgentInfo(ctx, node, vmid)
if err != nil {
return err
}
result = info
return nil
})
return result, err
}
// GetVMAgentVersion returns the guest agent version for the VM.
func (cc *ClusterClient) GetVMAgentVersion(ctx context.Context, node string, vmid int) (string, error) {
var version string
err := cc.executeWithFailover(ctx, func(client *Client) error {
v, err := client.GetVMAgentVersion(ctx, node, vmid)
if err != nil {
return err
}
version = v
return nil
})
return version, err
}
// GetVMFSInfo returns filesystem information from QEMU guest agent
func (cc *ClusterClient) GetVMFSInfo(ctx context.Context, node string, vmid int) ([]VMFileSystem, error) {
var result []VMFileSystem
err := cc.executeWithFailover(ctx, func(client *Client) error {
info, err := client.GetVMFSInfo(ctx, node, vmid)
if err != nil {
return err
}
result = info
return nil
})
return result, err
}
// GetVMNetworkInterfaces returns guest network interfaces from the QEMU agent
func (cc *ClusterClient) GetVMNetworkInterfaces(ctx context.Context, node string, vmid int) ([]VMNetworkInterface, error) {
var result []VMNetworkInterface
err := cc.executeWithFailover(ctx, func(client *Client) error {
interfaces, err := client.GetVMNetworkInterfaces(ctx, node, vmid)
if err != nil {
return err
}
result = interfaces
return nil
})
return result, err
}
// GetVMMemAvailableFromAgent reads /proc/meminfo via the QEMU guest agent to get MemAvailable.
func (cc *ClusterClient) GetVMMemAvailableFromAgent(ctx context.Context, node string, vmid int) (uint64, error) {
var result uint64
err := cc.executeWithFailover(ctx, func(client *Client) error {
available, err := client.GetVMMemAvailableFromAgent(ctx, node, vmid)
if err != nil {
return err
}
result = available
return nil
})
return result, err
}
// GetClusterResources returns all resources (VMs, containers) across the cluster in a single call
func (cc *ClusterClient) GetClusterResources(ctx context.Context, resourceType string) ([]ClusterResource, error) {
var result []ClusterResource
err := cc.executeWithFailover(ctx, func(client *Client) error {
resources, err := client.GetClusterResources(ctx, resourceType)
if err != nil {
return err
}
result = resources
return nil
})
return result, err
}
// GetContainerStatus returns the status of a specific container
func (cc *ClusterClient) GetContainerStatus(ctx context.Context, node string, vmid int) (*Container, error) {
var result *Container
err := cc.executeWithFailover(ctx, func(client *Client) error {
status, err := client.GetContainerStatus(ctx, node, vmid)
if err != nil {
return err
}
result = status
return nil
})
return result, err
}
// GetContainerConfig returns the configuration of a specific container
func (cc *ClusterClient) GetContainerConfig(ctx context.Context, node string, vmid int) (map[string]interface{}, error) {
var result map[string]interface{}
err := cc.executeWithFailover(ctx, func(client *Client) error {
config, err := client.GetContainerConfig(ctx, node, vmid)
if err != nil {
return err
}
result = config
return nil
})
return result, err
}
// GetContainerInterfaces returns interface details for a container
func (cc *ClusterClient) GetContainerInterfaces(ctx context.Context, node string, vmid int) ([]ContainerInterface, error) {
var result []ContainerInterface
err := cc.executeWithFailover(ctx, func(client *Client) error {
interfaces, err := client.GetContainerInterfaces(ctx, node, vmid)
if err != nil {
return err
}
result = interfaces
return nil
})
return result, err
}
// IsClusterMember checks if this node is part of a cluster
func (cc *ClusterClient) IsClusterMember(ctx context.Context) (bool, error) {
var result bool
err := cc.executeWithFailover(ctx, func(client *Client) error {
isMember, err := client.IsClusterMember(ctx)
if err != nil {
return err
}
result = isMember
return nil
})
return result, err
}
// GetZFSPoolStatus returns ZFS pool status for a node
func (cc *ClusterClient) GetZFSPoolStatus(ctx context.Context, node string) ([]ZFSPoolStatus, error) {
var result []ZFSPoolStatus
err := cc.executeWithFailover(ctx, func(client *Client) error {
pools, err := client.GetZFSPoolStatus(ctx, node)
if err != nil {
return err
}
result = pools
return nil
})
return result, err
}
// GetZFSPoolsWithDetails returns ZFS pools with full details for a node
func (cc *ClusterClient) GetZFSPoolsWithDetails(ctx context.Context, node string) ([]ZFSPoolInfo, error) {
var result []ZFSPoolInfo
err := cc.executeWithFailover(ctx, func(client *Client) error {
pools, err := client.GetZFSPoolsWithDetails(ctx, node)
if err != nil {
return err
}
result = pools
return nil
})
return result, err
}
// Helper to check if error is auth-related
func (cc *ClusterClient) GetDisks(ctx context.Context, node string) ([]Disk, error) {
var result []Disk
err := cc.executeWithFailover(ctx, func(client *Client) error {
disks, err := client.GetDisks(ctx, node)
if err != nil {
return err
}
result = disks
return nil
})
// Don't return error for transient connectivity issues
if err != nil && strings.Contains(err.Error(), "no healthy nodes available") {
log.Debug().
Str("cluster", cc.name).
Str("node", node).
Err(err).
Msg("No healthy nodes for GetDisks - returning empty list")
return []Disk{}, nil
}
return result, err
}
// GetNodePendingUpdates returns pending apt updates for a node with failover support
func (cc *ClusterClient) GetNodePendingUpdates(ctx context.Context, node string) ([]AptPackage, error) {
var result []AptPackage
err := cc.executeWithFailover(ctx, func(client *Client) error {
pkgs, err := client.GetNodePendingUpdates(ctx, node)
if err != nil {
return err
}
result = pkgs
return nil
})
// Don't return error for transient connectivity issues or permission issues
if err != nil && (strings.Contains(err.Error(), "no healthy nodes available") ||
strings.Contains(err.Error(), "403") || strings.Contains(err.Error(), "permission")) {
log.Debug().
Str("cluster", cc.name).
Str("node", node).
Err(err).
Msg("Could not get pending updates - returning empty list")
return []AptPackage{}, nil
}
return result, err
}
// GetClusterStatus returns the cluster status including all nodes with failover support.
func (cc *ClusterClient) GetClusterStatus(ctx context.Context) ([]ClusterStatus, error) {
var result []ClusterStatus
err := cc.executeWithFailover(ctx, func(client *Client) error {
status, err := client.GetClusterStatus(ctx)
if err != nil {
return err
}
result = status
return nil
})
return result, err
}
// IsQuorate checks if the cluster has quorum by querying the Proxmox cluster status.
// Returns true if the cluster is quorate (has enough votes for consensus), false otherwise.
// This is the authoritative check for cluster health - a cluster with quorum is healthy
// even if some nodes are intentionally offline (e.g., backup nodes not running).
func (cc *ClusterClient) IsQuorate(ctx context.Context) (bool, error) {
status, err := cc.GetClusterStatus(ctx)
if err != nil {
return false, err
}
// Look for the cluster entry which has the quorate field
for _, s := range status {
if s.Type == "cluster" {
return s.Quorate == 1, nil
}
}
// If no cluster entry found, this might be a standalone node - consider it healthy
return true, nil
}
// GetClusterOptions fetches datacenter options (e.g. tag colour map) via the first healthy node.
func (cc *ClusterClient) GetClusterOptions(ctx context.Context) (*ClusterOptions, error) {
var result *ClusterOptions
err := cc.executeWithFailover(ctx, func(client *Client) error {
opts, err := client.GetClusterOptions(ctx)
if err != nil {
return err
}
result = opts
return nil
})
return result, err
}
// isAuthError checks if an error is an authentication error
func isAuthError(err error) bool {
if err == nil {
return false
}
errStr := err.Error()
return strings.Contains(errStr, "authentication") ||
strings.Contains(errStr, "401") ||
strings.Contains(errStr, "403")
}