package proxmox import ( "context" "fmt" "math/rand" "regexp" "strconv" "strings" "sync" "time" "github.com/rs/zerolog/log" ) // ClusterClient wraps multiple Proxmox clients for cluster-aware operations type ClusterClient struct { mu sync.RWMutex name string clients map[string]*Client // Key is node name endpoints []string // All available endpoints endpointFingerprints map[string]string // Per-endpoint TLS fingerprints (TOFU) nodeHealth map[string]bool // Track node health lastHealthCheck map[string]time.Time // Track last health check time lastError map[string]string // Track last error per endpoint config ClientConfig // Base config (auth info) rateLimitUntil map[string]time.Time // Cooldown window for rate-limited endpoints } const ( rateLimitBaseDelay = 150 * time.Millisecond rateLimitMaxJitter = 200 * time.Millisecond rateLimitRetryBudget = 2 ) var statusCodePattern = regexp.MustCompile(`(?i)(?:api error|status)\s+(\d{3})`) var transientRateLimitStatusCodes = map[int]struct{}{ 408: {}, 425: {}, // Too Early 429: {}, 502: {}, 503: {}, 504: {}, } // isVMSpecificError reports whether an error string is scoped to a single VM/guest agent // and should not be treated as a node connectivity failure. func isVMSpecificError(errStr string) bool { if errStr == "" { return false } lower := strings.ToLower(errStr) if strings.Contains(lower, "no qemu guest agent") || strings.Contains(lower, "qemu guest agent is not running") || strings.Contains(lower, "guest agent") { return true } // QMP guest agent operations can time out or fail per-VM (e.g. guest-get-fsinfo). // These aren't node connectivity issues and should not mark endpoints unhealthy. if strings.Contains(lower, "qmp command") { return true } if strings.Contains(lower, "guest-get-") { return true } return false } // isEndpointConnectivityError reports whether an error indicates the endpoint // itself is unreachable (TCP/DNS/TLS failure). Any error that carries an HTTP // response — even a 500 — proves the endpoint is reachable, so those are NOT // connectivity errors. func isEndpointConnectivityError(err error) bool { if err == nil { return false } errStr := strings.ToLower(err.Error()) // If we received an HTTP response from Proxmox (any status code), // the endpoint is reachable. if strings.Contains(errStr, "api error") { return false } // TCP/DNS connectivity failures if strings.Contains(errStr, "connection refused") || strings.Contains(errStr, "connection reset") || strings.Contains(errStr, "no such host") || strings.Contains(errStr, "network is unreachable") || strings.Contains(errStr, "no route to host") || strings.Contains(errStr, "i/o timeout") || strings.Contains(errStr, "dial tcp") || strings.Contains(errStr, "dial:") { return true } // TLS failures if strings.Contains(errStr, "tls handshake") || strings.Contains(errStr, "tls:") || strings.Contains(errStr, "certificate") || strings.Contains(errStr, "fingerprint mismatch") { return true } return false } // sanitizeEndpointError transforms raw Go errors into user-friendly messages // for display in the UI. The original error is preserved in logs. func sanitizeEndpointError(errMsg string) string { if errMsg == "" { return errMsg } lower := strings.ToLower(errMsg) // Context deadline exceeded - usually means slow API response if strings.Contains(lower, "context deadline exceeded") { // Check for specific causes if strings.Contains(lower, "/storage") { return "Request timed out - storage API slow (check for unreachable PBS/NFS/Ceph backends)" } if strings.Contains(lower, "pbs-") || strings.Contains(lower, ":8007") { return "Request timed out - PBS storage backend unreachable" } return "Request timed out - Proxmox API may be slow or waiting on unreachable backend services" } // Client timeout - similar to context deadline if strings.Contains(lower, "client.timeout exceeded") { return "Connection timed out - Proxmox API not responding in time" } // Connection refused if strings.Contains(lower, "connection refused") { return "Connection refused - Proxmox API not running or firewall blocking" } // No route to host if strings.Contains(lower, "no route to host") { return "Network unreachable - check network connectivity to Proxmox host" } // TLS/certificate errors if strings.Contains(lower, "certificate") || strings.Contains(lower, "x509") { return "TLS certificate error - check SSL settings or add fingerprint" } // Auth errors - keep these specific if strings.Contains(lower, "authentication") || strings.Contains(lower, "401") || strings.Contains(lower, "403") { return "Authentication failed - check API token or credentials" } // PBS-specific errors if strings.Contains(lower, "can't connect to") && strings.Contains(lower, ":8007") { return "PBS storage unreachable - check Proxmox Backup Server connectivity" } // Return original if no transformation applies return errMsg } // NewClusterClient creates a new cluster-aware client. // endpointFingerprints is an optional map of endpoint URL -> TLS fingerprint for per-node certificate verification. // This enables TOFU (Trust On First Use) for clusters with unique self-signed certs per node. func NewClusterClient(name string, config ClientConfig, endpoints []string, endpointFingerprints map[string]string) *ClusterClient { if endpointFingerprints == nil { endpointFingerprints = make(map[string]string) } cc := &ClusterClient{ name: name, clients: make(map[string]*Client), endpoints: endpoints, endpointFingerprints: endpointFingerprints, nodeHealth: make(map[string]bool), lastHealthCheck: make(map[string]time.Time), lastError: make(map[string]string), config: config, rateLimitUntil: make(map[string]time.Time), } // Initialize all endpoints as unknown (will be tested on first use) // Start optimistically - assume healthy until proven otherwise // This allows operations to be attempted even if initial health check fails for _, endpoint := range endpoints { cc.nodeHealth[endpoint] = true // Start optimistic, will be marked unhealthy if operations fail } // Do a quick parallel health check on initialization (synchronous to avoid race) // This will mark unhealthy nodes but won't prevent trying them later cc.initialHealthCheck() return cc } // getEndpointFingerprint returns the TLS fingerprint to use for a specific endpoint. // It prefers the per-endpoint fingerprint (TOFU) over the base config fingerprint. func (cc *ClusterClient) getEndpointFingerprint(endpoint string) string { if fp, ok := cc.endpointFingerprints[endpoint]; ok && fp != "" { return fp } return cc.config.Fingerprint } // initialHealthCheck performs a quick parallel health check on all endpoints func (cc *ClusterClient) initialHealthCheck() { // Skip initial health check if there's only one endpoint // For single-endpoint clusters (using main host for routing), assume healthy if len(cc.endpoints) == 1 { log.Info(). Str("cluster", cc.name). Str("endpoint", cc.endpoints[0]). Msg("Single endpoint cluster - skipping initial health check") return } // For multi-node clusters, do a very quick check but don't mark unhealthy immediately // This prevents nodes from being marked unhealthy due to temporary startup conditions var wg sync.WaitGroup for _, endpoint := range cc.endpoints { wg.Add(1) go func(ep string) { defer wg.Done() // Try a quick connection test with slightly longer timeout for initial check cfg := cc.config cfg.Host = ep cfg.Fingerprint = cc.getEndpointFingerprint(ep) cfg.Timeout = 5 * time.Second testClient, err := NewClient(cfg) if err != nil { cc.mu.Lock() cc.nodeHealth[ep] = false cc.lastError[ep] = sanitizeEndpointError(err.Error()) cc.lastHealthCheck[ep] = time.Now() cc.mu.Unlock() log.Info(). Str("cluster", cc.name). Str("endpoint", ep). Err(err). Msg("Cluster endpoint marked unhealthy on initialization") return } // Quick test with slightly longer timeout for initial check ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) _, err = testClient.GetNodes(ctx) cancel() cc.mu.Lock() // Check if error is VM-specific (shouldn't affect health) vmSpecificErr := err != nil && isVMSpecificError(err.Error()) if err == nil || vmSpecificErr { // Node is healthy - create a proper client with full timeout for actual use fullCfg := cc.config fullCfg.Host = ep fullCfg.Fingerprint = cc.getEndpointFingerprint(ep) fullClient, clientErr := NewClient(fullCfg) if clientErr != nil { cc.nodeHealth[ep] = false cc.lastError[ep] = sanitizeEndpointError(clientErr.Error()) cc.lastHealthCheck[ep] = time.Now() log.Warn(). Str("cluster", cc.name). Str("endpoint", ep). Err(clientErr). Msg("Failed to create full client after successful health check") } else { cc.nodeHealth[ep] = true delete(cc.lastError, ep) cc.lastHealthCheck[ep] = time.Now() cc.clients[ep] = fullClient // Store the full client, not test client if vmSpecificErr { log.Debug(). Str("cluster", cc.name). Str("endpoint", ep). Msg("Cluster endpoint healthy despite VM-specific errors") } else { log.Info(). Str("cluster", cc.name). Str("endpoint", ep). Msg("Cluster endpoint passed initial health check") } } } else { // Real connectivity issue cc.nodeHealth[ep] = false cc.lastError[ep] = sanitizeEndpointError(err.Error()) cc.lastHealthCheck[ep] = time.Now() log.Info(). Str("cluster", cc.name). Str("endpoint", ep). Err(err). Msg("Cluster endpoint failed initial health check") } cc.mu.Unlock() }(endpoint) } // Wait for all checks to complete wg.Wait() log.Info(). Str("cluster", cc.name). Int("total", len(cc.endpoints)). Msg("Initial cluster health check completed") } // getHealthyClient returns a healthy client using round-robin selection func (cc *ClusterClient) getHealthyClient(ctx context.Context) (*Client, error) { cc.mu.Lock() defer cc.mu.Unlock() // Get list of healthy endpoints var healthyEndpoints []string var coolingEndpoints []string now := time.Now() for endpoint, healthy := range cc.nodeHealth { if healthy { if cooldown, exists := cc.rateLimitUntil[endpoint]; exists { if now.Before(cooldown) { coolingEndpoints = append(coolingEndpoints, endpoint) continue } delete(cc.rateLimitUntil, endpoint) } healthyEndpoints = append(healthyEndpoints, endpoint) } } if len(healthyEndpoints) == 0 && len(coolingEndpoints) > 0 { // Nothing is immediately available, fall back to endpoints that are in cooldown healthyEndpoints = append(healthyEndpoints, coolingEndpoints...) } // Count unhealthy endpoints for logging and recovery decisions unhealthyCount := 0 for _, healthy := range cc.nodeHealth { if !healthy { unhealthyCount++ } } // Log at warn level if no healthy endpoints to aid troubleshooting if len(healthyEndpoints) == 0 && len(coolingEndpoints) == 0 { log.Warn(). Str("cluster", cc.name). Int("healthy", len(healthyEndpoints)). Int("total", len(cc.nodeHealth)). Interface("nodeHealth", cc.nodeHealth). Msg("No healthy endpoints available - attempting recovery") } else { log.Debug(). Str("cluster", cc.name). Int("healthy", len(healthyEndpoints)). Int("cooling", len(coolingEndpoints)). Int("total", len(cc.nodeHealth)). Interface("nodeHealth", cc.nodeHealth). Msg("Checking for healthy endpoints") } // Trigger recovery if we have any unhealthy endpoints // This ensures degraded clusters recover individual nodes over time, // not just when all nodes are down if unhealthyCount > 0 { // Use an anonymous function to ensure the lock is re-acquired even if // recoverUnhealthyNodes panics, preventing double-unlock from defer func() { cc.mu.Unlock() defer cc.mu.Lock() cc.recoverUnhealthyNodes(ctx) }() // Refresh the healthy/cooling endpoints lists after recovery attempt // since cluster state may have changed while lock was released healthyEndpoints = nil coolingEndpoints = nil now = time.Now() // Refresh time for accurate cooldown checks for endpoint, healthy := range cc.nodeHealth { if healthy { if cooldown, exists := cc.rateLimitUntil[endpoint]; exists && now.Before(cooldown) { coolingEndpoints = append(coolingEndpoints, endpoint) continue } healthyEndpoints = append(healthyEndpoints, endpoint) } } // Re-apply cooldown fallback if no healthy endpoints but some cooling if len(healthyEndpoints) == 0 && len(coolingEndpoints) > 0 { healthyEndpoints = append(healthyEndpoints, coolingEndpoints...) } } if len(healthyEndpoints) == 0 { // If still no healthy endpoints and we only have one endpoint, // try to use it anyway (could be temporarily unreachable) if len(cc.endpoints) == 1 { log.Warn(). Str("cluster", cc.name). Str("endpoint", cc.endpoints[0]). Msg("Single endpoint appears unhealthy but attempting to use it anyway") healthyEndpoints = cc.endpoints // Mark it as healthy optimistically cc.nodeHealth[cc.endpoints[0]] = true } else { // Provide detailed error with endpoint status unhealthyList := make([]string, 0, len(cc.endpoints)) for _, ep := range cc.endpoints { if !cc.nodeHealth[ep] { unhealthyList = append(unhealthyList, ep) } } log.Error(). Str("cluster", cc.name). Strs("unhealthyEndpoints", unhealthyList). Int("totalEndpoints", len(cc.endpoints)). Msg("All cluster endpoints are unhealthy - verify network connectivity and API accessibility from Pulse server") return nil, fmt.Errorf("no healthy nodes available in cluster %s (all %d endpoints unreachable: %v)", cc.name, len(cc.endpoints), unhealthyList) } } // Use random selection for better load distribution selectedEndpoint := healthyEndpoints[rand.Intn(len(healthyEndpoints))] // Get or create client for this endpoint client, exists := cc.clients[selectedEndpoint] if !exists { // Create new client with shorter timeout for initial test cfg := cc.config cfg.Host = selectedEndpoint cfg.Fingerprint = cc.getEndpointFingerprint(selectedEndpoint) // First try with a short timeout to quickly detect offline nodes testCfg := cfg testCfg.Timeout = 3 * time.Second testClient, err := NewClient(testCfg) if err != nil { // Mark as unhealthy cc.nodeHealth[selectedEndpoint] = false log.Debug(). Str("cluster", cc.name). Str("endpoint", selectedEndpoint). Err(err). Msg("Failed to create client for cluster endpoint") return nil, fmt.Errorf("failed to create client for %s: %w", selectedEndpoint, err) } // Connectivity test - 5 seconds to allow for TLS handshake (~3s typical) testCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second) testNodes, testErr := testClient.GetNodes(testCtx) cancel() if testErr != nil { // Check if this is a transient rate limit error that shouldn't mark the node unhealthy if isRateLimited, _ := isTransientRateLimitError(testErr); isRateLimited { log.Debug(). Str("cluster", cc.name). Str("endpoint", selectedEndpoint). Err(testErr). Msg("Ignoring transient rate limit error during connectivity test") // Continue with client creation since the node is accessible, just rate limited } else { // Check if this is a VM-specific error that shouldn't mark the node unhealthy testErrStr := testErr.Error() if strings.Contains(testErrStr, "No QEMU guest agent") || strings.Contains(testErrStr, "QEMU guest agent is not running") || strings.Contains(testErrStr, "guest agent") { // This is a VM-specific issue, not a connectivity problem // The node is actually healthy, so don't mark it unhealthy log.Debug(). Str("cluster", cc.name). Str("endpoint", selectedEndpoint). Err(testErr). Msg("Ignoring VM-specific error during connectivity test") // Continue with client creation since the node is actually accessible } else { // Mark as unhealthy for real connectivity issues cc.nodeHealth[selectedEndpoint] = false log.Warn(). Str("cluster", cc.name). Str("endpoint", selectedEndpoint). Err(testErr). Msg("Failed to connect to Proxmox endpoint; endpoint removed from rotation until next refresh") return nil, fmt.Errorf("endpoint %s failed connectivity test: %w", selectedEndpoint, testErr) } } } log.Debug(). Str("cluster", cc.name). Str("endpoint", selectedEndpoint). Int("nodes", len(testNodes)). Msg("Cluster endpoint passed connectivity test") // Clear any stale error from previous failures now that connectivity succeeded delete(cc.lastError, selectedEndpoint) // Create the actual client with full timeout newClient, err := NewClient(cfg) if err != nil { // This shouldn't happen since we just tested it cc.nodeHealth[selectedEndpoint] = false return nil, fmt.Errorf("failed to create client for %s: %w", selectedEndpoint, err) } cc.clients[selectedEndpoint] = newClient client = newClient } return client, nil } // markUnhealthyWithError marks an endpoint as unhealthy and captures the error func (cc *ClusterClient) markUnhealthyWithError(endpoint string, errMsg string) { cc.mu.Lock() defer cc.mu.Unlock() if cc.nodeHealth[endpoint] { log.Warn(). Str("cluster", cc.name). Str("endpoint", endpoint). Str("error", errMsg). Msg("Marking cluster node as unhealthy") cc.nodeHealth[endpoint] = false } if errMsg != "" { cc.lastError[endpoint] = sanitizeEndpointError(errMsg) } cc.lastHealthCheck[endpoint] = time.Now() } // clearEndpointError removes any cached error for an endpoint after successful operations // and marks the endpoint as healthy since the operation succeeded func (cc *ClusterClient) clearEndpointError(endpoint string) { cc.mu.Lock() defer cc.mu.Unlock() delete(cc.lastError, endpoint) // Mark endpoint healthy since operation succeeded - this ensures degraded // clusters recover once endpoints start responding again cc.nodeHealth[endpoint] = true } // recoverUnhealthyNodes attempts to recover unhealthy nodes func (cc *ClusterClient) recoverUnhealthyNodes(ctx context.Context) { cc.mu.RLock() unhealthyEndpoints := make([]string, 0) throttledEndpoints := make([]string, 0) now := time.Now() for endpoint, healthy := range cc.nodeHealth { if !healthy { // Skip if we checked this endpoint recently (within 10 seconds) // Balance between recovery speed and avoiding excessive checks if lastCheck, exists := cc.lastHealthCheck[endpoint]; exists { if now.Sub(lastCheck) < 10*time.Second { throttledEndpoints = append(throttledEndpoints, endpoint) continue } } unhealthyEndpoints = append(unhealthyEndpoints, endpoint) } } cc.mu.RUnlock() if len(unhealthyEndpoints) == 0 { if len(throttledEndpoints) > 0 { log.Debug(). Str("cluster", cc.name). Strs("throttledEndpoints", throttledEndpoints). Msg("Skipping recovery check - endpoints checked recently") } return } log.Info(). Str("cluster", cc.name). Strs("unhealthyEndpoints", unhealthyEndpoints). Int("count", len(unhealthyEndpoints)). Msg("Attempting to recover unhealthy cluster endpoints") // Test all unhealthy endpoints concurrently with a short timeout var wg sync.WaitGroup recoveredEndpoints := make(chan string, len(unhealthyEndpoints)) for _, endpoint := range unhealthyEndpoints { wg.Add(1) go func(ep string) { defer wg.Done() // Update last check time cc.mu.Lock() cc.lastHealthCheck[ep] = now cc.mu.Unlock() // Try to create a client and test connection // Note: 5-second timeout needed because TLS handshake to Proxmox API // typically takes ~3 seconds on local networks cfg := cc.config cfg.Host = ep cfg.Fingerprint = cc.getEndpointFingerprint(ep) cfg.Timeout = 5 * time.Second testClient, err := NewClient(cfg) if err != nil { log.Debug(). Str("cluster", cc.name). Str("endpoint", ep). Err(err). Msg("Failed to create client during recovery attempt") return } // Try a simple API call testCtx, cancel := context.WithTimeout(ctx, 5*time.Second) _, err = testClient.GetNodes(testCtx) cancel() // Check if error is VM-specific (shouldn't prevent recovery) vmSpecificErr := err != nil && isVMSpecificError(err.Error()) if err == nil || vmSpecificErr { recoveredEndpoints <- ep // Store the client with original timeout cfg.Timeout = cc.config.Timeout fullClient, _ := NewClient(cfg) cc.mu.Lock() cc.nodeHealth[ep] = true delete(cc.lastError, ep) cc.lastHealthCheck[ep] = time.Now() cc.clients[ep] = fullClient cc.mu.Unlock() if vmSpecificErr { log.Info(). Str("cluster", cc.name). Str("endpoint", ep). Msg("Recovered unhealthy cluster node (ignoring VM-specific errors)") } else { log.Info(). Str("cluster", cc.name). Str("endpoint", ep). Msg("Recovered unhealthy cluster node") } } else { log.Debug(). Str("cluster", cc.name). Str("endpoint", ep). Err(err). Msg("Recovery attempt failed - endpoint still unhealthy") } }(endpoint) } // Wait for all recovery attempts to complete go func() { wg.Wait() close(recoveredEndpoints) }() // Count recovered endpoints recoveredCount := 0 for range recoveredEndpoints { recoveredCount++ } // Log recovery summary if recoveredCount > 0 { log.Info(). Str("cluster", cc.name). Int("recovered", recoveredCount). Int("attempted", len(unhealthyEndpoints)). Msg("Cluster endpoint recovery completed") } else if len(unhealthyEndpoints) > 0 { log.Warn(). Str("cluster", cc.name). Int("attempted", len(unhealthyEndpoints)). Strs("failedEndpoints", unhealthyEndpoints). Msg("No endpoints recovered - cluster may be unreachable from Pulse server") } } // executeWithFailover executes a function with automatic failover func (cc *ClusterClient) executeWithFailover(ctx context.Context, fn func(*Client) error) error { baseRetries := len(cc.endpoints) maxRetries := baseRetries + rateLimitRetryBudget var lastErr error log.Debug(). Str("cluster", cc.name). Int("maxRetries", maxRetries). Msg("Starting executeWithFailover") for i := 0; i < maxRetries; i++ { client, err := cc.getHealthyClient(ctx) if err != nil { log.Debug(). Str("cluster", cc.name). Err(err). Int("attempt", i+1). Msg("Failed to get healthy client") return err } // Get the endpoint for this client var clientEndpoint string cc.mu.RLock() for endpoint, c := range cc.clients { if c == client { clientEndpoint = endpoint break } } cc.mu.RUnlock() // Execute the function err = fn(client) if err == nil { // Clear any stale error for this endpoint on success cc.clearEndpointError(clientEndpoint) return nil } lastErr = err // Rate limit - retry with backoff (check before connectivity classification) if isRateLimited, statusCode := isTransientRateLimitError(err); isRateLimited { backoff := calculateRateLimitBackoff(i) cc.applyRateLimitCooldown(clientEndpoint, backoff) event := log.Warn(). Str("cluster", cc.name). Str("endpoint", clientEndpoint). Err(err). Dur("backoff", backoff). Int("attempt", i+1) if statusCode != 0 { event = event.Int("status", statusCode) } event.Msg("Rate limited by cluster node, retrying with backoff") timer := time.NewTimer(backoff) select { case <-ctx.Done(): if !timer.Stop() { <-timer.C } return fmt.Errorf("context canceled while backing off after rate limit: %w", ctx.Err()) case <-timer.C: } continue } // Auth errors - return immediately without marking endpoint unhealthy if isAuthError(err) { return err } // Only mark endpoint unhealthy for actual connectivity failures (TCP/DNS/TLS). // Any HTTP response — even 500 — proves the endpoint is reachable. if isEndpointConnectivityError(err) { cc.markUnhealthyWithError(clientEndpoint, err.Error()) log.Warn(). Str("cluster", cc.name). Str("endpoint", clientEndpoint). Err(err). Int("attempt", i+1). Msg("Connectivity failure on cluster node, trying next") continue } // Endpoint is reachable but this specific request failed (API error, permission // issue, VM-specific error, etc.). Return without marking endpoint unhealthy. log.Debug(). Str("cluster", cc.name). Str("endpoint", clientEndpoint). Err(err). Msg("Request-level error, endpoint reachable - not marking unhealthy") return err } if lastErr != nil { return fmt.Errorf("all cluster nodes failed for %s: %w", cc.name, lastErr) } return fmt.Errorf("all cluster nodes failed for %s", cc.name) } func (cc *ClusterClient) applyRateLimitCooldown(endpoint string, backoff time.Duration) { if endpoint == "" { return } cc.mu.Lock() defer cc.mu.Unlock() if cc.rateLimitUntil == nil { cc.rateLimitUntil = make(map[string]time.Time) } cc.rateLimitUntil[endpoint] = time.Now().Add(backoff) } func calculateRateLimitBackoff(attempt int) time.Duration { // Linear backoff with jitter keeps retries gentle while avoiding thundering herd base := rateLimitBaseDelay * time.Duration(attempt+1) if rateLimitMaxJitter <= 0 { return base } jitter := time.Duration(rand.Int63n(rateLimitMaxJitter.Nanoseconds()+1)) * time.Nanosecond return base + jitter } func isTransientRateLimitError(err error) (bool, int) { if err == nil { return false, 0 } errStr := err.Error() statusCode := extractStatusCode(errStr) if statusCode != 0 { if _, ok := transientRateLimitStatusCodes[statusCode]; ok { return true, statusCode } } lowerErr := strings.ToLower(errStr) if strings.Contains(lowerErr, "rate limit") || strings.Contains(lowerErr, "too many requests") { if statusCode == 0 { statusCode = 429 } return true, statusCode } return false, statusCode } func extractStatusCode(errStr string) int { matches := statusCodePattern.FindStringSubmatch(errStr) if len(matches) != 2 { return 0 } code, err := strconv.Atoi(matches[1]) if err != nil { return 0 } return code } func isNotImplementedError(errStr string) bool { lower := strings.ToLower(errStr) if !strings.Contains(lower, "not implemented") { return false } // Common formatting: "status 501", "error 501", "api error 501" if strings.Contains(lower, " 501") || strings.Contains(lower, "status 501") || strings.Contains(lower, "error 501") { return true } // Fallback to explicit HTTP status detection if extractStatusCode(errStr) == 501 { return true } return false } // GetHealthStatus returns the health status of all nodes func (cc *ClusterClient) GetHealthStatus() map[string]bool { cc.mu.RLock() defer cc.mu.RUnlock() status := make(map[string]bool) for endpoint, healthy := range cc.nodeHealth { status[endpoint] = healthy } return status } // EndpointHealth contains health information for a single endpoint type EndpointHealth struct { Healthy bool LastCheck time.Time LastError string } // GetHealthStatusWithErrors returns detailed health status including error messages func (cc *ClusterClient) GetHealthStatusWithErrors() map[string]EndpointHealth { cc.mu.RLock() defer cc.mu.RUnlock() status := make(map[string]EndpointHealth) for endpoint, healthy := range cc.nodeHealth { status[endpoint] = EndpointHealth{ Healthy: healthy, LastCheck: cc.lastHealthCheck[endpoint], LastError: cc.lastError[endpoint], } } return status } // Implement all the Client methods with failover func (cc *ClusterClient) GetNodes(ctx context.Context) ([]Node, error) { log.Debug(). Str("cluster", cc.name). Msg("ClusterClient.GetNodes called") var result []Node err := cc.executeWithFailover(ctx, func(client *Client) error { nodes, err := client.GetNodes(ctx) if err != nil { return err } result = nodes return nil }) if err != nil { log.Warn(). Str("cluster", cc.name). Err(err). Msg("ClusterClient.GetNodes failed") } else { log.Info(). Str("cluster", cc.name). Int("count", len(result)). Msg("ClusterClient.GetNodes succeeded") } return result, err } func (cc *ClusterClient) GetNodeStatus(ctx context.Context, node string) (*NodeStatus, error) { var result *NodeStatus err := cc.executeWithFailover(ctx, func(client *Client) error { status, err := client.GetNodeStatus(ctx, node) if err != nil { return err } result = status return nil }) return result, err } func (cc *ClusterClient) GetNodeRRDData(ctx context.Context, node, timeframe, cf string, ds []string) ([]NodeRRDPoint, error) { var result []NodeRRDPoint err := cc.executeWithFailover(ctx, func(client *Client) error { points, err := client.GetNodeRRDData(ctx, node, timeframe, cf, ds) if err != nil { return err } result = points return nil }) return result, err } func (cc *ClusterClient) GetLXCRRDData(ctx context.Context, node string, vmid int, timeframe, cf string, ds []string) ([]GuestRRDPoint, error) { var result []GuestRRDPoint err := cc.executeWithFailover(ctx, func(client *Client) error { points, err := client.GetLXCRRDData(ctx, node, vmid, timeframe, cf, ds) if err != nil { return err } result = points return nil }) return result, err } func (cc *ClusterClient) GetVMRRDData(ctx context.Context, node string, vmid int, timeframe, cf string, ds []string) ([]GuestRRDPoint, error) { var result []GuestRRDPoint err := cc.executeWithFailover(ctx, func(client *Client) error { points, err := client.GetVMRRDData(ctx, node, vmid, timeframe, cf, ds) if err != nil { return err } result = points return nil }) return result, err } func (cc *ClusterClient) GetVMs(ctx context.Context, node string) ([]VM, error) { var result []VM err := cc.executeWithFailover(ctx, func(client *Client) error { vms, err := client.GetVMs(ctx, node) if err != nil { return err } result = vms return nil }) // Don't return error for transient connectivity issues - preserve UI state if err != nil && strings.Contains(err.Error(), "no healthy nodes available") { log.Debug(). Str("cluster", cc.name). Str("node", node). Err(err). Msg("No healthy nodes for GetVMs - returning empty list to preserve UI state") return []VM{}, nil } return result, err } func (cc *ClusterClient) GetContainers(ctx context.Context, node string) ([]Container, error) { var result []Container err := cc.executeWithFailover(ctx, func(client *Client) error { containers, err := client.GetContainers(ctx, node) if err != nil { return err } result = containers return nil }) // Don't return error for transient connectivity issues - preserve UI state if err != nil && strings.Contains(err.Error(), "no healthy nodes available") { log.Debug(). Str("cluster", cc.name). Str("node", node). Err(err). Msg("No healthy nodes for GetContainers - returning empty list to preserve UI state") return []Container{}, nil } return result, err } func (cc *ClusterClient) GetStorage(ctx context.Context, node string) ([]Storage, error) { var result []Storage err := cc.executeWithFailover(ctx, func(client *Client) error { storage, err := client.GetStorage(ctx, node) if err != nil { return err } result = storage return nil }) return result, err } func (cc *ClusterClient) GetAllStorage(ctx context.Context) ([]Storage, error) { var result []Storage err := cc.executeWithFailover(ctx, func(client *Client) error { storage, err := client.GetAllStorage(ctx) if err != nil { return err } result = storage return nil }) return result, err } func (cc *ClusterClient) GetBackupTasks(ctx context.Context) ([]Task, error) { var result []Task err := cc.executeWithFailover(ctx, func(client *Client) error { tasks, err := client.GetBackupTasks(ctx) if err != nil { return err } result = tasks return nil }) return result, err } func (cc *ClusterClient) GetReplicationStatus(ctx context.Context) ([]ReplicationJob, error) { var result []ReplicationJob err := cc.executeWithFailover(ctx, func(client *Client) error { jobs, err := client.GetReplicationStatus(ctx) if err != nil { return err } result = jobs return nil }) return result, err } func (cc *ClusterClient) GetStorageContent(ctx context.Context, node, storage string) ([]StorageContent, error) { var result []StorageContent err := cc.executeWithFailover(ctx, func(client *Client) error { content, err := client.GetStorageContent(ctx, node, storage) if err != nil { return err } result = content return nil }) return result, err } // GetCephStatus returns Ceph cluster status information with failover support. func (cc *ClusterClient) GetCephStatus(ctx context.Context) (*CephStatus, error) { var result *CephStatus err := cc.executeWithFailover(ctx, func(client *Client) error { status, err := client.GetCephStatus(ctx) if err != nil { return err } result = status return nil }) return result, err } // GetCephDF returns Ceph capacity information with failover support. func (cc *ClusterClient) GetCephDF(ctx context.Context) (*CephDF, error) { var result *CephDF err := cc.executeWithFailover(ctx, func(client *Client) error { df, err := client.GetCephDF(ctx) if err != nil { return err } result = df return nil }) return result, err } func (cc *ClusterClient) GetVMSnapshots(ctx context.Context, node string, vmid int) ([]Snapshot, error) { var result []Snapshot err := cc.executeWithFailover(ctx, func(client *Client) error { snapshots, err := client.GetVMSnapshots(ctx, node, vmid) if err != nil { return err } result = snapshots return nil }) return result, err } func (cc *ClusterClient) GetContainerSnapshots(ctx context.Context, node string, vmid int) ([]Snapshot, error) { var result []Snapshot err := cc.executeWithFailover(ctx, func(client *Client) error { snapshots, err := client.GetContainerSnapshots(ctx, node, vmid) if err != nil { return err } result = snapshots return nil }) return result, err } func (cc *ClusterClient) GetVMStatus(ctx context.Context, node string, vmid int) (*VMStatus, error) { var result *VMStatus err := cc.executeWithFailover(ctx, func(client *Client) error { status, err := client.GetVMStatus(ctx, node, vmid) if err != nil { return err } result = status return nil }) return result, err } func (cc *ClusterClient) GetVMConfig(ctx context.Context, node string, vmid int) (map[string]interface{}, error) { var result map[string]interface{} err := cc.executeWithFailover(ctx, func(client *Client) error { config, err := client.GetVMConfig(ctx, node, vmid) if err != nil { return err } result = config return nil }) return result, err } func (cc *ClusterClient) GetVMAgentInfo(ctx context.Context, node string, vmid int) (map[string]interface{}, error) { var result map[string]interface{} err := cc.executeWithFailover(ctx, func(client *Client) error { info, err := client.GetVMAgentInfo(ctx, node, vmid) if err != nil { return err } result = info return nil }) return result, err } // GetVMAgentVersion returns the guest agent version for the VM. func (cc *ClusterClient) GetVMAgentVersion(ctx context.Context, node string, vmid int) (string, error) { var version string err := cc.executeWithFailover(ctx, func(client *Client) error { v, err := client.GetVMAgentVersion(ctx, node, vmid) if err != nil { return err } version = v return nil }) return version, err } // GetVMFSInfo returns filesystem information from QEMU guest agent func (cc *ClusterClient) GetVMFSInfo(ctx context.Context, node string, vmid int) ([]VMFileSystem, error) { var result []VMFileSystem err := cc.executeWithFailover(ctx, func(client *Client) error { info, err := client.GetVMFSInfo(ctx, node, vmid) if err != nil { return err } result = info return nil }) return result, err } // GetVMNetworkInterfaces returns guest network interfaces from the QEMU agent func (cc *ClusterClient) GetVMNetworkInterfaces(ctx context.Context, node string, vmid int) ([]VMNetworkInterface, error) { var result []VMNetworkInterface err := cc.executeWithFailover(ctx, func(client *Client) error { interfaces, err := client.GetVMNetworkInterfaces(ctx, node, vmid) if err != nil { return err } result = interfaces return nil }) return result, err } // GetVMMemAvailableFromAgent reads /proc/meminfo via the QEMU guest agent to get MemAvailable. func (cc *ClusterClient) GetVMMemAvailableFromAgent(ctx context.Context, node string, vmid int) (uint64, error) { var result uint64 err := cc.executeWithFailover(ctx, func(client *Client) error { available, err := client.GetVMMemAvailableFromAgent(ctx, node, vmid) if err != nil { return err } result = available return nil }) return result, err } // GetClusterResources returns all resources (VMs, containers) across the cluster in a single call func (cc *ClusterClient) GetClusterResources(ctx context.Context, resourceType string) ([]ClusterResource, error) { var result []ClusterResource err := cc.executeWithFailover(ctx, func(client *Client) error { resources, err := client.GetClusterResources(ctx, resourceType) if err != nil { return err } result = resources return nil }) return result, err } // GetContainerStatus returns the status of a specific container func (cc *ClusterClient) GetContainerStatus(ctx context.Context, node string, vmid int) (*Container, error) { var result *Container err := cc.executeWithFailover(ctx, func(client *Client) error { status, err := client.GetContainerStatus(ctx, node, vmid) if err != nil { return err } result = status return nil }) return result, err } // GetContainerConfig returns the configuration of a specific container func (cc *ClusterClient) GetContainerConfig(ctx context.Context, node string, vmid int) (map[string]interface{}, error) { var result map[string]interface{} err := cc.executeWithFailover(ctx, func(client *Client) error { config, err := client.GetContainerConfig(ctx, node, vmid) if err != nil { return err } result = config return nil }) return result, err } // GetContainerInterfaces returns interface details for a container func (cc *ClusterClient) GetContainerInterfaces(ctx context.Context, node string, vmid int) ([]ContainerInterface, error) { var result []ContainerInterface err := cc.executeWithFailover(ctx, func(client *Client) error { interfaces, err := client.GetContainerInterfaces(ctx, node, vmid) if err != nil { return err } result = interfaces return nil }) return result, err } // IsClusterMember checks if this node is part of a cluster func (cc *ClusterClient) IsClusterMember(ctx context.Context) (bool, error) { var result bool err := cc.executeWithFailover(ctx, func(client *Client) error { isMember, err := client.IsClusterMember(ctx) if err != nil { return err } result = isMember return nil }) return result, err } // GetZFSPoolStatus returns ZFS pool status for a node func (cc *ClusterClient) GetZFSPoolStatus(ctx context.Context, node string) ([]ZFSPoolStatus, error) { var result []ZFSPoolStatus err := cc.executeWithFailover(ctx, func(client *Client) error { pools, err := client.GetZFSPoolStatus(ctx, node) if err != nil { return err } result = pools return nil }) return result, err } // GetZFSPoolsWithDetails returns ZFS pools with full details for a node func (cc *ClusterClient) GetZFSPoolsWithDetails(ctx context.Context, node string) ([]ZFSPoolInfo, error) { var result []ZFSPoolInfo err := cc.executeWithFailover(ctx, func(client *Client) error { pools, err := client.GetZFSPoolsWithDetails(ctx, node) if err != nil { return err } result = pools return nil }) return result, err } // Helper to check if error is auth-related func (cc *ClusterClient) GetDisks(ctx context.Context, node string) ([]Disk, error) { var result []Disk err := cc.executeWithFailover(ctx, func(client *Client) error { disks, err := client.GetDisks(ctx, node) if err != nil { return err } result = disks return nil }) // Don't return error for transient connectivity issues if err != nil && strings.Contains(err.Error(), "no healthy nodes available") { log.Debug(). Str("cluster", cc.name). Str("node", node). Err(err). Msg("No healthy nodes for GetDisks - returning empty list") return []Disk{}, nil } return result, err } // GetNodePendingUpdates returns pending apt updates for a node with failover support func (cc *ClusterClient) GetNodePendingUpdates(ctx context.Context, node string) ([]AptPackage, error) { var result []AptPackage err := cc.executeWithFailover(ctx, func(client *Client) error { pkgs, err := client.GetNodePendingUpdates(ctx, node) if err != nil { return err } result = pkgs return nil }) // Don't return error for transient connectivity issues or permission issues if err != nil && (strings.Contains(err.Error(), "no healthy nodes available") || strings.Contains(err.Error(), "403") || strings.Contains(err.Error(), "permission")) { log.Debug(). Str("cluster", cc.name). Str("node", node). Err(err). Msg("Could not get pending updates - returning empty list") return []AptPackage{}, nil } return result, err } // GetClusterStatus returns the cluster status including all nodes with failover support. func (cc *ClusterClient) GetClusterStatus(ctx context.Context) ([]ClusterStatus, error) { var result []ClusterStatus err := cc.executeWithFailover(ctx, func(client *Client) error { status, err := client.GetClusterStatus(ctx) if err != nil { return err } result = status return nil }) return result, err } // IsQuorate checks if the cluster has quorum by querying the Proxmox cluster status. // Returns true if the cluster is quorate (has enough votes for consensus), false otherwise. // This is the authoritative check for cluster health - a cluster with quorum is healthy // even if some nodes are intentionally offline (e.g., backup nodes not running). func (cc *ClusterClient) IsQuorate(ctx context.Context) (bool, error) { status, err := cc.GetClusterStatus(ctx) if err != nil { return false, err } // Look for the cluster entry which has the quorate field for _, s := range status { if s.Type == "cluster" { return s.Quorate == 1, nil } } // If no cluster entry found, this might be a standalone node - consider it healthy return true, nil } // GetClusterOptions fetches datacenter options (e.g. tag colour map) via the first healthy node. func (cc *ClusterClient) GetClusterOptions(ctx context.Context) (*ClusterOptions, error) { var result *ClusterOptions err := cc.executeWithFailover(ctx, func(client *Client) error { opts, err := client.GetClusterOptions(ctx) if err != nil { return err } result = opts return nil }) return result, err } // isAuthError checks if an error is an authentication error func isAuthError(err error) bool { if err == nil { return false } errStr := err.Error() return strings.Contains(errStr, "authentication") || strings.Contains(errStr, "401") || strings.Contains(errStr, "403") }