mirror of
https://github.com/rcourtman/Pulse.git
synced 2026-04-28 11:30:15 +00:00
1470 lines
43 KiB
Go
1470 lines
43 KiB
Go
package proxmox
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"math/rand"
|
|
"regexp"
|
|
"strconv"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/rs/zerolog/log"
|
|
)
|
|
|
|
// ClusterClient wraps multiple Proxmox clients for cluster-aware operations
|
|
type ClusterClient struct {
|
|
mu sync.RWMutex
|
|
name string
|
|
clients map[string]*Client // Key is node name
|
|
endpoints []string // All available endpoints
|
|
endpointFingerprints map[string]string // Per-endpoint TLS fingerprints (TOFU)
|
|
nodeHealth map[string]bool // Track node health
|
|
lastHealthCheck map[string]time.Time // Track last health check time
|
|
lastError map[string]string // Track last error per endpoint
|
|
config ClientConfig // Base config (auth info)
|
|
rateLimitUntil map[string]time.Time // Cooldown window for rate-limited endpoints
|
|
}
|
|
|
|
const (
|
|
rateLimitBaseDelay = 150 * time.Millisecond
|
|
rateLimitMaxJitter = 200 * time.Millisecond
|
|
rateLimitRetryBudget = 2
|
|
)
|
|
|
|
var statusCodePattern = regexp.MustCompile(`(?i)(?:api error|status)\s+(\d{3})`)
|
|
|
|
var transientRateLimitStatusCodes = map[int]struct{}{
|
|
408: {},
|
|
425: {}, // Too Early
|
|
429: {},
|
|
502: {},
|
|
503: {},
|
|
504: {},
|
|
}
|
|
|
|
// isVMSpecificError reports whether an error string is scoped to a single VM/guest agent
|
|
// and should not be treated as a node connectivity failure.
|
|
func isVMSpecificError(errStr string) bool {
|
|
if errStr == "" {
|
|
return false
|
|
}
|
|
lower := strings.ToLower(errStr)
|
|
|
|
if strings.Contains(lower, "no qemu guest agent") ||
|
|
strings.Contains(lower, "qemu guest agent is not running") ||
|
|
strings.Contains(lower, "guest agent") {
|
|
return true
|
|
}
|
|
|
|
// QMP guest agent operations can time out or fail per-VM (e.g. guest-get-fsinfo).
|
|
// These aren't node connectivity issues and should not mark endpoints unhealthy.
|
|
if strings.Contains(lower, "qmp command") {
|
|
return true
|
|
}
|
|
|
|
if strings.Contains(lower, "guest-get-") {
|
|
return true
|
|
}
|
|
|
|
return false
|
|
}
|
|
|
|
// isEndpointConnectivityError reports whether an error indicates the endpoint
|
|
// itself is unreachable (TCP/DNS/TLS failure). Any error that carries an HTTP
|
|
// response — even a 500 — proves the endpoint is reachable, so those are NOT
|
|
// connectivity errors.
|
|
func isEndpointConnectivityError(err error) bool {
|
|
if err == nil {
|
|
return false
|
|
}
|
|
errStr := strings.ToLower(err.Error())
|
|
|
|
// If we received an HTTP response from Proxmox (any status code),
|
|
// the endpoint is reachable.
|
|
if strings.Contains(errStr, "api error") {
|
|
return false
|
|
}
|
|
|
|
// TCP/DNS connectivity failures
|
|
if strings.Contains(errStr, "connection refused") ||
|
|
strings.Contains(errStr, "connection reset") ||
|
|
strings.Contains(errStr, "no such host") ||
|
|
strings.Contains(errStr, "network is unreachable") ||
|
|
strings.Contains(errStr, "no route to host") ||
|
|
strings.Contains(errStr, "i/o timeout") ||
|
|
strings.Contains(errStr, "dial tcp") ||
|
|
strings.Contains(errStr, "dial:") {
|
|
return true
|
|
}
|
|
|
|
// TLS failures
|
|
if strings.Contains(errStr, "tls handshake") ||
|
|
strings.Contains(errStr, "tls:") ||
|
|
strings.Contains(errStr, "certificate") ||
|
|
strings.Contains(errStr, "fingerprint mismatch") {
|
|
return true
|
|
}
|
|
|
|
return false
|
|
}
|
|
|
|
// sanitizeEndpointError transforms raw Go errors into user-friendly messages
|
|
// for display in the UI. The original error is preserved in logs.
|
|
func sanitizeEndpointError(errMsg string) string {
|
|
if errMsg == "" {
|
|
return errMsg
|
|
}
|
|
|
|
lower := strings.ToLower(errMsg)
|
|
|
|
// Context deadline exceeded - usually means slow API response
|
|
if strings.Contains(lower, "context deadline exceeded") {
|
|
// Check for specific causes
|
|
if strings.Contains(lower, "/storage") {
|
|
return "Request timed out - storage API slow (check for unreachable PBS/NFS/Ceph backends)"
|
|
}
|
|
if strings.Contains(lower, "pbs-") || strings.Contains(lower, ":8007") {
|
|
return "Request timed out - PBS storage backend unreachable"
|
|
}
|
|
return "Request timed out - Proxmox API may be slow or waiting on unreachable backend services"
|
|
}
|
|
|
|
// Client timeout - similar to context deadline
|
|
if strings.Contains(lower, "client.timeout exceeded") {
|
|
return "Connection timed out - Proxmox API not responding in time"
|
|
}
|
|
|
|
// Connection refused
|
|
if strings.Contains(lower, "connection refused") {
|
|
return "Connection refused - Proxmox API not running or firewall blocking"
|
|
}
|
|
|
|
// No route to host
|
|
if strings.Contains(lower, "no route to host") {
|
|
return "Network unreachable - check network connectivity to Proxmox host"
|
|
}
|
|
|
|
// TLS/certificate errors
|
|
if strings.Contains(lower, "certificate") || strings.Contains(lower, "x509") {
|
|
return "TLS certificate error - check SSL settings or add fingerprint"
|
|
}
|
|
|
|
// Auth errors - keep these specific
|
|
if strings.Contains(lower, "authentication") || strings.Contains(lower, "401") || strings.Contains(lower, "403") {
|
|
return "Authentication failed - check API token or credentials"
|
|
}
|
|
|
|
// PBS-specific errors
|
|
if strings.Contains(lower, "can't connect to") && strings.Contains(lower, ":8007") {
|
|
return "PBS storage unreachable - check Proxmox Backup Server connectivity"
|
|
}
|
|
|
|
// Return original if no transformation applies
|
|
return errMsg
|
|
}
|
|
|
|
// NewClusterClient creates a new cluster-aware client.
|
|
// endpointFingerprints is an optional map of endpoint URL -> TLS fingerprint for per-node certificate verification.
|
|
// This enables TOFU (Trust On First Use) for clusters with unique self-signed certs per node.
|
|
func NewClusterClient(name string, config ClientConfig, endpoints []string, endpointFingerprints map[string]string) *ClusterClient {
|
|
if endpointFingerprints == nil {
|
|
endpointFingerprints = make(map[string]string)
|
|
}
|
|
cc := &ClusterClient{
|
|
name: name,
|
|
clients: make(map[string]*Client),
|
|
endpoints: endpoints,
|
|
endpointFingerprints: endpointFingerprints,
|
|
nodeHealth: make(map[string]bool),
|
|
lastHealthCheck: make(map[string]time.Time),
|
|
lastError: make(map[string]string),
|
|
config: config,
|
|
rateLimitUntil: make(map[string]time.Time),
|
|
}
|
|
|
|
// Initialize all endpoints as unknown (will be tested on first use)
|
|
// Start optimistically - assume healthy until proven otherwise
|
|
// This allows operations to be attempted even if initial health check fails
|
|
for _, endpoint := range endpoints {
|
|
cc.nodeHealth[endpoint] = true // Start optimistic, will be marked unhealthy if operations fail
|
|
}
|
|
|
|
// Do a quick parallel health check on initialization (synchronous to avoid race)
|
|
// This will mark unhealthy nodes but won't prevent trying them later
|
|
cc.initialHealthCheck()
|
|
|
|
return cc
|
|
}
|
|
|
|
// getEndpointFingerprint returns the TLS fingerprint to use for a specific endpoint.
|
|
// It prefers the per-endpoint fingerprint (TOFU) over the base config fingerprint.
|
|
func (cc *ClusterClient) getEndpointFingerprint(endpoint string) string {
|
|
if fp, ok := cc.endpointFingerprints[endpoint]; ok && fp != "" {
|
|
return fp
|
|
}
|
|
return cc.config.Fingerprint
|
|
}
|
|
|
|
// initialHealthCheck performs a quick parallel health check on all endpoints
|
|
func (cc *ClusterClient) initialHealthCheck() {
|
|
// Skip initial health check if there's only one endpoint
|
|
// For single-endpoint clusters (using main host for routing), assume healthy
|
|
if len(cc.endpoints) == 1 {
|
|
log.Info().
|
|
Str("cluster", cc.name).
|
|
Str("endpoint", cc.endpoints[0]).
|
|
Msg("Single endpoint cluster - skipping initial health check")
|
|
return
|
|
}
|
|
|
|
// For multi-node clusters, do a very quick check but don't mark unhealthy immediately
|
|
// This prevents nodes from being marked unhealthy due to temporary startup conditions
|
|
|
|
var wg sync.WaitGroup
|
|
for _, endpoint := range cc.endpoints {
|
|
wg.Add(1)
|
|
go func(ep string) {
|
|
defer wg.Done()
|
|
|
|
// Try a quick connection test with slightly longer timeout for initial check
|
|
cfg := cc.config
|
|
cfg.Host = ep
|
|
cfg.Fingerprint = cc.getEndpointFingerprint(ep)
|
|
cfg.Timeout = 5 * time.Second
|
|
|
|
testClient, err := NewClient(cfg)
|
|
if err != nil {
|
|
cc.mu.Lock()
|
|
cc.nodeHealth[ep] = false
|
|
cc.lastError[ep] = sanitizeEndpointError(err.Error())
|
|
cc.lastHealthCheck[ep] = time.Now()
|
|
cc.mu.Unlock()
|
|
log.Info().
|
|
Str("cluster", cc.name).
|
|
Str("endpoint", ep).
|
|
Err(err).
|
|
Msg("Cluster endpoint marked unhealthy on initialization")
|
|
return
|
|
}
|
|
|
|
// Quick test with slightly longer timeout for initial check
|
|
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
|
_, err = testClient.GetNodes(ctx)
|
|
cancel()
|
|
|
|
cc.mu.Lock()
|
|
|
|
// Check if error is VM-specific (shouldn't affect health)
|
|
vmSpecificErr := err != nil && isVMSpecificError(err.Error())
|
|
|
|
if err == nil || vmSpecificErr {
|
|
// Node is healthy - create a proper client with full timeout for actual use
|
|
fullCfg := cc.config
|
|
fullCfg.Host = ep
|
|
fullCfg.Fingerprint = cc.getEndpointFingerprint(ep)
|
|
fullClient, clientErr := NewClient(fullCfg)
|
|
if clientErr != nil {
|
|
cc.nodeHealth[ep] = false
|
|
cc.lastError[ep] = sanitizeEndpointError(clientErr.Error())
|
|
cc.lastHealthCheck[ep] = time.Now()
|
|
log.Warn().
|
|
Str("cluster", cc.name).
|
|
Str("endpoint", ep).
|
|
Err(clientErr).
|
|
Msg("Failed to create full client after successful health check")
|
|
} else {
|
|
cc.nodeHealth[ep] = true
|
|
delete(cc.lastError, ep)
|
|
cc.lastHealthCheck[ep] = time.Now()
|
|
cc.clients[ep] = fullClient // Store the full client, not test client
|
|
if vmSpecificErr {
|
|
log.Debug().
|
|
Str("cluster", cc.name).
|
|
Str("endpoint", ep).
|
|
Msg("Cluster endpoint healthy despite VM-specific errors")
|
|
} else {
|
|
log.Info().
|
|
Str("cluster", cc.name).
|
|
Str("endpoint", ep).
|
|
Msg("Cluster endpoint passed initial health check")
|
|
}
|
|
}
|
|
} else {
|
|
// Real connectivity issue
|
|
cc.nodeHealth[ep] = false
|
|
cc.lastError[ep] = sanitizeEndpointError(err.Error())
|
|
cc.lastHealthCheck[ep] = time.Now()
|
|
log.Info().
|
|
Str("cluster", cc.name).
|
|
Str("endpoint", ep).
|
|
Err(err).
|
|
Msg("Cluster endpoint failed initial health check")
|
|
}
|
|
cc.mu.Unlock()
|
|
}(endpoint)
|
|
}
|
|
|
|
// Wait for all checks to complete
|
|
wg.Wait()
|
|
|
|
log.Info().
|
|
Str("cluster", cc.name).
|
|
Int("total", len(cc.endpoints)).
|
|
Msg("Initial cluster health check completed")
|
|
}
|
|
|
|
// getHealthyClient returns a healthy client using round-robin selection
|
|
func (cc *ClusterClient) getHealthyClient(ctx context.Context) (*Client, error) {
|
|
cc.mu.Lock()
|
|
defer cc.mu.Unlock()
|
|
|
|
// Get list of healthy endpoints
|
|
var healthyEndpoints []string
|
|
var coolingEndpoints []string
|
|
now := time.Now()
|
|
for endpoint, healthy := range cc.nodeHealth {
|
|
if healthy {
|
|
if cooldown, exists := cc.rateLimitUntil[endpoint]; exists {
|
|
if now.Before(cooldown) {
|
|
coolingEndpoints = append(coolingEndpoints, endpoint)
|
|
continue
|
|
}
|
|
delete(cc.rateLimitUntil, endpoint)
|
|
}
|
|
healthyEndpoints = append(healthyEndpoints, endpoint)
|
|
}
|
|
}
|
|
|
|
if len(healthyEndpoints) == 0 && len(coolingEndpoints) > 0 {
|
|
// Nothing is immediately available, fall back to endpoints that are in cooldown
|
|
healthyEndpoints = append(healthyEndpoints, coolingEndpoints...)
|
|
}
|
|
|
|
// Count unhealthy endpoints for logging and recovery decisions
|
|
unhealthyCount := 0
|
|
for _, healthy := range cc.nodeHealth {
|
|
if !healthy {
|
|
unhealthyCount++
|
|
}
|
|
}
|
|
|
|
// Log at warn level if no healthy endpoints to aid troubleshooting
|
|
if len(healthyEndpoints) == 0 && len(coolingEndpoints) == 0 {
|
|
log.Warn().
|
|
Str("cluster", cc.name).
|
|
Int("healthy", len(healthyEndpoints)).
|
|
Int("total", len(cc.nodeHealth)).
|
|
Interface("nodeHealth", cc.nodeHealth).
|
|
Msg("No healthy endpoints available - attempting recovery")
|
|
} else {
|
|
log.Debug().
|
|
Str("cluster", cc.name).
|
|
Int("healthy", len(healthyEndpoints)).
|
|
Int("cooling", len(coolingEndpoints)).
|
|
Int("total", len(cc.nodeHealth)).
|
|
Interface("nodeHealth", cc.nodeHealth).
|
|
Msg("Checking for healthy endpoints")
|
|
}
|
|
|
|
// Trigger recovery if we have any unhealthy endpoints
|
|
// This ensures degraded clusters recover individual nodes over time,
|
|
// not just when all nodes are down
|
|
if unhealthyCount > 0 {
|
|
// Use an anonymous function to ensure the lock is re-acquired even if
|
|
// recoverUnhealthyNodes panics, preventing double-unlock from defer
|
|
func() {
|
|
cc.mu.Unlock()
|
|
defer cc.mu.Lock()
|
|
cc.recoverUnhealthyNodes(ctx)
|
|
}()
|
|
|
|
// Refresh the healthy/cooling endpoints lists after recovery attempt
|
|
// since cluster state may have changed while lock was released
|
|
healthyEndpoints = nil
|
|
coolingEndpoints = nil
|
|
now = time.Now() // Refresh time for accurate cooldown checks
|
|
for endpoint, healthy := range cc.nodeHealth {
|
|
if healthy {
|
|
if cooldown, exists := cc.rateLimitUntil[endpoint]; exists && now.Before(cooldown) {
|
|
coolingEndpoints = append(coolingEndpoints, endpoint)
|
|
continue
|
|
}
|
|
healthyEndpoints = append(healthyEndpoints, endpoint)
|
|
}
|
|
}
|
|
|
|
// Re-apply cooldown fallback if no healthy endpoints but some cooling
|
|
if len(healthyEndpoints) == 0 && len(coolingEndpoints) > 0 {
|
|
healthyEndpoints = append(healthyEndpoints, coolingEndpoints...)
|
|
}
|
|
}
|
|
|
|
if len(healthyEndpoints) == 0 {
|
|
// If still no healthy endpoints and we only have one endpoint,
|
|
// try to use it anyway (could be temporarily unreachable)
|
|
if len(cc.endpoints) == 1 {
|
|
log.Warn().
|
|
Str("cluster", cc.name).
|
|
Str("endpoint", cc.endpoints[0]).
|
|
Msg("Single endpoint appears unhealthy but attempting to use it anyway")
|
|
healthyEndpoints = cc.endpoints
|
|
// Mark it as healthy optimistically
|
|
cc.nodeHealth[cc.endpoints[0]] = true
|
|
} else {
|
|
// Provide detailed error with endpoint status
|
|
unhealthyList := make([]string, 0, len(cc.endpoints))
|
|
for _, ep := range cc.endpoints {
|
|
if !cc.nodeHealth[ep] {
|
|
unhealthyList = append(unhealthyList, ep)
|
|
}
|
|
}
|
|
log.Error().
|
|
Str("cluster", cc.name).
|
|
Strs("unhealthyEndpoints", unhealthyList).
|
|
Int("totalEndpoints", len(cc.endpoints)).
|
|
Msg("All cluster endpoints are unhealthy - verify network connectivity and API accessibility from Pulse server")
|
|
return nil, fmt.Errorf("no healthy nodes available in cluster %s (all %d endpoints unreachable: %v)", cc.name, len(cc.endpoints), unhealthyList)
|
|
}
|
|
}
|
|
|
|
// Use random selection for better load distribution
|
|
selectedEndpoint := healthyEndpoints[rand.Intn(len(healthyEndpoints))]
|
|
|
|
// Get or create client for this endpoint
|
|
client, exists := cc.clients[selectedEndpoint]
|
|
if !exists {
|
|
// Create new client with shorter timeout for initial test
|
|
cfg := cc.config
|
|
cfg.Host = selectedEndpoint
|
|
cfg.Fingerprint = cc.getEndpointFingerprint(selectedEndpoint)
|
|
|
|
// First try with a short timeout to quickly detect offline nodes
|
|
testCfg := cfg
|
|
testCfg.Timeout = 3 * time.Second
|
|
|
|
testClient, err := NewClient(testCfg)
|
|
if err != nil {
|
|
// Mark as unhealthy
|
|
cc.nodeHealth[selectedEndpoint] = false
|
|
log.Debug().
|
|
Str("cluster", cc.name).
|
|
Str("endpoint", selectedEndpoint).
|
|
Err(err).
|
|
Msg("Failed to create client for cluster endpoint")
|
|
return nil, fmt.Errorf("failed to create client for %s: %w", selectedEndpoint, err)
|
|
}
|
|
|
|
// Connectivity test - 5 seconds to allow for TLS handshake (~3s typical)
|
|
testCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
|
testNodes, testErr := testClient.GetNodes(testCtx)
|
|
cancel()
|
|
|
|
if testErr != nil {
|
|
// Check if this is a transient rate limit error that shouldn't mark the node unhealthy
|
|
if isRateLimited, _ := isTransientRateLimitError(testErr); isRateLimited {
|
|
log.Debug().
|
|
Str("cluster", cc.name).
|
|
Str("endpoint", selectedEndpoint).
|
|
Err(testErr).
|
|
Msg("Ignoring transient rate limit error during connectivity test")
|
|
// Continue with client creation since the node is accessible, just rate limited
|
|
} else {
|
|
// Check if this is a VM-specific error that shouldn't mark the node unhealthy
|
|
testErrStr := testErr.Error()
|
|
if strings.Contains(testErrStr, "No QEMU guest agent") ||
|
|
strings.Contains(testErrStr, "QEMU guest agent is not running") ||
|
|
strings.Contains(testErrStr, "guest agent") {
|
|
// This is a VM-specific issue, not a connectivity problem
|
|
// The node is actually healthy, so don't mark it unhealthy
|
|
log.Debug().
|
|
Str("cluster", cc.name).
|
|
Str("endpoint", selectedEndpoint).
|
|
Err(testErr).
|
|
Msg("Ignoring VM-specific error during connectivity test")
|
|
// Continue with client creation since the node is actually accessible
|
|
} else {
|
|
// Mark as unhealthy for real connectivity issues
|
|
cc.nodeHealth[selectedEndpoint] = false
|
|
log.Warn().
|
|
Str("cluster", cc.name).
|
|
Str("endpoint", selectedEndpoint).
|
|
Err(testErr).
|
|
Msg("Failed to connect to Proxmox endpoint; endpoint removed from rotation until next refresh")
|
|
return nil, fmt.Errorf("endpoint %s failed connectivity test: %w", selectedEndpoint, testErr)
|
|
}
|
|
}
|
|
}
|
|
|
|
log.Debug().
|
|
Str("cluster", cc.name).
|
|
Str("endpoint", selectedEndpoint).
|
|
Int("nodes", len(testNodes)).
|
|
Msg("Cluster endpoint passed connectivity test")
|
|
|
|
// Clear any stale error from previous failures now that connectivity succeeded
|
|
delete(cc.lastError, selectedEndpoint)
|
|
|
|
// Create the actual client with full timeout
|
|
newClient, err := NewClient(cfg)
|
|
if err != nil {
|
|
// This shouldn't happen since we just tested it
|
|
cc.nodeHealth[selectedEndpoint] = false
|
|
return nil, fmt.Errorf("failed to create client for %s: %w", selectedEndpoint, err)
|
|
}
|
|
|
|
cc.clients[selectedEndpoint] = newClient
|
|
client = newClient
|
|
}
|
|
|
|
return client, nil
|
|
}
|
|
|
|
// markUnhealthyWithError marks an endpoint as unhealthy and captures the error
|
|
func (cc *ClusterClient) markUnhealthyWithError(endpoint string, errMsg string) {
|
|
cc.mu.Lock()
|
|
defer cc.mu.Unlock()
|
|
|
|
if cc.nodeHealth[endpoint] {
|
|
log.Warn().
|
|
Str("cluster", cc.name).
|
|
Str("endpoint", endpoint).
|
|
Str("error", errMsg).
|
|
Msg("Marking cluster node as unhealthy")
|
|
cc.nodeHealth[endpoint] = false
|
|
}
|
|
if errMsg != "" {
|
|
cc.lastError[endpoint] = sanitizeEndpointError(errMsg)
|
|
}
|
|
cc.lastHealthCheck[endpoint] = time.Now()
|
|
}
|
|
|
|
// clearEndpointError removes any cached error for an endpoint after successful operations
|
|
// and marks the endpoint as healthy since the operation succeeded
|
|
func (cc *ClusterClient) clearEndpointError(endpoint string) {
|
|
cc.mu.Lock()
|
|
defer cc.mu.Unlock()
|
|
delete(cc.lastError, endpoint)
|
|
// Mark endpoint healthy since operation succeeded - this ensures degraded
|
|
// clusters recover once endpoints start responding again
|
|
cc.nodeHealth[endpoint] = true
|
|
}
|
|
|
|
// recoverUnhealthyNodes attempts to recover unhealthy nodes
|
|
func (cc *ClusterClient) recoverUnhealthyNodes(ctx context.Context) {
|
|
cc.mu.RLock()
|
|
unhealthyEndpoints := make([]string, 0)
|
|
throttledEndpoints := make([]string, 0)
|
|
now := time.Now()
|
|
for endpoint, healthy := range cc.nodeHealth {
|
|
if !healthy {
|
|
// Skip if we checked this endpoint recently (within 10 seconds)
|
|
// Balance between recovery speed and avoiding excessive checks
|
|
if lastCheck, exists := cc.lastHealthCheck[endpoint]; exists {
|
|
if now.Sub(lastCheck) < 10*time.Second {
|
|
throttledEndpoints = append(throttledEndpoints, endpoint)
|
|
continue
|
|
}
|
|
}
|
|
unhealthyEndpoints = append(unhealthyEndpoints, endpoint)
|
|
}
|
|
}
|
|
cc.mu.RUnlock()
|
|
|
|
if len(unhealthyEndpoints) == 0 {
|
|
if len(throttledEndpoints) > 0 {
|
|
log.Debug().
|
|
Str("cluster", cc.name).
|
|
Strs("throttledEndpoints", throttledEndpoints).
|
|
Msg("Skipping recovery check - endpoints checked recently")
|
|
}
|
|
return
|
|
}
|
|
|
|
log.Info().
|
|
Str("cluster", cc.name).
|
|
Strs("unhealthyEndpoints", unhealthyEndpoints).
|
|
Int("count", len(unhealthyEndpoints)).
|
|
Msg("Attempting to recover unhealthy cluster endpoints")
|
|
|
|
// Test all unhealthy endpoints concurrently with a short timeout
|
|
var wg sync.WaitGroup
|
|
recoveredEndpoints := make(chan string, len(unhealthyEndpoints))
|
|
|
|
for _, endpoint := range unhealthyEndpoints {
|
|
wg.Add(1)
|
|
go func(ep string) {
|
|
defer wg.Done()
|
|
|
|
// Update last check time
|
|
cc.mu.Lock()
|
|
cc.lastHealthCheck[ep] = now
|
|
cc.mu.Unlock()
|
|
|
|
// Try to create a client and test connection
|
|
// Note: 5-second timeout needed because TLS handshake to Proxmox API
|
|
// typically takes ~3 seconds on local networks
|
|
cfg := cc.config
|
|
cfg.Host = ep
|
|
cfg.Fingerprint = cc.getEndpointFingerprint(ep)
|
|
cfg.Timeout = 5 * time.Second
|
|
|
|
testClient, err := NewClient(cfg)
|
|
if err != nil {
|
|
log.Debug().
|
|
Str("cluster", cc.name).
|
|
Str("endpoint", ep).
|
|
Err(err).
|
|
Msg("Failed to create client during recovery attempt")
|
|
return
|
|
}
|
|
|
|
// Try a simple API call
|
|
testCtx, cancel := context.WithTimeout(ctx, 5*time.Second)
|
|
_, err = testClient.GetNodes(testCtx)
|
|
cancel()
|
|
|
|
// Check if error is VM-specific (shouldn't prevent recovery)
|
|
vmSpecificErr := err != nil && isVMSpecificError(err.Error())
|
|
|
|
if err == nil || vmSpecificErr {
|
|
recoveredEndpoints <- ep
|
|
|
|
// Store the client with original timeout
|
|
cfg.Timeout = cc.config.Timeout
|
|
fullClient, _ := NewClient(cfg)
|
|
|
|
cc.mu.Lock()
|
|
cc.nodeHealth[ep] = true
|
|
delete(cc.lastError, ep)
|
|
cc.lastHealthCheck[ep] = time.Now()
|
|
cc.clients[ep] = fullClient
|
|
cc.mu.Unlock()
|
|
|
|
if vmSpecificErr {
|
|
log.Info().
|
|
Str("cluster", cc.name).
|
|
Str("endpoint", ep).
|
|
Msg("Recovered unhealthy cluster node (ignoring VM-specific errors)")
|
|
} else {
|
|
log.Info().
|
|
Str("cluster", cc.name).
|
|
Str("endpoint", ep).
|
|
Msg("Recovered unhealthy cluster node")
|
|
}
|
|
} else {
|
|
log.Debug().
|
|
Str("cluster", cc.name).
|
|
Str("endpoint", ep).
|
|
Err(err).
|
|
Msg("Recovery attempt failed - endpoint still unhealthy")
|
|
}
|
|
}(endpoint)
|
|
}
|
|
|
|
// Wait for all recovery attempts to complete
|
|
go func() {
|
|
wg.Wait()
|
|
close(recoveredEndpoints)
|
|
}()
|
|
|
|
// Count recovered endpoints
|
|
recoveredCount := 0
|
|
for range recoveredEndpoints {
|
|
recoveredCount++
|
|
}
|
|
|
|
// Log recovery summary
|
|
if recoveredCount > 0 {
|
|
log.Info().
|
|
Str("cluster", cc.name).
|
|
Int("recovered", recoveredCount).
|
|
Int("attempted", len(unhealthyEndpoints)).
|
|
Msg("Cluster endpoint recovery completed")
|
|
} else if len(unhealthyEndpoints) > 0 {
|
|
log.Warn().
|
|
Str("cluster", cc.name).
|
|
Int("attempted", len(unhealthyEndpoints)).
|
|
Strs("failedEndpoints", unhealthyEndpoints).
|
|
Msg("No endpoints recovered - cluster may be unreachable from Pulse server")
|
|
}
|
|
}
|
|
|
|
// executeWithFailover executes a function with automatic failover
|
|
func (cc *ClusterClient) executeWithFailover(ctx context.Context, fn func(*Client) error) error {
|
|
baseRetries := len(cc.endpoints)
|
|
maxRetries := baseRetries + rateLimitRetryBudget
|
|
var lastErr error
|
|
|
|
log.Debug().
|
|
Str("cluster", cc.name).
|
|
Int("maxRetries", maxRetries).
|
|
Msg("Starting executeWithFailover")
|
|
|
|
for i := 0; i < maxRetries; i++ {
|
|
client, err := cc.getHealthyClient(ctx)
|
|
if err != nil {
|
|
log.Debug().
|
|
Str("cluster", cc.name).
|
|
Err(err).
|
|
Int("attempt", i+1).
|
|
Msg("Failed to get healthy client")
|
|
return err
|
|
}
|
|
|
|
// Get the endpoint for this client
|
|
var clientEndpoint string
|
|
cc.mu.RLock()
|
|
for endpoint, c := range cc.clients {
|
|
if c == client {
|
|
clientEndpoint = endpoint
|
|
break
|
|
}
|
|
}
|
|
cc.mu.RUnlock()
|
|
|
|
// Execute the function
|
|
err = fn(client)
|
|
if err == nil {
|
|
// Clear any stale error for this endpoint on success
|
|
cc.clearEndpointError(clientEndpoint)
|
|
return nil
|
|
}
|
|
lastErr = err
|
|
|
|
// Rate limit - retry with backoff (check before connectivity classification)
|
|
if isRateLimited, statusCode := isTransientRateLimitError(err); isRateLimited {
|
|
backoff := calculateRateLimitBackoff(i)
|
|
cc.applyRateLimitCooldown(clientEndpoint, backoff)
|
|
|
|
event := log.Warn().
|
|
Str("cluster", cc.name).
|
|
Str("endpoint", clientEndpoint).
|
|
Err(err).
|
|
Dur("backoff", backoff).
|
|
Int("attempt", i+1)
|
|
if statusCode != 0 {
|
|
event = event.Int("status", statusCode)
|
|
}
|
|
event.Msg("Rate limited by cluster node, retrying with backoff")
|
|
|
|
timer := time.NewTimer(backoff)
|
|
select {
|
|
case <-ctx.Done():
|
|
if !timer.Stop() {
|
|
<-timer.C
|
|
}
|
|
return fmt.Errorf("context canceled while backing off after rate limit: %w", ctx.Err())
|
|
case <-timer.C:
|
|
}
|
|
|
|
continue
|
|
}
|
|
|
|
// Auth errors - return immediately without marking endpoint unhealthy
|
|
if isAuthError(err) {
|
|
return err
|
|
}
|
|
|
|
// Only mark endpoint unhealthy for actual connectivity failures (TCP/DNS/TLS).
|
|
// Any HTTP response — even 500 — proves the endpoint is reachable.
|
|
if isEndpointConnectivityError(err) {
|
|
cc.markUnhealthyWithError(clientEndpoint, err.Error())
|
|
log.Warn().
|
|
Str("cluster", cc.name).
|
|
Str("endpoint", clientEndpoint).
|
|
Err(err).
|
|
Int("attempt", i+1).
|
|
Msg("Connectivity failure on cluster node, trying next")
|
|
continue
|
|
}
|
|
|
|
// Endpoint is reachable but this specific request failed (API error, permission
|
|
// issue, VM-specific error, etc.). Return without marking endpoint unhealthy.
|
|
log.Debug().
|
|
Str("cluster", cc.name).
|
|
Str("endpoint", clientEndpoint).
|
|
Err(err).
|
|
Msg("Request-level error, endpoint reachable - not marking unhealthy")
|
|
return err
|
|
}
|
|
|
|
if lastErr != nil {
|
|
return fmt.Errorf("all cluster nodes failed for %s: %w", cc.name, lastErr)
|
|
}
|
|
|
|
return fmt.Errorf("all cluster nodes failed for %s", cc.name)
|
|
}
|
|
|
|
func (cc *ClusterClient) applyRateLimitCooldown(endpoint string, backoff time.Duration) {
|
|
if endpoint == "" {
|
|
return
|
|
}
|
|
|
|
cc.mu.Lock()
|
|
defer cc.mu.Unlock()
|
|
if cc.rateLimitUntil == nil {
|
|
cc.rateLimitUntil = make(map[string]time.Time)
|
|
}
|
|
cc.rateLimitUntil[endpoint] = time.Now().Add(backoff)
|
|
}
|
|
|
|
func calculateRateLimitBackoff(attempt int) time.Duration {
|
|
// Linear backoff with jitter keeps retries gentle while avoiding thundering herd
|
|
base := rateLimitBaseDelay * time.Duration(attempt+1)
|
|
if rateLimitMaxJitter <= 0 {
|
|
return base
|
|
}
|
|
|
|
jitter := time.Duration(rand.Int63n(rateLimitMaxJitter.Nanoseconds()+1)) * time.Nanosecond
|
|
return base + jitter
|
|
}
|
|
|
|
func isTransientRateLimitError(err error) (bool, int) {
|
|
if err == nil {
|
|
return false, 0
|
|
}
|
|
|
|
errStr := err.Error()
|
|
statusCode := extractStatusCode(errStr)
|
|
if statusCode != 0 {
|
|
if _, ok := transientRateLimitStatusCodes[statusCode]; ok {
|
|
return true, statusCode
|
|
}
|
|
}
|
|
|
|
lowerErr := strings.ToLower(errStr)
|
|
if strings.Contains(lowerErr, "rate limit") || strings.Contains(lowerErr, "too many requests") {
|
|
if statusCode == 0 {
|
|
statusCode = 429
|
|
}
|
|
return true, statusCode
|
|
}
|
|
|
|
return false, statusCode
|
|
}
|
|
|
|
func extractStatusCode(errStr string) int {
|
|
matches := statusCodePattern.FindStringSubmatch(errStr)
|
|
if len(matches) != 2 {
|
|
return 0
|
|
}
|
|
|
|
code, err := strconv.Atoi(matches[1])
|
|
if err != nil {
|
|
return 0
|
|
}
|
|
|
|
return code
|
|
}
|
|
|
|
func isNotImplementedError(errStr string) bool {
|
|
lower := strings.ToLower(errStr)
|
|
if !strings.Contains(lower, "not implemented") {
|
|
return false
|
|
}
|
|
|
|
// Common formatting: "status 501", "error 501", "api error 501"
|
|
if strings.Contains(lower, " 501") || strings.Contains(lower, "status 501") || strings.Contains(lower, "error 501") {
|
|
return true
|
|
}
|
|
|
|
// Fallback to explicit HTTP status detection
|
|
if extractStatusCode(errStr) == 501 {
|
|
return true
|
|
}
|
|
|
|
return false
|
|
}
|
|
|
|
// GetHealthStatus returns the health status of all nodes
|
|
func (cc *ClusterClient) GetHealthStatus() map[string]bool {
|
|
cc.mu.RLock()
|
|
defer cc.mu.RUnlock()
|
|
|
|
status := make(map[string]bool)
|
|
for endpoint, healthy := range cc.nodeHealth {
|
|
status[endpoint] = healthy
|
|
}
|
|
return status
|
|
}
|
|
|
|
// EndpointHealth contains health information for a single endpoint
|
|
type EndpointHealth struct {
|
|
Healthy bool
|
|
LastCheck time.Time
|
|
LastError string
|
|
}
|
|
|
|
// GetHealthStatusWithErrors returns detailed health status including error messages
|
|
func (cc *ClusterClient) GetHealthStatusWithErrors() map[string]EndpointHealth {
|
|
cc.mu.RLock()
|
|
defer cc.mu.RUnlock()
|
|
|
|
status := make(map[string]EndpointHealth)
|
|
for endpoint, healthy := range cc.nodeHealth {
|
|
status[endpoint] = EndpointHealth{
|
|
Healthy: healthy,
|
|
LastCheck: cc.lastHealthCheck[endpoint],
|
|
LastError: cc.lastError[endpoint],
|
|
}
|
|
}
|
|
return status
|
|
}
|
|
|
|
// Implement all the Client methods with failover
|
|
|
|
func (cc *ClusterClient) GetNodes(ctx context.Context) ([]Node, error) {
|
|
log.Debug().
|
|
Str("cluster", cc.name).
|
|
Msg("ClusterClient.GetNodes called")
|
|
|
|
var result []Node
|
|
err := cc.executeWithFailover(ctx, func(client *Client) error {
|
|
nodes, err := client.GetNodes(ctx)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
result = nodes
|
|
return nil
|
|
})
|
|
|
|
if err != nil {
|
|
log.Warn().
|
|
Str("cluster", cc.name).
|
|
Err(err).
|
|
Msg("ClusterClient.GetNodes failed")
|
|
} else {
|
|
log.Info().
|
|
Str("cluster", cc.name).
|
|
Int("count", len(result)).
|
|
Msg("ClusterClient.GetNodes succeeded")
|
|
}
|
|
|
|
return result, err
|
|
}
|
|
|
|
func (cc *ClusterClient) GetNodeStatus(ctx context.Context, node string) (*NodeStatus, error) {
|
|
var result *NodeStatus
|
|
err := cc.executeWithFailover(ctx, func(client *Client) error {
|
|
status, err := client.GetNodeStatus(ctx, node)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
result = status
|
|
return nil
|
|
})
|
|
return result, err
|
|
}
|
|
|
|
func (cc *ClusterClient) GetNodeRRDData(ctx context.Context, node, timeframe, cf string, ds []string) ([]NodeRRDPoint, error) {
|
|
var result []NodeRRDPoint
|
|
err := cc.executeWithFailover(ctx, func(client *Client) error {
|
|
points, err := client.GetNodeRRDData(ctx, node, timeframe, cf, ds)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
result = points
|
|
return nil
|
|
})
|
|
return result, err
|
|
}
|
|
|
|
func (cc *ClusterClient) GetLXCRRDData(ctx context.Context, node string, vmid int, timeframe, cf string, ds []string) ([]GuestRRDPoint, error) {
|
|
var result []GuestRRDPoint
|
|
err := cc.executeWithFailover(ctx, func(client *Client) error {
|
|
points, err := client.GetLXCRRDData(ctx, node, vmid, timeframe, cf, ds)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
result = points
|
|
return nil
|
|
})
|
|
return result, err
|
|
}
|
|
|
|
func (cc *ClusterClient) GetVMRRDData(ctx context.Context, node string, vmid int, timeframe, cf string, ds []string) ([]GuestRRDPoint, error) {
|
|
var result []GuestRRDPoint
|
|
err := cc.executeWithFailover(ctx, func(client *Client) error {
|
|
points, err := client.GetVMRRDData(ctx, node, vmid, timeframe, cf, ds)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
result = points
|
|
return nil
|
|
})
|
|
return result, err
|
|
}
|
|
|
|
func (cc *ClusterClient) GetVMs(ctx context.Context, node string) ([]VM, error) {
|
|
var result []VM
|
|
err := cc.executeWithFailover(ctx, func(client *Client) error {
|
|
vms, err := client.GetVMs(ctx, node)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
result = vms
|
|
return nil
|
|
})
|
|
|
|
// Don't return error for transient connectivity issues - preserve UI state
|
|
if err != nil && strings.Contains(err.Error(), "no healthy nodes available") {
|
|
log.Debug().
|
|
Str("cluster", cc.name).
|
|
Str("node", node).
|
|
Err(err).
|
|
Msg("No healthy nodes for GetVMs - returning empty list to preserve UI state")
|
|
return []VM{}, nil
|
|
}
|
|
|
|
return result, err
|
|
}
|
|
|
|
func (cc *ClusterClient) GetContainers(ctx context.Context, node string) ([]Container, error) {
|
|
var result []Container
|
|
err := cc.executeWithFailover(ctx, func(client *Client) error {
|
|
containers, err := client.GetContainers(ctx, node)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
result = containers
|
|
return nil
|
|
})
|
|
|
|
// Don't return error for transient connectivity issues - preserve UI state
|
|
if err != nil && strings.Contains(err.Error(), "no healthy nodes available") {
|
|
log.Debug().
|
|
Str("cluster", cc.name).
|
|
Str("node", node).
|
|
Err(err).
|
|
Msg("No healthy nodes for GetContainers - returning empty list to preserve UI state")
|
|
return []Container{}, nil
|
|
}
|
|
|
|
return result, err
|
|
}
|
|
|
|
func (cc *ClusterClient) GetStorage(ctx context.Context, node string) ([]Storage, error) {
|
|
var result []Storage
|
|
err := cc.executeWithFailover(ctx, func(client *Client) error {
|
|
storage, err := client.GetStorage(ctx, node)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
result = storage
|
|
return nil
|
|
})
|
|
return result, err
|
|
}
|
|
|
|
func (cc *ClusterClient) GetAllStorage(ctx context.Context) ([]Storage, error) {
|
|
var result []Storage
|
|
err := cc.executeWithFailover(ctx, func(client *Client) error {
|
|
storage, err := client.GetAllStorage(ctx)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
result = storage
|
|
return nil
|
|
})
|
|
return result, err
|
|
}
|
|
|
|
func (cc *ClusterClient) GetBackupTasks(ctx context.Context) ([]Task, error) {
|
|
var result []Task
|
|
err := cc.executeWithFailover(ctx, func(client *Client) error {
|
|
tasks, err := client.GetBackupTasks(ctx)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
result = tasks
|
|
return nil
|
|
})
|
|
return result, err
|
|
}
|
|
|
|
func (cc *ClusterClient) GetReplicationStatus(ctx context.Context) ([]ReplicationJob, error) {
|
|
var result []ReplicationJob
|
|
err := cc.executeWithFailover(ctx, func(client *Client) error {
|
|
jobs, err := client.GetReplicationStatus(ctx)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
result = jobs
|
|
return nil
|
|
})
|
|
return result, err
|
|
}
|
|
|
|
func (cc *ClusterClient) GetStorageContent(ctx context.Context, node, storage string) ([]StorageContent, error) {
|
|
var result []StorageContent
|
|
err := cc.executeWithFailover(ctx, func(client *Client) error {
|
|
content, err := client.GetStorageContent(ctx, node, storage)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
result = content
|
|
return nil
|
|
})
|
|
return result, err
|
|
}
|
|
|
|
// GetCephStatus returns Ceph cluster status information with failover support.
|
|
func (cc *ClusterClient) GetCephStatus(ctx context.Context) (*CephStatus, error) {
|
|
var result *CephStatus
|
|
err := cc.executeWithFailover(ctx, func(client *Client) error {
|
|
status, err := client.GetCephStatus(ctx)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
result = status
|
|
return nil
|
|
})
|
|
return result, err
|
|
}
|
|
|
|
// GetCephDF returns Ceph capacity information with failover support.
|
|
func (cc *ClusterClient) GetCephDF(ctx context.Context) (*CephDF, error) {
|
|
var result *CephDF
|
|
err := cc.executeWithFailover(ctx, func(client *Client) error {
|
|
df, err := client.GetCephDF(ctx)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
result = df
|
|
return nil
|
|
})
|
|
return result, err
|
|
}
|
|
|
|
func (cc *ClusterClient) GetVMSnapshots(ctx context.Context, node string, vmid int) ([]Snapshot, error) {
|
|
var result []Snapshot
|
|
err := cc.executeWithFailover(ctx, func(client *Client) error {
|
|
snapshots, err := client.GetVMSnapshots(ctx, node, vmid)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
result = snapshots
|
|
return nil
|
|
})
|
|
return result, err
|
|
}
|
|
|
|
func (cc *ClusterClient) GetContainerSnapshots(ctx context.Context, node string, vmid int) ([]Snapshot, error) {
|
|
var result []Snapshot
|
|
err := cc.executeWithFailover(ctx, func(client *Client) error {
|
|
snapshots, err := client.GetContainerSnapshots(ctx, node, vmid)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
result = snapshots
|
|
return nil
|
|
})
|
|
return result, err
|
|
}
|
|
|
|
func (cc *ClusterClient) GetVMStatus(ctx context.Context, node string, vmid int) (*VMStatus, error) {
|
|
var result *VMStatus
|
|
err := cc.executeWithFailover(ctx, func(client *Client) error {
|
|
status, err := client.GetVMStatus(ctx, node, vmid)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
result = status
|
|
return nil
|
|
})
|
|
return result, err
|
|
}
|
|
|
|
func (cc *ClusterClient) GetVMConfig(ctx context.Context, node string, vmid int) (map[string]interface{}, error) {
|
|
var result map[string]interface{}
|
|
err := cc.executeWithFailover(ctx, func(client *Client) error {
|
|
config, err := client.GetVMConfig(ctx, node, vmid)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
result = config
|
|
return nil
|
|
})
|
|
return result, err
|
|
}
|
|
|
|
func (cc *ClusterClient) GetVMAgentInfo(ctx context.Context, node string, vmid int) (map[string]interface{}, error) {
|
|
var result map[string]interface{}
|
|
err := cc.executeWithFailover(ctx, func(client *Client) error {
|
|
info, err := client.GetVMAgentInfo(ctx, node, vmid)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
result = info
|
|
return nil
|
|
})
|
|
return result, err
|
|
}
|
|
|
|
// GetVMAgentVersion returns the guest agent version for the VM.
|
|
func (cc *ClusterClient) GetVMAgentVersion(ctx context.Context, node string, vmid int) (string, error) {
|
|
var version string
|
|
err := cc.executeWithFailover(ctx, func(client *Client) error {
|
|
v, err := client.GetVMAgentVersion(ctx, node, vmid)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
version = v
|
|
return nil
|
|
})
|
|
return version, err
|
|
}
|
|
|
|
// GetVMFSInfo returns filesystem information from QEMU guest agent
|
|
func (cc *ClusterClient) GetVMFSInfo(ctx context.Context, node string, vmid int) ([]VMFileSystem, error) {
|
|
var result []VMFileSystem
|
|
err := cc.executeWithFailover(ctx, func(client *Client) error {
|
|
info, err := client.GetVMFSInfo(ctx, node, vmid)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
result = info
|
|
return nil
|
|
})
|
|
return result, err
|
|
}
|
|
|
|
// GetVMNetworkInterfaces returns guest network interfaces from the QEMU agent
|
|
func (cc *ClusterClient) GetVMNetworkInterfaces(ctx context.Context, node string, vmid int) ([]VMNetworkInterface, error) {
|
|
var result []VMNetworkInterface
|
|
err := cc.executeWithFailover(ctx, func(client *Client) error {
|
|
interfaces, err := client.GetVMNetworkInterfaces(ctx, node, vmid)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
result = interfaces
|
|
return nil
|
|
})
|
|
return result, err
|
|
}
|
|
|
|
// GetVMMemAvailableFromAgent reads /proc/meminfo via the QEMU guest agent to get MemAvailable.
|
|
func (cc *ClusterClient) GetVMMemAvailableFromAgent(ctx context.Context, node string, vmid int) (uint64, error) {
|
|
var result uint64
|
|
err := cc.executeWithFailover(ctx, func(client *Client) error {
|
|
available, err := client.GetVMMemAvailableFromAgent(ctx, node, vmid)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
result = available
|
|
return nil
|
|
})
|
|
return result, err
|
|
}
|
|
|
|
// GetClusterResources returns all resources (VMs, containers) across the cluster in a single call
|
|
func (cc *ClusterClient) GetClusterResources(ctx context.Context, resourceType string) ([]ClusterResource, error) {
|
|
var result []ClusterResource
|
|
err := cc.executeWithFailover(ctx, func(client *Client) error {
|
|
resources, err := client.GetClusterResources(ctx, resourceType)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
result = resources
|
|
return nil
|
|
})
|
|
return result, err
|
|
}
|
|
|
|
// GetContainerStatus returns the status of a specific container
|
|
func (cc *ClusterClient) GetContainerStatus(ctx context.Context, node string, vmid int) (*Container, error) {
|
|
var result *Container
|
|
err := cc.executeWithFailover(ctx, func(client *Client) error {
|
|
status, err := client.GetContainerStatus(ctx, node, vmid)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
result = status
|
|
return nil
|
|
})
|
|
return result, err
|
|
}
|
|
|
|
// GetContainerConfig returns the configuration of a specific container
|
|
func (cc *ClusterClient) GetContainerConfig(ctx context.Context, node string, vmid int) (map[string]interface{}, error) {
|
|
var result map[string]interface{}
|
|
err := cc.executeWithFailover(ctx, func(client *Client) error {
|
|
config, err := client.GetContainerConfig(ctx, node, vmid)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
result = config
|
|
return nil
|
|
})
|
|
return result, err
|
|
}
|
|
|
|
// GetContainerInterfaces returns interface details for a container
|
|
func (cc *ClusterClient) GetContainerInterfaces(ctx context.Context, node string, vmid int) ([]ContainerInterface, error) {
|
|
var result []ContainerInterface
|
|
err := cc.executeWithFailover(ctx, func(client *Client) error {
|
|
interfaces, err := client.GetContainerInterfaces(ctx, node, vmid)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
result = interfaces
|
|
return nil
|
|
})
|
|
return result, err
|
|
}
|
|
|
|
// IsClusterMember checks if this node is part of a cluster
|
|
func (cc *ClusterClient) IsClusterMember(ctx context.Context) (bool, error) {
|
|
var result bool
|
|
err := cc.executeWithFailover(ctx, func(client *Client) error {
|
|
isMember, err := client.IsClusterMember(ctx)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
result = isMember
|
|
return nil
|
|
})
|
|
return result, err
|
|
}
|
|
|
|
// GetZFSPoolStatus returns ZFS pool status for a node
|
|
func (cc *ClusterClient) GetZFSPoolStatus(ctx context.Context, node string) ([]ZFSPoolStatus, error) {
|
|
var result []ZFSPoolStatus
|
|
err := cc.executeWithFailover(ctx, func(client *Client) error {
|
|
pools, err := client.GetZFSPoolStatus(ctx, node)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
result = pools
|
|
return nil
|
|
})
|
|
return result, err
|
|
}
|
|
|
|
// GetZFSPoolsWithDetails returns ZFS pools with full details for a node
|
|
func (cc *ClusterClient) GetZFSPoolsWithDetails(ctx context.Context, node string) ([]ZFSPoolInfo, error) {
|
|
var result []ZFSPoolInfo
|
|
err := cc.executeWithFailover(ctx, func(client *Client) error {
|
|
pools, err := client.GetZFSPoolsWithDetails(ctx, node)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
result = pools
|
|
return nil
|
|
})
|
|
return result, err
|
|
}
|
|
|
|
// Helper to check if error is auth-related
|
|
func (cc *ClusterClient) GetDisks(ctx context.Context, node string) ([]Disk, error) {
|
|
var result []Disk
|
|
err := cc.executeWithFailover(ctx, func(client *Client) error {
|
|
disks, err := client.GetDisks(ctx, node)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
result = disks
|
|
return nil
|
|
})
|
|
|
|
// Don't return error for transient connectivity issues
|
|
if err != nil && strings.Contains(err.Error(), "no healthy nodes available") {
|
|
log.Debug().
|
|
Str("cluster", cc.name).
|
|
Str("node", node).
|
|
Err(err).
|
|
Msg("No healthy nodes for GetDisks - returning empty list")
|
|
return []Disk{}, nil
|
|
}
|
|
|
|
return result, err
|
|
}
|
|
|
|
// GetNodePendingUpdates returns pending apt updates for a node with failover support
|
|
func (cc *ClusterClient) GetNodePendingUpdates(ctx context.Context, node string) ([]AptPackage, error) {
|
|
var result []AptPackage
|
|
err := cc.executeWithFailover(ctx, func(client *Client) error {
|
|
pkgs, err := client.GetNodePendingUpdates(ctx, node)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
result = pkgs
|
|
return nil
|
|
})
|
|
|
|
// Don't return error for transient connectivity issues or permission issues
|
|
if err != nil && (strings.Contains(err.Error(), "no healthy nodes available") ||
|
|
strings.Contains(err.Error(), "403") || strings.Contains(err.Error(), "permission")) {
|
|
log.Debug().
|
|
Str("cluster", cc.name).
|
|
Str("node", node).
|
|
Err(err).
|
|
Msg("Could not get pending updates - returning empty list")
|
|
return []AptPackage{}, nil
|
|
}
|
|
|
|
return result, err
|
|
}
|
|
|
|
// GetClusterStatus returns the cluster status including all nodes with failover support.
|
|
func (cc *ClusterClient) GetClusterStatus(ctx context.Context) ([]ClusterStatus, error) {
|
|
var result []ClusterStatus
|
|
err := cc.executeWithFailover(ctx, func(client *Client) error {
|
|
status, err := client.GetClusterStatus(ctx)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
result = status
|
|
return nil
|
|
})
|
|
|
|
return result, err
|
|
}
|
|
|
|
// IsQuorate checks if the cluster has quorum by querying the Proxmox cluster status.
|
|
// Returns true if the cluster is quorate (has enough votes for consensus), false otherwise.
|
|
// This is the authoritative check for cluster health - a cluster with quorum is healthy
|
|
// even if some nodes are intentionally offline (e.g., backup nodes not running).
|
|
func (cc *ClusterClient) IsQuorate(ctx context.Context) (bool, error) {
|
|
status, err := cc.GetClusterStatus(ctx)
|
|
if err != nil {
|
|
return false, err
|
|
}
|
|
|
|
// Look for the cluster entry which has the quorate field
|
|
for _, s := range status {
|
|
if s.Type == "cluster" {
|
|
return s.Quorate == 1, nil
|
|
}
|
|
}
|
|
|
|
// If no cluster entry found, this might be a standalone node - consider it healthy
|
|
return true, nil
|
|
}
|
|
|
|
// GetClusterOptions fetches datacenter options (e.g. tag colour map) via the first healthy node.
|
|
func (cc *ClusterClient) GetClusterOptions(ctx context.Context) (*ClusterOptions, error) {
|
|
var result *ClusterOptions
|
|
err := cc.executeWithFailover(ctx, func(client *Client) error {
|
|
opts, err := client.GetClusterOptions(ctx)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
result = opts
|
|
return nil
|
|
})
|
|
return result, err
|
|
}
|
|
|
|
// isAuthError checks if an error is an authentication error
|
|
func isAuthError(err error) bool {
|
|
if err == nil {
|
|
return false
|
|
}
|
|
errStr := err.Error()
|
|
return strings.Contains(errStr, "authentication") ||
|
|
strings.Contains(errStr, "401") ||
|
|
strings.Contains(errStr, "403")
|
|
}
|