Fix storage disappearing after upgrade by preserving TLS validation

Fixes #657

Between v4.25.0 and v4.26.4, commit 72865ff62 changed cluster endpoint
resolution to prefer IP addresses over hostnames to reduce DNS lookups
(refs #620). However, this caused TLS certificate validation to fail for
installations with VerifySSL=true, because Proxmox certificates typically
contain hostnames (e.g., pve01.example.com), not IP addresses.

When all cluster endpoints failed TLS validation during the initial health
check, the ClusterClient marked all nodes as unhealthy. Subsequent calls
to GetAllStorage() would fail with "no healthy nodes available in cluster",
causing storage data to disappear from the UI despite the cluster being
fully operational.

**Root Cause:**
The IP-first approach breaks TLS hostname verification when:
- VerifySSL is enabled (common for production environments)
- Certificates are issued with hostnames, not IPs (standard practice)
- Result: x509 certificate validation fails (e.g., "certificate is valid
  for pve01.example.com, not 10.0.0.44")

**Solution:**
Conditionally prefer hostnames vs IPs based on TLS validation requirements:

1. When TLS hostname verification is required (VerifySSL=true AND no
   fingerprint override), prefer hostname to ensure certificate CN/SAN
   validation succeeds.

2. When TLS verification is bypassed (VerifySSL=false OR fingerprint
   provided), prefer IP to reduce DNS lookups.

This approach:
- Fixes the regression for users with VerifySSL enabled
- Preserves the DNS optimization for self-signed/fingerprint configs
- Maintains backwards compatibility with v4.25.0 behavior
- Does not compromise TLS security

**Testing:**
Users reported that rolling back to v4.25.0 fixed their storage visibility.
This fix should restore storage for v4.26.4+ while maintaining the DNS
optimization for appropriate scenarios.
This commit is contained in:
rcourtman 2025-11-07 15:36:52 +00:00
parent 19091d47c9
commit 59a97f2e3e

View file

@ -351,13 +351,29 @@ func ensureClusterEndpointURL(raw string) string {
return "https://" + net.JoinHostPort(value, "8006")
}
func clusterEndpointEffectiveURL(endpoint config.ClusterEndpoint) string {
// Prefer IP address to avoid excessive DNS lookups
if endpoint.IP != "" {
return ensureClusterEndpointURL(endpoint.IP)
}
if endpoint.Host != "" {
return ensureClusterEndpointURL(endpoint.Host)
func clusterEndpointEffectiveURL(endpoint config.ClusterEndpoint, verifySSL bool, hasFingerprint bool) string {
// When TLS hostname verification is required (VerifySSL=true and no fingerprint),
// prefer hostname over IP to ensure certificate CN/SAN validation works correctly.
// When TLS is not verified (VerifySSL=false) or a fingerprint is provided (which
// bypasses hostname checks), prefer IP to reduce DNS lookups (refs #620).
requiresHostnameForTLS := verifySSL && !hasFingerprint
if requiresHostnameForTLS {
// Prefer hostname for proper TLS certificate validation
if endpoint.Host != "" {
return ensureClusterEndpointURL(endpoint.Host)
}
if endpoint.IP != "" {
return ensureClusterEndpointURL(endpoint.IP)
}
} else {
// Prefer IP address to avoid excessive DNS lookups
if endpoint.IP != "" {
return ensureClusterEndpointURL(endpoint.IP)
}
if endpoint.Host != "" {
return ensureClusterEndpointURL(endpoint.Host)
}
}
return ""
}
@ -3546,7 +3562,8 @@ func New(cfg *config.Config) (*Monitor, error) {
endpoints := make([]string, 0, len(pve.ClusterEndpoints))
for _, ep := range pve.ClusterEndpoints {
effectiveURL := clusterEndpointEffectiveURL(ep)
hasFingerprint := pve.Fingerprint != ""
effectiveURL := clusterEndpointEffectiveURL(ep, pve.VerifySSL, hasFingerprint)
if effectiveURL == "" {
log.Warn().
Str("node", ep.NodeName).
@ -5139,9 +5156,10 @@ func (m *Monitor) pollPVEInstance(ctx context.Context, instanceName string, clie
connectionHost := instanceCfg.Host
guestURL := instanceCfg.GuestURL
if instanceCfg.IsCluster && len(instanceCfg.ClusterEndpoints) > 0 {
hasFingerprint := instanceCfg.Fingerprint != ""
for _, ep := range instanceCfg.ClusterEndpoints {
if strings.EqualFold(ep.NodeName, node.Node) {
if effective := clusterEndpointEffectiveURL(ep); effective != "" {
if effective := clusterEndpointEffectiveURL(ep, instanceCfg.VerifySSL, hasFingerprint); effective != "" {
connectionHost = effective
}
if ep.GuestURL != "" {
@ -5598,9 +5616,10 @@ func (m *Monitor) pollPVEInstance(ctx context.Context, instanceName string, clie
sshHost := modelNode.Host
if modelNode.IsClusterMember && instanceCfg.IsCluster {
hasFingerprint := instanceCfg.Fingerprint != ""
for _, ep := range instanceCfg.ClusterEndpoints {
if strings.EqualFold(ep.NodeName, node.Node) {
if effective := clusterEndpointEffectiveURL(ep); effective != "" {
if effective := clusterEndpointEffectiveURL(ep, instanceCfg.VerifySSL, hasFingerprint); effective != "" {
sshHost = effective
}
break
@ -6074,6 +6093,7 @@ func (m *Monitor) pollPVEInstance(ctx context.Context, instanceName string, clie
}
// Update the online status for each cluster endpoint
hasFingerprint := instanceCfg.Fingerprint != ""
for i := range instanceCfg.ClusterEndpoints {
if online, exists := onlineNodes[instanceCfg.ClusterEndpoints[i].NodeName]; exists {
instanceCfg.ClusterEndpoints[i].Online = online
@ -6085,7 +6105,7 @@ func (m *Monitor) pollPVEInstance(ctx context.Context, instanceName string, clie
// Update Pulse connectivity status
if pulseHealth != nil {
// Try to find the endpoint in the health map by matching the effective URL
endpointURL := clusterEndpointEffectiveURL(instanceCfg.ClusterEndpoints[i])
endpointURL := clusterEndpointEffectiveURL(instanceCfg.ClusterEndpoints[i], instanceCfg.VerifySSL, hasFingerprint)
if health, exists := pulseHealth[endpointURL]; exists {
reachable := health.Healthy
instanceCfg.ClusterEndpoints[i].PulseReachable = &reachable