From 59a97f2e3e9560a4fbe24b083039669acf9fcbf0 Mon Sep 17 00:00:00 2001 From: rcourtman Date: Fri, 7 Nov 2025 15:36:52 +0000 Subject: [PATCH] Fix storage disappearing after upgrade by preserving TLS validation Fixes #657 Between v4.25.0 and v4.26.4, commit 72865ff62 changed cluster endpoint resolution to prefer IP addresses over hostnames to reduce DNS lookups (refs #620). However, this caused TLS certificate validation to fail for installations with VerifySSL=true, because Proxmox certificates typically contain hostnames (e.g., pve01.example.com), not IP addresses. When all cluster endpoints failed TLS validation during the initial health check, the ClusterClient marked all nodes as unhealthy. Subsequent calls to GetAllStorage() would fail with "no healthy nodes available in cluster", causing storage data to disappear from the UI despite the cluster being fully operational. **Root Cause:** The IP-first approach breaks TLS hostname verification when: - VerifySSL is enabled (common for production environments) - Certificates are issued with hostnames, not IPs (standard practice) - Result: x509 certificate validation fails (e.g., "certificate is valid for pve01.example.com, not 10.0.0.44") **Solution:** Conditionally prefer hostnames vs IPs based on TLS validation requirements: 1. When TLS hostname verification is required (VerifySSL=true AND no fingerprint override), prefer hostname to ensure certificate CN/SAN validation succeeds. 2. When TLS verification is bypassed (VerifySSL=false OR fingerprint provided), prefer IP to reduce DNS lookups. This approach: - Fixes the regression for users with VerifySSL enabled - Preserves the DNS optimization for self-signed/fingerprint configs - Maintains backwards compatibility with v4.25.0 behavior - Does not compromise TLS security **Testing:** Users reported that rolling back to v4.25.0 fixed their storage visibility. This fix should restore storage for v4.26.4+ while maintaining the DNS optimization for appropriate scenarios. --- internal/monitoring/monitor.go | 42 +++++++++++++++++++++++++--------- 1 file changed, 31 insertions(+), 11 deletions(-) diff --git a/internal/monitoring/monitor.go b/internal/monitoring/monitor.go index 9699cfb53..648725fa8 100644 --- a/internal/monitoring/monitor.go +++ b/internal/monitoring/monitor.go @@ -351,13 +351,29 @@ func ensureClusterEndpointURL(raw string) string { return "https://" + net.JoinHostPort(value, "8006") } -func clusterEndpointEffectiveURL(endpoint config.ClusterEndpoint) string { - // Prefer IP address to avoid excessive DNS lookups - if endpoint.IP != "" { - return ensureClusterEndpointURL(endpoint.IP) - } - if endpoint.Host != "" { - return ensureClusterEndpointURL(endpoint.Host) +func clusterEndpointEffectiveURL(endpoint config.ClusterEndpoint, verifySSL bool, hasFingerprint bool) string { + // When TLS hostname verification is required (VerifySSL=true and no fingerprint), + // prefer hostname over IP to ensure certificate CN/SAN validation works correctly. + // When TLS is not verified (VerifySSL=false) or a fingerprint is provided (which + // bypasses hostname checks), prefer IP to reduce DNS lookups (refs #620). + requiresHostnameForTLS := verifySSL && !hasFingerprint + + if requiresHostnameForTLS { + // Prefer hostname for proper TLS certificate validation + if endpoint.Host != "" { + return ensureClusterEndpointURL(endpoint.Host) + } + if endpoint.IP != "" { + return ensureClusterEndpointURL(endpoint.IP) + } + } else { + // Prefer IP address to avoid excessive DNS lookups + if endpoint.IP != "" { + return ensureClusterEndpointURL(endpoint.IP) + } + if endpoint.Host != "" { + return ensureClusterEndpointURL(endpoint.Host) + } } return "" } @@ -3546,7 +3562,8 @@ func New(cfg *config.Config) (*Monitor, error) { endpoints := make([]string, 0, len(pve.ClusterEndpoints)) for _, ep := range pve.ClusterEndpoints { - effectiveURL := clusterEndpointEffectiveURL(ep) + hasFingerprint := pve.Fingerprint != "" + effectiveURL := clusterEndpointEffectiveURL(ep, pve.VerifySSL, hasFingerprint) if effectiveURL == "" { log.Warn(). Str("node", ep.NodeName). @@ -5139,9 +5156,10 @@ func (m *Monitor) pollPVEInstance(ctx context.Context, instanceName string, clie connectionHost := instanceCfg.Host guestURL := instanceCfg.GuestURL if instanceCfg.IsCluster && len(instanceCfg.ClusterEndpoints) > 0 { + hasFingerprint := instanceCfg.Fingerprint != "" for _, ep := range instanceCfg.ClusterEndpoints { if strings.EqualFold(ep.NodeName, node.Node) { - if effective := clusterEndpointEffectiveURL(ep); effective != "" { + if effective := clusterEndpointEffectiveURL(ep, instanceCfg.VerifySSL, hasFingerprint); effective != "" { connectionHost = effective } if ep.GuestURL != "" { @@ -5598,9 +5616,10 @@ func (m *Monitor) pollPVEInstance(ctx context.Context, instanceName string, clie sshHost := modelNode.Host if modelNode.IsClusterMember && instanceCfg.IsCluster { + hasFingerprint := instanceCfg.Fingerprint != "" for _, ep := range instanceCfg.ClusterEndpoints { if strings.EqualFold(ep.NodeName, node.Node) { - if effective := clusterEndpointEffectiveURL(ep); effective != "" { + if effective := clusterEndpointEffectiveURL(ep, instanceCfg.VerifySSL, hasFingerprint); effective != "" { sshHost = effective } break @@ -6074,6 +6093,7 @@ func (m *Monitor) pollPVEInstance(ctx context.Context, instanceName string, clie } // Update the online status for each cluster endpoint + hasFingerprint := instanceCfg.Fingerprint != "" for i := range instanceCfg.ClusterEndpoints { if online, exists := onlineNodes[instanceCfg.ClusterEndpoints[i].NodeName]; exists { instanceCfg.ClusterEndpoints[i].Online = online @@ -6085,7 +6105,7 @@ func (m *Monitor) pollPVEInstance(ctx context.Context, instanceName string, clie // Update Pulse connectivity status if pulseHealth != nil { // Try to find the endpoint in the health map by matching the effective URL - endpointURL := clusterEndpointEffectiveURL(instanceCfg.ClusterEndpoints[i]) + endpointURL := clusterEndpointEffectiveURL(instanceCfg.ClusterEndpoints[i], instanceCfg.VerifySSL, hasFingerprint) if health, exists := pulseHealth[endpointURL]; exists { reachable := health.Healthy instanceCfg.ClusterEndpoints[i].PulseReachable = &reachable