From cc756f5d2ff40327ca336dbca500cc73367d5ec7 Mon Sep 17 00:00:00 2001 From: rcourtman Date: Wed, 1 Apr 2026 13:17:21 +0100 Subject: [PATCH] Select reachable Proxmox auto-register hosts --- internal/api/config_handlers.go | 134 ++++++++++++++---- .../api/config_handlers_auto_register_test.go | 76 ++++++++++ internal/hostagent/proxmox_setup.go | 119 +++++++++++----- internal/hostagent/proxmox_setup_test.go | 15 ++ 4 files changed, 280 insertions(+), 64 deletions(-) diff --git a/internal/api/config_handlers.go b/internal/api/config_handlers.go index ef92bd131..5a9b5e095 100644 --- a/internal/api/config_handlers.go +++ b/internal/api/config_handlers.go @@ -5322,14 +5322,15 @@ func (h *ConfigHandlers) HandleUpdateMockMode(w http.ResponseWriter, r *http.Req // AutoRegisterRequest represents a request from the setup script or agent to auto-register a node type AutoRegisterRequest struct { - Type string `json:"type"` // "pve" or "pbs" - Host string `json:"host"` // The host URL - TokenID string `json:"tokenId"` // Full token ID like pulse-monitor@pam!pulse-token - TokenValue string `json:"tokenValue,omitempty"` // The token value for the node - ServerName string `json:"serverName"` // Hostname or IP - SetupCode string `json:"setupCode,omitempty"` // One-time setup code for authentication (deprecated) - AuthToken string `json:"authToken,omitempty"` // Direct auth token from URL (new approach) - Source string `json:"source,omitempty"` // "agent" or "script" - indicates how the node was registered + Type string `json:"type"` // "pve" or "pbs" + Host string `json:"host"` // The preferred host URL + CandidateHosts []string `json:"candidateHosts,omitempty"` // Alternate host URLs the server can try from its own network view + TokenID string `json:"tokenId"` // Full token ID like pulse-monitor@pam!pulse-token + TokenValue string `json:"tokenValue,omitempty"` // The token value for the node + ServerName string `json:"serverName"` // Hostname or IP + SetupCode string `json:"setupCode,omitempty"` // One-time setup code for authentication (deprecated) + AuthToken string `json:"authToken,omitempty"` // Direct auth token from URL (new approach) + Source string `json:"source,omitempty"` // "agent" or "script" - indicates how the node was registered // CheckRegistration asks Pulse whether this Proxmox type already exists in config. // Used by agents to validate stale local registration marker files after server reinstalls. CheckRegistration bool `json:"checkRegistration,omitempty"` @@ -5339,6 +5340,73 @@ type AutoRegisterRequest struct { Password string `json:"password,omitempty"` // Password for authentication (never stored) } +func normalizeAutoRegisterHostCandidates(nodeType, primary string, alternates []string) ([]string, error) { + candidates := make([]string, 0, len(alternates)+1) + seen := make(map[string]struct{}, len(alternates)+1) + addCandidate := func(raw string) error { + if strings.TrimSpace(raw) == "" { + return nil + } + normalized, err := normalizeNodeHost(raw, nodeType) + if err != nil { + return err + } + if _, exists := seen[normalized]; exists { + return nil + } + seen[normalized] = struct{}{} + candidates = append(candidates, normalized) + return nil + } + + if err := addCandidate(primary); err != nil { + return nil, err + } + for _, candidate := range alternates { + if err := addCandidate(candidate); err != nil { + log.Debug(). + Str("type", nodeType). + Str("candidate", candidate). + Err(err). + Msg("Ignoring invalid auto-register host candidate") + } + } + + if len(candidates) == 0 { + return nil, fmt.Errorf("host is required") + } + return candidates, nil +} + +func selectAutoRegisterHost(nodeType, primary string, alternates []string) (string, string, bool, []string, error) { + candidates, err := normalizeAutoRegisterHostCandidates(nodeType, primary, alternates) + if err != nil { + return "", "", false, nil, err + } + + for idx, candidate := range candidates { + fingerprint, err := tlsutil.FetchFingerprint(candidate) + if err != nil { + log.Debug(). + Str("type", nodeType). + Str("candidate", candidate). + Err(err). + Msg("Auto-register candidate not reachable for fingerprint capture") + continue + } + if idx > 0 { + log.Info(). + Str("type", nodeType). + Str("selectedHost", candidate). + Str("requestedHost", candidates[0]). + Msg("Auto-register switched to fallback host candidate reachable from Pulse") + } + return candidate, fingerprint, true, candidates, nil + } + + return candidates[0], "", false, candidates, nil +} + func autoRegisterNodeMatchesHost(nodeHost, requestedHost string) bool { nodeHost = strings.TrimSpace(nodeHost) requestedHost = strings.TrimSpace(requestedHost) @@ -5369,16 +5437,30 @@ func (h *ConfigHandlers) autoRegisteredNodeExists(ctx context.Context, req *Auto return false } + candidates, err := normalizeAutoRegisterHostCandidates(req.Type, req.Host, req.CandidateHosts) + if err != nil { + return false + } + + matchesAny := func(nodeHost string) bool { + for _, candidate := range candidates { + if autoRegisterNodeMatchesHost(nodeHost, candidate) { + return true + } + } + return false + } + switch strings.ToLower(strings.TrimSpace(req.Type)) { case "pve": for _, node := range h.getConfig(ctx).PVEInstances { - if autoRegisterNodeMatchesHost(node.Host, req.Host) { + if matchesAny(node.Host) { return true } } case "pbs": for _, node := range h.getConfig(ctx).PBSInstances { - if autoRegisterNodeMatchesHost(node.Host, req.Host) { + if matchesAny(node.Host) { return true } } @@ -5649,23 +5731,15 @@ func (h *ConfigHandlers) HandleAutoRegister(w http.ResponseWriter, r *http.Reque return } - host, err := normalizeNodeHost(req.Host, req.Type) + host, fingerprint, verifySSL, candidateHosts, err := selectAutoRegisterHost(req.Type, req.Host, req.CandidateHosts) if err != nil { http.Error(w, err.Error(), http.StatusBadRequest) return } - fingerprint := "" - if fp, err := tlsutil.FetchFingerprint(host); err != nil { - log.Warn().Err(err).Str("host", host).Msg("Failed to fetch TLS fingerprint for auto-register") - } else { - fingerprint = fp - } - // Create a node configuration boolFalse := false boolTrue := true - verifySSL := fingerprint != "" // Only enforce strict TLS when we have a fingerprint to verify against nodeConfig := NodeConfigRequest{ Type: req.Type, Name: req.ServerName, @@ -5685,6 +5759,13 @@ func (h *ConfigHandlers) HandleAutoRegister(w http.ResponseWriter, r *http.Reque MonitorGarbageJobs: &boolFalse, } + log.Info(). + Str("type", req.Type). + Str("selectedHost", host). + Strs("candidateHosts", candidateHosts). + Bool("verifySSL", verifySSL). + Msg("Resolved auto-register host candidates") + // Check if a node with this host already exists // IMPORTANT: Match by Host URL primarily. // Also match by name+tokenID for DHCP scenarios where IP changed but it's the same host. @@ -6200,7 +6281,7 @@ func (h *ConfigHandlers) handleSecureAutoRegister(w http.ResponseWriter, r *http Str("username", req.Username). Msg("Processing secure auto-register request") - host, err := normalizeNodeHost(req.Host, req.Type) + host, fingerprint, verifySSL, candidateHosts, err := selectAutoRegisterHost(req.Type, req.Host, req.CandidateHosts) if err != nil { http.Error(w, err.Error(), http.StatusBadRequest) return @@ -6219,13 +6300,12 @@ func (h *ConfigHandlers) handleSecureAutoRegister(w http.ResponseWriter, r *http tokenValue := fmt.Sprintf("%x-%x-%x-%x-%x", tokenBytes[0:4], tokenBytes[4:6], tokenBytes[6:8], tokenBytes[8:10], tokenBytes[10:16]) - fingerprint := "" - if fp, err := tlsutil.FetchFingerprint(host); err != nil { - log.Warn().Err(err).Str("host", host).Msg("Failed to fetch TLS fingerprint for auto-register") - } else { - fingerprint = fp - } - verifySSL := fingerprint != "" // Only enforce strict TLS when we have a fingerprint to verify against + log.Info(). + Str("type", req.Type). + Str("selectedHost", host). + Strs("candidateHosts", candidateHosts). + Bool("verifySSL", verifySSL). + Msg("Resolved secure auto-register host candidates") existingTokenID := "" existingTokenValue := "" diff --git a/internal/api/config_handlers_auto_register_test.go b/internal/api/config_handlers_auto_register_test.go index 60cbaca6c..7b8892f83 100644 --- a/internal/api/config_handlers_auto_register_test.go +++ b/internal/api/config_handlers_auto_register_test.go @@ -1277,3 +1277,79 @@ func TestHandleAutoRegisterUpdateDoesNotClearFingerprintWhenFetchFails(t *testin t.Fatalf("expected existing fingerprint to be preserved, got %q", cfg.PVEInstances[0].Fingerprint) } } + +func TestHandleAutoRegisterSelectsReachableFallbackCandidateHost(t *testing.T) { + tempDir := t.TempDir() + t.Setenv("PULSE_DATA_DIR", tempDir) + + originalDetectPVECluster := detectPVECluster + detectPVECluster = func(clientConfig proxmox.ClientConfig, nodeName string, existingEndpoints []config.ClusterEndpoint) (bool, string, []config.ClusterEndpoint) { + return false, "", nil + } + defer func() { detectPVECluster = originalDetectPVECluster }() + + pveServer := httptest.NewTLSServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusOK) + })) + defer pveServer.Close() + + cfg := &config.Config{ + DataPath: tempDir, + ConfigPath: tempDir, + } + handler := newTestConfigHandlers(t, cfg) + + const authToken = "TEMP-TOKEN-FALLBACK" + tokenHash := internalauth.HashAPIToken(authToken) + handler.codeMutex.Lock() + handler.setupCodes[tokenHash] = &SetupCode{ + ExpiresAt: time.Now().Add(5 * time.Minute), + NodeType: "pve", + } + handler.codeMutex.Unlock() + + reqBody := AutoRegisterRequest{ + Type: "pve", + Host: "https://127.0.0.1:1", + CandidateHosts: []string{"https://127.0.0.1:1", pveServer.URL}, + TokenID: "pulse-monitor@pam!pulse-fallback", + TokenValue: "secret-token", + ServerName: "pve01", + Source: "agent", + AuthToken: authToken, + } + body, err := json.Marshal(reqBody) + if err != nil { + t.Fatalf("failed to marshal request: %v", err) + } + + req := httptest.NewRequest(http.MethodPost, "/api/auto-register", bytes.NewReader(body)) + rec := httptest.NewRecorder() + handler.HandleAutoRegister(rec, req) + if rec.Code != http.StatusOK { + t.Fatalf("registration failed: status=%d, body=%s", rec.Code, rec.Body.String()) + } + + if len(cfg.PVEInstances) != 1 { + t.Fatalf("expected 1 PVE instance, got %d", len(cfg.PVEInstances)) + } + + expectedHost, err := normalizeNodeHost(pveServer.URL, "pve") + if err != nil { + t.Fatalf("failed to normalize expected host: %v", err) + } + if cfg.PVEInstances[0].Host != expectedHost { + t.Fatalf("selected host = %q, want reachable fallback %q", cfg.PVEInstances[0].Host, expectedHost) + } + + expectedFP, err := tlsutil.FetchFingerprint(pveServer.URL) + if err != nil { + t.Fatalf("failed to fetch expected fingerprint: %v", err) + } + if cfg.PVEInstances[0].Fingerprint != expectedFP { + t.Fatalf("fingerprint = %q, want %q", cfg.PVEInstances[0].Fingerprint, expectedFP) + } + if !cfg.PVEInstances[0].VerifySSL { + t.Fatal("expected verifySSL to be enabled when fallback fingerprint capture succeeds") + } +} diff --git a/internal/hostagent/proxmox_setup.go b/internal/hostagent/proxmox_setup.go index 991975a7e..a1f2c7014 100644 --- a/internal/hostagent/proxmox_setup.go +++ b/internal/hostagent/proxmox_setup.go @@ -611,69 +611,99 @@ func (p *ProxmoxSetup) parsePBSTokenValue(output string) string { return "" } -// getHostURL constructs the host URL for this Proxmox node. -// Uses the local IP that can reach Pulse, falling back to intelligent IP selection. -func (p *ProxmoxSetup) getHostURL(ptype string) string { +func appendUniqueHostCandidate(candidates []string, seen map[string]struct{}, candidate string) []string { + normalized := strings.TrimSpace(candidate) + if normalized == "" { + return candidates + } + if _, exists := seen[normalized]; exists { + return candidates + } + seen[normalized] = struct{}{} + return append(candidates, normalized) +} + +func (p *ProxmoxSetup) candidateHostURLs(ptype string) []string { port := "8006" if ptype == "pbs" { port = "8007" } - // Priority 1: User-specified ReportIP override from configuration. - // This allows users to manually specify which IP should be used for Proxmox API - // connections when auto-detection picks the wrong one (e.g., Issue #1061). - if p.reportIP != "" { - p.logger.Info().Str("ip", p.reportIP).Msg("Using user-specified ReportIP for Proxmox registration") - return fmt.Sprintf("https://%s:%s", p.reportIP, port) + candidates := make([]string, 0, 4) + seen := make(map[string]struct{}, 4) + buildURL := func(host string) string { + if strings.TrimSpace(host) == "" { + return "" + } + return fmt.Sprintf("https://%s:%s", host, port) } - // Priority 2: Prefer the system hostname when it resolves to a non-loopback address. - // This preserves admin-managed DNS names and matching TLS certificates instead of - // silently downgrading to an inferred IP address, which can pick the wrong interface - // on multi-NIC Proxmox hosts and break internal-CA deployments. + // Respect an explicit operator override as authoritative. + if p.reportIP != "" { + return append(candidates, buildURL(p.reportIP)) + } + + if p.collector == nil { + hostname := strings.TrimSpace(p.hostname) + if hostname == "" { + hostname = "localhost" + } + return append(candidates, buildURL(hostname)) + } + + // Keep the hostname candidate first for environments where Pulse can resolve + // the managed DNS name and should preserve the node certificate identity. if hostname := strings.TrimSpace(p.hostname); hostname != "" { if hostnameIP := p.getIPForHostname(); hostnameIP != "" && !isLoopbackOrLinkLocalIP(hostnameIP) { - p.logger.Debug(). - Str("hostname", hostname). - Str("ip", hostnameIP). - Msg("Using resolvable hostname for Proxmox registration") - return fmt.Sprintf("https://%s:%s", hostname, port) + candidates = appendUniqueHostCandidate(candidates, seen, buildURL(hostname)) } } - // Priority 3: Try to determine which local IP is used to connect to Pulse. - // This remains a fallback for environments without usable hostname resolution. + // Also provide the exact local IP that can reach Pulse so the server can fall + // back to a proven-routable endpoint when the hostname is not reachable from + // the Pulse host (for example inside a Docker bridge without internal DNS). if reachableIP := p.getIPThatReachesPulse(); reachableIP != "" { - p.logger.Debug().Str("ip", reachableIP).Msg("Using IP that can reach Pulse server") - return fmt.Sprintf("https://%s:%s", reachableIP, port) + candidates = appendUniqueHostCandidate(candidates, seen, buildURL(reachableIP)) } - // Priority 4: Get all IPs and select the best one based on heuristics. if out, err := p.collector.CommandCombinedOutput(context.Background(), "hostname", "-I"); err == nil { ips := strings.Fields(out) if len(ips) > 0 { - // Get the IP that the system hostname currently resolves to hostnameIP := p.getIPForHostname() - - bestIP := selectBestIP(ips, hostnameIP) - if bestIP != "" { - return fmt.Sprintf("https://%s:%s", bestIP, port) + if bestIP := selectBestIP(ips, hostnameIP); bestIP != "" { + candidates = appendUniqueHostCandidate(candidates, seen, buildURL(bestIP)) } } } - // Final fallback to hostname if IP detection failed - hostname := p.hostname + if len(candidates) > 0 { + return candidates + } + + hostname := strings.TrimSpace(p.hostname) if hostname == "" { hostname = "localhost" } - return fmt.Sprintf("https://%s:%s", hostname, port) + return append(candidates, buildURL(hostname)) +} + +// getHostURL constructs the host URL for this Proxmox node. +// Uses the local IP that can reach Pulse, falling back to intelligent IP selection. +func (p *ProxmoxSetup) getHostURL(ptype string) string { + candidates := p.candidateHostURLs(ptype) + if len(candidates) == 0 { + return "" + } + return candidates[0] } // getIPThatReachesPulse determines which local IP is used to connect to the Pulse server. // This handles cases where multiple network interfaces exist (e.g., management, Ceph, cluster ring) // and ensures we pick the one that can actually reach Pulse. Related to #929. func (p *ProxmoxSetup) getIPThatReachesPulse() string { + if p.collector == nil { + return "" + } if p.pulseURL == "" { return "" } @@ -726,6 +756,9 @@ func (p *ProxmoxSetup) getIPThatReachesPulse() string { // getIPForHostname resolves the system hostname to an IP address. func (p *ProxmoxSetup) getIPForHostname() string { + if p.collector == nil { + return "" + } hostname := p.hostname if hostname == "" { hostname, _ = p.collector.Hostname() @@ -898,13 +931,25 @@ func (p *ProxmoxSetup) doRegisterRequest(ctx context.Context, body []byte) error // registerWithPulse calls the auto-register endpoint to add the node. func (p *ProxmoxSetup) registerWithPulse(ctx context.Context, ptype, hostURL, tokenID, tokenValue string) error { + candidateHosts := p.candidateHostURLs(ptype) + if len(candidateHosts) == 0 || candidateHosts[0] != hostURL { + seen := make(map[string]struct{}, len(candidateHosts)+1) + reordered := make([]string, 0, len(candidateHosts)+1) + reordered = appendUniqueHostCandidate(reordered, seen, hostURL) + for _, candidate := range candidateHosts { + reordered = appendUniqueHostCandidate(reordered, seen, candidate) + } + candidateHosts = reordered + } + payload := map[string]interface{}{ - "type": ptype, - "host": hostURL, - "serverName": p.hostname, - "tokenId": tokenID, - "tokenValue": tokenValue, - "source": "agent", // Indicates this was registered via agent + "type": ptype, + "host": hostURL, + "candidateHosts": candidateHosts, + "serverName": p.hostname, + "tokenId": tokenID, + "tokenValue": tokenValue, + "source": "agent", // Indicates this was registered via agent } if token := strings.TrimSpace(p.apiToken); token != "" { // Auto-register accepts one-time setup tokens via the JSON body. The diff --git a/internal/hostagent/proxmox_setup_test.go b/internal/hostagent/proxmox_setup_test.go index 85897a542..7b3d19099 100644 --- a/internal/hostagent/proxmox_setup_test.go +++ b/internal/hostagent/proxmox_setup_test.go @@ -370,6 +370,7 @@ func TestRegisterWithPulseRetry(t *testing.T) { var gotAuth string var gotAPIToken string var gotAuthToken string + var gotCandidateHosts []string server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { gotAuth = r.Header.Get("Authorization") gotAPIToken = r.Header.Get("X-API-Token") @@ -380,6 +381,14 @@ func TestRegisterWithPulseRetry(t *testing.T) { if token, _ := payload["authToken"].(string); token != "" { gotAuthToken = token } + if rawCandidates, ok := payload["candidateHosts"].([]interface{}); ok { + gotCandidateHosts = gotCandidateHosts[:0] + for _, raw := range rawCandidates { + if candidate, ok := raw.(string); ok { + gotCandidateHosts = append(gotCandidateHosts, candidate) + } + } + } n := atomic.AddInt32(&attempt, 1) if n <= 2 { w.WriteHeader(http.StatusServiceUnavailable) @@ -414,6 +423,12 @@ func TestRegisterWithPulseRetry(t *testing.T) { if gotAuthToken != "test-token" { t.Fatalf("authToken payload = %q, want %q", gotAuthToken, "test-token") } + if len(gotCandidateHosts) == 0 { + t.Fatal("expected candidateHosts payload to be present") + } + if gotCandidateHosts[0] != "https://10.0.0.1:8006" { + t.Fatalf("candidateHosts[0] = %q, want %q", gotCandidateHosts[0], "https://10.0.0.1:8006") + } } func TestRegisterWithPulseNoRetryOn4xx(t *testing.T) {