diff --git a/README.md b/README.md index 115e3b3cc..a4da0ea55 100644 --- a/README.md +++ b/README.md @@ -125,6 +125,12 @@ Your infrastructure data is yours alone. | Kubernetes/Helm | Clusters needing HA, ingress, GitOps | Kubernetes cluster with storage class + Helm 3 | [docs/KUBERNETES.md](docs/KUBERNETES.md) | | Bare metal/systemd | Minimal installs or environments without containers | Go-supported Linux host, systemd access | [docs/INSTALL.md](docs/INSTALL.md) and `scripts/build-release.sh` | +### Bootstrap vs Node Setup + +- Run `install.sh` on the Proxmox host to create the Pulse LXC and (optionally) install `pulse-sensor-proxy`. The installer records the decision in `/etc/pulse/install_summary.json`. +- After Pulse is running, head to **Settings → Nodes** and run the Quick Setup script per PVE/PBS/PMG instance. The script reads the summary so it can skip redundant prompts when the host proxy already exists, and it only asks you to deploy HTTPS proxies for remote/standalone nodes. +- If you skipped the proxy during bootstrap, the Quick Setup script (and the Settings UI) now remind you and provide a copy/paste HTTPS installer command so you can enable temperatures later. + ## Quick Start ### Install diff --git a/docs/TEMPERATURE_MONITORING.md b/docs/TEMPERATURE_MONITORING.md index 4420864e4..8e167875e 100644 --- a/docs/TEMPERATURE_MONITORING.md +++ b/docs/TEMPERATURE_MONITORING.md @@ -181,6 +181,10 @@ When Pulse cannot share the `/run/pulse-sensor-proxy` socket (for example, you r This HTTP path complements the socket path—you can run both simultaneously. Containerised Pulse stacks still need the socket for their own host, while HTTP mode covers every additional Proxmox node on the LAN or across sites. +Pulse now isolates transport failures per node: when a proxy reports that a node is invalid or unreachable, Pulse cools down polling for that node only instead of tearing down the shared socket. You will see a cooldown note in the diagnostics card if a node keeps failing; fix the proxy or disable temperature monitoring for that node to resume collection. + +> **Tip:** When Pulse is running inside a container and temperatures are blocked, open **Settings → Nodes → Edit node → Temperature monitoring**. The UI now offers a one-click “Generate HTTPS proxy command” button that produces the exact `install-sensor-proxy.sh --standalone --http-mode --pulse-server …` command for that node, so you can copy it straight to the host shell without rebuilding the instructions manually. + --- ## Disable Temperature Monitoring @@ -359,6 +363,8 @@ When run on a Proxmox host with Pulse in an LXC container: 5. Sets up SSH keys and cluster discovery 6. **Fully turnkey - no manual steps required!** +> **Note:** The main `install.sh` already installs the host-side proxy when you opt-in during bootstrap, so the Quick Setup script simply verifies it and moves on—you won’t be prompted a second time. Remote/standalone nodes still prompt to deploy their own HTTPS proxy. + ### For Docker Deployments (Manual Steps Required) When Pulse runs in Docker, the setup script will show you manual steps: diff --git a/frontend-modern/src/components/Settings/ConfiguredNodeTables.tsx b/frontend-modern/src/components/Settings/ConfiguredNodeTables.tsx index be656f25b..cfaca1daf 100644 --- a/frontend-modern/src/components/Settings/ConfiguredNodeTables.tsx +++ b/frontend-modern/src/components/Settings/ConfiguredNodeTables.tsx @@ -10,8 +10,15 @@ type NodeConfigWithStatus = NodeConfig & { export interface TemperatureTransportInfo { httpMap: Record; socketStatus: 'healthy' | 'error' | 'missing'; + socketCooldowns?: Record; } +type TemperatureSocketCooldownInfo = { + secondsRemaining?: number; + until?: string; + lastError?: string; +}; + interface PveNodesTableProps { nodes: NodeConfigWithStatus[]; stateNodes: { instance: string; status?: string; connectionHealth?: string }[]; @@ -30,6 +37,39 @@ type TemperatureTransportBadge = { description?: string; }; +const normalizeHostKey = (value?: string) => { + if (!value) { + return ''; + } + let result = value.trim().toLowerCase(); + if (!result) { + return ''; + } + result = result.replace(/^https?:\/\//, ''); + const slashIndex = result.indexOf('/'); + if (slashIndex !== -1) { + result = result.slice(0, slashIndex); + } + const colonIndex = result.indexOf(':'); + if (colonIndex !== -1) { + result = result.slice(0, colonIndex); + } + return result; +}; + +const formatCooldown = (seconds?: number) => { + if (!seconds || seconds <= 0) { + return '0s'; + } + if (seconds >= 3600) { + return `${Math.round(seconds / 3600)}h`; + } + if (seconds >= 60) { + return `${Math.round(seconds / 60)}m`; + } + return `${Math.round(seconds)}s`; +}; + const STATUS_META: Record = { online: { dotClass: 'bg-green-500', @@ -65,6 +105,11 @@ const resolveTemperatureTransport = ( ): TemperatureTransportBadge => { const monitoringEnabled = isTemperatureMonitoringEnabled(node, globalEnabled); const normalizedTransport = (node.temperatureTransport || '').toLowerCase(); + const nodeKey = normalizeHostKey(node.name); + const hostKey = normalizeHostKey(node.host); + const socketCooldownEntry = + (nodeKey && info?.socketCooldowns?.[nodeKey]) || + (hostKey && info?.socketCooldowns?.[hostKey]); if (!monitoringEnabled) { return { label: 'Temp disabled', @@ -78,11 +123,21 @@ const resolveTemperatureTransport = ( }; } - const key = (node.name || '').toLowerCase(); - const httpEntry = info?.httpMap?.[key]; + const key = nodeKey; + const httpEntry = key ? info?.httpMap?.[key] : undefined; const socketStatus = info?.socketStatus; const buildSocketBadge = (): TemperatureTransportBadge => { + if (socketCooldownEntry) { + const retryText = `Retrying in ${formatCooldown(socketCooldownEntry.secondsRemaining)}`; + return { + label: 'Socket cooldown', + badgeClass: 'bg-amber-100 dark:bg-amber-900 text-amber-700 dark:text-amber-300', + description: socketCooldownEntry.lastError + ? `${socketCooldownEntry.lastError} (${retryText})` + : retryText, + }; + } if (socketStatus === 'error') { return { label: 'Socket error', diff --git a/frontend-modern/src/components/Settings/NodeModal.tsx b/frontend-modern/src/components/Settings/NodeModal.tsx index e54cc7795..68405125a 100644 --- a/frontend-modern/src/components/Settings/NodeModal.tsx +++ b/frontend-modern/src/components/Settings/NodeModal.tsx @@ -1,4 +1,4 @@ -import { Component, Show, For, createSignal, createEffect } from 'solid-js'; +import { Component, Show, For, createSignal, createEffect, createMemo } from 'solid-js'; import { Portal } from 'solid-js/web'; import type { NodeConfig } from '@/types/nodes'; import type { SecurityStatus } from '@/types/config'; @@ -6,6 +6,7 @@ import { copyToClipboard } from '@/utils/clipboard'; import { showSuccess, showError } from '@/utils/toast'; import { getPulseBaseUrl } from '@/utils/url'; import { NodesAPI } from '@/api/nodes'; +import { apiFetchJSON } from '@/utils/apiClient'; import { SectionHeader } from '@/components/shared/SectionHeader'; import { formField, @@ -33,6 +34,12 @@ interface NodeModalProps { onToggleTemperatureMonitoring?: (enabled: boolean) => Promise | void; } +type TemperatureTransportDetail = { + tone: 'info' | 'success' | 'warning' | 'danger'; + message: string; + disable?: boolean; +}; + const deriveNameFromHost = (host: string): string => { let value = host.trim(); if (!value) { @@ -85,9 +92,97 @@ export const NodeModal: Component = (props) => { const [quickSetupCommand, setQuickSetupCommand] = createSignal(''); const [quickSetupToken, setQuickSetupToken] = createSignal(''); const [quickSetupExpiry, setQuickSetupExpiry] = createSignal(null); + const [proxyInstallCommand, setProxyInstallCommand] = createSignal(''); + const [loadingProxyCommand, setLoadingProxyCommand] = createSignal(false); + const [proxyCommandError, setProxyCommandError] = createSignal(null); const showTemperatureMonitoringSection = () => typeof props.temperatureMonitoringEnabled === 'boolean'; const temperatureMonitoringEnabledValue = () => props.temperatureMonitoringEnabled ?? true; + const temperatureTransportDetail = createMemo(() => { + const transport = props.editingNode?.temperatureTransport; + if (!transport) { + return null; + } + + switch (transport.toLowerCase()) { + case 'socket-proxy': + return { + tone: 'success', + message: 'Temperatures flow through the host sensor proxy mounted at /run/pulse-sensor-proxy.', + }; + case 'https-proxy': + return { + tone: 'success', + message: 'Temperatures are collected via the HTTPS proxy registered for this node.', + }; + case 'ssh-blocked': + return { + tone: 'danger', + disable: true, + message: + 'Pulse is running in a container without the pulse-sensor-proxy bind mount. Install the proxy on the host or register an HTTPS proxy before enabling temperatures.', + }; + case 'ssh': + return { + tone: 'info', + message: 'Pulse will SSH directly into this node for temperature collection.', + }; + default: + return null; + } + }); + const temperatureToggleDisabled = () => + props.temperatureMonitoringLocked || + props.savingTemperatureSetting || + Boolean(temperatureTransportDetail()?.disable); + const temperatureTransportMessageClass = () => { + const tone = temperatureTransportDetail()?.tone ?? 'info'; + switch (tone) { + case 'success': + return 'text-green-600 dark:text-green-300'; + case 'warning': + return 'text-amber-600 dark:text-amber-300'; + case 'danger': + return 'text-red-600 dark:text-red-300'; + default: + return 'text-gray-600 dark:text-gray-400'; + } + }; + const temperatureToggleTitle = () => { + const detail = temperatureTransportDetail(); + if (detail?.disable) { + return detail.message; + } + return undefined; + }; + const shouldOfferProxyCommand = () => + props.nodeType === 'pve' && Boolean(props.editingNode?.id) && Boolean(temperatureTransportDetail()?.disable); + const fetchProxyInstallCommand = async () => { + if (loadingProxyCommand()) { + return; + } + setLoadingProxyCommand(true); + setProxyCommandError(null); + setProxyInstallCommand(''); + try { + const nodeName = props.editingNode?.name ? encodeURIComponent(props.editingNode!.name) : ''; + const query = nodeName ? `?node=${nodeName}` : ''; + const response = await apiFetchJSON(`/api/temperature-proxy/install-command${query}`); + if (!response || typeof response.command !== 'string') { + throw new Error('Proxy installer command unavailable'); + } + setProxyInstallCommand(response.command); + showSuccess('HTTPS proxy command ready', 2000); + } catch (error) { + const message = + error instanceof Error ? error.message : 'Failed to generate HTTPS proxy command'; + setProxyCommandError(message); + showError(message); + logger.error('Failed to load proxy install command', error); + } finally { + setLoadingProxyCommand(false); + } + }; const quickSetupExpiryLabel = () => { const expiry = quickSetupExpiry(); if (!expiry) { @@ -1781,7 +1876,8 @@ export const NodeModal: Component = (props) => { onChange={(event) => { props.onToggleTemperatureMonitoring?.(event.currentTarget.checked); }} - disabled={props.temperatureMonitoringLocked || props.savingTemperatureSetting} + disabled={temperatureToggleDisabled()} + title={temperatureToggleTitle()} ariaLabel={ temperatureMonitoringEnabledValue() ? 'Disable temperature monitoring' @@ -1789,6 +1885,58 @@ export const NodeModal: Component = (props) => { } /> + +

+ {temperatureTransportDetail()?.message} +

+
+ +
+
Install HTTPS proxy on this host
+
Generate a one-line installer command to run on the Proxmox host:
+
+ + + Download installer script + +
+ +

+ {proxyCommandError()} +

+
+ +
+                                  {proxyInstallCommand()}
+                                
+ +
+
+

Pulse will skip SSH temperature polling for this node. Existing dashboard readings will stop refreshing. diff --git a/frontend-modern/src/components/Settings/Settings.tsx b/frontend-modern/src/components/Settings/Settings.tsx index fb9ef2d40..43bd7a83e 100644 --- a/frontend-modern/src/components/Settings/Settings.tsx +++ b/frontend-modern/src/components/Settings/Settings.tsx @@ -17,6 +17,7 @@ import { getPulsePort, getPulseWebSocketUrl } from '@/utils/url'; import { logger } from '@/utils/logger'; import { apiFetch, + apiFetchJSON, clearApiToken as clearApiClientToken, getApiToken as getApiClientToken, setApiToken as setApiClientToken, @@ -147,6 +148,38 @@ interface TemperatureProxyControlPlaneState { status?: string; } +interface TemperatureProxySocketHost { + node?: string; + host?: string; + cooldownUntil?: string; + secondsRemaining?: number; + lastError?: string; +} + +type TemperatureSocketCooldownInfo = { + secondsRemaining?: number; + until?: string; + lastError?: string; +}; + +interface HostProxySummary { + requested?: boolean; + installed?: boolean; + hostSocketPresent?: boolean; + containerSocketPresent?: boolean | null; + lastUpdated?: string; + ctid?: string; +} + +interface HostProxyStatusResponse { + hostSocketPresent?: boolean; + containerSocketPresent?: boolean; + summary?: HostProxySummary | null; + reinstallCommand?: string; + installerURL?: string; + lastChecked?: string; +} + interface TemperatureProxyDiagnostic { legacySSHDetected: boolean; recommendProxyUpgrade: boolean; @@ -165,6 +198,7 @@ interface TemperatureProxyDiagnostic { httpProxies?: TemperatureProxyHTTPStatus[]; controlPlaneEnabled?: boolean; controlPlaneStates?: TemperatureProxyControlPlaneState[]; + socketHostCooldowns?: TemperatureProxySocketHost[]; } interface APITokenSummary { @@ -588,6 +622,7 @@ const Settings: Component = (props) => { const [envOverrides, setEnvOverrides] = createSignal>({}); const [temperatureMonitoringEnabled, setTemperatureMonitoringEnabled] = createSignal(true); const [savingTemperatureSetting, setSavingTemperatureSetting] = createSignal(false); + const [hostProxyStatus, setHostProxyStatus] = createSignal(null); const temperatureMonitoringLocked = () => Boolean( envOverrides().temperatureMonitoringEnabled || envOverrides()['ENABLE_TEMPERATURE_MONITORING'], @@ -850,6 +885,26 @@ const Settings: Component = (props) => { return `${Math.floor(seconds)}s`; }; + const normalizeHostKey = (value?: string | null) => { + if (!value) { + return ''; + } + let result = value.trim().toLowerCase(); + if (!result) { + return ''; + } + result = result.replace(/^https?:\/\//, ''); + const slashIndex = result.indexOf('/'); + if (slashIndex !== -1) { + result = result.slice(0, slashIndex); + } + const colonIndex = result.indexOf(':'); + if (colonIndex !== -1) { + result = result.slice(0, colonIndex); + } + return result; + }; + const emitTemperatureProxyWarnings = (diag: DiagnosticsData | null) => { if (!diag?.temperatureProxy) { return; @@ -872,6 +927,15 @@ const Settings: Component = (props) => { showWarning(`Temperature proxy control plane is behind on: ${names}`); } } + if (diag.temperatureProxy.socketHostCooldowns) { + const cooling = (diag.temperatureProxy.socketHostCooldowns as TemperatureProxySocketHost[]).filter( + (entry) => entry && (entry.node || entry.host), + ); + if (cooling.length > 0) { + const hosts = cooling.map((entry) => entry.node || entry.host || 'proxy').join(', '); + showWarning(`Temperature proxy is cooling down the following hosts: ${hosts}`); + } + } }; const temperatureTransportInfo = createMemo(() => { @@ -901,7 +965,20 @@ const Settings: Component = (props) => { : diag.temperatureProxy.socketFound ? 'error' : 'missing'; - return { httpMap, socketStatus }; + const cooldowns: Record = {}; + const socketHosts = diag.temperatureProxy.socketHostCooldowns || []; + (socketHosts as TemperatureProxySocketHost[]).forEach((entry) => { + const key = normalizeHostKey(entry.node) || normalizeHostKey(entry.host); + if (!key) { + return; + } + cooldowns[key] = { + secondsRemaining: entry.secondsRemaining, + until: entry.cooldownUntil, + lastError: entry.lastError || undefined, + }; + }); + return { httpMap, socketStatus, socketCooldowns: cooldowns }; }); const proxyNodeChecksSupported = createMemo(() => { @@ -921,6 +998,15 @@ const Settings: Component = (props) => { const diag = await response.json(); setDiagnosticsData(diag); emitTemperatureProxyWarnings(diag); + if (diag?.temperatureProxy?.hostProxySummary) { + setHostProxyStatus({ + hostSocketPresent: Boolean(diag.temperatureProxy?.socketFound), + containerSocketPresent: Boolean( + diag.temperatureProxy?.hostProxySummary?.containerSocketPresent ?? false, + ), + summary: diag.temperatureProxy?.hostProxySummary ?? undefined, + }); + } } catch (err) { logger.error('Failed to fetch diagnostics', err); showError('Failed to run diagnostics'); @@ -929,6 +1015,40 @@ const Settings: Component = (props) => { } }; + const refreshHostProxyStatus = async (notify = false) => { + try { + const status = (await apiFetchJSON( + '/api/temperature-proxy/host-status', + )) as HostProxyStatusResponse; + setHostProxyStatus(status); + if (notify) { + showSuccess('Host proxy status refreshed', 2000); + } + } catch (err) { + logger.error('Failed to refresh host proxy status', err); + showError('Failed to refresh host proxy status'); + } + }; + + createEffect(() => { + if (typeof window === 'undefined') { + return; + } + const shouldPoll = currentTab() === 'proxmox' || currentTab() === 'diagnostics'; + if (!shouldPoll) { + return; + } + void runDiagnostics(); + void refreshHostProxyStatus(false); + const intervalId = window.setInterval(() => { + void runDiagnostics(); + void refreshHostProxyStatus(false); + }, 60000); + onCleanup(() => { + window.clearInterval(intervalId); + }); + }); + const handleRegisterProxyNodes = async () => { if (proxyActionLoading()) return; setProxyActionLoading('register-nodes'); @@ -1548,7 +1668,11 @@ const Settings: Component = (props) => { } } catch (error) { logger.error('Failed to update temperature monitoring setting', error); - notificationStore.error('Failed to update temperature monitoring setting'); + notificationStore.error( + error instanceof Error + ? error.message + : 'Failed to update temperature monitoring setting', + ); setTemperatureMonitoringEnabled(previous); } finally { setSavingTemperatureSetting(false); @@ -1589,7 +1713,11 @@ const Settings: Component = (props) => { } } catch (error) { logger.error('Failed to update node temperature monitoring setting', error); - notificationStore.error('Failed to update temperature monitoring setting'); + notificationStore.error( + error instanceof Error + ? error.message + : 'Failed to update temperature monitoring setting', + ); // Revert on error setNodes( nodes().map((n) => (n.id === nodeId ? { ...n, temperatureMonitoringEnabled: previous } : n)), @@ -5318,11 +5446,11 @@ const Settings: Component = (props) => { - 0 - } - > + 0 + } + >

HTTPS proxies @@ -5358,7 +5486,61 @@ const Settings: Component = (props) => { )}
+
+ Learn more:{" "} + + Temperature Monitoring docs + +
+ 0 + } + > +
+
+ Socket cooldowns +
+ + {(entry) => ( +
+
+
+
+ {entry.node || entry.host || 'Host'} +
+ +
+ Until {entry.cooldownUntil} +
+
+
+ + Cooling + +
+ +
+ Retrying in ~{formatUptime(entry.secondsRemaining || 0)} +
+
+ +
+ {entry.lastError} +
+
+
+ )} +
+
+
= (props) => { : 'Check proxy nodes'}
+ + {(status) => ( +
+
+
+ Pulse host proxy +
+ +
+
+
Requested
+
{status().summary?.requested ? 'Yes' : 'No'}
+
Installed
+
{status().summary?.installed ? 'Yes' : 'No'}
+
Host socket
+
{status().hostSocketPresent ? 'Present' : 'Missing'}
+
Container socket
+
{status().containerSocketPresent ? 'Present' : 'Missing'}
+
+ +
+ + + {(url) => ( + + Download installer script + + )} + +
+
+ +
+ Summary updated {status().summary?.lastUpdated} +
+
+
+ )} +
= (props) => { return sanitizedEntry; }); } + if (Array.isArray(proxyDiag.socketHostCooldowns)) { + proxyDiag.socketHostCooldowns = ( + proxyDiag.socketHostCooldowns as Array> + ).map((entry) => ({ + node: sanitizeHostname( + typeof entry.node === 'string' ? (entry.node as string) : undefined, + ) as string, + host: sanitizeHostname( + typeof entry.host === 'string' ? (entry.host as string) : undefined, + ) as string, + cooldownUntil: + typeof entry.cooldownUntil === 'string' + ? (entry.cooldownUntil as string) + : undefined, + secondsRemaining: + typeof entry.secondsRemaining === 'number' + ? (entry.secondsRemaining as number) + : undefined, + lastError: + typeof entry.lastError === 'string' + ? sanitizeText(entry.lastError as string) ?? (entry.lastError as string) + : undefined, + })); + } } if (sanitized.apiTokens && typeof sanitized.apiTokens === 'object') { diff --git a/install.sh b/install.sh index ac72d0cfc..ee85e7f5d 100755 --- a/install.sh +++ b/install.sh @@ -32,6 +32,9 @@ CURRENT_INSTALL_CTID="" CONTAINER_CREATED_FOR_CLEANUP=false BUILD_FROM_SOURCE_MARKER="$INSTALL_DIR/BUILD_FROM_SOURCE" DETECTED_CTID="" +INSTALL_SUMMARY_FILE="/etc/pulse/install_summary.json" +HOST_PROXY_REQUESTED=false +HOST_PROXY_INSTALLED=false DEBIAN_TEMPLATE_FALLBACK="debian-12-standard_12.12-1_amd64.tar.zst" DEBIAN_TEMPLATE="" @@ -1513,6 +1516,7 @@ fi'; then case "$PROXY_MODE" in yes) install_proxy=true + HOST_PROXY_REQUESTED=true ;; no) install_proxy=false @@ -1521,12 +1525,14 @@ fi'; then # Auto-detect: install if Docker is present if [[ "$docker_in_container" == "true" ]]; then install_proxy=true + HOST_PROXY_REQUESTED=true fi ;; *) # Empty/unset - reuse earlier user choice (defaults handled already) if [[ "$PROXY_USER_CHOICE" == "yes" ]]; then install_proxy=true + HOST_PROXY_REQUESTED=true fi ;; esac @@ -1676,6 +1682,7 @@ fi'; then fi print_success "Temperature proxy is healthy and ready" + HOST_PROXY_INSTALLED=true fi # End of health checks # Clean up temporary binary if it was copied @@ -2879,6 +2886,43 @@ create_marker_file() { touch ~/.pulse 2>/dev/null || true } +write_install_summary() { + local summary_dir="/etc/pulse" + mkdir -p "$summary_dir" + + local host_socket="false" + if [[ -S /run/pulse-sensor-proxy/pulse-sensor-proxy.sock ]]; then + host_socket="true" + fi + + local container_socket="null" + if [[ -n "${CTID:-}" ]] && command -v pct >/dev/null 2>&1; then + if pct exec "$CTID" -- test -S /mnt/pulse-proxy/pulse-sensor-proxy.sock >/dev/null 2>&1; then + container_socket="true" + else + container_socket="false" + fi + fi + + local timestamp="" + if command -v date >/dev/null 2>&1; then + timestamp=$(date -Is 2>/dev/null || date) + fi + + cat > "$INSTALL_SUMMARY_FILE" </dev/null 2>&1; then @@ -4064,7 +4111,47 @@ fi # Determine if this node is standalone (not joined to a cluster) IS_STANDALONE_NODE=false if ! command -v pvecm >/dev/null 2>&1 || ! pvecm status >/dev/null 2>&1; then - IS_STANDALONE_NODE=true + IS_STANDALONE_NODE=true +fi + +INSTALL_SUMMARY_FILE="/etc/pulse/install_summary.json" +SUMMARY_PROXY_REQUESTED="false" +SUMMARY_PROXY_INSTALLED="false" +SUMMARY_PROXY_SOCKET="false" +if [ -f "$INSTALL_SUMMARY_FILE" ]; then + if command -v python3 >/dev/null 2>&1; then + if SUMMARY_EVAL=$(python3 <<'PY' +import json +from pathlib import Path +path = Path("/etc/pulse/install_summary.json") +try: + data = json.loads(path.read_text()) +except Exception: + raise SystemExit(1) +proxy = data.get("proxy") or {} +def emit(key, value): + print(f"{key}={'true' if value else 'false'}") +emit("SUMMARY_PROXY_REQUESTED", proxy.get("requested")) +emit("SUMMARY_PROXY_INSTALLED", proxy.get("installed")) +emit("SUMMARY_PROXY_SOCKET", proxy.get("hostSocketPresent")) +PY + ); then + eval "$SUMMARY_EVAL" + fi + elif command -v jq >/dev/null 2>&1; then + if SUMMARY_EVAL=$(jq -r ' + [ + "\(.proxy.requested // false)", + "\(.proxy.installed // false)", + "\(.proxy.hostSocketPresent // false)" + ] | @tsv + ' "$INSTALL_SUMMARY_FILE" 2>/dev/null); then + read -r requested installed host_socket <<<"$SUMMARY_EVAL" + SUMMARY_PROXY_REQUESTED=$requested + SUMMARY_PROXY_INSTALLED=$installed + SUMMARY_PROXY_SOCKET=$host_socket + fi + fi fi # Track whether temperature monitoring can work (may be disabled by checks above) @@ -4095,8 +4182,8 @@ if [ "$PULSE_IS_CONTAINERIZED" = true ]; then fi fi -# If Pulse is containerized, try to install proxy automatically -if [ "$TEMP_MONITORING_AVAILABLE" = true ] && [ "$PULSE_IS_CONTAINERIZED" = true ] && [ -n "$PULSE_CTID" ]; then +# If Pulse is containerized, try to install proxy automatically (unless already present) +if [ "$TEMP_MONITORING_AVAILABLE" = true ] && [ "$PULSE_IS_CONTAINERIZED" = true ] && [ -n "$PULSE_CTID" ] && [ "$SKIP_TEMPERATURE_PROMPT" != true ]; then # Try automatic installation - proxy keeps SSH credentials on the host for security if true; then # Download installer script from Pulse server @@ -4154,6 +4241,10 @@ if [ "$TEMP_MONITORING_AVAILABLE" = true ] && [ "$PULSE_IS_CONTAINERIZED" = true fi # Note: Mount configuration and container restart are handled by the installer + if [ "$TEMP_MONITORING_AVAILABLE" = true ]; then + TEMPERATURE_ENABLED=true + SKIP_TEMPERATURE_PROMPT=true + fi else echo "" echo "⚠️ Proxy installation had issues - you may need to configure manually" @@ -4196,7 +4287,11 @@ if [ -n "$SSH_PUBLIC_KEY" ] && [ -f /root/.ssh/authorized_keys ]; then fi # Single temperature monitoring prompt -if [ "$SSH_ALREADY_CONFIGURED" = true ]; then +if [ "$SKIP_TEMPERATURE_PROMPT" = true ]; then + echo "Temperature monitoring is already configured via pulse-sensor-proxy on this host." + echo "Pulse will collect temperatures as soon as you finish the setup wizard." + echo "" +elif [ "$SSH_ALREADY_CONFIGURED" = true ]; then TEMPERATURE_ENABLED=true echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" echo "Temperature monitoring is currently ENABLED" diff --git a/internal/api/config_handlers_temperature_test.go b/internal/api/config_handlers_temperature_test.go index b15a847e5..c18ed0869 100644 --- a/internal/api/config_handlers_temperature_test.go +++ b/internal/api/config_handlers_temperature_test.go @@ -56,3 +56,28 @@ func TestDetermineTemperatureTransport(t *testing.T) { }) } } + +func TestEnsureTemperatureTransportAvailable(t *testing.T) { + t.Parallel() + + t.Run("allows socket transport", func(t *testing.T) { + t.Parallel() + if err := ensureTemperatureTransportAvailable(true, "", "", true, true); err != nil { + t.Fatalf("unexpected error: %v", err) + } + }) + + t.Run("blocks container without proxy", func(t *testing.T) { + t.Parallel() + if err := ensureTemperatureTransportAvailable(true, "", "", false, true); err == nil { + t.Fatal("expected error when no transport is available") + } + }) + + t.Run("ignores disabled state", func(t *testing.T) { + t.Parallel() + if err := ensureTemperatureTransportAvailable(false, "", "", false, true); err != nil { + t.Fatalf("expected nil error when not enabling transport, got %v", err) + } + }) +} diff --git a/internal/api/config_handlers_transport_guard_test.go b/internal/api/config_handlers_transport_guard_test.go new file mode 100644 index 000000000..4e2489cb5 --- /dev/null +++ b/internal/api/config_handlers_transport_guard_test.go @@ -0,0 +1,55 @@ +package api + +import ( + "bytes" + "net/http" + "net/http/httptest" + "strings" + "testing" + + "github.com/rcourtman/pulse-go-rewrite/internal/config" +) + +func TestHandleAddNodeRejectsTempsWithoutTransport(t *testing.T) { + tempDir := t.TempDir() + t.Setenv("PULSE_DOCKER", "true") + cfg := &config.Config{DataPath: tempDir, ConfigPath: tempDir} + handler := newTestConfigHandlers(t, cfg) + + body := bytes.NewBufferString(`{"type":"pve","name":"node-a","host":"pve-a.local","user":"root@pam","password":"secret","temperatureMonitoringEnabled":true}`) + req := httptest.NewRequest(http.MethodPost, "/api/config/nodes", body) + rec := httptest.NewRecorder() + + handler.HandleAddNode(rec, req) + + if rec.Code != http.StatusBadRequest { + t.Fatalf("expected status 400, got %d", rec.Code) + } + if !strings.Contains(rec.Body.String(), "proxy") { + t.Fatalf("expected proxy error, got %s", rec.Body.String()) + } +} + +func TestHandleUpdateNodeRejectsTempsWithoutTransport(t *testing.T) { + tempDir := t.TempDir() + t.Setenv("PULSE_DOCKER", "true") + cfg := &config.Config{DataPath: tempDir, ConfigPath: tempDir} + cfg.PVEInstances = []config.PVEInstance{{ + Name: "pve-a", + Host: "https://pve-a.local:8006", + }} + handler := newTestConfigHandlers(t, cfg) + + body := bytes.NewBufferString(`{"temperatureMonitoringEnabled":true}`) + req := httptest.NewRequest(http.MethodPut, "/api/config/nodes/pve-0", body) + rec := httptest.NewRecorder() + + handler.HandleUpdateNode(rec, req) + + if rec.Code != http.StatusBadRequest { + t.Fatalf("expected status 400, got %d", rec.Code) + } + if !strings.Contains(rec.Body.String(), "proxy") { + t.Fatalf("expected proxy error, got %s", rec.Body.String()) + } +} diff --git a/internal/api/diagnostics.go b/internal/api/diagnostics.go index 8d7ce66b9..972070bf6 100644 --- a/internal/api/diagnostics.go +++ b/internal/api/diagnostics.go @@ -249,6 +249,8 @@ type TemperatureProxyDiagnostic struct { HTTPProxies []TemperatureProxyHTTPStatus `json:"httpProxies,omitempty"` ControlPlaneEnabled bool `json:"controlPlaneEnabled"` ControlPlaneStates []TemperatureProxyControlPlaneState `json:"controlPlaneStates,omitempty"` + SocketHostCooldowns []TemperatureProxySocketHost `json:"socketHostCooldowns,omitempty"` + HostProxySummary *HostProxySummary `json:"hostProxySummary,omitempty"` } type TemperatureProxyControlPlaneState struct { @@ -266,6 +268,23 @@ type TemperatureProxyHTTPStatus struct { Error string `json:"error,omitempty"` } +type TemperatureProxySocketHost struct { + Node string `json:"node,omitempty"` + Host string `json:"host,omitempty"` + CooldownUntil string `json:"cooldownUntil,omitempty"` + SecondsRemaining int `json:"secondsRemaining,omitempty"` + LastError string `json:"lastError,omitempty"` +} + +type HostProxySummary struct { + Requested bool `json:"requested"` + Installed bool `json:"installed"` + HostSocketPresent bool `json:"hostSocketPresent"` + ContainerSocketPresent *bool `json:"containerSocketPresent,omitempty"` + LastUpdated string `json:"lastUpdated,omitempty"` + CTID string `json:"ctid,omitempty"` +} + // APITokenDiagnostic reports on the state of the multi-token authentication system. type APITokenDiagnostic struct { Enabled bool `json:"enabled"` @@ -413,12 +432,18 @@ func (r *Router) computeDiagnostics(ctx context.Context) DiagnosticsInfo { MemoryMB: memStats.Alloc / 1024 / 1024, } - var proxySync map[string]proxySyncState + var ( + proxySync map[string]proxySyncState + socketHostState []monitoring.ProxyHostDiagnostics + ) if r.temperatureProxyHandlers != nil { proxySync = r.temperatureProxyHandlers.SnapshotSyncStatus() } + if r.monitor != nil { + socketHostState = r.monitor.SocketProxyHostDiagnostics() + } - diag.TemperatureProxy = buildTemperatureProxyDiagnostic(r.config, proxySync) + diag.TemperatureProxy = buildTemperatureProxyDiagnostic(r.config, proxySync, socketHostState) diag.APITokens = buildAPITokenDiagnostic(r.config, r.monitor) // Test each configured node @@ -674,7 +699,7 @@ func buildDiscoveryDiagnostic(cfg *config.Config, monitor *monitoring.Monitor) * return discovery } -func buildTemperatureProxyDiagnostic(cfg *config.Config, syncStates map[string]proxySyncState) *TemperatureProxyDiagnostic { +func buildTemperatureProxyDiagnostic(cfg *config.Config, syncStates map[string]proxySyncState, hostStates []monitoring.ProxyHostDiagnostics) *TemperatureProxyDiagnostic { diag := &TemperatureProxyDiagnostic{} appendNote := func(note string) { @@ -716,6 +741,9 @@ func buildTemperatureProxyDiagnostic(cfg *config.Config, syncStates map[string]p if !diag.SocketFound { appendNote("No proxy socket detected inside the container. Remove the affected node in Pulse, then re-add it using the installer script from Settings → Nodes to regenerate the mount (or rerun the host installer script if you prefer).") + if cfg != nil && cfg.TemperatureMonitoringEnabled { + appendNote("Global temperature monitoring is enabled but the host proxy socket is missing; reinstall the proxy or disable temperatures until it is restored.") + } } else if diag.SocketPath == "/run/pulse-sensor-proxy/pulse-sensor-proxy.sock" { // Only warn about /run mount in LXC containers where /mnt/pulse-proxy is preferred // Docker deployments correctly use /run/pulse-sensor-proxy per docker-compose.yml @@ -867,9 +895,108 @@ func buildTemperatureProxyDiagnostic(cfg *config.Config, syncStates map[string]p } } + if len(hostStates) > 0 && cfg != nil { + now := time.Now() + cooldowns := make([]TemperatureProxySocketHost, 0, len(hostStates)) + for _, state := range hostStates { + if state.Host == "" || state.CooldownUntil.IsZero() { + continue + } + if now.After(state.CooldownUntil) { + continue + } + entry := TemperatureProxySocketHost{ + Host: state.Host, + CooldownUntil: state.CooldownUntil.UTC().Format(time.RFC3339), + SecondsRemaining: int(time.Until(state.CooldownUntil).Seconds()), + LastError: state.LastError, + } + if entry.SecondsRemaining < 0 { + entry.SecondsRemaining = 0 + } + if name := matchInstanceNameByHost(cfg, state.Host); name != "" { + entry.Node = name + } + cooldowns = append(cooldowns, entry) + } + if len(cooldowns) > 0 { + diag.SocketHostCooldowns = cooldowns + } + } + + if summary, err := loadHostProxySummary(); err == nil { + diag.HostProxySummary = summary + } + return diag } +func loadHostProxySummary() (*HostProxySummary, error) { + const summaryPath = "/etc/pulse/install_summary.json" + data, err := os.ReadFile(summaryPath) + if err != nil { + return nil, err + } + var raw struct { + GeneratedAt string `json:"generatedAt"` + CTID string `json:"ctid"` + Proxy struct { + Requested bool `json:"requested"` + Installed bool `json:"installed"` + HostSocketPresent bool `json:"hostSocketPresent"` + ContainerSocketPresent *bool `json:"containerSocketPresent"` + } `json:"proxy"` + } + if err := json.Unmarshal(data, &raw); err != nil { + return nil, err + } + summary := &HostProxySummary{ + Requested: raw.Proxy.Requested, + Installed: raw.Proxy.Installed, + HostSocketPresent: raw.Proxy.HostSocketPresent, + LastUpdated: strings.TrimSpace(raw.GeneratedAt), + CTID: strings.TrimSpace(raw.CTID), + } + if raw.Proxy.ContainerSocketPresent != nil { + value := *raw.Proxy.ContainerSocketPresent + summary.ContainerSocketPresent = &value + } + return summary, nil +} + +func matchInstanceNameByHost(cfg *config.Config, host string) string { + if cfg == nil { + return "" + } + needle := normalizeHostForComparison(host) + if needle == "" { + return "" + } + for _, inst := range cfg.PVEInstances { + candidate := normalizeHostForComparison(inst.Host) + if candidate != "" && strings.EqualFold(candidate, needle) { + return strings.TrimSpace(inst.Name) + } + } + return "" +} + +func normalizeHostForComparison(raw string) string { + trimmed := strings.TrimSpace(raw) + if trimmed == "" { + return "" + } + trimmed = strings.TrimPrefix(trimmed, "https://") + trimmed = strings.TrimPrefix(trimmed, "http://") + if idx := strings.IndexByte(trimmed, '/'); idx != -1 { + trimmed = trimmed[:idx] + } + if idx := strings.IndexByte(trimmed, ':'); idx != -1 { + trimmed = trimmed[:idx] + } + return strings.ToLower(strings.TrimSpace(trimmed)) +} + func buildAPITokenDiagnostic(cfg *config.Config, monitor *monitoring.Monitor) *APITokenDiagnostic { if cfg == nil { return nil diff --git a/internal/api/router.go b/internal/api/router.go index c3d12bc5f..0df7b5c6d 100644 --- a/internal/api/router.go +++ b/internal/api/router.go @@ -186,6 +186,8 @@ func (r *Router) setupRoutes() { r.mux.HandleFunc("/api/temperature-proxy/register", r.temperatureProxyHandlers.HandleRegister) r.mux.HandleFunc("/api/temperature-proxy/authorized-nodes", r.temperatureProxyHandlers.HandleAuthorizedNodes) r.mux.HandleFunc("/api/temperature-proxy/unregister", RequireAdmin(r.config, r.temperatureProxyHandlers.HandleUnregister)) + r.mux.HandleFunc("/api/temperature-proxy/install-command", RequireAdmin(r.config, RequireScope(config.ScopeSettingsWrite, r.handleTemperatureProxyInstallCommand))) + r.mux.HandleFunc("/api/temperature-proxy/host-status", RequireAdmin(r.config, RequireScope(config.ScopeSettingsRead, r.handleHostProxyStatus))) r.mux.HandleFunc("/api/agents/docker/commands/", RequireAuth(r.config, RequireScope(config.ScopeDockerReport, r.dockerAgentHandlers.HandleCommandAck))) r.mux.HandleFunc("/api/agents/docker/hosts/", RequireAdmin(r.config, RequireScope(config.ScopeDockerManage, r.dockerAgentHandlers.HandleDockerHostActions))) r.mux.HandleFunc("/api/version", r.handleVersion) @@ -3823,6 +3825,75 @@ func (r *Router) handleDownloadTemperatureProxyMigrationScript(w http.ResponseWr } } +func (r *Router) handleTemperatureProxyInstallCommand(w http.ResponseWriter, req *http.Request) { + if req.Method != http.MethodGet { + writeErrorResponse(w, http.StatusMethodNotAllowed, "method_not_allowed", "Only GET is allowed", nil) + return + } + + baseURL := strings.TrimSpace(r.resolvePublicURL(req)) + if baseURL == "" { + http.Error(w, "Pulse public URL is not configured", http.StatusBadRequest) + return + } + baseURL = strings.TrimRight(baseURL, "/") + + node := strings.TrimSpace(req.URL.Query().Get("node")) + command := fmt.Sprintf( + "curl -fsSL https://raw.githubusercontent.com/rcourtman/Pulse/main/scripts/install-sensor-proxy.sh | sudo bash -s -- --standalone --http-mode --pulse-server %s", + baseURL, + ) + + response := map[string]string{ + "command": command, + "pulseURL": baseURL, + } + if node != "" { + response["node"] = node + } + + if err := utils.WriteJSONResponse(w, response); err != nil { + log.Error().Err(err).Msg("Failed to serialize proxy install command response") + } +} + +func (r *Router) handleHostProxyStatus(w http.ResponseWriter, req *http.Request) { + if req.Method != http.MethodGet { + writeErrorResponse(w, http.StatusMethodNotAllowed, "method_not_allowed", "Only GET is allowed", nil) + return + } + + hostSocket := fileExists("/run/pulse-sensor-proxy/pulse-sensor-proxy.sock") + containerSocket := fileExists("/mnt/pulse-proxy/pulse-sensor-proxy.sock") + + resp := map[string]interface{}{ + "hostSocketPresent": hostSocket, + "containerSocketPresent": containerSocket, + "lastChecked": time.Now().UTC().Format(time.RFC3339), + } + + if summary, err := loadHostProxySummary(); err == nil && summary != nil { + resp["summary"] = summary + } + + baseURL := strings.TrimRight(r.resolvePublicURL(req), "/") + if baseURL == "" { + baseURL = "http://localhost:7655" + } + + ctid := "" + if summary, ok := resp["summary"].(*HostProxySummary); ok && summary != nil && summary.CTID != "" { + ctid = summary.CTID + } + + resp["reinstallCommand"] = fmt.Sprintf("curl -fsSL https://raw.githubusercontent.com/rcourtman/Pulse/main/scripts/install-sensor-proxy.sh | sudo bash -s -- --ctid %s --pulse-server %s", ctid, baseURL) + resp["installerURL"] = fmt.Sprintf("%s/api/install/install-sensor-proxy.sh", baseURL) + + if err := utils.WriteJSONResponse(w, resp); err != nil { + log.Error().Err(err).Msg("Failed to serialize host proxy status response") + } +} + func (r *Router) resolvePublicURL(req *http.Request) string { if publicURL := strings.TrimSpace(r.config.PublicURL); publicURL != "" { return strings.TrimRight(publicURL, "/") @@ -3852,6 +3923,11 @@ func (r *Router) resolvePublicURL(req *http.Request) string { return fmt.Sprintf("%s://%s", scheme, host) } +func fileExists(path string) bool { + _, err := os.Stat(path) + return err == nil +} + func normalizeDockerAgentArch(arch string) string { if arch == "" { return "" diff --git a/internal/api/system_settings.go b/internal/api/system_settings.go index ef2cbcf55..c87d8272f 100644 --- a/internal/api/system_settings.go +++ b/internal/api/system_settings.go @@ -35,6 +35,7 @@ type SystemSettingsHandler struct { EnableTemperatureMonitoring() DisableTemperatureMonitoring() GetNotificationManager() *notifications.NotificationManager + HasSocketTemperatureProxy() bool } } @@ -46,6 +47,7 @@ func NewSystemSettingsHandler(cfg *config.Config, persistence *config.ConfigPers EnableTemperatureMonitoring() DisableTemperatureMonitoring() GetNotificationManager() *notifications.NotificationManager + HasSocketTemperatureProxy() bool }, reloadSystemSettingsFunc func()) *SystemSettingsHandler { return &SystemSettingsHandler{ config: cfg, @@ -64,6 +66,7 @@ func (h *SystemSettingsHandler) SetMonitor(m interface { EnableTemperatureMonitoring() DisableTemperatureMonitoring() GetNotificationManager() *notifications.NotificationManager + HasSocketTemperatureProxy() bool }) { h.monitor = m } @@ -572,6 +575,34 @@ func (h *SystemSettingsHandler) HandleUpdateSystemSettings(w http.ResponseWriter settings.BackupPollingEnabled = updates.BackupPollingEnabled } if _, ok := rawRequest["temperatureMonitoringEnabled"]; ok { + if updates.TemperatureMonitoringEnabled { + socketAvailable := false + if h.monitor != nil { + socketAvailable = h.monitor.HasSocketTemperatureProxy() + } + if !socketAvailable { + missing := make([]string, 0) + if h.config != nil { + for _, inst := range h.config.PVEInstances { + if strings.TrimSpace(inst.TemperatureProxyURL) == "" || strings.TrimSpace(inst.TemperatureProxyToken) == "" { + name := strings.TrimSpace(inst.Name) + if name == "" { + name = strings.TrimSpace(inst.Host) + } + if name == "" { + name = "unnamed node" + } + missing = append(missing, name) + } + } + } + if len(missing) > 0 { + message := fmt.Sprintf("Cannot enable temperature monitoring: proxy socket is not available and the following nodes do not have HTTPS proxies configured: %s", strings.Join(missing, ", ")) + http.Error(w, message, http.StatusBadRequest) + return + } + } + } settings.TemperatureMonitoringEnabled = updates.TemperatureMonitoringEnabled tempToggleRequested = true } diff --git a/internal/api/system_settings_temperature_test.go b/internal/api/system_settings_temperature_test.go new file mode 100644 index 000000000..50363d5a5 --- /dev/null +++ b/internal/api/system_settings_temperature_test.go @@ -0,0 +1,27 @@ +package api + +import ( + "bytes" + "net/http" + "net/http/httptest" + "testing" + + "github.com/rcourtman/pulse-go-rewrite/internal/config" +) + +func TestHandleUpdateSystemSettingsRejectsTempsWithoutTransport(t *testing.T) { + tempDir := t.TempDir() + t.Setenv("PULSE_DOCKER", "true") + cfg := &config.Config{DataPath: tempDir, ConfigPath: tempDir, PVEInstances: []config.PVEInstance{{Name: "pve-a"}}} + persistence := config.NewConfigPersistence(tempDir) + handler := NewSystemSettingsHandler(cfg, persistence, nil, nil, nil) + + req := httptest.NewRequest(http.MethodPost, "/api/system/settings/update", bytes.NewBufferString(`{"temperatureMonitoringEnabled":true}`)) + rec := httptest.NewRecorder() + + handler.HandleUpdateSystemSettings(rec, req) + + if rec.Code != http.StatusBadRequest { + t.Fatalf("expected status 400, got %d", rec.Code) + } +} diff --git a/internal/api/temperature_proxy_command_test.go b/internal/api/temperature_proxy_command_test.go new file mode 100644 index 000000000..ddcb0a680 --- /dev/null +++ b/internal/api/temperature_proxy_command_test.go @@ -0,0 +1,45 @@ +package api + +import ( + "encoding/json" + "net/http" + "net/http/httptest" + "strings" + "testing" + + "github.com/rcourtman/pulse-go-rewrite/internal/config" +) + +func TestHandleTemperatureProxyInstallCommand(t *testing.T) { + cfg := &config.Config{PublicURL: "https://pulse.example:7655"} + router := &Router{config: cfg} + + req := httptest.NewRequest(http.MethodGet, "/api/temperature-proxy/install-command?node=pve-a", nil) + rec := httptest.NewRecorder() + + router.handleTemperatureProxyInstallCommand(rec, req) + + if rec.Code != http.StatusOK { + t.Fatalf("expected status 200, got %d", rec.Code) + } + + var resp map[string]string + if err := json.Unmarshal(rec.Body.Bytes(), &resp); err != nil { + t.Fatalf("failed to decode response: %v", err) + } + + if resp["node"] != "pve-a" { + t.Fatalf("expected node pve-a, got %s", resp["node"]) + } + + command := resp["command"] + if command == "" { + t.Fatalf("command missing in response") + } + if !strings.Contains(command, cfg.PublicURL) { + t.Fatalf("command does not include pulse URL: %s", command) + } + if !strings.Contains(command, "--standalone --http-mode") { + t.Fatalf("command missing expected flags: %s", command) + } +} diff --git a/internal/monitoring/monitor.go b/internal/monitoring/monitor.go index cfb32e4ac..8c1dc61cd 100644 --- a/internal/monitoring/monitor.go +++ b/internal/monitoring/monitor.go @@ -3472,6 +3472,19 @@ func (m *Monitor) HasSocketTemperatureProxy() bool { return m.tempCollector.SocketProxyAvailable() } +// SocketProxyHostDiagnostics exposes per-host proxy cooldown state for diagnostics. +func (m *Monitor) SocketProxyHostDiagnostics() []ProxyHostDiagnostics { + m.mu.RLock() + collector := m.tempCollector + m.mu.RUnlock() + + if collector == nil { + return nil + } + + return collector.ProxyHostDiagnostics() +} + // checkContainerizedTempMonitoring logs a security warning if Pulse is running // in a container with SSH-based temperature monitoring enabled func checkContainerizedTempMonitoring() { @@ -3514,6 +3527,13 @@ func New(cfg *config.Config) (*Monitor, error) { // Security warning if running in container with SSH temperature monitoring checkContainerizedTempMonitoring() + if cfg != nil && cfg.TemperatureMonitoringEnabled { + isContainer := os.Getenv("PULSE_DOCKER") == "true" || system.InContainer() + if isContainer && tempCollector != nil && !tempCollector.SocketProxyAvailable() { + log.Warn().Msg("Temperature monitoring is enabled but the container does not have access to pulse-sensor-proxy. Install the proxy on the host or disable temperatures until it is available.") + } + } + stalenessTracker := NewStalenessTracker(getPollMetrics()) stalenessTracker.SetBounds(cfg.AdaptivePollingBaseInterval, cfg.AdaptivePollingMaxInterval) taskQueue := NewTaskQueue() diff --git a/internal/monitoring/temperature.go b/internal/monitoring/temperature.go index 86b7ab278..de87b304d 100644 --- a/internal/monitoring/temperature.go +++ b/internal/monitoring/temperature.go @@ -43,10 +43,25 @@ type TemperatureCollector struct { proxyMu sync.Mutex proxyFailures int proxyCooldownUntil time.Time + proxyHostStates map[string]*proxyHostState missingKeyWarned atomic.Bool legacySSHDisabled atomic.Bool } +type proxyHostState struct { + failures int + cooldownUntil time.Time + lastError string +} + +// ProxyHostDiagnostics describes the proxy transport state for a host. +type ProxyHostDiagnostics struct { + Host string + Failures int + CooldownUntil time.Time + LastError string +} + // NewTemperatureCollector creates a new temperature collector with default SSH port (22) func NewTemperatureCollector(sshUser, sshKeyPath string) *TemperatureCollector { return NewTemperatureCollectorWithPort(sshUser, sshKeyPath, 22) @@ -59,9 +74,10 @@ func NewTemperatureCollectorWithPort(sshUser, sshKeyPath string, sshPort int) *T } tc := &TemperatureCollector{ - sshUser: sshUser, - sshKeyPath: sshKeyPath, - sshPort: sshPort, + sshUser: sshUser, + sshKeyPath: sshKeyPath, + sshPort: sshPort, + proxyHostStates: make(map[string]*proxyHostState), } homeDir := os.Getenv("HOME") @@ -126,9 +142,17 @@ func (tc *TemperatureCollector) CollectTemperatureWithProxy(ctx context.Context, // Use Unix socket proxy if available (local deployment) if tc.isProxyEnabled() { + if tc.shouldSkipProxyHost(host) { + log.Debug(). + Str("node", nodeName). + Str("host", host). + Msg("Skipping temperature proxy request while host is in cooldown") + return &models.Temperature{Available: false}, nil + } + output, err = tc.proxyClient.GetTemperature(host) if err != nil { - tc.handleProxyFailure(err) + tc.handleProxyFailure(host, err) log.Debug(). Str("node", nodeName). Str("host", host). @@ -137,6 +161,7 @@ func (tc *TemperatureCollector) CollectTemperatureWithProxy(ctx context.Context, return &models.Temperature{Available: false}, nil } tc.handleProxySuccess() + tc.handleProxyHostSuccess(host) } else { // SECURITY: Block SSH fallback when running in containers (unless dev mode) // Container compromise = SSH key compromise = root access to infrastructure @@ -817,6 +842,33 @@ func (tc *TemperatureCollector) isProxyEnabled() bool { return useProxy } +func (tc *TemperatureCollector) shouldSkipProxyHost(host string) bool { + host = strings.TrimSpace(host) + if host == "" { + return false + } + + tc.proxyMu.Lock() + defer tc.proxyMu.Unlock() + state, ok := tc.proxyHostStates[host] + if !ok || state == nil { + return false + } + + now := time.Now() + if state.cooldownUntil.IsZero() || now.After(state.cooldownUntil) { + // Cooldown expired; reset state so we can retry this host. + state.cooldownUntil = time.Time{} + state.failures = 0 + if state.cooldownUntil.IsZero() && state.failures == 0 { + delete(tc.proxyHostStates, host) + } + return false + } + + return true +} + // SocketProxyAvailable reports whether the unix socket proxy can currently be used. func (tc *TemperatureCollector) SocketProxyAvailable() bool { return tc != nil && tc.isProxyEnabled() @@ -831,26 +883,71 @@ func (tc *TemperatureCollector) handleProxySuccess() { tc.proxyMu.Unlock() } -func (tc *TemperatureCollector) handleProxyFailure(err error) { - if tc.proxyClient == nil || !tc.shouldDisableProxy(err) { +func (tc *TemperatureCollector) handleProxyHostSuccess(host string) { + host = strings.TrimSpace(host) + if host == "" { + return + } + tc.proxyMu.Lock() + delete(tc.proxyHostStates, host) + tc.proxyMu.Unlock() +} + +func (tc *TemperatureCollector) handleProxyFailure(host string, err error) { + if tc.proxyClient == nil { + return + } + + if tc.shouldDisableProxy(err) { + tc.proxyMu.Lock() + tc.proxyFailures++ + disable := tc.proxyFailures >= proxyFailureThreshold && tc.useProxy + if disable { + tc.useProxy = false + tc.proxyCooldownUntil = time.Now().Add(proxyRetryInterval) + tc.proxyFailures = 0 + } + tc.proxyMu.Unlock() + + if disable { + log.Warn(). + Err(err). + Dur("cooldown", proxyRetryInterval). + Msg("Temperature proxy disabled after repeated failures; will retry later") + } + return + } + + tc.handleProxyHostFailure(host, err) +} + +func (tc *TemperatureCollector) handleProxyHostFailure(host string, err error) { + host = strings.TrimSpace(host) + if host == "" { return } tc.proxyMu.Lock() - tc.proxyFailures++ - disable := tc.proxyFailures >= proxyFailureThreshold && tc.useProxy - if disable { - tc.useProxy = false - tc.proxyCooldownUntil = time.Now().Add(proxyRetryInterval) - tc.proxyFailures = 0 + state, ok := tc.proxyHostStates[host] + if !ok || state == nil { + state = &proxyHostState{} + tc.proxyHostStates[host] = state + } + state.failures++ + state.lastError = strings.TrimSpace(err.Error()) + trip := state.failures >= proxyFailureThreshold + if trip { + state.failures = 0 + state.cooldownUntil = time.Now().Add(proxyRetryInterval) } tc.proxyMu.Unlock() - if disable { + if trip { log.Warn(). Err(err). + Str("host", host). Dur("cooldown", proxyRetryInterval). - Msg("Temperature proxy disabled after repeated failures; will retry later") + Msg("Temperature proxy host in cooldown after repeated failures") } } @@ -866,3 +963,31 @@ func (tc *TemperatureCollector) shouldDisableProxy(err error) bool { } return true } + +// ProxyHostDiagnostics returns a snapshot of per-host proxy error state. +func (tc *TemperatureCollector) ProxyHostDiagnostics() []ProxyHostDiagnostics { + if tc == nil { + return nil + } + + tc.proxyMu.Lock() + defer tc.proxyMu.Unlock() + + if len(tc.proxyHostStates) == 0 { + return nil + } + + result := make([]ProxyHostDiagnostics, 0, len(tc.proxyHostStates)) + for host, state := range tc.proxyHostStates { + if state == nil { + continue + } + result = append(result, ProxyHostDiagnostics{ + Host: host, + Failures: state.failures, + CooldownUntil: state.cooldownUntil, + LastError: state.lastError, + }) + } + return result +} diff --git a/internal/tempproxy/client.go b/internal/tempproxy/client.go index f4abbbfef..b4dd1b4a4 100644 --- a/internal/tempproxy/client.go +++ b/internal/tempproxy/client.go @@ -34,6 +34,7 @@ const ( ErrorTypeSSH // SSH connectivity issues ErrorTypeSensor // Sensor command failures ErrorTypeTimeout // Operation timeout + ErrorTypeNode // Node allowlist or validation failures ) // ProxyError wraps errors with classification @@ -125,6 +126,16 @@ func classifyError(err error, respError string) *ProxyError { // Check response error messages first (even if err is nil) // This handles cases where the socket succeeds but the proxy returns an application error if respError != "" { + // Node validator/allowlist rejections should not disable the proxy globally + if contains(respError, "rejected by validator", "not in allowlist", "node \"") { + return &ProxyError{ + Type: ErrorTypeNode, + Message: respError, + Retryable: false, + Wrapped: fmt.Errorf("%s", respError), + } + } + // Rate limiting - never retry if contains(respError, "rate limit") { return &ProxyError{ @@ -412,7 +423,15 @@ func (c *Client) GetTemperature(nodeHost string) (string, error) { } if !resp.Success { - return "", fmt.Errorf("proxy error: %s", resp.Error) + if proxyErr := classifyError(nil, resp.Error); proxyErr != nil { + return "", proxyErr + } + return "", &ProxyError{ + Type: ErrorTypeUnknown, + Message: resp.Error, + Retryable: false, + Wrapped: fmt.Errorf("%s", resp.Error), + } } // Extract temperature JSON string