diff --git a/docs/TEMPERATURE_MONITORING.md b/docs/TEMPERATURE_MONITORING.md index bfdf81198..20fb99ca0 100644 --- a/docs/TEMPERATURE_MONITORING.md +++ b/docs/TEMPERATURE_MONITORING.md @@ -4,9 +4,9 @@ Pulse can display real-time CPU and NVMe temperatures directly in your dashboard ## Features -- **CPU Package Temperature**: Shows the overall CPU temperature +- **CPU Package Temperature**: Shows the overall CPU temperature when available - **Individual Core Temperatures**: Tracks each CPU core -- **NVMe Drive Temperatures**: Monitors NVMe SSD temperatures +- **NVMe Drive Temperatures**: Monitors NVMe SSD temperatures (visible in the Storage tab's disk list) - **Color-Coded Display**: - Green: < 60°C (normal) - Yellow: 60-80°C (warm) @@ -40,6 +40,8 @@ The auto-setup script (Settings → Nodes → Setup Script) will prompt you to c - Install `lm-sensors` - Run `sensors-detect --auto` +If the node is part of a Proxmox cluster, the script will now detect the other members and offer to configure the same SSH/lm-sensors setup on each of them automatically—confirm when prompted to roll it out cluster-wide. + ## Setup (Manual) If you skipped SSH setup during auto-setup, you can configure it manually: @@ -88,7 +90,7 @@ You should see JSON output with temperature data. 2. Runs `sensors -j` to get temperature data in JSON format 3. Parses CPU temperatures (coretemp/k10temp) 4. Parses NVMe temperatures (nvme-pci-*) -5. Displays the data in node cards with color coding +5. Displays CPU temperatures on the overview dashboard and lists NVMe drive temperatures in the Storage tab's disk table when available ## Troubleshooting diff --git a/frontend-modern/src/components/Dashboard/NodeCard.tsx b/frontend-modern/src/components/Dashboard/NodeCard.tsx index 379686971..61ad8a87d 100644 --- a/frontend-modern/src/components/Dashboard/NodeCard.tsx +++ b/frontend-modern/src/components/Dashboard/NodeCard.tsx @@ -6,7 +6,7 @@ import { AlertIndicator, AlertCountBadge } from '@/components/shared/AlertIndica import { useWebSocket } from '@/App'; import { Card } from '@/components/shared/Card'; import { getNodeDisplayName, hasAlternateDisplayName } from '@/utils/nodes'; -import { getPrimaryTemperature } from '@/utils/temperature'; +import { getCpuTemperature } from '@/utils/temperature'; interface NodeCardProps { node: Node; @@ -119,33 +119,20 @@ const NodeCard: Component = (props) => { ); const unacknowledgedNodeAlerts = createMemo(() => nodeAlerts().filter((alert) => !alert.acknowledged)); - const primaryTemperature = createMemo(() => getPrimaryTemperature(props.node.temperature)); - const primaryTemperatureValue = createMemo(() => { - const reading = primaryTemperature(); - return reading ? Math.round(reading.value) : null; - }); - const primaryTemperatureLabel = createMemo(() => { - const reading = primaryTemperature(); - if (!reading) return null; - if (reading.source === 'nvme') { - return reading.device ?? 'NVMe'; - } - return 'CPU'; + const cpuTemperature = createMemo(() => getCpuTemperature(props.node.temperature)); + const cpuTemperatureValue = createMemo(() => { + const value = cpuTemperature(); + return value !== null ? Math.round(value) : null; }); const temperatureTooltip = createMemo(() => { - const temp = props.node.temperature; - const rounded = primaryTemperatureValue(); - if (!temp?.available || rounded === null) { + if (!props.node.temperature?.available) { return ''; } - const label = primaryTemperatureLabel(); - const primaryLabel = - label && label !== 'CPU' ? `${label}: ${rounded}°C` : `CPU: ${rounded}°C`; - const nvmeDetails = - temp.nvme && temp.nvme.length > 0 - ? ` | NVMe: ${temp.nvme.map((n) => `${n.device}: ${Math.round(n.temp)}°C`).join(', ')}` - : ''; - return `${primaryLabel}${nvmeDetails}`; + const value = cpuTemperatureValue(); + if (value === null) { + return 'CPU sensor unavailable'; + } + return `CPU: ${value}°C`; }); // Determine border/ring style based on status and alerts @@ -249,25 +236,31 @@ const NodeCard: Component = (props) => { ↑{formatUptime(props.node.uptime)} ⚡{normalizedLoad()}} + when={props.node.temperature?.available && cpuTemperatureValue() !== null} + fallback={ + props.node.temperature?.available ? ( + + 🌡-- + + ) : ( + ⚡{normalizedLoad()} + ) + } > 80 + (cpuTemperatureValue() ?? 0) > 80 ? 'text-red-500' - : (primaryTemperatureValue() ?? 0) > 60 + : (cpuTemperatureValue() ?? 0) > 60 ? 'text-yellow-500' : 'text-green-500' }`} title={temperatureTooltip() || undefined} > - 🌡{primaryTemperatureValue()}°C - - - {primaryTemperatureLabel()} - - + 🌡{cpuTemperatureValue()}°C diff --git a/frontend-modern/src/components/Storage/DiskList.tsx b/frontend-modern/src/components/Storage/DiskList.tsx index 550ae8d6a..27b61b006 100644 --- a/frontend-modern/src/components/Storage/DiskList.tsx +++ b/frontend-modern/src/components/Storage/DiskList.tsx @@ -224,8 +224,8 @@ export const DiskList: Component = (props) => { 0} - fallback={-} + when={typeof disk.temperature === 'number' && disk.temperature !== 0} + fallback={-} > = (props) => { ? 'text-red-600 dark:text-red-400' : disk.temperature > 60 ? 'text-yellow-600 dark:text-yellow-400' - : 'text-gray-600 dark:text-gray-400' + : 'text-green-600 dark:text-green-400' }`} > {disk.temperature}°C diff --git a/frontend-modern/src/components/shared/NodeSummaryTable.tsx b/frontend-modern/src/components/shared/NodeSummaryTable.tsx index 9de5e38e5..b089c2e07 100644 --- a/frontend-modern/src/components/shared/NodeSummaryTable.tsx +++ b/frontend-modern/src/components/shared/NodeSummaryTable.tsx @@ -6,7 +6,7 @@ import { useWebSocket } from '@/App'; import { getAlertStyles } from '@/utils/alerts'; import { Card } from '@/components/shared/Card'; import { getNodeDisplayName, hasAlternateDisplayName } from '@/utils/nodes'; -import { getPrimaryTemperature, type PrimaryTemperatureReading } from '@/utils/temperature'; +import { getCpuTemperature } from '@/utils/temperature'; interface NodeSummaryTableProps { nodes: Node[]; @@ -107,13 +107,11 @@ export const NodeSummaryTable: Component = (props) => { return counts; }); - const roundTemperature = (reading: PrimaryTemperatureReading | null) => - reading ? Math.round(reading.value) : null; - - const getTemperatureReading = (item: SortableItem): PrimaryTemperatureReading | null => { + const getCpuTemperatureValue = (item: SortableItem) => { if (item.type !== 'pve') return null; const node = item.data as Node; - return getPrimaryTemperature(node.temperature); + const value = getCpuTemperature(node.temperature); + return value !== null ? Math.round(value) : null; }; const getDefaultSortDirection = (key: Exclude) => { @@ -203,10 +201,7 @@ export const NodeSummaryTable: Component = (props) => { }; const getTemperatureValue = (item: SortableItem) => { - if (item.type === 'pve') { - return roundTemperature(getTemperatureReading(item)); - } - return null; + return getCpuTemperatureValue(item); }; const getCountValue = (item: SortableItem, key: CountSortKey): number | null => { @@ -385,7 +380,8 @@ export const NodeSummaryTable: Component = (props) => { class="px-2 py-1.5 text-left text-[11px] sm:text-xs font-medium uppercase tracking-wider min-w-20 cursor-pointer hover:bg-gray-200 dark:hover:bg-gray-600" onClick={() => handleSort('temperature')} > - Temp {sortKey() === 'temperature' && (sortDirection() === 'asc' ? '▲' : '▼')} + Temp{' '} + {sortKey() === 'temperature' && (sortDirection() === 'asc' ? '▲' : '▼')} @@ -414,8 +410,7 @@ export const NodeSummaryTable: Component = (props) => { const memoryPercentValue = getMemoryPercent(item); const diskPercentValue = getDiskPercent(item); const diskSublabel = getDiskSublabel(item); - const temperatureReading = getTemperatureReading(item); - const temperatureValue = roundTemperature(temperatureReading); + const cpuTemperatureValue = getCpuTemperatureValue(item); const uptimeValue = isPVE ? node?.uptime ?? 0 : isPBS ? pbs?.uptime ?? 0 : 0; const displayName = () => { if (isPVE) return getNodeDisplayName(node as Node); @@ -610,33 +605,26 @@ export const NodeSummaryTable: Component = (props) => { online && isPVE && node!.temperature?.available && - temperatureValue !== null + cpuTemperatureValue !== null + } + fallback={ + + {online && isPVE && node!.temperature?.available + ? 'No CPU sensor' + : '-'} + } - fallback={-} > -
- {(() => { - const value = temperatureValue as number; - const severityClass = - value >= 80 - ? 'text-red-600 dark:text-red-400' - : value >= 70 - ? 'text-yellow-600 dark:text-yellow-400' - : 'text-green-600 dark:text-green-400'; - return ( - <> - - {value}°C - - - - {temperatureReading?.device ?? 'NVMe'} - - - - ); - })()} -
+ {(() => { + const value = cpuTemperatureValue as number; + const severityClass = + value >= 80 + ? 'text-red-600 dark:text-red-400' + : value >= 70 + ? 'text-yellow-600 dark:text-yellow-400' + : 'text-green-600 dark:text-green-400'; + return {value}°C; + })()} diff --git a/frontend-modern/src/utils/temperature.ts b/frontend-modern/src/utils/temperature.ts index ce3a61b99..dc8a11029 100644 --- a/frontend-modern/src/utils/temperature.ts +++ b/frontend-modern/src/utils/temperature.ts @@ -1,53 +1,53 @@ import type { Temperature } from '@/types/api'; -export type PrimaryTemperatureReading = { - value: number; - source: 'cpu' | 'nvme'; - device?: string; -}; - const isValidTemperature = (value: unknown): value is number => typeof value === 'number' && Number.isFinite(value); -export const getPrimaryTemperature = ( - temperature?: Temperature | null, -): PrimaryTemperatureReading | null => { +export const getCpuTemperature = (temperature?: Temperature | null): number | null => { if (!temperature?.available) return null; - const cpuCandidates: number[] = []; + const candidates: number[] = []; if (isValidTemperature(temperature.cpuPackage)) { - cpuCandidates.push(temperature.cpuPackage); + candidates.push(temperature.cpuPackage); } if (isValidTemperature(temperature.cpuMax)) { - cpuCandidates.push(temperature.cpuMax); + candidates.push(temperature.cpuMax); + } + if (Array.isArray(temperature.cores)) { + temperature.cores.forEach((core) => { + if (isValidTemperature(core.temp)) { + candidates.push(core.temp); + } + }); } - if (cpuCandidates.length > 0) { - return { - value: Math.max(...cpuCandidates), - source: 'cpu', - }; + if (candidates.length === 0) { + return null; } - const nvmeCandidates = (temperature.nvme ?? []) - .filter((nvme) => isValidTemperature(nvme.temp)) - .map((nvme) => ({ - device: nvme.device, - temp: nvme.temp, - })); - - if (nvmeCandidates.length > 0) { - const hottest = nvmeCandidates.reduce((max, current) => - current.temp > max.temp ? current : max, - ); - - return { - value: hottest.temp, - source: 'nvme', - device: hottest.device, - }; - } - - return null; + return Math.max(...candidates); +}; + +export type NvmeTemperatureReading = { + value: number; + device?: string; +}; + +export const getHottestNvmeTemperature = ( + temperature?: Temperature | null, +): NvmeTemperatureReading | null => { + if (!temperature?.available || !Array.isArray(temperature.nvme)) { + return null; + } + + const readings = temperature.nvme + .filter((nvme) => isValidTemperature(nvme.temp)) + .map((nvme) => ({ value: nvme.temp, device: nvme.device })); + + if (readings.length === 0) { + return null; + } + + return readings.reduce((max, current) => (current.value > max.value ? current : max)); }; diff --git a/internal/api/config_handlers.go b/internal/api/config_handlers.go index 7b1c31c44..003dacd60 100644 --- a/internal/api/config_handlers.go +++ b/internal/api/config_handlers.go @@ -3072,6 +3072,7 @@ echo "" # SSH public key embedded from Pulse server SSH_PUBLIC_KEY="%s" SSH_RESTRICTED_KEY_ENTRY="command=\"sensors -j\",no-port-forwarding,no-X11-forwarding,no-agent-forwarding,no-pty $SSH_PUBLIC_KEY" +TEMPERATURE_ENABLED=false # Check if SSH key is already configured and whether it needs upgrading SSH_ALREADY_CONFIGURED=false @@ -3087,6 +3088,7 @@ if [ -n "$SSH_PUBLIC_KEY" ] && [ -f /root/.ssh/authorized_keys ]; then fi if [ "$SSH_ALREADY_CONFIGURED" = true ]; then + TEMPERATURE_ENABLED=true echo "Temperature monitoring is currently ENABLED on this node." echo "" echo "What would you like to do?" @@ -3127,6 +3129,7 @@ if [ "$SSH_ALREADY_CONFIGURED" = true ]; then echo "" echo "To completely remove lm-sensors (optional):" echo " apt-get remove --purge lm-sensors" + TEMPERATURE_ENABLED=false elif [[ $SSH_ACTION =~ ^[Ss]$ ]]; then echo "Temperature monitoring configuration unchanged." else @@ -3207,6 +3210,7 @@ else echo "" echo "To disable later, re-run this setup script or manually remove the key:" echo " grep -v 'pulse' /root/.ssh/authorized_keys > /tmp/ak && mv /tmp/ak /root/.ssh/authorized_keys" + TEMPERATURE_ENABLED=true else echo "" echo "Warning: SSH key not available from Pulse server." @@ -3218,6 +3222,126 @@ else fi fi +# Offer to configure other Proxmox cluster nodes if temperature monitoring is enabled here +if [ "$TEMPERATURE_ENABLED" = true ] && command -v pvecm >/dev/null 2>&1 && command -v ssh >/dev/null 2>&1; then + CLUSTER_OUTPUT=$(pvecm nodes 2>/dev/null || true) + if [ -n "$CLUSTER_OUTPUT" ]; then + LOCAL_NODE=$(hostname -s 2>/dev/null || hostname) + CLUSTER_NODES=$(echo "$CLUSTER_OUTPUT" | awk 'NR>1 && $1 ~ /^[0-9]+$/ {print $3}') + + if [ -n "$CLUSTER_NODES" ]; then + OTHER_NODES_LIST=() + while read -r NODE_NAME; do + if [ -n "$NODE_NAME" ] && [ "$NODE_NAME" != "$LOCAL_NODE" ]; then + # Avoid duplicates + SKIP_NODE=false + for EXISTING in "${OTHER_NODES_LIST[@]}"; do + if [ "$EXISTING" = "$NODE_NAME" ]; then + SKIP_NODE=true + break + fi + done + if [ "$SKIP_NODE" = false ]; then + OTHER_NODES_LIST+=("$NODE_NAME") + fi + fi + done <<< "$CLUSTER_NODES" + + if [ ${#OTHER_NODES_LIST[@]} -gt 0 ]; then + echo "" + echo "Detected additional Proxmox nodes in cluster:" + for NODE in "${OTHER_NODES_LIST[@]}"; do + echo " • $NODE" + done + echo "" + echo "Configure temperature monitoring on these nodes as well?" + echo -n "[y/N]: " + + if [ -t 0 ]; then + read -p "> " -n 1 -r REMOTE_REPLY + else + if read -p "> " -n 1 -r REMOTE_REPLY /dev/null; then + : + else + echo "(No terminal available - skipping remote configuration)" + REMOTE_REPLY="n" + fi + fi + echo "" + echo "" + + if [[ $REMOTE_REPLY =~ ^[Yy]$ ]]; then + for NODE in "${OTHER_NODES_LIST[@]}"; do + echo "Configuring temperature monitoring on $NODE..." + if ssh -o BatchMode=yes -o StrictHostKeyChecking=no -o ConnectTimeout=5 -o LogLevel=ERROR root@"$NODE" "bash -s" <<'EOF' +set -e +SSH_PUBLIC_KEY='$SSH_PUBLIC_KEY' +SSH_RESTRICTED_KEY_ENTRY='$SSH_RESTRICTED_KEY_ENTRY' +mkdir -p /root/.ssh +chmod 700 /root/.ssh +AUTH_KEYS=/root/.ssh/authorized_keys +if [ -f "\$AUTH_KEYS" ] && grep -qF "\$SSH_PUBLIC_KEY" "\$AUTH_KEYS" 2>/dev/null; then + grep -vF "\$SSH_PUBLIC_KEY" "\$AUTH_KEYS" > "\$AUTH_KEYS.tmp" + mv "\$AUTH_KEYS.tmp" "\$AUTH_KEYS" +fi +if [ ! -f "\$AUTH_KEYS" ] || ! grep -qF "\$SSH_RESTRICTED_KEY_ENTRY" "\$AUTH_KEYS" 2>/dev/null; then + echo "\$SSH_RESTRICTED_KEY_ENTRY" >> "\$AUTH_KEYS" +fi +chmod 600 "\$AUTH_KEYS" +if ! command -v sensors >/dev/null 2>&1; then + echo " - Installing lm-sensors..." + export DEBIAN_FRONTEND=noninteractive + APT_LOG=$(mktemp) + if ! apt-get update -qq >"$APT_LOG" 2>&1; then + echo " ! apt-get update failed." + if grep -qi "enterprise.proxmox.com" "$APT_LOG"; then + echo " - Detected Proxmox enterprise repository without subscription; switching to no-subscription repository." + if [ -f /etc/apt/sources.list.d/pve-enterprise.list ]; then + cp /etc/apt/sources.list.d/pve-enterprise.list /etc/apt/sources.list.d/pve-enterprise.list.pulsebak 2>/dev/null || true + if grep -q "^[[:space:]]*deb" /etc/apt/sources.list.d/pve-enterprise.list; then + sed -i 's|^[[:space:]]*deb|# Pulse auto-disabled: deb|' /etc/apt/sources.list.d/pve-enterprise.list + fi + fi + if [ ! -f /etc/apt/sources.list.d/pve-no-subscription.list ]; then + CODENAME=$(. /etc/os-release 2>/dev/null && echo "$VERSION_CODENAME") + if [ -z "$CODENAME" ]; then + CODENAME=$(lsb_release -cs 2>/dev/null || echo "bookworm") + fi + echo "deb http://download.proxmox.com/debian/pve $CODENAME pve-no-subscription" > /etc/apt/sources.list.d/pve-no-subscription.list + fi + if apt-get update -qq >>"$APT_LOG" 2>&1; then + echo " ✓ Switched to no-subscription repository." + else + echo " ! apt-get update still failed after switching repositories." + fi + else + echo " ! apt-get update error was not recognized. Please review apt configuration on this node." + fi + fi + if apt-get install -y -qq lm-sensors >/dev/null 2>&1; then + sensors-detect --auto >/dev/null 2>&1 || true + echo " ✓ lm-sensors installed" + else + echo " ! Failed to install lm-sensors automatically. Please resolve apt issues and rerun this script." + fi + rm -f "$APT_LOG" +else + echo " ✓ lm-sensors package verified" +fi +EOF + then + echo " ✓ Temperature monitoring enabled on $NODE" + else + echo " ✗ Failed to configure $NODE (check SSH/cluster connectivity)" + fi + echo "" + done + fi + fi + fi + fi +fi + echo "" echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" echo "Setup Complete" diff --git a/internal/monitoring/monitor.go b/internal/monitoring/monitor.go index 9b3ec4185..69b52cc80 100644 --- a/internal/monitoring/monitor.go +++ b/internal/monitoring/monitor.go @@ -98,6 +98,74 @@ func getNodeDisplayName(instance *config.PVEInstance, nodeName string) string { return baseName } +func mergeNVMeTempsIntoDisks(disks []models.PhysicalDisk, nodes []models.Node) []models.PhysicalDisk { + if len(disks) == 0 || len(nodes) == 0 { + return disks + } + + nvmeTempsByNode := make(map[string][]models.NVMeTemp) + for _, node := range nodes { + if node.Temperature == nil || !node.Temperature.Available || len(node.Temperature.NVMe) == 0 { + continue + } + + temps := make([]models.NVMeTemp, len(node.Temperature.NVMe)) + copy(temps, node.Temperature.NVMe) + sort.Slice(temps, func(i, j int) bool { + return temps[i].Device < temps[j].Device + }) + + nvmeTempsByNode[node.Name] = temps + } + + if len(nvmeTempsByNode) == 0 { + return disks + } + + updated := make([]models.PhysicalDisk, len(disks)) + copy(updated, disks) + + disksByNode := make(map[string][]int) + for i := range updated { + if strings.EqualFold(updated[i].Type, "nvme") { + disksByNode[updated[i].Node] = append(disksByNode[updated[i].Node], i) + } + } + + for nodeName, diskIndexes := range disksByNode { + temps, ok := nvmeTempsByNode[nodeName] + if !ok || len(temps) == 0 { + for _, idx := range diskIndexes { + updated[idx].Temperature = 0 + } + continue + } + + sort.Slice(diskIndexes, func(i, j int) bool { + return updated[diskIndexes[i]].DevPath < updated[diskIndexes[j]].DevPath + }) + + for _, idx := range diskIndexes { + updated[idx].Temperature = 0 + } + + for idx, diskIdx := range diskIndexes { + if idx >= len(temps) { + break + } + + tempVal := temps[idx].Temp + if tempVal <= 0 || math.IsNaN(tempVal) { + continue + } + + updated[diskIdx].Temperature = int(math.Round(tempVal)) + } + } + + return updated +} + func lookupClusterEndpointLabel(instance *config.PVEInstance, nodeName string) string { if instance == nil { return "" @@ -2100,7 +2168,18 @@ func (m *Monitor) pollPVEInstance(ctx context.Context, instanceName string, clie Dur("sinceLastPoll", time.Since(lastPoll)). Dur("interval", pollingInterval). Msg("Skipping physical disk poll - interval not elapsed") - // Don't clear existing data, just skip the poll + // Refresh NVMe temperatures using the latest sensor data even when we skip the disk poll + currentState := m.state.GetSnapshot() + existing := make([]models.PhysicalDisk, 0) + for _, disk := range currentState.PhysicalDisks { + if disk.Instance == instanceName { + existing = append(existing, disk) + } + } + if len(existing) > 0 { + updated := mergeNVMeTempsIntoDisks(existing, modelNodes) + m.state.UpdatePhysicalDisks(instanceName, updated) + } } else { log.Debug(). Int("nodeCount", len(nodes)). @@ -2228,6 +2307,8 @@ func (m *Monitor) pollPVEInstance(ctx context.Context, instanceName string, clie } } + allDisks = mergeNVMeTempsIntoDisks(allDisks, modelNodes) + // Update physical disks in state log.Debug(). Str("instance", instanceName).