Improve NVMe temperature handling

This commit is contained in:
rcourtman 2025-10-12 16:06:55 +00:00
parent 2163d6f5a8
commit 18a88cb4cc
7 changed files with 304 additions and 116 deletions

View file

@ -4,9 +4,9 @@ Pulse can display real-time CPU and NVMe temperatures directly in your dashboard
## Features
- **CPU Package Temperature**: Shows the overall CPU temperature
- **CPU Package Temperature**: Shows the overall CPU temperature when available
- **Individual Core Temperatures**: Tracks each CPU core
- **NVMe Drive Temperatures**: Monitors NVMe SSD temperatures
- **NVMe Drive Temperatures**: Monitors NVMe SSD temperatures (visible in the Storage tab's disk list)
- **Color-Coded Display**:
- Green: < 60°C (normal)
- Yellow: 60-80°C (warm)
@ -40,6 +40,8 @@ The auto-setup script (Settings → Nodes → Setup Script) will prompt you to c
- Install `lm-sensors`
- Run `sensors-detect --auto`
If the node is part of a Proxmox cluster, the script will now detect the other members and offer to configure the same SSH/lm-sensors setup on each of them automatically—confirm when prompted to roll it out cluster-wide.
## Setup (Manual)
If you skipped SSH setup during auto-setup, you can configure it manually:
@ -88,7 +90,7 @@ You should see JSON output with temperature data.
2. Runs `sensors -j` to get temperature data in JSON format
3. Parses CPU temperatures (coretemp/k10temp)
4. Parses NVMe temperatures (nvme-pci-*)
5. Displays the data in node cards with color coding
5. Displays CPU temperatures on the overview dashboard and lists NVMe drive temperatures in the Storage tab's disk table when available
## Troubleshooting

View file

@ -6,7 +6,7 @@ import { AlertIndicator, AlertCountBadge } from '@/components/shared/AlertIndica
import { useWebSocket } from '@/App';
import { Card } from '@/components/shared/Card';
import { getNodeDisplayName, hasAlternateDisplayName } from '@/utils/nodes';
import { getPrimaryTemperature } from '@/utils/temperature';
import { getCpuTemperature } from '@/utils/temperature';
interface NodeCardProps {
node: Node;
@ -119,33 +119,20 @@ const NodeCard: Component<NodeCardProps> = (props) => {
);
const unacknowledgedNodeAlerts = createMemo(() => nodeAlerts().filter((alert) => !alert.acknowledged));
const primaryTemperature = createMemo(() => getPrimaryTemperature(props.node.temperature));
const primaryTemperatureValue = createMemo(() => {
const reading = primaryTemperature();
return reading ? Math.round(reading.value) : null;
});
const primaryTemperatureLabel = createMemo(() => {
const reading = primaryTemperature();
if (!reading) return null;
if (reading.source === 'nvme') {
return reading.device ?? 'NVMe';
}
return 'CPU';
const cpuTemperature = createMemo(() => getCpuTemperature(props.node.temperature));
const cpuTemperatureValue = createMemo(() => {
const value = cpuTemperature();
return value !== null ? Math.round(value) : null;
});
const temperatureTooltip = createMemo(() => {
const temp = props.node.temperature;
const rounded = primaryTemperatureValue();
if (!temp?.available || rounded === null) {
if (!props.node.temperature?.available) {
return '';
}
const label = primaryTemperatureLabel();
const primaryLabel =
label && label !== 'CPU' ? `${label}: ${rounded}°C` : `CPU: ${rounded}°C`;
const nvmeDetails =
temp.nvme && temp.nvme.length > 0
? ` | NVMe: ${temp.nvme.map((n) => `${n.device}: ${Math.round(n.temp)}°C`).join(', ')}`
: '';
return `${primaryLabel}${nvmeDetails}`;
const value = cpuTemperatureValue();
if (value === null) {
return 'CPU sensor unavailable';
}
return `CPU: ${value}°C`;
});
// Determine border/ring style based on status and alerts
@ -249,25 +236,31 @@ const NodeCard: Component<NodeCardProps> = (props) => {
{formatUptime(props.node.uptime)}
</span>
<Show
when={props.node.temperature?.available && primaryTemperatureValue() !== null}
fallback={<span title={`Load: ${normalizedLoad()}`}>{normalizedLoad()}</span>}
when={props.node.temperature?.available && cpuTemperatureValue() !== null}
fallback={
props.node.temperature?.available ? (
<span
class="font-medium text-gray-500 dark:text-gray-400"
title="CPU sensor unavailable"
>
🌡--
</span>
) : (
<span title={`Load: ${normalizedLoad()}`}>{normalizedLoad()}</span>
)
}
>
<span
class={`font-medium ${
(primaryTemperatureValue() ?? 0) > 80
(cpuTemperatureValue() ?? 0) > 80
? 'text-red-500'
: (primaryTemperatureValue() ?? 0) > 60
: (cpuTemperatureValue() ?? 0) > 60
? 'text-yellow-500'
: 'text-green-500'
}`}
title={temperatureTooltip() || undefined}
>
🌡{primaryTemperatureValue()}°C
<Show when={primaryTemperatureLabel() && primaryTemperatureLabel() !== 'CPU'}>
<span class="ml-1 text-[9px] uppercase text-gray-500 dark:text-gray-400">
{primaryTemperatureLabel()}
</span>
</Show>
🌡{cpuTemperatureValue()}°C
</span>
</Show>
</div>

View file

@ -224,8 +224,8 @@ export const DiskList: Component<DiskListProps> = (props) => {
</td>
<td class="px-2 py-1.5 text-xs hidden sm:table-cell">
<Show
when={disk.temperature > 0}
fallback={<span class="text-gray-400">-</span>}
when={typeof disk.temperature === 'number' && disk.temperature !== 0}
fallback={<span class="font-medium text-gray-400">-</span>}
>
<span
class={`font-medium ${
@ -233,7 +233,7 @@ export const DiskList: Component<DiskListProps> = (props) => {
? 'text-red-600 dark:text-red-400'
: disk.temperature > 60
? 'text-yellow-600 dark:text-yellow-400'
: 'text-gray-600 dark:text-gray-400'
: 'text-green-600 dark:text-green-400'
}`}
>
{disk.temperature}°C

View file

@ -6,7 +6,7 @@ import { useWebSocket } from '@/App';
import { getAlertStyles } from '@/utils/alerts';
import { Card } from '@/components/shared/Card';
import { getNodeDisplayName, hasAlternateDisplayName } from '@/utils/nodes';
import { getPrimaryTemperature, type PrimaryTemperatureReading } from '@/utils/temperature';
import { getCpuTemperature } from '@/utils/temperature';
interface NodeSummaryTableProps {
nodes: Node[];
@ -107,13 +107,11 @@ export const NodeSummaryTable: Component<NodeSummaryTableProps> = (props) => {
return counts;
});
const roundTemperature = (reading: PrimaryTemperatureReading | null) =>
reading ? Math.round(reading.value) : null;
const getTemperatureReading = (item: SortableItem): PrimaryTemperatureReading | null => {
const getCpuTemperatureValue = (item: SortableItem) => {
if (item.type !== 'pve') return null;
const node = item.data as Node;
return getPrimaryTemperature(node.temperature);
const value = getCpuTemperature(node.temperature);
return value !== null ? Math.round(value) : null;
};
const getDefaultSortDirection = (key: Exclude<SortKey, 'default'>) => {
@ -203,10 +201,7 @@ export const NodeSummaryTable: Component<NodeSummaryTableProps> = (props) => {
};
const getTemperatureValue = (item: SortableItem) => {
if (item.type === 'pve') {
return roundTemperature(getTemperatureReading(item));
}
return null;
return getCpuTemperatureValue(item);
};
const getCountValue = (item: SortableItem, key: CountSortKey): number | null => {
@ -385,7 +380,8 @@ export const NodeSummaryTable: Component<NodeSummaryTableProps> = (props) => {
class="px-2 py-1.5 text-left text-[11px] sm:text-xs font-medium uppercase tracking-wider min-w-20 cursor-pointer hover:bg-gray-200 dark:hover:bg-gray-600"
onClick={() => handleSort('temperature')}
>
Temp {sortKey() === 'temperature' && (sortDirection() === 'asc' ? '▲' : '▼')}
Temp{' '}
{sortKey() === 'temperature' && (sortDirection() === 'asc' ? '▲' : '▼')}
</th>
</Show>
<For each={countColumns()}>
@ -414,8 +410,7 @@ export const NodeSummaryTable: Component<NodeSummaryTableProps> = (props) => {
const memoryPercentValue = getMemoryPercent(item);
const diskPercentValue = getDiskPercent(item);
const diskSublabel = getDiskSublabel(item);
const temperatureReading = getTemperatureReading(item);
const temperatureValue = roundTemperature(temperatureReading);
const cpuTemperatureValue = getCpuTemperatureValue(item);
const uptimeValue = isPVE ? node?.uptime ?? 0 : isPBS ? pbs?.uptime ?? 0 : 0;
const displayName = () => {
if (isPVE) return getNodeDisplayName(node as Node);
@ -610,33 +605,26 @@ export const NodeSummaryTable: Component<NodeSummaryTableProps> = (props) => {
online &&
isPVE &&
node!.temperature?.available &&
temperatureValue !== null
cpuTemperatureValue !== null
}
fallback={
<span class="text-xs text-gray-400 dark:text-gray-500">
{online && isPVE && node!.temperature?.available
? 'No CPU sensor'
: '-'}
</span>
}
fallback={<span class="text-xs text-gray-400 dark:text-gray-500">-</span>}
>
<div class="flex items-center justify-center gap-1">
{(() => {
const value = temperatureValue as number;
const severityClass =
value >= 80
? 'text-red-600 dark:text-red-400'
: value >= 70
? 'text-yellow-600 dark:text-yellow-400'
: 'text-green-600 dark:text-green-400';
return (
<>
<span class={`text-xs font-medium ${severityClass}`}>
{value}°C
</span>
<Show when={temperatureReading?.source === 'nvme'}>
<span class="text-[9px] uppercase text-gray-500 dark:text-gray-400">
{temperatureReading?.device ?? 'NVMe'}
</span>
</Show>
</>
);
})()}
</div>
{(() => {
const value = cpuTemperatureValue as number;
const severityClass =
value >= 80
? 'text-red-600 dark:text-red-400'
: value >= 70
? 'text-yellow-600 dark:text-yellow-400'
: 'text-green-600 dark:text-green-400';
return <span class={`text-xs font-medium ${severityClass}`}>{value}°C</span>;
})()}
</Show>
</td>
</Show>

View file

@ -1,53 +1,53 @@
import type { Temperature } from '@/types/api';
export type PrimaryTemperatureReading = {
value: number;
source: 'cpu' | 'nvme';
device?: string;
};
const isValidTemperature = (value: unknown): value is number =>
typeof value === 'number' && Number.isFinite(value);
export const getPrimaryTemperature = (
temperature?: Temperature | null,
): PrimaryTemperatureReading | null => {
export const getCpuTemperature = (temperature?: Temperature | null): number | null => {
if (!temperature?.available) return null;
const cpuCandidates: number[] = [];
const candidates: number[] = [];
if (isValidTemperature(temperature.cpuPackage)) {
cpuCandidates.push(temperature.cpuPackage);
candidates.push(temperature.cpuPackage);
}
if (isValidTemperature(temperature.cpuMax)) {
cpuCandidates.push(temperature.cpuMax);
candidates.push(temperature.cpuMax);
}
if (Array.isArray(temperature.cores)) {
temperature.cores.forEach((core) => {
if (isValidTemperature(core.temp)) {
candidates.push(core.temp);
}
});
}
if (cpuCandidates.length > 0) {
return {
value: Math.max(...cpuCandidates),
source: 'cpu',
};
if (candidates.length === 0) {
return null;
}
const nvmeCandidates = (temperature.nvme ?? [])
.filter((nvme) => isValidTemperature(nvme.temp))
.map((nvme) => ({
device: nvme.device,
temp: nvme.temp,
}));
if (nvmeCandidates.length > 0) {
const hottest = nvmeCandidates.reduce((max, current) =>
current.temp > max.temp ? current : max,
);
return {
value: hottest.temp,
source: 'nvme',
device: hottest.device,
};
}
return null;
return Math.max(...candidates);
};
export type NvmeTemperatureReading = {
value: number;
device?: string;
};
export const getHottestNvmeTemperature = (
temperature?: Temperature | null,
): NvmeTemperatureReading | null => {
if (!temperature?.available || !Array.isArray(temperature.nvme)) {
return null;
}
const readings = temperature.nvme
.filter((nvme) => isValidTemperature(nvme.temp))
.map((nvme) => ({ value: nvme.temp, device: nvme.device }));
if (readings.length === 0) {
return null;
}
return readings.reduce((max, current) => (current.value > max.value ? current : max));
};

View file

@ -3072,6 +3072,7 @@ echo ""
# SSH public key embedded from Pulse server
SSH_PUBLIC_KEY="%s"
SSH_RESTRICTED_KEY_ENTRY="command=\"sensors -j\",no-port-forwarding,no-X11-forwarding,no-agent-forwarding,no-pty $SSH_PUBLIC_KEY"
TEMPERATURE_ENABLED=false
# Check if SSH key is already configured and whether it needs upgrading
SSH_ALREADY_CONFIGURED=false
@ -3087,6 +3088,7 @@ if [ -n "$SSH_PUBLIC_KEY" ] && [ -f /root/.ssh/authorized_keys ]; then
fi
if [ "$SSH_ALREADY_CONFIGURED" = true ]; then
TEMPERATURE_ENABLED=true
echo "Temperature monitoring is currently ENABLED on this node."
echo ""
echo "What would you like to do?"
@ -3127,6 +3129,7 @@ if [ "$SSH_ALREADY_CONFIGURED" = true ]; then
echo ""
echo "To completely remove lm-sensors (optional):"
echo " apt-get remove --purge lm-sensors"
TEMPERATURE_ENABLED=false
elif [[ $SSH_ACTION =~ ^[Ss]$ ]]; then
echo "Temperature monitoring configuration unchanged."
else
@ -3207,6 +3210,7 @@ else
echo ""
echo "To disable later, re-run this setup script or manually remove the key:"
echo " grep -v 'pulse' /root/.ssh/authorized_keys > /tmp/ak && mv /tmp/ak /root/.ssh/authorized_keys"
TEMPERATURE_ENABLED=true
else
echo ""
echo "Warning: SSH key not available from Pulse server."
@ -3218,6 +3222,126 @@ else
fi
fi
# Offer to configure other Proxmox cluster nodes if temperature monitoring is enabled here
if [ "$TEMPERATURE_ENABLED" = true ] && command -v pvecm >/dev/null 2>&1 && command -v ssh >/dev/null 2>&1; then
CLUSTER_OUTPUT=$(pvecm nodes 2>/dev/null || true)
if [ -n "$CLUSTER_OUTPUT" ]; then
LOCAL_NODE=$(hostname -s 2>/dev/null || hostname)
CLUSTER_NODES=$(echo "$CLUSTER_OUTPUT" | awk 'NR>1 && $1 ~ /^[0-9]+$/ {print $3}')
if [ -n "$CLUSTER_NODES" ]; then
OTHER_NODES_LIST=()
while read -r NODE_NAME; do
if [ -n "$NODE_NAME" ] && [ "$NODE_NAME" != "$LOCAL_NODE" ]; then
# Avoid duplicates
SKIP_NODE=false
for EXISTING in "${OTHER_NODES_LIST[@]}"; do
if [ "$EXISTING" = "$NODE_NAME" ]; then
SKIP_NODE=true
break
fi
done
if [ "$SKIP_NODE" = false ]; then
OTHER_NODES_LIST+=("$NODE_NAME")
fi
fi
done <<< "$CLUSTER_NODES"
if [ ${#OTHER_NODES_LIST[@]} -gt 0 ]; then
echo ""
echo "Detected additional Proxmox nodes in cluster:"
for NODE in "${OTHER_NODES_LIST[@]}"; do
echo " • $NODE"
done
echo ""
echo "Configure temperature monitoring on these nodes as well?"
echo -n "[y/N]: "
if [ -t 0 ]; then
read -p "> " -n 1 -r REMOTE_REPLY
else
if read -p "> " -n 1 -r REMOTE_REPLY </dev/tty 2>/dev/null; then
:
else
echo "(No terminal available - skipping remote configuration)"
REMOTE_REPLY="n"
fi
fi
echo ""
echo ""
if [[ $REMOTE_REPLY =~ ^[Yy]$ ]]; then
for NODE in "${OTHER_NODES_LIST[@]}"; do
echo "Configuring temperature monitoring on $NODE..."
if ssh -o BatchMode=yes -o StrictHostKeyChecking=no -o ConnectTimeout=5 -o LogLevel=ERROR root@"$NODE" "bash -s" <<'EOF'
set -e
SSH_PUBLIC_KEY='$SSH_PUBLIC_KEY'
SSH_RESTRICTED_KEY_ENTRY='$SSH_RESTRICTED_KEY_ENTRY'
mkdir -p /root/.ssh
chmod 700 /root/.ssh
AUTH_KEYS=/root/.ssh/authorized_keys
if [ -f "\$AUTH_KEYS" ] && grep -qF "\$SSH_PUBLIC_KEY" "\$AUTH_KEYS" 2>/dev/null; then
grep -vF "\$SSH_PUBLIC_KEY" "\$AUTH_KEYS" > "\$AUTH_KEYS.tmp"
mv "\$AUTH_KEYS.tmp" "\$AUTH_KEYS"
fi
if [ ! -f "\$AUTH_KEYS" ] || ! grep -qF "\$SSH_RESTRICTED_KEY_ENTRY" "\$AUTH_KEYS" 2>/dev/null; then
echo "\$SSH_RESTRICTED_KEY_ENTRY" >> "\$AUTH_KEYS"
fi
chmod 600 "\$AUTH_KEYS"
if ! command -v sensors >/dev/null 2>&1; then
echo " - Installing lm-sensors..."
export DEBIAN_FRONTEND=noninteractive
APT_LOG=$(mktemp)
if ! apt-get update -qq >"$APT_LOG" 2>&1; then
echo " ! apt-get update failed."
if grep -qi "enterprise.proxmox.com" "$APT_LOG"; then
echo " - Detected Proxmox enterprise repository without subscription; switching to no-subscription repository."
if [ -f /etc/apt/sources.list.d/pve-enterprise.list ]; then
cp /etc/apt/sources.list.d/pve-enterprise.list /etc/apt/sources.list.d/pve-enterprise.list.pulsebak 2>/dev/null || true
if grep -q "^[[:space:]]*deb" /etc/apt/sources.list.d/pve-enterprise.list; then
sed -i 's|^[[:space:]]*deb|# Pulse auto-disabled: deb|' /etc/apt/sources.list.d/pve-enterprise.list
fi
fi
if [ ! -f /etc/apt/sources.list.d/pve-no-subscription.list ]; then
CODENAME=$(. /etc/os-release 2>/dev/null && echo "$VERSION_CODENAME")
if [ -z "$CODENAME" ]; then
CODENAME=$(lsb_release -cs 2>/dev/null || echo "bookworm")
fi
echo "deb http://download.proxmox.com/debian/pve $CODENAME pve-no-subscription" > /etc/apt/sources.list.d/pve-no-subscription.list
fi
if apt-get update -qq >>"$APT_LOG" 2>&1; then
echo " ✓ Switched to no-subscription repository."
else
echo " ! apt-get update still failed after switching repositories."
fi
else
echo " ! apt-get update error was not recognized. Please review apt configuration on this node."
fi
fi
if apt-get install -y -qq lm-sensors >/dev/null 2>&1; then
sensors-detect --auto >/dev/null 2>&1 || true
echo " ✓ lm-sensors installed"
else
echo " ! Failed to install lm-sensors automatically. Please resolve apt issues and rerun this script."
fi
rm -f "$APT_LOG"
else
echo " ✓ lm-sensors package verified"
fi
EOF
then
echo " ✓ Temperature monitoring enabled on $NODE"
else
echo " ✗ Failed to configure $NODE (check SSH/cluster connectivity)"
fi
echo ""
done
fi
fi
fi
fi
fi
echo ""
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo "Setup Complete"

View file

@ -98,6 +98,74 @@ func getNodeDisplayName(instance *config.PVEInstance, nodeName string) string {
return baseName
}
func mergeNVMeTempsIntoDisks(disks []models.PhysicalDisk, nodes []models.Node) []models.PhysicalDisk {
if len(disks) == 0 || len(nodes) == 0 {
return disks
}
nvmeTempsByNode := make(map[string][]models.NVMeTemp)
for _, node := range nodes {
if node.Temperature == nil || !node.Temperature.Available || len(node.Temperature.NVMe) == 0 {
continue
}
temps := make([]models.NVMeTemp, len(node.Temperature.NVMe))
copy(temps, node.Temperature.NVMe)
sort.Slice(temps, func(i, j int) bool {
return temps[i].Device < temps[j].Device
})
nvmeTempsByNode[node.Name] = temps
}
if len(nvmeTempsByNode) == 0 {
return disks
}
updated := make([]models.PhysicalDisk, len(disks))
copy(updated, disks)
disksByNode := make(map[string][]int)
for i := range updated {
if strings.EqualFold(updated[i].Type, "nvme") {
disksByNode[updated[i].Node] = append(disksByNode[updated[i].Node], i)
}
}
for nodeName, diskIndexes := range disksByNode {
temps, ok := nvmeTempsByNode[nodeName]
if !ok || len(temps) == 0 {
for _, idx := range diskIndexes {
updated[idx].Temperature = 0
}
continue
}
sort.Slice(diskIndexes, func(i, j int) bool {
return updated[diskIndexes[i]].DevPath < updated[diskIndexes[j]].DevPath
})
for _, idx := range diskIndexes {
updated[idx].Temperature = 0
}
for idx, diskIdx := range diskIndexes {
if idx >= len(temps) {
break
}
tempVal := temps[idx].Temp
if tempVal <= 0 || math.IsNaN(tempVal) {
continue
}
updated[diskIdx].Temperature = int(math.Round(tempVal))
}
}
return updated
}
func lookupClusterEndpointLabel(instance *config.PVEInstance, nodeName string) string {
if instance == nil {
return ""
@ -2100,7 +2168,18 @@ func (m *Monitor) pollPVEInstance(ctx context.Context, instanceName string, clie
Dur("sinceLastPoll", time.Since(lastPoll)).
Dur("interval", pollingInterval).
Msg("Skipping physical disk poll - interval not elapsed")
// Don't clear existing data, just skip the poll
// Refresh NVMe temperatures using the latest sensor data even when we skip the disk poll
currentState := m.state.GetSnapshot()
existing := make([]models.PhysicalDisk, 0)
for _, disk := range currentState.PhysicalDisks {
if disk.Instance == instanceName {
existing = append(existing, disk)
}
}
if len(existing) > 0 {
updated := mergeNVMeTempsIntoDisks(existing, modelNodes)
m.state.UpdatePhysicalDisks(instanceName, updated)
}
} else {
log.Debug().
Int("nodeCount", len(nodes)).
@ -2228,6 +2307,8 @@ func (m *Monitor) pollPVEInstance(ctx context.Context, instanceName string, clie
}
}
allDisks = mergeNVMeTempsIntoDisks(allDisks, modelNodes)
// Update physical disks in state
log.Debug().
Str("instance", instanceName).