mirror of
https://github.com/rcourtman/Pulse.git
synced 2026-04-28 03:20:11 +00:00
Improve NVMe temperature handling
This commit is contained in:
parent
2163d6f5a8
commit
18a88cb4cc
7 changed files with 304 additions and 116 deletions
|
|
@ -4,9 +4,9 @@ Pulse can display real-time CPU and NVMe temperatures directly in your dashboard
|
|||
|
||||
## Features
|
||||
|
||||
- **CPU Package Temperature**: Shows the overall CPU temperature
|
||||
- **CPU Package Temperature**: Shows the overall CPU temperature when available
|
||||
- **Individual Core Temperatures**: Tracks each CPU core
|
||||
- **NVMe Drive Temperatures**: Monitors NVMe SSD temperatures
|
||||
- **NVMe Drive Temperatures**: Monitors NVMe SSD temperatures (visible in the Storage tab's disk list)
|
||||
- **Color-Coded Display**:
|
||||
- Green: < 60°C (normal)
|
||||
- Yellow: 60-80°C (warm)
|
||||
|
|
@ -40,6 +40,8 @@ The auto-setup script (Settings → Nodes → Setup Script) will prompt you to c
|
|||
- Install `lm-sensors`
|
||||
- Run `sensors-detect --auto`
|
||||
|
||||
If the node is part of a Proxmox cluster, the script will now detect the other members and offer to configure the same SSH/lm-sensors setup on each of them automatically—confirm when prompted to roll it out cluster-wide.
|
||||
|
||||
## Setup (Manual)
|
||||
|
||||
If you skipped SSH setup during auto-setup, you can configure it manually:
|
||||
|
|
@ -88,7 +90,7 @@ You should see JSON output with temperature data.
|
|||
2. Runs `sensors -j` to get temperature data in JSON format
|
||||
3. Parses CPU temperatures (coretemp/k10temp)
|
||||
4. Parses NVMe temperatures (nvme-pci-*)
|
||||
5. Displays the data in node cards with color coding
|
||||
5. Displays CPU temperatures on the overview dashboard and lists NVMe drive temperatures in the Storage tab's disk table when available
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
|
|
|
|||
|
|
@ -6,7 +6,7 @@ import { AlertIndicator, AlertCountBadge } from '@/components/shared/AlertIndica
|
|||
import { useWebSocket } from '@/App';
|
||||
import { Card } from '@/components/shared/Card';
|
||||
import { getNodeDisplayName, hasAlternateDisplayName } from '@/utils/nodes';
|
||||
import { getPrimaryTemperature } from '@/utils/temperature';
|
||||
import { getCpuTemperature } from '@/utils/temperature';
|
||||
|
||||
interface NodeCardProps {
|
||||
node: Node;
|
||||
|
|
@ -119,33 +119,20 @@ const NodeCard: Component<NodeCardProps> = (props) => {
|
|||
);
|
||||
const unacknowledgedNodeAlerts = createMemo(() => nodeAlerts().filter((alert) => !alert.acknowledged));
|
||||
|
||||
const primaryTemperature = createMemo(() => getPrimaryTemperature(props.node.temperature));
|
||||
const primaryTemperatureValue = createMemo(() => {
|
||||
const reading = primaryTemperature();
|
||||
return reading ? Math.round(reading.value) : null;
|
||||
});
|
||||
const primaryTemperatureLabel = createMemo(() => {
|
||||
const reading = primaryTemperature();
|
||||
if (!reading) return null;
|
||||
if (reading.source === 'nvme') {
|
||||
return reading.device ?? 'NVMe';
|
||||
}
|
||||
return 'CPU';
|
||||
const cpuTemperature = createMemo(() => getCpuTemperature(props.node.temperature));
|
||||
const cpuTemperatureValue = createMemo(() => {
|
||||
const value = cpuTemperature();
|
||||
return value !== null ? Math.round(value) : null;
|
||||
});
|
||||
const temperatureTooltip = createMemo(() => {
|
||||
const temp = props.node.temperature;
|
||||
const rounded = primaryTemperatureValue();
|
||||
if (!temp?.available || rounded === null) {
|
||||
if (!props.node.temperature?.available) {
|
||||
return '';
|
||||
}
|
||||
const label = primaryTemperatureLabel();
|
||||
const primaryLabel =
|
||||
label && label !== 'CPU' ? `${label}: ${rounded}°C` : `CPU: ${rounded}°C`;
|
||||
const nvmeDetails =
|
||||
temp.nvme && temp.nvme.length > 0
|
||||
? ` | NVMe: ${temp.nvme.map((n) => `${n.device}: ${Math.round(n.temp)}°C`).join(', ')}`
|
||||
: '';
|
||||
return `${primaryLabel}${nvmeDetails}`;
|
||||
const value = cpuTemperatureValue();
|
||||
if (value === null) {
|
||||
return 'CPU sensor unavailable';
|
||||
}
|
||||
return `CPU: ${value}°C`;
|
||||
});
|
||||
|
||||
// Determine border/ring style based on status and alerts
|
||||
|
|
@ -249,25 +236,31 @@ const NodeCard: Component<NodeCardProps> = (props) => {
|
|||
↑{formatUptime(props.node.uptime)}
|
||||
</span>
|
||||
<Show
|
||||
when={props.node.temperature?.available && primaryTemperatureValue() !== null}
|
||||
fallback={<span title={`Load: ${normalizedLoad()}`}>⚡{normalizedLoad()}</span>}
|
||||
when={props.node.temperature?.available && cpuTemperatureValue() !== null}
|
||||
fallback={
|
||||
props.node.temperature?.available ? (
|
||||
<span
|
||||
class="font-medium text-gray-500 dark:text-gray-400"
|
||||
title="CPU sensor unavailable"
|
||||
>
|
||||
🌡--
|
||||
</span>
|
||||
) : (
|
||||
<span title={`Load: ${normalizedLoad()}`}>⚡{normalizedLoad()}</span>
|
||||
)
|
||||
}
|
||||
>
|
||||
<span
|
||||
class={`font-medium ${
|
||||
(primaryTemperatureValue() ?? 0) > 80
|
||||
(cpuTemperatureValue() ?? 0) > 80
|
||||
? 'text-red-500'
|
||||
: (primaryTemperatureValue() ?? 0) > 60
|
||||
: (cpuTemperatureValue() ?? 0) > 60
|
||||
? 'text-yellow-500'
|
||||
: 'text-green-500'
|
||||
}`}
|
||||
title={temperatureTooltip() || undefined}
|
||||
>
|
||||
🌡{primaryTemperatureValue()}°C
|
||||
<Show when={primaryTemperatureLabel() && primaryTemperatureLabel() !== 'CPU'}>
|
||||
<span class="ml-1 text-[9px] uppercase text-gray-500 dark:text-gray-400">
|
||||
{primaryTemperatureLabel()}
|
||||
</span>
|
||||
</Show>
|
||||
🌡{cpuTemperatureValue()}°C
|
||||
</span>
|
||||
</Show>
|
||||
</div>
|
||||
|
|
|
|||
|
|
@ -224,8 +224,8 @@ export const DiskList: Component<DiskListProps> = (props) => {
|
|||
</td>
|
||||
<td class="px-2 py-1.5 text-xs hidden sm:table-cell">
|
||||
<Show
|
||||
when={disk.temperature > 0}
|
||||
fallback={<span class="text-gray-400">-</span>}
|
||||
when={typeof disk.temperature === 'number' && disk.temperature !== 0}
|
||||
fallback={<span class="font-medium text-gray-400">-</span>}
|
||||
>
|
||||
<span
|
||||
class={`font-medium ${
|
||||
|
|
@ -233,7 +233,7 @@ export const DiskList: Component<DiskListProps> = (props) => {
|
|||
? 'text-red-600 dark:text-red-400'
|
||||
: disk.temperature > 60
|
||||
? 'text-yellow-600 dark:text-yellow-400'
|
||||
: 'text-gray-600 dark:text-gray-400'
|
||||
: 'text-green-600 dark:text-green-400'
|
||||
}`}
|
||||
>
|
||||
{disk.temperature}°C
|
||||
|
|
|
|||
|
|
@ -6,7 +6,7 @@ import { useWebSocket } from '@/App';
|
|||
import { getAlertStyles } from '@/utils/alerts';
|
||||
import { Card } from '@/components/shared/Card';
|
||||
import { getNodeDisplayName, hasAlternateDisplayName } from '@/utils/nodes';
|
||||
import { getPrimaryTemperature, type PrimaryTemperatureReading } from '@/utils/temperature';
|
||||
import { getCpuTemperature } from '@/utils/temperature';
|
||||
|
||||
interface NodeSummaryTableProps {
|
||||
nodes: Node[];
|
||||
|
|
@ -107,13 +107,11 @@ export const NodeSummaryTable: Component<NodeSummaryTableProps> = (props) => {
|
|||
return counts;
|
||||
});
|
||||
|
||||
const roundTemperature = (reading: PrimaryTemperatureReading | null) =>
|
||||
reading ? Math.round(reading.value) : null;
|
||||
|
||||
const getTemperatureReading = (item: SortableItem): PrimaryTemperatureReading | null => {
|
||||
const getCpuTemperatureValue = (item: SortableItem) => {
|
||||
if (item.type !== 'pve') return null;
|
||||
const node = item.data as Node;
|
||||
return getPrimaryTemperature(node.temperature);
|
||||
const value = getCpuTemperature(node.temperature);
|
||||
return value !== null ? Math.round(value) : null;
|
||||
};
|
||||
|
||||
const getDefaultSortDirection = (key: Exclude<SortKey, 'default'>) => {
|
||||
|
|
@ -203,10 +201,7 @@ export const NodeSummaryTable: Component<NodeSummaryTableProps> = (props) => {
|
|||
};
|
||||
|
||||
const getTemperatureValue = (item: SortableItem) => {
|
||||
if (item.type === 'pve') {
|
||||
return roundTemperature(getTemperatureReading(item));
|
||||
}
|
||||
return null;
|
||||
return getCpuTemperatureValue(item);
|
||||
};
|
||||
|
||||
const getCountValue = (item: SortableItem, key: CountSortKey): number | null => {
|
||||
|
|
@ -385,7 +380,8 @@ export const NodeSummaryTable: Component<NodeSummaryTableProps> = (props) => {
|
|||
class="px-2 py-1.5 text-left text-[11px] sm:text-xs font-medium uppercase tracking-wider min-w-20 cursor-pointer hover:bg-gray-200 dark:hover:bg-gray-600"
|
||||
onClick={() => handleSort('temperature')}
|
||||
>
|
||||
Temp {sortKey() === 'temperature' && (sortDirection() === 'asc' ? '▲' : '▼')}
|
||||
Temp{' '}
|
||||
{sortKey() === 'temperature' && (sortDirection() === 'asc' ? '▲' : '▼')}
|
||||
</th>
|
||||
</Show>
|
||||
<For each={countColumns()}>
|
||||
|
|
@ -414,8 +410,7 @@ export const NodeSummaryTable: Component<NodeSummaryTableProps> = (props) => {
|
|||
const memoryPercentValue = getMemoryPercent(item);
|
||||
const diskPercentValue = getDiskPercent(item);
|
||||
const diskSublabel = getDiskSublabel(item);
|
||||
const temperatureReading = getTemperatureReading(item);
|
||||
const temperatureValue = roundTemperature(temperatureReading);
|
||||
const cpuTemperatureValue = getCpuTemperatureValue(item);
|
||||
const uptimeValue = isPVE ? node?.uptime ?? 0 : isPBS ? pbs?.uptime ?? 0 : 0;
|
||||
const displayName = () => {
|
||||
if (isPVE) return getNodeDisplayName(node as Node);
|
||||
|
|
@ -610,33 +605,26 @@ export const NodeSummaryTable: Component<NodeSummaryTableProps> = (props) => {
|
|||
online &&
|
||||
isPVE &&
|
||||
node!.temperature?.available &&
|
||||
temperatureValue !== null
|
||||
cpuTemperatureValue !== null
|
||||
}
|
||||
fallback={
|
||||
<span class="text-xs text-gray-400 dark:text-gray-500">
|
||||
{online && isPVE && node!.temperature?.available
|
||||
? 'No CPU sensor'
|
||||
: '-'}
|
||||
</span>
|
||||
}
|
||||
fallback={<span class="text-xs text-gray-400 dark:text-gray-500">-</span>}
|
||||
>
|
||||
<div class="flex items-center justify-center gap-1">
|
||||
{(() => {
|
||||
const value = temperatureValue as number;
|
||||
const severityClass =
|
||||
value >= 80
|
||||
? 'text-red-600 dark:text-red-400'
|
||||
: value >= 70
|
||||
? 'text-yellow-600 dark:text-yellow-400'
|
||||
: 'text-green-600 dark:text-green-400';
|
||||
return (
|
||||
<>
|
||||
<span class={`text-xs font-medium ${severityClass}`}>
|
||||
{value}°C
|
||||
</span>
|
||||
<Show when={temperatureReading?.source === 'nvme'}>
|
||||
<span class="text-[9px] uppercase text-gray-500 dark:text-gray-400">
|
||||
{temperatureReading?.device ?? 'NVMe'}
|
||||
</span>
|
||||
</Show>
|
||||
</>
|
||||
);
|
||||
})()}
|
||||
</div>
|
||||
{(() => {
|
||||
const value = cpuTemperatureValue as number;
|
||||
const severityClass =
|
||||
value >= 80
|
||||
? 'text-red-600 dark:text-red-400'
|
||||
: value >= 70
|
||||
? 'text-yellow-600 dark:text-yellow-400'
|
||||
: 'text-green-600 dark:text-green-400';
|
||||
return <span class={`text-xs font-medium ${severityClass}`}>{value}°C</span>;
|
||||
})()}
|
||||
</Show>
|
||||
</td>
|
||||
</Show>
|
||||
|
|
|
|||
|
|
@ -1,53 +1,53 @@
|
|||
import type { Temperature } from '@/types/api';
|
||||
|
||||
export type PrimaryTemperatureReading = {
|
||||
value: number;
|
||||
source: 'cpu' | 'nvme';
|
||||
device?: string;
|
||||
};
|
||||
|
||||
const isValidTemperature = (value: unknown): value is number =>
|
||||
typeof value === 'number' && Number.isFinite(value);
|
||||
|
||||
export const getPrimaryTemperature = (
|
||||
temperature?: Temperature | null,
|
||||
): PrimaryTemperatureReading | null => {
|
||||
export const getCpuTemperature = (temperature?: Temperature | null): number | null => {
|
||||
if (!temperature?.available) return null;
|
||||
|
||||
const cpuCandidates: number[] = [];
|
||||
const candidates: number[] = [];
|
||||
|
||||
if (isValidTemperature(temperature.cpuPackage)) {
|
||||
cpuCandidates.push(temperature.cpuPackage);
|
||||
candidates.push(temperature.cpuPackage);
|
||||
}
|
||||
if (isValidTemperature(temperature.cpuMax)) {
|
||||
cpuCandidates.push(temperature.cpuMax);
|
||||
candidates.push(temperature.cpuMax);
|
||||
}
|
||||
if (Array.isArray(temperature.cores)) {
|
||||
temperature.cores.forEach((core) => {
|
||||
if (isValidTemperature(core.temp)) {
|
||||
candidates.push(core.temp);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
if (cpuCandidates.length > 0) {
|
||||
return {
|
||||
value: Math.max(...cpuCandidates),
|
||||
source: 'cpu',
|
||||
};
|
||||
if (candidates.length === 0) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const nvmeCandidates = (temperature.nvme ?? [])
|
||||
.filter((nvme) => isValidTemperature(nvme.temp))
|
||||
.map((nvme) => ({
|
||||
device: nvme.device,
|
||||
temp: nvme.temp,
|
||||
}));
|
||||
|
||||
if (nvmeCandidates.length > 0) {
|
||||
const hottest = nvmeCandidates.reduce((max, current) =>
|
||||
current.temp > max.temp ? current : max,
|
||||
);
|
||||
|
||||
return {
|
||||
value: hottest.temp,
|
||||
source: 'nvme',
|
||||
device: hottest.device,
|
||||
};
|
||||
}
|
||||
|
||||
return null;
|
||||
return Math.max(...candidates);
|
||||
};
|
||||
|
||||
export type NvmeTemperatureReading = {
|
||||
value: number;
|
||||
device?: string;
|
||||
};
|
||||
|
||||
export const getHottestNvmeTemperature = (
|
||||
temperature?: Temperature | null,
|
||||
): NvmeTemperatureReading | null => {
|
||||
if (!temperature?.available || !Array.isArray(temperature.nvme)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const readings = temperature.nvme
|
||||
.filter((nvme) => isValidTemperature(nvme.temp))
|
||||
.map((nvme) => ({ value: nvme.temp, device: nvme.device }));
|
||||
|
||||
if (readings.length === 0) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return readings.reduce((max, current) => (current.value > max.value ? current : max));
|
||||
};
|
||||
|
|
|
|||
|
|
@ -3072,6 +3072,7 @@ echo ""
|
|||
# SSH public key embedded from Pulse server
|
||||
SSH_PUBLIC_KEY="%s"
|
||||
SSH_RESTRICTED_KEY_ENTRY="command=\"sensors -j\",no-port-forwarding,no-X11-forwarding,no-agent-forwarding,no-pty $SSH_PUBLIC_KEY"
|
||||
TEMPERATURE_ENABLED=false
|
||||
|
||||
# Check if SSH key is already configured and whether it needs upgrading
|
||||
SSH_ALREADY_CONFIGURED=false
|
||||
|
|
@ -3087,6 +3088,7 @@ if [ -n "$SSH_PUBLIC_KEY" ] && [ -f /root/.ssh/authorized_keys ]; then
|
|||
fi
|
||||
|
||||
if [ "$SSH_ALREADY_CONFIGURED" = true ]; then
|
||||
TEMPERATURE_ENABLED=true
|
||||
echo "Temperature monitoring is currently ENABLED on this node."
|
||||
echo ""
|
||||
echo "What would you like to do?"
|
||||
|
|
@ -3127,6 +3129,7 @@ if [ "$SSH_ALREADY_CONFIGURED" = true ]; then
|
|||
echo ""
|
||||
echo "To completely remove lm-sensors (optional):"
|
||||
echo " apt-get remove --purge lm-sensors"
|
||||
TEMPERATURE_ENABLED=false
|
||||
elif [[ $SSH_ACTION =~ ^[Ss]$ ]]; then
|
||||
echo "Temperature monitoring configuration unchanged."
|
||||
else
|
||||
|
|
@ -3207,6 +3210,7 @@ else
|
|||
echo ""
|
||||
echo "To disable later, re-run this setup script or manually remove the key:"
|
||||
echo " grep -v 'pulse' /root/.ssh/authorized_keys > /tmp/ak && mv /tmp/ak /root/.ssh/authorized_keys"
|
||||
TEMPERATURE_ENABLED=true
|
||||
else
|
||||
echo ""
|
||||
echo "Warning: SSH key not available from Pulse server."
|
||||
|
|
@ -3218,6 +3222,126 @@ else
|
|||
fi
|
||||
fi
|
||||
|
||||
# Offer to configure other Proxmox cluster nodes if temperature monitoring is enabled here
|
||||
if [ "$TEMPERATURE_ENABLED" = true ] && command -v pvecm >/dev/null 2>&1 && command -v ssh >/dev/null 2>&1; then
|
||||
CLUSTER_OUTPUT=$(pvecm nodes 2>/dev/null || true)
|
||||
if [ -n "$CLUSTER_OUTPUT" ]; then
|
||||
LOCAL_NODE=$(hostname -s 2>/dev/null || hostname)
|
||||
CLUSTER_NODES=$(echo "$CLUSTER_OUTPUT" | awk 'NR>1 && $1 ~ /^[0-9]+$/ {print $3}')
|
||||
|
||||
if [ -n "$CLUSTER_NODES" ]; then
|
||||
OTHER_NODES_LIST=()
|
||||
while read -r NODE_NAME; do
|
||||
if [ -n "$NODE_NAME" ] && [ "$NODE_NAME" != "$LOCAL_NODE" ]; then
|
||||
# Avoid duplicates
|
||||
SKIP_NODE=false
|
||||
for EXISTING in "${OTHER_NODES_LIST[@]}"; do
|
||||
if [ "$EXISTING" = "$NODE_NAME" ]; then
|
||||
SKIP_NODE=true
|
||||
break
|
||||
fi
|
||||
done
|
||||
if [ "$SKIP_NODE" = false ]; then
|
||||
OTHER_NODES_LIST+=("$NODE_NAME")
|
||||
fi
|
||||
fi
|
||||
done <<< "$CLUSTER_NODES"
|
||||
|
||||
if [ ${#OTHER_NODES_LIST[@]} -gt 0 ]; then
|
||||
echo ""
|
||||
echo "Detected additional Proxmox nodes in cluster:"
|
||||
for NODE in "${OTHER_NODES_LIST[@]}"; do
|
||||
echo " • $NODE"
|
||||
done
|
||||
echo ""
|
||||
echo "Configure temperature monitoring on these nodes as well?"
|
||||
echo -n "[y/N]: "
|
||||
|
||||
if [ -t 0 ]; then
|
||||
read -p "> " -n 1 -r REMOTE_REPLY
|
||||
else
|
||||
if read -p "> " -n 1 -r REMOTE_REPLY </dev/tty 2>/dev/null; then
|
||||
:
|
||||
else
|
||||
echo "(No terminal available - skipping remote configuration)"
|
||||
REMOTE_REPLY="n"
|
||||
fi
|
||||
fi
|
||||
echo ""
|
||||
echo ""
|
||||
|
||||
if [[ $REMOTE_REPLY =~ ^[Yy]$ ]]; then
|
||||
for NODE in "${OTHER_NODES_LIST[@]}"; do
|
||||
echo "Configuring temperature monitoring on $NODE..."
|
||||
if ssh -o BatchMode=yes -o StrictHostKeyChecking=no -o ConnectTimeout=5 -o LogLevel=ERROR root@"$NODE" "bash -s" <<'EOF'
|
||||
set -e
|
||||
SSH_PUBLIC_KEY='$SSH_PUBLIC_KEY'
|
||||
SSH_RESTRICTED_KEY_ENTRY='$SSH_RESTRICTED_KEY_ENTRY'
|
||||
mkdir -p /root/.ssh
|
||||
chmod 700 /root/.ssh
|
||||
AUTH_KEYS=/root/.ssh/authorized_keys
|
||||
if [ -f "\$AUTH_KEYS" ] && grep -qF "\$SSH_PUBLIC_KEY" "\$AUTH_KEYS" 2>/dev/null; then
|
||||
grep -vF "\$SSH_PUBLIC_KEY" "\$AUTH_KEYS" > "\$AUTH_KEYS.tmp"
|
||||
mv "\$AUTH_KEYS.tmp" "\$AUTH_KEYS"
|
||||
fi
|
||||
if [ ! -f "\$AUTH_KEYS" ] || ! grep -qF "\$SSH_RESTRICTED_KEY_ENTRY" "\$AUTH_KEYS" 2>/dev/null; then
|
||||
echo "\$SSH_RESTRICTED_KEY_ENTRY" >> "\$AUTH_KEYS"
|
||||
fi
|
||||
chmod 600 "\$AUTH_KEYS"
|
||||
if ! command -v sensors >/dev/null 2>&1; then
|
||||
echo " - Installing lm-sensors..."
|
||||
export DEBIAN_FRONTEND=noninteractive
|
||||
APT_LOG=$(mktemp)
|
||||
if ! apt-get update -qq >"$APT_LOG" 2>&1; then
|
||||
echo " ! apt-get update failed."
|
||||
if grep -qi "enterprise.proxmox.com" "$APT_LOG"; then
|
||||
echo " - Detected Proxmox enterprise repository without subscription; switching to no-subscription repository."
|
||||
if [ -f /etc/apt/sources.list.d/pve-enterprise.list ]; then
|
||||
cp /etc/apt/sources.list.d/pve-enterprise.list /etc/apt/sources.list.d/pve-enterprise.list.pulsebak 2>/dev/null || true
|
||||
if grep -q "^[[:space:]]*deb" /etc/apt/sources.list.d/pve-enterprise.list; then
|
||||
sed -i 's|^[[:space:]]*deb|# Pulse auto-disabled: deb|' /etc/apt/sources.list.d/pve-enterprise.list
|
||||
fi
|
||||
fi
|
||||
if [ ! -f /etc/apt/sources.list.d/pve-no-subscription.list ]; then
|
||||
CODENAME=$(. /etc/os-release 2>/dev/null && echo "$VERSION_CODENAME")
|
||||
if [ -z "$CODENAME" ]; then
|
||||
CODENAME=$(lsb_release -cs 2>/dev/null || echo "bookworm")
|
||||
fi
|
||||
echo "deb http://download.proxmox.com/debian/pve $CODENAME pve-no-subscription" > /etc/apt/sources.list.d/pve-no-subscription.list
|
||||
fi
|
||||
if apt-get update -qq >>"$APT_LOG" 2>&1; then
|
||||
echo " ✓ Switched to no-subscription repository."
|
||||
else
|
||||
echo " ! apt-get update still failed after switching repositories."
|
||||
fi
|
||||
else
|
||||
echo " ! apt-get update error was not recognized. Please review apt configuration on this node."
|
||||
fi
|
||||
fi
|
||||
if apt-get install -y -qq lm-sensors >/dev/null 2>&1; then
|
||||
sensors-detect --auto >/dev/null 2>&1 || true
|
||||
echo " ✓ lm-sensors installed"
|
||||
else
|
||||
echo " ! Failed to install lm-sensors automatically. Please resolve apt issues and rerun this script."
|
||||
fi
|
||||
rm -f "$APT_LOG"
|
||||
else
|
||||
echo " ✓ lm-sensors package verified"
|
||||
fi
|
||||
EOF
|
||||
then
|
||||
echo " ✓ Temperature monitoring enabled on $NODE"
|
||||
else
|
||||
echo " ✗ Failed to configure $NODE (check SSH/cluster connectivity)"
|
||||
fi
|
||||
echo ""
|
||||
done
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
|
||||
echo "Setup Complete"
|
||||
|
|
|
|||
|
|
@ -98,6 +98,74 @@ func getNodeDisplayName(instance *config.PVEInstance, nodeName string) string {
|
|||
return baseName
|
||||
}
|
||||
|
||||
func mergeNVMeTempsIntoDisks(disks []models.PhysicalDisk, nodes []models.Node) []models.PhysicalDisk {
|
||||
if len(disks) == 0 || len(nodes) == 0 {
|
||||
return disks
|
||||
}
|
||||
|
||||
nvmeTempsByNode := make(map[string][]models.NVMeTemp)
|
||||
for _, node := range nodes {
|
||||
if node.Temperature == nil || !node.Temperature.Available || len(node.Temperature.NVMe) == 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
temps := make([]models.NVMeTemp, len(node.Temperature.NVMe))
|
||||
copy(temps, node.Temperature.NVMe)
|
||||
sort.Slice(temps, func(i, j int) bool {
|
||||
return temps[i].Device < temps[j].Device
|
||||
})
|
||||
|
||||
nvmeTempsByNode[node.Name] = temps
|
||||
}
|
||||
|
||||
if len(nvmeTempsByNode) == 0 {
|
||||
return disks
|
||||
}
|
||||
|
||||
updated := make([]models.PhysicalDisk, len(disks))
|
||||
copy(updated, disks)
|
||||
|
||||
disksByNode := make(map[string][]int)
|
||||
for i := range updated {
|
||||
if strings.EqualFold(updated[i].Type, "nvme") {
|
||||
disksByNode[updated[i].Node] = append(disksByNode[updated[i].Node], i)
|
||||
}
|
||||
}
|
||||
|
||||
for nodeName, diskIndexes := range disksByNode {
|
||||
temps, ok := nvmeTempsByNode[nodeName]
|
||||
if !ok || len(temps) == 0 {
|
||||
for _, idx := range diskIndexes {
|
||||
updated[idx].Temperature = 0
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
sort.Slice(diskIndexes, func(i, j int) bool {
|
||||
return updated[diskIndexes[i]].DevPath < updated[diskIndexes[j]].DevPath
|
||||
})
|
||||
|
||||
for _, idx := range diskIndexes {
|
||||
updated[idx].Temperature = 0
|
||||
}
|
||||
|
||||
for idx, diskIdx := range diskIndexes {
|
||||
if idx >= len(temps) {
|
||||
break
|
||||
}
|
||||
|
||||
tempVal := temps[idx].Temp
|
||||
if tempVal <= 0 || math.IsNaN(tempVal) {
|
||||
continue
|
||||
}
|
||||
|
||||
updated[diskIdx].Temperature = int(math.Round(tempVal))
|
||||
}
|
||||
}
|
||||
|
||||
return updated
|
||||
}
|
||||
|
||||
func lookupClusterEndpointLabel(instance *config.PVEInstance, nodeName string) string {
|
||||
if instance == nil {
|
||||
return ""
|
||||
|
|
@ -2100,7 +2168,18 @@ func (m *Monitor) pollPVEInstance(ctx context.Context, instanceName string, clie
|
|||
Dur("sinceLastPoll", time.Since(lastPoll)).
|
||||
Dur("interval", pollingInterval).
|
||||
Msg("Skipping physical disk poll - interval not elapsed")
|
||||
// Don't clear existing data, just skip the poll
|
||||
// Refresh NVMe temperatures using the latest sensor data even when we skip the disk poll
|
||||
currentState := m.state.GetSnapshot()
|
||||
existing := make([]models.PhysicalDisk, 0)
|
||||
for _, disk := range currentState.PhysicalDisks {
|
||||
if disk.Instance == instanceName {
|
||||
existing = append(existing, disk)
|
||||
}
|
||||
}
|
||||
if len(existing) > 0 {
|
||||
updated := mergeNVMeTempsIntoDisks(existing, modelNodes)
|
||||
m.state.UpdatePhysicalDisks(instanceName, updated)
|
||||
}
|
||||
} else {
|
||||
log.Debug().
|
||||
Int("nodeCount", len(nodes)).
|
||||
|
|
@ -2228,6 +2307,8 @@ func (m *Monitor) pollPVEInstance(ctx context.Context, instanceName string, clie
|
|||
}
|
||||
}
|
||||
|
||||
allDisks = mergeNVMeTempsIntoDisks(allDisks, modelNodes)
|
||||
|
||||
// Update physical disks in state
|
||||
log.Debug().
|
||||
Str("instance", instanceName).
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue