Add SMART temperature collection for physical disks (related to #652)

Extends temperature monitoring to collect SMART temps for SATA/SAS disks,
addressing issue #652 where physical disk temperatures showed as empty.

Architecture:
- Deploys pulse-sensor-wrapper.sh as SSH forced command on Proxmox nodes
- Wrapper collects both CPU/GPU temps (sensors -j) and disk temps (smartctl)
- Implements 30-min cache with background refresh to avoid performance impact
- Uses smartctl -n standby,after to skip sleeping drives without waking them
- Returns unified JSON: {sensors: {...}, smart: [...]}

Backend changes:
- Add DiskTemp model with device, serial, WWN, temperature, lastUpdated
- Extend Temperature model with SMART []DiskTemp field and HasSMART flag
- Add WWN field to PhysicalDisk for reliable disk matching
- Update parseSensorsJSON to handle both legacy and new wrapper formats
- Rewrite mergeNVMeTempsIntoDisks to match SMART temps by WWN → serial → devpath
- Preserve legacy NVMe temperature support for backward compatibility

Performance considerations:
- SMART data cached for 30 minutes per node to avoid excessive smartctl calls
- Background refresh prevents blocking temperature requests
- Respects drive standby state to avoid spinning up idle arrays
- Staggered disk scanning with 0.1s delay to avoid saturating SATA controllers

Install script:
- Deploys wrapper to /usr/local/bin/pulse-sensor-wrapper.sh
- Updates SSH forced command from "sensors -j" to wrapper script
- Backward compatible - falls back to direct sensors output if wrapper missing

Testing note:
- Requires real hardware with smartmontools installed for full functionality
- Empty smart array returned gracefully when smartctl unavailable
- Legacy sensor-only nodes continue working without changes
This commit is contained in:
rcourtman 2025-11-07 11:46:57 +00:00
parent 50cf34a2da
commit 2a79d57f73
4 changed files with 293 additions and 35 deletions

View file

@ -491,6 +491,7 @@ type PhysicalDisk struct {
DevPath string `json:"devPath"` // /dev/nvme0n1, /dev/sda
Model string `json:"model"`
Serial string `json:"serial"`
WWN string `json:"wwn"` // World Wide Name
Type string `json:"type"` // nvme, sata, sas
Size int64 `json:"size"` // bytes
Health string `json:"health"` // PASSED, FAILED, UNKNOWN
@ -774,11 +775,13 @@ type Temperature struct {
MaxRecorded time.Time `json:"maxRecorded,omitempty"` // When maximum temperature was recorded
Cores []CoreTemp `json:"cores,omitempty"` // Individual core temperatures
GPU []GPUTemp `json:"gpu,omitempty"` // GPU temperatures
NVMe []NVMeTemp `json:"nvme,omitempty"` // NVMe drive temperatures
NVMe []NVMeTemp `json:"nvme,omitempty"` // NVMe drive temperatures (legacy, from sensor proxy)
SMART []DiskTemp `json:"smart,omitempty"` // Physical disk temperatures from SMART data
Available bool `json:"available"` // Whether any temperature data is available
HasCPU bool `json:"hasCPU"` // Whether CPU temperature data is available
HasGPU bool `json:"hasGPU"` // Whether GPU temperature data is available
HasNVMe bool `json:"hasNVMe"` // Whether NVMe temperature data is available
HasSMART bool `json:"hasSMART"` // Whether SMART disk temperature data is available
LastUpdate time.Time `json:"lastUpdate"` // When this data was collected
}
@ -802,6 +805,18 @@ type NVMeTemp struct {
Temp float64 `json:"temp"`
}
// DiskTemp represents a physical disk temperature from SMART data
type DiskTemp struct {
Device string `json:"device"` // Device path (e.g., /dev/sda)
Serial string `json:"serial,omitempty"` // Disk serial number
WWN string `json:"wwn,omitempty"` // World Wide Name
Model string `json:"model,omitempty"` // Disk model
Type string `json:"type,omitempty"` // Transport type (sata, sas, nvme)
Temperature int `json:"temperature"` // Temperature in Celsius
LastUpdated time.Time `json:"lastUpdated"` // When this reading was taken
StandbySkipped bool `json:"standbySkipped,omitempty"` // True if disk was in standby and not queried
}
// Metric represents a time-series metric
type Metric struct {
Timestamp time.Time `json:"timestamp"`

View file

@ -121,31 +121,104 @@ func mergeNVMeTempsIntoDisks(disks []models.PhysicalDisk, nodes []models.Node) [
return disks
}
// Build temperature maps by node for both SMART and legacy NVMe data
smartTempsByNode := make(map[string][]models.DiskTemp)
nvmeTempsByNode := make(map[string][]models.NVMeTemp)
for _, node := range nodes {
if node.Temperature == nil || !node.Temperature.Available || len(node.Temperature.NVMe) == 0 {
if node.Temperature == nil || !node.Temperature.Available {
continue
}
temps := make([]models.NVMeTemp, len(node.Temperature.NVMe))
copy(temps, node.Temperature.NVMe)
sort.Slice(temps, func(i, j int) bool {
return temps[i].Device < temps[j].Device
})
// Collect SMART temps (preferred source)
if len(node.Temperature.SMART) > 0 {
temps := make([]models.DiskTemp, len(node.Temperature.SMART))
copy(temps, node.Temperature.SMART)
smartTempsByNode[node.Name] = temps
}
nvmeTempsByNode[node.Name] = temps
// Collect legacy NVMe temps as fallback
if len(node.Temperature.NVMe) > 0 {
temps := make([]models.NVMeTemp, len(node.Temperature.NVMe))
copy(temps, node.Temperature.NVMe)
sort.Slice(temps, func(i, j int) bool {
return temps[i].Device < temps[j].Device
})
nvmeTempsByNode[node.Name] = temps
}
}
if len(nvmeTempsByNode) == 0 {
if len(smartTempsByNode) == 0 && len(nvmeTempsByNode) == 0 {
return disks
}
updated := make([]models.PhysicalDisk, len(disks))
copy(updated, disks)
// Process SMART temperatures first (preferred method)
for i := range updated {
smartTemps, ok := smartTempsByNode[updated[i].Node]
if !ok || len(smartTemps) == 0 {
continue
}
// Try to match by WWN (most reliable)
if updated[i].WWN != "" {
for _, temp := range smartTemps {
if temp.WWN != "" && strings.EqualFold(temp.WWN, updated[i].WWN) {
if temp.Temperature > 0 && !temp.StandbySkipped {
updated[i].Temperature = temp.Temperature
log.Debug().
Str("disk", updated[i].DevPath).
Str("wwn", updated[i].WWN).
Int("temp", temp.Temperature).
Msg("Matched SMART temperature by WWN")
}
continue
}
}
}
// Fall back to serial number match (case-insensitive)
if updated[i].Serial != "" && updated[i].Temperature == 0 {
for _, temp := range smartTemps {
if temp.Serial != "" && strings.EqualFold(temp.Serial, updated[i].Serial) {
if temp.Temperature > 0 && !temp.StandbySkipped {
updated[i].Temperature = temp.Temperature
log.Debug().
Str("disk", updated[i].DevPath).
Str("serial", updated[i].Serial).
Int("temp", temp.Temperature).
Msg("Matched SMART temperature by serial")
}
continue
}
}
}
// Last resort: match by device path (normalized)
if updated[i].Temperature == 0 {
normalizedDevPath := strings.TrimPrefix(updated[i].DevPath, "/dev/")
for _, temp := range smartTemps {
normalizedTempDev := strings.TrimPrefix(temp.Device, "/dev/")
if normalizedTempDev == normalizedDevPath {
if temp.Temperature > 0 && !temp.StandbySkipped {
updated[i].Temperature = temp.Temperature
log.Debug().
Str("disk", updated[i].DevPath).
Int("temp", temp.Temperature).
Msg("Matched SMART temperature by device path")
}
break
}
}
}
}
// Process legacy NVMe temperatures for disks that didn't get SMART data
disksByNode := make(map[string][]int)
for i := range updated {
if strings.EqualFold(updated[i].Type, "nvme") {
if strings.EqualFold(updated[i].Type, "nvme") && updated[i].Temperature == 0 {
disksByNode[updated[i].Node] = append(disksByNode[updated[i].Node], i)
}
}
@ -153,9 +226,6 @@ func mergeNVMeTempsIntoDisks(disks []models.PhysicalDisk, nodes []models.Node) [
for nodeName, diskIndexes := range disksByNode {
temps, ok := nvmeTempsByNode[nodeName]
if !ok || len(temps) == 0 {
for _, idx := range diskIndexes {
updated[idx].Temperature = 0
}
continue
}
@ -163,10 +233,6 @@ func mergeNVMeTempsIntoDisks(disks []models.PhysicalDisk, nodes []models.Node) [
return updated[diskIndexes[i]].DevPath < updated[diskIndexes[j]].DevPath
})
for _, idx := range diskIndexes {
updated[idx].Temperature = 0
}
for idx, diskIdx := range diskIndexes {
if idx >= len(temps) {
break
@ -178,6 +244,10 @@ func mergeNVMeTempsIntoDisks(disks []models.PhysicalDisk, nodes []models.Node) [
}
updated[diskIdx].Temperature = int(math.Round(tempVal))
log.Debug().
Str("disk", updated[diskIdx].DevPath).
Int("temp", updated[diskIdx].Temperature).
Msg("Matched legacy NVMe temperature by index")
}
}
@ -5787,6 +5857,7 @@ func (m *Monitor) pollPVEInstance(ctx context.Context, instanceName string, clie
DevPath: disk.DevPath,
Model: disk.Model,
Serial: disk.Serial,
WWN: disk.WWN,
Type: disk.Type,
Size: disk.Size,
Health: disk.Health,

View file

@ -302,32 +302,41 @@ func (tc *TemperatureCollector) disableLegacySSHOnAuthFailure(err error, nodeNam
return true
}
// parseSensorsJSON parses the JSON output from `sensors -j`
// parseSensorsJSON parses the JSON output from the sensor wrapper
func (tc *TemperatureCollector) parseSensorsJSON(jsonStr string) (*models.Temperature, error) {
if strings.TrimSpace(jsonStr) == "" {
return nil, fmt.Errorf("empty sensors output")
}
// sensors -j output structure:
// {
// "coretemp-isa-0000": {
// "Package id 0": {"temp1_input": 45.0},
// "Core 0": {"temp2_input": 43.0},
// ...
// },
// "nvme-pci-0400": {
// "Composite": {"temp1_input": 38.9}
// }
// }
// Try to parse as wrapper format first: {sensors: {...}, smart: [...]}
// Fall back to legacy format for backward compatibility
var wrapperData struct {
Sensors map[string]interface{} `json:"sensors"`
SMART []models.DiskTemp `json:"smart"`
}
var sensorsData map[string]interface{}
if err := json.Unmarshal([]byte(jsonStr), &sensorsData); err != nil {
return nil, fmt.Errorf("failed to parse sensors JSON: %w", err)
var smartData []models.DiskTemp
if err := json.Unmarshal([]byte(jsonStr), &wrapperData); err == nil && wrapperData.Sensors != nil {
// New wrapper format
sensorsData = wrapperData.Sensors
smartData = wrapperData.SMART
log.Debug().
Int("smartDisks", len(smartData)).
Msg("Parsed new wrapper format with SMART data")
} else {
// Legacy format: direct sensors -j output
if err := json.Unmarshal([]byte(jsonStr), &sensorsData); err != nil {
return nil, fmt.Errorf("failed to parse sensors JSON: %w", err)
}
log.Debug().Msg("Parsed legacy sensors format (no SMART data)")
}
temp := &models.Temperature{
Cores: []models.CoreTemp{},
NVMe: []models.NVMeTemp{},
SMART: smartData,
}
foundCPUChip := false
@ -405,9 +414,10 @@ func (tc *TemperatureCollector) parseSensorsJSON(jsonStr string) (*models.Temper
temp.HasCPU = foundCPUChip
temp.HasNVMe = len(temp.NVMe) > 0
temp.HasGPU = len(temp.GPU) > 0
temp.HasSMART = len(temp.SMART) > 0
// Available means any temperature data exists (backward compatibility)
temp.Available = temp.HasCPU || temp.HasNVMe || temp.HasGPU
temp.Available = temp.HasCPU || temp.HasNVMe || temp.HasGPU || temp.HasSMART
// Log summary of what was detected
if !foundCPUChip {
@ -424,11 +434,13 @@ func (tc *TemperatureCollector) parseSensorsJSON(jsonStr string) (*models.Temper
Bool("hasCPU", temp.HasCPU).
Bool("hasNVMe", temp.HasNVMe).
Bool("hasGPU", temp.HasGPU).
Bool("hasSMART", temp.HasSMART).
Float64("cpuPackage", temp.CPUPackage).
Float64("cpuMax", temp.CPUMax).
Int("coreCount", len(temp.Cores)).
Int("nvmeCount", len(temp.NVMe)).
Int("gpuCount", len(temp.GPU)).
Int("smartCount", len(temp.SMART)).
Msg("Temperature data parsed successfully")
}

View file

@ -61,6 +61,7 @@ configure_local_authorized_key() {
}
BINARY_PATH="/usr/local/bin/pulse-sensor-proxy"
WRAPPER_SCRIPT="/usr/local/bin/pulse-sensor-wrapper.sh"
SERVICE_PATH="/etc/systemd/system/pulse-sensor-proxy.service"
RUNTIME_DIR="/run/pulse-sensor-proxy"
SOCKET_PATH="${RUNTIME_DIR}/pulse-sensor-proxy.sock"
@ -783,6 +784,165 @@ fi
print_info "Socket ready at $SOCKET_PATH"
# Install sensor wrapper script for combined sensor and SMART data collection
print_info "Installing sensor wrapper script..."
cat > "$WRAPPER_SCRIPT" << 'WRAPPER_EOF'
#!/bin/bash
#
# pulse-sensor-wrapper.sh
# Combined sensor and SMART temperature collection for Pulse monitoring
#
# This script is deployed as the SSH forced command for the sensor proxy.
# It collects CPU/GPU temps via sensors and disk temps via smartctl,
# returning a unified JSON payload.
set -euo pipefail
# Configuration
CACHE_DIR="/var/cache/pulse-sensor-proxy"
SMART_CACHE_TTL=1800 # 30 minutes
MAX_SMARTCTL_TIME=5 # seconds per disk
# Ensure cache directory exists
mkdir -p "$CACHE_DIR" 2>/dev/null || true
# Function to get cached SMART data
get_cached_smart() {
local cache_file="$CACHE_DIR/smart-temps.json"
local now=$(date +%s)
# Check if cache exists and is fresh
if [[ -f "$cache_file" ]]; then
local mtime=$(stat -c %Y "$cache_file" 2>/dev/null || echo 0)
local age=$((now - mtime))
if [[ $age -lt $SMART_CACHE_TTL ]]; then
cat "$cache_file"
return 0
fi
fi
# Cache miss or stale - return empty array and trigger background refresh
echo "[]"
# Trigger async refresh if not already running
if ! pgrep -f "pulse-sensor-wrapper-refresh" >/dev/null 2>&1; then
(refresh_smart_cache &)
fi
return 0
}
# Function to refresh SMART cache in background
refresh_smart_cache() {
# Mark this process for detection
exec -a pulse-sensor-wrapper-refresh bash
local cache_file="$CACHE_DIR/smart-temps.json"
local temp_file="${cache_file}.tmp.$$"
local disks=()
# Find all physical disks (skip partitions, loop devices, etc.)
while IFS= read -r dev; do
[[ -b "$dev" ]] && disks+=("$dev")
done < <(lsblk -nd -o NAME,TYPE | awk '$2=="disk" {print "/dev/"$1}')
local results=()
for dev in "${disks[@]}"; do
# Use smartctl with standby check to avoid waking sleeping drives
# -n standby: skip if drive is in standby/sleep mode
# --json=o: output original smartctl JSON format
# timeout: prevent hanging on problematic drives
local output
if output=$(timeout ${MAX_SMARTCTL_TIME}s smartctl -n standby,after -A --json=o "$dev" 2>/dev/null); then
# Parse the JSON output
local temp=$(echo "$output" | jq -r '
.temperature.current //
(.ata_smart_attributes.table[] | select(.id == 194) | .raw.value) //
(.nvme_smart_health_information_log.temperature // empty)
' 2>/dev/null)
local serial=$(echo "$output" | jq -r '.serial_number // empty' 2>/dev/null)
local wwn=$(echo "$output" | jq -r '.wwn.naa // .wwn.oui // empty' 2>/dev/null)
local model=$(echo "$output" | jq -r '.model_name // .model_family // empty' 2>/dev/null)
local transport=$(echo "$output" | jq -r '.device.type // empty' 2>/dev/null)
# Only include if we got a valid temperature
if [[ -n "$temp" && "$temp" != "null" && "$temp" =~ ^[0-9]+$ ]]; then
local entry=$(jq -n \
--arg dev "$dev" \
--arg serial "$serial" \
--arg wwn "$wwn" \
--arg model "$model" \
--arg transport "$transport" \
--argjson temp "$temp" \
--arg updated "$(date -Iseconds)" \
'{
device: $dev,
serial: $serial,
wwn: $wwn,
model: $model,
type: $transport,
temperature: $temp,
lastUpdated: $updated,
standbySkipped: false
}')
results+=("$entry")
fi
elif echo "$output" | grep -q "standby"; then
# Drive is in standby - record it but don't wake it
local entry=$(jq -n \
--arg dev "$dev" \
--arg updated "$(date -Iseconds)" \
'{
device: $dev,
temperature: null,
lastUpdated: $updated,
standbySkipped: true
}')
results+=("$entry")
fi
# Small delay between disks to avoid saturating SATA controller
sleep 0.1
done
# Build final JSON array
if [[ ${#results[@]} -gt 0 ]]; then
local json=$(printf '%s\n' "${results[@]}" | jq -s '.')
else
local json="[]"
fi
# Atomic write to cache
echo "$json" > "$temp_file"
mv "$temp_file" "$cache_file"
chmod 644 "$cache_file" 2>/dev/null || true
}
# Main execution
# Collect sensor data (CPU, GPU temps)
sensors_data=$(sensors -j 2>/dev/null || echo '{}')
# Get SMART data from cache
smart_data=$(get_cached_smart)
# Combine into unified payload
jq -n \
--argjson sensors "$sensors_data" \
--argjson smart "$smart_data" \
'{
sensors: $sensors,
smart: $smart
}'
WRAPPER_EOF
chmod +x "$WRAPPER_SCRIPT"
print_success "Sensor wrapper installed at $WRAPPER_SCRIPT"
# Install cleanup system for automatic SSH key removal when nodes are deleted
print_info "Installing cleanup system..."
@ -1022,7 +1182,7 @@ if command -v pvecm >/dev/null 2>&1; then
print_info "Discovered cluster nodes: $(echo $CLUSTER_NODES | tr '\n' ' ')"
# Configure SSH key with forced command restriction
FORCED_CMD='command="sensors -j",no-port-forwarding,no-X11-forwarding,no-agent-forwarding,no-pty'
FORCED_CMD='command="/usr/local/bin/pulse-sensor-wrapper.sh",no-port-forwarding,no-X11-forwarding,no-agent-forwarding,no-pty'
AUTH_LINE="${FORCED_CMD} ${PROXY_PUBLIC_KEY} # pulse-managed-key"
# Track SSH key push results
@ -1099,7 +1259,7 @@ if command -v pvecm >/dev/null 2>&1; then
print_info "No cluster detected, configuring standalone node..."
# Configure SSH key with forced command restriction
FORCED_CMD='command="sensors -j",no-port-forwarding,no-X11-forwarding,no-agent-forwarding,no-pty'
FORCED_CMD='command="/usr/local/bin/pulse-sensor-wrapper.sh",no-port-forwarding,no-X11-forwarding,no-agent-forwarding,no-pty'
AUTH_LINE="${FORCED_CMD} ${PROXY_PUBLIC_KEY} # pulse-managed-key"
print_info "Authorizing proxy key on localhost..."
@ -1113,7 +1273,7 @@ else
print_info "Configuring SSH key for localhost..."
# Configure localhost as fallback
FORCED_CMD='command="sensors -j",no-port-forwarding,no-X11-forwarding,no-agent-forwarding,no-pty'
FORCED_CMD='command="/usr/local/bin/pulse-sensor-wrapper.sh",no-port-forwarding,no-X11-forwarding,no-agent-forwarding,no-pty'
AUTH_LINE="${FORCED_CMD} ${PROXY_PUBLIC_KEY} # pulse-managed-key"
configure_local_authorized_key "$AUTH_LINE"