mirror of
https://github.com/rcourtman/Pulse.git
synced 2026-05-20 01:01:20 +00:00
Use disk type thresholds for SMART temperatures
This commit is contained in:
parent
3cecf9576d
commit
8197fe6b1e
2 changed files with 160 additions and 2 deletions
|
|
@ -304,6 +304,16 @@ func (m *Manager) CheckHost(host models.Host) {
|
|||
if len(host.Sensors.SMART) > 0 {
|
||||
for _, disk := range host.Sensors.SMART {
|
||||
if disk.Temperature > 0 && !disk.Standby {
|
||||
effectiveTempThreshold := thresholds.DiskTemperature
|
||||
if diskType := strings.ToLower(strings.TrimSpace(disk.Type)); diskType != "" {
|
||||
m.mu.RLock()
|
||||
if th, ok := m.config.DiskTempByType[diskType]; ok {
|
||||
t := th
|
||||
effectiveTempThreshold = &t
|
||||
}
|
||||
m.mu.RUnlock()
|
||||
}
|
||||
|
||||
// Use specific resource ID for the disk: hostID/disk-temp:device
|
||||
tempResourceID := fmt.Sprintf("%s/disk_temp:%s", hostResourceID(host.ID), sanitizeHostComponent(disk.Device))
|
||||
tempResourceName := fmt.Sprintf("%s (%s Temp)", host.DisplayName, disk.Device)
|
||||
|
|
@ -313,7 +323,8 @@ func (m *Manager) CheckHost(host models.Host) {
|
|||
diskTempMetadata["device"] = disk.Device
|
||||
diskTempMetadata["temperature"] = disk.Temperature
|
||||
diskTempMetadata["model"] = disk.Model
|
||||
spec, err := buildCanonicalMetricSpec(tempResourceID, tempResourceName, unifiedresources.ResourceType("agent-disk"), "diskTemperature", thresholds.DiskTemperature)
|
||||
diskTempMetadata["diskType"] = disk.Type
|
||||
spec, err := buildCanonicalMetricSpec(tempResourceID, tempResourceName, unifiedresources.ResourceType("agent-disk"), "diskTemperature", effectiveTempThreshold)
|
||||
if err != nil {
|
||||
log.Warn().
|
||||
Err(err).
|
||||
|
|
@ -324,7 +335,7 @@ func (m *Manager) CheckHost(host models.Host) {
|
|||
continue
|
||||
}
|
||||
|
||||
m.checkMetricWithCanonicalSpec(spec, tempResourceName, nodeName, disk.Device, "agent", float64(disk.Temperature), thresholds.DiskTemperature, &metricOptions{Metadata: diskTempMetadata})
|
||||
m.checkMetricWithCanonicalSpec(spec, tempResourceName, nodeName, disk.Device, "agent", float64(disk.Temperature), effectiveTempThreshold, &metricOptions{Metadata: diskTempMetadata})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,7 +1,9 @@
|
|||
package alerts
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/rcourtman/pulse-go-rewrite/internal/models"
|
||||
)
|
||||
|
|
@ -42,6 +44,66 @@ func configureDiskTypeHostManager(t *testing.T) *Manager {
|
|||
return m
|
||||
}
|
||||
|
||||
func configureDiskTempTypeHostManager(t *testing.T) *Manager {
|
||||
t.Helper()
|
||||
|
||||
m := newTestManager(t)
|
||||
cfg := AlertConfig{
|
||||
Enabled: true,
|
||||
ActivationState: ActivationActive,
|
||||
AgentDefaults: ThresholdConfig{
|
||||
DiskTemperature: &HysteresisThreshold{Trigger: 55, Clear: 50},
|
||||
},
|
||||
DiskTempByType: map[string]HysteresisThreshold{
|
||||
"nvme": {Trigger: 70, Clear: 65},
|
||||
"sas": {Trigger: 65, Clear: 60},
|
||||
"sata": {Trigger: 55, Clear: 50},
|
||||
},
|
||||
Overrides: map[string]ThresholdConfig{},
|
||||
TimeThresholds: map[string]int{},
|
||||
SuppressionWindow: 0,
|
||||
MinimumDelta: 0,
|
||||
}
|
||||
m.UpdateConfig(cfg)
|
||||
|
||||
m.mu.Lock()
|
||||
m.config.TimeThresholds = map[string]int{}
|
||||
m.config.MetricTimeThresholds = nil
|
||||
m.config.SuppressionWindow = 0
|
||||
m.config.MinimumDelta = 0
|
||||
m.mu.Unlock()
|
||||
|
||||
m.ClearActiveAlerts()
|
||||
return m
|
||||
}
|
||||
|
||||
func hostWithSMARTDiskTemp(id, diskType string, temperature int) models.Host {
|
||||
return models.Host{
|
||||
ID: id,
|
||||
DisplayName: id,
|
||||
Hostname: id,
|
||||
Status: "online",
|
||||
Sensors: models.HostSensorSummary{
|
||||
SMART: []models.HostDiskSMART{
|
||||
{
|
||||
Device: "/dev/" + id,
|
||||
Model: "test-disk",
|
||||
Type: diskType,
|
||||
Temperature: temperature,
|
||||
},
|
||||
},
|
||||
},
|
||||
IntervalSeconds: 30,
|
||||
LastSeen: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func hostDiskTempAlertID(host models.Host) string {
|
||||
disk := host.Sensors.SMART[0]
|
||||
resourceID := fmt.Sprintf("%s/disk_temp:%s", hostResourceID(host.ID), sanitizeHostComponent(disk.Device))
|
||||
return canonicalMetricStateID(resourceID, "diskTemperature")
|
||||
}
|
||||
|
||||
func TestHostDiskFillUsesPerTypeThresholdForNVMe(t *testing.T) {
|
||||
m := configureDiskTypeHostManager(t)
|
||||
|
||||
|
|
@ -167,3 +229,88 @@ func TestStorageTypeBranchNotRegressed(t *testing.T) {
|
|||
|
||||
assertAlertPresent(t, m, canonicalMetricStateID("storage-1", "usage"))
|
||||
}
|
||||
|
||||
func TestHostDiskTempUsesNVMeThreshold(t *testing.T) {
|
||||
m := configureDiskTempTypeHostManager(t)
|
||||
|
||||
hostBelow := hostWithSMARTDiskTemp("host-temp-nvme", "nvme", 62)
|
||||
m.CheckHost(hostBelow)
|
||||
|
||||
trackingKey := hostDiskTempAlertID(hostBelow)
|
||||
if _, exists := testLookupActiveAlert(t, m, trackingKey); exists {
|
||||
t.Fatalf("expected no alert for nvme disk at 62C (nvme trigger 70), got active: %v", alertKeys(m))
|
||||
}
|
||||
m.mu.RLock()
|
||||
if _, pending := m.pendingAlerts[trackingKey]; pending {
|
||||
m.mu.RUnlock()
|
||||
t.Fatalf("expected no pending alert for nvme disk at 62C, but pendingAlerts has %q", trackingKey)
|
||||
}
|
||||
m.mu.RUnlock()
|
||||
|
||||
hostAbove := hostWithSMARTDiskTemp("host-temp-nvme", "nvme", 71)
|
||||
m.CheckHost(hostAbove)
|
||||
|
||||
if _, exists := testLookupActiveAlert(t, m, trackingKey); !exists {
|
||||
t.Fatalf("expected alert for nvme disk at 71C (nvme trigger 70), active: %v", alertKeys(m))
|
||||
}
|
||||
}
|
||||
|
||||
func TestHostDiskTempUsesSASThreshold(t *testing.T) {
|
||||
m := configureDiskTempTypeHostManager(t)
|
||||
|
||||
hostBelow := hostWithSMARTDiskTemp("host-temp-sas", "sas", 64)
|
||||
m.CheckHost(hostBelow)
|
||||
|
||||
trackingKey := hostDiskTempAlertID(hostBelow)
|
||||
if _, exists := testLookupActiveAlert(t, m, trackingKey); exists {
|
||||
t.Fatalf("expected no alert for sas disk at 64C (sas trigger 65), got active: %v", alertKeys(m))
|
||||
}
|
||||
|
||||
hostAbove := hostWithSMARTDiskTemp("host-temp-sas", "sas", 66)
|
||||
m.CheckHost(hostAbove)
|
||||
|
||||
if _, exists := testLookupActiveAlert(t, m, trackingKey); !exists {
|
||||
t.Fatalf("expected alert for sas disk at 66C (sas trigger 65), active: %v", alertKeys(m))
|
||||
}
|
||||
}
|
||||
|
||||
func TestHostDiskTempUsesSATAThreshold(t *testing.T) {
|
||||
m := configureDiskTempTypeHostManager(t)
|
||||
|
||||
hostBelow := hostWithSMARTDiskTemp("host-temp-sata", "sata", 54)
|
||||
m.CheckHost(hostBelow)
|
||||
|
||||
trackingKey := hostDiskTempAlertID(hostBelow)
|
||||
if _, exists := testLookupActiveAlert(t, m, trackingKey); exists {
|
||||
t.Fatalf("expected no alert for sata disk at 54C (sata trigger 55), got active: %v", alertKeys(m))
|
||||
}
|
||||
|
||||
hostAbove := hostWithSMARTDiskTemp("host-temp-sata", "sata", 56)
|
||||
m.CheckHost(hostAbove)
|
||||
|
||||
if _, exists := testLookupActiveAlert(t, m, trackingKey); !exists {
|
||||
t.Fatalf("expected alert for sata disk at 56C (sata trigger 55), active: %v", alertKeys(m))
|
||||
}
|
||||
}
|
||||
|
||||
func TestHostDiskTempPerTypeThresholdDoesNotOverrideDisabledGlobalDefault(t *testing.T) {
|
||||
m := configureDiskTempTypeHostManager(t)
|
||||
|
||||
m.mu.Lock()
|
||||
m.config.AgentDefaults.DiskTemperature = &HysteresisThreshold{Trigger: 0, Clear: 0}
|
||||
m.mu.Unlock()
|
||||
|
||||
host := hostWithSMARTDiskTemp("host-temp-disabled-nvme", "nvme", 71)
|
||||
m.CheckHost(host)
|
||||
|
||||
trackingKey := hostDiskTempAlertID(host)
|
||||
if _, exists := testLookupActiveAlert(t, m, trackingKey); exists {
|
||||
t.Fatalf("expected no alert when global agent disk temperature threshold is disabled, active: %v", alertKeys(m))
|
||||
}
|
||||
m.mu.RLock()
|
||||
if _, pending := m.pendingAlerts[trackingKey]; pending {
|
||||
m.mu.RUnlock()
|
||||
t.Fatalf("expected no pending alert when global agent disk temperature threshold is disabled, but pendingAlerts has %q", trackingKey)
|
||||
}
|
||||
m.mu.RUnlock()
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue