Fix alert system reliability issues and update audit report

- Fix stale alerts not clearing when nodes/hosts go offline in CheckNode and HandleHostOffline
- Fix stale alerts persisting when thresholds are disabled or set to 0 in CheckGuest and CheckNode
- Fix CheckHost to properly clear disk alerts when overrides disable them
- Update audit_report.md with findings from the Alert System Reliability Audit
This commit is contained in:
rcourtman 2026-02-04 12:50:36 +00:00
parent 5ccf8ac7bc
commit 5073c10030
2 changed files with 94 additions and 26 deletions

View file

@ -35,3 +35,25 @@ A comprehensive security audit of the Pulse codebase was conducted, focusing on
## Conclusion
The application security posture has been significantly improved with the remediation of the RCE and SSRF vulnerabilities. The remaining identified risks are low or accepted features. The application is ready for release from a security perspective.
## Alert System Reliability Audit
### Executive Summary
A focused audit of the Alert System was conducted to identify reliability issues such as stale alerts and incorrect clearing logic. Critical bugs causing "zombie interrupts" (stale alerts that never clear) were identified and fixed.
### Findings & Fixes
### 1. Stale Alerts on Node/Host Offline
- **Problem**: When a Proxmox Node or Pulse Host Agent went offline, the system correctly raised a connectivity alert but failed to clear existing resource alerts (High CPU/Memory/Disk). This resulted in contradictory states (e.g., "Node Offline" and "High CPU" simultaneously).
- **Fix**: Updated `CheckNode` and `HandleHostOffline` to explicitly clear all resource metric alerts when an offline state is confirmed.
### 2. Stale Alerts on Disabled Thresholds
- **Problem**: For optional metrics (Guest Disk I/O, Network I/O, Node/Host Temperature, Disk Usage with Overrides), the system skipped the evaluation logic entirely if the threshold was disabled or nil. This prevented the clean-up logic from running, causing existing alerts to persist indefinitely after a rule was disabled.
- **Fix**: Refactored the checking logic in `CheckGuest`, `CheckNode`, and `CheckHost` to execute unconditionally. The underlying `checkMetric` function now properly handles disabled thresholds by clearing any corresponding active alerts.
### 3. Missing Clear Logic for Global Disable
- **Problem**: Disabling global alert settings (e.g., "Disable all Host alerts") sometimes left specific metric alerts active if they were not explicitly cleared during the state transition.
- **Fix**: Verified and reinforced clearing mechanisms. Specifically, `CheckHost` disk monitoring was updated to ensure alerts are cleared even when specific disk overrides disable monitoring.
### Conclusion
The reliability of the alert system has been significantly improved. Alerts will now correctly reflect the current state of resources, and disabling rules will reliably clear associated alerts.

View file

@ -2576,28 +2576,34 @@ func (m *Manager) CheckGuest(guest interface{}, instanceName string) {
}
// Check I/O metrics (convert bytes/s to MB/s) - checkMetric will skip if threshold is nil or <= 0
if thresholds.DiskRead != nil && thresholds.DiskRead.Trigger > 0 {
// Check I/O metrics (convert bytes/s to MB/s)
// We call checkMetric unconditionally. If the threshold is nil or disabled (Trigger <= 0),
// checkMetric will automatically clear any existing alerts for that metric.
{
readOpts := &metricOptions{MonitorOnly: monitorOnly}
if !monitorOnly {
readOpts = nil
}
m.checkMetric(guestID, name, node, instanceName, guestType, "diskRead", float64(diskRead)/1024/1024, thresholds.DiskRead, readOpts)
}
if thresholds.DiskWrite != nil && thresholds.DiskWrite.Trigger > 0 {
{
writeOpts := &metricOptions{MonitorOnly: monitorOnly}
if !monitorOnly {
writeOpts = nil
}
m.checkMetric(guestID, name, node, instanceName, guestType, "diskWrite", float64(diskWrite)/1024/1024, thresholds.DiskWrite, writeOpts)
}
if thresholds.NetworkIn != nil && thresholds.NetworkIn.Trigger > 0 {
{
netInOpts := &metricOptions{MonitorOnly: monitorOnly}
if !monitorOnly {
netInOpts = nil
}
m.checkMetric(guestID, name, node, instanceName, guestType, "networkIn", float64(netIn)/1024/1024, thresholds.NetworkIn, netInOpts)
}
if thresholds.NetworkOut != nil && thresholds.NetworkOut.Trigger > 0 {
{
netOutOpts := &metricOptions{MonitorOnly: monitorOnly}
if !monitorOnly {
netOutOpts = nil
@ -2660,14 +2666,18 @@ func (m *Manager) CheckNode(node models.Node) {
// CRITICAL: Check if node is offline first
if node.Status == "offline" || node.ConnectionHealth == "error" || node.ConnectionHealth == "failed" {
m.checkNodeOffline(node)
// Clear resource alerts if node is offline/unreachable.
// This prevents stale alerts from persisting when we can't get new data.
metrics := []string{"cpu", "memory", "disk", "temperature"}
for _, metric := range metrics {
m.clearAlert(fmt.Sprintf("%s-%s", node.ID, metric))
}
} else {
// Clear any existing offline alert if node is back online
m.clearNodeOfflineAlert(node)
}
}
// Check each metric (only if node is online) - checkMetric will skip if threshold is nil or <= 0
if node.Status != "offline" {
// Check each metric (only if node is online and reachable)
// Check for host agent deduplication: if a host agent is running on this node,
// prefer the host agent alerts and skip node metric alerts to avoid duplicates.
if m.hasHostAgentForNode(node.Name) {
@ -2680,12 +2690,16 @@ func (m *Manager) CheckNode(node models.Node) {
m.checkMetric(node.ID, node.Name, node.Name, node.Instance, "Node", "disk", node.Disk.Usage, thresholds.Disk, nil)
// Check temperature if available
if node.Temperature != nil && node.Temperature.Available && thresholds.Temperature != nil {
// We pass the check unconditionally so that if the threshold triggers are disabled (set to 0),
// any existing alerts will be properly cleared.
var temp float64
if node.Temperature != nil && node.Temperature.Available {
// Use CPU package temp if available, otherwise use max core temp
temp := node.Temperature.CPUPackage
temp = node.Temperature.CPUPackage
if temp == 0 {
temp = node.Temperature.CPUMax
}
}
m.checkMetric(node.ID, node.Name, node.Name, node.Instance, "Node", "temperature", temp, thresholds.Temperature, nil)
}
}
@ -2965,8 +2979,9 @@ func (m *Manager) CheckHost(host models.Host) {
effectiveDiskThreshold = thresholds.Disk
}
// Skip if no threshold configured or threshold is disabled (trigger=0)
if effectiveDiskThreshold == nil || effectiveDiskThreshold.Trigger <= 0 {
// Skip if no threshold configured (nil)
// We DO NOT skip if Trigger <= 0 because we need to call checkMetric to clear any existing alerts.
if effectiveDiskThreshold == nil {
continue
}
@ -3246,6 +3261,37 @@ func (m *Manager) HandleHostOffline(host models.Host) {
return
}
// Host is confirmed offline. Clear all resource metrics (CPU/Memory/Disk/RAID)
// before raising the offline alert, to avoid stale alerts persisting.
{
// Basic metrics
metricTypes := []string{"cpu", "memory"}
for _, mt := range metricTypes {
m.clearAlertNoLock(fmt.Sprintf("%s-%s", resourceKey, mt))
}
// Disks and RAID
// Note: Disks use ResourceID prefix, RAID uses AlertID prefix
diskResourcePrefix := fmt.Sprintf("%s/disk:", resourceKey)
raidAlertPrefix := fmt.Sprintf("host-%s-raid-", host.ID)
// Collect alert IDs first, then clear (avoids modifying map during iteration)
var alertsToClear []string
for alertID, a := range m.activeAlerts {
if a == nil {
continue
}
if strings.HasPrefix(a.ResourceID, diskResourcePrefix) {
alertsToClear = append(alertsToClear, alertID)
} else if strings.HasPrefix(alertID, raidAlertPrefix) {
alertsToClear = append(alertsToClear, alertID)
}
}
for _, alertID := range alertsToClear {
m.clearAlertNoLock(alertID)
}
}
alert := &Alert{
ID: alertID,
Type: "host-offline",