mirror of
https://github.com/rcourtman/Pulse.git
synced 2026-04-28 03:20:11 +00:00
fix(alerts): make --disk-exclude suppress Proxmox SSD wear/health alerts (#1142)
The --disk-exclude agent flag only filtered local metric collection but had no effect on server-side Proxmox disk health and SSD wearout alerts, which poll the Proxmox API directly. Users excluding disks (e.g. --disk-exclude sda) still received alerts for those disks. Agent now sends its DiskExclude patterns in each report. The server stores them on the Host model and consults them during Proxmox disk polling — excluded disks get a synthetic healthy status passed to CheckDiskHealth so any existing alerts clear immediately. Also adds FreeBSD pseudo-filesystem types (fdescfs, devfs, linprocfs, linsysfs) to the virtual FS filter and /var/run/ to special mount prefixes, fixing false disk-full alerts on FreeBSD for fdescfs mounts.
This commit is contained in:
parent
bc378f0f60
commit
8c7d507ea4
6 changed files with 60 additions and 7 deletions
|
|
@ -374,6 +374,7 @@ func (a *Agent) buildReport(ctx context.Context) (agentshost.Report, error) {
|
|||
Hostname: a.hostname,
|
||||
UpdatedFrom: a.updatedFrom,
|
||||
CommandsEnabled: a.cfg.EnableCommands,
|
||||
DiskExclude: a.cfg.DiskExclude,
|
||||
},
|
||||
Host: agentshost.HostInfo{
|
||||
ID: a.machineID,
|
||||
|
|
|
|||
|
|
@ -199,6 +199,7 @@ type Host struct {
|
|||
TokenLastUsedAt *time.Time `json:"tokenLastUsedAt,omitempty"`
|
||||
Tags []string `json:"tags,omitempty"`
|
||||
IsLegacy bool `json:"isLegacy,omitempty"`
|
||||
DiskExclude []string `json:"diskExclude,omitempty"` // Agent's --disk-exclude patterns
|
||||
|
||||
// Linking: When this host agent is running on a known PVE node/VM/container
|
||||
LinkedNodeID string `json:"linkedNodeId,omitempty"` // ID of the PVE node this agent is running on
|
||||
|
|
|
|||
|
|
@ -2679,6 +2679,7 @@ func (m *Monitor) ApplyHostReport(report agentshost.Report, tokenRecord *config.
|
|||
ReportIP: strings.TrimSpace(report.Host.ReportIP),
|
||||
Tags: append([]string(nil), report.Tags...),
|
||||
IsLegacy: isLegacyHostAgent(report.Agent.Type),
|
||||
DiskExclude: append([]string(nil), report.Agent.DiskExclude...),
|
||||
}
|
||||
|
||||
// Apply any pending commands execution override from server config
|
||||
|
|
@ -6278,6 +6279,23 @@ func (m *Monitor) pollPVEInstance(ctx context.Context, instanceName string, clie
|
|||
}
|
||||
}
|
||||
|
||||
// Build a map of node name -> disk exclusion patterns from linked host agents.
|
||||
// This allows --disk-exclude on the agent to also suppress server-side
|
||||
// Proxmox disk health/wearout alerts for the same disks.
|
||||
diskExcludeByNode := make(map[string][]string)
|
||||
hostByID := make(map[string]models.Host, len(currentState.Hosts))
|
||||
for _, h := range currentState.Hosts {
|
||||
hostByID[h.ID] = h
|
||||
}
|
||||
for _, n := range currentState.Nodes {
|
||||
if n.LinkedHostAgentID == "" || n.Instance != inst {
|
||||
continue
|
||||
}
|
||||
if linkedHost, ok := hostByID[n.LinkedHostAgentID]; ok && len(linkedHost.DiskExclude) > 0 && linkedHost.Status == "online" {
|
||||
diskExcludeByNode[n.Name] = linkedHost.DiskExclude
|
||||
}
|
||||
}
|
||||
|
||||
var allDisks []models.PhysicalDisk
|
||||
polledNodes := make(map[string]bool) // Track which nodes we successfully polled
|
||||
|
||||
|
|
@ -6356,6 +6374,25 @@ func (m *Monitor) pollPVEInstance(ctx context.Context, instanceName string, clie
|
|||
Int("wearout", disk.Wearout).
|
||||
Msg("Checking disk health")
|
||||
|
||||
// If the linked host agent has --disk-exclude for this disk, pass a
|
||||
// synthetic healthy disk to CheckDiskHealth so any existing alerts
|
||||
// get cleared naturally, then skip the normal health/wearout checks.
|
||||
if excludePatterns, ok := diskExcludeByNode[node.Node]; ok {
|
||||
if fsfilters.MatchesDeviceExclude(disk.DevPath, excludePatterns) {
|
||||
log.Debug().
|
||||
Str("node", node.Node).
|
||||
Str("disk", disk.DevPath).
|
||||
Msg("Disk matches agent --disk-exclude, clearing any alerts")
|
||||
// Synthetic healthy disk: health="PASSED", wearout=100 (full life)
|
||||
// This causes CheckDiskHealth to clear both health and wearout alerts.
|
||||
healthyDisk := disk
|
||||
healthyDisk.Health = "PASSED"
|
||||
healthyDisk.Wearout = 100
|
||||
m.alertManager.CheckDiskHealth(inst, node.Node, healthyDisk)
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
normalizedHealth := strings.ToUpper(strings.TrimSpace(disk.Health))
|
||||
if normalizedHealth != "" && normalizedHealth != "UNKNOWN" && normalizedHealth != "PASSED" && normalizedHealth != "OK" {
|
||||
// Disk has failed or is failing - alert manager will handle this
|
||||
|
|
|
|||
|
|
@ -20,13 +20,14 @@ type Report struct {
|
|||
|
||||
// AgentInfo describes the reporting agent.
|
||||
type AgentInfo struct {
|
||||
ID string `json:"id"`
|
||||
Version string `json:"version,omitempty"`
|
||||
Type string `json:"type,omitempty"` // "unified", "host", or "docker" - empty means legacy
|
||||
IntervalSeconds int `json:"intervalSeconds,omitempty"`
|
||||
Hostname string `json:"hostname,omitempty"`
|
||||
UpdatedFrom string `json:"updatedFrom,omitempty"` // Previous version if recently auto-updated
|
||||
CommandsEnabled bool `json:"commandsEnabled,omitempty"` // Whether AI command execution is enabled
|
||||
ID string `json:"id"`
|
||||
Version string `json:"version,omitempty"`
|
||||
Type string `json:"type,omitempty"` // "unified", "host", or "docker" - empty means legacy
|
||||
IntervalSeconds int `json:"intervalSeconds,omitempty"`
|
||||
Hostname string `json:"hostname,omitempty"`
|
||||
UpdatedFrom string `json:"updatedFrom,omitempty"` // Previous version if recently auto-updated
|
||||
CommandsEnabled bool `json:"commandsEnabled,omitempty"` // Whether AI command execution is enabled
|
||||
DiskExclude []string `json:"diskExclude,omitempty"` // Disk exclusion patterns from --disk-exclude flag
|
||||
}
|
||||
|
||||
// HostInfo contains platform and identification details about the monitored host.
|
||||
|
|
|
|||
|
|
@ -77,6 +77,10 @@ var virtualFSTypes = map[string]bool{
|
|||
"overlay": true, // Docker/container overlay filesystems (issue #942)
|
||||
"overlayfs": true, // Alternative overlay name
|
||||
"autofs": true, // Systemd automount placeholders (issue #942)
|
||||
"fdescfs": true, // FreeBSD file descriptor filesystem (issue #1142)
|
||||
"devfs": true, // FreeBSD device filesystem
|
||||
"linprocfs": true, // FreeBSD Linux proc compatibility
|
||||
"linsysfs": true, // FreeBSD Linux sys compatibility
|
||||
}
|
||||
|
||||
// networkFSPatterns are substrings that indicate network/remote filesystems.
|
||||
|
|
@ -88,6 +92,7 @@ var specialMountPrefixes = []string{
|
|||
"/proc",
|
||||
"/sys",
|
||||
"/run",
|
||||
"/var/run/", // FreeBSD (not a symlink to /run like on Linux)
|
||||
"/var/lib/containers",
|
||||
"/snap",
|
||||
}
|
||||
|
|
|
|||
|
|
@ -193,6 +193,14 @@ func TestShouldSkipFilesystem(t *testing.T) {
|
|||
{"Windows C drive - should NOT skip", "NTFS", "C:\\", 500 * 1024 * 1024 * 1024, 200 * 1024 * 1024 * 1024, false},
|
||||
{"Windows D drive - should NOT skip", "NTFS", "D:\\", 1000 * 1024 * 1024 * 1024, 500 * 1024 * 1024 * 1024, false},
|
||||
|
||||
// FreeBSD pseudo filesystems (issue #1142)
|
||||
{"FreeBSD fdescfs", "fdescfs", "/var/run/samba/fd", 1024, 1024, true},
|
||||
{"FreeBSD devfs", "devfs", "/dev", 1024, 100, true},
|
||||
{"FreeBSD linprocfs", "linprocfs", "/compat/linux/proc", 0, 0, true},
|
||||
{"FreeBSD linsysfs", "linsysfs", "/compat/linux/sys", 0, 0, true},
|
||||
{"/var/run/ prefix FreeBSD", "ufs", "/var/run/something", 1024, 100, true},
|
||||
{"/var/runtime should NOT skip", "ufs", "/var/runtime", 1000000, 500000, false},
|
||||
|
||||
// Regular filesystems that should NOT be skipped
|
||||
{"ext4 root", "ext4", "/", 100 * 1024 * 1024 * 1024, 50 * 1024 * 1024 * 1024, false},
|
||||
{"xfs data", "xfs", "/data", 500 * 1024 * 1024 * 1024, 200 * 1024 * 1024 * 1024, false},
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue