mirror of
https://github.com/rcourtman/Pulse.git
synced 2026-04-28 03:20:11 +00:00
Fix linked host agent threshold overrides
This commit is contained in:
parent
5f3a4b79ba
commit
754aa0e39c
3 changed files with 268 additions and 10 deletions
|
|
@ -1776,10 +1776,7 @@ func (m *Manager) reevaluateActiveAlertsLocked() {
|
|||
continue
|
||||
}
|
||||
thresholds := m.config.HostDefaults
|
||||
// Overrides are keyed by raw host ID (without the "host:" prefix
|
||||
// that hostResourceID adds to the resource ID used in alert IDs).
|
||||
rawHostID := strings.TrimPrefix(resourceID, "host:")
|
||||
if override, exists := m.config.Overrides[rawHostID]; exists {
|
||||
if override, exists := m.resolveHostAlertThresholdOverrideNoLock(alert, resourceID); exists {
|
||||
if override.Disabled {
|
||||
alertsToResolve = append(alertsToResolve, alertID)
|
||||
continue
|
||||
|
|
@ -2908,6 +2905,68 @@ func hostInstanceName(host models.Host) string {
|
|||
return "Host Agent"
|
||||
}
|
||||
|
||||
func metadataStringValue(metadata map[string]interface{}, key string) string {
|
||||
if metadata == nil {
|
||||
return ""
|
||||
}
|
||||
value, _ := metadata[key].(string)
|
||||
return strings.TrimSpace(value)
|
||||
}
|
||||
|
||||
// resolveHostThresholdOverrideNoLock resolves the most specific threshold override for a host agent.
|
||||
// Explicit host-agent overrides win. When no host override exists, linked node/guest overrides are
|
||||
// inherited so alerts follow the logical resource the host agent is augmenting.
|
||||
// Caller must hold m.mu because guest override lookups may migrate legacy keys in-place.
|
||||
func (m *Manager) resolveHostThresholdOverrideNoLock(hostID, linkedNodeID, linkedVMID, linkedContainerID string) (ThresholdConfig, bool) {
|
||||
if hostID = strings.TrimSpace(hostID); hostID != "" {
|
||||
if override, exists := m.config.Overrides[hostID]; exists {
|
||||
return override, true
|
||||
}
|
||||
}
|
||||
|
||||
if linkedNodeID = strings.TrimSpace(linkedNodeID); linkedNodeID != "" {
|
||||
if override, exists := m.config.Overrides[linkedNodeID]; exists {
|
||||
return override, true
|
||||
}
|
||||
}
|
||||
|
||||
if linkedVMID = strings.TrimSpace(linkedVMID); linkedVMID != "" {
|
||||
if override, exists := m.lookupGuestOverride(nil, linkedVMID); exists {
|
||||
return override, true
|
||||
}
|
||||
}
|
||||
|
||||
if linkedContainerID = strings.TrimSpace(linkedContainerID); linkedContainerID != "" {
|
||||
if override, exists := m.lookupGuestOverride(nil, linkedContainerID); exists {
|
||||
return override, true
|
||||
}
|
||||
}
|
||||
|
||||
return ThresholdConfig{}, false
|
||||
}
|
||||
|
||||
// resolveHostAlertThresholdOverrideNoLock resolves threshold overrides for persisted host alerts.
|
||||
// It uses alert metadata to inherit linked node/guest overrides when the alert came from a linked host agent.
|
||||
// Caller must hold m.mu.
|
||||
func (m *Manager) resolveHostAlertThresholdOverrideNoLock(alert *Alert, resourceID string) (ThresholdConfig, bool) {
|
||||
hostID := strings.TrimSpace(strings.TrimPrefix(resourceID, "host:"))
|
||||
if idx := strings.Index(hostID, "/"); idx >= 0 {
|
||||
hostID = hostID[:idx]
|
||||
}
|
||||
|
||||
var linkedNodeID, linkedVMID, linkedContainerID string
|
||||
if alert != nil && alert.Metadata != nil {
|
||||
if metadataHostID := metadataStringValue(alert.Metadata, "hostId"); metadataHostID != "" {
|
||||
hostID = metadataHostID
|
||||
}
|
||||
linkedNodeID = metadataStringValue(alert.Metadata, "linkedNodeId")
|
||||
linkedVMID = metadataStringValue(alert.Metadata, "linkedVmId")
|
||||
linkedContainerID = metadataStringValue(alert.Metadata, "linkedContainerId")
|
||||
}
|
||||
|
||||
return m.resolveHostThresholdOverrideNoLock(hostID, linkedNodeID, linkedVMID, linkedContainerID)
|
||||
}
|
||||
|
||||
func sanitizeHostComponent(value string) string {
|
||||
value = strings.TrimSpace(strings.ToLower(value))
|
||||
if value == "" {
|
||||
|
|
@ -2977,12 +3036,17 @@ func (m *Manager) CheckHost(host models.Host) {
|
|||
// Fresh telemetry marks the host as online and clears offline tracking.
|
||||
m.HandleHostOnline(host)
|
||||
|
||||
m.mu.RLock()
|
||||
m.mu.Lock()
|
||||
alertsEnabled := m.config.Enabled
|
||||
disableAllHosts := m.config.DisableAllHosts
|
||||
thresholds := m.config.HostDefaults
|
||||
override, hasOverride := m.config.Overrides[host.ID]
|
||||
m.mu.RUnlock()
|
||||
override, hasOverride := m.resolveHostThresholdOverrideNoLock(
|
||||
host.ID,
|
||||
host.LinkedNodeID,
|
||||
host.LinkedVMID,
|
||||
host.LinkedContainerID,
|
||||
)
|
||||
m.mu.Unlock()
|
||||
|
||||
if !alertsEnabled {
|
||||
return
|
||||
|
|
@ -3022,6 +3086,15 @@ func (m *Manager) CheckHost(host models.Host) {
|
|||
"agentVersion": host.AgentVersion,
|
||||
"architecture": host.Architecture,
|
||||
}
|
||||
if linkedNodeID := strings.TrimSpace(host.LinkedNodeID); linkedNodeID != "" {
|
||||
baseMetadata["linkedNodeId"] = linkedNodeID
|
||||
}
|
||||
if linkedVMID := strings.TrimSpace(host.LinkedVMID); linkedVMID != "" {
|
||||
baseMetadata["linkedVmId"] = linkedVMID
|
||||
}
|
||||
if linkedContainerID := strings.TrimSpace(host.LinkedContainerID); linkedContainerID != "" {
|
||||
baseMetadata["linkedContainerId"] = linkedContainerID
|
||||
}
|
||||
if len(host.Tags) > 0 {
|
||||
baseMetadata["tags"] = append([]string(nil), host.Tags...)
|
||||
}
|
||||
|
|
@ -3358,11 +3431,16 @@ func (m *Manager) HandleHostOffline(host models.Host) {
|
|||
}
|
||||
|
||||
var disableConnectivity bool
|
||||
m.mu.RLock()
|
||||
if override, exists := m.config.Overrides[host.ID]; exists {
|
||||
m.mu.Lock()
|
||||
if override, exists := m.resolveHostThresholdOverrideNoLock(
|
||||
host.ID,
|
||||
host.LinkedNodeID,
|
||||
host.LinkedVMID,
|
||||
host.LinkedContainerID,
|
||||
); exists {
|
||||
disableConnectivity = override.DisableConnectivity || override.Disabled
|
||||
}
|
||||
m.mu.RUnlock()
|
||||
m.mu.Unlock()
|
||||
|
||||
if disableConnectivity {
|
||||
m.clearAlert(alertID)
|
||||
|
|
|
|||
|
|
@ -15154,6 +15154,128 @@ func TestCheckHostComprehensive(t *testing.T) {
|
|||
t.Fatalf("expected qualified host resource name, got %q", alert.ResourceName)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("inherits linked node overrides for host agent metrics", func(t *testing.T) {
|
||||
m := newTestManager(t)
|
||||
m.ClearActiveAlerts()
|
||||
|
||||
m.mu.Lock()
|
||||
m.config.TimeThreshold = 0
|
||||
m.config.TimeThresholds = map[string]int{}
|
||||
m.config.HostDefaults = ThresholdConfig{
|
||||
Memory: &HysteresisThreshold{Trigger: 85.0, Clear: 80.0},
|
||||
}
|
||||
m.config.Overrides = map[string]ThresholdConfig{
|
||||
"ProxmoxCluster-proxmoxn3": {
|
||||
Memory: &HysteresisThreshold{Trigger: 97.0, Clear: 92.0},
|
||||
},
|
||||
}
|
||||
m.mu.Unlock()
|
||||
|
||||
host := models.Host{
|
||||
ID: "host-proxmoxn3",
|
||||
DisplayName: "proxmoxn3",
|
||||
Hostname: "proxmoxn3",
|
||||
LinkedNodeID: "ProxmoxCluster-proxmoxn3",
|
||||
Memory: models.Memory{
|
||||
Usage: 90.6,
|
||||
Total: 1024,
|
||||
Used: 928,
|
||||
Free: 96,
|
||||
},
|
||||
Status: "online",
|
||||
LastSeen: time.Now(),
|
||||
}
|
||||
|
||||
m.CheckHost(host)
|
||||
|
||||
m.mu.RLock()
|
||||
_, exists := m.activeAlerts["host:host-proxmoxn3-memory"]
|
||||
m.mu.RUnlock()
|
||||
|
||||
if exists {
|
||||
t.Fatal("expected linked node override to suppress host-agent memory alert")
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("inherits linked guest overrides for host agent metrics", func(t *testing.T) {
|
||||
m := newTestManager(t)
|
||||
m.ClearActiveAlerts()
|
||||
|
||||
m.mu.Lock()
|
||||
m.config.TimeThreshold = 0
|
||||
m.config.TimeThresholds = map[string]int{}
|
||||
m.config.HostDefaults = ThresholdConfig{
|
||||
CPU: &HysteresisThreshold{Trigger: 80.0, Clear: 75.0},
|
||||
}
|
||||
m.config.Overrides = map[string]ThresholdConfig{
|
||||
"Main:node3:101": {
|
||||
CPU: &HysteresisThreshold{Trigger: 105.0, Clear: 100.0},
|
||||
},
|
||||
}
|
||||
m.mu.Unlock()
|
||||
|
||||
host := models.Host{
|
||||
ID: "host-hamster",
|
||||
DisplayName: "Hamster",
|
||||
Hostname: "hamster.local",
|
||||
LinkedVMID: "Main:node3:101",
|
||||
CPUUsage: 97.5,
|
||||
Status: "online",
|
||||
LastSeen: time.Now(),
|
||||
}
|
||||
|
||||
m.CheckHost(host)
|
||||
|
||||
m.mu.RLock()
|
||||
_, exists := m.activeAlerts["host:host-hamster-cpu"]
|
||||
m.mu.RUnlock()
|
||||
|
||||
if exists {
|
||||
t.Fatal("expected linked guest override to suppress host-agent cpu alert")
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("prefers explicit host overrides over linked resource overrides", func(t *testing.T) {
|
||||
m := newTestManager(t)
|
||||
m.ClearActiveAlerts()
|
||||
|
||||
m.mu.Lock()
|
||||
m.config.TimeThreshold = 0
|
||||
m.config.TimeThresholds = map[string]int{}
|
||||
m.config.HostDefaults = ThresholdConfig{
|
||||
CPU: &HysteresisThreshold{Trigger: 80.0, Clear: 75.0},
|
||||
}
|
||||
m.config.Overrides = map[string]ThresholdConfig{
|
||||
"Main:node3:101": {
|
||||
CPU: &HysteresisThreshold{Trigger: 105.0, Clear: 100.0},
|
||||
},
|
||||
"host-hamster": {
|
||||
CPU: &HysteresisThreshold{Trigger: 90.0, Clear: 85.0},
|
||||
},
|
||||
}
|
||||
m.mu.Unlock()
|
||||
|
||||
host := models.Host{
|
||||
ID: "host-hamster",
|
||||
DisplayName: "Hamster",
|
||||
Hostname: "hamster.local",
|
||||
LinkedVMID: "Main:node3:101",
|
||||
CPUUsage: 97.5,
|
||||
Status: "online",
|
||||
LastSeen: time.Now(),
|
||||
}
|
||||
|
||||
m.CheckHost(host)
|
||||
|
||||
m.mu.RLock()
|
||||
alert := m.activeAlerts["host:host-hamster-cpu"]
|
||||
m.mu.RUnlock()
|
||||
|
||||
if alert == nil {
|
||||
t.Fatal("expected explicit host override to take precedence and trigger alert")
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func TestCheckPBSComprehensive(t *testing.T) {
|
||||
|
|
|
|||
|
|
@ -157,6 +157,64 @@ func TestReevaluateActiveAlertsWithOverride(t *testing.T) {
|
|||
}
|
||||
}
|
||||
|
||||
func TestReevaluateActiveAlertsWithLinkedHostOverride(t *testing.T) {
|
||||
manager := NewManager()
|
||||
|
||||
manager.mu.Lock()
|
||||
manager.activeAlerts = make(map[string]*Alert)
|
||||
manager.mu.Unlock()
|
||||
|
||||
initialConfig := AlertConfig{
|
||||
Enabled: true,
|
||||
HostDefaults: ThresholdConfig{
|
||||
Memory: &HysteresisThreshold{Trigger: 85, Clear: 80},
|
||||
},
|
||||
Overrides: make(map[string]ThresholdConfig),
|
||||
}
|
||||
manager.UpdateConfig(initialConfig)
|
||||
|
||||
alertID := "host:host-proxmoxn3-memory"
|
||||
alert := &Alert{
|
||||
ID: alertID,
|
||||
Type: "memory",
|
||||
Level: AlertLevelWarning,
|
||||
ResourceID: "host:host-proxmoxn3",
|
||||
ResourceName: "proxmoxn3 (Host Agent)",
|
||||
Node: "proxmoxn3",
|
||||
Instance: "linux",
|
||||
Message: "Host memory at 90.6%",
|
||||
Value: 90.6,
|
||||
Threshold: 85.0,
|
||||
StartTime: time.Now().Add(-5 * time.Minute),
|
||||
LastSeen: time.Now(),
|
||||
Metadata: map[string]interface{}{
|
||||
"resourceType": "Host",
|
||||
"hostId": "host-proxmoxn3",
|
||||
"linkedNodeId": "ProxmoxCluster-proxmoxn3",
|
||||
},
|
||||
}
|
||||
|
||||
manager.mu.Lock()
|
||||
manager.activeAlerts[alertID] = alert
|
||||
manager.mu.Unlock()
|
||||
|
||||
updatedConfig := initialConfig
|
||||
updatedConfig.Overrides["ProxmoxCluster-proxmoxn3"] = ThresholdConfig{
|
||||
Memory: &HysteresisThreshold{Trigger: 97, Clear: 92},
|
||||
}
|
||||
manager.UpdateConfig(updatedConfig)
|
||||
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
|
||||
manager.mu.RLock()
|
||||
_, alertStillActive := manager.activeAlerts[alertID]
|
||||
manager.mu.RUnlock()
|
||||
|
||||
if alertStillActive {
|
||||
t.Errorf("expected linked host alert to be resolved after linked node override increase")
|
||||
}
|
||||
}
|
||||
|
||||
// TestReevaluateActiveAlertsStillAboveThreshold tests that alerts stay active if still above threshold
|
||||
func TestReevaluateActiveAlertsStillAboveThreshold(t *testing.T) {
|
||||
manager := NewManager()
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue