Fix linked host agent threshold overrides
Some checks are pending
Build and Test / Secret Scan (push) Waiting to run
Build and Test / Frontend & Backend (push) Waiting to run
Core E2E Tests / Playwright Core E2E (push) Waiting to run

This commit is contained in:
rcourtman 2026-04-12 22:47:34 +01:00
parent 5f3a4b79ba
commit 754aa0e39c
3 changed files with 268 additions and 10 deletions

View file

@ -1776,10 +1776,7 @@ func (m *Manager) reevaluateActiveAlertsLocked() {
continue
}
thresholds := m.config.HostDefaults
// Overrides are keyed by raw host ID (without the "host:" prefix
// that hostResourceID adds to the resource ID used in alert IDs).
rawHostID := strings.TrimPrefix(resourceID, "host:")
if override, exists := m.config.Overrides[rawHostID]; exists {
if override, exists := m.resolveHostAlertThresholdOverrideNoLock(alert, resourceID); exists {
if override.Disabled {
alertsToResolve = append(alertsToResolve, alertID)
continue
@ -2908,6 +2905,68 @@ func hostInstanceName(host models.Host) string {
return "Host Agent"
}
func metadataStringValue(metadata map[string]interface{}, key string) string {
if metadata == nil {
return ""
}
value, _ := metadata[key].(string)
return strings.TrimSpace(value)
}
// resolveHostThresholdOverrideNoLock resolves the most specific threshold override for a host agent.
// Explicit host-agent overrides win. When no host override exists, linked node/guest overrides are
// inherited so alerts follow the logical resource the host agent is augmenting.
// Caller must hold m.mu because guest override lookups may migrate legacy keys in-place.
func (m *Manager) resolveHostThresholdOverrideNoLock(hostID, linkedNodeID, linkedVMID, linkedContainerID string) (ThresholdConfig, bool) {
if hostID = strings.TrimSpace(hostID); hostID != "" {
if override, exists := m.config.Overrides[hostID]; exists {
return override, true
}
}
if linkedNodeID = strings.TrimSpace(linkedNodeID); linkedNodeID != "" {
if override, exists := m.config.Overrides[linkedNodeID]; exists {
return override, true
}
}
if linkedVMID = strings.TrimSpace(linkedVMID); linkedVMID != "" {
if override, exists := m.lookupGuestOverride(nil, linkedVMID); exists {
return override, true
}
}
if linkedContainerID = strings.TrimSpace(linkedContainerID); linkedContainerID != "" {
if override, exists := m.lookupGuestOverride(nil, linkedContainerID); exists {
return override, true
}
}
return ThresholdConfig{}, false
}
// resolveHostAlertThresholdOverrideNoLock resolves threshold overrides for persisted host alerts.
// It uses alert metadata to inherit linked node/guest overrides when the alert came from a linked host agent.
// Caller must hold m.mu.
func (m *Manager) resolveHostAlertThresholdOverrideNoLock(alert *Alert, resourceID string) (ThresholdConfig, bool) {
hostID := strings.TrimSpace(strings.TrimPrefix(resourceID, "host:"))
if idx := strings.Index(hostID, "/"); idx >= 0 {
hostID = hostID[:idx]
}
var linkedNodeID, linkedVMID, linkedContainerID string
if alert != nil && alert.Metadata != nil {
if metadataHostID := metadataStringValue(alert.Metadata, "hostId"); metadataHostID != "" {
hostID = metadataHostID
}
linkedNodeID = metadataStringValue(alert.Metadata, "linkedNodeId")
linkedVMID = metadataStringValue(alert.Metadata, "linkedVmId")
linkedContainerID = metadataStringValue(alert.Metadata, "linkedContainerId")
}
return m.resolveHostThresholdOverrideNoLock(hostID, linkedNodeID, linkedVMID, linkedContainerID)
}
func sanitizeHostComponent(value string) string {
value = strings.TrimSpace(strings.ToLower(value))
if value == "" {
@ -2977,12 +3036,17 @@ func (m *Manager) CheckHost(host models.Host) {
// Fresh telemetry marks the host as online and clears offline tracking.
m.HandleHostOnline(host)
m.mu.RLock()
m.mu.Lock()
alertsEnabled := m.config.Enabled
disableAllHosts := m.config.DisableAllHosts
thresholds := m.config.HostDefaults
override, hasOverride := m.config.Overrides[host.ID]
m.mu.RUnlock()
override, hasOverride := m.resolveHostThresholdOverrideNoLock(
host.ID,
host.LinkedNodeID,
host.LinkedVMID,
host.LinkedContainerID,
)
m.mu.Unlock()
if !alertsEnabled {
return
@ -3022,6 +3086,15 @@ func (m *Manager) CheckHost(host models.Host) {
"agentVersion": host.AgentVersion,
"architecture": host.Architecture,
}
if linkedNodeID := strings.TrimSpace(host.LinkedNodeID); linkedNodeID != "" {
baseMetadata["linkedNodeId"] = linkedNodeID
}
if linkedVMID := strings.TrimSpace(host.LinkedVMID); linkedVMID != "" {
baseMetadata["linkedVmId"] = linkedVMID
}
if linkedContainerID := strings.TrimSpace(host.LinkedContainerID); linkedContainerID != "" {
baseMetadata["linkedContainerId"] = linkedContainerID
}
if len(host.Tags) > 0 {
baseMetadata["tags"] = append([]string(nil), host.Tags...)
}
@ -3358,11 +3431,16 @@ func (m *Manager) HandleHostOffline(host models.Host) {
}
var disableConnectivity bool
m.mu.RLock()
if override, exists := m.config.Overrides[host.ID]; exists {
m.mu.Lock()
if override, exists := m.resolveHostThresholdOverrideNoLock(
host.ID,
host.LinkedNodeID,
host.LinkedVMID,
host.LinkedContainerID,
); exists {
disableConnectivity = override.DisableConnectivity || override.Disabled
}
m.mu.RUnlock()
m.mu.Unlock()
if disableConnectivity {
m.clearAlert(alertID)

View file

@ -15154,6 +15154,128 @@ func TestCheckHostComprehensive(t *testing.T) {
t.Fatalf("expected qualified host resource name, got %q", alert.ResourceName)
}
})
t.Run("inherits linked node overrides for host agent metrics", func(t *testing.T) {
m := newTestManager(t)
m.ClearActiveAlerts()
m.mu.Lock()
m.config.TimeThreshold = 0
m.config.TimeThresholds = map[string]int{}
m.config.HostDefaults = ThresholdConfig{
Memory: &HysteresisThreshold{Trigger: 85.0, Clear: 80.0},
}
m.config.Overrides = map[string]ThresholdConfig{
"ProxmoxCluster-proxmoxn3": {
Memory: &HysteresisThreshold{Trigger: 97.0, Clear: 92.0},
},
}
m.mu.Unlock()
host := models.Host{
ID: "host-proxmoxn3",
DisplayName: "proxmoxn3",
Hostname: "proxmoxn3",
LinkedNodeID: "ProxmoxCluster-proxmoxn3",
Memory: models.Memory{
Usage: 90.6,
Total: 1024,
Used: 928,
Free: 96,
},
Status: "online",
LastSeen: time.Now(),
}
m.CheckHost(host)
m.mu.RLock()
_, exists := m.activeAlerts["host:host-proxmoxn3-memory"]
m.mu.RUnlock()
if exists {
t.Fatal("expected linked node override to suppress host-agent memory alert")
}
})
t.Run("inherits linked guest overrides for host agent metrics", func(t *testing.T) {
m := newTestManager(t)
m.ClearActiveAlerts()
m.mu.Lock()
m.config.TimeThreshold = 0
m.config.TimeThresholds = map[string]int{}
m.config.HostDefaults = ThresholdConfig{
CPU: &HysteresisThreshold{Trigger: 80.0, Clear: 75.0},
}
m.config.Overrides = map[string]ThresholdConfig{
"Main:node3:101": {
CPU: &HysteresisThreshold{Trigger: 105.0, Clear: 100.0},
},
}
m.mu.Unlock()
host := models.Host{
ID: "host-hamster",
DisplayName: "Hamster",
Hostname: "hamster.local",
LinkedVMID: "Main:node3:101",
CPUUsage: 97.5,
Status: "online",
LastSeen: time.Now(),
}
m.CheckHost(host)
m.mu.RLock()
_, exists := m.activeAlerts["host:host-hamster-cpu"]
m.mu.RUnlock()
if exists {
t.Fatal("expected linked guest override to suppress host-agent cpu alert")
}
})
t.Run("prefers explicit host overrides over linked resource overrides", func(t *testing.T) {
m := newTestManager(t)
m.ClearActiveAlerts()
m.mu.Lock()
m.config.TimeThreshold = 0
m.config.TimeThresholds = map[string]int{}
m.config.HostDefaults = ThresholdConfig{
CPU: &HysteresisThreshold{Trigger: 80.0, Clear: 75.0},
}
m.config.Overrides = map[string]ThresholdConfig{
"Main:node3:101": {
CPU: &HysteresisThreshold{Trigger: 105.0, Clear: 100.0},
},
"host-hamster": {
CPU: &HysteresisThreshold{Trigger: 90.0, Clear: 85.0},
},
}
m.mu.Unlock()
host := models.Host{
ID: "host-hamster",
DisplayName: "Hamster",
Hostname: "hamster.local",
LinkedVMID: "Main:node3:101",
CPUUsage: 97.5,
Status: "online",
LastSeen: time.Now(),
}
m.CheckHost(host)
m.mu.RLock()
alert := m.activeAlerts["host:host-hamster-cpu"]
m.mu.RUnlock()
if alert == nil {
t.Fatal("expected explicit host override to take precedence and trigger alert")
}
})
}
func TestCheckPBSComprehensive(t *testing.T) {

View file

@ -157,6 +157,64 @@ func TestReevaluateActiveAlertsWithOverride(t *testing.T) {
}
}
func TestReevaluateActiveAlertsWithLinkedHostOverride(t *testing.T) {
manager := NewManager()
manager.mu.Lock()
manager.activeAlerts = make(map[string]*Alert)
manager.mu.Unlock()
initialConfig := AlertConfig{
Enabled: true,
HostDefaults: ThresholdConfig{
Memory: &HysteresisThreshold{Trigger: 85, Clear: 80},
},
Overrides: make(map[string]ThresholdConfig),
}
manager.UpdateConfig(initialConfig)
alertID := "host:host-proxmoxn3-memory"
alert := &Alert{
ID: alertID,
Type: "memory",
Level: AlertLevelWarning,
ResourceID: "host:host-proxmoxn3",
ResourceName: "proxmoxn3 (Host Agent)",
Node: "proxmoxn3",
Instance: "linux",
Message: "Host memory at 90.6%",
Value: 90.6,
Threshold: 85.0,
StartTime: time.Now().Add(-5 * time.Minute),
LastSeen: time.Now(),
Metadata: map[string]interface{}{
"resourceType": "Host",
"hostId": "host-proxmoxn3",
"linkedNodeId": "ProxmoxCluster-proxmoxn3",
},
}
manager.mu.Lock()
manager.activeAlerts[alertID] = alert
manager.mu.Unlock()
updatedConfig := initialConfig
updatedConfig.Overrides["ProxmoxCluster-proxmoxn3"] = ThresholdConfig{
Memory: &HysteresisThreshold{Trigger: 97, Clear: 92},
}
manager.UpdateConfig(updatedConfig)
time.Sleep(100 * time.Millisecond)
manager.mu.RLock()
_, alertStillActive := manager.activeAlerts[alertID]
manager.mu.RUnlock()
if alertStillActive {
t.Errorf("expected linked host alert to be resolved after linked node override increase")
}
}
// TestReevaluateActiveAlertsStillAboveThreshold tests that alerts stay active if still above threshold
func TestReevaluateActiveAlertsStillAboveThreshold(t *testing.T) {
manager := NewManager()