From a2e2e1f2241955e078a2e712cbcd899ecc62ff71 Mon Sep 17 00:00:00 2001 From: rcourtman Date: Wed, 1 Apr 2026 17:24:30 +0100 Subject: [PATCH] Make alert node display name resolution instance-aware --- .../v6/internal/subsystems/alerts.md | 6 ++ internal/alerts/alerts.go | 61 ++++++++++++++----- internal/alerts/alerts_test.go | 34 +++++++++++ internal/alerts/canonical_metric.go | 4 +- internal/alerts/guest_metric_migration.go | 2 +- internal/alerts/unified_eval_test.go | 11 ++++ internal/alerts/unified_incidents.go | 2 +- internal/alerts/unified_incidents_test.go | 11 ++++ 8 files changed, 112 insertions(+), 19 deletions(-) diff --git a/docs/release-control/v6/internal/subsystems/alerts.md b/docs/release-control/v6/internal/subsystems/alerts.md index 153ecb8b6..634dfa348 100644 --- a/docs/release-control/v6/internal/subsystems/alerts.md +++ b/docs/release-control/v6/internal/subsystems/alerts.md @@ -82,6 +82,12 @@ runtime must migrate the active alert, history entry, acknowledgment record, suppression/rate-limit/flapping tracking, and guest per-disk metric identity to the current canonical state instead of reopening a duplicate alert or resolving only the stale node-scoped identity. +That same alerts runtime also owns instance-scoped node display-name +resolution. Raw node names are not globally unique across configured +infrastructure instances, so cached node display names must key on instance + +node identity whenever the alert carries instance context. Alert updates, +incident rebuilds, and guest-metric migrations may fall back to the legacy +name-only cache only for instance-less resources like standalone host agents. Alert history persistence is also part of that canonical boundary. The history manager may choose the owned runtime data directory, but it must normalize that diff --git a/internal/alerts/alerts.go b/internal/alerts/alerts.go index 7c374a983..5279799e8 100644 --- a/internal/alerts/alerts.go +++ b/internal/alerts/alerts.go @@ -582,10 +582,11 @@ type Manager struct { // When a host agent is running on a Proxmox node, we prefer the host agent // alerts and suppress the node alerts to avoid duplicate monitoring. hostAgentHostnames map[string]struct{} // Normalized hostnames (lowercase) - // Node display name cache: maps raw node/host name → user-configured display name. - // Populated by CheckNode and CheckHost so that checkMetric (and direct alert - // creation sites) can resolve display names without signature changes. - nodeDisplayNames map[string]string + // Node display name caches. Proxmox nodes can share the same raw node name + // across multiple configured instances, so keep instance-scoped entries in + // addition to the legacy raw-name cache used by instance-less resources. + nodeDisplayNames map[string]string + instanceNodeDisplayNames map[string]string // License checking for Pro-only alert features hasProFeature func(feature string) bool @@ -655,6 +656,7 @@ func NewManagerWithDataDir(dataDir string) *Manager { cleanupStop: make(chan struct{}), hostAgentHostnames: make(map[string]struct{}), nodeDisplayNames: make(map[string]string), + instanceNodeDisplayNames: make(map[string]string), config: AlertConfig{ Enabled: true, ActivationState: ActivationPending, @@ -2863,7 +2865,7 @@ func (m *Manager) CheckGuest(guest any, instanceName string) { // CheckNode checks a node against thresholds func (m *Manager) CheckNode(node models.Node) { // Cache display name so all alerts (including guest alerts on this node) can resolve it. - m.UpdateNodeDisplayName(node.Name, node.DisplayName) + m.UpdateNodeDisplayName(node.Instance, node.Name, node.DisplayName) m.mu.RLock() if !m.config.Enabled { @@ -3034,24 +3036,53 @@ func (m *Manager) hasHostAgentForNode(nodeName string) bool { // UpdateNodeDisplayName caches the display name for a node/host so alerts // can resolve it without needing the full model object. -func (m *Manager) UpdateNodeDisplayName(name, displayName string) { +func nodeDisplayNameCacheKey(instance, name string) string { + return strings.TrimSpace(instance) + "\x00" + strings.TrimSpace(name) +} + +func (m *Manager) UpdateNodeDisplayName(parts ...string) { + var instance, name, displayName string + switch len(parts) { + case 2: + name, displayName = parts[0], parts[1] + case 3: + instance, name, displayName = parts[0], parts[1], parts[2] + default: + return + } + + instance = strings.TrimSpace(instance) name = strings.TrimSpace(name) if name == "" { return } displayName = strings.TrimSpace(displayName) m.mu.Lock() - if displayName != "" && displayName != name { - m.nodeDisplayNames[name] = displayName + if instance != "" { + key := nodeDisplayNameCacheKey(instance, name) + if displayName != "" && displayName != name { + m.instanceNodeDisplayNames[key] = displayName + } else { + delete(m.instanceNodeDisplayNames, key) + } } else { - delete(m.nodeDisplayNames, name) + if displayName != "" && displayName != name { + m.nodeDisplayNames[name] = displayName + } else { + delete(m.nodeDisplayNames, name) + } } m.mu.Unlock() } // resolveNodeDisplayName returns the cached display name for a node, or empty // string if none is set. Caller must hold m.mu (read or write). -func (m *Manager) resolveNodeDisplayName(node string) string { +func (m *Manager) resolveNodeDisplayName(instance, node string) string { + if instance = strings.TrimSpace(instance); instance != "" { + if displayName, ok := m.instanceNodeDisplayNames[nodeDisplayNameCacheKey(instance, node)]; ok { + return displayName + } + } return m.nodeDisplayNames[node] } @@ -3180,7 +3211,7 @@ func (m *Manager) CheckHost(host models.Host) { } // Cache display name so host alerts show the user-configured name. - m.UpdateNodeDisplayName(host.Hostname, host.DisplayName) + m.UpdateNodeDisplayName("", host.Hostname, host.DisplayName) // Fresh telemetry marks the host as online and clears offline tracking. m.HandleHostOnline(host) @@ -6989,7 +7020,7 @@ func (m *Manager) checkMetric(resourceID, resourceName, node, instance, resource ResourceID: resourceID, ResourceName: resourceName, Node: node, - NodeDisplayName: m.resolveNodeDisplayName(node), + NodeDisplayName: m.resolveNodeDisplayName(instance, node), Instance: instance, Message: message, Value: value, @@ -7085,7 +7116,7 @@ func (m *Manager) checkMetric(resourceID, resourceName, node, instance, resource existingAlert.LastSeen = time.Now() existingAlert.Value = value // Keep display name current (handles upgrades and renames). - if dn := m.resolveNodeDisplayName(existingAlert.Node); dn != "" { + if dn := m.resolveNodeDisplayName(existingAlert.Instance, existingAlert.Node); dn != "" { existingAlert.NodeDisplayName = dn } if existingAlert.Metadata == nil { @@ -7395,7 +7426,7 @@ func (m *Manager) preserveAlertState(alertID string, updated *Alert) { // Auto-resolve node display name if not already set. if updated.NodeDisplayName == "" && updated.Node != "" { - updated.NodeDisplayName = m.resolveNodeDisplayName(updated.Node) + updated.NodeDisplayName = m.resolveNodeDisplayName(updated.Instance, updated.Node) } existing, exists := m.getActiveAlertNoLock(alertID) @@ -7481,7 +7512,7 @@ func (m *Manager) GetActiveAlerts() []Alert { a := *cloneAlertForOutput(alert) // Ensure display name is current (handles upgrades, renames, and // alerts created before the cache was populated). - if dn := m.resolveNodeDisplayName(a.Node); dn != "" { + if dn := m.resolveNodeDisplayName(a.Instance, a.Node); dn != "" { a.NodeDisplayName = dn } alerts = append(alerts, a) diff --git a/internal/alerts/alerts_test.go b/internal/alerts/alerts_test.go index d91fdc379..86dd5be04 100644 --- a/internal/alerts/alerts_test.go +++ b/internal/alerts/alerts_test.go @@ -223,6 +223,40 @@ func TestCheckMetricClearsAlertWhenThresholdDisabled(t *testing.T) { } } +func TestGetActiveAlertsKeepsInstanceScopedNodeDisplayNames(t *testing.T) { + m := newTestManager(t) + m.ClearActiveAlerts() + m.mu.Lock() + m.config.TimeThresholds = map[string]int{} + m.config.SuppressionWindow = 0 + m.config.MinimumDelta = 0 + m.mu.Unlock() + + threshold := &HysteresisThreshold{Trigger: 80, Clear: 70} + + m.UpdateNodeDisplayName("cluster-a", "pve", "Alpha") + m.UpdateNodeDisplayName("cluster-b", "pve", "Beta") + + m.checkMetric("guest-a", "vm-a", "pve", "cluster-a", "guest", "cpu", 90, threshold, nil) + m.checkMetric("guest-b", "vm-b", "pve", "cluster-b", "guest", "cpu", 91, threshold, nil) + + m.UpdateNodeDisplayName("cluster-a", "pve", "Alpha Updated") + m.checkMetric("guest-a", "vm-a", "pve", "cluster-a", "guest", "cpu", 92, threshold, nil) + m.checkMetric("guest-b", "vm-b", "pve", "cluster-b", "guest", "cpu", 93, threshold, nil) + + gotByResourceID := make(map[string]Alert) + for _, alert := range m.GetActiveAlerts() { + gotByResourceID[alert.ResourceID] = alert + } + + if gotByResourceID["guest-a"].NodeDisplayName != "Alpha Updated" { + t.Fatalf("guest-a node display name = %q, want %q", gotByResourceID["guest-a"].NodeDisplayName, "Alpha Updated") + } + if gotByResourceID["guest-b"].NodeDisplayName != "Beta" { + t.Fatalf("guest-b node display name = %q, want %q", gotByResourceID["guest-b"].NodeDisplayName, "Beta") + } +} + func TestQuietHoursCategoryForResourceIncidentUsesIncidentCategoryMetadata(t *testing.T) { availability := &Alert{ Type: "resource-incident", diff --git a/internal/alerts/canonical_metric.go b/internal/alerts/canonical_metric.go index 37cb0023d..23b29d404 100644 --- a/internal/alerts/canonical_metric.go +++ b/internal/alerts/canonical_metric.go @@ -196,7 +196,7 @@ func (m *Manager) evaluateCanonicalMetricAlert(spec alertspecs.ResourceAlertSpec ResourceID: spec.ResourceID, ResourceName: resourceName, Node: node, - NodeDisplayName: m.resolveNodeDisplayName(node), + NodeDisplayName: m.resolveNodeDisplayName(instance, node), Instance: instance, Message: message, Value: value, @@ -249,7 +249,7 @@ func (m *Manager) evaluateCanonicalMetricAlert(spec alertspecs.ResourceAlertSpec existingAlert.Value = value existingAlert.Threshold = spec.MetricThreshold.Trigger existingAlert.Level = level - if dn := m.resolveNodeDisplayName(existingAlert.Node); dn != "" { + if dn := m.resolveNodeDisplayName(existingAlert.Instance, existingAlert.Node); dn != "" { existingAlert.NodeDisplayName = dn } if opts != nil && opts.Message != "" { diff --git a/internal/alerts/guest_metric_migration.go b/internal/alerts/guest_metric_migration.go index 3efbe5ed2..dc71a27cd 100644 --- a/internal/alerts/guest_metric_migration.go +++ b/internal/alerts/guest_metric_migration.go @@ -134,7 +134,7 @@ func (m *Manager) migrateGuestMetricAlertNoLock(storageKey, specID, kind, resour matchedAlert.ResourceName = resourceName matchedAlert.Node = normalizedNode matchedAlert.Instance = normalizedInstance - if dn := m.resolveNodeDisplayName(normalizedNode); dn != "" { + if dn := m.resolveNodeDisplayName(normalizedInstance, normalizedNode); dn != "" { matchedAlert.NodeDisplayName = dn } else { matchedAlert.NodeDisplayName = "" diff --git a/internal/alerts/unified_eval_test.go b/internal/alerts/unified_eval_test.go index 3f23d2650..3e1e8214c 100644 --- a/internal/alerts/unified_eval_test.go +++ b/internal/alerts/unified_eval_test.go @@ -313,6 +313,17 @@ func TestCheckUnifiedResourceKeepsInstanceScopedNodeDisplayNames(t *testing.T) { if alertB.NodeDisplayName != "Beta" { t.Fatalf("vm-b node display name = %q, want %q", alertB.NodeDisplayName, "Beta") } + + gotByResourceID := make(map[string]Alert) + for _, alert := range m.GetActiveAlerts() { + gotByResourceID[alert.ResourceID] = alert + } + if gotByResourceID["vm-a"].NodeDisplayName != "Alpha Updated" { + t.Fatalf("GetActiveAlerts vm-a node display name = %q, want %q", gotByResourceID["vm-a"].NodeDisplayName, "Alpha Updated") + } + if gotByResourceID["vm-b"].NodeDisplayName != "Beta" { + t.Fatalf("GetActiveAlerts vm-b node display name = %q, want %q", gotByResourceID["vm-b"].NodeDisplayName, "Beta") + } } func TestCheckGuestPerDiskAnnotatesCanonicalSpecMetadata(t *testing.T) { diff --git a/internal/alerts/unified_incidents.go b/internal/alerts/unified_incidents.go index 3fdbc6d0b..baf8c09ec 100644 --- a/internal/alerts/unified_incidents.go +++ b/internal/alerts/unified_incidents.go @@ -116,7 +116,7 @@ func (m *Manager) SyncUnifiedResourceIncidents(resources []unifiedresources.Reso existing.ResourceID = alert.ResourceID existing.ResourceName = alert.ResourceName existing.Node = alert.Node - existing.NodeDisplayName = m.resolveNodeDisplayName(alert.Node) + existing.NodeDisplayName = m.resolveNodeDisplayName(alert.Instance, alert.Node) existing.Instance = alert.Instance existing.Message = alert.Message existing.Metadata = alert.Metadata diff --git a/internal/alerts/unified_incidents_test.go b/internal/alerts/unified_incidents_test.go index d0143218f..5c9209998 100644 --- a/internal/alerts/unified_incidents_test.go +++ b/internal/alerts/unified_incidents_test.go @@ -128,6 +128,17 @@ func TestSyncUnifiedResourceIncidentsKeepsInstanceScopedNodeDisplayNames(t *test if alertB.NodeDisplayName != "Beta" { t.Fatalf("resourceB node display name = %q, want %q", alertB.NodeDisplayName, "Beta") } + + gotByResourceID := make(map[string]Alert) + for _, alert := range m.GetActiveAlerts() { + gotByResourceID[alert.ResourceID] = alert + } + if gotByResourceID[resourceA.ID].NodeDisplayName != "Alpha Updated" { + t.Fatalf("GetActiveAlerts resourceA node display name = %q, want %q", gotByResourceID[resourceA.ID].NodeDisplayName, "Alpha Updated") + } + if gotByResourceID[resourceB.ID].NodeDisplayName != "Beta" { + t.Fatalf("GetActiveAlerts resourceB node display name = %q, want %q", gotByResourceID[resourceB.ID].NodeDisplayName, "Beta") + } } func TestSyncUnifiedResourceIncidentsIncludesConsumerImpact(t *testing.T) {