Make alert node display name resolution instance-aware

This commit is contained in:
rcourtman 2026-04-01 17:24:30 +01:00
parent 3e33012081
commit a2e2e1f224
8 changed files with 112 additions and 19 deletions

View file

@ -82,6 +82,12 @@ runtime must migrate the active alert, history entry, acknowledgment record,
suppression/rate-limit/flapping tracking, and guest per-disk metric identity
to the current canonical state instead of reopening a duplicate alert or
resolving only the stale node-scoped identity.
That same alerts runtime also owns instance-scoped node display-name
resolution. Raw node names are not globally unique across configured
infrastructure instances, so cached node display names must key on instance +
node identity whenever the alert carries instance context. Alert updates,
incident rebuilds, and guest-metric migrations may fall back to the legacy
name-only cache only for instance-less resources like standalone host agents.
Alert history persistence is also part of that canonical boundary. The history
manager may choose the owned runtime data directory, but it must normalize that

View file

@ -582,10 +582,11 @@ type Manager struct {
// When a host agent is running on a Proxmox node, we prefer the host agent
// alerts and suppress the node alerts to avoid duplicate monitoring.
hostAgentHostnames map[string]struct{} // Normalized hostnames (lowercase)
// Node display name cache: maps raw node/host name → user-configured display name.
// Populated by CheckNode and CheckHost so that checkMetric (and direct alert
// creation sites) can resolve display names without signature changes.
nodeDisplayNames map[string]string
// Node display name caches. Proxmox nodes can share the same raw node name
// across multiple configured instances, so keep instance-scoped entries in
// addition to the legacy raw-name cache used by instance-less resources.
nodeDisplayNames map[string]string
instanceNodeDisplayNames map[string]string
// License checking for Pro-only alert features
hasProFeature func(feature string) bool
@ -655,6 +656,7 @@ func NewManagerWithDataDir(dataDir string) *Manager {
cleanupStop: make(chan struct{}),
hostAgentHostnames: make(map[string]struct{}),
nodeDisplayNames: make(map[string]string),
instanceNodeDisplayNames: make(map[string]string),
config: AlertConfig{
Enabled: true,
ActivationState: ActivationPending,
@ -2863,7 +2865,7 @@ func (m *Manager) CheckGuest(guest any, instanceName string) {
// CheckNode checks a node against thresholds
func (m *Manager) CheckNode(node models.Node) {
// Cache display name so all alerts (including guest alerts on this node) can resolve it.
m.UpdateNodeDisplayName(node.Name, node.DisplayName)
m.UpdateNodeDisplayName(node.Instance, node.Name, node.DisplayName)
m.mu.RLock()
if !m.config.Enabled {
@ -3034,24 +3036,53 @@ func (m *Manager) hasHostAgentForNode(nodeName string) bool {
// UpdateNodeDisplayName caches the display name for a node/host so alerts
// can resolve it without needing the full model object.
func (m *Manager) UpdateNodeDisplayName(name, displayName string) {
func nodeDisplayNameCacheKey(instance, name string) string {
return strings.TrimSpace(instance) + "\x00" + strings.TrimSpace(name)
}
func (m *Manager) UpdateNodeDisplayName(parts ...string) {
var instance, name, displayName string
switch len(parts) {
case 2:
name, displayName = parts[0], parts[1]
case 3:
instance, name, displayName = parts[0], parts[1], parts[2]
default:
return
}
instance = strings.TrimSpace(instance)
name = strings.TrimSpace(name)
if name == "" {
return
}
displayName = strings.TrimSpace(displayName)
m.mu.Lock()
if displayName != "" && displayName != name {
m.nodeDisplayNames[name] = displayName
if instance != "" {
key := nodeDisplayNameCacheKey(instance, name)
if displayName != "" && displayName != name {
m.instanceNodeDisplayNames[key] = displayName
} else {
delete(m.instanceNodeDisplayNames, key)
}
} else {
delete(m.nodeDisplayNames, name)
if displayName != "" && displayName != name {
m.nodeDisplayNames[name] = displayName
} else {
delete(m.nodeDisplayNames, name)
}
}
m.mu.Unlock()
}
// resolveNodeDisplayName returns the cached display name for a node, or empty
// string if none is set. Caller must hold m.mu (read or write).
func (m *Manager) resolveNodeDisplayName(node string) string {
func (m *Manager) resolveNodeDisplayName(instance, node string) string {
if instance = strings.TrimSpace(instance); instance != "" {
if displayName, ok := m.instanceNodeDisplayNames[nodeDisplayNameCacheKey(instance, node)]; ok {
return displayName
}
}
return m.nodeDisplayNames[node]
}
@ -3180,7 +3211,7 @@ func (m *Manager) CheckHost(host models.Host) {
}
// Cache display name so host alerts show the user-configured name.
m.UpdateNodeDisplayName(host.Hostname, host.DisplayName)
m.UpdateNodeDisplayName("", host.Hostname, host.DisplayName)
// Fresh telemetry marks the host as online and clears offline tracking.
m.HandleHostOnline(host)
@ -6989,7 +7020,7 @@ func (m *Manager) checkMetric(resourceID, resourceName, node, instance, resource
ResourceID: resourceID,
ResourceName: resourceName,
Node: node,
NodeDisplayName: m.resolveNodeDisplayName(node),
NodeDisplayName: m.resolveNodeDisplayName(instance, node),
Instance: instance,
Message: message,
Value: value,
@ -7085,7 +7116,7 @@ func (m *Manager) checkMetric(resourceID, resourceName, node, instance, resource
existingAlert.LastSeen = time.Now()
existingAlert.Value = value
// Keep display name current (handles upgrades and renames).
if dn := m.resolveNodeDisplayName(existingAlert.Node); dn != "" {
if dn := m.resolveNodeDisplayName(existingAlert.Instance, existingAlert.Node); dn != "" {
existingAlert.NodeDisplayName = dn
}
if existingAlert.Metadata == nil {
@ -7395,7 +7426,7 @@ func (m *Manager) preserveAlertState(alertID string, updated *Alert) {
// Auto-resolve node display name if not already set.
if updated.NodeDisplayName == "" && updated.Node != "" {
updated.NodeDisplayName = m.resolveNodeDisplayName(updated.Node)
updated.NodeDisplayName = m.resolveNodeDisplayName(updated.Instance, updated.Node)
}
existing, exists := m.getActiveAlertNoLock(alertID)
@ -7481,7 +7512,7 @@ func (m *Manager) GetActiveAlerts() []Alert {
a := *cloneAlertForOutput(alert)
// Ensure display name is current (handles upgrades, renames, and
// alerts created before the cache was populated).
if dn := m.resolveNodeDisplayName(a.Node); dn != "" {
if dn := m.resolveNodeDisplayName(a.Instance, a.Node); dn != "" {
a.NodeDisplayName = dn
}
alerts = append(alerts, a)

View file

@ -223,6 +223,40 @@ func TestCheckMetricClearsAlertWhenThresholdDisabled(t *testing.T) {
}
}
func TestGetActiveAlertsKeepsInstanceScopedNodeDisplayNames(t *testing.T) {
m := newTestManager(t)
m.ClearActiveAlerts()
m.mu.Lock()
m.config.TimeThresholds = map[string]int{}
m.config.SuppressionWindow = 0
m.config.MinimumDelta = 0
m.mu.Unlock()
threshold := &HysteresisThreshold{Trigger: 80, Clear: 70}
m.UpdateNodeDisplayName("cluster-a", "pve", "Alpha")
m.UpdateNodeDisplayName("cluster-b", "pve", "Beta")
m.checkMetric("guest-a", "vm-a", "pve", "cluster-a", "guest", "cpu", 90, threshold, nil)
m.checkMetric("guest-b", "vm-b", "pve", "cluster-b", "guest", "cpu", 91, threshold, nil)
m.UpdateNodeDisplayName("cluster-a", "pve", "Alpha Updated")
m.checkMetric("guest-a", "vm-a", "pve", "cluster-a", "guest", "cpu", 92, threshold, nil)
m.checkMetric("guest-b", "vm-b", "pve", "cluster-b", "guest", "cpu", 93, threshold, nil)
gotByResourceID := make(map[string]Alert)
for _, alert := range m.GetActiveAlerts() {
gotByResourceID[alert.ResourceID] = alert
}
if gotByResourceID["guest-a"].NodeDisplayName != "Alpha Updated" {
t.Fatalf("guest-a node display name = %q, want %q", gotByResourceID["guest-a"].NodeDisplayName, "Alpha Updated")
}
if gotByResourceID["guest-b"].NodeDisplayName != "Beta" {
t.Fatalf("guest-b node display name = %q, want %q", gotByResourceID["guest-b"].NodeDisplayName, "Beta")
}
}
func TestQuietHoursCategoryForResourceIncidentUsesIncidentCategoryMetadata(t *testing.T) {
availability := &Alert{
Type: "resource-incident",

View file

@ -196,7 +196,7 @@ func (m *Manager) evaluateCanonicalMetricAlert(spec alertspecs.ResourceAlertSpec
ResourceID: spec.ResourceID,
ResourceName: resourceName,
Node: node,
NodeDisplayName: m.resolveNodeDisplayName(node),
NodeDisplayName: m.resolveNodeDisplayName(instance, node),
Instance: instance,
Message: message,
Value: value,
@ -249,7 +249,7 @@ func (m *Manager) evaluateCanonicalMetricAlert(spec alertspecs.ResourceAlertSpec
existingAlert.Value = value
existingAlert.Threshold = spec.MetricThreshold.Trigger
existingAlert.Level = level
if dn := m.resolveNodeDisplayName(existingAlert.Node); dn != "" {
if dn := m.resolveNodeDisplayName(existingAlert.Instance, existingAlert.Node); dn != "" {
existingAlert.NodeDisplayName = dn
}
if opts != nil && opts.Message != "" {

View file

@ -134,7 +134,7 @@ func (m *Manager) migrateGuestMetricAlertNoLock(storageKey, specID, kind, resour
matchedAlert.ResourceName = resourceName
matchedAlert.Node = normalizedNode
matchedAlert.Instance = normalizedInstance
if dn := m.resolveNodeDisplayName(normalizedNode); dn != "" {
if dn := m.resolveNodeDisplayName(normalizedInstance, normalizedNode); dn != "" {
matchedAlert.NodeDisplayName = dn
} else {
matchedAlert.NodeDisplayName = ""

View file

@ -313,6 +313,17 @@ func TestCheckUnifiedResourceKeepsInstanceScopedNodeDisplayNames(t *testing.T) {
if alertB.NodeDisplayName != "Beta" {
t.Fatalf("vm-b node display name = %q, want %q", alertB.NodeDisplayName, "Beta")
}
gotByResourceID := make(map[string]Alert)
for _, alert := range m.GetActiveAlerts() {
gotByResourceID[alert.ResourceID] = alert
}
if gotByResourceID["vm-a"].NodeDisplayName != "Alpha Updated" {
t.Fatalf("GetActiveAlerts vm-a node display name = %q, want %q", gotByResourceID["vm-a"].NodeDisplayName, "Alpha Updated")
}
if gotByResourceID["vm-b"].NodeDisplayName != "Beta" {
t.Fatalf("GetActiveAlerts vm-b node display name = %q, want %q", gotByResourceID["vm-b"].NodeDisplayName, "Beta")
}
}
func TestCheckGuestPerDiskAnnotatesCanonicalSpecMetadata(t *testing.T) {

View file

@ -116,7 +116,7 @@ func (m *Manager) SyncUnifiedResourceIncidents(resources []unifiedresources.Reso
existing.ResourceID = alert.ResourceID
existing.ResourceName = alert.ResourceName
existing.Node = alert.Node
existing.NodeDisplayName = m.resolveNodeDisplayName(alert.Node)
existing.NodeDisplayName = m.resolveNodeDisplayName(alert.Instance, alert.Node)
existing.Instance = alert.Instance
existing.Message = alert.Message
existing.Metadata = alert.Metadata

View file

@ -128,6 +128,17 @@ func TestSyncUnifiedResourceIncidentsKeepsInstanceScopedNodeDisplayNames(t *test
if alertB.NodeDisplayName != "Beta" {
t.Fatalf("resourceB node display name = %q, want %q", alertB.NodeDisplayName, "Beta")
}
gotByResourceID := make(map[string]Alert)
for _, alert := range m.GetActiveAlerts() {
gotByResourceID[alert.ResourceID] = alert
}
if gotByResourceID[resourceA.ID].NodeDisplayName != "Alpha Updated" {
t.Fatalf("GetActiveAlerts resourceA node display name = %q, want %q", gotByResourceID[resourceA.ID].NodeDisplayName, "Alpha Updated")
}
if gotByResourceID[resourceB.ID].NodeDisplayName != "Beta" {
t.Fatalf("GetActiveAlerts resourceB node display name = %q, want %q", gotByResourceID[resourceB.ID].NodeDisplayName, "Beta")
}
}
func TestSyncUnifiedResourceIncidentsIncludesConsumerImpact(t *testing.T) {