From b5ee2c1f98ca719caa1cfde23b2eb9c5cc24c5b2 Mon Sep 17 00:00:00 2001 From: rcourtman Date: Wed, 25 Mar 2026 10:13:10 +0000 Subject: [PATCH] Fix guest override migration for canonical IDs (#1334) --- internal/alerts/alerts.go | 29 +++--------- internal/alerts/alerts_test.go | 56 +++++++++++++++++++++++ internal/alerts/filter_evaluation.go | 22 +++++++-- internal/alerts/filter_evaluation_test.go | 39 ++++++++++++++++ 4 files changed, 121 insertions(+), 25 deletions(-) diff --git a/internal/alerts/alerts.go b/internal/alerts/alerts.go index cf1dad381..8ac73e284 100644 --- a/internal/alerts/alerts.go +++ b/internal/alerts/alerts.go @@ -9516,51 +9516,36 @@ func (m *Manager) LoadActiveAlerts() error { seen := make(map[string]bool) for _, alert := range alerts { - // Migrate legacy guest alert IDs (instance-node-VMID -> instance-VMID) + // Migrate legacy guest alert IDs to the canonical guest format. // Check if this is a guest-related alert by looking at common alert types isGuestAlert := strings.Contains(alert.Type, "cpu") || strings.Contains(alert.Type, "memory") || strings.Contains(alert.Type, "disk") || strings.Contains(alert.Type, "network") || alert.Type == "guest-offline" if isGuestAlert { - // Try to extract instance, node, and VMID from resource ID - // Legacy format: instance-node-VMID or node-VMID (standalone) parts := strings.Split(alert.ResourceID, "-") - // Check if this looks like a legacy format (has node in the ID) - // We can detect this if we have Node field and it appears in the ResourceID if alert.Node != "" && len(parts) >= 2 { var newResourceID string + oldResourceID := alert.ResourceID // Try to extract VMID (should be last part) vmidStr := parts[len(parts)-1] if _, err := strconv.Atoi(vmidStr); err == nil { - // VMID is valid, now check if we need to migrate - if len(parts) == 3 && alert.Instance != "" && alert.Instance != alert.Node { - // Format: instance-node-VMID -> instance-VMID - newResourceID = fmt.Sprintf("%s-%s", alert.Instance, vmidStr) - } else if len(parts) == 2 && alert.Instance == alert.Node { - // Format: node-VMID -> instance-VMID (standalone) - newResourceID = fmt.Sprintf("%s-%s", alert.Instance, vmidStr) - } + vmid, _ := strconv.Atoi(vmidStr) + newResourceID = BuildGuestKey(alert.Instance, alert.Node, vmid) - if newResourceID != "" && newResourceID != alert.ResourceID { + if newResourceID != "" && newResourceID != oldResourceID { log.Info(). - Str("oldID", alert.ResourceID). + Str("oldID", oldResourceID). Str("newID", newResourceID). Str("alertType", alert.Type). Msg("Migrating active alert from legacy guest ID format") - oldAlertID := alert.ID - // Update resource ID alert.ResourceID = newResourceID // Update alert ID (usually contains resource ID) - alert.ID = strings.Replace(alert.ID, alert.ResourceID, newResourceID, 1) - - // Update seen map to use new ID - seen[alert.ID] = true - delete(seen, oldAlertID) + alert.ID = strings.Replace(alert.ID, oldResourceID, newResourceID, 1) } } } diff --git a/internal/alerts/alerts_test.go b/internal/alerts/alerts_test.go index 8c677d9ad..cf5d3c2b6 100644 --- a/internal/alerts/alerts_test.go +++ b/internal/alerts/alerts_test.go @@ -16831,6 +16831,62 @@ func TestLoadActiveAlerts(t *testing.T) { t.Errorf("first alert should win, got type %s", alert.Type) } }) + + t.Run("migrates legacy guest alert resource IDs to canonical format", func(t *testing.T) { + m := newTestManager(t) + m.ClearActiveAlerts() + + alertsDir := filepath.Join(utils.GetDataDir(), "alerts") + if err := os.MkdirAll(alertsDir, 0755); err != nil { + t.Fatalf("failed to create alerts dir: %v", err) + } + + startTime := time.Now().Add(-10 * time.Minute) + legacyResourceID := "pve1-100" + canonicalResourceID := BuildGuestKey("pve1", "node1", 100) + alerts := []Alert{ + { + ID: legacyResourceID + "-cpu", + Type: "cpu", + Level: AlertLevelWarning, + ResourceID: legacyResourceID, + ResourceName: "test-vm", + Node: "node1", + Instance: "pve1", + StartTime: startTime, + LastSeen: time.Now(), + }, + } + + data, _ := json.Marshal(alerts) + alertsFile := filepath.Join(alertsDir, "active-alerts.json") + if err := os.WriteFile(alertsFile, data, 0644); err != nil { + t.Fatalf("failed to write alerts json: %v", err) + } + + err := m.LoadActiveAlerts() + if err != nil { + t.Fatalf("failed to load alerts: %v", err) + } + + m.mu.RLock() + alert, exists := m.activeAlerts[canonicalResourceID+"-cpu"] + _, oldExists := m.activeAlerts[legacyResourceID+"-cpu"] + m.mu.RUnlock() + + if !exists { + t.Fatal("expected canonical guest alert to be loaded") + } + if oldExists { + t.Fatal("expected legacy guest alert ID to be replaced") + } + if alert.ResourceID != canonicalResourceID { + t.Fatalf("expected resource ID %q, got %q", canonicalResourceID, alert.ResourceID) + } + if alert.ID != canonicalResourceID+"-cpu" { + t.Fatalf("expected alert ID %q, got %q", canonicalResourceID+"-cpu", alert.ID) + } + }) } func TestNamespaceMatchesInstance(t *testing.T) { diff --git a/internal/alerts/filter_evaluation.go b/internal/alerts/filter_evaluation.go index 1ea9ebc19..617d8a1cd 100644 --- a/internal/alerts/filter_evaluation.go +++ b/internal/alerts/filter_evaluation.go @@ -268,7 +268,7 @@ func (m *Manager) getGuestThresholds(guest interface{}, guestID string) Threshol } // Finally check guest-specific overrides (highest priority) - // First try the new stable ID format (instance-VMID) + // First try the current canonical ID format (instance:node:vmid) override, exists := m.config.Overrides[guestID] // If not found, try legacy ID formats for migration @@ -331,8 +331,8 @@ func (m *Manager) getGuestThresholds(guest interface{}, guestID string) Threshol return thresholds } -// tryLegacyOverrideMigration attempts to find and migrate legacy override formats. -// Returns the override and true if found, or zero value and false otherwise. +// tryLegacyOverrideMigration attempts to find and migrate legacy override formats +// into the canonical guest ID format (instance:node:vmid). func (m *Manager) tryLegacyOverrideMigration(guest interface{}, guestID string) (ThresholdConfig, bool) { var node string var vmid int @@ -370,6 +370,22 @@ func (m *Manager) tryLegacyOverrideMigration(guest interface{}, guestID string) } } + // Try legacy clustered format: instance-VMID + if instance != "" { + legacyID := fmt.Sprintf("%s-%d", instance, vmid) + if legacyOverride, legacyExists := m.config.Overrides[legacyID]; legacyExists { + log.Info(). + Str("legacyID", legacyID). + Str("newID", guestID). + Msg("Migrating guest override from legacy cluster ID format") + + m.config.Overrides[guestID] = legacyOverride + delete(m.config.Overrides, legacyID) + + return legacyOverride, true + } + } + // Try standalone format: node-VMID if instance == node { legacyID := fmt.Sprintf("%s-%d", node, vmid) diff --git a/internal/alerts/filter_evaluation_test.go b/internal/alerts/filter_evaluation_test.go index 892212517..314de3a78 100644 --- a/internal/alerts/filter_evaluation_test.go +++ b/internal/alerts/filter_evaluation_test.go @@ -1595,6 +1595,45 @@ func TestGetGuestThresholds(t *testing.T) { } }) + t.Run("legacy ID migration for clustered instance-vmid format", func(t *testing.T) { + canonicalID := BuildGuestKey("pve1", "node1", 100) + m := &Manager{ + config: AlertConfig{ + GuestDefaults: ThresholdConfig{}, + Overrides: map[string]ThresholdConfig{ + // Legacy cluster format: instance-vmid + "pve1-100": { + CPU: &HysteresisThreshold{Trigger: 65, Clear: 60}, + }, + }, + CustomRules: []CustomAlertRule{}, + }, + } + + vm := models.VM{ + Name: "test-vm", + Node: "node1", + Instance: "pve1", + VMID: 100, + } + + result := m.getGuestThresholds(vm, canonicalID) + + if result.CPU == nil { + t.Fatal("CPU threshold should not be nil after cluster legacy migration") + } + if result.CPU.Trigger != 65 { + t.Errorf("expected CPU trigger 65 from migrated cluster legacy override, got %v", result.CPU.Trigger) + } + + if _, exists := m.config.Overrides[canonicalID]; !exists { + t.Error("override should be migrated to canonical guest ID") + } + if _, exists := m.config.Overrides["pve1-100"]; exists { + t.Error("old cluster legacy override should be removed after migration") + } + }) + t.Run("works with container type", func(t *testing.T) { m := &Manager{ config: AlertConfig{