Fix guest override migration for canonical IDs (#1334)

This commit is contained in:
rcourtman 2026-03-25 10:13:10 +00:00
parent ab85c5a936
commit b5ee2c1f98
4 changed files with 121 additions and 25 deletions

View file

@ -9516,51 +9516,36 @@ func (m *Manager) LoadActiveAlerts() error {
seen := make(map[string]bool)
for _, alert := range alerts {
// Migrate legacy guest alert IDs (instance-node-VMID -> instance-VMID)
// Migrate legacy guest alert IDs to the canonical guest format.
// Check if this is a guest-related alert by looking at common alert types
isGuestAlert := strings.Contains(alert.Type, "cpu") || strings.Contains(alert.Type, "memory") ||
strings.Contains(alert.Type, "disk") || strings.Contains(alert.Type, "network") ||
alert.Type == "guest-offline"
if isGuestAlert {
// Try to extract instance, node, and VMID from resource ID
// Legacy format: instance-node-VMID or node-VMID (standalone)
parts := strings.Split(alert.ResourceID, "-")
// Check if this looks like a legacy format (has node in the ID)
// We can detect this if we have Node field and it appears in the ResourceID
if alert.Node != "" && len(parts) >= 2 {
var newResourceID string
oldResourceID := alert.ResourceID
// Try to extract VMID (should be last part)
vmidStr := parts[len(parts)-1]
if _, err := strconv.Atoi(vmidStr); err == nil {
// VMID is valid, now check if we need to migrate
if len(parts) == 3 && alert.Instance != "" && alert.Instance != alert.Node {
// Format: instance-node-VMID -> instance-VMID
newResourceID = fmt.Sprintf("%s-%s", alert.Instance, vmidStr)
} else if len(parts) == 2 && alert.Instance == alert.Node {
// Format: node-VMID -> instance-VMID (standalone)
newResourceID = fmt.Sprintf("%s-%s", alert.Instance, vmidStr)
}
vmid, _ := strconv.Atoi(vmidStr)
newResourceID = BuildGuestKey(alert.Instance, alert.Node, vmid)
if newResourceID != "" && newResourceID != alert.ResourceID {
if newResourceID != "" && newResourceID != oldResourceID {
log.Info().
Str("oldID", alert.ResourceID).
Str("oldID", oldResourceID).
Str("newID", newResourceID).
Str("alertType", alert.Type).
Msg("Migrating active alert from legacy guest ID format")
oldAlertID := alert.ID
// Update resource ID
alert.ResourceID = newResourceID
// Update alert ID (usually contains resource ID)
alert.ID = strings.Replace(alert.ID, alert.ResourceID, newResourceID, 1)
// Update seen map to use new ID
seen[alert.ID] = true
delete(seen, oldAlertID)
alert.ID = strings.Replace(alert.ID, oldResourceID, newResourceID, 1)
}
}
}

View file

@ -16831,6 +16831,62 @@ func TestLoadActiveAlerts(t *testing.T) {
t.Errorf("first alert should win, got type %s", alert.Type)
}
})
t.Run("migrates legacy guest alert resource IDs to canonical format", func(t *testing.T) {
m := newTestManager(t)
m.ClearActiveAlerts()
alertsDir := filepath.Join(utils.GetDataDir(), "alerts")
if err := os.MkdirAll(alertsDir, 0755); err != nil {
t.Fatalf("failed to create alerts dir: %v", err)
}
startTime := time.Now().Add(-10 * time.Minute)
legacyResourceID := "pve1-100"
canonicalResourceID := BuildGuestKey("pve1", "node1", 100)
alerts := []Alert{
{
ID: legacyResourceID + "-cpu",
Type: "cpu",
Level: AlertLevelWarning,
ResourceID: legacyResourceID,
ResourceName: "test-vm",
Node: "node1",
Instance: "pve1",
StartTime: startTime,
LastSeen: time.Now(),
},
}
data, _ := json.Marshal(alerts)
alertsFile := filepath.Join(alertsDir, "active-alerts.json")
if err := os.WriteFile(alertsFile, data, 0644); err != nil {
t.Fatalf("failed to write alerts json: %v", err)
}
err := m.LoadActiveAlerts()
if err != nil {
t.Fatalf("failed to load alerts: %v", err)
}
m.mu.RLock()
alert, exists := m.activeAlerts[canonicalResourceID+"-cpu"]
_, oldExists := m.activeAlerts[legacyResourceID+"-cpu"]
m.mu.RUnlock()
if !exists {
t.Fatal("expected canonical guest alert to be loaded")
}
if oldExists {
t.Fatal("expected legacy guest alert ID to be replaced")
}
if alert.ResourceID != canonicalResourceID {
t.Fatalf("expected resource ID %q, got %q", canonicalResourceID, alert.ResourceID)
}
if alert.ID != canonicalResourceID+"-cpu" {
t.Fatalf("expected alert ID %q, got %q", canonicalResourceID+"-cpu", alert.ID)
}
})
}
func TestNamespaceMatchesInstance(t *testing.T) {

View file

@ -268,7 +268,7 @@ func (m *Manager) getGuestThresholds(guest interface{}, guestID string) Threshol
}
// Finally check guest-specific overrides (highest priority)
// First try the new stable ID format (instance-VMID)
// First try the current canonical ID format (instance:node:vmid)
override, exists := m.config.Overrides[guestID]
// If not found, try legacy ID formats for migration
@ -331,8 +331,8 @@ func (m *Manager) getGuestThresholds(guest interface{}, guestID string) Threshol
return thresholds
}
// tryLegacyOverrideMigration attempts to find and migrate legacy override formats.
// Returns the override and true if found, or zero value and false otherwise.
// tryLegacyOverrideMigration attempts to find and migrate legacy override formats
// into the canonical guest ID format (instance:node:vmid).
func (m *Manager) tryLegacyOverrideMigration(guest interface{}, guestID string) (ThresholdConfig, bool) {
var node string
var vmid int
@ -370,6 +370,22 @@ func (m *Manager) tryLegacyOverrideMigration(guest interface{}, guestID string)
}
}
// Try legacy clustered format: instance-VMID
if instance != "" {
legacyID := fmt.Sprintf("%s-%d", instance, vmid)
if legacyOverride, legacyExists := m.config.Overrides[legacyID]; legacyExists {
log.Info().
Str("legacyID", legacyID).
Str("newID", guestID).
Msg("Migrating guest override from legacy cluster ID format")
m.config.Overrides[guestID] = legacyOverride
delete(m.config.Overrides, legacyID)
return legacyOverride, true
}
}
// Try standalone format: node-VMID
if instance == node {
legacyID := fmt.Sprintf("%s-%d", node, vmid)

View file

@ -1595,6 +1595,45 @@ func TestGetGuestThresholds(t *testing.T) {
}
})
t.Run("legacy ID migration for clustered instance-vmid format", func(t *testing.T) {
canonicalID := BuildGuestKey("pve1", "node1", 100)
m := &Manager{
config: AlertConfig{
GuestDefaults: ThresholdConfig{},
Overrides: map[string]ThresholdConfig{
// Legacy cluster format: instance-vmid
"pve1-100": {
CPU: &HysteresisThreshold{Trigger: 65, Clear: 60},
},
},
CustomRules: []CustomAlertRule{},
},
}
vm := models.VM{
Name: "test-vm",
Node: "node1",
Instance: "pve1",
VMID: 100,
}
result := m.getGuestThresholds(vm, canonicalID)
if result.CPU == nil {
t.Fatal("CPU threshold should not be nil after cluster legacy migration")
}
if result.CPU.Trigger != 65 {
t.Errorf("expected CPU trigger 65 from migrated cluster legacy override, got %v", result.CPU.Trigger)
}
if _, exists := m.config.Overrides[canonicalID]; !exists {
t.Error("override should be migrated to canonical guest ID")
}
if _, exists := m.config.Overrides["pve1-100"]; exists {
t.Error("old cluster legacy override should be removed after migration")
}
})
t.Run("works with container type", func(t *testing.T) {
m := &Manager{
config: AlertConfig{