Stabilize guest threshold overrides across node moves (#1334)
Some checks are pending
Build and Test / Secret Scan (push) Waiting to run
Build and Test / Frontend & Backend (push) Waiting to run
Core E2E Tests / Playwright Core E2E (push) Waiting to run

This commit is contained in:
rcourtman 2026-03-31 23:18:19 +01:00
parent 70a15e0801
commit 31753e5536
6 changed files with 332 additions and 21 deletions

View file

@ -486,9 +486,12 @@ export const extractTriggerValues = (
export const normalizeRawOverrideConfigKeys = (
rawOverrides: Record<string, RawOverrideConfig>,
storage: Storage[] = [],
guests: Array<Pick<VM, 'id' | 'instance' | 'node' | 'vmid'> | Pick<Container, 'id' | 'instance' | 'node' | 'vmid'>> = [],
): Record<string, RawOverrideConfig> => {
const normalized: Record<string, RawOverrideConfig> = {};
const sharedStorageLegacyKeyMap = new Map<string, string>();
const guestLegacyKeyMap = new Map<string, string>();
const guestLegacyMatchers: Array<{ canonicalID: string; instance: string; vmid: number }> = [];
storage.forEach((entry) => {
if (!entry.shared || !entry.id || !entry.name) {
@ -504,6 +507,21 @@ export const normalizeRawOverrideConfigKeys = (
});
});
guests.forEach((guest) => {
const instance = guest.instance?.trim() || guest.node?.trim() || '';
const node = guest.node?.trim() || '';
const canonicalID = guest.id || `${instance}:${node}:${guest.vmid}`;
if (!instance || !node || !guest.vmid || !canonicalID) {
return;
}
guestLegacyKeyMap.set(`${instance}-${guest.vmid}`, canonicalID);
if (instance !== node) {
guestLegacyKeyMap.set(`${instance}-${node}-${guest.vmid}`, canonicalID);
guestLegacyMatchers.push({ canonicalID, instance, vmid: guest.vmid });
}
});
for (const [rawKey, value] of Object.entries(rawOverrides || {})) {
let key = rawKey;
@ -522,6 +540,18 @@ export const normalizeRawOverrideConfigKeys = (
key = sharedStorageKey;
}
const guestKey = guestLegacyKeyMap.get(key);
if (guestKey) {
key = guestKey;
} else {
const matchedGuest = guestLegacyMatchers.find(({ instance, vmid }) =>
key.startsWith(`${instance}-`) && key.endsWith(`-${vmid}`),
);
if (matchedGuest) {
key = matchedGuest.canonicalID;
}
}
normalized[key] = value;
}
@ -689,7 +719,11 @@ export function Alerts() {
createEffect(() => {
const currentRawOverrides = rawOverridesConfig();
const normalized = normalizeRawOverrideConfigKeys(currentRawOverrides, state.storage || []);
const normalized = normalizeRawOverrideConfigKeys(
currentRawOverrides,
state.storage || [],
[...(state.vms || []), ...(state.containers || [])],
);
if (JSON.stringify(normalized) !== JSON.stringify(currentRawOverrides)) {
setRawOverridesConfig(normalized);
}
@ -1304,7 +1338,13 @@ export function Alerts() {
setDisableAllPMGOffline(config.disableAllPMGOffline ?? false);
setDisableAllDockerHostsOffline(config.disableAllDockerHostsOffline ?? false);
setRawOverridesConfig(normalizeRawOverrideConfigKeys(config.overrides || {}, state.storage || []));
setRawOverridesConfig(
normalizeRawOverrideConfigKeys(
config.overrides || {},
state.storage || [],
[...(state.vms || []), ...(state.containers || [])],
),
);
if (config.schedule) {
if (config.schedule.quietHours) {

View file

@ -227,4 +227,38 @@ describe('threshold helper utilities', () => {
},
});
});
it('maps stable clustered guest override keys onto the current canonical guest id', () => {
expect(normalizeRawOverrideConfigKeys({
'Main-101': {
cpu: { trigger: 95, clear: 90 },
},
}, [], [{
id: 'Main:node2:101',
instance: 'Main',
node: 'node2',
vmid: 101,
} as any])).toEqual({
'Main:node2:101': {
cpu: { trigger: 95, clear: 90 },
},
});
});
it('maps legacy clustered instance-node-vmid guest override keys onto the current canonical guest id', () => {
expect(normalizeRawOverrideConfigKeys({
'Main-node1-101': {
memory: { trigger: 96, clear: 91 },
},
}, [], [{
id: 'Main:node2:101',
instance: 'Main',
node: 'node2',
vmid: 101,
} as any])).toEqual({
'Main:node2:101': {
memory: { trigger: 96, clear: 91 },
},
});
});
});

View file

@ -1553,7 +1553,14 @@ func normalizeOverrides(overrides map[string]ThresholdConfig) {
if override.Usage != nil {
override.Usage = ensureHysteresisThreshold(override.Usage)
}
overrides[id] = override
normalizedKey := id
if ident, ok := parseCanonicalGuestKey(id); ok && ident.instance != ident.node {
normalizedKey = stableGuestOverrideKey(ident.instance, ident.node, ident.vmid)
}
if normalizedKey != id {
delete(overrides, id)
}
overrides[normalizedKey] = override
}
}
@ -1892,7 +1899,7 @@ func (m *Manager) reevaluateActiveAlertsLocked() {
// The next poll cycle will properly evaluate them with custom rules.
// Check if there's an override for this specific guest
if override, exists := m.config.Overrides[resourceID]; exists {
if override, exists := m.lookupGuestOverride(nil, resourceID); exists {
if override.Disabled {
// Alert is now disabled for this resource, resolve it
alertsToResolve = append(alertsToResolve, alertID)
@ -8874,7 +8881,7 @@ func (m *Manager) checkGuestPoweredOff(guestID, name, node, instanceName, guestT
// Get thresholds to check if powered-off alerts are disabled
var thresholds ThresholdConfig
if override, exists := m.config.Overrides[guestID]; exists {
if override, exists := m.lookupGuestOverride(nil, guestID); exists {
thresholds = override
} else {
thresholds = m.config.GuestDefaults

View file

@ -2,6 +2,7 @@ package alerts
import (
"fmt"
"strconv"
"strings"
"github.com/rcourtman/pulse-go-rewrite/internal/models"
@ -188,6 +189,152 @@ func (m *Manager) evaluateFilterStack(guest interface{}, stack FilterStack) bool
return false
}
type guestOverrideIdentity struct {
instance string
node string
vmid int
}
func stableGuestOverrideKey(instance, node string, vmid int) string {
instance = strings.TrimSpace(instance)
node = strings.TrimSpace(node)
if instance == "" {
instance = node
}
return fmt.Sprintf("%s-%d", instance, vmid)
}
func parseCanonicalGuestKey(guestID string) (guestOverrideIdentity, bool) {
parts := strings.Split(strings.TrimSpace(guestID), ":")
if len(parts) != 3 {
return guestOverrideIdentity{}, false
}
vmid, err := strconv.Atoi(parts[2])
if err != nil {
return guestOverrideIdentity{}, false
}
instance := strings.TrimSpace(parts[0])
node := strings.TrimSpace(parts[1])
if instance == "" {
instance = node
}
if instance == "" || node == "" {
return guestOverrideIdentity{}, false
}
return guestOverrideIdentity{
instance: instance,
node: node,
vmid: vmid,
}, true
}
func guestOverrideIdentityFromGuestOrID(guest interface{}, guestID string) (guestOverrideIdentity, bool) {
switch g := guest.(type) {
case models.VM:
instance := strings.TrimSpace(g.Instance)
node := strings.TrimSpace(g.Node)
if instance == "" {
instance = node
}
if instance == "" || node == "" || g.VMID <= 0 {
return guestOverrideIdentity{}, false
}
return guestOverrideIdentity{
instance: instance,
node: node,
vmid: g.VMID,
}, true
case models.Container:
instance := strings.TrimSpace(g.Instance)
node := strings.TrimSpace(g.Node)
if instance == "" {
instance = node
}
if instance == "" || node == "" || g.VMID <= 0 {
return guestOverrideIdentity{}, false
}
return guestOverrideIdentity{
instance: instance,
node: node,
vmid: g.VMID,
}, true
default:
return parseCanonicalGuestKey(guestID)
}
}
func isCanonicalGuestOverrideKey(key string, ident guestOverrideIdentity) bool {
parsed, ok := parseCanonicalGuestKey(key)
if !ok {
return false
}
return parsed.instance == ident.instance && parsed.vmid == ident.vmid
}
func (m *Manager) lookupGuestOverride(guest interface{}, guestID string) (ThresholdConfig, bool) {
if guestID = strings.TrimSpace(guestID); guestID != "" {
if override, exists := m.config.Overrides[guestID]; exists {
ident, ok := guestOverrideIdentityFromGuestOrID(guest, guestID)
if !ok || ident.instance == ident.node {
return override, true
}
stableKey := stableGuestOverrideKey(ident.instance, ident.node, ident.vmid)
if guestID != stableKey {
// Cluster guest overrides should live under the stable instance-vmid key so
// they survive node moves. Preserve the override in-memory immediately.
m.config.Overrides[stableKey] = override
delete(m.config.Overrides, guestID)
}
return override, true
}
}
ident, ok := guestOverrideIdentityFromGuestOrID(guest, guestID)
if !ok {
return ThresholdConfig{}, false
}
stableKey := stableGuestOverrideKey(ident.instance, ident.node, ident.vmid)
if override, exists := m.config.Overrides[stableKey]; exists {
return override, true
}
if ident.instance != ident.node {
legacyKey := fmt.Sprintf("%s-%s-%d", ident.instance, ident.node, ident.vmid)
if legacyOverride, legacyExists := m.config.Overrides[legacyKey]; legacyExists {
log.Info().
Str("legacyID", legacyKey).
Str("newID", stableKey).
Msg("Migrating guest override from legacy cluster-node ID format")
m.config.Overrides[stableKey] = legacyOverride
delete(m.config.Overrides, legacyKey)
return legacyOverride, true
}
for key, override := range m.config.Overrides {
if !isCanonicalGuestOverrideKey(key, ident) {
continue
}
m.config.Overrides[stableKey] = override
delete(m.config.Overrides, key)
log.Info().
Str("legacyID", key).
Str("newID", stableKey).
Msg("Migrating clustered guest override from node-bound canonical key to stable key")
return override, true
}
}
return ThresholdConfig{}, false
}
// getGuestThresholds returns the appropriate thresholds for a guest
// Priority: Guest-specific overrides > Custom rules (by priority) > Global defaults
func (m *Manager) getGuestThresholds(guest interface{}, guestID string) ThresholdConfig {
@ -267,14 +414,10 @@ func (m *Manager) getGuestThresholds(guest interface{}, guestID string) Threshol
Msg("Applied custom alert rule")
}
// Finally check guest-specific overrides (highest priority)
// First try the current canonical ID format (instance:node:vmid)
override, exists := m.config.Overrides[guestID]
// If not found, try legacy ID formats for migration
if !exists {
override, exists = m.tryLegacyOverrideMigration(guest, guestID)
}
// Finally check guest-specific overrides (highest priority).
// Cluster guest overrides should resolve via a stable instance-vmid key so they
// continue to apply after node moves. Older node-bound formats are migrated lazily.
override, exists := m.lookupGuestOverride(guest, guestID)
if exists {
// Apply the disabled flag if set

View file

@ -1544,8 +1544,8 @@ func TestGetGuestThresholds(t *testing.T) {
VMID: 100,
}
// Query with new format
result := m.getGuestThresholds(vm, "pve1-100")
// Query with the current canonical format
result := m.getGuestThresholds(vm, BuildGuestKey("pve1", "node1", 100))
if result.CPU == nil {
t.Fatal("CPU threshold should not be nil after legacy migration")
@ -1554,9 +1554,9 @@ func TestGetGuestThresholds(t *testing.T) {
t.Errorf("expected CPU trigger 60 from migrated legacy override, got %v", result.CPU.Trigger)
}
// Verify the override was migrated to new ID
// Verify the override was migrated to the stable clustered key
if _, exists := m.config.Overrides["pve1-100"]; !exists {
t.Error("override should be migrated to new ID format")
t.Error("override should be migrated to stable clustered key")
}
if _, exists := m.config.Overrides["pve1-node1-100"]; exists {
t.Error("old legacy override should be removed after migration")
@ -1626,11 +1626,50 @@ func TestGetGuestThresholds(t *testing.T) {
t.Errorf("expected CPU trigger 65 from migrated cluster legacy override, got %v", result.CPU.Trigger)
}
if _, exists := m.config.Overrides[canonicalID]; !exists {
t.Error("override should be migrated to canonical guest ID")
if _, exists := m.config.Overrides["pve1-100"]; !exists {
t.Error("stable clustered guest override should remain on instance-vmid key")
}
if _, exists := m.config.Overrides["pve1-100"]; exists {
t.Error("old cluster legacy override should be removed after migration")
if _, exists := m.config.Overrides[canonicalID]; exists {
t.Error("clustered guest override should not be migrated onto a node-bound canonical key")
}
})
t.Run("migrates clustered canonical override to stable key across node moves", func(t *testing.T) {
oldCanonicalID := BuildGuestKey("pve1", "node1", 100)
currentCanonicalID := BuildGuestKey("pve1", "node2", 100)
m := &Manager{
config: AlertConfig{
GuestDefaults: ThresholdConfig{},
Overrides: map[string]ThresholdConfig{
oldCanonicalID: {
Memory: &HysteresisThreshold{Trigger: 92, Clear: 87},
},
},
CustomRules: []CustomAlertRule{},
},
}
vm := models.VM{
Name: "test-vm",
Node: "node2",
Instance: "pve1",
VMID: 100,
}
result := m.getGuestThresholds(vm, currentCanonicalID)
if result.Memory == nil {
t.Fatal("Memory threshold should not be nil after clustered canonical migration")
}
if result.Memory.Trigger != 92 {
t.Errorf("expected Memory trigger 92 from migrated clustered canonical override, got %v", result.Memory.Trigger)
}
if _, exists := m.config.Overrides["pve1-100"]; !exists {
t.Error("override should be migrated to stable clustered key")
}
if _, exists := m.config.Overrides[oldCanonicalID]; exists {
t.Error("old node-bound canonical override should be removed after migration")
}
})

View file

@ -329,3 +329,51 @@ func TestReevaluateActiveAlertsGuestNotMisclassifiedAsNode(t *testing.T) {
t.Errorf("Guest alert should have been resolved when guest memory threshold is disabled, but it was misclassified as a node alert")
}
}
func TestReevaluateActiveAlertsGuestUsesStableClusterOverrideAcrossNodeMove(t *testing.T) {
manager := NewManager()
manager.mu.Lock()
manager.activeAlerts = make(map[string]*Alert)
manager.mu.Unlock()
currentResourceID := BuildGuestKey("pve1", "node2", 101)
alertID := currentResourceID + "-memory"
config := AlertConfig{
Enabled: true,
GuestDefaults: ThresholdConfig{
Memory: &HysteresisThreshold{Trigger: 85, Clear: 80},
},
NodeDefaults: ThresholdConfig{},
StorageDefault: HysteresisThreshold{Trigger: 85, Clear: 80},
Overrides: map[string]ThresholdConfig{"pve1-101": {Memory: &HysteresisThreshold{Trigger: 95, Clear: 90}}},
}
manager.UpdateConfig(config)
manager.mu.Lock()
manager.activeAlerts[alertID] = &Alert{
ID: alertID,
Type: "memory",
Level: AlertLevelWarning,
ResourceID: currentResourceID,
Node: "node2",
Instance: "pve1",
Value: 88,
Threshold: 85,
StartTime: time.Now().Add(-5 * time.Minute),
LastSeen: time.Now(),
}
manager.mu.Unlock()
manager.UpdateConfig(config)
time.Sleep(100 * time.Millisecond)
manager.mu.RLock()
_, exists := manager.activeAlerts[alertID]
manager.mu.RUnlock()
if exists {
t.Fatalf("expected guest alert to be resolved when stable clustered override threshold is above current value")
}
}