mirror of
https://github.com/rcourtman/Pulse.git
synced 2026-04-28 03:20:11 +00:00
Stabilize guest threshold overrides across node moves (#1334)
This commit is contained in:
parent
70a15e0801
commit
31753e5536
6 changed files with 332 additions and 21 deletions
|
|
@ -486,9 +486,12 @@ export const extractTriggerValues = (
|
|||
export const normalizeRawOverrideConfigKeys = (
|
||||
rawOverrides: Record<string, RawOverrideConfig>,
|
||||
storage: Storage[] = [],
|
||||
guests: Array<Pick<VM, 'id' | 'instance' | 'node' | 'vmid'> | Pick<Container, 'id' | 'instance' | 'node' | 'vmid'>> = [],
|
||||
): Record<string, RawOverrideConfig> => {
|
||||
const normalized: Record<string, RawOverrideConfig> = {};
|
||||
const sharedStorageLegacyKeyMap = new Map<string, string>();
|
||||
const guestLegacyKeyMap = new Map<string, string>();
|
||||
const guestLegacyMatchers: Array<{ canonicalID: string; instance: string; vmid: number }> = [];
|
||||
|
||||
storage.forEach((entry) => {
|
||||
if (!entry.shared || !entry.id || !entry.name) {
|
||||
|
|
@ -504,6 +507,21 @@ export const normalizeRawOverrideConfigKeys = (
|
|||
});
|
||||
});
|
||||
|
||||
guests.forEach((guest) => {
|
||||
const instance = guest.instance?.trim() || guest.node?.trim() || '';
|
||||
const node = guest.node?.trim() || '';
|
||||
const canonicalID = guest.id || `${instance}:${node}:${guest.vmid}`;
|
||||
if (!instance || !node || !guest.vmid || !canonicalID) {
|
||||
return;
|
||||
}
|
||||
|
||||
guestLegacyKeyMap.set(`${instance}-${guest.vmid}`, canonicalID);
|
||||
if (instance !== node) {
|
||||
guestLegacyKeyMap.set(`${instance}-${node}-${guest.vmid}`, canonicalID);
|
||||
guestLegacyMatchers.push({ canonicalID, instance, vmid: guest.vmid });
|
||||
}
|
||||
});
|
||||
|
||||
for (const [rawKey, value] of Object.entries(rawOverrides || {})) {
|
||||
let key = rawKey;
|
||||
|
||||
|
|
@ -522,6 +540,18 @@ export const normalizeRawOverrideConfigKeys = (
|
|||
key = sharedStorageKey;
|
||||
}
|
||||
|
||||
const guestKey = guestLegacyKeyMap.get(key);
|
||||
if (guestKey) {
|
||||
key = guestKey;
|
||||
} else {
|
||||
const matchedGuest = guestLegacyMatchers.find(({ instance, vmid }) =>
|
||||
key.startsWith(`${instance}-`) && key.endsWith(`-${vmid}`),
|
||||
);
|
||||
if (matchedGuest) {
|
||||
key = matchedGuest.canonicalID;
|
||||
}
|
||||
}
|
||||
|
||||
normalized[key] = value;
|
||||
}
|
||||
|
||||
|
|
@ -689,7 +719,11 @@ export function Alerts() {
|
|||
|
||||
createEffect(() => {
|
||||
const currentRawOverrides = rawOverridesConfig();
|
||||
const normalized = normalizeRawOverrideConfigKeys(currentRawOverrides, state.storage || []);
|
||||
const normalized = normalizeRawOverrideConfigKeys(
|
||||
currentRawOverrides,
|
||||
state.storage || [],
|
||||
[...(state.vms || []), ...(state.containers || [])],
|
||||
);
|
||||
if (JSON.stringify(normalized) !== JSON.stringify(currentRawOverrides)) {
|
||||
setRawOverridesConfig(normalized);
|
||||
}
|
||||
|
|
@ -1304,7 +1338,13 @@ export function Alerts() {
|
|||
setDisableAllPMGOffline(config.disableAllPMGOffline ?? false);
|
||||
setDisableAllDockerHostsOffline(config.disableAllDockerHostsOffline ?? false);
|
||||
|
||||
setRawOverridesConfig(normalizeRawOverrideConfigKeys(config.overrides || {}, state.storage || []));
|
||||
setRawOverridesConfig(
|
||||
normalizeRawOverrideConfigKeys(
|
||||
config.overrides || {},
|
||||
state.storage || [],
|
||||
[...(state.vms || []), ...(state.containers || [])],
|
||||
),
|
||||
);
|
||||
|
||||
if (config.schedule) {
|
||||
if (config.schedule.quietHours) {
|
||||
|
|
|
|||
|
|
@ -227,4 +227,38 @@ describe('threshold helper utilities', () => {
|
|||
},
|
||||
});
|
||||
});
|
||||
|
||||
it('maps stable clustered guest override keys onto the current canonical guest id', () => {
|
||||
expect(normalizeRawOverrideConfigKeys({
|
||||
'Main-101': {
|
||||
cpu: { trigger: 95, clear: 90 },
|
||||
},
|
||||
}, [], [{
|
||||
id: 'Main:node2:101',
|
||||
instance: 'Main',
|
||||
node: 'node2',
|
||||
vmid: 101,
|
||||
} as any])).toEqual({
|
||||
'Main:node2:101': {
|
||||
cpu: { trigger: 95, clear: 90 },
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
it('maps legacy clustered instance-node-vmid guest override keys onto the current canonical guest id', () => {
|
||||
expect(normalizeRawOverrideConfigKeys({
|
||||
'Main-node1-101': {
|
||||
memory: { trigger: 96, clear: 91 },
|
||||
},
|
||||
}, [], [{
|
||||
id: 'Main:node2:101',
|
||||
instance: 'Main',
|
||||
node: 'node2',
|
||||
vmid: 101,
|
||||
} as any])).toEqual({
|
||||
'Main:node2:101': {
|
||||
memory: { trigger: 96, clear: 91 },
|
||||
},
|
||||
});
|
||||
});
|
||||
});
|
||||
|
|
|
|||
|
|
@ -1553,7 +1553,14 @@ func normalizeOverrides(overrides map[string]ThresholdConfig) {
|
|||
if override.Usage != nil {
|
||||
override.Usage = ensureHysteresisThreshold(override.Usage)
|
||||
}
|
||||
overrides[id] = override
|
||||
normalizedKey := id
|
||||
if ident, ok := parseCanonicalGuestKey(id); ok && ident.instance != ident.node {
|
||||
normalizedKey = stableGuestOverrideKey(ident.instance, ident.node, ident.vmid)
|
||||
}
|
||||
if normalizedKey != id {
|
||||
delete(overrides, id)
|
||||
}
|
||||
overrides[normalizedKey] = override
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -1892,7 +1899,7 @@ func (m *Manager) reevaluateActiveAlertsLocked() {
|
|||
// The next poll cycle will properly evaluate them with custom rules.
|
||||
|
||||
// Check if there's an override for this specific guest
|
||||
if override, exists := m.config.Overrides[resourceID]; exists {
|
||||
if override, exists := m.lookupGuestOverride(nil, resourceID); exists {
|
||||
if override.Disabled {
|
||||
// Alert is now disabled for this resource, resolve it
|
||||
alertsToResolve = append(alertsToResolve, alertID)
|
||||
|
|
@ -8874,7 +8881,7 @@ func (m *Manager) checkGuestPoweredOff(guestID, name, node, instanceName, guestT
|
|||
|
||||
// Get thresholds to check if powered-off alerts are disabled
|
||||
var thresholds ThresholdConfig
|
||||
if override, exists := m.config.Overrides[guestID]; exists {
|
||||
if override, exists := m.lookupGuestOverride(nil, guestID); exists {
|
||||
thresholds = override
|
||||
} else {
|
||||
thresholds = m.config.GuestDefaults
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@ package alerts
|
|||
|
||||
import (
|
||||
"fmt"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"github.com/rcourtman/pulse-go-rewrite/internal/models"
|
||||
|
|
@ -188,6 +189,152 @@ func (m *Manager) evaluateFilterStack(guest interface{}, stack FilterStack) bool
|
|||
return false
|
||||
}
|
||||
|
||||
type guestOverrideIdentity struct {
|
||||
instance string
|
||||
node string
|
||||
vmid int
|
||||
}
|
||||
|
||||
func stableGuestOverrideKey(instance, node string, vmid int) string {
|
||||
instance = strings.TrimSpace(instance)
|
||||
node = strings.TrimSpace(node)
|
||||
if instance == "" {
|
||||
instance = node
|
||||
}
|
||||
return fmt.Sprintf("%s-%d", instance, vmid)
|
||||
}
|
||||
|
||||
func parseCanonicalGuestKey(guestID string) (guestOverrideIdentity, bool) {
|
||||
parts := strings.Split(strings.TrimSpace(guestID), ":")
|
||||
if len(parts) != 3 {
|
||||
return guestOverrideIdentity{}, false
|
||||
}
|
||||
|
||||
vmid, err := strconv.Atoi(parts[2])
|
||||
if err != nil {
|
||||
return guestOverrideIdentity{}, false
|
||||
}
|
||||
|
||||
instance := strings.TrimSpace(parts[0])
|
||||
node := strings.TrimSpace(parts[1])
|
||||
if instance == "" {
|
||||
instance = node
|
||||
}
|
||||
if instance == "" || node == "" {
|
||||
return guestOverrideIdentity{}, false
|
||||
}
|
||||
|
||||
return guestOverrideIdentity{
|
||||
instance: instance,
|
||||
node: node,
|
||||
vmid: vmid,
|
||||
}, true
|
||||
}
|
||||
|
||||
func guestOverrideIdentityFromGuestOrID(guest interface{}, guestID string) (guestOverrideIdentity, bool) {
|
||||
switch g := guest.(type) {
|
||||
case models.VM:
|
||||
instance := strings.TrimSpace(g.Instance)
|
||||
node := strings.TrimSpace(g.Node)
|
||||
if instance == "" {
|
||||
instance = node
|
||||
}
|
||||
if instance == "" || node == "" || g.VMID <= 0 {
|
||||
return guestOverrideIdentity{}, false
|
||||
}
|
||||
return guestOverrideIdentity{
|
||||
instance: instance,
|
||||
node: node,
|
||||
vmid: g.VMID,
|
||||
}, true
|
||||
case models.Container:
|
||||
instance := strings.TrimSpace(g.Instance)
|
||||
node := strings.TrimSpace(g.Node)
|
||||
if instance == "" {
|
||||
instance = node
|
||||
}
|
||||
if instance == "" || node == "" || g.VMID <= 0 {
|
||||
return guestOverrideIdentity{}, false
|
||||
}
|
||||
return guestOverrideIdentity{
|
||||
instance: instance,
|
||||
node: node,
|
||||
vmid: g.VMID,
|
||||
}, true
|
||||
default:
|
||||
return parseCanonicalGuestKey(guestID)
|
||||
}
|
||||
}
|
||||
|
||||
func isCanonicalGuestOverrideKey(key string, ident guestOverrideIdentity) bool {
|
||||
parsed, ok := parseCanonicalGuestKey(key)
|
||||
if !ok {
|
||||
return false
|
||||
}
|
||||
return parsed.instance == ident.instance && parsed.vmid == ident.vmid
|
||||
}
|
||||
|
||||
func (m *Manager) lookupGuestOverride(guest interface{}, guestID string) (ThresholdConfig, bool) {
|
||||
if guestID = strings.TrimSpace(guestID); guestID != "" {
|
||||
if override, exists := m.config.Overrides[guestID]; exists {
|
||||
ident, ok := guestOverrideIdentityFromGuestOrID(guest, guestID)
|
||||
if !ok || ident.instance == ident.node {
|
||||
return override, true
|
||||
}
|
||||
|
||||
stableKey := stableGuestOverrideKey(ident.instance, ident.node, ident.vmid)
|
||||
if guestID != stableKey {
|
||||
// Cluster guest overrides should live under the stable instance-vmid key so
|
||||
// they survive node moves. Preserve the override in-memory immediately.
|
||||
m.config.Overrides[stableKey] = override
|
||||
delete(m.config.Overrides, guestID)
|
||||
}
|
||||
return override, true
|
||||
}
|
||||
}
|
||||
|
||||
ident, ok := guestOverrideIdentityFromGuestOrID(guest, guestID)
|
||||
if !ok {
|
||||
return ThresholdConfig{}, false
|
||||
}
|
||||
|
||||
stableKey := stableGuestOverrideKey(ident.instance, ident.node, ident.vmid)
|
||||
if override, exists := m.config.Overrides[stableKey]; exists {
|
||||
return override, true
|
||||
}
|
||||
|
||||
if ident.instance != ident.node {
|
||||
legacyKey := fmt.Sprintf("%s-%s-%d", ident.instance, ident.node, ident.vmid)
|
||||
if legacyOverride, legacyExists := m.config.Overrides[legacyKey]; legacyExists {
|
||||
log.Info().
|
||||
Str("legacyID", legacyKey).
|
||||
Str("newID", stableKey).
|
||||
Msg("Migrating guest override from legacy cluster-node ID format")
|
||||
|
||||
m.config.Overrides[stableKey] = legacyOverride
|
||||
delete(m.config.Overrides, legacyKey)
|
||||
return legacyOverride, true
|
||||
}
|
||||
|
||||
for key, override := range m.config.Overrides {
|
||||
if !isCanonicalGuestOverrideKey(key, ident) {
|
||||
continue
|
||||
}
|
||||
m.config.Overrides[stableKey] = override
|
||||
delete(m.config.Overrides, key)
|
||||
|
||||
log.Info().
|
||||
Str("legacyID", key).
|
||||
Str("newID", stableKey).
|
||||
Msg("Migrating clustered guest override from node-bound canonical key to stable key")
|
||||
|
||||
return override, true
|
||||
}
|
||||
}
|
||||
|
||||
return ThresholdConfig{}, false
|
||||
}
|
||||
|
||||
// getGuestThresholds returns the appropriate thresholds for a guest
|
||||
// Priority: Guest-specific overrides > Custom rules (by priority) > Global defaults
|
||||
func (m *Manager) getGuestThresholds(guest interface{}, guestID string) ThresholdConfig {
|
||||
|
|
@ -267,14 +414,10 @@ func (m *Manager) getGuestThresholds(guest interface{}, guestID string) Threshol
|
|||
Msg("Applied custom alert rule")
|
||||
}
|
||||
|
||||
// Finally check guest-specific overrides (highest priority)
|
||||
// First try the current canonical ID format (instance:node:vmid)
|
||||
override, exists := m.config.Overrides[guestID]
|
||||
|
||||
// If not found, try legacy ID formats for migration
|
||||
if !exists {
|
||||
override, exists = m.tryLegacyOverrideMigration(guest, guestID)
|
||||
}
|
||||
// Finally check guest-specific overrides (highest priority).
|
||||
// Cluster guest overrides should resolve via a stable instance-vmid key so they
|
||||
// continue to apply after node moves. Older node-bound formats are migrated lazily.
|
||||
override, exists := m.lookupGuestOverride(guest, guestID)
|
||||
|
||||
if exists {
|
||||
// Apply the disabled flag if set
|
||||
|
|
|
|||
|
|
@ -1544,8 +1544,8 @@ func TestGetGuestThresholds(t *testing.T) {
|
|||
VMID: 100,
|
||||
}
|
||||
|
||||
// Query with new format
|
||||
result := m.getGuestThresholds(vm, "pve1-100")
|
||||
// Query with the current canonical format
|
||||
result := m.getGuestThresholds(vm, BuildGuestKey("pve1", "node1", 100))
|
||||
|
||||
if result.CPU == nil {
|
||||
t.Fatal("CPU threshold should not be nil after legacy migration")
|
||||
|
|
@ -1554,9 +1554,9 @@ func TestGetGuestThresholds(t *testing.T) {
|
|||
t.Errorf("expected CPU trigger 60 from migrated legacy override, got %v", result.CPU.Trigger)
|
||||
}
|
||||
|
||||
// Verify the override was migrated to new ID
|
||||
// Verify the override was migrated to the stable clustered key
|
||||
if _, exists := m.config.Overrides["pve1-100"]; !exists {
|
||||
t.Error("override should be migrated to new ID format")
|
||||
t.Error("override should be migrated to stable clustered key")
|
||||
}
|
||||
if _, exists := m.config.Overrides["pve1-node1-100"]; exists {
|
||||
t.Error("old legacy override should be removed after migration")
|
||||
|
|
@ -1626,11 +1626,50 @@ func TestGetGuestThresholds(t *testing.T) {
|
|||
t.Errorf("expected CPU trigger 65 from migrated cluster legacy override, got %v", result.CPU.Trigger)
|
||||
}
|
||||
|
||||
if _, exists := m.config.Overrides[canonicalID]; !exists {
|
||||
t.Error("override should be migrated to canonical guest ID")
|
||||
if _, exists := m.config.Overrides["pve1-100"]; !exists {
|
||||
t.Error("stable clustered guest override should remain on instance-vmid key")
|
||||
}
|
||||
if _, exists := m.config.Overrides["pve1-100"]; exists {
|
||||
t.Error("old cluster legacy override should be removed after migration")
|
||||
if _, exists := m.config.Overrides[canonicalID]; exists {
|
||||
t.Error("clustered guest override should not be migrated onto a node-bound canonical key")
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("migrates clustered canonical override to stable key across node moves", func(t *testing.T) {
|
||||
oldCanonicalID := BuildGuestKey("pve1", "node1", 100)
|
||||
currentCanonicalID := BuildGuestKey("pve1", "node2", 100)
|
||||
m := &Manager{
|
||||
config: AlertConfig{
|
||||
GuestDefaults: ThresholdConfig{},
|
||||
Overrides: map[string]ThresholdConfig{
|
||||
oldCanonicalID: {
|
||||
Memory: &HysteresisThreshold{Trigger: 92, Clear: 87},
|
||||
},
|
||||
},
|
||||
CustomRules: []CustomAlertRule{},
|
||||
},
|
||||
}
|
||||
|
||||
vm := models.VM{
|
||||
Name: "test-vm",
|
||||
Node: "node2",
|
||||
Instance: "pve1",
|
||||
VMID: 100,
|
||||
}
|
||||
|
||||
result := m.getGuestThresholds(vm, currentCanonicalID)
|
||||
|
||||
if result.Memory == nil {
|
||||
t.Fatal("Memory threshold should not be nil after clustered canonical migration")
|
||||
}
|
||||
if result.Memory.Trigger != 92 {
|
||||
t.Errorf("expected Memory trigger 92 from migrated clustered canonical override, got %v", result.Memory.Trigger)
|
||||
}
|
||||
|
||||
if _, exists := m.config.Overrides["pve1-100"]; !exists {
|
||||
t.Error("override should be migrated to stable clustered key")
|
||||
}
|
||||
if _, exists := m.config.Overrides[oldCanonicalID]; exists {
|
||||
t.Error("old node-bound canonical override should be removed after migration")
|
||||
}
|
||||
})
|
||||
|
||||
|
|
|
|||
|
|
@ -329,3 +329,51 @@ func TestReevaluateActiveAlertsGuestNotMisclassifiedAsNode(t *testing.T) {
|
|||
t.Errorf("Guest alert should have been resolved when guest memory threshold is disabled, but it was misclassified as a node alert")
|
||||
}
|
||||
}
|
||||
|
||||
func TestReevaluateActiveAlertsGuestUsesStableClusterOverrideAcrossNodeMove(t *testing.T) {
|
||||
manager := NewManager()
|
||||
|
||||
manager.mu.Lock()
|
||||
manager.activeAlerts = make(map[string]*Alert)
|
||||
manager.mu.Unlock()
|
||||
|
||||
currentResourceID := BuildGuestKey("pve1", "node2", 101)
|
||||
alertID := currentResourceID + "-memory"
|
||||
|
||||
config := AlertConfig{
|
||||
Enabled: true,
|
||||
GuestDefaults: ThresholdConfig{
|
||||
Memory: &HysteresisThreshold{Trigger: 85, Clear: 80},
|
||||
},
|
||||
NodeDefaults: ThresholdConfig{},
|
||||
StorageDefault: HysteresisThreshold{Trigger: 85, Clear: 80},
|
||||
Overrides: map[string]ThresholdConfig{"pve1-101": {Memory: &HysteresisThreshold{Trigger: 95, Clear: 90}}},
|
||||
}
|
||||
manager.UpdateConfig(config)
|
||||
|
||||
manager.mu.Lock()
|
||||
manager.activeAlerts[alertID] = &Alert{
|
||||
ID: alertID,
|
||||
Type: "memory",
|
||||
Level: AlertLevelWarning,
|
||||
ResourceID: currentResourceID,
|
||||
Node: "node2",
|
||||
Instance: "pve1",
|
||||
Value: 88,
|
||||
Threshold: 85,
|
||||
StartTime: time.Now().Add(-5 * time.Minute),
|
||||
LastSeen: time.Now(),
|
||||
}
|
||||
manager.mu.Unlock()
|
||||
|
||||
manager.UpdateConfig(config)
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
|
||||
manager.mu.RLock()
|
||||
_, exists := manager.activeAlerts[alertID]
|
||||
manager.mu.RUnlock()
|
||||
|
||||
if exists {
|
||||
t.Fatalf("expected guest alert to be resolved when stable clustered override threshold is above current value")
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue