Pulse/internal/monitoring/vm_memory_telemetry_guard.go
rcourtman c1fa084100
Some checks are pending
Build and Test / Secret Scan (push) Waiting to run
Build and Test / Frontend & Backend (push) Waiting to run
Helm CI / Lint and Render Chart (push) Waiting to run
Core E2E Tests / Playwright Core E2E (push) Waiting to run
Stabilize low-trust 100 percent VM memory across mixed sizes (#1319)
2026-03-27 11:50:51 +00:00

483 lines
13 KiB
Go

package monitoring
import (
"fmt"
"sort"
"strings"
"time"
"github.com/rcourtman/pulse-go-rewrite/internal/models"
"github.com/rs/zerolog/log"
)
const (
minRunningVMsForRepeatedMemoryCheck = 4
minRepeatedVMsForSuspicion = 3
minRepeatedMemoryShare = 0.60
maxRepeatedMemorySampleNames = 5
)
type repeatedVMMemoryUsage struct {
suspicious bool
signature string
pattern string
runningCount int
repeatedCount int
repeatedMemUsed int64
sourceBreakdown []string
sampleVMNames []string
}
type repeatedMemoryGroup struct {
count int
sources map[string]int
names []string
}
func filterVMsByInstance(vms []models.VM, instanceName string) []models.VM {
filtered := make([]models.VM, 0, len(vms))
for _, vm := range vms {
if vm.Instance == instanceName {
filtered = append(filtered, vm)
}
}
return filtered
}
func detectRepeatedVMMemoryUsage(vms []models.VM) repeatedVMMemoryUsage {
groups := make(map[int64]*repeatedMemoryGroup)
runningCount := 0
for _, vm := range vms {
if vm.Type != "qemu" || vm.Status != "running" || vm.Memory.Total <= 0 || vm.Memory.Used <= 0 {
continue
}
runningCount++
group, ok := groups[vm.Memory.Used]
if !ok {
group = &repeatedMemoryGroup{
sources: make(map[string]int),
}
groups[vm.Memory.Used] = group
}
group.count++
source := strings.TrimSpace(vm.MemorySource)
if source == "" {
source = "unknown"
}
group.sources[source]++
if len(group.names) < maxRepeatedMemorySampleNames {
name := strings.TrimSpace(vm.Name)
if name == "" {
name = vm.ID
}
group.names = append(group.names, name)
}
}
if runningCount < minRunningVMsForRepeatedMemoryCheck {
return repeatedVMMemoryUsage{runningCount: runningCount}
}
var topUsed int64
var topGroup *repeatedMemoryGroup
for used, group := range groups {
if topGroup == nil || group.count > topGroup.count {
topUsed = used
topGroup = group
}
}
if topGroup == nil {
return repeatedVMMemoryUsage{runningCount: runningCount}
}
share := float64(topGroup.count) / float64(runningCount)
if topGroup.count < minRepeatedVMsForSuspicion || share < minRepeatedMemoryShare {
return repeatedVMMemoryUsage{runningCount: runningCount}
}
breakdown := formatMemorySourceBreakdown(topGroup.sources)
sampleNames := append([]string(nil), topGroup.names...)
sort.Strings(sampleNames)
return repeatedVMMemoryUsage{
suspicious: true,
signature: fmt.Sprintf("%d:%d:%s", topUsed, topGroup.count, strings.Join(breakdown, ",")),
pattern: "exact-used",
runningCount: runningCount,
repeatedCount: topGroup.count,
repeatedMemUsed: topUsed,
sourceBreakdown: breakdown,
sampleVMNames: sampleNames,
}
}
func formatMemorySourceBreakdown(counts map[string]int) []string {
type sourceCount struct {
source string
count int
}
entries := make([]sourceCount, 0, len(counts))
for source, count := range counts {
entries = append(entries, sourceCount{source: source, count: count})
}
sort.Slice(entries, func(i, j int) bool {
if entries[i].count == entries[j].count {
return entries[i].source < entries[j].source
}
return entries[i].count > entries[j].count
})
breakdown := make([]string, 0, len(entries))
for _, entry := range entries {
breakdown = append(breakdown, fmt.Sprintf("%s:%d", entry.source, entry.count))
}
return breakdown
}
func detectRepeatedLowTrustFullUsage(vms []models.VM, snapshots []GuestMemorySnapshot) repeatedVMMemoryUsage {
if len(vms) == 0 {
return repeatedVMMemoryUsage{}
}
hasSnapshots := len(vms) == len(snapshots)
runningCount := 0
repeatedCount := 0
sourceCounts := make(map[string]int)
sampleNames := make([]string, 0, maxRepeatedMemorySampleNames)
for i := range vms {
vm := vms[i]
if vm.Type != "qemu" || vm.Status != "running" || vm.Memory.Total <= 0 {
continue
}
runningCount++
source := strings.TrimSpace(vm.MemorySource)
if hasSnapshots {
used, candidateSource, ok := lowTrustMemoryCandidate(vm, snapshots[i])
if !ok || used <= 0 {
continue
}
source = candidateSource
} else if vmMemorySourceReliability(source) > vmMemorySourceReliabilityFallback {
continue
}
if vm.Memory.Usage < 99 || vmMemorySourceReliability(source) > vmMemorySourceReliabilityFallback {
continue
}
repeatedCount++
sourceCounts[source]++
if len(sampleNames) < maxRepeatedMemorySampleNames {
name := strings.TrimSpace(vm.Name)
if name == "" {
name = vm.ID
}
sampleNames = append(sampleNames, name)
}
}
if runningCount < minRunningVMsForRepeatedMemoryCheck {
return repeatedVMMemoryUsage{runningCount: runningCount}
}
share := float64(repeatedCount) / float64(runningCount)
if repeatedCount < minRepeatedVMsForSuspicion || share < minRepeatedMemoryShare {
return repeatedVMMemoryUsage{runningCount: runningCount}
}
breakdown := formatMemorySourceBreakdown(sourceCounts)
sort.Strings(sampleNames)
return repeatedVMMemoryUsage{
suspicious: true,
signature: fmt.Sprintf("full-usage:%d:%s", repeatedCount, strings.Join(breakdown, ",")),
pattern: "low-trust-full-usage",
runningCount: runningCount,
repeatedCount: repeatedCount,
sourceBreakdown: breakdown,
sampleVMNames: sampleNames,
}
}
func (m *Monitor) logSuspiciousRepeatedVMMemoryUsage(instanceName string, currentVMs []models.VM, previousVMs []models.VM) {
current := detectRepeatedVMMemoryUsage(currentVMs)
if !current.suspicious {
current = detectRepeatedLowTrustFullUsage(currentVMs, nil)
}
if !current.suspicious {
return
}
previous := detectRepeatedVMMemoryUsage(previousVMs)
if previous.suspicious && previous.signature == current.signature {
return
}
log.Warn().
Str("instance", instanceName).
Str("pattern", current.pattern).
Int("runningVMs", current.runningCount).
Int("repeatedVMs", current.repeatedCount).
Int64("repeatedMemoryUsedBytes", current.repeatedMemUsed).
Strs("memorySources", current.sourceBreakdown).
Strs("sampleVMs", current.sampleVMNames).
Msg("Suspicious repeated VM memory-used pattern detected")
}
func stabilizeSuspiciousRepeatedVMMemory(vms []models.VM, alertVMs []models.VM, snapshots []GuestMemorySnapshot, previousVMs []models.VM, now time.Time) int {
current := detectRepeatedLowTrustVMMemoryUsage(vms, snapshots)
if !current.suspicious {
current = detectRepeatedVMMemoryUsage(vms)
}
if !current.suspicious {
current = detectRepeatedLowTrustFullUsage(vms, snapshots)
}
if !current.suspicious {
return 0
}
prevByID := make(map[string]models.VM, len(previousVMs))
for _, prev := range previousVMs {
prevByID[prev.ID] = prev
}
applied := 0
for i := range vms {
vm := &vms[i]
if vm.Type != "qemu" || vm.Status != "running" || vm.Memory.Total <= 0 {
continue
}
if vmMemorySourceReliability(vm.MemorySource) > vmMemorySourceReliabilityFallback {
continue
}
switch current.pattern {
case "low-trust-full-usage":
if vm.Memory.Usage < 99 {
continue
}
default:
if vm.Memory.Used != current.repeatedMemUsed {
continue
}
}
prev, ok := prevByID[vm.ID]
if !ok || !shouldStabilizeSuspiciousRepeatedVMMemory(prev, *vm, now) {
continue
}
currentBalloon := vm.Memory.Balloon
vm.Memory = prev.Memory
if currentBalloon > 0 {
vm.Memory.Balloon = currentBalloon
}
vm.MemorySource = "previous-snapshot"
if len(alertVMs) == len(vms) {
alertVMs[i].Memory = vm.Memory
alertVMs[i].MemorySource = vm.MemorySource
}
if len(snapshots) == len(vms) {
snapshots[i].Memory = vm.Memory
snapshots[i].MemorySource = vm.MemorySource
snapshots[i].Notes = appendSnapshotNote(snapshots[i].Notes, "preserved-previous-memory-after-repeated-low-trust-pattern")
}
applied++
}
if applied == 0 {
return 0
}
log.Warn().
Str("pattern", current.pattern).
Int("runningVMs", current.runningCount).
Int("repeatedVMs", current.repeatedCount).
Int("stabilizedVMs", applied).
Int64("repeatedMemoryUsedBytes", current.repeatedMemUsed).
Strs("memorySources", current.sourceBreakdown).
Strs("sampleVMs", current.sampleVMNames).
Msg("Stabilized suspicious repeated VM memory-used pattern with previous readings")
return applied
}
func detectRepeatedLowTrustVMMemoryUsage(vms []models.VM, snapshots []GuestMemorySnapshot) repeatedVMMemoryUsage {
if len(vms) == 0 || len(vms) != len(snapshots) {
return repeatedVMMemoryUsage{}
}
groups := make(map[int64]*repeatedMemoryGroup)
runningCount := 0
for i := range vms {
vm := vms[i]
if vm.Type != "qemu" || vm.Status != "running" || vm.Memory.Total <= 0 {
continue
}
runningCount++
used, source, ok := lowTrustMemoryCandidate(vm, snapshots[i])
if !ok || used <= 0 {
continue
}
group, exists := groups[used]
if !exists {
group = &repeatedMemoryGroup{sources: make(map[string]int)}
groups[used] = group
}
group.count++
group.sources[source]++
if len(group.names) < maxRepeatedMemorySampleNames {
name := strings.TrimSpace(vm.Name)
if name == "" {
name = vm.ID
}
group.names = append(group.names, name)
}
}
if runningCount < minRunningVMsForRepeatedMemoryCheck {
return repeatedVMMemoryUsage{runningCount: runningCount}
}
var topUsed int64
var topGroup *repeatedMemoryGroup
for used, group := range groups {
if topGroup == nil || group.count > topGroup.count {
topUsed = used
topGroup = group
}
}
if topGroup == nil {
return repeatedVMMemoryUsage{runningCount: runningCount}
}
share := float64(topGroup.count) / float64(runningCount)
if topGroup.count < minRepeatedVMsForSuspicion || share < minRepeatedMemoryShare {
return repeatedVMMemoryUsage{runningCount: runningCount}
}
breakdown := formatMemorySourceBreakdown(topGroup.sources)
sampleNames := append([]string(nil), topGroup.names...)
sort.Strings(sampleNames)
return repeatedVMMemoryUsage{
suspicious: true,
signature: fmt.Sprintf("%d:%d:%s", topUsed, topGroup.count, strings.Join(breakdown, ",")),
runningCount: runningCount,
repeatedCount: topGroup.count,
repeatedMemUsed: topUsed,
sourceBreakdown: breakdown,
sampleVMNames: sampleNames,
}
}
func lowTrustMemoryCandidate(vm models.VM, snapshot GuestMemorySnapshot) (int64, string, bool) {
source := strings.TrimSpace(snapshot.MemorySource)
if source == "" {
source = strings.TrimSpace(vm.MemorySource)
}
switch {
case source == "status-mem":
return clampLowTrustMemoryCandidate(vm.Memory.Total, snapshot.Raw.StatusMem, "status-mem")
case source == "status-freemem":
if snapshot.Raw.StatusFreeMem == 0 || uint64(vm.Memory.Total) < snapshot.Raw.StatusFreeMem {
return 0, "", false
}
return int64(uint64(vm.Memory.Total) - snapshot.Raw.StatusFreeMem), "status-freemem", true
case source == "listing-mem" || source == "cluster-resources":
return clampLowTrustMemoryCandidate(vm.Memory.Total, snapshot.Raw.ListingMem, source)
case source == "meminfo-total-minus-used":
if snapshot.Raw.MemInfoTotalMinusUsed == 0 || uint64(vm.Memory.Total) < snapshot.Raw.MemInfoTotalMinusUsed {
return 0, "", false
}
return int64(uint64(vm.Memory.Total) - snapshot.Raw.MemInfoTotalMinusUsed), "meminfo-total-minus-used", true
case source == "previous-snapshot":
if !snapshotHasNote(snapshot.Notes, "preserved-previous-memory-after-low-trust-fallback") &&
!snapshotHasNote(snapshot.Notes, "preserved-previous-memory-after-repeated-low-trust-pattern") {
return 0, "", false
}
switch {
case snapshot.Raw.StatusMem > 0:
return clampLowTrustMemoryCandidate(vm.Memory.Total, snapshot.Raw.StatusMem, "status-mem")
case snapshot.Raw.MemInfoTotalMinusUsed > 0 && uint64(vm.Memory.Total) >= snapshot.Raw.MemInfoTotalMinusUsed:
return int64(uint64(vm.Memory.Total) - snapshot.Raw.MemInfoTotalMinusUsed), "meminfo-total-minus-used", true
case snapshot.Raw.ListingMem > 0:
return clampLowTrustMemoryCandidate(vm.Memory.Total, snapshot.Raw.ListingMem, "listing-mem")
}
}
return 0, "", false
}
func clampLowTrustMemoryCandidate(total int64, used uint64, source string) (int64, string, bool) {
if total <= 0 || used == 0 {
return 0, "", false
}
if used > uint64(total) {
used = uint64(total)
}
return int64(used), source, true
}
func snapshotHasNote(notes []string, target string) bool {
for _, note := range notes {
if note == target {
return true
}
}
return false
}
func shouldStabilizeSuspiciousRepeatedVMMemory(prev, current models.VM, now time.Time) bool {
if current.Type != "qemu" || current.Status != "running" || prev.Type != "qemu" || prev.Status != "running" {
return false
}
if prev.Memory.Total <= 0 || prev.Memory.Used < 0 {
return false
}
if prev.LastSeen.IsZero() || now.Sub(prev.LastSeen) > vmMemoryCarryForwardMaxAge {
return false
}
if current.Memory.Total > 0 && prev.Memory.Total > 0 && prev.Memory.Total != current.Memory.Total {
return false
}
prevReliability := vmMemorySourceReliability(prev.MemorySource)
if prev.MemorySource != "previous-snapshot" && prevReliability < vmMemorySourceReliabilityTrusted {
return false
}
if vmMemorySourceReliability(current.MemorySource) > vmMemorySourceReliabilityFallback {
return false
}
if prev.Memory.Usage > 0 && absFloat64(prev.Memory.Usage-current.Memory.Usage) < vmMemoryCarryForwardMinUsageDelta {
return false
}
return true
}
func appendSnapshotNote(notes []string, note string) []string {
for _, existing := range notes {
if existing == note {
return notes
}
}
return append(notes, note)
}
func absFloat64(v float64) float64 {
if v < 0 {
return -v
}
return v
}